OpenCL (Ubuntu/Windows) + MKL code improved (#452)

OpenCL support added, MKL support improved.

OpenCL (Ubuntu/Windows) + MKL code improved (#452)
OpenCL support added, MKL support improved.
eedd24b2 · Raaj · Gines · 29b6697d · eedd24b2 · 569352cc
32 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -75,6 +75,7 @@ tags
 *.vs/
 *x64/
 3rdparty/windows/caffe/
+3rdparty/windows/caffe_opencl/
 3rdparty/windows/caffe3rdparty/
 3rdparty/windows/opencv/
 3rdparty/windows/freeglut/

--- a/caffe @ 569352cc
+++ b/caffe @ 569352cc
-Subproject commit 4b786a77665eea5e9730a1818364e756ff321b53
+Subproject commit 569352cc8e6a3ce8e8b03c12c975779235522f19
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,18 +98,35 @@ if (WIN32)
  endif (${INSTRUCTION_SET} MATCHES "SSE")
 endif (WIN32)

+# Look for CUDA
+find_package(CUDA)
+# Look for OpenCL
+set(OpenCL_FOUND FALSE)
+find_package(OpenCL)
+
 # Set the acceleration library
 set(GPU_MODE CUDA CACHE STRING "Select the acceleration GPU library or CPU otherwise.")
-# set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA CPU_ONLY)
-
-# # Test for CUDA or OpenCL
-# find_package(CUDA)
-# if (CUDA_FOUND)
+# Display proper options to user
+# if (CUDA_FOUND AND OpenCL_FOUND)
+#   set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA OPENCL CPU_ONLY)
+# elseif (CUDA_FOUND)
 #   set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA CPU_ONLY)
+# elseif (OpenCL_FOUND)
+#   set_property(CACHE GPU_MODE PROPERTY STRINGS OPENCL CPU_ONLY)
+# else ()
+#   set_property(CACHE GPU_MODE PROPERTY STRINGS CPU_ONLY)
 # endif ()
-# # set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA OPENCL CPU_ONLY)
 set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA)

+# Code to avoid crash at compiling time if OpenCL is not found
+if (NOT OpenCL_FOUND)
+  set(OpenCL_LIBRARIES "")
+endif (NOT OpenCL_FOUND)
+# Required for OpenCL in Nvidia graphic cards
+if (CUDA_FOUND AND OpenCL_FOUND AND ${CUDA_VERSION_MAJOR} LESS 9)
+  add_definitions(-DLOWER_CL_VERSION)
+endif ()
+# Handle desired GPU mode option
 if (${GPU_MODE} MATCHES "CUDA")
  # OpenPose flags
  add_definitions(-DUSE_CUDA)
@@ -118,14 +135,13 @@ elseif (${GPU_MODE} MATCHES "CPU_ONLY")
  # OpenPose flag for Caffe
  add_definitions(-DCPU_ONLY)
  message(STATUS "Building CPU Only.")
+elseif (${GPU_MODE} MATCHES "OPENCL")
+  # OpenPose flag for Caffe
+  add_definitions(-DUSE_OPENCL)
+  add_definitions(-DUSE_GREENTEA)
+  message(STATUS "Building with OpenCL.")
 endif ()

-if (${USE_MKL})
-  # OpenPose flags
-  add_definitions(-DUSE_MKL)
-  message(STATUS "Building with MKL support.")
-endif (${USE_MKL})
-
 # Intel branch with MKL Support
 if (${GPU_MODE} MATCHES "CPU_ONLY")
  if (UNIX AND NOT APPLE)
@@ -133,6 +149,12 @@ if (${GPU_MODE} MATCHES "CPU_ONLY")
  endif ()
 endif ()

+if (${USE_MKL})
+  # OpenPose flags
+  add_definitions(-DUSE_MKL)
+  message(STATUS "Building with MKL support.")
+endif (${USE_MKL})
+
 # Set/disable profiler
 if (PROFILER_ENABLED)
  add_definitions(-DPROFILER_ENABLED)
@@ -169,6 +191,13 @@ option(BUILD_SHARED_LIBS "Build as shared lib" ON)
 # Speed profiler
 option(PROFILER_ENABLED "If enabled, OpenPose will be able to print out speed information at runtime." OFF)

+# Threads
+if (${GPU_MODE} MATCHES "OPENCL")
+  unset(CMAKE_THREAD_LIBS_INIT CACHE)
+  find_package(Threads)
+else ()
+  set(CMAKE_THREAD_LIBS_INIT "")
+endif ()

 ### FIND REQUIRED PACKAGES
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
@@ -282,8 +311,12 @@ if (WIN32)

  #TODO Add hashes
  download_zip("opencv_310.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 1e5240a64b814b3c0b822f136be78ad7)
-  download_zip("caffe3rdparty_2017_07_14.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} ec0f800c8fb337e33304f3375bd06a80)
-  download_zip("caffe_2018_01_18.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 4b8e548cc7ea20abea472950dd5301bd)
+  download_zip("caffe3rdparty_2018_02_13.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 6653E07E3D2D7226D64FAD2DF0B407FB)
+  if (${GPU_MODE} MATCHES "OPENCL")
+    download_zip("caffe_opencl_2018_02_13.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 3ac3e1acf5ee6a4e57920be73053067a)
+  else ()
+    download_zip("caffe_2018_01_18.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 4b8e548cc7ea20abea472950dd5301bd)
+  endif ()
  if (WITH_3D_RENDERER)
    download_zip("freeglut_2018_01_14.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} BB182187285E06880F0EDE3A39530091)
  endif (WITH_3D_RENDERER)
@@ -295,8 +328,35 @@ if (WIN32)
  find_library(GLOG_LIBRARY_RELEASE glog HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
  find_library(GLOG_LIBRARY_DEBUG glogd HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
  find_library(OpenCV_LIBS opencv_world310 HINTS ${FIND_LIB_PREFIX}/opencv/x64/vc14/lib)
-  find_library(Caffe_LIB caffe HINTS ${FIND_LIB_PREFIX}/caffe/lib)
-  find_library(Caffe_Proto_LIB caffeproto HINTS ${FIND_LIB_PREFIX}/caffe/lib)
+
+  if (${GPU_MODE} MATCHES "OPENCL")
+    SET(CAFFE_CL_CHECKED True CACHE BOOL "")
+    unset(Caffe_LIB CACHE)
+    unset(Caffe_Proto_LIB CACHE)
+    set(CL "_CL")
+    find_library(Caffe_LIB caffe HINTS ${FIND_LIB_PREFIX}/caffe_opencl/lib)
+    find_library(Caffe_Proto_LIB caffeproto HINTS ${FIND_LIB_PREFIX}/caffe_opencl/lib)
+  else ()
+    if (CAFFE_CL_CHECKED)
+      unset(Caffe_LIB CACHE)
+      unset(Caffe_Proto_LIB CACHE)
+    endif ()
+    set(CAFFE_CL_CHECKED 0)
+    set(CL "")
+    find_library(Caffe_LIB caffe HINTS ${FIND_LIB_PREFIX}/caffe/lib)
+    find_library(Caffe_Proto_LIB caffeproto HINTS ${FIND_LIB_PREFIX}/caffe/lib)
+  endif ()
+
+  if (${GPU_MODE} MATCHES "OPENCL")
+    unset(BOOST_SYSTEM_LIB_RELEASE CACHE)
+    unset(BOOST_SYSTEM_LIB_DEBUG CACHE)
+    find_library(BOOST_SYSTEM_LIB_RELEASE boost_system-vc140-mt-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
+    find_library(BOOST_SYSTEM_LIB_DEBUG boost_system-vc140-mt-gd-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
+  else ()
+    set(BOOST_SYSTEM_LIB_RELEASE "")
+    set(BOOST_SYSTEM_LIB_DEBUG "")
+  endif ()
+
  if (WITH_3D_RENDERER)
    find_library(GLUT_LIBRARY freeglut HINTS ${FIND_LIB_PREFIX}/freeglut/lib)
    message(STATUS "\${GLUT_LIBRARY} = ${GLUT_LIBRARY}")
@@ -306,7 +366,17 @@ if (WIN32)
  endif (WITH_FLIR_CAMERA)
  set(Caffe_LIBS ${Caffe_LIB};${Caffe_Proto_LIB})
  set(OpenCV_INCLUDE_DIRS "3rdparty/windows/opencv/include")
-  set(Caffe_INCLUDE_DIRS "3rdparty/windows/caffe/include;3rdparty/windows/caffe/include2")
+  if (${GPU_MODE} MATCHES "OPENCL")
+    SET(CAFFE_CL_CHECKED2 True CACHE BOOL "")
+    unset(Caffe_INCLUDE_DIRS CACHE)
+    set(Caffe_INCLUDE_DIRS "3rdparty/windows/caffe_opencl/include;3rdparty/windows/caffe_opencl/include2" CACHE FILEPATH "Caffe_INCLUDE_DIRS")
+  else ()
+    if (CAFFE_CL_CHECKED2)
+      unset(Caffe_INCLUDE_DIRS CACHE)
+    endif ()
+    set(CAFFE_CL_CHECKED2 0)
+    set(Caffe_INCLUDE_DIRS "3rdparty/windows/caffe/include;3rdparty/windows/caffe/include2" CACHE FILEPATH "Caffe_INCLUDE_DIRS")
+  endif ()
  set(Boost_INCLUDE_DIRS "3rdparty/windows/caffe3rdparty/include/boost-1_61")
  set(WINDOWS_INCLUDE_DIRS "3rdparty/windows/caffe3rdparty/include")
  if (WITH_3D_RENDERER)
@@ -359,6 +429,7 @@ if (UNIX AND NOT APPLE)
          set( MLIST ${rv} )
          separate_arguments(MLIST)
          list(GET MLIST 0 MKL_PATH)
+          message(STATUS ${MKL_PATH})
          file(GLOB MKL_SO
            "${MKL_PATH}lib/*"
            )
@@ -376,6 +447,9 @@ if (UNIX AND NOT APPLE)
        endif ()
        set(CAFFE_CPU_ONLY ON)
        set(USE_CUDNN OFF)
+      elseif (${GPU_MODE} MATCHES "OPENCL")
+        execute_process(COMMAND git checkout opencl WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/3rdparty/caffe)
+        set(USE_CUDNN OFF)
      endif ()

      # Build Caffe
@@ -443,7 +517,6 @@ if (UNIX AND NOT APPLE)

 endif (UNIX AND NOT APPLE)

-
 ### PROJECT INCLUDES
 # Specify the include directories
 include_directories(
@@ -452,6 +525,11 @@ include_directories(
  ${GLOG_INCLUDE_DIR}
  ${OpenCV_INCLUDE_DIRS})

+if(USE_MKL)
+ include_directories(
+     "${MKL_PATH}/include/")
+endif (USE_MKL)
+
 if (Caffe_FOUND)
  include_directories(
      ${Caffe_INCLUDE_DIRS})
@@ -461,6 +539,10 @@ if (${GPU_MODE} MATCHES "CUDA")
  include_directories(
      ${CUDA_INCLUDE_DIRS})
 endif ()
+if (${GPU_MODE} MATCHES "OPENCL")
+  include_directories(
+    ${OpenCL_INCLUDE_DIRS})
+endif ()
 # 3D
 if (WITH_3D_RENDERER)
  include_directories(${GLUT_INCLUDE_DIRS})

--- a/cmake/OpenPose_CL.vcxproj.user
+++ b/cmake/OpenPose_CL.vcxproj.user
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LocalDebuggerEnvironment>PATH=%PATH%;3rdparty\windows\caffe_opencl\bin\;3rdparty\windows\opencv\x64\vc14\bin;3rdparty\windows\freeglut\bin\;3rdparty\windows\spinnaker\bin\</LocalDebuggerEnvironment>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+    <LocalDebuggerWorkingDirectory>..\..\..</LocalDebuggerWorkingDirectory>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LocalDebuggerEnvironment>PATH=%PATH%;3rdparty\windows\caffe_opencl\bin\;3rdparty\windows\opencv\x64\vc14\bin;3rdparty\windows\freeglut\bin\;3rdparty\windows\spinnaker\bin\</LocalDebuggerEnvironment>
+    <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
+    <LocalDebuggerWorkingDirectory>..\..\..</LocalDebuggerWorkingDirectory>
+  </PropertyGroup>
+
+</Project>
\ No newline at end of file
--- a/doc/installation.md
+++ b/doc/installation.md
@@ -90,13 +90,14 @@ The instructions in this section describe the steps to build OpenPose using CMak
 2. [**CUDA 8**](https://developer.nvidia.com/cuda-80-ga2-download-archive):
    - Ubuntu: Run `ubuntu/install_cuda.sh` or alternatively download and install it from their website.
    - Windows: Install CUDA 8.0 after Visual Studio 2015 is installed to assure that the CUDA installation will generate all necessary files for VS. If CUDA was already installed, re-install CUDA after installing VS!
+    - **IMPORTANT**: As of a recent Windows update, you have to download the Nvidia [drivers](http://www.nvidia.com/Download/index.aspx) drivers first, and then install CUDA without the Graphics Driver flag or else your system might hang.
 3. [**cuDNN 5.1**](https://developer.nvidia.com/cudnn):
    - Ubuntu: Run `ubuntu/install_cudnn.sh` or alternatively download and install it from their website.
    - Windows (and Ubuntu if manual installation): In order to manually install it, just unzip it and copy (merge) the contents on the CUDA folder, usually `/usr/local/cuda/` in Ubuntu and `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0` in Windows.
-3. Ubuntu - Other prerequisites:
+4. Ubuntu - Other prerequisites:
    - Caffe prerequisites: By default, OpenPose uses Caffe under the hood. If you have not used Caffe previously, install its dependencies by running `bash ./ubuntu/install_cmake.sh`.
    - OpenCV must be already installed on your machine. It can be installed with `apt-get install libopencv-dev`. You can also use your own compiled OpenCV version.
-4. Windows - **Microsoft Visual Studio (VS) 2015 Enterprise Update 3**:
+5. Windows - **Microsoft Visual Studio (VS) 2015 Enterprise Update 3**:
    - If **Visual Studio 2017 Community** is desired, we do not officially support it, but it might be compiled by firstly [enabling CUDA 8.0 in VS2017](https://stackoverflow.com/questions/43745099/using-cuda-with-visual-studio-2017?answertab=active#tab-top) or use **VS2017 with CUDA 9** by checking the `.vcxproj` file and changing the necessary paths from CUDA 8 to 9.
    - VS 2015 Enterprise Update 1 will give some compiler errors and VS 2015 Community has not been tested.
 5. Windows - **Caffe, OpenCV, and Caffe prerequisites**:

--- a/examples/openpose/CMakeLists.txt
+++ b/examples/openpose/CMakeLists.txt
@@ -13,11 +13,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})

  message(STATUS "Adding Example ${EXE_NAME}")
  add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES})
  
  if (WIN32)
    set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples")
-    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
+    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
        ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
    # Properties->General->Output Directory
    set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))

--- a/examples/tests/CMakeLists.txt
+++ b/examples/tests/CMakeLists.txt
 set(EXAMPLE_FILES
-    handFromJsonTest.cpp)
+    handFromJsonTest.cpp
+    resizeTest.cpp)

 foreach(EXAMPLE_FILE ${EXAMPLE_FILES})

@@ -13,11 +14,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})

  message(STATUS "Adding Example ${EXE_NAME}")
  add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
  
  if (WIN32)
    set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Tests")
-    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
+    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
        ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
    # Properties->General->Output Directory
    set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))

--- a/examples/tests/resizeTest.cpp
+++ b/examples/tests/resizeTest.cpp
+// ------------------------- OpenPose Resize Layer Testing -------------------------
+
+#include <chrono> // `std::chrono::` functions and classes, e.g. std::chrono::milliseconds
+// GFlags: DEFINE_bool, _int32, _int64, _uint64, _double, _string
+#include <gflags/gflags.h>
+// Allow Google Flags in Ubuntu 14
+#ifndef GFLAGS_GFLAGS_H_
+namespace gflags = google;
+#endif
+#include <openpose/headers.hpp>
+#include <openpose/core/resizeAndMergeBase.hpp>
+#ifdef USE_CUDA
+#include <caffe/net.hpp>
+#endif
+#include <openpose/gpu/cuda.hpp>
+
+DEFINE_string(image_path,               "examples/media/COCO_val2014_000000000192.jpg",     "Process the desired image.");
+
+cv::Mat gpuResize(cv::Mat& img, cv::Size newSize)
+{
+    #ifdef USE_CUDA
+        // Upload to Source to GPU
+        float* cpuPtr = &img.at<float>(0);
+        float* gpuPtr;
+        cudaMallocHost((void **)&gpuPtr, img.size().width * img.size().height * sizeof(float));
+        cudaMemcpy(gpuPtr, cpuPtr, img.size().width * img.size().height * sizeof(float),
+                   cudaMemcpyHostToDevice);
+
+        // Upload to Dest to GPU
+        cv::Mat newImg = cv::Mat(newSize,CV_32FC1,cv::Scalar(0));
+        float* newCpuPtr = &newImg.at<float>(0);
+        float* newGpuPtr;
+        cudaMallocHost((void **)&newGpuPtr, newSize.width * newSize.height * sizeof(float));
+        cudaMemcpy(newGpuPtr, newCpuPtr, newSize.width * newSize.height * sizeof(float),
+                   cudaMemcpyHostToDevice);
+
+        std::vector<const float*> sourcePtrs;
+        sourcePtrs.emplace_back(gpuPtr);
+        std::array<int, 4> targetSize = {1,1,newImg.size().height,newImg.size().width};
+        std::array<int, 4> sourceSize = {1,1,img.size().height,img.size().width};
+        std::vector<std::array<int, 4>> sourceSizes;
+        sourceSizes.emplace_back(sourceSize);
+        op::resizeAndMergeGpu(newGpuPtr, sourcePtrs, targetSize, sourceSizes);
+        cudaMemcpy(newCpuPtr, newGpuPtr, newImg.size().width * newImg.size().height * sizeof(float),
+                   cudaMemcpyDeviceToHost);
+
+        cudaFree(gpuPtr);
+        cudaFree(newGpuPtr);
+        return newImg;
+    #else
+        op::error("OpenPose must be compiled with the `USE_CAFFE` & `USE_CUDA` macro definitions in order to run"
+              " this functionality.", __LINE__, __FUNCTION__, __FILE__);
+    #endif
+}
+
+cv::Mat cpuResize(cv::Mat& img, cv::Size newSize)
+{
+    // Upload to Source to GPU
+    float* cpuPtr = &img.at<float>(0);
+
+    // Upload to Dest to GPU
+    cv::Mat newImg = cv::Mat(newSize,CV_32FC1,cv::Scalar(0));
+
+    std::vector<const float*> sourcePtrs;
+    sourcePtrs.emplace_back(cpuPtr);
+    std::array<int, 4> targetSize = {1,1,newImg.size().height,newImg.size().width};
+    std::array<int, 4> sourceSize = {1,1,img.size().height,img.size().width};
+    std::vector<std::array<int, 4>> sourceSizes;
+    sourceSizes.emplace_back(sourceSize);
+    op::resizeAndMergeCpu(&newImg.at<float>(0), sourcePtrs, targetSize, sourceSizes);
+
+    return newImg;
+}
+
+int resizeTest()
+{
+    // logging_level
+    cv::Mat img = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_GRAYSCALE);
+    if(img.empty())
+        op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
+    img.convertTo(img, CV_32FC1);
+    img = cpuResize(img, cv::Size(img.size().width/4,img.size().height/4));
+    img*=0.005;
+
+    cv::Mat gpuImg = gpuResize(img, cv::Size(img.size().width*8,img.size().height*8));
+    cv::Mat cpuImg = cpuResize(img, cv::Size(img.size().width*8,img.size().height*8));
+    cv::imshow("gpuImg", gpuImg);
+    cv::imshow("cpuImg", cpuImg);
+
+    op::log("Done");
+    cv::waitKey(0);
+
+    return 0;
+}
+
+int main(int argc, char *argv[])
+{
+    // Parsing command line flags
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+    // Running handFromJsonTest
+    return resizeTest();
+}
--- a/examples/tutorial_add_module/CMakeLists.txt
+++ b/examples/tutorial_add_module/CMakeLists.txt
@@ -16,11 +16,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
  message(STATUS "Adding Example ${EXE_NAME}")
  add_executable(${EXE_NAME} ${EXAMPLE_FILE})
  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY}
-      ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
+      ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${CMAKE_THREAD_LIBS_INIT})

  if (WIN32)
    set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/AddModule")
-    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
+    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
        ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
    # Properties->General->Output Directory
    set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))

--- a/examples/tutorial_pose/CMakeLists.txt
+++ b/examples/tutorial_pose/CMakeLists.txt
@@ -14,11 +14,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})

  message(STATUS "Adding Example ${EXE_NAME}")
  add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries(${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
+  target_link_libraries(${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
  
  if (WIN32)
    set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Pose")
-    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
+    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
        ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
    # Properties->General->Output Directory
    set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))

--- a/examples/tutorial_thread/CMakeLists.txt
+++ b/examples/tutorial_thread/CMakeLists.txt
@@ -16,11 +16,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})

  message(STATUS "Adding Example ${EXE_NAME}")
  add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
-  
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${CMAKE_THREAD_LIBS_INIT})
+
  if (WIN32)
    set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Thread")
-    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
+    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
        ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
    # Properties->General->Output Directory
    set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))

--- a/examples/tutorial_wrapper/CMakeLists.txt
+++ b/examples/tutorial_wrapper/CMakeLists.txt
@@ -17,11 +17,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})

  message(STATUS "Adding Example ${EXE_NAME}")
  add_executable(${EXE_NAME} ${EXAMPLE_FILE})
-  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
+  target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${CMAKE_THREAD_LIBS_INIT})
  
  if (WIN32)
    set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Wrapper")
-    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
+    configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
        ${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
    # Properties->General->Output Directory
    set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))

--- a/include/openpose/core/nmsBase.hpp
+++ b/include/openpose/core/nmsBase.hpp
@@ -6,10 +6,17 @@
 namespace op
 {
    template <typename T>
-    OP_API void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+    OP_API void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
+                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);

    template <typename T>
-    OP_API void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+    OP_API void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
+                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
+
+    template <typename T>
+    OP_API void nmsOcl(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
+                       const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
+                       const int gpuID = 0);
 }

 #endif // OPENPOSE_CORE_NMS_BASE_HPP
--- a/include/openpose/core/nmsCaffe.hpp
+++ b/include/openpose/core/nmsCaffe.hpp
@@ -19,7 +19,7 @@ namespace op
        virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
-                             const int maxPeaks, const int outputChannels = -1);
+                             const int maxPeaks, const int outputChannels = -1, const int gpuID = 0);

        virtual inline const char* type() const { return "Nms"; }

@@ -29,6 +29,8 @@ namespace op

        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

+        virtual void Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+
        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
                                  const std::vector<caffe::Blob<T>*>& bottom);

@@ -37,6 +39,7 @@ namespace op

    private:
        T mThreshold;
+        int mGpuID;

        // PIMPL idiom
        // http://www.cppsamples.com/common-tasks/pimpl.html

--- a/include/openpose/core/resizeAndMergeBase.hpp
+++ b/include/openpose/core/resizeAndMergeBase.hpp
@@ -16,6 +16,13 @@ namespace op
                                  const std::array<int, 4>& targetSize,
                                  const std::vector<std::array<int, 4>>& sourceSizes,
                                  const std::vector<T>& scaleInputToNetInputs = {1.f});
+
+    template <typename T>
+    OP_API void resizeAndMergeOcl(T* targetPtr, const std::vector<const T*>& sourcePtrs,
+                                  const std::array<int, 4>& targetSize,
+                                  const std::vector<std::array<int, 4>>& sourceSizes,
+                                  const std::vector<T>& scaleInputToNetInputs = {1.f},
+                                  const int gpuID = 0);
 }

 #endif // OPENPOSE_CORE_RESIZE_AND_MERGE_BASE_HPP
--- a/include/openpose/core/resizeAndMergeCaffe.hpp
+++ b/include/openpose/core/resizeAndMergeCaffe.hpp
@@ -24,7 +24,8 @@ namespace op
        virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

        virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
-                             const T netFactor, const T scaleFactor, const bool mergeFirstDimension = true);
+                             const T netFactor, const T scaleFactor, const bool mergeFirstDimension = true,
+                             const int gpuID = 0);

        virtual inline const char* type() const { return "ResizeAndMerge"; }

@@ -34,6 +35,8 @@ namespace op

        virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);

+        virtual void Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
+
        virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
                                  const std::vector<caffe::Blob<T>*>& bottom);

@@ -44,6 +47,7 @@ namespace op
        std::vector<T> mScaleRatios;
        std::vector<std::array<int, 4>> mBottomSizes;
        std::array<int, 4> mTopSize;
+        int mGpuID;

        DELETE_COPY(ResizeAndMergeCaffe);
    };

--- a/include/openpose/gpu/cl2.hpp
+++ b/include/openpose/gpu/cl2.hpp
--- a/include/openpose/gpu/headers.hpp
+++ b/include/openpose/gpu/headers.hpp
@@ -2,8 +2,8 @@
 #define OPENPOSE_GPU_HEADERS_HPP

 // gpu module
-#include <openpose/gpu/enumClasses.hpp>
 #include <openpose/gpu/cuda.hpp>
+#include <openpose/gpu/enumClasses.hpp>
 #include <openpose/gpu/gpu.hpp>

 #endif // OPENPOSE_GPU_HEADERS_HPP
--- a/include/openpose/gpu/opencl.hcl
+++ b/include/openpose/gpu/opencl.hcl
+#ifndef OPENPOSE_CORE_OPENCL_HPP
+#define OPENPOSE_CORE_OPENCL_HPP
+
+#include <openpose/core/common.hpp>
+
+#define MULTI_LINE_STRING(ARG) #ARG
+
+#define CL_HPP_ENABLE_EXCEPTIONS
+#ifdef LOWER_CL_VERSION
+    #define CL_HPP_MINIMUM_OPENCL_VERSION 120
+    #define CL_HPP_TARGET_OPENCL_VERSION 120
+#else
+    #define CL_HPP_MINIMUM_OPENCL_VERSION 200
+    #define CL_HPP_TARGET_OPENCL_VERSION 200
+#endif
+
+typedef struct _cl_buffer_region cl_buffer_region;
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+namespace cl
+{
+    class CommandQueue;
+    class Kernel;
+}
+
+// Singleton structure
+// https://stackoverflow.com/questions/1008019/c-singleton-design-pattern
+
+namespace op
+{
+    class OP_API OpenCL
+    {
+    public:
+        static std::shared_ptr<OpenCL> getInstance(const int deviceId = 0, const int deviceType = CL_DEVICE_TYPE_GPU,
+                                                   bool getFromVienna = false);
+        ~OpenCL();
+
+        cl::CommandQueue& getQueue();
+
+        template <typename T>
+        bool buildKernelIntoManager(const std::string& kernelName, const std::string& src = "", bool isFile = false);
+
+        template <typename T>
+        cl::Kernel& getKernelFromManager(const std::string& kernelName, const std::string& src = "", bool isFile = false);
+
+        template <typename K, typename T>
+        inline K getKernelFunctorFromManager(const std::string& kernelName, const std::string& src = "", bool isFile = false)
+        {            
+            return K(getKernelFromManager<T>(kernelName, src, isFile));
+        }
+      
+        template <typename T> static void getBufferRegion(cl_buffer_region& region, const int origin, const int size);
+
+        static std::string clErrorToString(int err);
+
+        static int getTotalGPU();
+
+    private:
+        struct ImplCLManager;
+        std::unique_ptr<ImplCLManager> upImpl;
+
+        OpenCL(const int deviceId, const int deviceType, bool getFromVienna);
+
+        DELETE_COPY(OpenCL);
+    };
+}
+
+#endif // OPENPOSE_CORE_OPENCL_HPP
--- a/src/openpose/CMakeLists.txt
+++ b/src/openpose/CMakeLists.txt
@@ -35,9 +35,12 @@ if(UNIX AND NOT APPLE)
 # Windows
 elseif (WIN32)
  set_property(TARGET openpose PROPERTY DEBUG_POSTFIX d)
-  target_link_libraries(openpose ${OpenCV_LIBS} ${Caffe_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB}
+  target_link_libraries(openpose ${OpenCV_LIBS} ${Caffe_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES}
      debug ${GFLAGS_LIBRARY_DEBUG} optimized ${GFLAGS_LIBRARY_RELEASE}
      debug ${GLOG_LIBRARY_DEBUG} optimized ${GLOG_LIBRARY_RELEASE})
+  if (${GPU_MODE} MATCHES "OPENCL")
+    target_link_libraries(openpose debug ${BOOST_SYSTEM_LIB_DEBUG} optimized ${BOOST_SYSTEM_LIB_RELEASE})
+  endif (${GPU_MODE} MATCHES "OPENCL")
  if (${DL_FRAMEWORK} MATCHES "CAFFE")
    target_compile_definitions(openpose PRIVATE BOOST_ALL_NO_LIB)
  endif(${DL_FRAMEWORK} MATCHES "CAFFE")

--- a/src/openpose/core/CMakeLists.txt
+++ b/src/openpose/core/CMakeLists.txt
+set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;mm;CPP;cl)
 set(SOURCES_OP_CORE
    array.cpp
    cvMatToOpInput.cpp
@@ -13,6 +14,7 @@ set(SOURCES_OP_CORE
    netCaffe.cpp
    nmsBase.cpp
    nmsBase.cu
+    nmsBaseCL.cpp
    nmsCaffe.cpp
    opOutputToCvMat.cpp
    point.cpp
@@ -20,6 +22,7 @@ set(SOURCES_OP_CORE
    renderer.cpp
    resizeAndMergeBase.cpp
    resizeAndMergeBase.cu
+    resizeAndMergeBaseCL.cpp
    resizeAndMergeCaffe.cpp
    scaleAndSizeExtractor.cpp)


--- a/src/openpose/core/netCaffe.cpp
+++ b/src/openpose/core/netCaffe.cpp
@@ -5,7 +5,13 @@
    #include <caffe/net.hpp>
    #include <glog/logging.h> // google::InitGoogleLogging
 #endif
-#include <openpose/gpu/cuda.hpp>
+#ifdef USE_CUDA
+    #include <openpose/gpu/cuda.hpp>
+#endif
+#ifdef USE_OPENCL
+    #include <openpose/gpu/opencl.hcl>
+    #include <openpose/gpu/cl2.hpp>
+#endif
 #include <openpose/utilities/fileSystem.hpp>
 #include <openpose/utilities/standard.hpp>
 #include <openpose/core/netCaffe.hpp>
@@ -14,6 +20,9 @@ namespace op
 {
    std::mutex sMutexNetCaffe;
    std::atomic<bool> sGoogleLoggingInitialized{false};
+    #ifdef USE_OPENCL
+        std::atomic<bool> sOpenCLInitialized{false};
+    #endif

    struct NetCaffe::ImplNetCaffe
    {
@@ -53,6 +62,25 @@ namespace op
                        sGoogleLoggingInitialized = true;
                    }
                }
+                #ifdef USE_OPENCL
+                    // Initialize OpenCL
+                    if (!sOpenCLInitialized)
+                    {
+                        std::lock_guard<std::mutex> lock{sMutexNetCaffe};
+                        if (!sOpenCLInitialized)
+                        {
+                            caffe::Caffe::set_mode(caffe::Caffe::GPU);
+                            std::vector<int> devices;
+                            const int maxNumberGpu = op::OpenCL::getTotalGPU();
+                            for (auto i = 0; i < maxNumberGpu; i++)
+                                devices.emplace_back(i);
+                            caffe::Caffe::SetDevices(devices);
+                            if (mGpuId >= maxNumberGpu)
+                                error("Unexpected error. Please, notify us.", __LINE__, __FUNCTION__, __FILE__);
+                            sOpenCLInitialized = true;
+                        }
+                    }
+                #endif
            }
        #endif
    };
@@ -109,18 +137,27 @@ namespace op
        try
        {
            #ifdef USE_CAFFE
-                // Initialize net
-                #ifdef USE_CUDA
-                    caffe::Caffe::set_mode(caffe::Caffe::GPU);
-                    caffe::Caffe::SetDevice(upImpl->mGpuId);
-                #else
-                    caffe::Caffe::set_mode(caffe::Caffe::CPU);
-                #endif
-                upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST});
-                upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
-                #ifdef USE_CUDA
-                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
-                #endif
+               // Initialize net
+               #ifdef USE_OPENCL
+                   caffe::Caffe::set_mode(caffe::Caffe::GPU);
+                   caffe::Caffe::SelectDevice(upImpl->mGpuId, true);
+                   upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST,
+                                            caffe::Caffe::GetDefaultDevice()});
+                   upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
+                   op::OpenCL::getInstance(upImpl->mGpuId, CL_DEVICE_TYPE_GPU, true);
+               #else
+                   #ifdef USE_CUDA
+                       caffe::Caffe::set_mode(caffe::Caffe::GPU);
+                       caffe::Caffe::SetDevice(upImpl->mGpuId);
+                   #else
+                       caffe::Caffe::set_mode(caffe::Caffe::CPU);
+                   #endif
+                   upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST});
+                   upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
+                   #ifdef USE_CUDA
+                       cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                   #endif
+               #endif
                // Set spOutputBlob
                upImpl->spOutputBlob = upImpl->upCaffeNet->blob_by_name(upImpl->mLastBlobName);
                if (upImpl->spOutputBlob == nullptr)
@@ -159,6 +196,12 @@ namespace op
                    auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data();
                    cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), inputData.getVolume() * sizeof(float),
                               cudaMemcpyHostToDevice);
+                #elif defined USE_OPENCL
+                    auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data();
+                    cl::Buffer imageBuffer = cl::Buffer((cl_mem)gpuImagePtr, true);
+                    op::OpenCL::getInstance(upImpl->mGpuId)->getQueue().enqueueWriteBuffer(imageBuffer, true, 0,
+                                                                                           inputData.getVolume() * sizeof(float),
+                                                                                           inputData.getConstPtr());
                #else
                    auto* cpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_cpu_data();
                    std::copy(inputData.getConstPtr(), inputData.getConstPtr() + inputData.getVolume(), cpuImagePtr);

--- a/src/openpose/core/nmsBaseCL.cpp
+++ b/src/openpose/core/nmsBaseCL.cpp
+#include <algorithm>
+#include <bitset>
+#include <numeric>
+#include <opencv2/opencv.hpp>
+#ifdef USE_OPENCL
+    #include <openpose/gpu/opencl.hcl>
+    #include <openpose/gpu/cl2.hpp>
+#endif
+#include <openpose/core/common.hpp>
+#include <openpose/core/nmsBase.hpp>
+
+namespace op
+{
+    #ifdef USE_OPENCL
+        const std::string nmsOclCommonFunctions = MULTI_LINE_STRING(
+            void nmsAccuratePeakPosition(__global const Type* sourcePtr, const int peakLocX, const int peakLocY,
+                                         const int width, const int height, Type* fx, Type* fy, Type* fscore)
+            {
+                Type xAcc = 0.f;
+                Type yAcc = 0.f;
+                Type scoreAcc = 0.f;
+                const int dWidth = 3;
+                const int dHeight = 3;
+                for (auto dy = -dHeight ; dy <= dHeight ; dy++)
+                {
+                    const int y = peakLocY + dy;
+                    if (0 <= y && y < height) // Default height = 368
+                    {
+                        for (int dx = -dWidth ; dx <= dWidth ; dx++)
+                        {
+                            const int x = peakLocX + dx;
+                            if (0 <= x && x < width) // Default width = 656
+                            {
+                                const Type score = sourcePtr[y * width + x];
+                                if (score > 0)
+                                {
+                                    xAcc += (Type)x*score;
+                                    yAcc += (Type)y*score;
+                                    scoreAcc += score;
+                                }
+                            }
+                        }
+                    }
+                }
+
+                *fx = xAcc / scoreAcc;
+                *fy = yAcc / scoreAcc;
+                *fscore = sourcePtr[peakLocY*width + peakLocX];
+            }
+
+            union DS {
+              struct {
+                short x;
+                short y;
+                float score;
+              } ds;
+              double dbl;
+            };
+        );
+
+        typedef cl::KernelFunctor<cl::Buffer, cl::Buffer, int, int, float, int> NMSRegisterKernelFunctor;
+        const std::string nmsRegisterKernel = MULTI_LINE_STRING(
+            __kernel void nmsRegisterKernel(__global int* kernelPtr, __global const Type* sourcePtr,
+                                               const int w, const int h, const Type threshold, const int debug)
+            {
+                int x = get_global_id(0);
+                int y = get_global_id(1);
+                int index = y*w + x;      
+
+                if (0 < x && x < (w-1) && 0 < y && y < (h-1))
+                {
+                    const Type value = sourcePtr[index];
+                    if (value > threshold)
+                    {
+                        const Type topLeft     = sourcePtr[(y-1)*w + x-1];
+                        const Type top         = sourcePtr[(y-1)*w + x];
+                        const Type topRight    = sourcePtr[(y-1)*w + x+1];
+                        const Type left        = sourcePtr[    y*w + x-1];
+                        const Type right       = sourcePtr[    y*w + x+1];
+                        const Type bottomLeft  = sourcePtr[(y+1)*w + x-1];
+                        const Type bottom      = sourcePtr[(y+1)*w + x];
+                        const Type bottomRight = sourcePtr[(y+1)*w + x+1];
+
+                        if (value > topLeft && value > top && value > topRight
+                            && value > left && value > right
+                            && value > bottomLeft && value > bottom && value > bottomRight)
+                        {
+                            //Type fx = 0; Type fy = 0; Type fscore = 0;
+                            //nmsAccuratePeakPosition(sourcePtr, x, y, w, h, &fx, &fy, &fscore);
+                            kernelPtr[index] = 1;
+                            //if(debug) printf("%d %d \n", x,y);
+                        }
+                        else
+                            kernelPtr[index] = 0;
+                    }
+                    else
+                        kernelPtr[index] = 0;
+                }
+                else if (x == 0 || x == (w-1) || y == 0 || y == (h-1))
+                    kernelPtr[index] = 0;
+            }
+        );
+
+        typedef cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, int, int, int, int> NMSWriteKernelFunctor;
+        const std::string nmsWriteKernel = MULTI_LINE_STRING(
+            __kernel void nmsWriteKernel(__global Type* targetPtr, __global int* kernelPtr, __global const Type* sourcePtr,
+                                         const int w, const int h, const int maxPeaks, const int debug)
+            {
+                int x = get_global_id(0);
+                int y = get_global_id(1);
+                int index = y*w + x;
+
+                if (index != 0){
+                    int prev = kernelPtr[index-1];
+                    int curr = kernelPtr[index];
+                    if (curr < maxPeaks)
+                    {
+                        if (prev - curr)
+                        {
+                            Type fx = 0; Type fy = 0; Type fscore = 0;
+                            nmsAccuratePeakPosition(sourcePtr, x, y, w, h, &fx, &fy, &fscore);
+                            //if (debug) printf("C %d %d %d \n", x,y,kernelPtr[index]);
+                            __global Type* output = &targetPtr[curr*3];
+                            output[0] = fx; output[1] = fy; output[2] = fscore;
+                        }
+                        if (index + 1 == w*h)
+                        {
+                            __global Type* output = &targetPtr[0*3];
+                            output[0] = curr;
+                        }
+                    }
+                    else
+                    {
+                        if (index + 1 == w*h)
+                        {
+                            __global Type* output = &targetPtr[0*3];
+                            output[0] = maxPeaks;
+                        }
+                    }
+                }
+            }
+        );
+    #endif
+
+    template <typename T>
+    void nmsOcl(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
+                const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const int gpuID)
+    {
+        try
+        {
+            #ifdef USE_OPENCL
+                // Security checks
+                if (sourceSize.empty())
+                    error("sourceSize cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
+                if (targetSize.empty())
+                    error("targetSize cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
+                if (threshold < 0 || threshold > 1.0)
+                    error("threshold value invalid.", __LINE__, __FUNCTION__, __FILE__);
+
+                //Forward_cpu(bottom, top);
+                const auto num = sourceSize[0];
+                const auto height = sourceSize[2];
+                const auto width = sourceSize[3];
+                const auto channels = targetSize[1];
+                const auto targetPeaks = targetSize[2]; // 97
+                const auto targetPeakVec = targetSize[3]; // 3
+                const auto imageOffset = height * width;
+                const auto targetChannelOffset = targetPeaks * targetPeakVec;
+
+                // Get Kernel
+                cl::Buffer sourcePtrBuffer = cl::Buffer((cl_mem)(sourcePtr), true);
+                cl::Buffer kernelPtrBuffer = cl::Buffer((cl_mem)(kernelPtr), true);
+                cl::Buffer targetPtrBuffer = cl::Buffer((cl_mem)(targetPtr), true);
+                auto nmsRegisterKernel = op::OpenCL::getInstance(gpuID)->getKernelFunctorFromManager
+                        <op::NMSRegisterKernelFunctor, T>(
+                            "nmsRegisterKernel",op::nmsOclCommonFunctions + op::nmsRegisterKernel);
+                auto nmsWriteKernel = op::OpenCL::getInstance(gpuID)->getKernelFunctorFromManager
+                        <op::NMSWriteKernelFunctor, T>(
+                            "nmsWriteKernel",op::nmsOclCommonFunctions + op::nmsWriteKernel);
+
+                // log("num_b: " + std::to_string(bottom->shape(0)));       // = 1
+                // log("channel_b: " + std::to_string(bottom->shape(1)));   // = 57 = 18 body parts + bkg + 19x2 PAFs
+                // log("height_b: " + std::to_string(bottom->shape(2)));    // = 368 = height
+                // log("width_b: " + std::to_string(bottom->shape(3)));     // = 656 = width
+                // log("num_t: " + std::to_string(top->shape(0)));       // = 1
+                // log("channel_t: " + std::to_string(top->shape(1)));   // = 18 = numberParts
+                // log("height_t: " + std::to_string(top->shape(2)));    // = 97 = maxPeople + 1
+                // log("width_t: " + std::to_string(top->shape(3)));     // = 3 = [x, y, score]
+                // log("");
+
+                // Temp DS
+                //cv::Mat kernelCPU(cv::Size(width, height),CV_32FC1,cv::Scalar(0));
+                std::vector<int> kernelCPU(imageOffset);
+                for (auto n = 0; n < num; n++)
+                {
+                    for (auto c = 0; c < channels; c++)
+                    {
+                        // log("channel: " + std::to_string(c));
+                        const auto offsetChannel = (n * channels + c);
+
+                        // CL Data
+                        cl_buffer_region kernelRegion, sourceRegion, targetRegion;
+                        kernelRegion.origin = sizeof(int) * offsetChannel * imageOffset;
+                        kernelRegion.size = sizeof(int) * imageOffset;
+                        cl::Buffer kernelBuffer = kernelPtrBuffer.createSubBuffer(CL_MEM_READ_WRITE,
+                                                                                  CL_BUFFER_CREATE_TYPE_REGION,
+                                                                                  &kernelRegion);
+                        op::OpenCL::getBufferRegion<T>(sourceRegion, offsetChannel * imageOffset, imageOffset);
+                        op::OpenCL::getBufferRegion<T>(targetRegion, offsetChannel * targetChannelOffset, targetChannelOffset);
+                        cl::Buffer sourceBuffer = sourcePtrBuffer.createSubBuffer(CL_MEM_READ_ONLY,
+                                                                                  CL_BUFFER_CREATE_TYPE_REGION,
+                                                                                  &sourceRegion);
+                        cl::Buffer targetBuffer = targetPtrBuffer.createSubBuffer(CL_MEM_READ_WRITE,
+                                                                                  CL_BUFFER_CREATE_TYPE_REGION,
+                                                                                  &targetRegion);
+
+                        // Run Kernel to get 1-0 map
+                        bool debug = false;
+                        nmsRegisterKernel(cl::EnqueueArgs(op::OpenCL::getInstance(gpuID)->getQueue(), cl::NDRange(width, height)),
+                                          kernelBuffer, sourceBuffer, width, height, threshold, debug);
+                        // This is a really bad approach. We need to write a custom accumalator to run on gpu
+                        // Download it to CPU
+                        op::OpenCL::getInstance(gpuID)->getQueue().enqueueReadBuffer(kernelBuffer, CL_TRUE, 0,
+                                                                                     sizeof(int) *  width * height, &kernelCPU[0]);
+                        // Compute partial sum in CPU
+                        std::partial_sum(kernelCPU.begin(),kernelCPU.end(),kernelCPU.begin());
+                        // Reupload to GPU
+                        op::OpenCL::getInstance(gpuID)->getQueue().enqueueWriteBuffer(kernelBuffer, CL_TRUE, 0,
+                                                                                      sizeof(int) *  width * height, &kernelCPU[0]);
+                        // Write Kernel
+                        nmsWriteKernel(cl::EnqueueArgs(op::OpenCL::getInstance(gpuID)->getQueue(), cl::NDRange(width, height)),
+                                          targetBuffer, kernelBuffer, sourceBuffer, width, height, targetPeaks-1, debug);
+                    }
+                }
+            #else
+                UNUSED(targetPtr);
+                UNUSED(kernelPtr);
+                UNUSED(sourcePtr);
+                UNUSED(threshold);
+                UNUSED(targetSize);
+                UNUSED(sourceSize);
+                UNUSED(gpuID);
+                error("OpenPose must be compiled with the `USE_OPENCL` macro definition in order to use this"
+                      " functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        #ifdef USE_OPENCL
+            catch (const cl::Error& e)
+            {
+                error(std::string(e.what()) + " : " + op::OpenCL::clErrorToString(e.err()) + " ID: " +
+                      std::to_string(gpuID), __LINE__, __FUNCTION__, __FILE__);
+            }
+        #endif
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
+    template void nmsOcl(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, int gpuID);
+    template void nmsOcl(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
+                         const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, int gpuID);
+}
--- a/src/openpose/core/nmsCaffe.cpp
+++ b/src/openpose/core/nmsCaffe.cpp
@@ -13,9 +13,15 @@ namespace op
            caffe::Blob<int> mKernelBlob;
            std::array<int, 4> mBottomSize;
            std::array<int, 4> mTopSize;
+            // Special Kernel for OpenCL NMS
+            #ifdef USE_OPENCL
+                std::shared_ptr<caffe::Blob<int>> mKernelBlobT;
+            #endif
        #endif

-        ImplNmsCaffe(){};
+        ImplNmsCaffe()
+        {
+        }
    };

    template <typename T>
@@ -63,7 +69,7 @@ namespace op

    template <typename T>
    void NmsCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
-                              const int maxPeaks, const int outputChannels)
+                              const int maxPeaks, const int outputChannels, const int gpuID)
    {
        try
        {
@@ -82,6 +88,15 @@ namespace op
                topBlob->Reshape(topShape);
                upImpl->mKernelBlob.Reshape(bottomShape);

+                // Special Kernel for OpenCL NMS
+                #ifdef USE_OPENCL
+                    upImpl->mKernelBlobT = {std::make_shared<caffe::Blob<int>>(1,1,1,1)};
+                    upImpl->mKernelBlobT->Reshape(bottomShape);
+                    // GPU ID
+                    mGpuID = gpuID;
+                #else
+                    UNUSED(mGpuID);
+                #endif
                // Array sizes
                upImpl->mTopSize = std::array<int, 4>{topBlob->shape(0), topBlob->shape(1),
                                                      topBlob->shape(2), topBlob->shape(3)};
@@ -152,6 +167,27 @@ namespace op
        }
    }

+    template <typename T>
+    void NmsCaffe<T>::Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
+    {
+        try
+        {
+            #if defined USE_CAFFE && defined USE_OPENCL
+                nmsOcl(top.at(0)->mutable_gpu_data(), upImpl->mKernelBlobT->mutable_gpu_data(),
+                       bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mGpuID);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+                error("OpenPose must be compiled with the `USE_CAFFE` & `USE_OPENCL` macro definitions in order to run"
+                      " this functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
    template <typename T>
    void NmsCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
                                   const std::vector<caffe::Blob<T>*>& bottom)

--- a/src/openpose/core/resizeAndMergeBaseCL.cpp
+++ b/src/openpose/core/resizeAndMergeBaseCL.cpp
--- a/src/openpose/core/resizeAndMergeCaffe.cpp
+++ b/src/openpose/core/resizeAndMergeCaffe.cpp
@@ -51,7 +51,8 @@ namespace op
                                         const std::vector<caffe::Blob<T>*>& top,
                                         const T netFactor,
                                         const T scaleFactor,
-                                         const bool mergeFirstDimension)
+                                         const bool mergeFirstDimension,
+                                         const int gpuID)
    {
        try
        {
@@ -81,6 +82,12 @@ namespace op
                for (auto i = 0u ; i < mBottomSizes.size() ; i++)
                    mBottomSizes[i] = std::array<int, 4>{bottom[i]->shape(0), bottom[i]->shape(1),
                                                         bottom[i]->shape(2), bottom[i]->shape(3)};
+                #ifdef USE_OPENCL
+                    // GPU ID
+                    mGpuID = gpuID;
+                #else
+                    UNUSED(mGpuID);
+                #endif
            #else
                UNUSED(bottom);
                UNUSED(top);
@@ -156,6 +163,31 @@ namespace op
        }
    }

+    template <typename T>
+    void ResizeAndMergeCaffe<T>::Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom,
+                                             const std::vector<caffe::Blob<T>*>& top)
+    {
+        try
+        {
+            #if defined USE_CAFFE && defined USE_OPENCL
+                std::vector<const T*> sourcePtrs(bottom.size());
+                for (auto i = 0u ; i < sourcePtrs.size() ; i++)
+                    sourcePtrs[i] = bottom[i]->gpu_data();
+                resizeAndMergeOcl(top.at(0)->mutable_gpu_data(), sourcePtrs, mTopSize, mBottomSizes,
+                                  mScaleRatios, mGpuID);
+            #else
+                UNUSED(bottom);
+                UNUSED(top);
+                error("OpenPose must be compiled with the `USE_CAFFE` & `USE_OPENCL` macro definitions in order to run"
+                      " this functionality.", __LINE__, __FUNCTION__, __FILE__);
+            #endif
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+        }
+    }
+
    template <typename T>
    void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top,
                                              const std::vector<bool>& propagate_down,

--- a/src/openpose/face/faceExtractorCaffe.cpp
+++ b/src/openpose/face/faceExtractorCaffe.cpp
@@ -17,6 +17,7 @@ namespace op
    {
        #if defined USE_CAFFE
            bool netInitialized;
+            const int mGpuId;
            std::shared_ptr<NetCaffe> spNetCaffe;
            std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
            std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
@@ -27,6 +28,7 @@ namespace op

            ImplFaceExtractorCaffe(const std::string& modelFolder, const int gpuId, const bool enableGoogleLogging) :
                netInitialized{false},
+                mGpuId{gpuId},
                spNetCaffe{std::make_shared<NetCaffe>(modelFolder + FACE_PROTOTXT, modelFolder + FACE_TRAINED_MODEL,
                                                      gpuId, enableGoogleLogging)},
                spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
@@ -78,14 +80,15 @@ namespace op
                                              std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
                                              boost::shared_ptr<caffe::Blob<float>>& caffeNetOutputBlob,
                                              std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob)
+                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob,
+                                              const int gpuID)
        {
            try
            {
                // HeatMaps extractor blob and layer
                const bool mergeFirstDimension = true;
                resizeAndMergeCaffe->Reshape({caffeNetOutputBlob.get()}, {heatMapsBlob.get()},
-                                             FACE_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension);
+                                             FACE_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
                // Pose extractor blob and layer
                maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
                // Cuda check
@@ -241,7 +244,7 @@ namespace op
                                upImpl->netInitialized = true;
                                reshapeFaceExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spMaximumCaffe,
                                                          upImpl->spCaffeNetOutputBlob, upImpl->spHeatMapsBlob,
-                                                          upImpl->spPeaksBlob);
+                                                          upImpl->spPeaksBlob, upImpl->mGpuId);
                            }

                            // 2. Resize heat maps + merge different scales
@@ -249,6 +252,9 @@ namespace op
                                upImpl->spResizeAndMergeCaffe->Forward_gpu({upImpl->spCaffeNetOutputBlob.get()},
                                                                           {upImpl->spHeatMapsBlob.get()});
                                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                            #elif USE_OPENCL
+                                upImpl->spResizeAndMergeCaffe->Forward_ocl({upImpl->spCaffeNetOutputBlob.get()},
+                                                                           {upImpl->spHeatMapsBlob.get()});
                            #else
                                upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()},
                                                                           {upImpl->spHeatMapsBlob.get()});
@@ -259,6 +265,9 @@ namespace op
                                upImpl->spMaximumCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()},
                                                                    {upImpl->spPeaksBlob.get()});
                                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                            #elif USE_OPENCL
+                                // CPU Version is already very fast (4ms) and data is sent to connectKeypoints as CPU for now anyway
+                                upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
                            #else
                                upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()},
                                                                    {upImpl->spPeaksBlob.get()});

--- a/src/openpose/gpu/CMakeLists.txt
+++ b/src/openpose/gpu/CMakeLists.txt
 set(SOURCES_OP_GPU
    cuda.cpp
-    gpu.cpp)
+    gpu.cpp
+    opencl.cpp)

 include(${CMAKE_SOURCE_DIR}/cmake/Utils.cmake)
 prepend(SOURCES_OP_GPU_WITH_CP ${CMAKE_CURRENT_SOURCE_DIR} ${SOURCES_OP_GPU})

--- a/src/openpose/gpu/gpu.cpp
+++ b/src/openpose/gpu/gpu.cpp
@@ -2,7 +2,7 @@
    #include <openpose/gpu/cuda.hpp>
 #endif
 #ifdef USE_OPENCL
-    #include <openpose/core/clManager.hpp>
+    #include <openpose/gpu/opencl.hcl>
 #endif
 #include <openpose/gpu/gpu.hpp>

@@ -15,7 +15,7 @@ namespace op
            #ifdef USE_CUDA
                return getCudaGpuNumber();
            #elif defined USE_OPENCL
-                return CLManager::getTotalGPU();
+                return OpenCL::getTotalGPU();
            #else
                error("OpenPose must be compiled with the `USE_CUDA` or `USE_OPENCL` macro definition in order to use"
                      " this functionality.", __LINE__, __FUNCTION__, __FILE__);

--- a/src/openpose/gpu/opencl.cpp
+++ b/src/openpose/gpu/opencl.cpp
--- a/src/openpose/hand/handExtractorCaffe.cpp
+++ b/src/openpose/hand/handExtractorCaffe.cpp
@@ -18,6 +18,7 @@ namespace op
    {
        #if defined USE_CAFFE
            bool netInitialized;
+            const int mGpuId;
            std::shared_ptr<NetCaffe> spNetCaffe;
            std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
            std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
@@ -29,6 +30,7 @@ namespace op
            ImplHandExtractorCaffe(const std::string& modelFolder, const int gpuId,
                                   const bool enableGoogleLogging) :
                netInitialized{false},
+                mGpuId{gpuId},
                spNetCaffe{std::make_shared<NetCaffe>(modelFolder + HAND_PROTOTXT, modelFolder + HAND_TRAINED_MODEL,
                                                      gpuId, enableGoogleLogging)},
                spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
@@ -164,14 +166,15 @@ namespace op
                                              std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
                                              boost::shared_ptr<caffe::Blob<float>>& caffeNetOutputBlob,
                                              std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
-                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob)
+                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob,
+                                              const int gpuID)
        {
            try
            {
                // HeatMaps extractor blob and layer
                const bool mergeFirstDimension = true;
                resizeAndMergeCaffe->Reshape({caffeNetOutputBlob.get()}, {heatMapsBlob.get()},
-                                             HAND_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension);
+                                             HAND_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
                // Pose extractor blob and layer
                maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
                // Cuda check
@@ -411,7 +414,7 @@ namespace op
                    upImpl->netInitialized = true;
                    reshapeFaceExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spMaximumCaffe,
                                              upImpl->spCaffeNetOutputBlob, upImpl->spHeatMapsBlob,
-                                              upImpl->spPeaksBlob);
+                                              upImpl->spPeaksBlob, upImpl->mGpuId);
                }

                // 2. Resize heat maps + merge different scales
@@ -419,6 +422,9 @@ namespace op
                    upImpl->spResizeAndMergeCaffe->Forward_gpu({upImpl->spCaffeNetOutputBlob.get()},
                                                               {upImpl->spHeatMapsBlob.get()});
                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #elif USE_OPENCL
+                    upImpl->spResizeAndMergeCaffe->Forward_ocl({upImpl->spCaffeNetOutputBlob.get()},
+                                                               {upImpl->spHeatMapsBlob.get()});
                #else
                    upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()},
                                                               {upImpl->spHeatMapsBlob.get()});
@@ -428,6 +434,9 @@ namespace op
                #ifdef USE_CUDA
                    upImpl->spMaximumCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #elif USE_OPENCL
+                    // CPU Version is already very fast (4ms) and data is sent to connectKeypoints as CPU for now anyway
+                    upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
                #else
                    upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
                #endif

--- a/src/openpose/pose/poseExtractorCaffe.cpp
+++ b/src/openpose/pose/poseExtractorCaffe.cpp
@@ -5,6 +5,9 @@
 #include <openpose/core/nmsCaffe.hpp>
 #include <openpose/core/resizeAndMergeCaffe.hpp>
 #include <openpose/gpu/cuda.hpp>
+#ifdef USE_OPENCL
+    #include <openpose/gpu/opencl.hcl>
+#endif
 #include <openpose/pose/bodyPartConnectorCaffe.hpp>
 #include <openpose/pose/poseParameters.hpp>
 #include <openpose/utilities/check.hpp>
@@ -75,7 +78,8 @@ namespace op
                                              std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
                                              std::shared_ptr<caffe::Blob<float>>& peaksBlob,
                                              const float scaleInputToNetInput,
-                                              const PoseModel poseModel)
+                                              const PoseModel poseModel,
+                                              const int gpuID)
        {
            try
            {
@@ -83,10 +87,11 @@ namespace op
                // Caffe modifies bottom - Heatmap gets resized
                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
                resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()},
-                                             getPoseNetDecreaseFactor(poseModel), 1.f/scaleInputToNetInput);
+                                             getPoseNetDecreaseFactor(poseModel), 1.f/scaleInputToNetInput, true,
+                                             gpuID);
                // Pose extractor blob and layer
                nmsCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()}, getPoseMaxPeaks(poseModel),
-                                  getPoseNumberBodyParts(poseModel));
+                                  getPoseNumberBodyParts(poseModel), gpuID);
                // Pose extractor blob and layer
                bodyPartConnectorCaffe->Reshape({heatMapsBlob.get(), peaksBlob.get()});
                // Cuda check
@@ -239,7 +244,7 @@ namespace op
                        reshapePoseExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spNmsCaffe,
                                                  upImpl->spBodyPartConnectorCaffe, upImpl->spCaffeNetOutputBlobs,
                                                  upImpl->spHeatMapsBlob, upImpl->spPeaksBlob,
-                                                  1.f, upImpl->mPoseModel);
+                                                  1.f, upImpl->mPoseModel, upImpl->mGpuId);
                                                  // scaleInputToNetInputs[i], upImpl->mPoseModel);
                    }
                }
@@ -248,8 +253,13 @@ namespace op
                const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spCaffeNetOutputBlobs);
                const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
                upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
+
                #ifdef USE_CUDA
+                    //upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
                    upImpl->spResizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~5ms
+                #elif USE_OPENCL
+                    //upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
+                    upImpl->spResizeAndMergeCaffe->Forward_ocl(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()});
                #else
                    upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
                #endif
@@ -257,8 +267,12 @@ namespace op
                // 3. Get peaks by Non-Maximum Suppression
                upImpl->spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold));
                #ifdef USE_CUDA
+                    //upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
                    upImpl->spNmsCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});// ~2ms
                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
+                #elif USE_OPENCL
+                    //upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
+                    upImpl->spNmsCaffe->Forward_ocl({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
                #else
                    upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
                #endif