提交 eedd24b2 编写于 作者: R Raaj 提交者: Gines

OpenCL (Ubuntu/Windows) + MKL code improved (#452)

OpenCL support added, MKL support improved.
上级 29b6697d
......@@ -75,6 +75,7 @@ tags
*.vs/
*x64/
3rdparty/windows/caffe/
3rdparty/windows/caffe_opencl/
3rdparty/windows/caffe3rdparty/
3rdparty/windows/opencv/
3rdparty/windows/freeglut/
......
Subproject commit 4b786a77665eea5e9730a1818364e756ff321b53
Subproject commit 569352cc8e6a3ce8e8b03c12c975779235522f19
......@@ -98,18 +98,35 @@ if (WIN32)
endif (${INSTRUCTION_SET} MATCHES "SSE")
endif (WIN32)
# Look for CUDA
find_package(CUDA)
# Look for OpenCL
set(OpenCL_FOUND FALSE)
find_package(OpenCL)
# Set the acceleration library
set(GPU_MODE CUDA CACHE STRING "Select the acceleration GPU library or CPU otherwise.")
# set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA CPU_ONLY)
# # Test for CUDA or OpenCL
# find_package(CUDA)
# if (CUDA_FOUND)
# Display proper options to user
# if (CUDA_FOUND AND OpenCL_FOUND)
# set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA OPENCL CPU_ONLY)
# elseif (CUDA_FOUND)
# set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA CPU_ONLY)
# elseif (OpenCL_FOUND)
# set_property(CACHE GPU_MODE PROPERTY STRINGS OPENCL CPU_ONLY)
# else ()
# set_property(CACHE GPU_MODE PROPERTY STRINGS CPU_ONLY)
# endif ()
# # set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA OPENCL CPU_ONLY)
set_property(CACHE GPU_MODE PROPERTY STRINGS CUDA)
# Code to avoid crash at compiling time if OpenCL is not found
if (NOT OpenCL_FOUND)
set(OpenCL_LIBRARIES "")
endif (NOT OpenCL_FOUND)
# Required for OpenCL in Nvidia graphic cards
if (CUDA_FOUND AND OpenCL_FOUND AND ${CUDA_VERSION_MAJOR} LESS 9)
add_definitions(-DLOWER_CL_VERSION)
endif ()
# Handle desired GPU mode option
if (${GPU_MODE} MATCHES "CUDA")
# OpenPose flags
add_definitions(-DUSE_CUDA)
......@@ -118,14 +135,13 @@ elseif (${GPU_MODE} MATCHES "CPU_ONLY")
# OpenPose flag for Caffe
add_definitions(-DCPU_ONLY)
message(STATUS "Building CPU Only.")
elseif (${GPU_MODE} MATCHES "OPENCL")
# OpenPose flag for Caffe
add_definitions(-DUSE_OPENCL)
add_definitions(-DUSE_GREENTEA)
message(STATUS "Building with OpenCL.")
endif ()
if (${USE_MKL})
# OpenPose flags
add_definitions(-DUSE_MKL)
message(STATUS "Building with MKL support.")
endif (${USE_MKL})
# Intel branch with MKL Support
if (${GPU_MODE} MATCHES "CPU_ONLY")
if (UNIX AND NOT APPLE)
......@@ -133,6 +149,12 @@ if (${GPU_MODE} MATCHES "CPU_ONLY")
endif ()
endif ()
if (${USE_MKL})
# OpenPose flags
add_definitions(-DUSE_MKL)
message(STATUS "Building with MKL support.")
endif (${USE_MKL})
# Set/disable profiler
if (PROFILER_ENABLED)
add_definitions(-DPROFILER_ENABLED)
......@@ -169,6 +191,13 @@ option(BUILD_SHARED_LIBS "Build as shared lib" ON)
# Speed profiler
option(PROFILER_ENABLED "If enabled, OpenPose will be able to print out speed information at runtime." OFF)
# Threads
if (${GPU_MODE} MATCHES "OPENCL")
unset(CMAKE_THREAD_LIBS_INIT CACHE)
find_package(Threads)
else ()
set(CMAKE_THREAD_LIBS_INIT "")
endif ()
### FIND REQUIRED PACKAGES
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules")
......@@ -282,8 +311,12 @@ if (WIN32)
#TODO Add hashes
download_zip("opencv_310.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 1e5240a64b814b3c0b822f136be78ad7)
download_zip("caffe3rdparty_2017_07_14.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} ec0f800c8fb337e33304f3375bd06a80)
download_zip("caffe_2018_01_18.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 4b8e548cc7ea20abea472950dd5301bd)
download_zip("caffe3rdparty_2018_02_13.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 6653E07E3D2D7226D64FAD2DF0B407FB)
if (${GPU_MODE} MATCHES "OPENCL")
download_zip("caffe_opencl_2018_02_13.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 3ac3e1acf5ee6a4e57920be73053067a)
else ()
download_zip("caffe_2018_01_18.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} 4b8e548cc7ea20abea472950dd5301bd)
endif ()
if (WITH_3D_RENDERER)
download_zip("freeglut_2018_01_14.zip" ${OP_WIN_URL} ${FIND_LIB_PREFIX} BB182187285E06880F0EDE3A39530091)
endif (WITH_3D_RENDERER)
......@@ -295,8 +328,35 @@ if (WIN32)
find_library(GLOG_LIBRARY_RELEASE glog HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
find_library(GLOG_LIBRARY_DEBUG glogd HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
find_library(OpenCV_LIBS opencv_world310 HINTS ${FIND_LIB_PREFIX}/opencv/x64/vc14/lib)
find_library(Caffe_LIB caffe HINTS ${FIND_LIB_PREFIX}/caffe/lib)
find_library(Caffe_Proto_LIB caffeproto HINTS ${FIND_LIB_PREFIX}/caffe/lib)
if (${GPU_MODE} MATCHES "OPENCL")
SET(CAFFE_CL_CHECKED True CACHE BOOL "")
unset(Caffe_LIB CACHE)
unset(Caffe_Proto_LIB CACHE)
set(CL "_CL")
find_library(Caffe_LIB caffe HINTS ${FIND_LIB_PREFIX}/caffe_opencl/lib)
find_library(Caffe_Proto_LIB caffeproto HINTS ${FIND_LIB_PREFIX}/caffe_opencl/lib)
else ()
if (CAFFE_CL_CHECKED)
unset(Caffe_LIB CACHE)
unset(Caffe_Proto_LIB CACHE)
endif ()
set(CAFFE_CL_CHECKED 0)
set(CL "")
find_library(Caffe_LIB caffe HINTS ${FIND_LIB_PREFIX}/caffe/lib)
find_library(Caffe_Proto_LIB caffeproto HINTS ${FIND_LIB_PREFIX}/caffe/lib)
endif ()
if (${GPU_MODE} MATCHES "OPENCL")
unset(BOOST_SYSTEM_LIB_RELEASE CACHE)
unset(BOOST_SYSTEM_LIB_DEBUG CACHE)
find_library(BOOST_SYSTEM_LIB_RELEASE boost_system-vc140-mt-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
find_library(BOOST_SYSTEM_LIB_DEBUG boost_system-vc140-mt-gd-1_61 HINTS ${FIND_LIB_PREFIX}/caffe3rdparty/lib)
else ()
set(BOOST_SYSTEM_LIB_RELEASE "")
set(BOOST_SYSTEM_LIB_DEBUG "")
endif ()
if (WITH_3D_RENDERER)
find_library(GLUT_LIBRARY freeglut HINTS ${FIND_LIB_PREFIX}/freeglut/lib)
message(STATUS "\${GLUT_LIBRARY} = ${GLUT_LIBRARY}")
......@@ -306,7 +366,17 @@ if (WIN32)
endif (WITH_FLIR_CAMERA)
set(Caffe_LIBS ${Caffe_LIB};${Caffe_Proto_LIB})
set(OpenCV_INCLUDE_DIRS "3rdparty/windows/opencv/include")
set(Caffe_INCLUDE_DIRS "3rdparty/windows/caffe/include;3rdparty/windows/caffe/include2")
if (${GPU_MODE} MATCHES "OPENCL")
SET(CAFFE_CL_CHECKED2 True CACHE BOOL "")
unset(Caffe_INCLUDE_DIRS CACHE)
set(Caffe_INCLUDE_DIRS "3rdparty/windows/caffe_opencl/include;3rdparty/windows/caffe_opencl/include2" CACHE FILEPATH "Caffe_INCLUDE_DIRS")
else ()
if (CAFFE_CL_CHECKED2)
unset(Caffe_INCLUDE_DIRS CACHE)
endif ()
set(CAFFE_CL_CHECKED2 0)
set(Caffe_INCLUDE_DIRS "3rdparty/windows/caffe/include;3rdparty/windows/caffe/include2" CACHE FILEPATH "Caffe_INCLUDE_DIRS")
endif ()
set(Boost_INCLUDE_DIRS "3rdparty/windows/caffe3rdparty/include/boost-1_61")
set(WINDOWS_INCLUDE_DIRS "3rdparty/windows/caffe3rdparty/include")
if (WITH_3D_RENDERER)
......@@ -359,6 +429,7 @@ if (UNIX AND NOT APPLE)
set( MLIST ${rv} )
separate_arguments(MLIST)
list(GET MLIST 0 MKL_PATH)
message(STATUS ${MKL_PATH})
file(GLOB MKL_SO
"${MKL_PATH}lib/*"
)
......@@ -376,6 +447,9 @@ if (UNIX AND NOT APPLE)
endif ()
set(CAFFE_CPU_ONLY ON)
set(USE_CUDNN OFF)
elseif (${GPU_MODE} MATCHES "OPENCL")
execute_process(COMMAND git checkout opencl WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/3rdparty/caffe)
set(USE_CUDNN OFF)
endif ()
# Build Caffe
......@@ -443,7 +517,6 @@ if (UNIX AND NOT APPLE)
endif (UNIX AND NOT APPLE)
### PROJECT INCLUDES
# Specify the include directories
include_directories(
......@@ -452,6 +525,11 @@ include_directories(
${GLOG_INCLUDE_DIR}
${OpenCV_INCLUDE_DIRS})
if(USE_MKL)
include_directories(
"${MKL_PATH}/include/")
endif (USE_MKL)
if (Caffe_FOUND)
include_directories(
${Caffe_INCLUDE_DIRS})
......@@ -461,6 +539,10 @@ if (${GPU_MODE} MATCHES "CUDA")
include_directories(
${CUDA_INCLUDE_DIRS})
endif ()
if (${GPU_MODE} MATCHES "OPENCL")
include_directories(
${OpenCL_INCLUDE_DIRS})
endif ()
# 3D
if (WITH_3D_RENDERER)
include_directories(${GLUT_INCLUDE_DIRS})
......
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LocalDebuggerEnvironment>PATH=%PATH%;3rdparty\windows\caffe_opencl\bin\;3rdparty\windows\opencv\x64\vc14\bin;3rdparty\windows\freeglut\bin\;3rdparty\windows\spinnaker\bin\</LocalDebuggerEnvironment>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
<LocalDebuggerWorkingDirectory>..\..\..</LocalDebuggerWorkingDirectory>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LocalDebuggerEnvironment>PATH=%PATH%;3rdparty\windows\caffe_opencl\bin\;3rdparty\windows\opencv\x64\vc14\bin;3rdparty\windows\freeglut\bin\;3rdparty\windows\spinnaker\bin\</LocalDebuggerEnvironment>
<DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
<LocalDebuggerWorkingDirectory>..\..\..</LocalDebuggerWorkingDirectory>
</PropertyGroup>
</Project>
\ No newline at end of file
......@@ -90,13 +90,14 @@ The instructions in this section describe the steps to build OpenPose using CMak
2. [**CUDA 8**](https://developer.nvidia.com/cuda-80-ga2-download-archive):
- Ubuntu: Run `ubuntu/install_cuda.sh` or alternatively download and install it from their website.
- Windows: Install CUDA 8.0 after Visual Studio 2015 is installed to assure that the CUDA installation will generate all necessary files for VS. If CUDA was already installed, re-install CUDA after installing VS!
- **IMPORTANT**: As of a recent Windows update, you have to download the Nvidia [drivers](http://www.nvidia.com/Download/index.aspx) drivers first, and then install CUDA without the Graphics Driver flag or else your system might hang.
3. [**cuDNN 5.1**](https://developer.nvidia.com/cudnn):
- Ubuntu: Run `ubuntu/install_cudnn.sh` or alternatively download and install it from their website.
- Windows (and Ubuntu if manual installation): In order to manually install it, just unzip it and copy (merge) the contents on the CUDA folder, usually `/usr/local/cuda/` in Ubuntu and `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v8.0` in Windows.
3. Ubuntu - Other prerequisites:
4. Ubuntu - Other prerequisites:
- Caffe prerequisites: By default, OpenPose uses Caffe under the hood. If you have not used Caffe previously, install its dependencies by running `bash ./ubuntu/install_cmake.sh`.
- OpenCV must be already installed on your machine. It can be installed with `apt-get install libopencv-dev`. You can also use your own compiled OpenCV version.
4. Windows - **Microsoft Visual Studio (VS) 2015 Enterprise Update 3**:
5. Windows - **Microsoft Visual Studio (VS) 2015 Enterprise Update 3**:
- If **Visual Studio 2017 Community** is desired, we do not officially support it, but it might be compiled by firstly [enabling CUDA 8.0 in VS2017](https://stackoverflow.com/questions/43745099/using-cuda-with-visual-studio-2017?answertab=active#tab-top) or use **VS2017 with CUDA 9** by checking the `.vcxproj` file and changing the necessary paths from CUDA 8 to 9.
- VS 2015 Enterprise Update 1 will give some compiler errors and VS 2015 Community has not been tested.
5. Windows - **Caffe, OpenCV, and Caffe prerequisites**:
......
......@@ -13,11 +13,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
message(STATUS "Adding Example ${EXE_NAME}")
add_executable(${EXE_NAME} ${EXAMPLE_FILE})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES})
if (WIN32)
set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples")
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
# Properties->General->Output Directory
set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))
......
set(EXAMPLE_FILES
handFromJsonTest.cpp)
handFromJsonTest.cpp
resizeTest.cpp)
foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
......@@ -13,11 +14,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
message(STATUS "Adding Example ${EXE_NAME}")
add_executable(${EXE_NAME} ${EXAMPLE_FILE})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Tests")
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
# Properties->General->Output Directory
set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))
......
// ------------------------- OpenPose Resize Layer Testing -------------------------
#include <chrono> // `std::chrono::` functions and classes, e.g. std::chrono::milliseconds
// GFlags: DEFINE_bool, _int32, _int64, _uint64, _double, _string
#include <gflags/gflags.h>
// Allow Google Flags in Ubuntu 14
#ifndef GFLAGS_GFLAGS_H_
namespace gflags = google;
#endif
#include <openpose/headers.hpp>
#include <openpose/core/resizeAndMergeBase.hpp>
#ifdef USE_CUDA
#include <caffe/net.hpp>
#endif
#include <openpose/gpu/cuda.hpp>
DEFINE_string(image_path, "examples/media/COCO_val2014_000000000192.jpg", "Process the desired image.");
cv::Mat gpuResize(cv::Mat& img, cv::Size newSize)
{
#ifdef USE_CUDA
// Upload to Source to GPU
float* cpuPtr = &img.at<float>(0);
float* gpuPtr;
cudaMallocHost((void **)&gpuPtr, img.size().width * img.size().height * sizeof(float));
cudaMemcpy(gpuPtr, cpuPtr, img.size().width * img.size().height * sizeof(float),
cudaMemcpyHostToDevice);
// Upload to Dest to GPU
cv::Mat newImg = cv::Mat(newSize,CV_32FC1,cv::Scalar(0));
float* newCpuPtr = &newImg.at<float>(0);
float* newGpuPtr;
cudaMallocHost((void **)&newGpuPtr, newSize.width * newSize.height * sizeof(float));
cudaMemcpy(newGpuPtr, newCpuPtr, newSize.width * newSize.height * sizeof(float),
cudaMemcpyHostToDevice);
std::vector<const float*> sourcePtrs;
sourcePtrs.emplace_back(gpuPtr);
std::array<int, 4> targetSize = {1,1,newImg.size().height,newImg.size().width};
std::array<int, 4> sourceSize = {1,1,img.size().height,img.size().width};
std::vector<std::array<int, 4>> sourceSizes;
sourceSizes.emplace_back(sourceSize);
op::resizeAndMergeGpu(newGpuPtr, sourcePtrs, targetSize, sourceSizes);
cudaMemcpy(newCpuPtr, newGpuPtr, newImg.size().width * newImg.size().height * sizeof(float),
cudaMemcpyDeviceToHost);
cudaFree(gpuPtr);
cudaFree(newGpuPtr);
return newImg;
#else
op::error("OpenPose must be compiled with the `USE_CAFFE` & `USE_CUDA` macro definitions in order to run"
" this functionality.", __LINE__, __FUNCTION__, __FILE__);
#endif
}
cv::Mat cpuResize(cv::Mat& img, cv::Size newSize)
{
// Upload to Source to GPU
float* cpuPtr = &img.at<float>(0);
// Upload to Dest to GPU
cv::Mat newImg = cv::Mat(newSize,CV_32FC1,cv::Scalar(0));
std::vector<const float*> sourcePtrs;
sourcePtrs.emplace_back(cpuPtr);
std::array<int, 4> targetSize = {1,1,newImg.size().height,newImg.size().width};
std::array<int, 4> sourceSize = {1,1,img.size().height,img.size().width};
std::vector<std::array<int, 4>> sourceSizes;
sourceSizes.emplace_back(sourceSize);
op::resizeAndMergeCpu(&newImg.at<float>(0), sourcePtrs, targetSize, sourceSizes);
return newImg;
}
int resizeTest()
{
// logging_level
cv::Mat img = op::loadImage(FLAGS_image_path, CV_LOAD_IMAGE_GRAYSCALE);
if(img.empty())
op::error("Could not open or find the image: " + FLAGS_image_path, __LINE__, __FUNCTION__, __FILE__);
img.convertTo(img, CV_32FC1);
img = cpuResize(img, cv::Size(img.size().width/4,img.size().height/4));
img*=0.005;
cv::Mat gpuImg = gpuResize(img, cv::Size(img.size().width*8,img.size().height*8));
cv::Mat cpuImg = cpuResize(img, cv::Size(img.size().width*8,img.size().height*8));
cv::imshow("gpuImg", gpuImg);
cv::imshow("cpuImg", cpuImg);
op::log("Done");
cv::waitKey(0);
return 0;
}
int main(int argc, char *argv[])
{
// Parsing command line flags
gflags::ParseCommandLineFlags(&argc, &argv, true);
// Running handFromJsonTest
return resizeTest();
}
......@@ -16,11 +16,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
message(STATUS "Adding Example ${EXE_NAME}")
add_executable(${EXE_NAME} ${EXAMPLE_FILE})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY}
${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/AddModule")
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
# Properties->General->Output Directory
set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))
......
......@@ -14,11 +14,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
message(STATUS "Adding Example ${EXE_NAME}")
add_executable(${EXE_NAME} ${EXAMPLE_FILE})
target_link_libraries(${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
target_link_libraries(${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Pose")
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
# Properties->General->Output Directory
set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))
......
......@@ -16,11 +16,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
message(STATUS "Adding Example ${EXE_NAME}")
add_executable(${EXE_NAME} ${EXAMPLE_FILE})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Thread")
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
# Properties->General->Output Directory
set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))
......
......@@ -17,11 +17,11 @@ foreach(EXAMPLE_FILE ${EXAMPLE_FILES})
message(STATUS "Adding Example ${EXE_NAME}")
add_executable(${EXE_NAME} ${EXAMPLE_FILE})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB})
target_link_libraries( ${EXE_NAME} openpose ${GLOG_LIBRARY} ${GFLAGS_LIBRARY} ${Caffe_LIBS} ${MKL_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
set_property(TARGET ${EXE_NAME} PROPERTY FOLDER "Examples/Tutorial/Wrapper")
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose.vcxproj.user
configure_file(${CMAKE_SOURCE_DIR}/cmake/OpenPose${CL}.vcxproj.user
${CMAKE_CURRENT_BINARY_DIR}/${EXE_NAME}.vcxproj.user @ONLY)
# Properties->General->Output Directory
set_property(TARGET ${EXE_NAME} PROPERTY RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/$(Platform)/$(Configuration))
......
......@@ -6,10 +6,17 @@
namespace op
{
template <typename T>
OP_API void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
OP_API void nmsCpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
template <typename T>
OP_API void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold, const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
OP_API void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize);
template <typename T>
OP_API void nmsOcl(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize,
const int gpuID = 0);
}
#endif // OPENPOSE_CORE_NMS_BASE_HPP
......@@ -19,7 +19,7 @@ namespace op
virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
const int maxPeaks, const int outputChannels = -1);
const int maxPeaks, const int outputChannels = -1, const int gpuID = 0);
virtual inline const char* type() const { return "Nms"; }
......@@ -29,6 +29,8 @@ namespace op
virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
virtual void Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
const std::vector<caffe::Blob<T>*>& bottom);
......@@ -37,6 +39,7 @@ namespace op
private:
T mThreshold;
int mGpuID;
// PIMPL idiom
// http://www.cppsamples.com/common-tasks/pimpl.html
......
......@@ -16,6 +16,13 @@ namespace op
const std::array<int, 4>& targetSize,
const std::vector<std::array<int, 4>>& sourceSizes,
const std::vector<T>& scaleInputToNetInputs = {1.f});
template <typename T>
OP_API void resizeAndMergeOcl(T* targetPtr, const std::vector<const T*>& sourcePtrs,
const std::array<int, 4>& targetSize,
const std::vector<std::array<int, 4>>& sourceSizes,
const std::vector<T>& scaleInputToNetInputs = {1.f},
const int gpuID = 0);
}
#endif // OPENPOSE_CORE_RESIZE_AND_MERGE_BASE_HPP
......@@ -24,7 +24,8 @@ namespace op
virtual void LayerSetUp(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
virtual void Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
const T netFactor, const T scaleFactor, const bool mergeFirstDimension = true);
const T netFactor, const T scaleFactor, const bool mergeFirstDimension = true,
const int gpuID = 0);
virtual inline const char* type() const { return "ResizeAndMerge"; }
......@@ -34,6 +35,8 @@ namespace op
virtual void Forward_gpu(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
virtual void Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top);
virtual void Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
const std::vector<caffe::Blob<T>*>& bottom);
......@@ -44,6 +47,7 @@ namespace op
std::vector<T> mScaleRatios;
std::vector<std::array<int, 4>> mBottomSizes;
std::array<int, 4> mTopSize;
int mGpuID;
DELETE_COPY(ResizeAndMergeCaffe);
};
......
此差异已折叠。
......@@ -2,8 +2,8 @@
#define OPENPOSE_GPU_HEADERS_HPP
// gpu module
#include <openpose/gpu/enumClasses.hpp>
#include <openpose/gpu/cuda.hpp>
#include <openpose/gpu/enumClasses.hpp>
#include <openpose/gpu/gpu.hpp>
#endif // OPENPOSE_GPU_HEADERS_HPP
#ifndef OPENPOSE_CORE_OPENCL_HPP
#define OPENPOSE_CORE_OPENCL_HPP
#include <openpose/core/common.hpp>
#define MULTI_LINE_STRING(ARG) #ARG
#define CL_HPP_ENABLE_EXCEPTIONS
#ifdef LOWER_CL_VERSION
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#else
#define CL_HPP_MINIMUM_OPENCL_VERSION 200
#define CL_HPP_TARGET_OPENCL_VERSION 200
#endif
typedef struct _cl_buffer_region cl_buffer_region;
#define CL_DEVICE_TYPE_GPU (1 << 2)
namespace cl
{
class CommandQueue;
class Kernel;
}
// Singleton structure
// https://stackoverflow.com/questions/1008019/c-singleton-design-pattern
namespace op
{
class OP_API OpenCL
{
public:
static std::shared_ptr<OpenCL> getInstance(const int deviceId = 0, const int deviceType = CL_DEVICE_TYPE_GPU,
bool getFromVienna = false);
~OpenCL();
cl::CommandQueue& getQueue();
template <typename T>
bool buildKernelIntoManager(const std::string& kernelName, const std::string& src = "", bool isFile = false);
template <typename T>
cl::Kernel& getKernelFromManager(const std::string& kernelName, const std::string& src = "", bool isFile = false);
template <typename K, typename T>
inline K getKernelFunctorFromManager(const std::string& kernelName, const std::string& src = "", bool isFile = false)
{
return K(getKernelFromManager<T>(kernelName, src, isFile));
}
template <typename T> static void getBufferRegion(cl_buffer_region& region, const int origin, const int size);
static std::string clErrorToString(int err);
static int getTotalGPU();
private:
struct ImplCLManager;
std::unique_ptr<ImplCLManager> upImpl;
OpenCL(const int deviceId, const int deviceType, bool getFromVienna);
DELETE_COPY(OpenCL);
};
}
#endif // OPENPOSE_CORE_OPENCL_HPP
......@@ -35,9 +35,12 @@ if(UNIX AND NOT APPLE)
# Windows
elseif (WIN32)
set_property(TARGET openpose PROPERTY DEBUG_POSTFIX d)
target_link_libraries(openpose ${OpenCV_LIBS} ${Caffe_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB}
target_link_libraries(openpose ${OpenCV_LIBS} ${Caffe_LIBS} ${GLUT_LIBRARY} ${SPINNAKER_LIB} ${OpenCL_LIBRARIES}
debug ${GFLAGS_LIBRARY_DEBUG} optimized ${GFLAGS_LIBRARY_RELEASE}
debug ${GLOG_LIBRARY_DEBUG} optimized ${GLOG_LIBRARY_RELEASE})
if (${GPU_MODE} MATCHES "OPENCL")
target_link_libraries(openpose debug ${BOOST_SYSTEM_LIB_DEBUG} optimized ${BOOST_SYSTEM_LIB_RELEASE})
endif (${GPU_MODE} MATCHES "OPENCL")
if (${DL_FRAMEWORK} MATCHES "CAFFE")
target_compile_definitions(openpose PRIVATE BOOST_ALL_NO_LIB)
endif(${DL_FRAMEWORK} MATCHES "CAFFE")
......
set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;mm;CPP;cl)
set(SOURCES_OP_CORE
array.cpp
cvMatToOpInput.cpp
......@@ -13,6 +14,7 @@ set(SOURCES_OP_CORE
netCaffe.cpp
nmsBase.cpp
nmsBase.cu
nmsBaseCL.cpp
nmsCaffe.cpp
opOutputToCvMat.cpp
point.cpp
......@@ -20,6 +22,7 @@ set(SOURCES_OP_CORE
renderer.cpp
resizeAndMergeBase.cpp
resizeAndMergeBase.cu
resizeAndMergeBaseCL.cpp
resizeAndMergeCaffe.cpp
scaleAndSizeExtractor.cpp)
......
......@@ -5,7 +5,13 @@
#include <caffe/net.hpp>
#include <glog/logging.h> // google::InitGoogleLogging
#endif
#include <openpose/gpu/cuda.hpp>
#ifdef USE_CUDA
#include <openpose/gpu/cuda.hpp>
#endif
#ifdef USE_OPENCL
#include <openpose/gpu/opencl.hcl>
#include <openpose/gpu/cl2.hpp>
#endif
#include <openpose/utilities/fileSystem.hpp>
#include <openpose/utilities/standard.hpp>
#include <openpose/core/netCaffe.hpp>
......@@ -14,6 +20,9 @@ namespace op
{
std::mutex sMutexNetCaffe;
std::atomic<bool> sGoogleLoggingInitialized{false};
#ifdef USE_OPENCL
std::atomic<bool> sOpenCLInitialized{false};
#endif
struct NetCaffe::ImplNetCaffe
{
......@@ -53,6 +62,25 @@ namespace op
sGoogleLoggingInitialized = true;
}
}
#ifdef USE_OPENCL
// Initialize OpenCL
if (!sOpenCLInitialized)
{
std::lock_guard<std::mutex> lock{sMutexNetCaffe};
if (!sOpenCLInitialized)
{
caffe::Caffe::set_mode(caffe::Caffe::GPU);
std::vector<int> devices;
const int maxNumberGpu = op::OpenCL::getTotalGPU();
for (auto i = 0; i < maxNumberGpu; i++)
devices.emplace_back(i);
caffe::Caffe::SetDevices(devices);
if (mGpuId >= maxNumberGpu)
error("Unexpected error. Please, notify us.", __LINE__, __FUNCTION__, __FILE__);
sOpenCLInitialized = true;
}
}
#endif
}
#endif
};
......@@ -109,18 +137,27 @@ namespace op
try
{
#ifdef USE_CAFFE
// Initialize net
#ifdef USE_CUDA
caffe::Caffe::set_mode(caffe::Caffe::GPU);
caffe::Caffe::SetDevice(upImpl->mGpuId);
#else
caffe::Caffe::set_mode(caffe::Caffe::CPU);
#endif
upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST});
upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
#ifdef USE_CUDA
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#endif
// Initialize net
#ifdef USE_OPENCL
caffe::Caffe::set_mode(caffe::Caffe::GPU);
caffe::Caffe::SelectDevice(upImpl->mGpuId, true);
upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST,
caffe::Caffe::GetDefaultDevice()});
upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
op::OpenCL::getInstance(upImpl->mGpuId, CL_DEVICE_TYPE_GPU, true);
#else
#ifdef USE_CUDA
caffe::Caffe::set_mode(caffe::Caffe::GPU);
caffe::Caffe::SetDevice(upImpl->mGpuId);
#else
caffe::Caffe::set_mode(caffe::Caffe::CPU);
#endif
upImpl->upCaffeNet.reset(new caffe::Net<float>{upImpl->mCaffeProto, caffe::TEST});
upImpl->upCaffeNet->CopyTrainedLayersFrom(upImpl->mCaffeTrainedModel);
#ifdef USE_CUDA
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#endif
#endif
// Set spOutputBlob
upImpl->spOutputBlob = upImpl->upCaffeNet->blob_by_name(upImpl->mLastBlobName);
if (upImpl->spOutputBlob == nullptr)
......@@ -159,6 +196,12 @@ namespace op
auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data();
cudaMemcpy(gpuImagePtr, inputData.getConstPtr(), inputData.getVolume() * sizeof(float),
cudaMemcpyHostToDevice);
#elif defined USE_OPENCL
auto* gpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_gpu_data();
cl::Buffer imageBuffer = cl::Buffer((cl_mem)gpuImagePtr, true);
op::OpenCL::getInstance(upImpl->mGpuId)->getQueue().enqueueWriteBuffer(imageBuffer, true, 0,
inputData.getVolume() * sizeof(float),
inputData.getConstPtr());
#else
auto* cpuImagePtr = upImpl->upCaffeNet->blobs().at(0)->mutable_cpu_data();
std::copy(inputData.getConstPtr(), inputData.getConstPtr() + inputData.getVolume(), cpuImagePtr);
......
#include <algorithm>
#include <bitset>
#include <numeric>
#include <opencv2/opencv.hpp>
#ifdef USE_OPENCL
#include <openpose/gpu/opencl.hcl>
#include <openpose/gpu/cl2.hpp>
#endif
#include <openpose/core/common.hpp>
#include <openpose/core/nmsBase.hpp>
namespace op
{
#ifdef USE_OPENCL
const std::string nmsOclCommonFunctions = MULTI_LINE_STRING(
void nmsAccuratePeakPosition(__global const Type* sourcePtr, const int peakLocX, const int peakLocY,
const int width, const int height, Type* fx, Type* fy, Type* fscore)
{
Type xAcc = 0.f;
Type yAcc = 0.f;
Type scoreAcc = 0.f;
const int dWidth = 3;
const int dHeight = 3;
for (auto dy = -dHeight ; dy <= dHeight ; dy++)
{
const int y = peakLocY + dy;
if (0 <= y && y < height) // Default height = 368
{
for (int dx = -dWidth ; dx <= dWidth ; dx++)
{
const int x = peakLocX + dx;
if (0 <= x && x < width) // Default width = 656
{
const Type score = sourcePtr[y * width + x];
if (score > 0)
{
xAcc += (Type)x*score;
yAcc += (Type)y*score;
scoreAcc += score;
}
}
}
}
}
*fx = xAcc / scoreAcc;
*fy = yAcc / scoreAcc;
*fscore = sourcePtr[peakLocY*width + peakLocX];
}
union DS {
struct {
short x;
short y;
float score;
} ds;
double dbl;
};
);
typedef cl::KernelFunctor<cl::Buffer, cl::Buffer, int, int, float, int> NMSRegisterKernelFunctor;
const std::string nmsRegisterKernel = MULTI_LINE_STRING(
__kernel void nmsRegisterKernel(__global int* kernelPtr, __global const Type* sourcePtr,
const int w, const int h, const Type threshold, const int debug)
{
int x = get_global_id(0);
int y = get_global_id(1);
int index = y*w + x;
if (0 < x && x < (w-1) && 0 < y && y < (h-1))
{
const Type value = sourcePtr[index];
if (value > threshold)
{
const Type topLeft = sourcePtr[(y-1)*w + x-1];
const Type top = sourcePtr[(y-1)*w + x];
const Type topRight = sourcePtr[(y-1)*w + x+1];
const Type left = sourcePtr[ y*w + x-1];
const Type right = sourcePtr[ y*w + x+1];
const Type bottomLeft = sourcePtr[(y+1)*w + x-1];
const Type bottom = sourcePtr[(y+1)*w + x];
const Type bottomRight = sourcePtr[(y+1)*w + x+1];
if (value > topLeft && value > top && value > topRight
&& value > left && value > right
&& value > bottomLeft && value > bottom && value > bottomRight)
{
//Type fx = 0; Type fy = 0; Type fscore = 0;
//nmsAccuratePeakPosition(sourcePtr, x, y, w, h, &fx, &fy, &fscore);
kernelPtr[index] = 1;
//if(debug) printf("%d %d \n", x,y);
}
else
kernelPtr[index] = 0;
}
else
kernelPtr[index] = 0;
}
else if (x == 0 || x == (w-1) || y == 0 || y == (h-1))
kernelPtr[index] = 0;
}
);
typedef cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, int, int, int, int> NMSWriteKernelFunctor;
const std::string nmsWriteKernel = MULTI_LINE_STRING(
__kernel void nmsWriteKernel(__global Type* targetPtr, __global int* kernelPtr, __global const Type* sourcePtr,
const int w, const int h, const int maxPeaks, const int debug)
{
int x = get_global_id(0);
int y = get_global_id(1);
int index = y*w + x;
if (index != 0){
int prev = kernelPtr[index-1];
int curr = kernelPtr[index];
if (curr < maxPeaks)
{
if (prev - curr)
{
Type fx = 0; Type fy = 0; Type fscore = 0;
nmsAccuratePeakPosition(sourcePtr, x, y, w, h, &fx, &fy, &fscore);
//if (debug) printf("C %d %d %d \n", x,y,kernelPtr[index]);
__global Type* output = &targetPtr[curr*3];
output[0] = fx; output[1] = fy; output[2] = fscore;
}
if (index + 1 == w*h)
{
__global Type* output = &targetPtr[0*3];
output[0] = curr;
}
}
else
{
if (index + 1 == w*h)
{
__global Type* output = &targetPtr[0*3];
output[0] = maxPeaks;
}
}
}
}
);
#endif
template <typename T>
void nmsOcl(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const int gpuID)
{
try
{
#ifdef USE_OPENCL
// Security checks
if (sourceSize.empty())
error("sourceSize cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
if (targetSize.empty())
error("targetSize cannot be empty.", __LINE__, __FUNCTION__, __FILE__);
if (threshold < 0 || threshold > 1.0)
error("threshold value invalid.", __LINE__, __FUNCTION__, __FILE__);
//Forward_cpu(bottom, top);
const auto num = sourceSize[0];
const auto height = sourceSize[2];
const auto width = sourceSize[3];
const auto channels = targetSize[1];
const auto targetPeaks = targetSize[2]; // 97
const auto targetPeakVec = targetSize[3]; // 3
const auto imageOffset = height * width;
const auto targetChannelOffset = targetPeaks * targetPeakVec;
// Get Kernel
cl::Buffer sourcePtrBuffer = cl::Buffer((cl_mem)(sourcePtr), true);
cl::Buffer kernelPtrBuffer = cl::Buffer((cl_mem)(kernelPtr), true);
cl::Buffer targetPtrBuffer = cl::Buffer((cl_mem)(targetPtr), true);
auto nmsRegisterKernel = op::OpenCL::getInstance(gpuID)->getKernelFunctorFromManager
<op::NMSRegisterKernelFunctor, T>(
"nmsRegisterKernel",op::nmsOclCommonFunctions + op::nmsRegisterKernel);
auto nmsWriteKernel = op::OpenCL::getInstance(gpuID)->getKernelFunctorFromManager
<op::NMSWriteKernelFunctor, T>(
"nmsWriteKernel",op::nmsOclCommonFunctions + op::nmsWriteKernel);
// log("num_b: " + std::to_string(bottom->shape(0))); // = 1
// log("channel_b: " + std::to_string(bottom->shape(1))); // = 57 = 18 body parts + bkg + 19x2 PAFs
// log("height_b: " + std::to_string(bottom->shape(2))); // = 368 = height
// log("width_b: " + std::to_string(bottom->shape(3))); // = 656 = width
// log("num_t: " + std::to_string(top->shape(0))); // = 1
// log("channel_t: " + std::to_string(top->shape(1))); // = 18 = numberParts
// log("height_t: " + std::to_string(top->shape(2))); // = 97 = maxPeople + 1
// log("width_t: " + std::to_string(top->shape(3))); // = 3 = [x, y, score]
// log("");
// Temp DS
//cv::Mat kernelCPU(cv::Size(width, height),CV_32FC1,cv::Scalar(0));
std::vector<int> kernelCPU(imageOffset);
for (auto n = 0; n < num; n++)
{
for (auto c = 0; c < channels; c++)
{
// log("channel: " + std::to_string(c));
const auto offsetChannel = (n * channels + c);
// CL Data
cl_buffer_region kernelRegion, sourceRegion, targetRegion;
kernelRegion.origin = sizeof(int) * offsetChannel * imageOffset;
kernelRegion.size = sizeof(int) * imageOffset;
cl::Buffer kernelBuffer = kernelPtrBuffer.createSubBuffer(CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION,
&kernelRegion);
op::OpenCL::getBufferRegion<T>(sourceRegion, offsetChannel * imageOffset, imageOffset);
op::OpenCL::getBufferRegion<T>(targetRegion, offsetChannel * targetChannelOffset, targetChannelOffset);
cl::Buffer sourceBuffer = sourcePtrBuffer.createSubBuffer(CL_MEM_READ_ONLY,
CL_BUFFER_CREATE_TYPE_REGION,
&sourceRegion);
cl::Buffer targetBuffer = targetPtrBuffer.createSubBuffer(CL_MEM_READ_WRITE,
CL_BUFFER_CREATE_TYPE_REGION,
&targetRegion);
// Run Kernel to get 1-0 map
bool debug = false;
nmsRegisterKernel(cl::EnqueueArgs(op::OpenCL::getInstance(gpuID)->getQueue(), cl::NDRange(width, height)),
kernelBuffer, sourceBuffer, width, height, threshold, debug);
// This is a really bad approach. We need to write a custom accumalator to run on gpu
// Download it to CPU
op::OpenCL::getInstance(gpuID)->getQueue().enqueueReadBuffer(kernelBuffer, CL_TRUE, 0,
sizeof(int) * width * height, &kernelCPU[0]);
// Compute partial sum in CPU
std::partial_sum(kernelCPU.begin(),kernelCPU.end(),kernelCPU.begin());
// Reupload to GPU
op::OpenCL::getInstance(gpuID)->getQueue().enqueueWriteBuffer(kernelBuffer, CL_TRUE, 0,
sizeof(int) * width * height, &kernelCPU[0]);
// Write Kernel
nmsWriteKernel(cl::EnqueueArgs(op::OpenCL::getInstance(gpuID)->getQueue(), cl::NDRange(width, height)),
targetBuffer, kernelBuffer, sourceBuffer, width, height, targetPeaks-1, debug);
}
}
#else
UNUSED(targetPtr);
UNUSED(kernelPtr);
UNUSED(sourcePtr);
UNUSED(threshold);
UNUSED(targetSize);
UNUSED(sourceSize);
UNUSED(gpuID);
error("OpenPose must be compiled with the `USE_OPENCL` macro definition in order to use this"
" functionality.", __LINE__, __FUNCTION__, __FILE__);
#endif
}
#ifdef USE_OPENCL
catch (const cl::Error& e)
{
error(std::string(e.what()) + " : " + op::OpenCL::clErrorToString(e.err()) + " ID: " +
std::to_string(gpuID), __LINE__, __FUNCTION__, __FILE__);
}
#endif
catch (const std::exception& e)
{
error(e.what(), __LINE__, __FUNCTION__, __FILE__);
}
}
template void nmsOcl(float* targetPtr, int* kernelPtr, const float* const sourcePtr, const float threshold,
const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, int gpuID);
template void nmsOcl(double* targetPtr, int* kernelPtr, const double* const sourcePtr, const double threshold,
const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, int gpuID);
}
......@@ -13,9 +13,15 @@ namespace op
caffe::Blob<int> mKernelBlob;
std::array<int, 4> mBottomSize;
std::array<int, 4> mTopSize;
// Special Kernel for OpenCL NMS
#ifdef USE_OPENCL
std::shared_ptr<caffe::Blob<int>> mKernelBlobT;
#endif
#endif
ImplNmsCaffe(){};
ImplNmsCaffe()
{
}
};
template <typename T>
......@@ -63,7 +69,7 @@ namespace op
template <typename T>
void NmsCaffe<T>::Reshape(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top,
const int maxPeaks, const int outputChannels)
const int maxPeaks, const int outputChannels, const int gpuID)
{
try
{
......@@ -82,6 +88,15 @@ namespace op
topBlob->Reshape(topShape);
upImpl->mKernelBlob.Reshape(bottomShape);
// Special Kernel for OpenCL NMS
#ifdef USE_OPENCL
upImpl->mKernelBlobT = {std::make_shared<caffe::Blob<int>>(1,1,1,1)};
upImpl->mKernelBlobT->Reshape(bottomShape);
// GPU ID
mGpuID = gpuID;
#else
UNUSED(mGpuID);
#endif
// Array sizes
upImpl->mTopSize = std::array<int, 4>{topBlob->shape(0), topBlob->shape(1),
topBlob->shape(2), topBlob->shape(3)};
......@@ -152,6 +167,27 @@ namespace op
}
}
template <typename T>
void NmsCaffe<T>::Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom, const std::vector<caffe::Blob<T>*>& top)
{
try
{
#if defined USE_CAFFE && defined USE_OPENCL
nmsOcl(top.at(0)->mutable_gpu_data(), upImpl->mKernelBlobT->mutable_gpu_data(),
bottom.at(0)->gpu_data(), mThreshold, upImpl->mTopSize, upImpl->mBottomSize, mGpuID);
#else
UNUSED(bottom);
UNUSED(top);
error("OpenPose must be compiled with the `USE_CAFFE` & `USE_OPENCL` macro definitions in order to run"
" this functionality.", __LINE__, __FUNCTION__, __FILE__);
#endif
}
catch (const std::exception& e)
{
error(e.what(), __LINE__, __FUNCTION__, __FILE__);
}
}
template <typename T>
void NmsCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top, const std::vector<bool>& propagate_down,
const std::vector<caffe::Blob<T>*>& bottom)
......
此差异已折叠。
......@@ -51,7 +51,8 @@ namespace op
const std::vector<caffe::Blob<T>*>& top,
const T netFactor,
const T scaleFactor,
const bool mergeFirstDimension)
const bool mergeFirstDimension,
const int gpuID)
{
try
{
......@@ -81,6 +82,12 @@ namespace op
for (auto i = 0u ; i < mBottomSizes.size() ; i++)
mBottomSizes[i] = std::array<int, 4>{bottom[i]->shape(0), bottom[i]->shape(1),
bottom[i]->shape(2), bottom[i]->shape(3)};
#ifdef USE_OPENCL
// GPU ID
mGpuID = gpuID;
#else
UNUSED(mGpuID);
#endif
#else
UNUSED(bottom);
UNUSED(top);
......@@ -156,6 +163,31 @@ namespace op
}
}
template <typename T>
void ResizeAndMergeCaffe<T>::Forward_ocl(const std::vector<caffe::Blob<T>*>& bottom,
const std::vector<caffe::Blob<T>*>& top)
{
try
{
#if defined USE_CAFFE && defined USE_OPENCL
std::vector<const T*> sourcePtrs(bottom.size());
for (auto i = 0u ; i < sourcePtrs.size() ; i++)
sourcePtrs[i] = bottom[i]->gpu_data();
resizeAndMergeOcl(top.at(0)->mutable_gpu_data(), sourcePtrs, mTopSize, mBottomSizes,
mScaleRatios, mGpuID);
#else
UNUSED(bottom);
UNUSED(top);
error("OpenPose must be compiled with the `USE_CAFFE` & `USE_OPENCL` macro definitions in order to run"
" this functionality.", __LINE__, __FUNCTION__, __FILE__);
#endif
}
catch (const std::exception& e)
{
error(e.what(), __LINE__, __FUNCTION__, __FILE__);
}
}
template <typename T>
void ResizeAndMergeCaffe<T>::Backward_cpu(const std::vector<caffe::Blob<T>*>& top,
const std::vector<bool>& propagate_down,
......
......@@ -17,6 +17,7 @@ namespace op
{
#if defined USE_CAFFE
bool netInitialized;
const int mGpuId;
std::shared_ptr<NetCaffe> spNetCaffe;
std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
......@@ -27,6 +28,7 @@ namespace op
ImplFaceExtractorCaffe(const std::string& modelFolder, const int gpuId, const bool enableGoogleLogging) :
netInitialized{false},
mGpuId{gpuId},
spNetCaffe{std::make_shared<NetCaffe>(modelFolder + FACE_PROTOTXT, modelFolder + FACE_TRAINED_MODEL,
gpuId, enableGoogleLogging)},
spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
......@@ -78,14 +80,15 @@ namespace op
std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
boost::shared_ptr<caffe::Blob<float>>& caffeNetOutputBlob,
std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
std::shared_ptr<caffe::Blob<float>>& peaksBlob)
std::shared_ptr<caffe::Blob<float>>& peaksBlob,
const int gpuID)
{
try
{
// HeatMaps extractor blob and layer
const bool mergeFirstDimension = true;
resizeAndMergeCaffe->Reshape({caffeNetOutputBlob.get()}, {heatMapsBlob.get()},
FACE_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension);
FACE_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
// Pose extractor blob and layer
maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
// Cuda check
......@@ -241,7 +244,7 @@ namespace op
upImpl->netInitialized = true;
reshapeFaceExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spMaximumCaffe,
upImpl->spCaffeNetOutputBlob, upImpl->spHeatMapsBlob,
upImpl->spPeaksBlob);
upImpl->spPeaksBlob, upImpl->mGpuId);
}
// 2. Resize heat maps + merge different scales
......@@ -249,6 +252,9 @@ namespace op
upImpl->spResizeAndMergeCaffe->Forward_gpu({upImpl->spCaffeNetOutputBlob.get()},
{upImpl->spHeatMapsBlob.get()});
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#elif USE_OPENCL
upImpl->spResizeAndMergeCaffe->Forward_ocl({upImpl->spCaffeNetOutputBlob.get()},
{upImpl->spHeatMapsBlob.get()});
#else
upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()},
{upImpl->spHeatMapsBlob.get()});
......@@ -259,6 +265,9 @@ namespace op
upImpl->spMaximumCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()},
{upImpl->spPeaksBlob.get()});
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#elif USE_OPENCL
// CPU Version is already very fast (4ms) and data is sent to connectKeypoints as CPU for now anyway
upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
#else
upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()},
{upImpl->spPeaksBlob.get()});
......
set(SOURCES_OP_GPU
cuda.cpp
gpu.cpp)
gpu.cpp
opencl.cpp)
include(${CMAKE_SOURCE_DIR}/cmake/Utils.cmake)
prepend(SOURCES_OP_GPU_WITH_CP ${CMAKE_CURRENT_SOURCE_DIR} ${SOURCES_OP_GPU})
......
......@@ -2,7 +2,7 @@
#include <openpose/gpu/cuda.hpp>
#endif
#ifdef USE_OPENCL
#include <openpose/core/clManager.hpp>
#include <openpose/gpu/opencl.hcl>
#endif
#include <openpose/gpu/gpu.hpp>
......@@ -15,7 +15,7 @@ namespace op
#ifdef USE_CUDA
return getCudaGpuNumber();
#elif defined USE_OPENCL
return CLManager::getTotalGPU();
return OpenCL::getTotalGPU();
#else
error("OpenPose must be compiled with the `USE_CUDA` or `USE_OPENCL` macro definition in order to use"
" this functionality.", __LINE__, __FUNCTION__, __FILE__);
......
此差异已折叠。
......@@ -18,6 +18,7 @@ namespace op
{
#if defined USE_CAFFE
bool netInitialized;
const int mGpuId;
std::shared_ptr<NetCaffe> spNetCaffe;
std::shared_ptr<ResizeAndMergeCaffe<float>> spResizeAndMergeCaffe;
std::shared_ptr<MaximumCaffe<float>> spMaximumCaffe;
......@@ -29,6 +30,7 @@ namespace op
ImplHandExtractorCaffe(const std::string& modelFolder, const int gpuId,
const bool enableGoogleLogging) :
netInitialized{false},
mGpuId{gpuId},
spNetCaffe{std::make_shared<NetCaffe>(modelFolder + HAND_PROTOTXT, modelFolder + HAND_TRAINED_MODEL,
gpuId, enableGoogleLogging)},
spResizeAndMergeCaffe{std::make_shared<ResizeAndMergeCaffe<float>>()},
......@@ -164,14 +166,15 @@ namespace op
std::shared_ptr<MaximumCaffe<float>>& maximumCaffe,
boost::shared_ptr<caffe::Blob<float>>& caffeNetOutputBlob,
std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
std::shared_ptr<caffe::Blob<float>>& peaksBlob)
std::shared_ptr<caffe::Blob<float>>& peaksBlob,
const int gpuID)
{
try
{
// HeatMaps extractor blob and layer
const bool mergeFirstDimension = true;
resizeAndMergeCaffe->Reshape({caffeNetOutputBlob.get()}, {heatMapsBlob.get()},
HAND_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension);
HAND_CCN_DECREASE_FACTOR, 1.f, mergeFirstDimension, gpuID);
// Pose extractor blob and layer
maximumCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()});
// Cuda check
......@@ -411,7 +414,7 @@ namespace op
upImpl->netInitialized = true;
reshapeFaceExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spMaximumCaffe,
upImpl->spCaffeNetOutputBlob, upImpl->spHeatMapsBlob,
upImpl->spPeaksBlob);
upImpl->spPeaksBlob, upImpl->mGpuId);
}
// 2. Resize heat maps + merge different scales
......@@ -419,6 +422,9 @@ namespace op
upImpl->spResizeAndMergeCaffe->Forward_gpu({upImpl->spCaffeNetOutputBlob.get()},
{upImpl->spHeatMapsBlob.get()});
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#elif USE_OPENCL
upImpl->spResizeAndMergeCaffe->Forward_ocl({upImpl->spCaffeNetOutputBlob.get()},
{upImpl->spHeatMapsBlob.get()});
#else
upImpl->spResizeAndMergeCaffe->Forward_cpu({upImpl->spCaffeNetOutputBlob.get()},
{upImpl->spHeatMapsBlob.get()});
......@@ -428,6 +434,9 @@ namespace op
#ifdef USE_CUDA
upImpl->spMaximumCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#elif USE_OPENCL
// CPU Version is already very fast (4ms) and data is sent to connectKeypoints as CPU for now anyway
upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
#else
upImpl->spMaximumCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
#endif
......
......@@ -5,6 +5,9 @@
#include <openpose/core/nmsCaffe.hpp>
#include <openpose/core/resizeAndMergeCaffe.hpp>
#include <openpose/gpu/cuda.hpp>
#ifdef USE_OPENCL
#include <openpose/gpu/opencl.hcl>
#endif
#include <openpose/pose/bodyPartConnectorCaffe.hpp>
#include <openpose/pose/poseParameters.hpp>
#include <openpose/utilities/check.hpp>
......@@ -75,7 +78,8 @@ namespace op
std::shared_ptr<caffe::Blob<float>>& heatMapsBlob,
std::shared_ptr<caffe::Blob<float>>& peaksBlob,
const float scaleInputToNetInput,
const PoseModel poseModel)
const PoseModel poseModel,
const int gpuID)
{
try
{
......@@ -83,10 +87,11 @@ namespace op
// Caffe modifies bottom - Heatmap gets resized
const auto caffeNetOutputBlobs = caffeNetSharedToPtr(caffeNetOutputBlob);
resizeAndMergeCaffe->Reshape(caffeNetOutputBlobs, {heatMapsBlob.get()},
getPoseNetDecreaseFactor(poseModel), 1.f/scaleInputToNetInput);
getPoseNetDecreaseFactor(poseModel), 1.f/scaleInputToNetInput, true,
gpuID);
// Pose extractor blob and layer
nmsCaffe->Reshape({heatMapsBlob.get()}, {peaksBlob.get()}, getPoseMaxPeaks(poseModel),
getPoseNumberBodyParts(poseModel));
getPoseNumberBodyParts(poseModel), gpuID);
// Pose extractor blob and layer
bodyPartConnectorCaffe->Reshape({heatMapsBlob.get(), peaksBlob.get()});
// Cuda check
......@@ -239,7 +244,7 @@ namespace op
reshapePoseExtractorCaffe(upImpl->spResizeAndMergeCaffe, upImpl->spNmsCaffe,
upImpl->spBodyPartConnectorCaffe, upImpl->spCaffeNetOutputBlobs,
upImpl->spHeatMapsBlob, upImpl->spPeaksBlob,
1.f, upImpl->mPoseModel);
1.f, upImpl->mPoseModel, upImpl->mGpuId);
// scaleInputToNetInputs[i], upImpl->mPoseModel);
}
}
......@@ -248,8 +253,13 @@ namespace op
const auto caffeNetOutputBlobs = caffeNetSharedToPtr(upImpl->spCaffeNetOutputBlobs);
const std::vector<float> floatScaleRatios(scaleInputToNetInputs.begin(), scaleInputToNetInputs.end());
upImpl->spResizeAndMergeCaffe->setScaleRatios(floatScaleRatios);
#ifdef USE_CUDA
//upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
upImpl->spResizeAndMergeCaffe->Forward_gpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~5ms
#elif USE_OPENCL
//upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
upImpl->spResizeAndMergeCaffe->Forward_ocl(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()});
#else
upImpl->spResizeAndMergeCaffe->Forward_cpu(caffeNetOutputBlobs, {upImpl->spHeatMapsBlob.get()}); // ~20ms
#endif
......@@ -257,8 +267,12 @@ namespace op
// 3. Get peaks by Non-Maximum Suppression
upImpl->spNmsCaffe->setThreshold((float)get(PoseProperty::NMSThreshold));
#ifdef USE_CUDA
//upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
upImpl->spNmsCaffe->Forward_gpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});// ~2ms
cudaCheck(__LINE__, __FUNCTION__, __FILE__);
#elif USE_OPENCL
//upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
upImpl->spNmsCaffe->Forward_ocl({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()});
#else
upImpl->spNmsCaffe->Forward_cpu({upImpl->spHeatMapsBlob.get()}, {upImpl->spPeaksBlob.get()}); // ~ 7ms
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册