From 19b667b51d5591e5884160d7f12eaa35dea2ce0c Mon Sep 17 00:00:00 2001 From: gineshidalgo99 Date: Thu, 4 May 2017 20:36:38 -0400 Subject: [PATCH] Max render resolution from 720p to >32k images --- include/openpose/utilities/cuda.hpp | 17 ++++++- include/openpose/utilities/render.hu | 4 +- .../experimental/face/faceRenderGpu.cu | 8 ++-- .../experimental/hand/handRenderGpu.cu | 7 ++- src/openpose/pose/poseRenderGpu.cu | 44 +++++++------------ src/openpose/utilities/cuda.cpp | 27 +++++++++++- 6 files changed, 65 insertions(+), 42 deletions(-) diff --git a/include/openpose/utilities/cuda.hpp b/include/openpose/utilities/cuda.hpp index 25c632f8..2e9dbd0e 100644 --- a/include/openpose/utilities/cuda.hpp +++ b/include/openpose/utilities/cuda.hpp @@ -2,15 +2,30 @@ #define OPENPOSE__UTILITIES_CUDA_HPP #include +#include // std::pair +#include +#include +#include namespace op { + const auto CUDA_NUM_THREADS = 512u; + void cudaCheck(const int line = -1, const std::string& function = "", const std::string& file = ""); - inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads) + inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads = CUDA_NUM_THREADS) { return (totalRequired + numberCudaThreads - 1) / numberCudaThreads; } + + inline dim3 getNumberCudaBlocks(const cv::Size& frameSize, const dim3 numberCudaThreads = dim3{CUDA_NUM_THREADS, CUDA_NUM_THREADS, 1}) + { + return dim3{getNumberCudaBlocks(frameSize.width, numberCudaThreads.x), + getNumberCudaBlocks(frameSize.height, numberCudaThreads.y), + numberCudaThreads.z}; + } + + std::pair getNumberCudaThreadsAndBlocks(const cv::Size& frameSize); } #endif // OPENPOSE__UTILITIES_CUDA_HPP diff --git a/include/openpose/utilities/render.hu b/include/openpose/utilities/render.hu index 87d18299..6e8cc1eb 100644 --- a/include/openpose/utilities/render.hu +++ b/include/openpose/utilities/render.hu @@ -35,8 +35,8 @@ namespace op sharedMaxs[globalIdx].y = y; } } - const auto averageX = sharedMaxs[globalIdx].x-sharedMins[globalIdx].x; - const auto averageY = sharedMaxs[globalIdx].y-sharedMins[globalIdx].y; + const auto averageX = sharedMaxs[globalIdx].x - sharedMins[globalIdx].x; + const auto averageY = sharedMaxs[globalIdx].y - sharedMins[globalIdx].y; sharedScaleF[globalIdx] = fastTruncate((averageX + averageY) / 400.f, 0.33f, 1.f); // (averageX + averageY) / 2.f / 400.f const auto constantToAdd = 50.f; sharedMaxs[globalIdx].x += constantToAdd; diff --git a/src/openpose/experimental/face/faceRenderGpu.cu b/src/openpose/experimental/face/faceRenderGpu.cu index 7071add0..2f59501f 100644 --- a/src/openpose/experimental/face/faceRenderGpu.cu +++ b/src/openpose/experimental/face/faceRenderGpu.cu @@ -7,8 +7,7 @@ namespace op { - const auto THREADS_PER_BLOCK_1D = 32; - + const dim3 THREADS_PER_BLOCK{128, 128, 1}; __constant__ const unsigned char PART_PAIRS_GPU[] = FACE_PAIRS_TO_RENDER; __constant__ const float RGB_COLORS[] = { 255.f, 255.f, 255.f, @@ -48,9 +47,8 @@ namespace op if (numberFaces > 0) { const auto threshold = 0.5f; - dim3 threadsPerBlock = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D}; - dim3 numBlocks = dim3{getNumberCudaBlocks(frameSize.width, threadsPerBlock.x), getNumberCudaBlocks(frameSize.height, threadsPerBlock.y)}; - renderFaceParts<<>>(framePtr, frameSize.width, frameSize.height, facePtr, numberFaces, threshold, alphaColorToAdd); + const auto numBlocks = getNumberCudaBlocks(frameSize, THREADS_PER_BLOCK); + renderFaceParts<<>>(framePtr, frameSize.width, frameSize.height, facePtr, numberFaces, threshold, alphaColorToAdd); cudaCheck(__LINE__, __FUNCTION__, __FILE__); } } diff --git a/src/openpose/experimental/hand/handRenderGpu.cu b/src/openpose/experimental/hand/handRenderGpu.cu index 16a3c4ec..908e3c55 100644 --- a/src/openpose/experimental/hand/handRenderGpu.cu +++ b/src/openpose/experimental/hand/handRenderGpu.cu @@ -7,8 +7,6 @@ namespace op { - const auto THREADS_PER_BLOCK_1D = 32; - __constant__ const unsigned char PART_PAIRS_GPU[] = HAND_PAIRS_TO_RENDER; __constant__ const float RGB_COLORS[] = { 179.f, 0.f, 0.f, @@ -71,8 +69,9 @@ namespace op if (numberHands > 0) { const auto threshold = 0.05f; - dim3 threadsPerBlock = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D}; - dim3 numBlocks = dim3{getNumberCudaBlocks(frameSize.width, threadsPerBlock.x), getNumberCudaBlocks(frameSize.height, threadsPerBlock.y)}; + dim3 threadsPerBlock; + dim3 numBlocks; + std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize); renderHandsParts<<>>(framePtr, frameSize.width, frameSize.height, handsPtr, numberHands, threshold, alphaColorToAdd); cudaCheck(__LINE__, __FUNCTION__, __FILE__); } diff --git a/src/openpose/pose/poseRenderGpu.cu b/src/openpose/pose/poseRenderGpu.cu index 8a62e0f5..412b483d 100644 --- a/src/openpose/pose/poseRenderGpu.cu +++ b/src/openpose/pose/poseRenderGpu.cu @@ -1,4 +1,4 @@ -#include /// std::pair +#include // std::pair #include "openpose/pose/poseParameters.hpp" #include "openpose/utilities/errorAndLog.hpp" #include "openpose/utilities/cuda.hpp" @@ -8,7 +8,6 @@ namespace op { - const auto THREADS_PER_BLOCK_1D = 32u; __constant__ const unsigned char COCO_PAIRS_GPU[] = POSE_COCO_PAIRS_TO_RENDER; __constant__ const unsigned char MPI_PAIRS_GPU[] = POSE_MPI_PAIRS_TO_RENDER; __constant__ const float COCO_RGB_COLORS[] = { @@ -388,24 +387,9 @@ namespace op } } - inline std::pair getThreadsAndBlocks(const cv::Size& frameSize) - { - try - { - std::pair threadsAndBlocks; - threadsAndBlocks.first = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D}; - threadsAndBlocks.second = dim3{getNumberCudaBlocks(frameSize.width, threadsAndBlocks.first.x), getNumberCudaBlocks(frameSize.height, threadsAndBlocks.first.y)}; - return threadsAndBlocks; - } - catch (const std::exception& e) - { - error(e.what(), __LINE__, __FUNCTION__, __FILE__); - return std::make_pair(dim3{0,0,0}, dim3{0,0,0}); - } - } - - inline void renderKeyPointsPartAffinityAux(float* framePtr, const PoseModel poseModel, const cv::Size& frameSize, const float* const heatMapPtr, - const cv::Size& heatMapSize, const float scaleToKeepRatio, const int part, const int partsToRender, const float alphaBlending) + inline void renderKeyPointsPartAffinityAux(float* framePtr, const PoseModel poseModel, const cv::Size& frameSize, + const float* const heatMapPtr, const cv::Size& heatMapSize, const float scaleToKeepRatio, + const int part, const int partsToRender, const float alphaBlending) { try { @@ -415,9 +399,9 @@ namespace op const auto heatMapOffset = POSE_NUMBER_BODY_PARTS[(int)poseModel] * heatMapSize.area(); dim3 threadsPerBlock; dim3 numBlocks; - std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize); - renderPartAffinities<<>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, heatMapSize.height, - scaleToKeepRatio, partsToRender, part, alphaBlending); + std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize); + renderPartAffinities<<>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, + heatMapSize.height, scaleToKeepRatio, partsToRender, part, alphaBlending); cudaCheck(__LINE__, __FUNCTION__, __FILE__); } catch (const std::exception& e) @@ -441,13 +425,15 @@ namespace op dim3 threadsPerBlock; dim3 numBlocks; - std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize); + std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize); const auto threshold = getThresholdForPose(poseModel); if (poseModel == PoseModel::COCO_18) - renderPoseCoco<<>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, threshold, googlyEyes, blendOriginalFrame, alphaBlending); + renderPoseCoco<<>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, + threshold, googlyEyes, blendOriginalFrame, alphaBlending); else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4) - renderPoseMpi29Parts<<>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, threshold, blendOriginalFrame, alphaBlending); + renderPoseMpi29Parts<<>>(framePtr, frameSize.width, frameSize.height, posePtr, + numberPeople, threshold, blendOriginalFrame, alphaBlending); else error("Unvalid Model.", __LINE__, __FUNCTION__, __FILE__); cudaCheck(__LINE__, __FUNCTION__, __FILE__); @@ -469,7 +455,7 @@ namespace op checkAlpha(alphaBlending); dim3 threadsPerBlock; dim3 numBlocks; - std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize); + std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize); const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel]; const auto heatMapOffset = numberBodyParts * heatMapSize.area(); @@ -493,12 +479,12 @@ namespace op checkAlpha(alphaBlending); dim3 threadsPerBlock; dim3 numBlocks; - std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize); + std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize); const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel]; const auto heatMapOffset = numberBodyParts * heatMapSize.area(); renderBodyPartHeatMaps<<>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, heatMapSize.height, - scaleToKeepRatio, numberBodyParts, alphaBlending); + scaleToKeepRatio, numberBodyParts, alphaBlending); cudaCheck(__LINE__, __FUNCTION__, __FILE__); } catch (const std::exception& e) diff --git a/src/openpose/utilities/cuda.cpp b/src/openpose/utilities/cuda.cpp index 31f3588d..a0de64cf 100644 --- a/src/openpose/utilities/cuda.cpp +++ b/src/openpose/utilities/cuda.cpp @@ -1,14 +1,39 @@ #include #include #include "openpose/utilities/errorAndLog.hpp" +#include "openpose/utilities/fastMath.hpp" #include "openpose/utilities/cuda.hpp" namespace op { + const dim3 THREADS_PER_BLOCK_SMALL{32, 32, 1}; + const dim3 THREADS_PER_BLOCK_MEDIUM{128, 128, 1}; + const dim3 THREADS_PER_BLOCK_BIG{256, 256, 1}; + void cudaCheck(const int line, const std::string& function, const std::string& file) { const auto errorCode = cudaPeekAtLastError(); if(errorCode != cudaSuccess) - error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "):" + cudaGetErrorString(errorCode), line, function, file); + error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "): " + cudaGetErrorString(errorCode), line, function, file); + } + + std::pair getNumberCudaThreadsAndBlocks(const cv::Size& frameSize) + { + try + { + // Image <= 1280x720 --> THREADS_PER_BLOCK_SMALL + // Image < 16K --> THREADS_PER_BLOCK_MEDIUM + // Image > 16K --> THREADS_PER_BLOCK_BIG + const auto maxValue = fastMax(frameSize.width, frameSize.height); + const auto threadsPerBlock = (maxValue < 1281 ? THREADS_PER_BLOCK_SMALL + : (maxValue < 16384 ? THREADS_PER_BLOCK_MEDIUM + : THREADS_PER_BLOCK_BIG)); + return std::make_pair(threadsPerBlock, getNumberCudaBlocks(frameSize, threadsPerBlock)); + } + catch (const std::exception& e) + { + error(e.what(), __LINE__, __FUNCTION__, __FILE__); + return std::make_pair(dim3{1,1,1}, dim3{1,1,1}); + } } } -- GitLab