Max render resolution from 720p to >32k images

19b667b5 · gineshidalgo99 · 34d77981 · 19b667b5 · 19b667b5 · 19b667b5
6 changed file
--- a/include/openpose/utilities/cuda.hpp
+++ b/include/openpose/utilities/cuda.hpp
@@ -2,15 +2,30 @@
 #define OPENPOSE__UTILITIES_CUDA_HPP

 #include <string>
+#include <utility> // std::pair
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <opencv2/core/core.hpp>

 namespace op
 {
+    const auto CUDA_NUM_THREADS = 512u;
+
    void cudaCheck(const int line = -1, const std::string& function = "", const std::string& file = "");

-    inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads)
+    inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads = CUDA_NUM_THREADS)
    {
        return (totalRequired + numberCudaThreads - 1) / numberCudaThreads;
    }
+
+    inline dim3 getNumberCudaBlocks(const cv::Size& frameSize, const dim3 numberCudaThreads = dim3{CUDA_NUM_THREADS, CUDA_NUM_THREADS, 1})
+    {
+        return dim3{getNumberCudaBlocks(frameSize.width, numberCudaThreads.x),
+                    getNumberCudaBlocks(frameSize.height, numberCudaThreads.y),
+                    numberCudaThreads.z};
+    }
+
+    std::pair<dim3, dim3> getNumberCudaThreadsAndBlocks(const cv::Size& frameSize);
 }

 #endif // OPENPOSE__UTILITIES_CUDA_HPP
--- a/include/openpose/utilities/render.hu
+++ b/include/openpose/utilities/render.hu
@@ -35,8 +35,8 @@ namespace op
                        sharedMaxs[globalIdx].y = y;
                }
            }
-            const auto averageX = sharedMaxs[globalIdx].x-sharedMins[globalIdx].x;
-            const auto averageY = sharedMaxs[globalIdx].y-sharedMins[globalIdx].y;
+            const auto averageX = sharedMaxs[globalIdx].x - sharedMins[globalIdx].x;
+            const auto averageY = sharedMaxs[globalIdx].y - sharedMins[globalIdx].y;
            sharedScaleF[globalIdx] = fastTruncate((averageX + averageY) / 400.f, 0.33f, 1.f);    // (averageX + averageY) / 2.f / 400.f
            const auto constantToAdd = 50.f;
            sharedMaxs[globalIdx].x += constantToAdd;

--- a/src/openpose/experimental/face/faceRenderGpu.cu
+++ b/src/openpose/experimental/face/faceRenderGpu.cu
@@ -7,8 +7,7 @@

 namespace op
 {
-    const auto THREADS_PER_BLOCK_1D = 32;
-
+    const dim3 THREADS_PER_BLOCK{128, 128, 1};
    __constant__ const unsigned char PART_PAIRS_GPU[] = FACE_PAIRS_TO_RENDER;
    __constant__ const float RGB_COLORS[] = {
        255.f,    255.f,    255.f,
@@ -48,9 +47,8 @@ namespace op
            if (numberFaces > 0)
            {
                const auto threshold = 0.5f;
-                dim3 threadsPerBlock = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
-                dim3 numBlocks = dim3{getNumberCudaBlocks(frameSize.width, threadsPerBlock.x), getNumberCudaBlocks(frameSize.height, threadsPerBlock.y)};
-                renderFaceParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, facePtr, numberFaces, threshold, alphaColorToAdd);
+                const auto numBlocks = getNumberCudaBlocks(frameSize, THREADS_PER_BLOCK);
+                renderFaceParts<<<THREADS_PER_BLOCK, numBlocks>>>(framePtr, frameSize.width, frameSize.height, facePtr, numberFaces, threshold, alphaColorToAdd);
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            }
        }

--- a/src/openpose/experimental/hand/handRenderGpu.cu
+++ b/src/openpose/experimental/hand/handRenderGpu.cu
@@ -7,8 +7,6 @@

 namespace op
 {
-    const auto THREADS_PER_BLOCK_1D = 32;
-
    __constant__ const unsigned char PART_PAIRS_GPU[] = HAND_PAIRS_TO_RENDER;
    __constant__ const float RGB_COLORS[] = {
        179.f,    0.f,    0.f,
@@ -71,8 +69,9 @@ namespace op
            if (numberHands > 0)
            {
                const auto threshold = 0.05f;
-                dim3 threadsPerBlock = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
-                dim3 numBlocks = dim3{getNumberCudaBlocks(frameSize.width, threadsPerBlock.x), getNumberCudaBlocks(frameSize.height, threadsPerBlock.y)};
+                dim3 threadsPerBlock;
+                dim3 numBlocks;
+                std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
                renderHandsParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, handsPtr, numberHands, threshold, alphaColorToAdd);
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            }

--- a/src/openpose/pose/poseRenderGpu.cu
+++ b/src/openpose/pose/poseRenderGpu.cu
-#include <utility> /// std::pair
+#include <utility> // std::pair
 #include "openpose/pose/poseParameters.hpp"
 #include "openpose/utilities/errorAndLog.hpp"
 #include "openpose/utilities/cuda.hpp"
@@ -8,7 +8,6 @@

 namespace op
 {
-    const auto THREADS_PER_BLOCK_1D = 32u;
    __constant__ const unsigned char COCO_PAIRS_GPU[] = POSE_COCO_PAIRS_TO_RENDER;
    __constant__ const unsigned char MPI_PAIRS_GPU[] = POSE_MPI_PAIRS_TO_RENDER;
    __constant__ const float COCO_RGB_COLORS[] = {
@@ -388,24 +387,9 @@ namespace op
        }
    }

-    inline std::pair<dim3, dim3> getThreadsAndBlocks(const cv::Size& frameSize)
-    {
-        try
-        {
-            std::pair<dim3, dim3> threadsAndBlocks;
-            threadsAndBlocks.first = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
-            threadsAndBlocks.second = dim3{getNumberCudaBlocks(frameSize.width, threadsAndBlocks.first.x), getNumberCudaBlocks(frameSize.height, threadsAndBlocks.first.y)};
-            return threadsAndBlocks;
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-            return std::make_pair(dim3{0,0,0}, dim3{0,0,0});
-        }
-    }
-
-    inline void renderKeyPointsPartAffinityAux(float* framePtr, const PoseModel poseModel, const cv::Size& frameSize, const float* const heatMapPtr,
-                                         const cv::Size& heatMapSize, const float scaleToKeepRatio, const int part, const int partsToRender, const float alphaBlending)
+    inline void renderKeyPointsPartAffinityAux(float* framePtr, const PoseModel poseModel, const cv::Size& frameSize,
+                                               const float* const heatMapPtr, const cv::Size& heatMapSize, const float scaleToKeepRatio,
+                                               const int part, const int partsToRender, const float alphaBlending)
    {
        try
        {
@@ -415,9 +399,9 @@ namespace op
            const auto heatMapOffset = POSE_NUMBER_BODY_PARTS[(int)poseModel] * heatMapSize.area();
            dim3 threadsPerBlock;
            dim3 numBlocks;
-            std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
-            renderPartAffinities<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, heatMapSize.height, 
-                                                                 scaleToKeepRatio, partsToRender, part, alphaBlending);
+            std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
+            renderPartAffinities<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width,
+                                                                 heatMapSize.height, scaleToKeepRatio, partsToRender, part, alphaBlending);
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)
@@ -441,13 +425,15 @@ namespace op

                dim3 threadsPerBlock;
                dim3 numBlocks;
-                std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
+                std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
                const auto threshold = getThresholdForPose(poseModel);

                if (poseModel == PoseModel::COCO_18)
-                    renderPoseCoco<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, threshold, googlyEyes, blendOriginalFrame, alphaBlending);
+                    renderPoseCoco<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, 
+                                                                   threshold, googlyEyes, blendOriginalFrame, alphaBlending);
                else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
-                    renderPoseMpi29Parts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, threshold, blendOriginalFrame, alphaBlending);
+                    renderPoseMpi29Parts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr,
+                                                                         numberPeople, threshold, blendOriginalFrame, alphaBlending);
                else
                    error("Unvalid Model.", __LINE__, __FUNCTION__, __FILE__);
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -469,7 +455,7 @@ namespace op
            checkAlpha(alphaBlending);
            dim3 threadsPerBlock;
            dim3 numBlocks;
-            std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
+            std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
            const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel];
            const auto heatMapOffset = numberBodyParts * heatMapSize.area();

@@ -493,12 +479,12 @@ namespace op
            checkAlpha(alphaBlending);
            dim3 threadsPerBlock;
            dim3 numBlocks;
-            std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
+            std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
            const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel];
            const auto heatMapOffset = numberBodyParts * heatMapSize.area();

            renderBodyPartHeatMaps<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, heatMapSize.height,
-                                                                    scaleToKeepRatio, numberBodyParts, alphaBlending);
+                                                                   scaleToKeepRatio, numberBodyParts, alphaBlending);
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)

--- a/src/openpose/utilities/cuda.cpp
+++ b/src/openpose/utilities/cuda.cpp
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "openpose/utilities/errorAndLog.hpp"
+#include "openpose/utilities/fastMath.hpp"
 #include "openpose/utilities/cuda.hpp"

 namespace op
 {
+    const dim3 THREADS_PER_BLOCK_SMALL{32, 32, 1};
+    const dim3 THREADS_PER_BLOCK_MEDIUM{128, 128, 1};
+    const dim3 THREADS_PER_BLOCK_BIG{256, 256, 1};
+
    void cudaCheck(const int line, const std::string& function, const std::string& file)
    {
        const auto errorCode = cudaPeekAtLastError();
        if(errorCode != cudaSuccess)
-        	error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "):" + cudaGetErrorString(errorCode), line, function, file);
+            error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "): " + cudaGetErrorString(errorCode), line, function, file);
+    }
+
+    std::pair<dim3, dim3> getNumberCudaThreadsAndBlocks(const cv::Size& frameSize)
+    {
+        try
+        {
+            // Image <= 1280x720    --> THREADS_PER_BLOCK_SMALL
+            // Image < 16K          --> THREADS_PER_BLOCK_MEDIUM
+            // Image > 16K          --> THREADS_PER_BLOCK_BIG
+            const auto maxValue = fastMax(frameSize.width, frameSize.height);
+            const auto threadsPerBlock = (maxValue < 1281 ? THREADS_PER_BLOCK_SMALL
+                                          : (maxValue < 16384 ? THREADS_PER_BLOCK_MEDIUM
+                                             : THREADS_PER_BLOCK_BIG));
+            return std::make_pair(threadsPerBlock, getNumberCudaBlocks(frameSize, threadsPerBlock));
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return std::make_pair(dim3{1,1,1}, dim3{1,1,1});
+        }
    }
 }