From 19b667b51d5591e5884160d7f12eaa35dea2ce0c Mon Sep 17 00:00:00 2001
From: gineshidalgo99 <gineshidalgo99@gmail.com>
Date: Thu, 4 May 2017 20:36:38 -0400
Subject: [PATCH] Max render resolution from 720p to >32k images

---
 include/openpose/utilities/cuda.hpp           | 17 ++++++-
 include/openpose/utilities/render.hu          |  4 +-
 .../experimental/face/faceRenderGpu.cu        |  8 ++--
 .../experimental/hand/handRenderGpu.cu        |  7 ++-
 src/openpose/pose/poseRenderGpu.cu            | 44 +++++++------------
 src/openpose/utilities/cuda.cpp               | 27 +++++++++++-
 6 files changed, 65 insertions(+), 42 deletions(-)
diff --git a/include/openpose/utilities/cuda.hpp b/include/openpose/utilities/cuda.hpp
index 25c632f8..2e9dbd0e 100644
--- a/include/openpose/utilities/cuda.hpp
+++ b/include/openpose/utilities/cuda.hpp
@@ -2,15 +2,30 @@
 #define OPENPOSE__UTILITIES_CUDA_HPP
 
 #include <string>
+#include <utility> // std::pair
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <opencv2/core/core.hpp>
 
 namespace op
 {
+    const auto CUDA_NUM_THREADS = 512u;
+
     void cudaCheck(const int line = -1, const std::string& function = "", const std::string& file = "");
 
-    inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads)
+    inline unsigned int getNumberCudaBlocks(const unsigned int totalRequired, const unsigned int numberCudaThreads = CUDA_NUM_THREADS)
     {
         return (totalRequired + numberCudaThreads - 1) / numberCudaThreads;
     }
+
+    inline dim3 getNumberCudaBlocks(const cv::Size& frameSize, const dim3 numberCudaThreads = dim3{CUDA_NUM_THREADS, CUDA_NUM_THREADS, 1})
+    {
+        return dim3{getNumberCudaBlocks(frameSize.width, numberCudaThreads.x),
+                    getNumberCudaBlocks(frameSize.height, numberCudaThreads.y),
+                    numberCudaThreads.z};
+    }
+
+    std::pair<dim3, dim3> getNumberCudaThreadsAndBlocks(const cv::Size& frameSize);
 }
 
 #endif // OPENPOSE__UTILITIES_CUDA_HPP
diff --git a/include/openpose/utilities/render.hu b/include/openpose/utilities/render.hu
index 87d18299..6e8cc1eb 100644
--- a/include/openpose/utilities/render.hu
+++ b/include/openpose/utilities/render.hu
@@ -35,8 +35,8 @@ namespace op
                         sharedMaxs[globalIdx].y = y;
                 }
             }
-            const auto averageX = sharedMaxs[globalIdx].x-sharedMins[globalIdx].x;
-            const auto averageY = sharedMaxs[globalIdx].y-sharedMins[globalIdx].y;
+            const auto averageX = sharedMaxs[globalIdx].x - sharedMins[globalIdx].x;
+            const auto averageY = sharedMaxs[globalIdx].y - sharedMins[globalIdx].y;
             sharedScaleF[globalIdx] = fastTruncate((averageX + averageY) / 400.f, 0.33f, 1.f);    // (averageX + averageY) / 2.f / 400.f
             const auto constantToAdd = 50.f;
             sharedMaxs[globalIdx].x += constantToAdd;
diff --git a/src/openpose/experimental/face/faceRenderGpu.cu b/src/openpose/experimental/face/faceRenderGpu.cu
index 7071add0..2f59501f 100644
--- a/src/openpose/experimental/face/faceRenderGpu.cu
+++ b/src/openpose/experimental/face/faceRenderGpu.cu
@@ -7,8 +7,7 @@
 
 namespace op
 {
-    const auto THREADS_PER_BLOCK_1D = 32;
-
+    const dim3 THREADS_PER_BLOCK{128, 128, 1};
     __constant__ const unsigned char PART_PAIRS_GPU[] = FACE_PAIRS_TO_RENDER;
     __constant__ const float RGB_COLORS[] = {
         255.f,    255.f,    255.f,
@@ -48,9 +47,8 @@ namespace op
             if (numberFaces > 0)
             {
                 const auto threshold = 0.5f;
-                dim3 threadsPerBlock = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
-                dim3 numBlocks = dim3{getNumberCudaBlocks(frameSize.width, threadsPerBlock.x), getNumberCudaBlocks(frameSize.height, threadsPerBlock.y)};
-                renderFaceParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, facePtr, numberFaces, threshold, alphaColorToAdd);
+                const auto numBlocks = getNumberCudaBlocks(frameSize, THREADS_PER_BLOCK);
+                renderFaceParts<<<THREADS_PER_BLOCK, numBlocks>>>(framePtr, frameSize.width, frameSize.height, facePtr, numberFaces, threshold, alphaColorToAdd);
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             }
         }
diff --git a/src/openpose/experimental/hand/handRenderGpu.cu b/src/openpose/experimental/hand/handRenderGpu.cu
index 16a3c4ec..908e3c55 100644
--- a/src/openpose/experimental/hand/handRenderGpu.cu
+++ b/src/openpose/experimental/hand/handRenderGpu.cu
@@ -7,8 +7,6 @@
 
 namespace op
 {
-    const auto THREADS_PER_BLOCK_1D = 32;
-
     __constant__ const unsigned char PART_PAIRS_GPU[] = HAND_PAIRS_TO_RENDER;
     __constant__ const float RGB_COLORS[] = {
         179.f,    0.f,    0.f,
@@ -71,8 +69,9 @@ namespace op
             if (numberHands > 0)
             {
                 const auto threshold = 0.05f;
-                dim3 threadsPerBlock = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
-                dim3 numBlocks = dim3{getNumberCudaBlocks(frameSize.width, threadsPerBlock.x), getNumberCudaBlocks(frameSize.height, threadsPerBlock.y)};
+                dim3 threadsPerBlock;
+                dim3 numBlocks;
+                std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
                 renderHandsParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, handsPtr, numberHands, threshold, alphaColorToAdd);
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             }
diff --git a/src/openpose/pose/poseRenderGpu.cu b/src/openpose/pose/poseRenderGpu.cu
index 8a62e0f5..412b483d 100644
--- a/src/openpose/pose/poseRenderGpu.cu
+++ b/src/openpose/pose/poseRenderGpu.cu
@@ -1,4 +1,4 @@
-#include <utility> /// std::pair
+#include <utility> // std::pair
 #include "openpose/pose/poseParameters.hpp"
 #include "openpose/utilities/errorAndLog.hpp"
 #include "openpose/utilities/cuda.hpp"
@@ -8,7 +8,6 @@
 
 namespace op
 {
-    const auto THREADS_PER_BLOCK_1D = 32u;
     __constant__ const unsigned char COCO_PAIRS_GPU[] = POSE_COCO_PAIRS_TO_RENDER;
     __constant__ const unsigned char MPI_PAIRS_GPU[] = POSE_MPI_PAIRS_TO_RENDER;
     __constant__ const float COCO_RGB_COLORS[] = {
@@ -388,24 +387,9 @@ namespace op
         }
     }
 
-    inline std::pair<dim3, dim3> getThreadsAndBlocks(const cv::Size& frameSize)
-    {
-        try
-        {
-            std::pair<dim3, dim3> threadsAndBlocks;
-            threadsAndBlocks.first = dim3{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D};
-            threadsAndBlocks.second = dim3{getNumberCudaBlocks(frameSize.width, threadsAndBlocks.first.x), getNumberCudaBlocks(frameSize.height, threadsAndBlocks.first.y)};
-            return threadsAndBlocks;
-        }
-        catch (const std::exception& e)
-        {
-            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
-            return std::make_pair(dim3{0,0,0}, dim3{0,0,0});
-        }
-    }
-
-    inline void renderKeyPointsPartAffinityAux(float* framePtr, const PoseModel poseModel, const cv::Size& frameSize, const float* const heatMapPtr,
-                                         const cv::Size& heatMapSize, const float scaleToKeepRatio, const int part, const int partsToRender, const float alphaBlending)
+    inline void renderKeyPointsPartAffinityAux(float* framePtr, const PoseModel poseModel, const cv::Size& frameSize,
+                                               const float* const heatMapPtr, const cv::Size& heatMapSize, const float scaleToKeepRatio,
+                                               const int part, const int partsToRender, const float alphaBlending)
     {
         try
         {
@@ -415,9 +399,9 @@ namespace op
             const auto heatMapOffset = POSE_NUMBER_BODY_PARTS[(int)poseModel] * heatMapSize.area();
             dim3 threadsPerBlock;
             dim3 numBlocks;
-            std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
-            renderPartAffinities<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, heatMapSize.height, 
-                                                                 scaleToKeepRatio, partsToRender, part, alphaBlending);
+            std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
+            renderPartAffinities<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width,
+                                                                 heatMapSize.height, scaleToKeepRatio, partsToRender, part, alphaBlending);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         }
         catch (const std::exception& e)
@@ -441,13 +425,15 @@ namespace op
 
                 dim3 threadsPerBlock;
                 dim3 numBlocks;
-                std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
+                std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
                 const auto threshold = getThresholdForPose(poseModel);
 
                 if (poseModel == PoseModel::COCO_18)
-                    renderPoseCoco<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, threshold, googlyEyes, blendOriginalFrame, alphaBlending);
+                    renderPoseCoco<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, 
+                                                                   threshold, googlyEyes, blendOriginalFrame, alphaBlending);
                 else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
-                    renderPoseMpi29Parts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr, numberPeople, threshold, blendOriginalFrame, alphaBlending);
+                    renderPoseMpi29Parts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, posePtr,
+                                                                         numberPeople, threshold, blendOriginalFrame, alphaBlending);
                 else
                     error("Unvalid Model.", __LINE__, __FUNCTION__, __FILE__);
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
@@ -469,7 +455,7 @@ namespace op
             checkAlpha(alphaBlending);
             dim3 threadsPerBlock;
             dim3 numBlocks;
-            std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
+            std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
             const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel];
             const auto heatMapOffset = numberBodyParts * heatMapSize.area();
 
@@ -493,12 +479,12 @@ namespace op
             checkAlpha(alphaBlending);
             dim3 threadsPerBlock;
             dim3 numBlocks;
-            std::tie(threadsPerBlock, numBlocks) = getThreadsAndBlocks(frameSize);
+            std::tie(threadsPerBlock, numBlocks) = getNumberCudaThreadsAndBlocks(frameSize);
             const auto numberBodyParts = POSE_NUMBER_BODY_PARTS[(int)poseModel];
             const auto heatMapOffset = numberBodyParts * heatMapSize.area();
 
             renderBodyPartHeatMaps<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.width, frameSize.height, heatMapPtr, heatMapSize.width, heatMapSize.height,
-                                                                    scaleToKeepRatio, numberBodyParts, alphaBlending);
+                                                                   scaleToKeepRatio, numberBodyParts, alphaBlending);
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         }
         catch (const std::exception& e)
diff --git a/src/openpose/utilities/cuda.cpp b/src/openpose/utilities/cuda.cpp
index 31f3588d..a0de64cf 100644
--- a/src/openpose/utilities/cuda.cpp
+++ b/src/openpose/utilities/cuda.cpp
@@ -1,14 +1,39 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include "openpose/utilities/errorAndLog.hpp"
+#include "openpose/utilities/fastMath.hpp"
 #include "openpose/utilities/cuda.hpp"
 
 namespace op
 {
+    const dim3 THREADS_PER_BLOCK_SMALL{32, 32, 1};
+    const dim3 THREADS_PER_BLOCK_MEDIUM{128, 128, 1};
+    const dim3 THREADS_PER_BLOCK_BIG{256, 256, 1};
+
     void cudaCheck(const int line, const std::string& function, const std::string& file)
     {
         const auto errorCode = cudaPeekAtLastError();
         if(errorCode != cudaSuccess)
-        	error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "):" + cudaGetErrorString(errorCode), line, function, file);
+            error("Cuda check failed (" + std::to_string(errorCode) + " vs. " + std::to_string(cudaSuccess) + "): " + cudaGetErrorString(errorCode), line, function, file);
+    }
+
+    std::pair<dim3, dim3> getNumberCudaThreadsAndBlocks(const cv::Size& frameSize)
+    {
+        try
+        {
+            // Image <= 1280x720    --> THREADS_PER_BLOCK_SMALL
+            // Image < 16K          --> THREADS_PER_BLOCK_MEDIUM
+            // Image > 16K          --> THREADS_PER_BLOCK_BIG
+            const auto maxValue = fastMax(frameSize.width, frameSize.height);
+            const auto threadsPerBlock = (maxValue < 1281 ? THREADS_PER_BLOCK_SMALL
+                                          : (maxValue < 16384 ? THREADS_PER_BLOCK_MEDIUM
+                                             : THREADS_PER_BLOCK_BIG));
+            return std::make_pair(threadsPerBlock, getNumberCudaBlocks(frameSize, threadsPerBlock));
+        }
+        catch (const std::exception& e)
+        {
+            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+            return std::make_pair(dim3{1,1,1}, dim3{1,1,1});
+        }
     }
 }
-- 
GitLab