From 35f8c9d18f9d252acb3efcadec327661105c58b8 Mon Sep 17 00:00:00 2001
From: joker3212 <apatwal@andrew.cmu.edu>
Date: Fri, 3 May 2019 21:42:31 -0400
Subject: [PATCH] Render and resize speedup in CUDA (#1209)

---
 include/openpose/face/faceGpuRenderer.hpp   |   3 +
 include/openpose/face/renderFace.hpp        |   9 +-
 include/openpose/gpu/cuda.hu                |  24 +-
 include/openpose/hand/handGpuRenderer.hpp   |   3 +
 include/openpose/hand/renderHand.hpp        |  11 +-
 include/openpose/net/resizeAndMergeBase.hpp |  10 +-
 include/openpose/pose/poseGpuRenderer.hpp   |   3 +
 include/openpose/pose/renderPose.hpp        |  66 ++--
 include/openpose/utilities/render.hu        | 223 ++++++++++-
 src/openpose/face/faceGpuRenderer.cpp       |  23 +-
 src/openpose/face/renderFace.cu             |  42 +-
 src/openpose/hand/handGpuRenderer.cpp       |  21 +-
 src/openpose/hand/renderHand.cu             |  42 +-
 src/openpose/net/resizeAndMergeBase.cu      | 217 ++++++++++-
 src/openpose/pose/poseGpuRenderer.cpp       |  33 +-
 src/openpose/pose/renderPose.cu             | 406 +++++++++++---------
 16 files changed, 847 insertions(+), 289 deletions(-)
diff --git a/include/openpose/face/faceGpuRenderer.hpp b/include/openpose/face/faceGpuRenderer.hpp
index dba5ece0..f0d82a4b 100644
--- a/include/openpose/face/faceGpuRenderer.hpp
+++ b/include/openpose/face/faceGpuRenderer.hpp
@@ -23,6 +23,9 @@ namespace op
 
     private:
         float* pGpuFace; // GPU aux memory
+        float* pMaxPtr; // GPU aux memory
+        float* pMinPtr; // GPU aux memory
+        float* pScalePtr; // GPU aux memory
 
         DELETE_COPY(FaceGpuRenderer);
     };
diff --git a/include/openpose/face/renderFace.hpp b/include/openpose/face/renderFace.hpp
index 3582d660..ed7a90e4 100644
--- a/include/openpose/face/renderFace.hpp
+++ b/include/openpose/face/renderFace.hpp
@@ -6,10 +6,13 @@
 
 namespace op
 {
-    OP_API void renderFaceKeypointsCpu(Array<float>& frameArray, const Array<float>& faceKeypoints, const float renderThreshold);
+    OP_API void renderFaceKeypointsCpu(
+        Array<float>& frameArray, const Array<float>& faceKeypoints, const float renderThreshold);
 
-    void renderFaceKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const facePtr, const int numberPeople,
-                                const float renderThreshold, const float alphaColorToAdd = FACE_DEFAULT_ALPHA_KEYPOINT);
+    void renderFaceKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
+        const float* const facePtr, const int numberPeople, const float renderThreshold,
+        const float alphaColorToAdd = FACE_DEFAULT_ALPHA_KEYPOINT);
 }
 
 #endif // OPENPOSE_FACE_RENDER_FACE_HPP
diff --git a/include/openpose/gpu/cuda.hu b/include/openpose/gpu/cuda.hu
index f0d842ec..6e299c0a 100644
--- a/include/openpose/gpu/cuda.hu
+++ b/include/openpose/gpu/cuda.hu
@@ -121,8 +121,9 @@ namespace op
     }
 
     template <typename T>
-    inline __device__ T bicubicInterpolate(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource,
-                                           const int heightSource, const int widthSourcePtr)
+    inline __device__ T bicubicInterpolate(
+        const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
+        const int widthSourcePtr)
     {
         int xIntArray[4];
         int yIntArray[4];
@@ -140,6 +141,25 @@ namespace op
         return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
     }
 
+    template <typename T>
+    inline __device__ T bicubicInterpolate8Times(
+        const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
+        const int widthSourcePtr, const int threadIdxX, const int threadIdxY)
+    {
+        // Now we only need dx and dy
+        const T dx = xSource - fastTruncateCuda(int(xSource + 1e-5), 0, widthSource - 1);
+        const T dy = ySource - fastTruncateCuda(int(ySource + 1e-5), 0, heightSource - 1);
+
+        T temp[4];
+        for (unsigned char i = 0; i < 4; i++)
+        {
+            const auto offset = 5 * (i + (threadIdxY > 3 ? 1 : 0)) + (threadIdxX > 3 ? 1 : 0);
+            temp[i] = cubicInterpolate(sourcePtr[offset + 0], sourcePtr[offset + 1],
+                                       sourcePtr[offset + 2], sourcePtr[offset + 3], dx);
+        }
+        return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
+    }
+
     template <typename T>
     inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2)
     {
diff --git a/include/openpose/hand/handGpuRenderer.hpp b/include/openpose/hand/handGpuRenderer.hpp
index a9eb77d4..47e59b08 100644
--- a/include/openpose/hand/handGpuRenderer.hpp
+++ b/include/openpose/hand/handGpuRenderer.hpp
@@ -23,6 +23,9 @@ namespace op
 
     private:
         float* pGpuHand; // GPU aux memory
+        float* pMaxPtr; // GPU aux memory
+        float* pMinPtr; // GPU aux memory
+        float* pScalePtr; // GPU aux memory
 
         DELETE_COPY(HandGpuRenderer);
     };
diff --git a/include/openpose/hand/renderHand.hpp b/include/openpose/hand/renderHand.hpp
index dea0d024..7610a8d8 100644
--- a/include/openpose/hand/renderHand.hpp
+++ b/include/openpose/hand/renderHand.hpp
@@ -6,12 +6,13 @@
 
 namespace op
 {
-    OP_API void renderHandKeypointsCpu(Array<float>& frameArray, const std::array<Array<float>, 2>& handKeypoints,
-                                       const float renderThreshold);
+    OP_API void renderHandKeypointsCpu(
+        Array<float>& frameArray, const std::array<Array<float>, 2>& handKeypoints, const float renderThreshold);
 
-    void renderHandKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const handsPtr,
-                                const int numberHands, const float renderThreshold,
-                                const float alphaColorToAdd = HAND_DEFAULT_ALPHA_KEYPOINT);
+    void renderHandKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
+        const float* const handsPtr, const int numberHands, const float renderThreshold,
+        const float alphaColorToAdd = HAND_DEFAULT_ALPHA_KEYPOINT);
 }
 
 #endif // OPENPOSE_HAND_GPU_HAND_RENDER_HPP
diff --git a/include/openpose/net/resizeAndMergeBase.hpp b/include/openpose/net/resizeAndMergeBase.hpp
index 75c72406..ddfc487f 100644
--- a/include/openpose/net/resizeAndMergeBase.hpp
+++ b/include/openpose/net/resizeAndMergeBase.hpp
@@ -22,6 +22,14 @@ namespace op
         T* targetPtr, const std::vector<const T*>& sourcePtrs, std::vector<T*>& sourceTempPtrs,
         const std::array<int, 4>& targetSize, const std::vector<std::array<int, 4>>& sourceSizes,
         const std::vector<T>& scaleInputToNetInputs = {1.f}, const int gpuID = 0);
-}
 
+    // Functions for the files cvMatToOpInput/Output
+    void resizeAndMergeRGBGPU(
+        float* targetPtr, const float* const srcPtr, const int sourceWidth, const int sourceHeight,
+        const int targetWidth, const int targetHeight, const float scaleFactor);
+
+    void reorderAndCast(
+        float* targetPtr, const unsigned char* const srcPtr, const int width, const int height);
+
+}
 #endif // OPENPOSE_NET_RESIZE_AND_MERGE_BASE_HPP
diff --git a/include/openpose/pose/poseGpuRenderer.hpp b/include/openpose/pose/poseGpuRenderer.hpp
index 4553d074..1238d9d4 100644
--- a/include/openpose/pose/poseGpuRenderer.hpp
+++ b/include/openpose/pose/poseGpuRenderer.hpp
@@ -31,6 +31,9 @@ namespace op
         const std::shared_ptr<PoseExtractorNet> spPoseExtractorNet;
         // Init with thread
         float* pGpuPose; // GPU aux memory
+        float* pMaxPtr; // GPU aux memory
+        float* pMinPtr; // GPU aux memory
+        float* pScalePtr; // GPU aux memory
 
         DELETE_COPY(PoseGpuRenderer);
     };
diff --git a/include/openpose/pose/renderPose.hpp b/include/openpose/pose/renderPose.hpp
index c72dd7f4..196b3617 100644
--- a/include/openpose/pose/renderPose.hpp
+++ b/include/openpose/pose/renderPose.hpp
@@ -8,39 +8,39 @@
 
 namespace op
 {
-    OP_API void renderPoseKeypointsCpu(Array<float>& frameArray, const Array<float>& poseKeypoints,
-                                       const PoseModel poseModel, const float renderThreshold,
-                                       const bool blendOriginalFrame = true);
-
-    void renderPoseKeypointsGpu(float* framePtr, const PoseModel poseModel, const int numberPeople,
-                                const Point<int>& frameSize, const float* const posePtr,
-                                const float renderThreshold, const bool googlyEyes = false,
-                                const bool blendOriginalFrame = true,
-                                const float alphaBlending = POSE_DEFAULT_ALPHA_KEYPOINT);
-
-    void renderPoseHeatMapGpu(float* frame, const Point<int>& frameSize, const float* const heatMapPtr,
-                              const Point<int>& heatMapSize, const float scaleToKeepRatio,
-                              const unsigned int part,
-                              const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
-
-    void renderPoseHeatMapsGpu(float* frame, const PoseModel poseModel, const Point<int>& frameSize,
-                               const float* const heatMapPtr, const Point<int>& heatMapSize,
-                               const float scaleToKeepRatio,
-                               const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
-
-    void renderPosePAFGpu(float* framePtr, const PoseModel poseModel, const Point<int>& frameSize,
-                          const float* const heatMapPtr, const Point<int>& heatMapSize,
-                          const float scaleToKeepRatio, const int part,
-                          const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
-
-    void renderPosePAFsGpu(float* framePtr, const PoseModel poseModel, const Point<int>& frameSize,
-                           const float* const heatMapPtr, const Point<int>& heatMapSize,
-                           const float scaleToKeepRatio,
-                           const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
-
-    void renderPoseDistanceGpu(float* framePtr, const Point<int>& frameSize, const float* const heatMapPtr,
-                               const Point<int>& heatMapSize, const float scaleToKeepRatio,
-                               const unsigned int part, const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+    OP_API void renderPoseKeypointsCpu(
+        Array<float>& frameArray, const Array<float>& poseKeypoints, const PoseModel poseModel,
+        const float renderThreshold, const bool blendOriginalFrame = true);
+
+    void renderPoseKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const PoseModel poseModel,
+        const int numberPeople, const Point<int>& frameSize, const float* const posePtr,
+        const float renderThreshold, const bool googlyEyes = false, const bool blendOriginalFrame = true,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_KEYPOINT);
+
+    void renderPoseHeatMapGpu(
+        float* frame, const Point<int>& frameSize, const float* const heatMapPtr, const Point<int>& heatMapSize,
+        const float scaleToKeepRatio, const unsigned int part,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+
+    void renderPoseHeatMapsGpu(
+        float* frame, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
+        const Point<int>& heatMapSize, const float scaleToKeepRatio,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+
+    void renderPosePAFGpu(
+        float* framePtr, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
+        const Point<int>& heatMapSize, const float scaleToKeepRatio, const int part,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+
+    void renderPosePAFsGpu(
+        float* framePtr, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
+        const Point<int>& heatMapSize, const float scaleToKeepRatio,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+
+    void renderPoseDistanceGpu(
+        float* framePtr, const Point<int>& frameSize, const float* const heatMapPtr, const Point<int>& heatMapSize,
+        const float scaleToKeepRatio, const unsigned int part, const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
 }
 
 #endif // OPENPOSE_POSE_RENDER_POSE_HPP
diff --git a/include/openpose/utilities/render.hu b/include/openpose/utilities/render.hu
index 4948f373..0ff4791f 100644
--- a/include/openpose/utilities/render.hu
+++ b/include/openpose/utilities/render.hu
@@ -3,16 +3,217 @@
 
 namespace op
 {
-    inline __device__ void renderKeypoints(float* targetPtr, float2* sharedMaxs, float2* sharedMins,
-                                           float* sharedScaleF, const int globalIdx, const int x, const int y,
-                                           const int targetWidth, const int targetHeight,
-                                           const float* const keypointsPtr, const unsigned int* const partPairsPtr,
-                                           const int numberPeople, const int numberParts, const int numberPartPairs,
-                                           const float* const rgbColorsPtr, const int numberColors, const float radius,
-                                           const float lineWidth, const float* const keypointScalePtr,
-                                           const int numberScales, const float threshold,
-                                           const float alphaColorToAdd, const bool blendOriginalFrame = true,
-                                           const int googlyEye1 = -1, const int googlyEye2 = -1)
+    __inline__ __device__ void getBoundingBoxPerPerson(
+        float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
+        const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
+    {
+        const auto globalIdx = threadIdx.x;
+        // const auto xIndex = 2*globalIdx;
+        // const auto yIndex = xIndex+1;
+        const auto xIndex = globalIdx;
+        const auto yIndex = numberPeople+globalIdx;
+
+        // Fill shared parameters
+        // if (globalIdx < numberPeople)
+        {
+            auto minValueX = (float)targetWidth;
+            auto minValueY = (float)targetHeight;
+            auto maxValueX = 0.f;
+            auto maxValueY = 0.f;
+            for (auto part = 0 ; part < numberParts ; part++)
+            {
+                const auto index = 3 * (globalIdx*numberParts + part);
+                const auto x = keypointsPtr[index];
+                const auto y = keypointsPtr[index+1];
+                const auto score = keypointsPtr[index+2];
+                if (score > threshold)
+                {
+                    if (x < minValueX)
+                        minValueX = x;
+                    if (x > maxValueX)
+                        maxValueX = x;
+                    if (y < minValueY)
+                        minValueY = y;
+                    if (y > maxValueY)
+                        maxValueY = y;
+                }
+            }
+            if (maxValueX != 0.f && maxValueY != 0.f)
+            {
+                const auto averageX = maxValueX - minValueX;
+                const auto averageY = maxValueY - minValueY;
+                // (averageX + averageY) / 2.f / 400.f
+                scalePtr[globalIdx] = fastTruncateCuda((averageX + averageY) / 400.f, 0.33f, 1.f);
+                const auto constantToAdd = 50.f;
+                maxValueX += constantToAdd;
+                maxValueY += constantToAdd;
+                minValueX -= constantToAdd;
+                minValueY -= constantToAdd;
+            }
+
+            minPtr[xIndex] = minValueX;
+            minPtr[yIndex] = minValueY;
+            maxPtr[xIndex] = maxValueX;
+            maxPtr[yIndex] = maxValueY;
+        }
+    }
+
+    __inline__ __device__ void renderKeypoints(
+        float* targetPtr, float* sharedMaxs, float* sharedMins, float* sharedScaleF, const float* const maxPtr,
+        const float* const minPtr, const float* const scalePtr, const int globalIdx, const int x, const int y,
+        const int targetWidth, const int targetHeight, const float* const keypointsPtr,
+        const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts,
+        const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius,
+        const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold,
+        const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1,
+        const int googlyEye2 = -1)
+    {
+        // Load shared memory
+        if (globalIdx < 2*numberPeople)
+        {
+            sharedMins[globalIdx] = minPtr[globalIdx];
+            sharedMaxs[globalIdx] = maxPtr[globalIdx];
+            if (globalIdx < numberPeople)
+                sharedScaleF[globalIdx] = scalePtr[globalIdx];
+        }
+        __syncthreads();
+
+        // Fill each (x,y) target pixel
+        if (x < targetWidth && y < targetHeight)
+        {
+            const auto baseIndex = 3*(y * targetWidth + x);
+            auto b = targetPtr[baseIndex];
+            auto g = targetPtr[baseIndex+1];
+            auto r = targetPtr[baseIndex+2];
+            if (!blendOriginalFrame)
+            {
+                b = 0.f;
+                g = 0.f;
+                r = 0.f;
+            }
+
+            const auto lineWidthSquared = lineWidth * lineWidth;
+            const auto radiusSquared = radius * radius;
+            for (auto person = 0; person < numberPeople; person++)
+            {
+                // Make sure person x,y in the limits
+                // Make sure person is not empty. Assume all joints are below threshold. Then
+                // maxs = 0 and mins = width/height. So if statement would be false
+                // const auto xIndex = 2*person;
+                // const auto yIndex = xIndex+1;
+                const auto xIndex = person;
+                const auto yIndex = numberPeople+person;
+                if (x <= sharedMaxs[xIndex] && x >= sharedMins[xIndex]
+                     && y <= sharedMaxs[yIndex] && y >= sharedMins[yIndex])
+                {
+                    // Part pair connections
+                    for (auto partPair = 0; partPair < numberPartPairs; partPair++)
+                    {
+                        const auto partA = partPairsPtr[2*partPair];
+                        const auto partB = partPairsPtr[2*partPair+1];
+                        const auto indexA = person*numberParts*3 + partA*3;
+                        const auto xA = keypointsPtr[indexA];
+                        const auto yA = keypointsPtr[indexA + 1];
+                        const auto scoreA = keypointsPtr[indexA + 2];
+                        const auto indexB = person*numberParts*3 + partB*3;
+                        const auto xB = keypointsPtr[indexB];
+                        const auto yB = keypointsPtr[indexB + 1];
+                        const auto scoreB = keypointsPtr[indexB + 2];
+
+                        if (scoreA > threshold && scoreB > threshold)
+                        {
+                            const auto keypointScale = keypointScalePtr[partB%numberScales]
+                                                     * keypointScalePtr[partB%numberScales]
+                                                     * keypointScalePtr[partB%numberScales];
+                            const auto lineWidthScaled = lineWidthSquared * keypointScale;
+                            const auto bSqrt = sharedScaleF[person] * sharedScaleF[person] * lineWidthScaled;
+
+                            const auto xP = (xA + xB) / 2.f;
+                            const auto yP = (yA + yB) / 2.f;
+                            const auto aSqrt = (xA - xP) * (xA - xP) + (yA - yP) * (yA - yP);
+
+                            const auto angle = atan2f(yB - yA, xB - xA);
+                            const auto sine = sinf(angle);
+                            const auto cosine = cosf(angle);
+                            const auto A = cosine * (x - xP) + sine * (y - yP);
+                            const auto B = sine * (x - xP) - cosine * (y - yP);
+
+                            const auto judge = A * A / aSqrt + B * B / bSqrt;
+                            const auto minV = 0.f;
+                            const auto maxV = 1.f;
+                            if (minV <= judge && judge <= maxV)
+                                // Before used partPair vs partB
+                                addColorWeighted(r, g, b, &rgbColorsPtr[(partB%numberColors)*3], alphaColorToAdd);
+                        }
+                    }
+
+                    // Part circles
+                    for (auto part = 0u; part < numberParts; part++)
+                    {
+                        const auto index = 3 * (person*numberParts + part);
+                        const auto localX = keypointsPtr[index];
+                        const auto localY = keypointsPtr[index + 1];
+                        const auto score = keypointsPtr[index + 2];
+
+                        if (score > threshold)
+                        {
+                            const auto keypointScale = keypointScalePtr[part%numberScales]
+                                                     * keypointScalePtr[part%numberScales]
+                                                     * keypointScalePtr[part%numberScales];
+                            const auto radiusScaled = radiusSquared * keypointScale;
+                            const auto dist2 = (x - localX) * (x - localX) + (y - localY) * (y - localY);
+                            // Googly eyes
+                            if (googlyEye1 == part || googlyEye2 == part)
+                            {
+                                const auto eyeRatio = 2.5f * sqrt(radiusScaled);
+                                const auto minr2 = sharedScaleF[person] * sharedScaleF[person]
+                                                 * (eyeRatio - 2) * (eyeRatio - 2);
+                                const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * eyeRatio * eyeRatio;
+                                if (dist2 <= maxr2)
+                                {
+                                    float colorToAdd [3] = {0., 0., 0.};
+                                    if (dist2 <= minr2)
+                                        for (auto& color : colorToAdd)
+                                            color = {255.f};
+                                    if (dist2 <= minr2*0.6f)
+                                    {
+                                        const auto dist3 = (x-4 - localX)
+                                                         * (x-4 - localX) + (y - localY+4) * (y - localY+4);
+                                        if (dist3 > 14.0625f) // 3.75f^2
+                                            for (auto& color : colorToAdd)
+                                                color = {0.f};
+                                    }
+                                    const auto alphaColorToAdd = 0.9f;
+                                    addColorWeighted(r, g, b, colorToAdd, alphaColorToAdd);
+                                }
+                            }
+                            // Other parts
+                            else
+                            {
+                                const auto minr2 = 0.f;
+                                const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * radiusScaled;
+                                if (minr2 <= dist2 && dist2 <= maxr2)
+                                    addColorWeighted(r, g, b, &rgbColorsPtr[(part%numberColors)*3], alphaColorToAdd);
+                            }
+
+                        }
+                    }
+                }
+            }
+            targetPtr[baseIndex] = b;
+            targetPtr[baseIndex+1] = g;
+            targetPtr[baseIndex+2] = r;
+        }
+    }
+
+    __inline__ __device__ void renderKeypointsOld(
+        float* targetPtr, float2* sharedMaxs, float2* sharedMins, float* sharedScaleF, const int globalIdx,
+        const int x, const int y, const int targetWidth, const int targetHeight, const float* const keypointsPtr,
+        const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts,
+        const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius,
+        const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold,
+        const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1,
+        const int googlyEye2 = -1)
     {
         // Fill shared parameters
         if (globalIdx < numberPeople)
@@ -52,7 +253,6 @@ namespace op
                 sharedMins[globalIdx].y -= constantToAdd;
             }
         }
-
         __syncthreads();
 
         // Fill each (x,y) target pixel
@@ -176,4 +376,5 @@ namespace op
     }
 }
 
+
 #endif // OPENPOSE_UTILITIES_RENDER_HU
diff --git a/src/openpose/face/faceGpuRenderer.cpp b/src/openpose/face/faceGpuRenderer.cpp
index be129e58..e0bf4d38 100644
--- a/src/openpose/face/faceGpuRenderer.cpp
+++ b/src/openpose/face/faceGpuRenderer.cpp
@@ -10,7 +10,11 @@ namespace op
 {
     FaceGpuRenderer::FaceGpuRenderer(const float renderThreshold, const float alphaKeypoint,
                                      const float alphaHeatMap) :
-        GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap}
+        GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap},
+        pGpuFace{nullptr},
+        pMaxPtr{nullptr},
+        pMinPtr{nullptr},
+        pScalePtr{nullptr}
     {
     }
 
@@ -20,7 +24,14 @@ namespace op
         {
             // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
             #ifdef USE_CUDA
-                cudaFree(pGpuFace);
+                if (pGpuFace != nullptr)
+                    cudaFree(pGpuFace);
+                if (pMaxPtr != nullptr)
+                    cudaFree(pMaxPtr);
+                if (pMinPtr != nullptr)
+                    cudaFree(pMinPtr);
+                if (pScalePtr != nullptr)
+                    cudaFree(pScalePtr);
             #endif
         }
         catch (const std::exception& e)
@@ -37,6 +48,9 @@ namespace op
             // GPU memory allocation for rendering
             #ifdef USE_CUDA
                 cudaMalloc((void**)(&pGpuFace), POSE_MAX_PEOPLE * FACE_NUMBER_PARTS * 3 * sizeof(float));
+                cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * FACE_NUMBER_PARTS);
+                cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * FACE_NUMBER_PARTS);
+                cudaMalloc((void**)&pScalePtr, sizeof(float) * FACE_NUMBER_PARTS);
             #endif
             log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
         }
@@ -63,8 +77,9 @@ namespace op
                     cudaMemcpy(pGpuFace, faceKeypoints.getConstPtr(),
                                faceKeypoints.getSize(0) * FACE_NUMBER_PARTS * 3 * sizeof(float),
                                cudaMemcpyHostToDevice);
-                    renderFaceKeypointsGpu(*spGpuMemory, frameSize, pGpuFace, faceKeypoints.getSize(0),
-                                           mRenderThreshold, getAlphaKeypoint());
+                    renderFaceKeypointsGpu(
+                        *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, frameSize, pGpuFace, faceKeypoints.getSize(0),
+                        mRenderThreshold, getAlphaKeypoint());
                     // CUDA check
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 }
diff --git a/src/openpose/face/renderFace.cu b/src/openpose/face/renderFace.cu
index 898a3392..34b71776 100644
--- a/src/openpose/face/renderFace.cu
+++ b/src/openpose/face/renderFace.cu
@@ -10,17 +10,25 @@ namespace op
     __constant__ const float SCALES[] = {FACE_SCALES_RENDER_GPU};
     __constant__ const float COLORS[] = {FACE_COLORS_RENDER_GPU};
 
-    __global__ void renderFaceParts(float* targetPtr, const int targetWidth, const int targetHeight,
-                                    const float* const facePtr, const int numberPeople,
-                                    const float threshold, const float alphaColorToAdd)
+    __global__ void getBoundingBoxPerPersonFace(
+        float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
+        const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
+    {
+        getBoundingBoxPerPerson(
+            maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold);
+    }
+
+    __global__ void renderFaceParts(
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const facePtr, const int numberPeople, const float threshold, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[FACE_MAX_FACES];
-        __shared__ float2 sharedMaxs[FACE_MAX_FACES];
+        __shared__ float sharedMins[2*FACE_MAX_FACES];
+        __shared__ float sharedMaxs[2*FACE_MAX_FACES];
         __shared__ float sharedScaleF[FACE_MAX_FACES];
 
         // Other parameters
@@ -31,23 +39,33 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 250.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        facePtr, PART_PAIRS_GPU, numberPeople, FACE_NUMBER_PARTS, numberPartPairs, COLORS,
-                        numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
+            targetHeight, facePtr, PART_PAIRS_GPU, numberPeople, FACE_NUMBER_PARTS, numberPartPairs, COLORS,
+            numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
     }
 
-    void renderFaceKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const facePtr,
-                                const int numberPeople, const float renderThreshold, const float alphaColorToAdd)
+    void renderFaceKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
+        const float* const facePtr, const int numberPeople, const float renderThreshold, const float alphaColorToAdd)
     {
         try
         {
             if (numberPeople > 0)
             {
+                // Get bouding boxes
+                const dim3 threadsPerBlockBoundBox = {1, 1, 1};
+                const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
+                getBoundingBoxPerPersonFace<<<threadsPerBlockBoundBox, numBlocksBox>>>(
+                    maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, facePtr, numberPeople,
+                    FACE_NUMBER_PARTS, renderThreshold);
+                // Draw hands
                 dim3 threadsPerBlock;
                 dim3 numBlocks;
                 getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
-                renderFaceParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, facePtr,
-                                                                numberPeople, renderThreshold, alphaColorToAdd);
+                renderFaceParts<<<threadsPerBlock, numBlocks>>>(
+                    framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, facePtr, numberPeople,
+                    renderThreshold, alphaColorToAdd);
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             }
         }
diff --git a/src/openpose/hand/handGpuRenderer.cpp b/src/openpose/hand/handGpuRenderer.cpp
index 8d02f433..f76cd107 100644
--- a/src/openpose/hand/handGpuRenderer.cpp
+++ b/src/openpose/hand/handGpuRenderer.cpp
@@ -10,7 +10,11 @@ namespace op
 {
     HandGpuRenderer::HandGpuRenderer(const float renderThreshold, const float alphaKeypoint,
                                      const float alphaHeatMap) :
-        GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap}
+        GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap},
+        pGpuHand{nullptr},
+        pMaxPtr{nullptr},
+        pMinPtr{nullptr},
+        pScalePtr{nullptr}
     {
     }
 
@@ -20,7 +24,14 @@ namespace op
         {
             // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
             #ifdef USE_CUDA
-                cudaFree(pGpuHand);
+                if (pGpuHand != nullptr)
+                    cudaFree(pGpuHand);
+                if (pMaxPtr != nullptr)
+                    cudaFree(pMaxPtr);
+                if (pMinPtr != nullptr)
+                    cudaFree(pMinPtr);
+                if (pScalePtr != nullptr)
+                    cudaFree(pScalePtr);
             #endif
         }
         catch (const std::exception& e)
@@ -37,6 +48,9 @@ namespace op
             // GPU memory allocation for rendering
             #ifdef USE_CUDA
                 cudaMalloc((void**)(&pGpuHand), HAND_MAX_HANDS * HAND_NUMBER_PARTS * 3 * sizeof(float));
+                cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * HAND_MAX_HANDS);
+                cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * HAND_MAX_HANDS);
+                cudaMalloc((void**)&pScalePtr, sizeof(float) * HAND_MAX_HANDS);
             #endif
             log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
         }
@@ -69,7 +83,8 @@ namespace op
                     cudaMemcpy(pGpuHand + handVolume, handKeypoints[1].getConstPtr(),
                                handVolume * sizeof(float), cudaMemcpyHostToDevice);
                     renderHandKeypointsGpu(
-                        *spGpuMemory, frameSize, pGpuHand, 2 * numberPeople, mRenderThreshold, getAlphaKeypoint());
+                        *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, frameSize, pGpuHand, 2 * numberPeople,
+                        mRenderThreshold, getAlphaKeypoint());
                     // CUDA check
                     cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                 }
diff --git a/src/openpose/hand/renderHand.cu b/src/openpose/hand/renderHand.cu
index 8564ee16..d3f478db 100644
--- a/src/openpose/hand/renderHand.cu
+++ b/src/openpose/hand/renderHand.cu
@@ -10,17 +10,25 @@ namespace op
     __constant__ const float SCALES[] = {HAND_SCALES_RENDER_GPU};
     __constant__ const float COLORS[] = {HAND_COLORS_RENDER_GPU};
 
-    __global__ void renderHandsParts(float* targetPtr, const int targetWidth, const int targetHeight,
-                                     const float* const handsPtr, const int numberHands,
-                                     const float threshold, const float alphaColorToAdd)
+    __global__ void getBoundingBoxPerPersonHand(
+        float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
+        const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
+    {
+        getBoundingBoxPerPerson(
+            maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold);
+    }
+
+    __global__ void renderHandsParts(
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const handsPtr, const int numberHands, const float threshold, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[HAND_MAX_HANDS];
-        __shared__ float2 sharedMaxs[HAND_MAX_HANDS];
+        __shared__ float sharedMins[2*HAND_MAX_HANDS];
+        __shared__ float sharedMaxs[2*HAND_MAX_HANDS];
         __shared__ float sharedScaleF[HAND_MAX_HANDS];
 
         // Other parameters
@@ -31,23 +39,33 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 80.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        handsPtr, PART_PAIRS_GPU, numberHands, HAND_NUMBER_PARTS, numberPartPairs, COLORS,
-                        numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
+            targetHeight, handsPtr, PART_PAIRS_GPU, numberHands, HAND_NUMBER_PARTS, numberPartPairs, COLORS,
+            numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
     }
 
-    void renderHandKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const handsPtr,
-                                const int numberHands, const float renderThreshold, const float alphaColorToAdd)
+    void renderHandKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
+        const float* const handsPtr, const int numberHands, const float renderThreshold, const float alphaColorToAdd)
     {
         try
         {
             if (numberHands > 0)
             {
+                // Get bouding boxes
+                const dim3 threadsPerBlockBoundBox = {1, 1, 1};
+                const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
+                getBoundingBoxPerPersonHand<<<threadsPerBlockBoundBox, numBlocksBox>>>(
+                    maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, handsPtr, numberHands,
+                    HAND_NUMBER_PARTS, renderThreshold);
+                // Draw hands
                 dim3 threadsPerBlock;
                 dim3 numBlocks;
                 getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
-                renderHandsParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, handsPtr,
-                                                                 numberHands, renderThreshold, alphaColorToAdd);
+                renderHandsParts<<<threadsPerBlock, numBlocks>>>(
+                    framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, handsPtr, numberHands,
+                    renderThreshold, alphaColorToAdd);
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             }
         }
diff --git a/src/openpose/net/resizeAndMergeBase.cu b/src/openpose/net/resizeAndMergeBase.cu
index 38dd2ba3..a52b8969 100644
--- a/src/openpose/net/resizeAndMergeBase.cu
+++ b/src/openpose/net/resizeAndMergeBase.cu
@@ -6,19 +6,111 @@ namespace op
 {
     const auto THREADS_PER_BLOCK_1D = 16u;
 
+    // template <typename T>
+    // __global__ void resizeKernelOld(
+    //     T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+    //     const int targetHeight)
+    // {
+    //     const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+    //     if (x < targetWidth && y < targetHeight)
+    //     {
+    //         const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
+    //         const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
+    //         targetPtr[y*targetWidth+x] = bicubicInterpolate(
+    //             sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+    //     }
+    // }
+
+    template <typename T>
+    __global__ void resizeKernel(
+        T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+        const int targetHeight, const int channels)
+    {
+        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+        const auto sourceArea = sourceWidth * sourceHeight;
+        const auto targetArea = targetWidth * targetHeight;
+
+        if (x < targetWidth && y < targetHeight && channel < channels)
+        {
+            const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
+            const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
+            const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
+            targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate(
+                sourcePtrChannel, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+        }
+    }
+
     template <typename T>
-    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight,
-                                 const int targetWidth, const int targetHeight)
+    __global__ void resizeAndPadKernel(
+        T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+        const int targetHeight, const float rescaleFactor, const int channels)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+        const auto sourceArea = sourceWidth * sourceHeight;
+        const auto targetArea = targetWidth * targetHeight;
+
+        if (x < targetWidth && y < targetHeight && channel < channels)
+        {
+            const T xSource = (x + T(0.5f)) * 1.0 / T(rescaleFactor) - T(0.5f);
+            const T ySource = (y + T(0.5f)) * 1.0 / T(rescaleFactor) - T(0.5f);
+            const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
+            if (x < sourceWidth * rescaleFactor && y < sourceHeight * rescaleFactor)
+                targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate(
+                    sourcePtrChannel, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+            else
+                targetPtr[channel * targetArea + y*targetWidth+x] = 0;
+        }
+    }
+
+
+        template <typename T>
+    __global__ void resize8TimesKernel(
+        T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+        const int targetHeight, const unsigned int rescaleFactor)
+    {
+        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+        // Load shared memory
+        // If resize >= 5, then #threads per block >= # elements of shared memory
+        __shared__ T sourcePtrShared[25];
+        const auto sharedLoadId = threadIdx.x + rescaleFactor*threadIdx.y;
+        if (sharedLoadId < 25)
+        {
+            const auto minTargetX = blockIdx.x * rescaleFactor;
+            const auto minSourceX = (minTargetX + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
+            const auto minSourceXInt_1 = int(minSourceX+ 1e-5);
+            const auto minSourceXInt = minSourceXInt_1 - 1;
+            const auto minTargetY = blockIdx.y * rescaleFactor;
+            const auto minSourceY = (minTargetY + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
+            const auto minSourceYInt_1 = int(minSourceY  + 1e-5);
+            const auto minSourceYInt = minSourceYInt_1 - 1;
+
+            const auto yClean = fastTruncateCuda(int(minSourceYInt+sharedLoadId/5 + 1e-5), 0, sourceHeight - 1);
+            const auto xClean = fastTruncateCuda(int(minSourceXInt+sharedLoadId%5 + 1e-5), 0, sourceWidth - 1);
+            const auto sourceArea = sourceWidth * sourceHeight;
+            const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
+            sourcePtrShared[sharedLoadId] = sourcePtrChannel[yClean * sourceWidth + xClean];
+        }
+        // Wait here until shared memory has been loaded
+        __syncthreads();
 
         if (x < targetWidth && y < targetHeight)
         {
+            const auto targetArea = targetWidth * targetHeight;
             const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
             const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
-            targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
-                                                            sourceWidth);
+            targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate8Times(
+                sourcePtrShared, xSource, ySource, sourceWidth, sourceHeight, sourceWidth, threadIdx.x, threadIdx.y);
         }
     }
 
@@ -58,10 +150,53 @@ namespace op
         }
     }
 
+    __global__ void reorderAndCastKernel(
+        float* targetPtr, const unsigned char* const srcPtr, const int width, const int height)
+    {
+        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto c = (blockIdx.z * blockDim.z) + threadIdx.z;
+        if (x < width && y < height)
+        {
+            const auto channels = 3;
+            const auto originFramePtrOffsetY = y * width;
+            const auto channelOffset = c * width * height;
+            const auto targetIndex = channelOffset + y * width + x;
+            const auto srcIndex = (originFramePtrOffsetY + x) * channels + c;
+            targetPtr[targetIndex] =  float(srcPtr[srcIndex]) * (1/256.f) - 0.5f;
+        }
+    }
+
+    void reorderAndCast(float* targetPtr, const unsigned char* const srcPtr, const int width, const int height)
+    {
+        const dim3 threadsPerBlock{32, 1, 1};
+        const dim3 numBlocks{
+            getNumberCudaBlocks(width, threadsPerBlock.x),
+            getNumberCudaBlocks(height, threadsPerBlock.y),
+            getNumberCudaBlocks(3, threadsPerBlock.z)};
+        reorderAndCastKernel<<<numBlocks, threadsPerBlock>>>(targetPtr, srcPtr, width, height);
+    }
+
+    void resizeAndMergeRGBGPU(
+        float* targetPtr, const float* const srcPtr, const int sourceWidth, const int sourceHeight,
+        const int targetWidth, const int targetHeight, const float scaleFactor)
+
+    {
+        const auto channels = 3;
+        const dim3 threadsPerBlock{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1};
+        const dim3 numBlocks{
+            getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
+            getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
+            getNumberCudaBlocks(channels, threadsPerBlock.z)};
+
+        resizeAndPadKernel<<<numBlocks, threadsPerBlock>>>(
+            targetPtr, srcPtr, sourceWidth, sourceHeight, targetWidth, targetHeight, scaleFactor, channels);
+    }
+
     template <typename T>
-    void resizeAndMergeGpu(T* targetPtr, const std::vector<const T*>& sourcePtrs, const std::array<int, 4>& targetSize,
-                           const std::vector<std::array<int, 4>>& sourceSizes,
-                           const std::vector<T>& scaleInputToNetInputs)
+    void resizeAndMergeGpu(
+        T* targetPtr, const std::vector<const T*>& sourcePtrs, const std::array<int, 4>& targetSize,
+        const std::vector<std::array<int, 4>>& sourceSizes, const std::vector<T>& scaleInputToNetInputs)
     {
         try
         {
@@ -90,20 +225,60 @@ namespace op
                 const auto num = sourceSize[0];
                 if (targetSize[0] > 1 || num == 1)
                 {
-                    const auto sourceChannelOffset = sourceHeight * sourceWidth;
-                    const auto targetChannelOffset = targetWidth * targetHeight;
-                    for (auto n = 0; n < num; n++)
-                    {
-                        const auto offsetBase = n*channels;
-                        for (auto c = 0 ; c < channels ; c++)
-                        {
-                            const auto offset = offsetBase + c;
-                            resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + offset * targetChannelOffset,
-                                                                         sourcePtrs.at(0) + offset * sourceChannelOffset,
-                                                                         sourceWidth, sourceHeight, targetWidth,
-                                                                         targetHeight);
-                        }
-                    }
+                    // // Profiling code
+                    // const auto REPS = 250;
+                    // double timeNormalize0 = 0.;
+                    // double timeNormalize1 = 0.;
+                    // double timeNormalize2 = 0.;
+                    // double timeNormalize3 = 0.;
+                    // // Non-optimized function
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    // const auto sourceChannelOffset = sourceHeight * sourceWidth;
+                    // const auto targetChannelOffset = targetWidth * targetHeight;
+                    // for (auto n = 0; n < num; n++)
+                    // {
+                    //     const auto offsetBase = n*channels;
+                    //     for (auto c = 0 ; c < channels ; c++)
+                    //     {
+                    //         const auto offset = offsetBase + c;
+                    //         resizeKernelOld<<<numBlocks, threadsPerBlock>>>(
+                    //             targetPtr + offset * targetChannelOffset,
+                    //             sourcePtrs.at(0) + offset * sourceChannelOffset,
+                    //             sourceWidth, sourceHeight, targetWidth, targetHeight);
+                    //     }
+                    // }
+                    // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);
+
+                    // Optimized function for any resize size (suboptimal for 8x resize)
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    const dim3 threadsPerBlock{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1};
+                    const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
+                                         getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
+                                         getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
+                    resizeKernel<<<numBlocks, threadsPerBlock>>>(
+                        targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight,
+                        num * channels);
+                    // OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS);
+
+                    // // Optimized function for 8x resize
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    // if (targetWidth / sourceWidth != 8 || targetHeight / sourceHeight != 8)
+                    //     error("Kernel only implemented for 8x resize. Notify us if this error appears.",
+                    //         __LINE__, __FUNCTION__, __FILE__);
+                    // const auto rescaleFactor = (unsigned int) std::ceil((float)(targetHeight) / (float)(sourceHeight));
+
+                    // const dim3 threadsPerBlock{rescaleFactor, rescaleFactor, 1};
+                    // const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
+                    //                      getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
+                    //                      getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
+                    // resize8TimesKernel<<<numBlocks, threadsPerBlock>>>(
+                    //     targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight, rescaleFactor);
+                    // OP_CUDA_PROFILE_END(timeNormalize3, 1e3, REPS);
+
+                    // Profiling code
+                    // log("  Res(ori)=" + std::to_string(timeNormalize1) + "ms");
+                    // log("  Res(new)=" + std::to_string(timeNormalize2) + "ms");
+                    // log("  Res(new8x)=" + std::to_string(timeNormalize3) + "ms");
                 }
                 // Old inefficient multi-scale merging
                 else
diff --git a/src/openpose/pose/poseGpuRenderer.cpp b/src/openpose/pose/poseGpuRenderer.cpp
index ab1fe86e..030d2cf4 100644
--- a/src/openpose/pose/poseGpuRenderer.cpp
+++ b/src/openpose/pose/poseGpuRenderer.cpp
@@ -22,7 +22,10 @@ namespace op
                     getNumberElementsToRender(poseModel)}, // mNumberElementsToRender
         PoseRenderer{poseModel},
         spPoseExtractorNet{poseExtractorNet},
-        pGpuPose{nullptr}
+        pGpuPose{nullptr},
+        pMaxPtr{nullptr},
+        pMinPtr{nullptr},
+        pScalePtr{nullptr}
     {
     }
 
@@ -32,7 +35,14 @@ namespace op
         {
             // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
             #ifdef USE_CUDA
-                cudaFree(pGpuPose);
+                if (pGpuPose != nullptr)
+                    cudaFree(pGpuPose);
+                if (pMaxPtr != nullptr)
+                    cudaFree(pMaxPtr);
+                if (pMinPtr != nullptr)
+                    cudaFree(pMinPtr);
+                if (pScalePtr != nullptr)
+                    cudaFree(pScalePtr);
             #endif
         }
         catch (const std::exception& e)
@@ -49,7 +59,10 @@ namespace op
             // GPU memory allocation for rendering
             #ifdef USE_CUDA
                 cudaMalloc((void**)(&pGpuPose),
-                           POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float));
+                    POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float));
+                cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * POSE_MAX_PEOPLE);
+                cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * POSE_MAX_PEOPLE);
+                cudaMalloc((void**)&pScalePtr, sizeof(float) * POSE_MAX_PEOPLE);
                 cudaCheck(__LINE__, __FUNCTION__, __FILE__);
             #endif
             log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
@@ -60,10 +73,9 @@ namespace op
         }
     }
 
-    std::pair<int, std::string> PoseGpuRenderer::renderPose(Array<float>& outputData,
-                                                            const Array<float>& poseKeypoints,
-                                                            const float scaleInputToOutput,
-                                                            const float scaleNetToOutput)
+    std::pair<int, std::string> PoseGpuRenderer::renderPose(
+        Array<float>& outputData, const Array<float>& poseKeypoints, const float scaleInputToOutput,
+        const float scaleNetToOutput)
     {
         try
         {
@@ -78,7 +90,6 @@ namespace op
                 if (numberPeople > 0 || elementRendered != 0 || !mBlendOriginalFrame)
                 {
                     cpuToGpuMemoryIfNotCopiedYet(outputData.getPtr(), outputData.getVolume());
-                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                     const auto numberBodyParts = getPoseNumberBodyParts(mPoseModel);
                     const auto hasBkg = addBkgChannel(mPoseModel);
                     const auto numberBodyPartsPlusBkg = numberBodyParts + (hasBkg ? 1 : 0);
@@ -96,9 +107,9 @@ namespace op
                                        poseKeypointsRescaled.getConstPtr(),
                                        numberPeople * numberBodyParts * 3 * sizeof(float),
                                        cudaMemcpyHostToDevice);
-                        renderPoseKeypointsGpu(*spGpuMemory, mPoseModel, numberPeople, frameSize, pGpuPose,
-                                               mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame,
-                                               getAlphaKeypoint());
+                        renderPoseKeypointsGpu(
+                            *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, mPoseModel, numberPeople, frameSize, pGpuPose,
+                            mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame, getAlphaKeypoint());
                     }
                     else
                     {
diff --git a/src/openpose/pose/renderPose.cu b/src/openpose/pose/renderPose.cu
index 4afc1ff4..cd1cdb57 100644
--- a/src/openpose/pose/renderPose.cu
+++ b/src/openpose/pose/renderPose.cu
@@ -14,8 +14,6 @@ namespace op
     __constant__ const unsigned int BODY_19_PAIRS_GPU[] = {POSE_BODY_19_PAIRS_RENDER_GPU};
     __constant__ const unsigned int BODY_23_PAIRS_GPU[] = {POSE_BODY_23_PAIRS_RENDER_GPU};
     __constant__ const unsigned int BODY_25B_PAIRS_GPU[] = {POSE_BODY_25B_PAIRS_RENDER_GPU};
-    __constant__ const unsigned int BODY_65_PAIRS_GPU[] = {POSE_BODY_65_PAIRS_RENDER_GPU};
-    __constant__ const unsigned int BODY_95_PAIRS_GPU[] = {POSE_BODY_95_PAIRS_RENDER_GPU};
     __constant__ const unsigned int BODY_135_PAIRS_GPU[] = {POSE_BODY_135_PAIRS_RENDER_GPU};
     __constant__ const unsigned int MPI_PAIRS_GPU[] = {POSE_MPI_PAIRS_RENDER_GPU};
     __constant__ const unsigned int CAR_12_PAIRS_GPU[] = {POSE_CAR_12_PAIRS_RENDER_GPU};
@@ -26,8 +24,6 @@ namespace op
     __constant__ const float BODY_19_SCALES[] = {POSE_BODY_19_SCALES_RENDER_GPU};
     __constant__ const float BODY_23_SCALES[] = {POSE_BODY_23_SCALES_RENDER_GPU};
     __constant__ const float BODY_25B_SCALES[] = {POSE_BODY_25B_SCALES_RENDER_GPU};
-    __constant__ const float BODY_65_SCALES[] = {POSE_BODY_65_SCALES_RENDER_GPU};
-    __constant__ const float BODY_95_SCALES[] = {POSE_BODY_95_SCALES_RENDER_GPU};
     __constant__ const float BODY_135_SCALES[] = {POSE_BODY_135_SCALES_RENDER_GPU};
     __constant__ const float MPI_SCALES[] = {POSE_MPI_SCALES_RENDER_GPU};
     __constant__ const float CAR_12_SCALES[] = {POSE_CAR_12_SCALES_RENDER_GPU};
@@ -38,8 +34,6 @@ namespace op
     __constant__ const float BODY_19_COLORS[] = {POSE_BODY_19_COLORS_RENDER_GPU};
     __constant__ const float BODY_23_COLORS[] = {POSE_BODY_23_COLORS_RENDER_GPU};
     __constant__ const float BODY_25B_COLORS[] = {POSE_BODY_25B_COLORS_RENDER_GPU};
-    __constant__ const float BODY_65_COLORS[] = {POSE_BODY_65_COLORS_RENDER_GPU};
-    __constant__ const float BODY_95_COLORS[] = {POSE_BODY_95_COLORS_RENDER_GPU};
     __constant__ const float BODY_135_COLORS[] = {POSE_BODY_135_COLORS_RENDER_GPU};
     __constant__ const float MPI_COLORS[] = {POSE_MPI_COLORS_RENDER_GPU};
     __constant__ const float CAR_12_COLORS[] = {POSE_CAR_12_COLORS_RENDER_GPU};
@@ -124,18 +118,26 @@ namespace op
         colorPtr.z *= rad;
     }
 
+    __global__ void getBoundingBoxPerPersonPose(
+        float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
+        const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
+    {
+        getBoundingBoxPerPerson(
+            maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold);
+    }
+
     __global__ void renderPoseCoco(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
+        const bool blendOriginalFrame, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -146,24 +148,25 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, COCO_PAIRS_GPU, numberPeople, 18, numberPartPairs, COCO_COLORS,
-                        numberColors, radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 14 : -1), (googlyEyes ? 15 : -1));
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr,
+            globalIdx, x, y, targetWidth, targetHeight, posePtr, COCO_PAIRS_GPU, numberPeople, 18, numberPartPairs,
+            COCO_COLORS, numberColors, radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd,
+            blendOriginalFrame, (googlyEyes ? 14 : -1), (googlyEyes ? 15 : -1));
     }
 
     __global__ void renderPoseBody19(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
+        const bool blendOriginalFrame, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -174,24 +177,26 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, BODY_19_PAIRS_GPU, numberPeople, 19, numberPartPairs, BODY_19_COLORS, numberColors,
-                        radius, lineWidth, BODY_19_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1));
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr,
+            globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_19_PAIRS_GPU, numberPeople, 19, numberPartPairs,
+            BODY_19_COLORS, numberColors, radius, lineWidth, BODY_19_SCALES, numberScales, threshold, alphaColorToAdd,
+            blendOriginalFrame, (googlyEyes ? 15 : -1),
+            (googlyEyes ? 16 : -1));
     }
 
     __global__ void renderPoseBody23(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
+        const bool blendOriginalFrame, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -202,24 +207,54 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, BODY_23_PAIRS_GPU, numberPeople, 23, numberPartPairs, BODY_23_COLORS, numberColors,
-                        radius, lineWidth, BODY_23_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 13 : -1), (googlyEyes ? 14 : -1));
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr,
+            globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_23_PAIRS_GPU, numberPeople, 23, numberPartPairs,
+            BODY_23_COLORS, numberColors, radius, lineWidth, BODY_23_SCALES, numberScales, threshold, alphaColorToAdd,
+            blendOriginalFrame, (googlyEyes ? 13 : -1), (googlyEyes ? 14 : -1));
     }
 
+    // __global__ void renderPoseBody25Old(
+    //     float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
+    //     const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
+    //     const float alphaColorToAdd)
+    // {
+    //     const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    //     const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
+
+    //     // Shared parameters
+    //     __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
+    //     __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+    //     __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
+
+    //     // Other parameters
+    //     const auto numberPartPairs = sizeof(BODY_25_PAIRS_GPU) / (2*sizeof(BODY_25_PAIRS_GPU[0]));
+    //     const auto numberScales = sizeof(BODY_25_SCALES) / sizeof(BODY_25_SCALES[0]);
+    //     const auto numberColors = sizeof(BODY_25_COLORS) / (3*sizeof(BODY_25_COLORS[0]));
+    //     const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
+    //     const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
+
+    //     // Render key points
+    //     renderKeypointsOld(
+    //         targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, posePtr,
+    //         BODY_25_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25_COLORS, numberColors, radius, lineWidth,
+    //         BODY_25_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame, (googlyEyes ? 15 : -1),
+    //         (googlyEyes ? 16 : -1));
+    // }
+
     __global__ void renderPoseBody25(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth,
+        const int targetHeight, const float* const posePtr, const int numberPeople, const float threshold,
+        const bool googlyEyes, const bool blendOriginalFrame, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -230,24 +265,26 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, BODY_25_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25_COLORS, numberColors,
-                        radius, lineWidth, BODY_25_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1));
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr,
+            globalIdx, x, y, targetWidth, targetHeight,
+            posePtr, BODY_25_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25_COLORS, numberColors,
+            radius, lineWidth, BODY_25_SCALES, numberScales, threshold, alphaColorToAdd,
+            blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1));
     }
 
     __global__ void renderPoseBody25b(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
+        const bool blendOriginalFrame, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -258,80 +295,54 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, BODY_25B_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25B_COLORS, numberColors,
-                        radius, lineWidth, BODY_25B_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1));
-    }
-
-    __global__ void renderPoseBody65(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
-    {
-        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
-        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
-        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
-
-        // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
-        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
-
-        // Other parameters
-        const auto numberPartPairs = sizeof(BODY_65_PAIRS_GPU) / (2*sizeof(BODY_65_PAIRS_GPU[0]));
-        const auto numberScales = sizeof(BODY_65_SCALES) / sizeof(BODY_65_SCALES[0]);
-        const auto numberColors = sizeof(BODY_65_COLORS) / (3*sizeof(BODY_65_COLORS[0]));
-        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
-        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
-
-        // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, BODY_65_PAIRS_GPU, numberPeople, 65, numberPartPairs, BODY_65_COLORS, numberColors,
-                        radius, lineWidth, BODY_65_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1));
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr,
+            globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_25B_PAIRS_GPU, numberPeople, 25, numberPartPairs,
+            BODY_25B_COLORS, numberColors, radius, lineWidth, BODY_25B_SCALES, numberScales, threshold, alphaColorToAdd,
+            blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1));
     }
 
-    __global__ void renderPoseBody95(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
-    {
-        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
-        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
-        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
-
-        // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
-        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
-
-        // Other parameters
-        const auto numberPartPairs = sizeof(BODY_95_PAIRS_GPU) / (2*sizeof(BODY_95_PAIRS_GPU[0]));
-        const auto numberScales = sizeof(BODY_95_SCALES) / sizeof(BODY_95_SCALES[0]);
-        const auto numberColors = sizeof(BODY_95_COLORS) / (3*sizeof(BODY_95_COLORS[0]));
-        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
-        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
-
-        // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, BODY_95_PAIRS_GPU, numberPeople, 95, numberPartPairs, BODY_95_COLORS, numberColors,
-                        radius, lineWidth, BODY_95_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1));
-    }
+    // __global__ void renderPoseBody135Old(
+    //     float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
+    //     const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
+    //     const float alphaColorToAdd)
+    // {
+    //     const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+    //     const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
+
+    //     // Shared parameters
+    //     __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
+    //     __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+    //     __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
+
+    //     // Other parameters
+    //     const auto numberPartPairs = sizeof(BODY_135_PAIRS_GPU) / (2*sizeof(BODY_135_PAIRS_GPU[0]));
+    //     const auto numberScales = sizeof(BODY_135_SCALES) / sizeof(BODY_135_SCALES[0]);
+    //     const auto numberColors = sizeof(BODY_135_COLORS) / (3*sizeof(BODY_135_COLORS[0]));
+    //     const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
+    //     const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
+
+    //     // Render key points
+    //     renderKeypointsOld(
+    //         targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight, posePtr,
+    //         BODY_135_PAIRS_GPU, numberPeople, 135, numberPartPairs, BODY_135_COLORS, numberColors, radius, lineWidth,
+    //         BODY_135_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame, (googlyEyes ? 1 : -1),
+    //         (googlyEyes ? 2 : -1));
+    // }
 
     __global__ void renderPoseBody135(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
+        const bool blendOriginalFrame, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -342,23 +353,25 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, BODY_135_PAIRS_GPU, numberPeople, 135, numberPartPairs, BODY_135_COLORS, numberColors,
-                        radius, lineWidth, BODY_135_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1));
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
+            targetHeight, posePtr, BODY_135_PAIRS_GPU, numberPeople, 135, numberPartPairs, BODY_135_COLORS,
+            numberColors, radius, lineWidth, BODY_135_SCALES, numberScales, threshold, alphaColorToAdd,
+            blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1));
     }
 
     __global__ void renderPoseMpi29Parts(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool blendOriginalFrame, const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const posePtr, const int numberPeople, const float threshold, const bool blendOriginalFrame,
+        const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -369,23 +382,24 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, MPI_PAIRS_GPU, numberPeople, 15, numberPartPairs, MPI_COLORS, numberColors,
-                        radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame);
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
+            targetHeight, posePtr, MPI_PAIRS_GPU, numberPeople, 15, numberPartPairs, MPI_COLORS, numberColors,
+            radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame);
     }
 
     __global__ void renderPoseCar12(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
+        const bool blendOriginalFrame, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -396,24 +410,25 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, CAR_12_PAIRS_GPU, numberPeople, 12, numberPartPairs, CAR_12_COLORS, numberColors,
-                        radius, lineWidth, CAR_12_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 4 : -1), (googlyEyes ? 5 : -1));
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
+            targetHeight, posePtr, CAR_12_PAIRS_GPU, numberPeople, 12, numberPartPairs, CAR_12_COLORS, numberColors,
+            radius, lineWidth, CAR_12_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame,
+            (googlyEyes ? 4 : -1), (googlyEyes ? 5 : -1));
     }
 
     __global__ void renderPoseCar22(
-        float* targetPtr, const int targetWidth, const int targetHeight, const float* const posePtr,
-        const int numberPeople, const float threshold, const bool googlyEyes, const bool blendOriginalFrame,
-        const float alphaColorToAdd)
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
+        const bool blendOriginalFrame, const float alphaColorToAdd)
     {
         const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
         const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
 
         // Shared parameters
-        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
-        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
+        __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
+        __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
         __shared__ float sharedScaleF[POSE_MAX_PEOPLE];
 
         // Other parameters
@@ -424,10 +439,11 @@ namespace op
         const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
 
         // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        posePtr, CAR_22_PAIRS_GPU, numberPeople, 22, numberPartPairs, CAR_22_COLORS, numberColors,
-                        radius, lineWidth, CAR_22_SCALES, numberScales, threshold, alphaColorToAdd,
-                        blendOriginalFrame, (googlyEyes ? 6 : -1), (googlyEyes ? 7 : -1));
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
+            targetHeight, posePtr, CAR_22_PAIRS_GPU, numberPeople, 22, numberPartPairs, CAR_22_COLORS, numberColors,
+            radius, lineWidth, CAR_22_SCALES, numberScales, threshold, alphaColorToAdd, blendOriginalFrame,
+            (googlyEyes ? 6 : -1), (googlyEyes ? 7 : -1));
     }
 
     __global__ void renderBodyPartHeatMaps(float* targetPtr, const int targetWidth, const int targetHeight,
@@ -619,9 +635,10 @@ namespace op
         }
     }
 
-    void renderPoseKeypointsGpu(float* framePtr, const PoseModel poseModel, const int numberPeople,
-                                const Point<int>& frameSize, const float* const posePtr, const float renderThreshold,
-                                const bool googlyEyes, const bool blendOriginalFrame, const float alphaBlending)
+    void renderPoseKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const PoseModel poseModel,
+        const int numberPeople, const Point<int>& frameSize, const float* const posePtr,
+        const float renderThreshold, const bool googlyEyes, const bool blendOriginalFrame, const float alphaBlending)
     {
         try
         {
@@ -642,63 +659,110 @@ namespace op
                 getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
 
                 // Body pose
+                const dim3 threadsPerBlockBoundBox = {1, 1, 1};
+                const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
+                getBoundingBoxPerPersonPose<<<threadsPerBlockBoundBox, numBlocksBox>>>(
+                    maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                    getPoseNumberBodyParts(poseModel), renderThreshold);
                 if (poseModel == PoseModel::BODY_25 || poseModel == PoseModel::BODY_25D
                     || poseModel == PoseModel::BODY_25E)
+                {
+                    // const auto REPS = 1000;
+                    // double timeNormalize0 = 0.;
+                    // double timeNormalize1 = 0.;
+
+                    // // Non-optimized code
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    // renderPoseBody25Old<<<threadsPerBlock, numBlocks>>>(
+                    //     framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
+                    //     blendOriginalFrame, alphaBlending
+                    // );
+                    // OP_CUDA_PROFILE_END(timeNormalize0, 1e3, REPS);
+
+                    // Optimized code
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    // const dim3 threadsPerBlockBoundBox = {1, 1, 1};
+                    // const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
+                    // getBoundingBoxPerPersonPose<<<threadsPerBlockBoundBox, numBlocksBox>>>(
+                    //     maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, 25,
+                    //     renderThreshold);
                     renderPoseBody25<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
                     );
+                    // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);
+
+                    // // Profiling code
+                    // log("  renderOld=" + std::to_string(timeNormalize0) + "ms");
+                    // log("  renderNew=" + std::to_string(timeNormalize1) + "ms");
+                }
                 else if (poseModel == PoseModel::COCO_18)
                     renderPoseCoco<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
                     );
                 else if (poseModel == PoseModel::BODY_19 || poseModel == PoseModel::BODY_19E
                          || poseModel == PoseModel::BODY_19N || poseModel == PoseModel::BODY_19_X2)
                     renderPoseBody19<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
                     );
                 else if (poseModel == PoseModel::BODY_23)
                     renderPoseBody23<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
                     );
                 else if (poseModel == PoseModel::BODY_25B)
                     renderPoseBody25b<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
-                    );
-                else if (poseModel == PoseModel::BODY_65)
-                    renderPoseBody65<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
-                    );
-                else if (poseModel == PoseModel::BODY_95)
-                    renderPoseBody95<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
                     );
                 else if (poseModel == PoseModel::BODY_135)
+                {
+                    // const auto REPS = 500;
+                    // double timeNormalize1 = 0.;
+                    // double timeNormalize2 = 0.;
+
+                    // // Non-optimized code
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    //  renderPoseBody135Old<<<threadsPerBlock, numBlocks>>>(
+                    //      framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
+                    //      blendOriginalFrame, alphaBlending
+                    // );
+                    // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);
+
+                    // Optimized code
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    // const dim3 threadsPerBlockBoundBox = {1, 1, 1};
+                    // const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
+                    // getBoundingBoxPerPersonPose<<<threadsPerBlockBoundBox, numBlocksBox>>>(
+                    //     maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, 135,
+                    //     renderThreshold);
                     renderPoseBody135<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
                     );
+                    // OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS);
+
+                    // // Profiling code
+                    // log("  renderOld=" + std::to_string(timeNormalize1) + "ms");
+                    // log("  renderNew=" + std::to_string(timeNormalize2) + "ms");
+                }
                 else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
                     renderPoseMpi29Parts<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, blendOriginalFrame, alphaBlending
                     );
                 // Car pose
                 else if (poseModel == PoseModel::CAR_12)
                     renderPoseCar12<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
                     );
                 else if (poseModel == PoseModel::CAR_22)
                     renderPoseCar22<<<threadsPerBlock, numBlocks>>>(
-                        framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
-                        blendOriginalFrame, alphaBlending
+                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
+                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
                     );
                 // Unknown
                 else
-- 
GitLab