Render and resize speedup in CUDA (#1209)

35f8c9d1 · joker3212 · Gines · 5f4cf6bf · 35f8c9d1 · 35f8c9d1
16 changed file
--- a/include/openpose/face/faceGpuRenderer.hpp
+++ b/include/openpose/face/faceGpuRenderer.hpp
@@ -23,6 +23,9 @@ namespace op

    private:
        float* pGpuFace; // GPU aux memory
+        float* pMaxPtr; // GPU aux memory
+        float* pMinPtr; // GPU aux memory
+        float* pScalePtr; // GPU aux memory

        DELETE_COPY(FaceGpuRenderer);
    };

--- a/include/openpose/face/renderFace.hpp
+++ b/include/openpose/face/renderFace.hpp
@@ -6,10 +6,13 @@

 namespace op
 {
-    OP_API void renderFaceKeypointsCpu(Array<float>& frameArray, const Array<float>& faceKeypoints, const float renderThreshold);
+    OP_API void renderFaceKeypointsCpu(
+        Array<float>& frameArray, const Array<float>& faceKeypoints, const float renderThreshold);

-    void renderFaceKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const facePtr, const int numberPeople,
-                                const float renderThreshold, const float alphaColorToAdd = FACE_DEFAULT_ALPHA_KEYPOINT);
+    void renderFaceKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
+        const float* const facePtr, const int numberPeople, const float renderThreshold,
+        const float alphaColorToAdd = FACE_DEFAULT_ALPHA_KEYPOINT);
 }

 #endif // OPENPOSE_FACE_RENDER_FACE_HPP
--- a/include/openpose/gpu/cuda.hu
+++ b/include/openpose/gpu/cuda.hu
@@ -121,8 +121,9 @@ namespace op
    }

    template <typename T>
-    inline __device__ T bicubicInterpolate(const T* const sourcePtr, const T xSource, const T ySource, const int widthSource,
-                                           const int heightSource, const int widthSourcePtr)
+    inline __device__ T bicubicInterpolate(
+        const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
+        const int widthSourcePtr)
    {
        int xIntArray[4];
        int yIntArray[4];
@@ -140,6 +141,25 @@ namespace op
        return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
    }

+    template <typename T>
+    inline __device__ T bicubicInterpolate8Times(
+        const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource,
+        const int widthSourcePtr, const int threadIdxX, const int threadIdxY)
+    {
+        // Now we only need dx and dy
+        const T dx = xSource - fastTruncateCuda(int(xSource + 1e-5), 0, widthSource - 1);
+        const T dy = ySource - fastTruncateCuda(int(ySource + 1e-5), 0, heightSource - 1);
+
+        T temp[4];
+        for (unsigned char i = 0; i < 4; i++)
+        {
+            const auto offset = 5 * (i + (threadIdxY > 3 ? 1 : 0)) + (threadIdxX > 3 ? 1 : 0);
+            temp[i] = cubicInterpolate(sourcePtr[offset + 0], sourcePtr[offset + 1],
+                                       sourcePtr[offset + 2], sourcePtr[offset + 3], dx);
+        }
+        return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy);
+    }
+
    template <typename T>
    inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2)
    {

--- a/include/openpose/hand/handGpuRenderer.hpp
+++ b/include/openpose/hand/handGpuRenderer.hpp
@@ -23,6 +23,9 @@ namespace op

    private:
        float* pGpuHand; // GPU aux memory
+        float* pMaxPtr; // GPU aux memory
+        float* pMinPtr; // GPU aux memory
+        float* pScalePtr; // GPU aux memory

        DELETE_COPY(HandGpuRenderer);
    };

--- a/include/openpose/hand/renderHand.hpp
+++ b/include/openpose/hand/renderHand.hpp
@@ -6,12 +6,13 @@

 namespace op
 {
-    OP_API void renderHandKeypointsCpu(Array<float>& frameArray, const std::array<Array<float>, 2>& handKeypoints,
-                                       const float renderThreshold);
+    OP_API void renderHandKeypointsCpu(
+        Array<float>& frameArray, const std::array<Array<float>, 2>& handKeypoints, const float renderThreshold);

-    void renderHandKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const handsPtr,
-                                const int numberHands, const float renderThreshold,
-                                const float alphaColorToAdd = HAND_DEFAULT_ALPHA_KEYPOINT);
+    void renderHandKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
+        const float* const handsPtr, const int numberHands, const float renderThreshold,
+        const float alphaColorToAdd = HAND_DEFAULT_ALPHA_KEYPOINT);
 }

 #endif // OPENPOSE_HAND_GPU_HAND_RENDER_HPP
--- a/include/openpose/net/resizeAndMergeBase.hpp
+++ b/include/openpose/net/resizeAndMergeBase.hpp
@@ -22,6 +22,14 @@ namespace op
        T* targetPtr, const std::vector<const T*>& sourcePtrs, std::vector<T*>& sourceTempPtrs,
        const std::array<int, 4>& targetSize, const std::vector<std::array<int, 4>>& sourceSizes,
        const std::vector<T>& scaleInputToNetInputs = {1.f}, const int gpuID = 0);
-}

+    // Functions for the files cvMatToOpInput/Output
+    void resizeAndMergeRGBGPU(
+        float* targetPtr, const float* const srcPtr, const int sourceWidth, const int sourceHeight,
+        const int targetWidth, const int targetHeight, const float scaleFactor);
+
+    void reorderAndCast(
+        float* targetPtr, const unsigned char* const srcPtr, const int width, const int height);
+
+}
 #endif // OPENPOSE_NET_RESIZE_AND_MERGE_BASE_HPP
--- a/include/openpose/pose/poseGpuRenderer.hpp
+++ b/include/openpose/pose/poseGpuRenderer.hpp
@@ -31,6 +31,9 @@ namespace op
        const std::shared_ptr<PoseExtractorNet> spPoseExtractorNet;
        // Init with thread
        float* pGpuPose; // GPU aux memory
+        float* pMaxPtr; // GPU aux memory
+        float* pMinPtr; // GPU aux memory
+        float* pScalePtr; // GPU aux memory

        DELETE_COPY(PoseGpuRenderer);
    };

--- a/include/openpose/pose/renderPose.hpp
+++ b/include/openpose/pose/renderPose.hpp
@@ -8,39 +8,39 @@

 namespace op
 {
-    OP_API void renderPoseKeypointsCpu(Array<float>& frameArray, const Array<float>& poseKeypoints,
-                                       const PoseModel poseModel, const float renderThreshold,
-                                       const bool blendOriginalFrame = true);
-
-    void renderPoseKeypointsGpu(float* framePtr, const PoseModel poseModel, const int numberPeople,
-                                const Point<int>& frameSize, const float* const posePtr,
-                                const float renderThreshold, const bool googlyEyes = false,
-                                const bool blendOriginalFrame = true,
-                                const float alphaBlending = POSE_DEFAULT_ALPHA_KEYPOINT);
-
-    void renderPoseHeatMapGpu(float* frame, const Point<int>& frameSize, const float* const heatMapPtr,
-                              const Point<int>& heatMapSize, const float scaleToKeepRatio,
-                              const unsigned int part,
-                              const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
-
-    void renderPoseHeatMapsGpu(float* frame, const PoseModel poseModel, const Point<int>& frameSize,
-                               const float* const heatMapPtr, const Point<int>& heatMapSize,
-                               const float scaleToKeepRatio,
-                               const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
-
-    void renderPosePAFGpu(float* framePtr, const PoseModel poseModel, const Point<int>& frameSize,
-                          const float* const heatMapPtr, const Point<int>& heatMapSize,
-                          const float scaleToKeepRatio, const int part,
-                          const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
-
-    void renderPosePAFsGpu(float* framePtr, const PoseModel poseModel, const Point<int>& frameSize,
-                           const float* const heatMapPtr, const Point<int>& heatMapSize,
-                           const float scaleToKeepRatio,
-                           const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
-
-    void renderPoseDistanceGpu(float* framePtr, const Point<int>& frameSize, const float* const heatMapPtr,
-                               const Point<int>& heatMapSize, const float scaleToKeepRatio,
-                               const unsigned int part, const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+    OP_API void renderPoseKeypointsCpu(
+        Array<float>& frameArray, const Array<float>& poseKeypoints, const PoseModel poseModel,
+        const float renderThreshold, const bool blendOriginalFrame = true);
+
+    void renderPoseKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const PoseModel poseModel,
+        const int numberPeople, const Point<int>& frameSize, const float* const posePtr,
+        const float renderThreshold, const bool googlyEyes = false, const bool blendOriginalFrame = true,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_KEYPOINT);
+
+    void renderPoseHeatMapGpu(
+        float* frame, const Point<int>& frameSize, const float* const heatMapPtr, const Point<int>& heatMapSize,
+        const float scaleToKeepRatio, const unsigned int part,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+
+    void renderPoseHeatMapsGpu(
+        float* frame, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
+        const Point<int>& heatMapSize, const float scaleToKeepRatio,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+
+    void renderPosePAFGpu(
+        float* framePtr, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
+        const Point<int>& heatMapSize, const float scaleToKeepRatio, const int part,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+
+    void renderPosePAFsGpu(
+        float* framePtr, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
+        const Point<int>& heatMapSize, const float scaleToKeepRatio,
+        const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
+
+    void renderPoseDistanceGpu(
+        float* framePtr, const Point<int>& frameSize, const float* const heatMapPtr, const Point<int>& heatMapSize,
+        const float scaleToKeepRatio, const unsigned int part, const float alphaBlending = POSE_DEFAULT_ALPHA_HEAT_MAP);
 }

 #endif // OPENPOSE_POSE_RENDER_POSE_HPP
--- a/include/openpose/utilities/render.hu
+++ b/include/openpose/utilities/render.hu
@@ -3,16 +3,217 @@

 namespace op
 {
-    inline __device__ void renderKeypoints(float* targetPtr, float2* sharedMaxs, float2* sharedMins,
-                                           float* sharedScaleF, const int globalIdx, const int x, const int y,
-                                           const int targetWidth, const int targetHeight,
-                                           const float* const keypointsPtr, const unsigned int* const partPairsPtr,
-                                           const int numberPeople, const int numberParts, const int numberPartPairs,
-                                           const float* const rgbColorsPtr, const int numberColors, const float radius,
-                                           const float lineWidth, const float* const keypointScalePtr,
-                                           const int numberScales, const float threshold,
-                                           const float alphaColorToAdd, const bool blendOriginalFrame = true,
-                                           const int googlyEye1 = -1, const int googlyEye2 = -1)
+    __inline__ __device__ void getBoundingBoxPerPerson(
+        float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
+        const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
+    {
+        const auto globalIdx = threadIdx.x;
+        // const auto xIndex = 2*globalIdx;
+        // const auto yIndex = xIndex+1;
+        const auto xIndex = globalIdx;
+        const auto yIndex = numberPeople+globalIdx;
+
+        // Fill shared parameters
+        // if (globalIdx < numberPeople)
+        {
+            auto minValueX = (float)targetWidth;
+            auto minValueY = (float)targetHeight;
+            auto maxValueX = 0.f;
+            auto maxValueY = 0.f;
+            for (auto part = 0 ; part < numberParts ; part++)
+            {
+                const auto index = 3 * (globalIdx*numberParts + part);
+                const auto x = keypointsPtr[index];
+                const auto y = keypointsPtr[index+1];
+                const auto score = keypointsPtr[index+2];
+                if (score > threshold)
+                {
+                    if (x < minValueX)
+                        minValueX = x;
+                    if (x > maxValueX)
+                        maxValueX = x;
+                    if (y < minValueY)
+                        minValueY = y;
+                    if (y > maxValueY)
+                        maxValueY = y;
+                }
+            }
+            if (maxValueX != 0.f && maxValueY != 0.f)
+            {
+                const auto averageX = maxValueX - minValueX;
+                const auto averageY = maxValueY - minValueY;
+                // (averageX + averageY) / 2.f / 400.f
+                scalePtr[globalIdx] = fastTruncateCuda((averageX + averageY) / 400.f, 0.33f, 1.f);
+                const auto constantToAdd = 50.f;
+                maxValueX += constantToAdd;
+                maxValueY += constantToAdd;
+                minValueX -= constantToAdd;
+                minValueY -= constantToAdd;
+            }
+
+            minPtr[xIndex] = minValueX;
+            minPtr[yIndex] = minValueY;
+            maxPtr[xIndex] = maxValueX;
+            maxPtr[yIndex] = maxValueY;
+        }
+    }
+
+    __inline__ __device__ void renderKeypoints(
+        float* targetPtr, float* sharedMaxs, float* sharedMins, float* sharedScaleF, const float* const maxPtr,
+        const float* const minPtr, const float* const scalePtr, const int globalIdx, const int x, const int y,
+        const int targetWidth, const int targetHeight, const float* const keypointsPtr,
+        const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts,
+        const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius,
+        const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold,
+        const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1,
+        const int googlyEye2 = -1)
+    {
+        // Load shared memory
+        if (globalIdx < 2*numberPeople)
+        {
+            sharedMins[globalIdx] = minPtr[globalIdx];
+            sharedMaxs[globalIdx] = maxPtr[globalIdx];
+            if (globalIdx < numberPeople)
+                sharedScaleF[globalIdx] = scalePtr[globalIdx];
+        }
+        __syncthreads();
+
+        // Fill each (x,y) target pixel
+        if (x < targetWidth && y < targetHeight)
+        {
+            const auto baseIndex = 3*(y * targetWidth + x);
+            auto b = targetPtr[baseIndex];
+            auto g = targetPtr[baseIndex+1];
+            auto r = targetPtr[baseIndex+2];
+            if (!blendOriginalFrame)
+            {
+                b = 0.f;
+                g = 0.f;
+                r = 0.f;
+            }
+
+            const auto lineWidthSquared = lineWidth * lineWidth;
+            const auto radiusSquared = radius * radius;
+            for (auto person = 0; person < numberPeople; person++)
+            {
+                // Make sure person x,y in the limits
+                // Make sure person is not empty. Assume all joints are below threshold. Then
+                // maxs = 0 and mins = width/height. So if statement would be false
+                // const auto xIndex = 2*person;
+                // const auto yIndex = xIndex+1;
+                const auto xIndex = person;
+                const auto yIndex = numberPeople+person;
+                if (x <= sharedMaxs[xIndex] && x >= sharedMins[xIndex]
+                     && y <= sharedMaxs[yIndex] && y >= sharedMins[yIndex])
+                {
+                    // Part pair connections
+                    for (auto partPair = 0; partPair < numberPartPairs; partPair++)
+                    {
+                        const auto partA = partPairsPtr[2*partPair];
+                        const auto partB = partPairsPtr[2*partPair+1];
+                        const auto indexA = person*numberParts*3 + partA*3;
+                        const auto xA = keypointsPtr[indexA];
+                        const auto yA = keypointsPtr[indexA + 1];
+                        const auto scoreA = keypointsPtr[indexA + 2];
+                        const auto indexB = person*numberParts*3 + partB*3;
+                        const auto xB = keypointsPtr[indexB];
+                        const auto yB = keypointsPtr[indexB + 1];
+                        const auto scoreB = keypointsPtr[indexB + 2];
+
+                        if (scoreA > threshold && scoreB > threshold)
+                        {
+                            const auto keypointScale = keypointScalePtr[partB%numberScales]
+                                                     * keypointScalePtr[partB%numberScales]
+                                                     * keypointScalePtr[partB%numberScales];
+                            const auto lineWidthScaled = lineWidthSquared * keypointScale;
+                            const auto bSqrt = sharedScaleF[person] * sharedScaleF[person] * lineWidthScaled;
+
+                            const auto xP = (xA + xB) / 2.f;
+                            const auto yP = (yA + yB) / 2.f;
+                            const auto aSqrt = (xA - xP) * (xA - xP) + (yA - yP) * (yA - yP);
+
+                            const auto angle = atan2f(yB - yA, xB - xA);
+                            const auto sine = sinf(angle);
+                            const auto cosine = cosf(angle);
+                            const auto A = cosine * (x - xP) + sine * (y - yP);
+                            const auto B = sine * (x - xP) - cosine * (y - yP);
+
+                            const auto judge = A * A / aSqrt + B * B / bSqrt;
+                            const auto minV = 0.f;
+                            const auto maxV = 1.f;
+                            if (minV <= judge && judge <= maxV)
+                                // Before used partPair vs partB
+                                addColorWeighted(r, g, b, &rgbColorsPtr[(partB%numberColors)*3], alphaColorToAdd);
+                        }
+                    }
+
+                    // Part circles
+                    for (auto part = 0u; part < numberParts; part++)
+                    {
+                        const auto index = 3 * (person*numberParts + part);
+                        const auto localX = keypointsPtr[index];
+                        const auto localY = keypointsPtr[index + 1];
+                        const auto score = keypointsPtr[index + 2];
+
+                        if (score > threshold)
+                        {
+                            const auto keypointScale = keypointScalePtr[part%numberScales]
+                                                     * keypointScalePtr[part%numberScales]
+                                                     * keypointScalePtr[part%numberScales];
+                            const auto radiusScaled = radiusSquared * keypointScale;
+                            const auto dist2 = (x - localX) * (x - localX) + (y - localY) * (y - localY);
+                            // Googly eyes
+                            if (googlyEye1 == part || googlyEye2 == part)
+                            {
+                                const auto eyeRatio = 2.5f * sqrt(radiusScaled);
+                                const auto minr2 = sharedScaleF[person] * sharedScaleF[person]
+                                                 * (eyeRatio - 2) * (eyeRatio - 2);
+                                const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * eyeRatio * eyeRatio;
+                                if (dist2 <= maxr2)
+                                {
+                                    float colorToAdd [3] = {0., 0., 0.};
+                                    if (dist2 <= minr2)
+                                        for (auto& color : colorToAdd)
+                                            color = {255.f};
+                                    if (dist2 <= minr2*0.6f)
+                                    {
+                                        const auto dist3 = (x-4 - localX)
+                                                         * (x-4 - localX) + (y - localY+4) * (y - localY+4);
+                                        if (dist3 > 14.0625f) // 3.75f^2
+                                            for (auto& color : colorToAdd)
+                                                color = {0.f};
+                                    }
+                                    const auto alphaColorToAdd = 0.9f;
+                                    addColorWeighted(r, g, b, colorToAdd, alphaColorToAdd);
+                                }
+                            }
+                            // Other parts
+                            else
+                            {
+                                const auto minr2 = 0.f;
+                                const auto maxr2 = sharedScaleF[person] * sharedScaleF[person] * radiusScaled;
+                                if (minr2 <= dist2 && dist2 <= maxr2)
+                                    addColorWeighted(r, g, b, &rgbColorsPtr[(part%numberColors)*3], alphaColorToAdd);
+                            }
+
+                        }
+                    }
+                }
+            }
+            targetPtr[baseIndex] = b;
+            targetPtr[baseIndex+1] = g;
+            targetPtr[baseIndex+2] = r;
+        }
+    }
+
+    __inline__ __device__ void renderKeypointsOld(
+        float* targetPtr, float2* sharedMaxs, float2* sharedMins, float* sharedScaleF, const int globalIdx,
+        const int x, const int y, const int targetWidth, const int targetHeight, const float* const keypointsPtr,
+        const unsigned int* const partPairsPtr, const int numberPeople, const int numberParts,
+        const int numberPartPairs, const float* const rgbColorsPtr, const int numberColors, const float radius,
+        const float lineWidth, const float* const keypointScalePtr, const int numberScales, const float threshold,
+        const float alphaColorToAdd, const bool blendOriginalFrame = true, const int googlyEye1 = -1,
+        const int googlyEye2 = -1)
    {
        // Fill shared parameters
        if (globalIdx < numberPeople)
@@ -52,7 +253,6 @@ namespace op
                sharedMins[globalIdx].y -= constantToAdd;
            }
        }
-
        __syncthreads();

        // Fill each (x,y) target pixel
@@ -176,4 +376,5 @@ namespace op
    }
 }

+
 #endif // OPENPOSE_UTILITIES_RENDER_HU
--- a/src/openpose/face/faceGpuRenderer.cpp
+++ b/src/openpose/face/faceGpuRenderer.cpp
@@ -10,7 +10,11 @@ namespace op
 {
    FaceGpuRenderer::FaceGpuRenderer(const float renderThreshold, const float alphaKeypoint,
                                     const float alphaHeatMap) :
-        GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap}
+        GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap},
+        pGpuFace{nullptr},
+        pMaxPtr{nullptr},
+        pMinPtr{nullptr},
+        pScalePtr{nullptr}
    {
    }

@@ -20,7 +24,14 @@ namespace op
        {
            // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
            #ifdef USE_CUDA
-                cudaFree(pGpuFace);
+                if (pGpuFace != nullptr)
+                    cudaFree(pGpuFace);
+                if (pMaxPtr != nullptr)
+                    cudaFree(pMaxPtr);
+                if (pMinPtr != nullptr)
+                    cudaFree(pMinPtr);
+                if (pScalePtr != nullptr)
+                    cudaFree(pScalePtr);
            #endif
        }
        catch (const std::exception& e)
@@ -37,6 +48,9 @@ namespace op
            // GPU memory allocation for rendering
            #ifdef USE_CUDA
                cudaMalloc((void**)(&pGpuFace), POSE_MAX_PEOPLE * FACE_NUMBER_PARTS * 3 * sizeof(float));
+                cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * FACE_NUMBER_PARTS);
+                cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * FACE_NUMBER_PARTS);
+                cudaMalloc((void**)&pScalePtr, sizeof(float) * FACE_NUMBER_PARTS);
            #endif
            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
        }
@@ -63,8 +77,9 @@ namespace op
                    cudaMemcpy(pGpuFace, faceKeypoints.getConstPtr(),
                               faceKeypoints.getSize(0) * FACE_NUMBER_PARTS * 3 * sizeof(float),
                               cudaMemcpyHostToDevice);
-                    renderFaceKeypointsGpu(*spGpuMemory, frameSize, pGpuFace, faceKeypoints.getSize(0),
-                                           mRenderThreshold, getAlphaKeypoint());
+                    renderFaceKeypointsGpu(
+                        *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, frameSize, pGpuFace, faceKeypoints.getSize(0),
+                        mRenderThreshold, getAlphaKeypoint());
                    // CUDA check
                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                }

--- a/src/openpose/face/renderFace.cu
+++ b/src/openpose/face/renderFace.cu
@@ -10,17 +10,25 @@ namespace op
    __constant__ const float SCALES[] = {FACE_SCALES_RENDER_GPU};
    __constant__ const float COLORS[] = {FACE_COLORS_RENDER_GPU};

-    __global__ void renderFaceParts(float* targetPtr, const int targetWidth, const int targetHeight,
-                                    const float* const facePtr, const int numberPeople,
-                                    const float threshold, const float alphaColorToAdd)
+    __global__ void getBoundingBoxPerPersonFace(
+        float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
+        const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
+    {
+        getBoundingBoxPerPerson(
+            maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold);
+    }
+
+    __global__ void renderFaceParts(
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const facePtr, const int numberPeople, const float threshold, const float alphaColorToAdd)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
-        __shared__ float2 sharedMins[FACE_MAX_FACES];
-        __shared__ float2 sharedMaxs[FACE_MAX_FACES];
+        __shared__ float sharedMins[2*FACE_MAX_FACES];
+        __shared__ float sharedMaxs[2*FACE_MAX_FACES];
        __shared__ float sharedScaleF[FACE_MAX_FACES];

        // Other parameters
@@ -31,23 +39,33 @@ namespace op
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 250.f;

        // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        facePtr, PART_PAIRS_GPU, numberPeople, FACE_NUMBER_PARTS, numberPartPairs, COLORS,
-                        numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
+            targetHeight, facePtr, PART_PAIRS_GPU, numberPeople, FACE_NUMBER_PARTS, numberPartPairs, COLORS,
+            numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
    }

-    void renderFaceKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const facePtr,
-                                const int numberPeople, const float renderThreshold, const float alphaColorToAdd)
+    void renderFaceKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
+        const float* const facePtr, const int numberPeople, const float renderThreshold, const float alphaColorToAdd)
    {
        try
        {
            if (numberPeople > 0)
            {
+                // Get bouding boxes
+                const dim3 threadsPerBlockBoundBox = {1, 1, 1};
+                const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
+                getBoundingBoxPerPersonFace<<<threadsPerBlockBoundBox, numBlocksBox>>>(
+                    maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, facePtr, numberPeople,
+                    FACE_NUMBER_PARTS, renderThreshold);
+                // Draw hands
                dim3 threadsPerBlock;
                dim3 numBlocks;
                getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
-                renderFaceParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, facePtr,
-                                                                numberPeople, renderThreshold, alphaColorToAdd);
+                renderFaceParts<<<threadsPerBlock, numBlocks>>>(
+                    framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, facePtr, numberPeople,
+                    renderThreshold, alphaColorToAdd);
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            }
        }

--- a/src/openpose/hand/handGpuRenderer.cpp
+++ b/src/openpose/hand/handGpuRenderer.cpp
@@ -10,7 +10,11 @@ namespace op
 {
    HandGpuRenderer::HandGpuRenderer(const float renderThreshold, const float alphaKeypoint,
                                     const float alphaHeatMap) :
-        GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap}
+        GpuRenderer{renderThreshold, alphaKeypoint, alphaHeatMap},
+        pGpuHand{nullptr},
+        pMaxPtr{nullptr},
+        pMinPtr{nullptr},
+        pScalePtr{nullptr}
    {
    }

@@ -20,7 +24,14 @@ namespace op
        {
            // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
            #ifdef USE_CUDA
-                cudaFree(pGpuHand);
+                if (pGpuHand != nullptr)
+                    cudaFree(pGpuHand);
+                if (pMaxPtr != nullptr)
+                    cudaFree(pMaxPtr);
+                if (pMinPtr != nullptr)
+                    cudaFree(pMinPtr);
+                if (pScalePtr != nullptr)
+                    cudaFree(pScalePtr);
            #endif
        }
        catch (const std::exception& e)
@@ -37,6 +48,9 @@ namespace op
            // GPU memory allocation for rendering
            #ifdef USE_CUDA
                cudaMalloc((void**)(&pGpuHand), HAND_MAX_HANDS * HAND_NUMBER_PARTS * 3 * sizeof(float));
+                cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * HAND_MAX_HANDS);
+                cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * HAND_MAX_HANDS);
+                cudaMalloc((void**)&pScalePtr, sizeof(float) * HAND_MAX_HANDS);
            #endif
            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
        }
@@ -69,7 +83,8 @@ namespace op
                    cudaMemcpy(pGpuHand + handVolume, handKeypoints[1].getConstPtr(),
                               handVolume * sizeof(float), cudaMemcpyHostToDevice);
                    renderHandKeypointsGpu(
-                        *spGpuMemory, frameSize, pGpuHand, 2 * numberPeople, mRenderThreshold, getAlphaKeypoint());
+                        *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, frameSize, pGpuHand, 2 * numberPeople,
+                        mRenderThreshold, getAlphaKeypoint());
                    // CUDA check
                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                }

--- a/src/openpose/hand/renderHand.cu
+++ b/src/openpose/hand/renderHand.cu
@@ -10,17 +10,25 @@ namespace op
    __constant__ const float SCALES[] = {HAND_SCALES_RENDER_GPU};
    __constant__ const float COLORS[] = {HAND_COLORS_RENDER_GPU};

-    __global__ void renderHandsParts(float* targetPtr, const int targetWidth, const int targetHeight,
-                                     const float* const handsPtr, const int numberHands,
-                                     const float threshold, const float alphaColorToAdd)
+    __global__ void getBoundingBoxPerPersonHand(
+        float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
+        const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
+    {
+        getBoundingBoxPerPerson(
+            maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold);
+    }
+
+    __global__ void renderHandsParts(
+        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
+        const float* const handsPtr, const int numberHands, const float threshold, const float alphaColorToAdd)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
-        __shared__ float2 sharedMins[HAND_MAX_HANDS];
-        __shared__ float2 sharedMaxs[HAND_MAX_HANDS];
+        __shared__ float sharedMins[2*HAND_MAX_HANDS];
+        __shared__ float sharedMaxs[2*HAND_MAX_HANDS];
        __shared__ float sharedScaleF[HAND_MAX_HANDS];

        // Other parameters
@@ -31,23 +39,33 @@ namespace op
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 80.f;

        // Render key points
-        renderKeypoints(targetPtr, sharedMaxs, sharedMins, sharedScaleF, globalIdx, x, y, targetWidth, targetHeight,
-                        handsPtr, PART_PAIRS_GPU, numberHands, HAND_NUMBER_PARTS, numberPartPairs, COLORS,
-                        numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
+        renderKeypoints(
+            targetPtr, sharedMaxs, sharedMins, sharedScaleF, maxPtr, minPtr, scalePtr, globalIdx, x, y, targetWidth,
+            targetHeight, handsPtr, PART_PAIRS_GPU, numberHands, HAND_NUMBER_PARTS, numberPartPairs, COLORS,
+            numberColors, radius, lineWidth, SCALES, numberScales, threshold, alphaColorToAdd);
    }

-    void renderHandKeypointsGpu(float* framePtr, const Point<int>& frameSize, const float* const handsPtr,
-                                const int numberHands, const float renderThreshold, const float alphaColorToAdd)
+    void renderHandKeypointsGpu(
+        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const Point<int>& frameSize,
+        const float* const handsPtr, const int numberHands, const float renderThreshold, const float alphaColorToAdd)
    {
        try
        {
            if (numberHands > 0)
            {
+                // Get bouding boxes
+                const dim3 threadsPerBlockBoundBox = {1, 1, 1};
+                const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
+                getBoundingBoxPerPersonHand<<<threadsPerBlockBoundBox, numBlocksBox>>>(
+                    maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, handsPtr, numberHands,
+                    HAND_NUMBER_PARTS, renderThreshold);
+                // Draw hands
                dim3 threadsPerBlock;
                dim3 numBlocks;
                getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
-                renderHandsParts<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, handsPtr,
-                                                                 numberHands, renderThreshold, alphaColorToAdd);
+                renderHandsParts<<<threadsPerBlock, numBlocks>>>(
+                    framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, handsPtr, numberHands,
+                    renderThreshold, alphaColorToAdd);
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            }
        }

--- a/src/openpose/net/resizeAndMergeBase.cu
+++ b/src/openpose/net/resizeAndMergeBase.cu
@@ -6,19 +6,111 @@ namespace op
 {
    const auto THREADS_PER_BLOCK_1D = 16u;

+    // template <typename T>
+    // __global__ void resizeKernelOld(
+    //     T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+    //     const int targetHeight)
+    // {
+    //     const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+
+    //     if (x < targetWidth && y < targetHeight)
+    //     {
+    //         const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
+    //         const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
+    //         targetPtr[y*targetWidth+x] = bicubicInterpolate(
+    //             sourcePtr, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+    //     }
+    // }
+
+    template <typename T>
+    __global__ void resizeKernel(
+        T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+        const int targetHeight, const int channels)
+    {
+        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+        const auto sourceArea = sourceWidth * sourceHeight;
+        const auto targetArea = targetWidth * targetHeight;
+
+        if (x < targetWidth && y < targetHeight && channel < channels)
+        {
+            const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
+            const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
+            const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
+            targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate(
+                sourcePtrChannel, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+        }
+    }
+
    template <typename T>
-    __global__ void resizeKernel(T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight,
-                                 const int targetWidth, const int targetHeight)
+    __global__ void resizeAndPadKernel(
+        T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+        const int targetHeight, const float rescaleFactor, const int channels)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+        const auto sourceArea = sourceWidth * sourceHeight;
+        const auto targetArea = targetWidth * targetHeight;
+
+        if (x < targetWidth && y < targetHeight && channel < channels)
+        {
+            const T xSource = (x + T(0.5f)) * 1.0 / T(rescaleFactor) - T(0.5f);
+            const T ySource = (y + T(0.5f)) * 1.0 / T(rescaleFactor) - T(0.5f);
+            const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
+            if (x < sourceWidth * rescaleFactor && y < sourceHeight * rescaleFactor)
+                targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate(
+                    sourcePtrChannel, xSource, ySource, sourceWidth, sourceHeight, sourceWidth);
+            else
+                targetPtr[channel * targetArea + y*targetWidth+x] = 0;
+        }
+    }
+
+
+        template <typename T>
+    __global__ void resize8TimesKernel(
+        T* targetPtr, const T* const sourcePtr, const int sourceWidth, const int sourceHeight, const int targetWidth,
+        const int targetHeight, const unsigned int rescaleFactor)
+    {
+        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto channel = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+        // Load shared memory
+        // If resize >= 5, then #threads per block >= # elements of shared memory
+        __shared__ T sourcePtrShared[25];
+        const auto sharedLoadId = threadIdx.x + rescaleFactor*threadIdx.y;
+        if (sharedLoadId < 25)
+        {
+            const auto minTargetX = blockIdx.x * rescaleFactor;
+            const auto minSourceX = (minTargetX + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
+            const auto minSourceXInt_1 = int(minSourceX+ 1e-5);
+            const auto minSourceXInt = minSourceXInt_1 - 1;
+            const auto minTargetY = blockIdx.y * rescaleFactor;
+            const auto minSourceY = (minTargetY + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
+            const auto minSourceYInt_1 = int(minSourceY  + 1e-5);
+            const auto minSourceYInt = minSourceYInt_1 - 1;
+
+            const auto yClean = fastTruncateCuda(int(minSourceYInt+sharedLoadId/5 + 1e-5), 0, sourceHeight - 1);
+            const auto xClean = fastTruncateCuda(int(minSourceXInt+sharedLoadId%5 + 1e-5), 0, sourceWidth - 1);
+            const auto sourceArea = sourceWidth * sourceHeight;
+            const T* sourcePtrChannel = sourcePtr + channel * sourceArea;
+            sourcePtrShared[sharedLoadId] = sourcePtrChannel[yClean * sourceWidth + xClean];
+        }
+        // Wait here until shared memory has been loaded
+        __syncthreads();

        if (x < targetWidth && y < targetHeight)
        {
+            const auto targetArea = targetWidth * targetHeight;
            const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
            const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
-            targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
-                                                            sourceWidth);
+            targetPtr[channel * targetArea + y*targetWidth+x] = bicubicInterpolate8Times(
+                sourcePtrShared, xSource, ySource, sourceWidth, sourceHeight, sourceWidth, threadIdx.x, threadIdx.y);
        }
    }

@@ -58,10 +150,53 @@ namespace op
        }
    }

+    __global__ void reorderAndCastKernel(
+        float* targetPtr, const unsigned char* const srcPtr, const int width, const int height)
+    {
+        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
+        const auto c = (blockIdx.z * blockDim.z) + threadIdx.z;
+        if (x < width && y < height)
+        {
+            const auto channels = 3;
+            const auto originFramePtrOffsetY = y * width;
+            const auto channelOffset = c * width * height;
+            const auto targetIndex = channelOffset + y * width + x;
+            const auto srcIndex = (originFramePtrOffsetY + x) * channels + c;
+            targetPtr[targetIndex] =  float(srcPtr[srcIndex]) * (1/256.f) - 0.5f;
+        }
+    }
+
+    void reorderAndCast(float* targetPtr, const unsigned char* const srcPtr, const int width, const int height)
+    {
+        const dim3 threadsPerBlock{32, 1, 1};
+        const dim3 numBlocks{
+            getNumberCudaBlocks(width, threadsPerBlock.x),
+            getNumberCudaBlocks(height, threadsPerBlock.y),
+            getNumberCudaBlocks(3, threadsPerBlock.z)};
+        reorderAndCastKernel<<<numBlocks, threadsPerBlock>>>(targetPtr, srcPtr, width, height);
+    }
+
+    void resizeAndMergeRGBGPU(
+        float* targetPtr, const float* const srcPtr, const int sourceWidth, const int sourceHeight,
+        const int targetWidth, const int targetHeight, const float scaleFactor)
+
+    {
+        const auto channels = 3;
+        const dim3 threadsPerBlock{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1};
+        const dim3 numBlocks{
+            getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
+            getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
+            getNumberCudaBlocks(channels, threadsPerBlock.z)};
+
+        resizeAndPadKernel<<<numBlocks, threadsPerBlock>>>(
+            targetPtr, srcPtr, sourceWidth, sourceHeight, targetWidth, targetHeight, scaleFactor, channels);
+    }
+
    template <typename T>
-    void resizeAndMergeGpu(T* targetPtr, const std::vector<const T*>& sourcePtrs, const std::array<int, 4>& targetSize,
-                           const std::vector<std::array<int, 4>>& sourceSizes,
-                           const std::vector<T>& scaleInputToNetInputs)
+    void resizeAndMergeGpu(
+        T* targetPtr, const std::vector<const T*>& sourcePtrs, const std::array<int, 4>& targetSize,
+        const std::vector<std::array<int, 4>>& sourceSizes, const std::vector<T>& scaleInputToNetInputs)
    {
        try
        {
@@ -90,20 +225,60 @@ namespace op
                const auto num = sourceSize[0];
                if (targetSize[0] > 1 || num == 1)
                {
-                    const auto sourceChannelOffset = sourceHeight * sourceWidth;
-                    const auto targetChannelOffset = targetWidth * targetHeight;
-                    for (auto n = 0; n < num; n++)
-                    {
-                        const auto offsetBase = n*channels;
-                        for (auto c = 0 ; c < channels ; c++)
-                        {
-                            const auto offset = offsetBase + c;
-                            resizeKernel<<<numBlocks, threadsPerBlock>>>(targetPtr + offset * targetChannelOffset,
-                                                                         sourcePtrs.at(0) + offset * sourceChannelOffset,
-                                                                         sourceWidth, sourceHeight, targetWidth,
-                                                                         targetHeight);
-                        }
-                    }
+                    // // Profiling code
+                    // const auto REPS = 250;
+                    // double timeNormalize0 = 0.;
+                    // double timeNormalize1 = 0.;
+                    // double timeNormalize2 = 0.;
+                    // double timeNormalize3 = 0.;
+                    // // Non-optimized function
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    // const auto sourceChannelOffset = sourceHeight * sourceWidth;
+                    // const auto targetChannelOffset = targetWidth * targetHeight;
+                    // for (auto n = 0; n < num; n++)
+                    // {
+                    //     const auto offsetBase = n*channels;
+                    //     for (auto c = 0 ; c < channels ; c++)
+                    //     {
+                    //         const auto offset = offsetBase + c;
+                    //         resizeKernelOld<<<numBlocks, threadsPerBlock>>>(
+                    //             targetPtr + offset * targetChannelOffset,
+                    //             sourcePtrs.at(0) + offset * sourceChannelOffset,
+                    //             sourceWidth, sourceHeight, targetWidth, targetHeight);
+                    //     }
+                    // }
+                    // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);
+
+                    // Optimized function for any resize size (suboptimal for 8x resize)
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    const dim3 threadsPerBlock{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1};
+                    const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
+                                         getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
+                                         getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
+                    resizeKernel<<<numBlocks, threadsPerBlock>>>(
+                        targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight,
+                        num * channels);
+                    // OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS);
+
+                    // // Optimized function for 8x resize
+                    // OP_CUDA_PROFILE_INIT(REPS);
+                    // if (targetWidth / sourceWidth != 8 || targetHeight / sourceHeight != 8)
+                    //     error("Kernel only implemented for 8x resize. Notify us if this error appears.",
+                    //         __LINE__, __FUNCTION__, __FILE__);
+                    // const auto rescaleFactor = (unsigned int) std::ceil((float)(targetHeight) / (float)(sourceHeight));
+
+                    // const dim3 threadsPerBlock{rescaleFactor, rescaleFactor, 1};
+                    // const dim3 numBlocks{getNumberCudaBlocks(targetWidth, threadsPerBlock.x),
+                    //                      getNumberCudaBlocks(targetHeight, threadsPerBlock.y),
+                    //                      getNumberCudaBlocks(num * channels, threadsPerBlock.z)};
+                    // resize8TimesKernel<<<numBlocks, threadsPerBlock>>>(
+                    //     targetPtr, sourcePtrs.at(0), sourceWidth, sourceHeight, targetWidth, targetHeight, rescaleFactor);
+                    // OP_CUDA_PROFILE_END(timeNormalize3, 1e3, REPS);
+
+                    // Profiling code
+                    // log("  Res(ori)=" + std::to_string(timeNormalize1) + "ms");
+                    // log("  Res(new)=" + std::to_string(timeNormalize2) + "ms");
+                    // log("  Res(new8x)=" + std::to_string(timeNormalize3) + "ms");
                }
                // Old inefficient multi-scale merging
                else

--- a/src/openpose/pose/poseGpuRenderer.cpp
+++ b/src/openpose/pose/poseGpuRenderer.cpp
@@ -22,7 +22,10 @@ namespace op
                    getNumberElementsToRender(poseModel)}, // mNumberElementsToRender
        PoseRenderer{poseModel},
        spPoseExtractorNet{poseExtractorNet},
-        pGpuPose{nullptr}
+        pGpuPose{nullptr},
+        pMaxPtr{nullptr},
+        pMinPtr{nullptr},
+        pScalePtr{nullptr}
    {
    }

@@ -32,7 +35,14 @@ namespace op
        {
            // Free CUDA pointers - Note that if pointers are 0 (i.e., nullptr), no operation is performed.
            #ifdef USE_CUDA
-                cudaFree(pGpuPose);
+                if (pGpuPose != nullptr)
+                    cudaFree(pGpuPose);
+                if (pMaxPtr != nullptr)
+                    cudaFree(pMaxPtr);
+                if (pMinPtr != nullptr)
+                    cudaFree(pMinPtr);
+                if (pScalePtr != nullptr)
+                    cudaFree(pScalePtr);
            #endif
        }
        catch (const std::exception& e)
@@ -49,7 +59,10 @@ namespace op
            // GPU memory allocation for rendering
            #ifdef USE_CUDA
                cudaMalloc((void**)(&pGpuPose),
-                           POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float));
+                    POSE_MAX_PEOPLE * getPoseNumberBodyParts(mPoseModel) * 3 * sizeof(float));
+                cudaMalloc((void**)&pMaxPtr, sizeof(float) * 2 * POSE_MAX_PEOPLE);
+                cudaMalloc((void**)&pMinPtr, sizeof(float) * 2 * POSE_MAX_PEOPLE);
+                cudaMalloc((void**)&pScalePtr, sizeof(float) * POSE_MAX_PEOPLE);
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            #endif
            log("Finished initialization on thread.", Priority::Low, __LINE__, __FUNCTION__, __FILE__);
@@ -60,10 +73,9 @@ namespace op
        }
    }

-    std::pair<int, std::string> PoseGpuRenderer::renderPose(Array<float>& outputData,
-                                                            const Array<float>& poseKeypoints,
-                                                            const float scaleInputToOutput,
-                                                            const float scaleNetToOutput)
+    std::pair<int, std::string> PoseGpuRenderer::renderPose(
+        Array<float>& outputData, const Array<float>& poseKeypoints, const float scaleInputToOutput,
+        const float scaleNetToOutput)
    {
        try
        {
@@ -78,7 +90,6 @@ namespace op
                if (numberPeople > 0 || elementRendered != 0 || !mBlendOriginalFrame)
                {
                    cpuToGpuMemoryIfNotCopiedYet(outputData.getPtr(), outputData.getVolume());
-                    cudaCheck(__LINE__, __FUNCTION__, __FILE__);
                    const auto numberBodyParts = getPoseNumberBodyParts(mPoseModel);
                    const auto hasBkg = addBkgChannel(mPoseModel);
                    const auto numberBodyPartsPlusBkg = numberBodyParts + (hasBkg ? 1 : 0);
@@ -96,9 +107,9 @@ namespace op
                                       poseKeypointsRescaled.getConstPtr(),
                                       numberPeople * numberBodyParts * 3 * sizeof(float),
                                       cudaMemcpyHostToDevice);
-                        renderPoseKeypointsGpu(*spGpuMemory, mPoseModel, numberPeople, frameSize, pGpuPose,
-                                               mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame,
-                                               getAlphaKeypoint());
+                        renderPoseKeypointsGpu(
+                            *spGpuMemory, pMaxPtr, pMinPtr, pScalePtr, mPoseModel, numberPeople, frameSize, pGpuPose,
+                            mRenderThreshold, mShowGooglyEyes, mBlendOriginalFrame, getAlphaKeypoint());
                    }
                    else
                    {

--- a/src/openpose/pose/renderPose.cu
+++ b/src/openpose/pose/renderPose.cu