Reduced latency with CUDA cvMatToOpOutput

00b6842a · Gines Hidalgo · db415913 · 00b6842a · 00b6842a · 00b6842a
4 changed file
--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -267,6 +267,7 @@ OpenPose Library - Release Notes
        3. ~2-4x speedup for NMS.
        4. ~2x speedup for image resize.
        5. +25-30% speedup for rendering.
+        6. Reduced latency and increased speed by moving the resize in cvMatToOpOutput to CUDA. It generalizes better to higher number of GPUs.
    3. Unity binding of OpenPose released. OpenPose adds the flag `BUILD_UNITY_SUPPORT` on CMake, which enables special Unity code so it can be built as a Unity plugin.
    4. If camera is unplugged, OpenPose GUI and command line will display a warning and try to reconnect it.
    5. Wrapper classes simplified and renamed. Wrapper renamed as WrapperT, and created Wrapper as the non-templated class equivalent.

--- a/include/openpose/wrapper/wrapperAuxiliary.hpp
+++ b/include/openpose/wrapper/wrapperAuxiliary.hpp
@@ -273,8 +273,7 @@ namespace op
                    cvMatToOpInputW = std::make_shared<WCvMatToOpInput<TDatumsSP>>(cvMatToOpInput);
                }
                // Note: We realized that somehow doing it on GPU for any number of GPUs does speedup the whole OP
-                resizeOnCpu = true;
+                resizeOnCpu = false;
-                // resizeOnCpu = (numberGpuThreads < 3);
                if (addCvMatToOpOutput && (resizeOnCpu || !renderOutputGpu))
                {
                    const auto gpuResize = false;

--- a/src/openpose/core/cvMatToOpInput.cpp
+++ b/src/openpose/core/cvMatToOpInput.cpp
@@ -87,41 +87,46 @@ namespace op
                // CUDA version (if #Gpus > n)
                else
                {
-                    // (Re)Allocate temporary memory
+                    #ifdef USE_CUDA
-                    const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
+                        // (Re)Allocate temporary memory
-                    const unsigned int outputImageSize = 3 * netInputSizes[i].x * netInputSizes[i].y;
+                        const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
-                    if (pInputMaxSize < inputImageSize)
+                        const unsigned int outputImageSize = 3 * netInputSizes[i].x * netInputSizes[i].y;
-                    {
+                        if (pInputMaxSize < inputImageSize)
-                        pInputMaxSize = inputImageSize;
+                        {
-                        // Free temporary memory
+                            pInputMaxSize = inputImageSize;
-                        cudaFree(pInputImageCuda);
+                            // Free temporary memory
-                        cudaFree(pInputImageReorderedCuda);
+                            cudaFree(pInputImageCuda);
-                        // Re-allocate memory
+                            cudaFree(pInputImageReorderedCuda);
-                        cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
+                            // Re-allocate memory
-                        cudaMalloc((void**)&pInputImageReorderedCuda, sizeof(float) * inputImageSize);
+                            cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
-                    }
+                            cudaMalloc((void**)&pInputImageReorderedCuda, sizeof(float) * inputImageSize);
-                    if (pOutputMaxSize < outputImageSize)
+                        }
-                    {
+                        if (pOutputMaxSize < outputImageSize)
-                        pOutputMaxSize = outputImageSize;
+                        {
-                        // Free temporary memory
+                            pOutputMaxSize = outputImageSize;
-                        cudaFree(pOutputImageCuda);
+                            // Free temporary memory
-                        // Re-allocate memory
+                            cudaFree(pOutputImageCuda);
-                        cudaMalloc((void**)&pOutputImageCuda, sizeof(float) * outputImageSize);
+                            // Re-allocate memory
-                    }
+                            cudaMalloc((void**)&pOutputImageCuda, sizeof(float) * outputImageSize);
-                    // Copy image to GPU
+                        }
-                    cudaMemcpy(
+                        // Copy image to GPU
-                        pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize,
+                        cudaMemcpy(
-                        cudaMemcpyHostToDevice);
+                            pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize,
-                    // Resize image on GPU
+                            cudaMemcpyHostToDevice);
-                    reorderAndCast(pInputImageReorderedCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, 3);
+                        // Resize image on GPU
-                    resizeAndMergeRGBGPU(
+                        reorderAndCast(pInputImageReorderedCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, 3);
-                        pOutputImageCuda, pInputImageReorderedCuda, cvInputData.cols, cvInputData.rows,
+                        resizeAndMergeRGBGPU(
-                        netInputSizes[i].x, netInputSizes[i].y, (float)scaleInputToNetInputs[i]);
+                            pOutputImageCuda, pInputImageReorderedCuda, cvInputData.cols, cvInputData.rows,
-                    // Copy back to CPU
+                            netInputSizes[i].x, netInputSizes[i].y, (float)scaleInputToNetInputs[i]);
-                    inputNetData[i].reset({1, 3, netInputSizes.at(i).y, netInputSizes.at(i).x});
+                        // Copy back to CPU
-                    cudaMemcpy(
+                        inputNetData[i].reset({1, 3, netInputSizes.at(i).y, netInputSizes.at(i).x});
-                        inputNetData[i].getPtr(), pOutputImageCuda, sizeof(float) * outputImageSize,
+                        cudaMemcpy(
-                        cudaMemcpyDeviceToHost);
+                            inputNetData[i].getPtr(), pOutputImageCuda, sizeof(float) * outputImageSize,
+                            cudaMemcpyDeviceToHost);
+                    #else
+                        error("You need to compile OpenPose with CUDA support in order to use GPU resize.",
+                            __LINE__, __FUNCTION__, __FILE__);
+                    #endif
                }
            }
            return inputNetData;

--- a/src/openpose/core/cvMatToOpOutput.cpp
+++ b/src/openpose/core/cvMatToOpOutput.cpp
@@ -89,35 +89,40 @@ namespace op
            // CUDA version (if #Gpus > 3)
            else
            {
+                #ifdef USE_CUDA
 // Input image can be shared between this one and cvMatToOpInput.hpp
-                // (Free and re-)Allocate temporary memory
+                    // (Free and re-)Allocate temporary memory
-                const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
+                    const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
-                if (pInputMaxSize < inputImageSize)
+                    if (pInputMaxSize < inputImageSize)
-                {
+                    {
-                    pInputMaxSize = inputImageSize;
+                        pInputMaxSize = inputImageSize;
-                    cudaFree(pInputImageCuda);
+                        cudaFree(pInputImageCuda);
-                    cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
+                        cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
-                }
+                    }
-                // (Free and re-)Allocate temporary memory
+                    // (Free and re-)Allocate temporary memory
-                const unsigned int outputImageSize = 3 * outputResolution.x * outputResolution.y;
+                    const unsigned int outputImageSize = 3 * outputResolution.x * outputResolution.y;
-                if (*spOutputMaxSize < outputImageSize)
+                    if (*spOutputMaxSize < outputImageSize)
-                {
+                    {
-                    *spOutputMaxSize = outputImageSize;
+                        *spOutputMaxSize = outputImageSize;
-                    cudaFree(*spOutputImageCuda);
+                        cudaFree(*spOutputImageCuda);
-                    cudaMalloc((void**)spOutputImageCuda.get(), sizeof(float) * outputImageSize);
+                        cudaMalloc((void**)spOutputImageCuda.get(), sizeof(float) * outputImageSize);
-                }
+                    }
-                // Copy original image to GPU
+                    // Copy original image to GPU
-                cudaMemcpy(
+                    cudaMemcpy(
-                    pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize, cudaMemcpyHostToDevice);
+                        pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize, cudaMemcpyHostToDevice);
-                // Resize output image on GPU
+                    // Resize output image on GPU
-                resizeAndMergeRGBGPU(
+                    resizeAndMergeRGBGPU(
-                    *spOutputImageCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, outputResolution.x,
+                        *spOutputImageCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, outputResolution.x,
-                    outputResolution.y, (float)scaleInputToOutput);
+                        outputResolution.y, (float)scaleInputToOutput);
-                *spGpuMemoryAllocated = true;
+                    *spGpuMemoryAllocated = true;
-                // // No need to copy output image back to CPU
+                    // // No need to copy output image back to CPU
-                // cudaMemcpy(
+                    // cudaMemcpy(
-                //     outputData.getPtr(), *spOutputImageCuda, sizeof(float) * outputImageSize,
+                    //     outputData.getPtr(), *spOutputImageCuda, sizeof(float) * outputImageSize,
-                //     cudaMemcpyDeviceToHost);
+                    //     cudaMemcpyDeviceToHost);
+                #else
+                    error("You need to compile OpenPose with CUDA support in order to use GPU resize.",
+                        __LINE__, __FUNCTION__, __FILE__);
+                #endif
            }
            // Return result
            return outputData;