提交 00b6842a 编写于 作者: G Gines Hidalgo

Reduced latency with CUDA cvMatToOpOutput

上级 db415913
...@@ -267,6 +267,7 @@ OpenPose Library - Release Notes ...@@ -267,6 +267,7 @@ OpenPose Library - Release Notes
3. ~2-4x speedup for NMS. 3. ~2-4x speedup for NMS.
4. ~2x speedup for image resize. 4. ~2x speedup for image resize.
5. +25-30% speedup for rendering. 5. +25-30% speedup for rendering.
6. Reduced latency and increased speed by moving the resize in cvMatToOpOutput to CUDA. It generalizes better to higher number of GPUs.
3. Unity binding of OpenPose released. OpenPose adds the flag `BUILD_UNITY_SUPPORT` on CMake, which enables special Unity code so it can be built as a Unity plugin. 3. Unity binding of OpenPose released. OpenPose adds the flag `BUILD_UNITY_SUPPORT` on CMake, which enables special Unity code so it can be built as a Unity plugin.
4. If camera is unplugged, OpenPose GUI and command line will display a warning and try to reconnect it. 4. If camera is unplugged, OpenPose GUI and command line will display a warning and try to reconnect it.
5. Wrapper classes simplified and renamed. Wrapper renamed as WrapperT, and created Wrapper as the non-templated class equivalent. 5. Wrapper classes simplified and renamed. Wrapper renamed as WrapperT, and created Wrapper as the non-templated class equivalent.
......
...@@ -273,8 +273,7 @@ namespace op ...@@ -273,8 +273,7 @@ namespace op
cvMatToOpInputW = std::make_shared<WCvMatToOpInput<TDatumsSP>>(cvMatToOpInput); cvMatToOpInputW = std::make_shared<WCvMatToOpInput<TDatumsSP>>(cvMatToOpInput);
} }
// Note: We realized that somehow doing it on GPU for any number of GPUs does speedup the whole OP // Note: We realized that somehow doing it on GPU for any number of GPUs does speedup the whole OP
resizeOnCpu = true; resizeOnCpu = false;
// resizeOnCpu = (numberGpuThreads < 3);
if (addCvMatToOpOutput && (resizeOnCpu || !renderOutputGpu)) if (addCvMatToOpOutput && (resizeOnCpu || !renderOutputGpu))
{ {
const auto gpuResize = false; const auto gpuResize = false;
......
...@@ -87,41 +87,46 @@ namespace op ...@@ -87,41 +87,46 @@ namespace op
// CUDA version (if #Gpus > n) // CUDA version (if #Gpus > n)
else else
{ {
// (Re)Allocate temporary memory #ifdef USE_CUDA
const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols; // (Re)Allocate temporary memory
const unsigned int outputImageSize = 3 * netInputSizes[i].x * netInputSizes[i].y; const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
if (pInputMaxSize < inputImageSize) const unsigned int outputImageSize = 3 * netInputSizes[i].x * netInputSizes[i].y;
{ if (pInputMaxSize < inputImageSize)
pInputMaxSize = inputImageSize; {
// Free temporary memory pInputMaxSize = inputImageSize;
cudaFree(pInputImageCuda); // Free temporary memory
cudaFree(pInputImageReorderedCuda); cudaFree(pInputImageCuda);
// Re-allocate memory cudaFree(pInputImageReorderedCuda);
cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize); // Re-allocate memory
cudaMalloc((void**)&pInputImageReorderedCuda, sizeof(float) * inputImageSize); cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
} cudaMalloc((void**)&pInputImageReorderedCuda, sizeof(float) * inputImageSize);
if (pOutputMaxSize < outputImageSize) }
{ if (pOutputMaxSize < outputImageSize)
pOutputMaxSize = outputImageSize; {
// Free temporary memory pOutputMaxSize = outputImageSize;
cudaFree(pOutputImageCuda); // Free temporary memory
// Re-allocate memory cudaFree(pOutputImageCuda);
cudaMalloc((void**)&pOutputImageCuda, sizeof(float) * outputImageSize); // Re-allocate memory
} cudaMalloc((void**)&pOutputImageCuda, sizeof(float) * outputImageSize);
// Copy image to GPU }
cudaMemcpy( // Copy image to GPU
pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize, cudaMemcpy(
cudaMemcpyHostToDevice); pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize,
// Resize image on GPU cudaMemcpyHostToDevice);
reorderAndCast(pInputImageReorderedCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, 3); // Resize image on GPU
resizeAndMergeRGBGPU( reorderAndCast(pInputImageReorderedCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, 3);
pOutputImageCuda, pInputImageReorderedCuda, cvInputData.cols, cvInputData.rows, resizeAndMergeRGBGPU(
netInputSizes[i].x, netInputSizes[i].y, (float)scaleInputToNetInputs[i]); pOutputImageCuda, pInputImageReorderedCuda, cvInputData.cols, cvInputData.rows,
// Copy back to CPU netInputSizes[i].x, netInputSizes[i].y, (float)scaleInputToNetInputs[i]);
inputNetData[i].reset({1, 3, netInputSizes.at(i).y, netInputSizes.at(i).x}); // Copy back to CPU
cudaMemcpy( inputNetData[i].reset({1, 3, netInputSizes.at(i).y, netInputSizes.at(i).x});
inputNetData[i].getPtr(), pOutputImageCuda, sizeof(float) * outputImageSize, cudaMemcpy(
cudaMemcpyDeviceToHost); inputNetData[i].getPtr(), pOutputImageCuda, sizeof(float) * outputImageSize,
cudaMemcpyDeviceToHost);
#else
error("You need to compile OpenPose with CUDA support in order to use GPU resize.",
__LINE__, __FUNCTION__, __FILE__);
#endif
} }
} }
return inputNetData; return inputNetData;
......
...@@ -89,35 +89,40 @@ namespace op ...@@ -89,35 +89,40 @@ namespace op
// CUDA version (if #Gpus > 3) // CUDA version (if #Gpus > 3)
else else
{ {
#ifdef USE_CUDA
// Input image can be shared between this one and cvMatToOpInput.hpp // Input image can be shared between this one and cvMatToOpInput.hpp
// (Free and re-)Allocate temporary memory // (Free and re-)Allocate temporary memory
const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols; const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
if (pInputMaxSize < inputImageSize) if (pInputMaxSize < inputImageSize)
{ {
pInputMaxSize = inputImageSize; pInputMaxSize = inputImageSize;
cudaFree(pInputImageCuda); cudaFree(pInputImageCuda);
cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize); cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
} }
// (Free and re-)Allocate temporary memory // (Free and re-)Allocate temporary memory
const unsigned int outputImageSize = 3 * outputResolution.x * outputResolution.y; const unsigned int outputImageSize = 3 * outputResolution.x * outputResolution.y;
if (*spOutputMaxSize < outputImageSize) if (*spOutputMaxSize < outputImageSize)
{ {
*spOutputMaxSize = outputImageSize; *spOutputMaxSize = outputImageSize;
cudaFree(*spOutputImageCuda); cudaFree(*spOutputImageCuda);
cudaMalloc((void**)spOutputImageCuda.get(), sizeof(float) * outputImageSize); cudaMalloc((void**)spOutputImageCuda.get(), sizeof(float) * outputImageSize);
} }
// Copy original image to GPU // Copy original image to GPU
cudaMemcpy( cudaMemcpy(
pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize, cudaMemcpyHostToDevice); pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize, cudaMemcpyHostToDevice);
// Resize output image on GPU // Resize output image on GPU
resizeAndMergeRGBGPU( resizeAndMergeRGBGPU(
*spOutputImageCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, outputResolution.x, *spOutputImageCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, outputResolution.x,
outputResolution.y, (float)scaleInputToOutput); outputResolution.y, (float)scaleInputToOutput);
*spGpuMemoryAllocated = true; *spGpuMemoryAllocated = true;
// // No need to copy output image back to CPU // // No need to copy output image back to CPU
// cudaMemcpy( // cudaMemcpy(
// outputData.getPtr(), *spOutputImageCuda, sizeof(float) * outputImageSize, // outputData.getPtr(), *spOutputImageCuda, sizeof(float) * outputImageSize,
// cudaMemcpyDeviceToHost); // cudaMemcpyDeviceToHost);
#else
error("You need to compile OpenPose with CUDA support in order to use GPU resize.",
__LINE__, __FUNCTION__, __FILE__);
#endif
} }
// Return result // Return result
return outputData; return outputData;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册