提交 00b6842a 编写于 作者: G Gines Hidalgo

Reduced latency with CUDA cvMatToOpOutput

上级 db415913
......@@ -267,6 +267,7 @@ OpenPose Library - Release Notes
3. ~2-4x speedup for NMS.
4. ~2x speedup for image resize.
5. +25-30% speedup for rendering.
6. Reduced latency and increased speed by moving the resize in cvMatToOpOutput to CUDA. It generalizes better to higher number of GPUs.
3. Unity binding of OpenPose released. OpenPose adds the flag `BUILD_UNITY_SUPPORT` on CMake, which enables special Unity code so it can be built as a Unity plugin.
4. If camera is unplugged, OpenPose GUI and command line will display a warning and try to reconnect it.
5. Wrapper classes simplified and renamed. Wrapper renamed as WrapperT, and created Wrapper as the non-templated class equivalent.
......
......@@ -273,8 +273,7 @@ namespace op
cvMatToOpInputW = std::make_shared<WCvMatToOpInput<TDatumsSP>>(cvMatToOpInput);
}
// Note: We realized that somehow doing it on GPU for any number of GPUs does speedup the whole OP
resizeOnCpu = true;
// resizeOnCpu = (numberGpuThreads < 3);
resizeOnCpu = false;
if (addCvMatToOpOutput && (resizeOnCpu || !renderOutputGpu))
{
const auto gpuResize = false;
......
......@@ -87,41 +87,46 @@ namespace op
// CUDA version (if #Gpus > n)
else
{
// (Re)Allocate temporary memory
const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
const unsigned int outputImageSize = 3 * netInputSizes[i].x * netInputSizes[i].y;
if (pInputMaxSize < inputImageSize)
{
pInputMaxSize = inputImageSize;
// Free temporary memory
cudaFree(pInputImageCuda);
cudaFree(pInputImageReorderedCuda);
// Re-allocate memory
cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
cudaMalloc((void**)&pInputImageReorderedCuda, sizeof(float) * inputImageSize);
}
if (pOutputMaxSize < outputImageSize)
{
pOutputMaxSize = outputImageSize;
// Free temporary memory
cudaFree(pOutputImageCuda);
// Re-allocate memory
cudaMalloc((void**)&pOutputImageCuda, sizeof(float) * outputImageSize);
}
// Copy image to GPU
cudaMemcpy(
pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize,
cudaMemcpyHostToDevice);
// Resize image on GPU
reorderAndCast(pInputImageReorderedCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, 3);
resizeAndMergeRGBGPU(
pOutputImageCuda, pInputImageReorderedCuda, cvInputData.cols, cvInputData.rows,
netInputSizes[i].x, netInputSizes[i].y, (float)scaleInputToNetInputs[i]);
// Copy back to CPU
inputNetData[i].reset({1, 3, netInputSizes.at(i).y, netInputSizes.at(i).x});
cudaMemcpy(
inputNetData[i].getPtr(), pOutputImageCuda, sizeof(float) * outputImageSize,
cudaMemcpyDeviceToHost);
#ifdef USE_CUDA
// (Re)Allocate temporary memory
const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
const unsigned int outputImageSize = 3 * netInputSizes[i].x * netInputSizes[i].y;
if (pInputMaxSize < inputImageSize)
{
pInputMaxSize = inputImageSize;
// Free temporary memory
cudaFree(pInputImageCuda);
cudaFree(pInputImageReorderedCuda);
// Re-allocate memory
cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
cudaMalloc((void**)&pInputImageReorderedCuda, sizeof(float) * inputImageSize);
}
if (pOutputMaxSize < outputImageSize)
{
pOutputMaxSize = outputImageSize;
// Free temporary memory
cudaFree(pOutputImageCuda);
// Re-allocate memory
cudaMalloc((void**)&pOutputImageCuda, sizeof(float) * outputImageSize);
}
// Copy image to GPU
cudaMemcpy(
pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize,
cudaMemcpyHostToDevice);
// Resize image on GPU
reorderAndCast(pInputImageReorderedCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, 3);
resizeAndMergeRGBGPU(
pOutputImageCuda, pInputImageReorderedCuda, cvInputData.cols, cvInputData.rows,
netInputSizes[i].x, netInputSizes[i].y, (float)scaleInputToNetInputs[i]);
// Copy back to CPU
inputNetData[i].reset({1, 3, netInputSizes.at(i).y, netInputSizes.at(i).x});
cudaMemcpy(
inputNetData[i].getPtr(), pOutputImageCuda, sizeof(float) * outputImageSize,
cudaMemcpyDeviceToHost);
#else
error("You need to compile OpenPose with CUDA support in order to use GPU resize.",
__LINE__, __FUNCTION__, __FILE__);
#endif
}
}
return inputNetData;
......
......@@ -89,35 +89,40 @@ namespace op
// CUDA version (if #Gpus > 3)
else
{
#ifdef USE_CUDA
// Input image can be shared between this one and cvMatToOpInput.hpp
// (Free and re-)Allocate temporary memory
const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
if (pInputMaxSize < inputImageSize)
{
pInputMaxSize = inputImageSize;
cudaFree(pInputImageCuda);
cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
}
// (Free and re-)Allocate temporary memory
const unsigned int outputImageSize = 3 * outputResolution.x * outputResolution.y;
if (*spOutputMaxSize < outputImageSize)
{
*spOutputMaxSize = outputImageSize;
cudaFree(*spOutputImageCuda);
cudaMalloc((void**)spOutputImageCuda.get(), sizeof(float) * outputImageSize);
}
// Copy original image to GPU
cudaMemcpy(
pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize, cudaMemcpyHostToDevice);
// Resize output image on GPU
resizeAndMergeRGBGPU(
*spOutputImageCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, outputResolution.x,
outputResolution.y, (float)scaleInputToOutput);
*spGpuMemoryAllocated = true;
// // No need to copy output image back to CPU
// cudaMemcpy(
// outputData.getPtr(), *spOutputImageCuda, sizeof(float) * outputImageSize,
// cudaMemcpyDeviceToHost);
// (Free and re-)Allocate temporary memory
const unsigned int inputImageSize = 3 * cvInputData.rows * cvInputData.cols;
if (pInputMaxSize < inputImageSize)
{
pInputMaxSize = inputImageSize;
cudaFree(pInputImageCuda);
cudaMalloc((void**)&pInputImageCuda, sizeof(unsigned char) * inputImageSize);
}
// (Free and re-)Allocate temporary memory
const unsigned int outputImageSize = 3 * outputResolution.x * outputResolution.y;
if (*spOutputMaxSize < outputImageSize)
{
*spOutputMaxSize = outputImageSize;
cudaFree(*spOutputImageCuda);
cudaMalloc((void**)spOutputImageCuda.get(), sizeof(float) * outputImageSize);
}
// Copy original image to GPU
cudaMemcpy(
pInputImageCuda, cvInputData.data, sizeof(unsigned char) * inputImageSize, cudaMemcpyHostToDevice);
// Resize output image on GPU
resizeAndMergeRGBGPU(
*spOutputImageCuda, pInputImageCuda, cvInputData.cols, cvInputData.rows, outputResolution.x,
outputResolution.y, (float)scaleInputToOutput);
*spGpuMemoryAllocated = true;
// // No need to copy output image back to CPU
// cudaMemcpy(
// outputData.getPtr(), *spOutputImageCuda, sizeof(float) * outputImageSize,
// cudaMemcpyDeviceToHost);
#else
error("You need to compile OpenPose with CUDA support in order to use GPU resize.",
__LINE__, __FUNCTION__, __FILE__);
#endif
}
// Return result
return outputData;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册