提交 5c873aa9 编写于 作者: G gineshidalgo99

ResizeAndMerge multi-scale more robust

上级 d1d483f3
......@@ -113,7 +113,7 @@ OpenPose Library - Release Notes
## Current version (future OpenPose 1.2.0alpha)
## Current version (future OpenPose 1.2.0)
1. Main improvements:
1. Speed increase when processing images with different aspect ratios. E.g. ~20% increase over 3.7k COCO validation images on 1 scale.
2. Huge speed increase and memory reduction when processing multi-scale. E.g. over 3.7k COCO validation images on 4 scales: ~40% (~770 to ~450 sec) speed increase, ~25% memory reduction (from ~8.9 to ~6.7 GB / GPU).
......
......@@ -13,8 +13,5 @@ OP_BIN=./build/examples/openpose/openpose.bin
# 1 scale
$OP_BIN --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1_test.json --no_display --render_pose 0
# # 3 scales
# $OP_BIN --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1_3.json --no_display --render_pose 0 --scale_number 3 --scale_gap 0.25
# 4 scales
$OP_BIN --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1_4_test.json --no_display --render_pose 0 --scale_number 4 --scale_gap 0.25 --net_resolution "1312x736"
......@@ -32,5 +32,5 @@ $OP_BIN --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1.json --no_di
# # 4 scales
# $OP_BIN --num_gpu 1 --image_dir $IMAGE_FOLDER --write_coco_json ${JSON_FOLDER}1_4.json --no_display --render_pose 0 --scale_number 4 --scale_gap 0.25 --net_resolution "1312x736" --frame_last 3558
# Debugging - Rendered frames saved
# # Debugging - Rendered frames saved
# $OP_BIN --image_dir $IMAGE_FOLDER --write_images ${JSON_FOLDER}frameOutput --no_display
......@@ -15,30 +15,46 @@ namespace op
if (x < targetWidth && y < targetHeight)
{
const T xSource = (x + 0.5f) * sourceWidth / T(targetWidth) - 0.5f;
const T ySource = (y + 0.5f) * sourceHeight / T(targetHeight) - 0.5f;
const T xSource = (x + T(0.5f)) * sourceWidth / T(targetWidth) - T(0.5f);
const T ySource = (y + T(0.5f)) * sourceHeight / T(targetHeight) - T(0.5f);
targetPtr[y*targetWidth+x] = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
sourceWidth);
}
}
template <typename T>
__global__ void resizeKernelAndMerge(T* targetPtr, const T* const sourcePtr, const T scaleWidth,
const T scaleHeight, const int sourceWidth, const int sourceHeight,
const int targetWidth, const int targetHeight, const int averageCounter)
__global__ void resizeKernelAndAdd(T* targetPtr, const T* const sourcePtr, const T scaleWidth,
const T scaleHeight, const int sourceWidth, const int sourceHeight,
const int targetWidth, const int targetHeight)
{
const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x < targetWidth && y < targetHeight)
{
const T xSource = (x + 0.5f) / scaleWidth - 0.5f;
const T ySource = (y + 0.5f) / scaleHeight - 0.5f;
const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
targetPtr[y*targetWidth+x] += bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
sourceWidth);
}
}
template <typename T>
__global__ void resizeKernelAndAverage(T* targetPtr, const T* const sourcePtr, const T scaleWidth,
const T scaleHeight, const int sourceWidth, const int sourceHeight,
const int targetWidth, const int targetHeight, const int counter)
{
const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x < targetWidth && y < targetHeight)
{
const T xSource = (x + T(0.5f)) / scaleWidth - T(0.5f);
const T ySource = (y + T(0.5f)) / scaleHeight - T(0.5f);
const auto interpolated = bicubicInterpolate(sourcePtr, xSource, ySource, sourceWidth, sourceHeight,
sourceWidth);
auto& targetPixel = targetPtr[y*targetWidth+x];
targetPixel = ((averageCounter * targetPixel) + interpolated) / T(averageCounter + 1);
// targetPixel = fastMax(targetPixel, interpolated);
targetPixel = (targetPixel + interpolated) / T(counter);
}
}
......@@ -91,14 +107,13 @@ namespace op
}
// Old inefficient multi-scale merging
else
error("It should never reaches this point. Notify us.", __LINE__, __FUNCTION__, __FILE__);
error("It should never reache this point. Notify us otherwise.", __LINE__, __FUNCTION__, __FILE__);
}
// Multi-scaling merging
else
{
const auto targetChannelOffset = targetWidth * targetHeight;
cudaMemset(targetPtr, 0.f, channels*targetChannelOffset * sizeof(T));
auto averageCounter = -1;
const auto scaleToMainScaleWidth = targetWidth / T(sourceWidth);
const auto scaleToMainScaleHeight = targetHeight / T(sourceHeight);
......@@ -111,14 +126,29 @@ namespace op
const auto scaleInputToNet = scaleInputToNetInputs[i] / scaleInputToNetInputs[0];
const auto scaleWidth = scaleToMainScaleWidth / scaleInputToNet;
const auto scaleHeight = scaleToMainScaleHeight / scaleInputToNet;
averageCounter++;
for (auto c = 0 ; c < channels ; c++)
// All but last image --> add
if (i < sourceSizes.size() - 1)
{
resizeKernelAndMerge<<<numBlocks, threadsPerBlock>>>(
targetPtr + c * targetChannelOffset, sourcePtrs[i] + c * sourceChannelOffset,
scaleWidth, scaleHeight, currentWidth, currentHeight, targetWidth,
targetHeight, averageCounter
);
for (auto c = 0 ; c < channels ; c++)
{
resizeKernelAndAdd<<<numBlocks, threadsPerBlock>>>(
targetPtr + c * targetChannelOffset, sourcePtrs[i] + c * sourceChannelOffset,
scaleWidth, scaleHeight, currentWidth, currentHeight, targetWidth,
targetHeight
);
}
}
// Last image --> average all
else
{
for (auto c = 0 ; c < channels ; c++)
{
resizeKernelAndAverage<<<numBlocks, threadsPerBlock>>>(
targetPtr + c * targetChannelOffset, sourcePtrs[i] + c * sourceChannelOffset,
scaleWidth, scaleHeight, currentWidth, currentHeight, targetWidth,
targetHeight, sourceSizes.size()
);
}
}
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册