renderPose.cu 46.0 KB
Newer Older
1
#include <openpose/pose/renderPose.hpp>
G
gineshidalgo99 已提交
2
#include <openpose/gpu/cuda.hpp>
G
Gines Hidalgo 已提交
3
#include <openpose/pose/poseParameters.hpp>
4 5
#include <openpose_private/gpu/cuda.hu>
#include <openpose_private/utilities/render.hu>
G
gineshidalgo99 已提交
6 7 8

namespace op
{
G
gineshidalgo99 已提交
9 10
    // PI digits: http://www.piday.org/million/
    __constant__ const float PI = 3.14159265358979323846264338327950288419716939937510582097494459230781640628620899862803482534211706798214808651328230664709384460955058223172535940812848111745f;
11
    // Keypoint pairs
12
    __constant__ const unsigned int BODY_25_PAIRS_GPU[] = {POSE_BODY_25_PAIRS_RENDER_GPU};
13 14
    __constant__ const unsigned int COCO_PAIRS_GPU[] = {POSE_COCO_PAIRS_RENDER_GPU};
    __constant__ const unsigned int BODY_19_PAIRS_GPU[] = {POSE_BODY_19_PAIRS_RENDER_GPU};
G
gineshidalgo99 已提交
15
    __constant__ const unsigned int BODY_23_PAIRS_GPU[] = {POSE_BODY_23_PAIRS_RENDER_GPU};
16
    __constant__ const unsigned int BODY_25B_PAIRS_GPU[] = {POSE_BODY_25B_PAIRS_RENDER_GPU};
G
gineshidalgo99 已提交
17
    __constant__ const unsigned int BODY_135_PAIRS_GPU[] = {POSE_BODY_135_PAIRS_RENDER_GPU};
18
    __constant__ const unsigned int MPI_PAIRS_GPU[] = {POSE_MPI_PAIRS_RENDER_GPU};
19
    __constant__ const unsigned int CAR_12_PAIRS_GPU[] = {POSE_CAR_12_PAIRS_RENDER_GPU};
20
    __constant__ const unsigned int CAR_22_PAIRS_GPU[] = {POSE_CAR_22_PAIRS_RENDER_GPU};
21
    // Keypoint scales
22
    __constant__ const float BODY_25_SCALES[] = {POSE_BODY_25_SCALES_RENDER_GPU};
23 24
    __constant__ const float COCO_SCALES[] = {POSE_COCO_SCALES_RENDER_GPU};
    __constant__ const float BODY_19_SCALES[] = {POSE_BODY_19_SCALES_RENDER_GPU};
G
gineshidalgo99 已提交
25
    __constant__ const float BODY_23_SCALES[] = {POSE_BODY_23_SCALES_RENDER_GPU};
26
    __constant__ const float BODY_25B_SCALES[] = {POSE_BODY_25B_SCALES_RENDER_GPU};
G
gineshidalgo99 已提交
27
    __constant__ const float BODY_135_SCALES[] = {POSE_BODY_135_SCALES_RENDER_GPU};
28
    __constant__ const float MPI_SCALES[] = {POSE_MPI_SCALES_RENDER_GPU};
29
    __constant__ const float CAR_12_SCALES[] = {POSE_CAR_12_SCALES_RENDER_GPU};
30
    __constant__ const float CAR_22_SCALES[] = {POSE_CAR_22_SCALES_RENDER_GPU};
31
    // RGB colors
32
    __constant__ const float BODY_25_COLORS[] = {POSE_BODY_25_COLORS_RENDER_GPU};
33
    __constant__ const float COCO_COLORS[] = {POSE_COCO_COLORS_RENDER_GPU};
34
    __constant__ const float BODY_19_COLORS[] = {POSE_BODY_19_COLORS_RENDER_GPU};
G
gineshidalgo99 已提交
35
    __constant__ const float BODY_23_COLORS[] = {POSE_BODY_23_COLORS_RENDER_GPU};
36
    __constant__ const float BODY_25B_COLORS[] = {POSE_BODY_25B_COLORS_RENDER_GPU};
G
gineshidalgo99 已提交
37
    __constant__ const float BODY_135_COLORS[] = {POSE_BODY_135_COLORS_RENDER_GPU};
38
    __constant__ const float MPI_COLORS[] = {POSE_MPI_COLORS_RENDER_GPU};
39
    __constant__ const float CAR_12_COLORS[] = {POSE_CAR_12_COLORS_RENDER_GPU};
40
    __constant__ const float CAR_22_COLORS[] = {POSE_CAR_22_COLORS_RENDER_GPU};
G
gineshidalgo99 已提交
41 42 43



G
Gines Hidalgo 已提交
44
    inline __device__ void getColorHeatMap(float* colorPtr, const float v, const float vmin, const float vmax)
G
gineshidalgo99 已提交
45
    {
G
Gines Hidalgo 已提交
46
        const auto vTrunc = fastTruncateCuda(v, vmin, vmax);
G
gineshidalgo99 已提交
47 48
        const auto dv = vmax - vmin;

G
Gines Hidalgo 已提交
49
        if (vTrunc < (vmin + 0.125f * dv))
G
gineshidalgo99 已提交
50
        {
G
Gines Hidalgo 已提交
51
            colorPtr[0] = 256.f * (0.5f + (vTrunc * 4.f)); //B: 0.5 ~ 1
G
gineshidalgo99 已提交
52 53 54
            colorPtr[1] = 0.f;
            colorPtr[2] = 0.f;
        }
G
Gines Hidalgo 已提交
55
        else if (vTrunc < (vmin + 0.375f * dv))
G
gineshidalgo99 已提交
56 57
        {
            colorPtr[0] = 255.f;
G
Gines Hidalgo 已提交
58
            colorPtr[1] = 256.f * (vTrunc - 0.125f) * 4.f; //G: 0 ~ 1
G
gineshidalgo99 已提交
59 60
            colorPtr[2] = 0.f;
        }
G
Gines Hidalgo 已提交
61
        else if (vTrunc < (vmin + 0.625f * dv))
G
gineshidalgo99 已提交
62
        {
G
Gines Hidalgo 已提交
63
            colorPtr[0] = 256.f * (-4.f * vTrunc + 2.5f); //B: 1 ~ 0
G
gineshidalgo99 已提交
64
            colorPtr[1] = 255.f;
G
Gines Hidalgo 已提交
65
            colorPtr[2] = 256.f * (4.f * (vTrunc - 0.375f)); // R: 0 ~ 1
G
gineshidalgo99 已提交
66
        }
G
Gines Hidalgo 已提交
67
        else if (vTrunc < (vmin + 0.875f * dv))
G
gineshidalgo99 已提交
68 69
        {
            colorPtr[0] = 0.f;
G
Gines Hidalgo 已提交
70
            colorPtr[1] = 256.f * (-4.f * vTrunc + 3.5f); //G: 1 ~ 0
G
gineshidalgo99 已提交
71 72 73 74 75 76
            colorPtr[2] = 255.f;
        }
        else
        {
            colorPtr[0] = 0.f;
            colorPtr[1] = 0.f;
G
Gines Hidalgo 已提交
77
            colorPtr[2] = 256.f * (-4.f * vTrunc + 4.5f); //R: 1 ~ 0.5
G
gineshidalgo99 已提交
78 79 80
        }
    }

G
Gines Hidalgo 已提交
81
    inline __device__ void getColorAffinity(float3& colorPtr, const float v, const float vmin, const float vmax)
G
gineshidalgo99 已提交
82 83 84 85 86 87 88 89
    {
        const auto RY = 15;
        const auto YG =  6;
        const auto GC =  4;
        const auto CB = 11;
        const auto BM = 13;
        const auto MR =  6;
        const auto summed = RY+YG+GC+CB+BM+MR;       // 55
G
Gines Hidalgo 已提交
90
        const auto vTrunc = fastTruncateCuda(v, vmin, vmax) * summed;
G
gineshidalgo99 已提交
91

G
Gines Hidalgo 已提交
92 93 94 95 96 97 98 99 100 101 102 103
        if (vTrunc < RY)
            colorPtr = {255.f,                              255.f*(vTrunc/(RY)),                0.f};
        else if (vTrunc < RY+YG)
            colorPtr = {255.f*(1-((vTrunc-RY)/(YG))),       255.f,                              0.f};
        else if (vTrunc < RY+YG+GC)
            colorPtr = {0.f * (1-((vTrunc-RY)/(YG))),       255.f,                              255.f*((vTrunc-RY-YG)/(GC))};
        else if (vTrunc < RY+YG+GC+CB)
            colorPtr = {0.f,                                255.f*(1-((vTrunc-RY-YG-GC)/(CB))), 255.f};
        else if (vTrunc < summed-MR)
            colorPtr = {255.f*((vTrunc-RY-YG-GC-CB)/(BM)),  0.f,                                255.f};
        else if (vTrunc < summed)
            colorPtr = {255.f,                              0.f,                                255.f*(1-((vTrunc-RY-YG-GC-CB-BM)/(MR)))};
G
gineshidalgo99 已提交
104
        else
G
Gines Hidalgo 已提交
105
            colorPtr = {255.f,                              0.f,                                0.f};
G
gineshidalgo99 已提交
106 107 108 109
    }

    inline __device__ void getColorXYAffinity(float3& colorPtr, const float x, const float y)
    {
110
        const auto rad = fastMinCuda(1.f, sqrt( x*x + y*y ) );
111
        const float a = atan2(-y,-x)/PI;
G
gineshidalgo99 已提交
112 113 114 115 116 117 118 119 120
        auto fk = (a+1.f)/2.f; // 0 to 1
        if (::isnan(fk))
            fk = 0.f;
        getColorAffinity(colorPtr, fk, 0.f, 1.f);
        colorPtr.x *= rad;
        colorPtr.y *= rad;
        colorPtr.z *= rad;
    }

121 122 123 124 125 126 127 128
    __global__ void getBoundingBoxPerPersonPose(
        float* maxPtr, float* minPtr, float* scalePtr,const int targetWidth, const int targetHeight,
        const float* const keypointsPtr, const int numberPeople, const int numberParts, const float threshold)
    {
        getBoundingBoxPerPerson(
            maxPtr, minPtr, scalePtr, targetWidth, targetHeight, keypointsPtr, numberPeople, numberParts, threshold);
    }

G
gineshidalgo99 已提交
129
    __global__ void renderPoseCoco(
130 131 132
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
        const bool blendOriginalFrame, const float alphaColorToAdd)
G
gineshidalgo99 已提交
133 134 135 136 137
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

138
        // Shared parameters
139 140 141 142
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
G
gineshidalgo99 已提交
143 144
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

145 146
        // Other parameters
        const auto numberPartPairs = sizeof(COCO_PAIRS_GPU) / (2*sizeof(COCO_PAIRS_GPU[0]));
147
        const auto numberScales = sizeof(COCO_SCALES) / sizeof(COCO_SCALES[0]);
G
gineshidalgo99 已提交
148
        const auto numberColors = sizeof(COCO_COLORS) / (3*sizeof(COCO_COLORS[0]));
149 150
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
151 152

        // Render key points
153 154 155
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
156 157 158
            globalIdx, x, y, targetWidth, targetHeight, posePtr, COCO_PAIRS_GPU, numberPeople, 18, numberPartPairs,
            COCO_COLORS, numberColors, radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd,
            blendOriginalFrame, (googlyEyes ? 14 : -1), (googlyEyes ? 15 : -1));
G
gineshidalgo99 已提交
159 160
    }

G
gineshidalgo99 已提交
161
    __global__ void renderPoseBody19(
162 163 164
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
        const bool blendOriginalFrame, const float alphaColorToAdd)
165 166 167 168 169 170
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
171 172 173 174
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
175 176 177 178
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

        // Other parameters
        const auto numberPartPairs = sizeof(BODY_19_PAIRS_GPU) / (2*sizeof(BODY_19_PAIRS_GPU[0]));
179
        const auto numberScales = sizeof(BODY_19_SCALES) / sizeof(BODY_19_SCALES[0]);
180
        const auto numberColors = sizeof(BODY_19_COLORS) / (3*sizeof(BODY_19_COLORS[0]));
181 182
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
183 184

        // Render key points
185 186 187
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
188 189 190 191
            globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_19_PAIRS_GPU, numberPeople, 19, numberPartPairs,
            BODY_19_COLORS, numberColors, radius, lineWidth, BODY_19_SCALES, numberScales, threshold, alphaColorToAdd,
            blendOriginalFrame, (googlyEyes ? 15 : -1),
            (googlyEyes ? 16 : -1));
192 193
    }

G
gineshidalgo99 已提交
194
    __global__ void renderPoseBody23(
195 196 197
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
        const bool blendOriginalFrame, const float alphaColorToAdd)
G
gineshidalgo99 已提交
198 199 200 201 202 203
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
204 205 206 207
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
G
gineshidalgo99 已提交
208 209 210
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

        // Other parameters
G
gineshidalgo99 已提交
211 212 213
        const auto numberPartPairs = sizeof(BODY_23_PAIRS_GPU) / (2*sizeof(BODY_23_PAIRS_GPU[0]));
        const auto numberScales = sizeof(BODY_23_SCALES) / sizeof(BODY_23_SCALES[0]);
        const auto numberColors = sizeof(BODY_23_COLORS) / (3*sizeof(BODY_23_COLORS[0]));
214 215
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
G
gineshidalgo99 已提交
216 217

        // Render key points
218 219 220
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
221 222 223
            globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_23_PAIRS_GPU, numberPeople, 23, numberPartPairs,
            BODY_23_COLORS, numberColors, radius, lineWidth, BODY_23_SCALES, numberScales, threshold, alphaColorToAdd,
            blendOriginalFrame, (googlyEyes ? 13 : -1), (googlyEyes ? 14 : -1));
G
gineshidalgo99 已提交
224 225
    }

G
gineshidalgo99 已提交
226
    __global__ void renderPoseBody25(
227 228 229
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth,
        const int targetHeight, const float* const posePtr, const int numberPeople, const float threshold,
        const bool googlyEyes, const bool blendOriginalFrame, const float alphaColorToAdd)
G
gineshidalgo99 已提交
230 231 232 233 234 235
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
236 237 238 239
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
G
gineshidalgo99 已提交
240 241 242 243 244 245
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

        // Other parameters
        const auto numberPartPairs = sizeof(BODY_25_PAIRS_GPU) / (2*sizeof(BODY_25_PAIRS_GPU[0]));
        const auto numberScales = sizeof(BODY_25_SCALES) / sizeof(BODY_25_SCALES[0]);
        const auto numberColors = sizeof(BODY_25_COLORS) / (3*sizeof(BODY_25_COLORS[0]));
246 247
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
G
gineshidalgo99 已提交
248 249

        // Render key points
250 251 252
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
253 254 255 256
            globalIdx, x, y, targetWidth, targetHeight,
            posePtr, BODY_25_PAIRS_GPU, numberPeople, 25, numberPartPairs, BODY_25_COLORS, numberColors,
            radius, lineWidth, BODY_25_SCALES, numberScales, threshold, alphaColorToAdd,
            blendOriginalFrame, (googlyEyes ? 15 : -1), (googlyEyes ? 16 : -1));
G
gineshidalgo99 已提交
257 258
    }

G
gineshidalgo99 已提交
259
    __global__ void renderPoseBody25b(
260 261 262
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
        const bool blendOriginalFrame, const float alphaColorToAdd)
263 264 265 266 267 268
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
269 270 271 272
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
273 274 275
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

        // Other parameters
276 277 278
        const auto numberPartPairs = sizeof(BODY_25B_PAIRS_GPU) / (2*sizeof(BODY_25B_PAIRS_GPU[0]));
        const auto numberScales = sizeof(BODY_25B_SCALES) / sizeof(BODY_25B_SCALES[0]);
        const auto numberColors = sizeof(BODY_25B_COLORS) / (3*sizeof(BODY_25B_COLORS[0]));
279 280
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
281 282

        // Render key points
283 284 285
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
286 287 288
            globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_25B_PAIRS_GPU, numberPeople, 25, numberPartPairs,
            BODY_25B_COLORS, numberColors, radius, lineWidth, BODY_25B_SCALES, numberScales, threshold, alphaColorToAdd,
            blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1));
289 290
    }

G
gineshidalgo99 已提交
291
    __global__ void renderPoseBody135(
292 293 294
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
        const bool blendOriginalFrame, const float alphaColorToAdd)
G
gineshidalgo99 已提交
295 296 297 298 299 300
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
301 302 303 304
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
G
gineshidalgo99 已提交
305 306 307 308 309 310
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

        // Other parameters
        const auto numberPartPairs = sizeof(BODY_135_PAIRS_GPU) / (2*sizeof(BODY_135_PAIRS_GPU[0]));
        const auto numberScales = sizeof(BODY_135_SCALES) / sizeof(BODY_135_SCALES[0]);
        const auto numberColors = sizeof(BODY_135_COLORS) / (3*sizeof(BODY_135_COLORS[0]));
311 312
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
G
gineshidalgo99 已提交
313 314

        // Render key points
315 316 317 318 319 320
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
            globalIdx, x, y, targetWidth, targetHeight, posePtr, BODY_135_PAIRS_GPU, numberPeople, 135,
            numberPartPairs, BODY_135_COLORS, numberColors, radius, lineWidth, BODY_135_SCALES, numberScales,
            threshold, alphaColorToAdd, blendOriginalFrame, (googlyEyes ? 1 : -1), (googlyEyes ? 2 : -1));
G
gineshidalgo99 已提交
321 322 323
    }

    __global__ void renderPoseMpi29Parts(
324 325 326
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
        const float* const posePtr, const int numberPeople, const float threshold, const bool blendOriginalFrame,
        const float alphaColorToAdd)
G
gineshidalgo99 已提交
327 328 329
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
330
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;
G
gineshidalgo99 已提交
331

332
        // Shared parameters
333 334 335 336
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
337 338 339 340
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

        // Other parameters
        const auto numberPartPairs = sizeof(MPI_PAIRS_GPU) / (2*sizeof(MPI_PAIRS_GPU[0]));
341
        const auto numberScales = sizeof(MPI_SCALES) / sizeof(MPI_SCALES[0]);
G
gineshidalgo99 已提交
342
        const auto numberColors = sizeof(MPI_COLORS) / (3*sizeof(MPI_COLORS[0]));
343 344
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
345 346

        // Render key points
347 348 349 350 351 352
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
            globalIdx, x, y, targetWidth, targetHeight, posePtr, MPI_PAIRS_GPU, numberPeople, 15, numberPartPairs,
            MPI_COLORS, numberColors, radius, lineWidth, COCO_SCALES, numberScales, threshold, alphaColorToAdd,
            blendOriginalFrame);
G
gineshidalgo99 已提交
353 354
    }

G
gineshidalgo99 已提交
355
    __global__ void renderPoseCar12(
356 357 358
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
        const bool blendOriginalFrame, const float alphaColorToAdd)
359 360 361 362 363 364
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
365 366 367 368
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
369 370 371 372 373 374
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

        // Other parameters
        const auto numberPartPairs = sizeof(CAR_12_PAIRS_GPU) / (2*sizeof(CAR_12_PAIRS_GPU[0]));
        const auto numberScales = sizeof(CAR_12_SCALES) / sizeof(CAR_12_SCALES[0]);
        const auto numberColors = sizeof(CAR_12_COLORS) / (3*sizeof(CAR_12_COLORS[0]));
375 376
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
377 378

        // Render key points
379 380 381 382 383 384
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
            globalIdx, x, y, targetWidth, targetHeight, posePtr, CAR_12_PAIRS_GPU, numberPeople, 12, numberPartPairs,
            CAR_12_COLORS, numberColors, radius, lineWidth, CAR_12_SCALES, numberScales, threshold, alphaColorToAdd,
            blendOriginalFrame, (googlyEyes ? 4 : -1), (googlyEyes ? 5 : -1));
385 386
    }

G
gineshidalgo99 已提交
387
    __global__ void renderPoseCar22(
388 389 390
        float* targetPtr, float* minPtr, float* maxPtr, float* scalePtr, const int targetWidth, const int targetHeight,
        const float* const posePtr, const int numberPeople, const float threshold, const bool googlyEyes,
        const bool blendOriginalFrame, const float alphaColorToAdd)
391 392 393 394 395 396
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;
        const auto globalIdx = threadIdx.y * blockDim.x + threadIdx.x;

        // Shared parameters
397 398 399 400
        __shared__ float2 sharedMins[POSE_MAX_PEOPLE];
        __shared__ float2 sharedMaxs[POSE_MAX_PEOPLE];
        // __shared__ float sharedMins[2*POSE_MAX_PEOPLE];
        // __shared__ float sharedMaxs[2*POSE_MAX_PEOPLE];
401 402 403 404 405 406
        __shared__ float sharedScaleF[POSE_MAX_PEOPLE];

        // Other parameters
        const auto numberPartPairs = sizeof(CAR_22_PAIRS_GPU) / (2*sizeof(CAR_22_PAIRS_GPU[0]));
        const auto numberScales = sizeof(CAR_22_SCALES) / sizeof(CAR_22_SCALES[0]);
        const auto numberColors = sizeof(CAR_22_COLORS) / (3*sizeof(CAR_22_COLORS[0]));
407 408
        const auto radius = fastMinCuda(targetWidth, targetHeight) / 100.f;
        const auto lineWidth = fastMinCuda(targetWidth, targetHeight) / 120.f;
409 410

        // Render key points
411 412 413 414 415 416
        // Note: renderKeypoints is not working for videos with many people, renderKeypointsOld speed was slightly improved instead
        renderKeypointsOld( // renderKeypoints(
            targetPtr, sharedMaxs, sharedMins, sharedScaleF, // maxPtr, minPtr, scalePtr,
            globalIdx, x, y, targetWidth, targetHeight, posePtr, CAR_22_PAIRS_GPU, numberPeople, 22, numberPartPairs,
            CAR_22_COLORS, numberColors, radius, lineWidth, CAR_22_SCALES, numberScales, threshold, alphaColorToAdd,
            blendOriginalFrame, (googlyEyes ? 6 : -1), (googlyEyes ? 7 : -1));
417 418
    }

419 420 421 422
    __global__ void renderBodyPartHeatMaps(float* targetPtr, const int targetWidth, const int targetHeight,
                                           const float* const heatMapPtr, const int widthHeatMap,
                                           const int heightHeatMap, const float scaleToKeepRatio,
                                           const int numberBodyParts, const float alphaColorToAdd)
G
gineshidalgo99 已提交
423 424 425 426
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;

G
gineshidalgo99 已提交
427
        const auto numberColors = sizeof(COCO_COLORS)/(3*sizeof(COCO_COLORS[0]));
G
gineshidalgo99 已提交
428 429 430 431 432 433

        if (x < targetWidth && y < targetHeight)
        {
            float rgbColor [3] = {0.f,0.f,0.f};
            const auto xSource = (x + 0.5f) / scaleToKeepRatio - 0.5f;
            const auto ySource = (y + 0.5f) / scaleToKeepRatio - 0.5f;
434 435
            const auto xHeatMap = fastTruncateCuda(int(xSource + 1e-5), 0, widthHeatMap);
            const auto yHeatMap = fastTruncateCuda(int(ySource + 1e-5), 0, heightHeatMap);
G
gineshidalgo99 已提交
436
            const auto heatMapArea = widthHeatMap * heightHeatMap;
437
            for (auto part = 0u ; part < numberBodyParts ; part++)
G
gineshidalgo99 已提交
438 439
            {
                const auto offsetOrigin = part * heatMapArea;
440 441
                // __saturatef = trucate to [0,1]
                const auto value = __saturatef(heatMapPtr[offsetOrigin + yHeatMap*widthHeatMap + xHeatMap]);
G
gineshidalgo99 已提交
442
                const auto rgbColorIndex = (part%numberColors)*3;
G
gineshidalgo99 已提交
443 444 445
                rgbColor[0] += value*COCO_COLORS[rgbColorIndex];
                rgbColor[1] += value*COCO_COLORS[rgbColorIndex+1];
                rgbColor[2] += value*COCO_COLORS[rgbColorIndex+2];
G
gineshidalgo99 已提交
446 447
            }

448 449
            const auto blueIndex = 3*(y * targetWidth + x);
            addColorWeighted(targetPtr[blueIndex+2], targetPtr[blueIndex+1], targetPtr[blueIndex], rgbColor,
450
                             alphaColorToAdd);
G
gineshidalgo99 已提交
451 452 453
        }
    }

454 455
    __global__ void renderBodyPartHeatMap(float* targetPtr, const int targetWidth, const int targetHeight,
                                          const float* const heatMapPtr, const int widthHeatMap,
G
Gines Hidalgo 已提交
456
                                          const int heightHeatMap, const float scaleToKeepRatio, const unsigned int part,
G
gineshidalgo99 已提交
457
                                          const float alphaColorToAdd, const bool absValue = false)
G
gineshidalgo99 已提交
458 459 460 461 462 463 464 465
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;

        if (x < targetWidth && y < targetHeight)
        {
            const auto xSource = (x + 0.5f) / scaleToKeepRatio - 0.5f;
            const auto ySource = (y + 0.5f) / scaleToKeepRatio - 0.5f;
G
gineshidalgo99 已提交
466
            const auto* const heatMapPtrOffsetted = heatMapPtr + part * widthHeatMap * heightHeatMap;
467 468
            const auto interpolatedValue = bicubicInterpolate(heatMapPtrOffsetted, xSource, ySource, widthHeatMap,
                                                              heightHeatMap, widthHeatMap);
G
gineshidalgo99 已提交
469 470

            float rgbColor[3];
G
gineshidalgo99 已提交
471 472 473 474
            if (absValue)
                getColorHeatMap(rgbColor, fabsf(interpolatedValue), 0.f, 1.f);
            else
                getColorHeatMap(rgbColor, interpolatedValue, 0.f, 1.f);
G
gineshidalgo99 已提交
475

476 477
            const auto blueIndex = 3*(y * targetWidth + x);
            addColorWeighted(targetPtr[blueIndex+2], targetPtr[blueIndex+1], targetPtr[blueIndex], rgbColor,
478
                             alphaColorToAdd);
G
gineshidalgo99 已提交
479 480 481
        }
    }

482 483 484 485
    __global__ void renderPartAffinities(float* targetPtr, const int targetWidth, const int targetHeight,
                                         const float* const heatMapPtr, const int widthHeatMap,
                                         const int heightHeatMap, const float scaleToKeepRatio,
                                         const int partsToRender, const int initPart, const float alphaColorToAdd)
G
gineshidalgo99 已提交
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;

        if (x < targetWidth && y < targetHeight)
        {
            float rgbColor [3] = {0.f,0.f,0.f};
            const auto xSource = (x + 0.5f) / scaleToKeepRatio - 0.5f;
            const auto ySource = (y + 0.5f) / scaleToKeepRatio - 0.5f;
            const auto heatMapArea = widthHeatMap * heightHeatMap;

            for (auto part = initPart ; part < initPart + partsToRender*2 ; part += 2)
            {
                int xIntArray[4];
                int yIntArray[4];
                float dx;
                float dy;
                cubicSequentialData(xIntArray, yIntArray, dx, dy, xSource, ySource, widthHeatMap, heightHeatMap);

                const auto offsetOriginX = part * heatMapArea;
                const auto offsetOriginY = (part+1) * heatMapArea;
                auto valueX = heatMapPtr[offsetOriginX + yIntArray[1]*widthHeatMap + xIntArray[1]];
                auto valueY = heatMapPtr[offsetOriginY + yIntArray[1]*widthHeatMap + xIntArray[1]];
                if (partsToRender == 1)
                {
                    const auto xB = heatMapPtr[offsetOriginX + yIntArray[1]*widthHeatMap + xIntArray[2]];
                    const auto xC = heatMapPtr[offsetOriginX + yIntArray[2]*widthHeatMap + xIntArray[1]];
                    const auto xD = heatMapPtr[offsetOriginX + yIntArray[2]*widthHeatMap + xIntArray[2]];
                    valueX = (1-dx)*(1-dy)*valueX
                           + dx*(1-dy)*xB
                           + (1-dx)*dy*xC
                           + dx*dy*xD;
                    const auto yB = heatMapPtr[offsetOriginY + yIntArray[1]*widthHeatMap + xIntArray[2]];
                    const auto yC = heatMapPtr[offsetOriginY + yIntArray[2]*widthHeatMap + xIntArray[1]];
                    const auto yD = heatMapPtr[offsetOriginY + yIntArray[2]*widthHeatMap + xIntArray[2]];
                    valueY = (1-dx)*(1-dy)*valueY
                           + dx*(1-dy)*yB
                           + (1-dx)*dy*yC
                           + dx*dy*yD;
                }

                float3 rgbColor2;
G
gineshidalgo99 已提交
528 529 530 531 532 533 534 535 536
                // if (forceNorm1)
                // {
                //     const auto norm = std::sqrt(valueX*valueX + valueY*valueY);
                //     if (norm > 0.05f)
                //         getColorXYAffinity(rgbColor2, valueX/norm, valueY/norm);
                //     else
                //         getColorXYAffinity(rgbColor2, valueX, valueY);
                // }
                // else
G
gineshidalgo99 已提交
537 538 539 540 541 542
                getColorXYAffinity(rgbColor2, valueX, valueY);
                rgbColor[0] += rgbColor2.x;
                rgbColor[1] += rgbColor2.y;
                rgbColor[2] += rgbColor2.z;
            }

543 544
            const auto blueIndex = 3*(y * targetWidth + x);
            addColorWeighted(targetPtr[blueIndex+2], targetPtr[blueIndex+1], targetPtr[blueIndex], rgbColor,
545
                             alphaColorToAdd);
G
gineshidalgo99 已提交
546 547 548
        }
    }

G
gineshidalgo99 已提交
549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
    __global__ void renderDistance(float* targetPtr, const int targetWidth, const int targetHeight,
                                   const float* const heatMapPtr, const int widthHeatMap, const int heightHeatMap,
                                   const float scaleToKeepRatio, const int part, const int numberBodyParts,
                                   const int numberBodyPAFChannels, const float alphaColorToAdd)
    {
        const auto x = (blockIdx.x * blockDim.x) + threadIdx.x;
        const auto y = (blockIdx.y * blockDim.y) + threadIdx.y;

        if (x < targetWidth && y < targetHeight)
        {
            const auto xSource = (x + 0.5f) / scaleToKeepRatio - 0.5f;
            const auto ySource = (y + 0.5f) / scaleToKeepRatio - 0.5f;
            const auto heatMapOffset = part * widthHeatMap * heightHeatMap
                                     + (numberBodyParts+1+numberBodyPAFChannels)*widthHeatMap * heightHeatMap;
            const auto* const heatMapPtrOffsetted = heatMapPtr + heatMapOffset;
            const auto interpolatedValue = 0.5f
                                         + 0.5f * bicubicInterpolate(heatMapPtrOffsetted, xSource, ySource,
                                                                     widthHeatMap, heightHeatMap, widthHeatMap);

            float rgbColor[3];
            getColorHeatMap(rgbColor, interpolatedValue, 0.f, 1.f);

            const auto blueIndex = 3*(y * targetWidth + x);
            addColorWeighted(targetPtr[blueIndex+2], targetPtr[blueIndex+1], targetPtr[blueIndex], rgbColor,
                             alphaColorToAdd);
        }
    }

G
gineshidalgo99 已提交
577 578 579 580 581 582
    inline void checkAlpha(const float alphaColorToAdd)
    {
        if (alphaColorToAdd < 0.f || alphaColorToAdd > 1.f)
            error("Alpha must be in the range [0, 1].", __LINE__, __FUNCTION__, __FILE__);
    }

G
gineshidalgo99 已提交
583
    inline void renderPosePAFGpuAux(float* framePtr, const PoseModel poseModel, const Point<int>& frameSize,
584 585 586
                                    const float* const heatMapPtr, const Point<int>& heatMapSize,
                                    const float scaleToKeepRatio, const int part, const int partsToRender,
                                    const float alphaBlending)
G
gineshidalgo99 已提交
587 588 589 590
    {
        try
        {
            //framePtr      =   width * height * 3
591
            //heatMapPtr    =   heatMapSize.x * heatMapSize.y * #body parts
G
gineshidalgo99 已提交
592
            checkAlpha(alphaBlending);
593
            const auto heatMapOffset = getPoseNumberBodyParts(poseModel) * heatMapSize.area();
G
gineshidalgo99 已提交
594 595
            dim3 threadsPerBlock;
            dim3 numBlocks;
596
            getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
597 598 599
            renderPartAffinities<<<threadsPerBlock, numBlocks>>>(framePtr, frameSize.x, frameSize.y, heatMapPtr,
                                                                 heatMapSize.x, heatMapSize.y, scaleToKeepRatio,
                                                                 partsToRender, part, alphaBlending);
G
gineshidalgo99 已提交
600 601 602 603 604 605 606 607
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }

608 609 610 611
    void renderPoseKeypointsGpu(
        float* framePtr, float* maxPtr, float* minPtr, float* scalePtr, const PoseModel poseModel,
        const int numberPeople, const Point<int>& frameSize, const float* const posePtr,
        const float renderThreshold, const bool googlyEyes, const bool blendOriginalFrame, const float alphaBlending)
G
gineshidalgo99 已提交
612 613 614 615 616
    {
        try
        {
            if (numberPeople > 0 || !blendOriginalFrame)
            {
617 618 619 620 621
                // framePtr      =   width * height * 3
                // heatMapPtr    =   heatMapSize.x * heatMapSize.y * #body parts
                // posePtr       =   3 (x,y,score) * #Body parts * numberPeople
                if (googlyEyes && (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4))
                    error("Bool googlyEyes not compatible with MPI models.",
622
                          __LINE__, __FUNCTION__, __FILE__);
623 624 625
                if (numberPeople > POSE_MAX_PEOPLE)
                    error("Rendering assumes that numberPeople <= POSE_MAX_PEOPLE = " + std::to_string(POSE_MAX_PEOPLE)
                          + ".", __LINE__, __FUNCTION__, __FILE__);
G
gineshidalgo99 已提交
626

627 628 629 630 631 632 633 634
                //// Get bounding box per person
                //const dim3 threadsPerBlockBoundBox = {1, 1, 1};
                //const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
                //getBoundingBoxPerPersonPose<<<threadsPerBlockBoundBox, numBlocksBox>>>(
                //    maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                //    getPoseNumberBodyParts(poseModel), renderThreshold);

                // Body pose
G
gineshidalgo99 已提交
635 636
                dim3 threadsPerBlock;
                dim3 numBlocks;
637
                getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
638 639
                if (poseModel == PoseModel::BODY_25 || poseModel == PoseModel::BODY_25D
                    || poseModel == PoseModel::BODY_25E)
640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659
                {
                    // const auto REPS = 1000;
                    // double timeNormalize0 = 0.;
                    // double timeNormalize1 = 0.;

                    // // Non-optimized code
                    // OP_CUDA_PROFILE_INIT(REPS);
                    // renderPoseBody25Old<<<threadsPerBlock, numBlocks>>>(
                    //     framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
                    //     blendOriginalFrame, alphaBlending
                    // );
                    // OP_CUDA_PROFILE_END(timeNormalize0, 1e3, REPS);

                    // Optimized code
                    // OP_CUDA_PROFILE_INIT(REPS);
                    // const dim3 threadsPerBlockBoundBox = {1, 1, 1};
                    // const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
                    // getBoundingBoxPerPersonPose<<<threadsPerBlockBoundBox, numBlocksBox>>>(
                    //     maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, 25,
                    //     renderThreshold);
660
                    renderPoseBody25<<<threadsPerBlock, numBlocks>>>(
661 662
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
663
                    );
664 665 666
                    // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);

                    // // Profiling code
667 668
                    // opLog("  renderOld=" + std::to_string(timeNormalize0) + "ms");
                    // opLog("  renderNew=" + std::to_string(timeNormalize1) + "ms");
669
                }
670
                else if (poseModel == PoseModel::COCO_18)
671
                    renderPoseCoco<<<threadsPerBlock, numBlocks>>>(
672 673
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
674
                    );
G
gineshidalgo99 已提交
675 676
                else if (poseModel == PoseModel::BODY_19 || poseModel == PoseModel::BODY_19E
                         || poseModel == PoseModel::BODY_19N || poseModel == PoseModel::BODY_19_X2)
677
                    renderPoseBody19<<<threadsPerBlock, numBlocks>>>(
678 679
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
680
                    );
G
gineshidalgo99 已提交
681 682
                else if (poseModel == PoseModel::BODY_23)
                    renderPoseBody23<<<threadsPerBlock, numBlocks>>>(
683 684
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
G
gineshidalgo99 已提交
685
                    );
686 687
                else if (poseModel == PoseModel::BODY_25B)
                    renderPoseBody25b<<<threadsPerBlock, numBlocks>>>(
688 689
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
690
                    );
G
gineshidalgo99 已提交
691
                else if (poseModel == PoseModel::BODY_135)
692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711
                {
                    // const auto REPS = 500;
                    // double timeNormalize1 = 0.;
                    // double timeNormalize2 = 0.;

                    // // Non-optimized code
                    // OP_CUDA_PROFILE_INIT(REPS);
                    //  renderPoseBody135Old<<<threadsPerBlock, numBlocks>>>(
                    //      framePtr, frameSize.x, frameSize.y, posePtr, numberPeople, renderThreshold, googlyEyes,
                    //      blendOriginalFrame, alphaBlending
                    // );
                    // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);

                    // Optimized code
                    // OP_CUDA_PROFILE_INIT(REPS);
                    // const dim3 threadsPerBlockBoundBox = {1, 1, 1};
                    // const dim3 numBlocksBox{getNumberCudaBlocks(POSE_MAX_PEOPLE, threadsPerBlockBoundBox.x)};
                    // getBoundingBoxPerPersonPose<<<threadsPerBlockBoundBox, numBlocksBox>>>(
                    //     maxPtr, minPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople, 135,
                    //     renderThreshold);
G
gineshidalgo99 已提交
712
                    renderPoseBody135<<<threadsPerBlock, numBlocks>>>(
713 714
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
G
gineshidalgo99 已提交
715
                    );
716 717 718
                    // OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS);

                    // // Profiling code
719 720
                    // opLog("  renderOld=" + std::to_string(timeNormalize1) + "ms");
                    // opLog("  renderNew=" + std::to_string(timeNormalize2) + "ms");
721
                }
G
gineshidalgo99 已提交
722
                else if (poseModel == PoseModel::MPI_15 || poseModel == PoseModel::MPI_15_4)
723
                    renderPoseMpi29Parts<<<threadsPerBlock, numBlocks>>>(
724 725
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, blendOriginalFrame, alphaBlending
726
                    );
727 728 729
                // Car pose
                else if (poseModel == PoseModel::CAR_12)
                    renderPoseCar12<<<threadsPerBlock, numBlocks>>>(
730 731
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
732
                    );
733 734
                else if (poseModel == PoseModel::CAR_22)
                    renderPoseCar22<<<threadsPerBlock, numBlocks>>>(
735 736
                        framePtr, minPtr, maxPtr, scalePtr, frameSize.x, frameSize.y, posePtr, numberPeople,
                        renderThreshold, googlyEyes, blendOriginalFrame, alphaBlending
737
                    );
738
                // Unknown
G
gineshidalgo99 已提交
739
                else
740
                    error("Invalid Model.", __LINE__, __FUNCTION__, __FILE__);
G
gineshidalgo99 已提交
741 742 743 744 745 746 747 748 749
                cudaCheck(__LINE__, __FUNCTION__, __FILE__);
            }
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }

G
Gines Hidalgo 已提交
750 751 752
    void renderPoseHeatMapGpu(float* framePtr, const Point<int>& frameSize, const float* const heatMapPtr,
                              const Point<int>& heatMapSize, const float scaleToKeepRatio, const unsigned int part,
                              const float alphaBlending)
G
gineshidalgo99 已提交
753 754 755 756
    {
        try
        {
            //framePtr      =   width * height * 3
757
            //heatMapPtr    =   heatMapSize.x * heatMapSize.y * #body parts
G
gineshidalgo99 已提交
758 759 760
            checkAlpha(alphaBlending);
            dim3 threadsPerBlock;
            dim3 numBlocks;
761
            getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
G
gineshidalgo99 已提交
762

763 764
            renderBodyPartHeatMap<<<threadsPerBlock, numBlocks>>>(
                framePtr, frameSize.x, frameSize.y, heatMapPtr, heatMapSize.x, heatMapSize.y, scaleToKeepRatio,
G
gineshidalgo99 已提交
765
                part, alphaBlending
766
            );
G
gineshidalgo99 已提交
767 768 769 770 771 772 773 774
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }

775 776 777
    void renderPoseHeatMapsGpu(float* framePtr, const PoseModel poseModel, const Point<int>& frameSize,
                               const float* const heatMapPtr, const Point<int>& heatMapSize,
                               const float scaleToKeepRatio, const float alphaBlending)
G
gineshidalgo99 已提交
778 779 780 781
    {
        try
        {
            //framePtr      =   width * height * 3
782
            //heatMapPtr    =   heatMapSize.x * heatMapSize.y * #body parts
G
gineshidalgo99 已提交
783 784 785
            checkAlpha(alphaBlending);
            dim3 threadsPerBlock;
            dim3 numBlocks;
786
            getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
787
            const auto numberBodyParts = getPoseNumberBodyParts(poseModel);
G
gineshidalgo99 已提交
788 789
            const auto heatMapOffset = numberBodyParts * heatMapSize.area();

790 791 792 793
            renderBodyPartHeatMaps<<<threadsPerBlock, numBlocks>>>(
                framePtr, frameSize.x, frameSize.y, heatMapPtr, heatMapSize.x, heatMapSize.y, scaleToKeepRatio,
                numberBodyParts, alphaBlending
            );
G
gineshidalgo99 已提交
794 795 796 797 798 799 800 801
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }

802 803 804
    void renderPosePAFGpu(
        float* framePtr, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
        const Point<int>& heatMapSize, const float scaleToKeepRatio, const int part, const float alphaBlending)
G
gineshidalgo99 已提交
805 806 807
    {
        try
        {
808 809
            renderPosePAFGpuAux(framePtr, poseModel, frameSize, heatMapPtr, heatMapSize, scaleToKeepRatio, part, 1,
                                alphaBlending);
G
gineshidalgo99 已提交
810 811 812 813 814 815 816
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }

817 818 819
    void renderPosePAFsGpu(
        float* framePtr, const PoseModel poseModel, const Point<int>& frameSize, const float* const heatMapPtr,
        const Point<int>& heatMapSize, const float scaleToKeepRatio, const float alphaBlending)
G
gineshidalgo99 已提交
820 821 822
    {
        try
        {
823
            const auto numberBodyPartPairs = (int)getPosePartPairs(poseModel).size()/2;
824 825
            renderPosePAFGpuAux(
                framePtr, poseModel, frameSize, heatMapPtr, heatMapSize, scaleToKeepRatio,
826
                getPoseNumberBodyParts(poseModel) + (addBkgChannel(poseModel) ? 1 : 0),
827
                numberBodyPartPairs, alphaBlending);
G
gineshidalgo99 已提交
828 829 830 831 832 833
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }
G
gineshidalgo99 已提交
834

835 836 837
    void renderPoseDistanceGpu(
        float* framePtr, const Point<int>& frameSize, const float* const heatMapPtr, const Point<int>& heatMapSize,
        const float scaleToKeepRatio, const unsigned int part, const float alphaBlending)
G
gineshidalgo99 已提交
838 839 840
    {
        try
        {
G
gineshidalgo99 已提交
841 842 843 844 845 846 847 848
            // // As PAF
            // const bool forceNorm1 = true;
            // renderPosePAFGpuAux(framePtr, poseModel, frameSize, heatMapPtr, heatMapSize, scaleToKeepRatio, part, 1,
            //                     alphaBlending, forceNorm1);

            // As body part
            // framePtr      =   width * height * 3
            // heatMapPtr    =   heatMapSize.x * heatMapSize.y * #body parts
G
gineshidalgo99 已提交
849 850 851 852
            checkAlpha(alphaBlending);
            dim3 threadsPerBlock;
            dim3 numBlocks;
            getNumberCudaThreadsAndBlocks(threadsPerBlock, numBlocks, frameSize);
G
gineshidalgo99 已提交
853 854 855

            const auto absValue = true;
            renderBodyPartHeatMap<<<threadsPerBlock, numBlocks>>>(
G
gineshidalgo99 已提交
856
                framePtr, frameSize.x, frameSize.y, heatMapPtr, heatMapSize.x, heatMapSize.y, scaleToKeepRatio,
G
gineshidalgo99 已提交
857
                part, alphaBlending, absValue);
G
gineshidalgo99 已提交
858 859 860 861 862 863 864
            cudaCheck(__LINE__, __FUNCTION__, __FILE__);
        }
        catch (const std::exception& e)
        {
            error(e.what(), __LINE__, __FUNCTION__, __FILE__);
        }
    }
G
gineshidalgo99 已提交
865
}