// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include "paddle/phi/kernels/yolov3_loss_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/yolov3_loss_functor.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { template static inline bool LessEqualZero(T x) { return x < 1e-6; } template static T SigmoidCrossEntropy(T x, T label) { return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x))); } template static T L1Loss(T x, T y) { return std::abs(y - x); } static int GetMaskIndex(std::vector mask, int val) { for (size_t i = 0; i < mask.size(); i++) { if (mask[i] == val) { return i; } } return -1; } template static inline Box GetYoloBox(const T* x, std::vector anchors, int i, int j, int an_idx, int grid_size, int input_size, int index, int stride, float scale, float bias) { Box b; b.x = (i + sigmoid(x[index]) * scale + bias) / grid_size; b.y = (j + sigmoid(x[index + stride]) * scale + bias) / grid_size; b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] / input_size; b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] / input_size; return b; } template static inline T BoxOverlap(T c1, T w1, T c2, T w2) { T l1 = c1 - w1 / 2.0; T l2 = c2 - w2 / 2.0; T left = l1 > l2 ? l1 : l2; T r1 = c1 + w1 / 2.0; T r2 = c2 + w2 / 2.0; T right = r1 < r2 ? r1 : r2; return right - left; } template static inline T CalcBoxIoU(Box b1, Box b2) { T w = BoxOverlap(b1.x, b1.w, b2.x, b2.w); T h = BoxOverlap(b1.y, b1.h, b2.y, b2.h); T inter_area = (w < 0 || h < 0) ? 0.0 : w * h; T union_area = b1.w * b1.h + b2.w * b2.h - inter_area; return inter_area / union_area; } template static void CalcBoxLocationLoss(T* loss, const T* input, Box gt, std::vector anchors, int an_idx, int box_idx, int gi, int gj, int grid_size, int input_size, int stride, T score) { T tx = gt.x * grid_size - gi; T ty = gt.y * grid_size - gj; T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); T scale = (2.0 - gt.w * gt.h) * score; loss[0] += SigmoidCrossEntropy(input[box_idx], tx) * scale; loss[0] += SigmoidCrossEntropy(input[box_idx + stride], ty) * scale; loss[0] += L1Loss(input[box_idx + 2 * stride], tw) * scale; loss[0] += L1Loss(input[box_idx + 3 * stride], th) * scale; } template static inline void CalcLabelLoss(T* loss, const T* input, const int index, const int label, const int class_num, const int stride, const T pos, const T neg, T score) { for (int i = 0; i < class_num; i++) { T pred = input[index + i * stride]; loss[0] += SigmoidCrossEntropy(pred, (i == label) ? pos : neg) * score; } } template static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness, const int n, const int an_num, const int h, const int w, const int stride, const int an_stride) { for (int i = 0; i < n; i++) { for (int j = 0; j < an_num; j++) { for (int k = 0; k < h; k++) { for (int l = 0; l < w; l++) { T obj = objness[k * w + l]; if (obj > 1e-5) { // positive sample: obj = mixup score loss[i] += SigmoidCrossEntropy(input[k * w + l], 1.0) * obj; } else if (obj > -0.5) { // negetive sample: obj = 0 loss[i] += SigmoidCrossEntropy(input[k * w + l], 0.0); } } } objness += stride; input += an_stride; } } } template static void inline GtValid(bool* valid, const T* gtbox, const int n, const int b) { for (int i = 0; i < n; i++) { for (int j = 0; j < b; j++) { if (LessEqualZero(gtbox[j * 4 + 2]) || LessEqualZero(gtbox[j * 4 + 3])) { valid[j] = false; } else { valid[j] = true; } } valid += b; gtbox += b * 4; } } template void Yolov3LossKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& gt_box, const DenseTensor& gt_label, paddle::optional gt_score, const std::vector& anchors, const std::vector& anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth, float scale_x_y, DenseTensor* loss, DenseTensor* objectness_mask, DenseTensor* gt_match_mask) { auto* input = &x; auto objness_mask = objectness_mask; float scale = scale_x_y; float bias = -0.5 * (scale - 1.); const int n = input->dims()[0]; const int h = input->dims()[2]; const int w = input->dims()[3]; const int an_num = anchors.size() / 2; const int mask_num = anchor_mask.size(); const int b = gt_box.dims()[1]; int input_size = downsample_ratio * h; const int stride = h * w; const int an_stride = (class_num + 5) * stride; T label_pos = 1.0; T label_neg = 0.0; if (use_label_smooth) { T smooth_weight = std::min(1.0 / static_cast(class_num), 1.0 / 40); label_pos = 1.0 - smooth_weight; label_neg = smooth_weight; } const T* input_data = input->data(); const T* gt_box_data = gt_box.data(); const int* gt_label_data = gt_label.data(); loss->Resize({n}); T* loss_data = dev_ctx.template Alloc(loss); memset(loss_data, 0, loss->numel() * sizeof(T)); objness_mask->Resize({n, mask_num, h, w}); T* obj_mask_data = dev_ctx.template Alloc(objness_mask); memset(obj_mask_data, 0, objness_mask->numel() * sizeof(T)); gt_match_mask->Resize({n, b}); int* gt_match_mask_data = dev_ctx.template Alloc(gt_match_mask); const T* gt_score_data; DenseTensor gtscore; if (!(gt_score.is_initialized())) { gtscore.Resize({n, b}); dev_ctx.template Alloc(>score); phi::funcs::SetConstant()( dev_ctx, >score, static_cast(1.0)); gt_score_data = gtscore.data(); } else { gt_score_data = gt_score.get_ptr()->data(); } // calc valid gt box mask, avoid calc duplicately in following code DenseTensor gt_valid_mask; gt_valid_mask.Resize({n, b}); bool* gt_valid_mask_data = dev_ctx.template Alloc(>_valid_mask); GtValid(gt_valid_mask_data, gt_box_data, n, b); for (int i = 0; i < n; i++) { for (int j = 0; j < mask_num; j++) { for (int k = 0; k < h; k++) { for (int l = 0; l < w; l++) { // each predict box find a best match gt box, if overlap is bigger // then ignore_thresh, ignore the objectness loss. int box_idx = GetEntryIndex(i, j, k * w + l, mask_num, an_stride, stride, 0); Box pred = GetYoloBox(input_data, anchors, l, k, anchor_mask[j], h, input_size, box_idx, stride, scale, bias); T best_iou = 0; for (int t = 0; t < b; t++) { if (!gt_valid_mask_data[i * b + t]) { continue; } Box gt = GetGtBox(gt_box_data, i, b, t); T iou = CalcBoxIoU(pred, gt); if (iou > best_iou) { best_iou = iou; } } // If best IoU is bigger then ignore_thresh, // ignore the objectness loss. if (best_iou > ignore_thresh) { int obj_idx = (i * mask_num + j) * stride + k * w + l; obj_mask_data[obj_idx] = static_cast(-1); } // all losses should be calculated if best IoU // is bigger then truth thresh, but currently, // truth thresh is an unreachable value as 1.0. } } } for (int t = 0; t < b; t++) { if (!gt_valid_mask_data[i * b + t]) { gt_match_mask_data[i * b + t] = -1; continue; } Box gt = GetGtBox(gt_box_data, i, b, t); int gi = static_cast(gt.x * w); int gj = static_cast(gt.y * h); Box gt_shift = gt; gt_shift.x = 0.0; gt_shift.y = 0.0; T best_iou = 0.0; int best_n = 0; // each gt box find a best match anchor box as positive sample, // for positive sample, all losses should be calculated, and for // other samples, only objectness loss is required. for (int an_idx = 0; an_idx < an_num; an_idx++) { Box an_box; an_box.x = 0.0; an_box.y = 0.0; an_box.w = anchors[2 * an_idx] / static_cast(input_size); an_box.h = anchors[2 * an_idx + 1] / static_cast(input_size); float iou = CalcBoxIoU(an_box, gt_shift); if (iou > best_iou) { best_iou = iou; best_n = an_idx; } } int mask_idx = GetMaskIndex(anchor_mask, best_n); gt_match_mask_data[i * b + t] = mask_idx; if (mask_idx >= 0) { T score = gt_score_data[i * b + t]; int box_idx = GetEntryIndex( i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 0); CalcBoxLocationLoss(loss_data + i, input_data, gt, anchors, best_n, box_idx, gi, gj, h, input_size, stride, score); int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; obj_mask_data[obj_idx] = score; int label = gt_label_data[i * b + t]; int label_idx = GetEntryIndex( i, mask_idx, gj * w + gi, mask_num, an_stride, stride, 5); CalcLabelLoss(loss_data + i, input_data, label_idx, label, class_num, stride, label_pos, label_neg, score); } } } CalcObjnessLoss(loss_data, input_data + 4 * stride, obj_mask_data, n, mask_num, h, w, stride, an_stride); } } // namespace phi PD_REGISTER_KERNEL( yolov3_loss, CPU, ALL_LAYOUT, phi::Yolov3LossKernel, float, double) {}