Merge pull request #20671 from rogday:yolov4x-mish

Add support for YOLOv4x-mish * backport to 3.4 for supporting yolov4x-mish * add YOLOv4x-mish test * address review comments Co-authored-by: N Guo Xu <guoxu@1school.com.cn>

Merge pull request #20671 from rogday:yolov4x-mish
Add support for YOLOv4x-mish * backport to 3.4 for supporting yolov4x-mish * add YOLOv4x-mish test * address review comments Co-authored-by: N Guo Xu <guoxu@1school.com.cn>
c410d7a9 · rogday · GitHub · 6fa63dcc · c410d7a9 · c410d7a9
3 changed file
--- a/modules/dnn/src/darknet/darknet_io.cpp
+++ b/modules/dnn/src/darknet/darknet_io.cpp
@@ -470,7 +470,7 @@ namespace cv {
                    fused_layer_names.push_back(last_layer);
                }

-                void setYolo(int classes, const std::vector<int>& mask, const std::vector<float>& anchors, float thresh, float nms_threshold, float scale_x_y)
+                void setYolo(int classes, const std::vector<int>& mask, const std::vector<float>& anchors, float thresh, float nms_threshold, float scale_x_y, int new_coords)
                {
                    cv::dnn::LayerParams region_param;
                    region_param.name = "Region-name";
@@ -484,6 +484,7 @@ namespace cv {
                    region_param.set<float>("thresh", thresh);
                    region_param.set<float>("nms_threshold", nms_threshold);
                    region_param.set<float>("scale_x_y", scale_x_y);
+                    region_param.set<int>("new_coords", new_coords);

                    std::vector<float> usedAnchors(numAnchors * 2);
                    for (int i = 0; i < numAnchors; ++i)
@@ -882,6 +883,7 @@ namespace cv {
                        float thresh = getParam<float>(layer_params, "thresh", 0.2);
                        float nms_threshold = getParam<float>(layer_params, "nms_threshold", 0.0);
                        float scale_x_y = getParam<float>(layer_params, "scale_x_y", 1.0);
+                        int new_coords = getParam<int>(layer_params, "new_coords", 0);

                        std::string anchors_values = getParam<std::string>(layer_params, "anchors", std::string());
                        CV_Assert(!anchors_values.empty());
@@ -894,7 +896,7 @@ namespace cv {
                        CV_Assert(classes > 0 && num_of_anchors > 0 && (num_of_anchors * 2) == anchors_vec.size());

                        setParams.setPermute(false);
-                        setParams.setYolo(classes, mask_vec, anchors_vec, thresh, nms_threshold, scale_x_y);
+                        setParams.setYolo(classes, mask_vec, anchors_vec, thresh, nms_threshold, scale_x_y, new_coords);
                    }
                    else {
                        CV_Error(cv::Error::StsParseError, "Unknown layer type: " + layer_type);

--- a/modules/dnn/src/layers/region_layer.cpp
+++ b/modules/dnn/src/layers/region_layer.cpp
@@ -64,6 +64,7 @@ class RegionLayerImpl CV_FINAL : public RegionLayer
 public:
    int coords, classes, anchors, classfix;
    float thresh, nmsThreshold, scale_x_y;
+    int new_coords;
    bool useSoftmax, useLogistic;
 #ifdef HAVE_OPENCL
    UMat blob_umat;
@@ -83,6 +84,7 @@ public:
        useLogistic = params.get<bool>("logistic", false);
        nmsThreshold = params.get<float>("nms_threshold", 0.4);
        scale_x_y = params.get<float>("scale_x_y", 1.0); // Yolov4
+        new_coords = params.get<int>("new_coords", 0); // Yolov4x-mish

        CV_Assert(nmsThreshold >= 0.);
        CV_Assert(coords == 4);
@@ -113,7 +115,7 @@ public:
    {
 #ifdef HAVE_DNN_NGRAPH
    if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
-        return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2) && preferableTarget != DNN_TARGET_MYRIAD;
+        return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2020_2) && preferableTarget != DNN_TARGET_MYRIAD && new_coords == 0;
 #endif
        return backendId == DNN_BACKEND_OPENCV;
    }
@@ -259,26 +261,28 @@ public:
            const float *srcData = inpBlob.ptr<float>();
            float *dstData = outBlob.ptr<float>();

-            // logistic activation for t0, for each grid cell (X x Y x Anchor-index)
-            for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
-                int index = cell_size*i;
-                float x = srcData[index + 4];
-                dstData[index + 4] = logistic_activate(x);	// logistic activation
-            }
-
-            if (useSoftmax) {  // Yolo v2
+            if (new_coords == 0) {
+                // logistic activation for t0, for each grid cell (X x Y x Anchor-index)
                for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
                    int index = cell_size*i;
-                    softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
+                    float x = srcData[index + 4];
+                    dstData[index + 4] = logistic_activate(x);	// logistic activation
                }
-            }
-            else if (useLogistic) {  // Yolo v3
-                for (int i = 0; i < batch_size*rows*cols*anchors; ++i){
-                    int index = cell_size*i;
-                    const float* input = srcData + index + 5;
-                    float* output = dstData + index + 5;
-                    for (int c = 0; c < classes; ++c)
-                        output[c] = logistic_activate(input[c]);
+
+                if (useSoftmax) {  // Yolo v2
+                    for (int i = 0; i < batch_size*rows*cols*anchors; ++i) {
+                        int index = cell_size*i;
+                        softmax_activate(srcData + index + 5, classes, 1, dstData + index + 5);
+                    }
+                }
+                else if (useLogistic) {  // Yolo v3
+                    for (int i = 0; i < batch_size*rows*cols*anchors; ++i){
+                        int index = cell_size*i;
+                        const float* input = srcData + index + 5;
+                        float* output = dstData + index + 5;
+                        for (int c = 0; c < classes; ++c)
+                            output[c] = logistic_activate(input[c]);
+                    }
                }
            }
            for (int b = 0; b < batch_size; ++b)
@@ -290,20 +294,46 @@ public:
                            int index = (y*cols + x)*anchors + a;  // index for each grid-cell & anchor
                            int p_index = index_sample_offset + index * cell_size + 4;
                            float scale = dstData[p_index];
-                            if (classfix == -1 && scale < .5) scale = 0;  // if(t0 < 0.5) t0 = 0;
+                            if (classfix == -1 && scale < .5)
+                            {
+                                scale = 0;  // if(t0 < 0.5) t0 = 0;
+                            }
                            int box_index = index_sample_offset + index * cell_size;

-                            float x_tmp = (logistic_activate(srcData[box_index + 0]) - 0.5f) * scale_x_y + 0.5f;
-                            float y_tmp = (logistic_activate(srcData[box_index + 1]) - 0.5f) * scale_x_y + 0.5f;
-                            dstData[box_index + 0] = (x + x_tmp) / cols;
-                            dstData[box_index + 1] = (y + y_tmp) / rows;
-                            dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / wNorm;
-                            dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / hNorm;
-
-                            int class_index = index_sample_offset + index * cell_size + 5;
-                            for (int j = 0; j < classes; ++j) {
-                                float prob = scale*dstData[class_index + j];  // prob = IoU(box, object) = t0 * class-probability
-                                dstData[class_index + j] = (prob > thresh) ? prob : 0;  // if (IoU < threshold) IoU = 0;
+                            if (new_coords == 1) {
+                                float x_tmp = (srcData[box_index + 0] - 0.5f) * scale_x_y + 0.5f;
+                                float y_tmp = (srcData[box_index + 1] - 0.5f) * scale_x_y + 0.5f;
+                                dstData[box_index + 0] = (x + x_tmp) / cols;
+                                dstData[box_index + 1] = (y + y_tmp) / rows;
+                                dstData[box_index + 2] = (srcData[box_index + 2]) * (srcData[box_index + 2]) * 4 * biasData[2 * a] / wNorm;
+                                dstData[box_index + 3] = (srcData[box_index + 3]) * (srcData[box_index + 3]) * 4 * biasData[2 * a + 1] / hNorm;
+
+                                scale = srcData[p_index];
+                                if (classfix == -1 && scale < thresh)
+                                {
+                                    scale = 0;  // if(t0 < 0.5) t0 = 0;
+                                }
+
+                                int class_index = index_sample_offset + index * cell_size + 5;
+                                for (int j = 0; j < classes; ++j) {
+                                    float prob = scale*srcData[class_index + j];  // prob = IoU(box, object) = t0 * class-probability
+                                    dstData[class_index + j] = (prob > thresh) ? prob : 0;  // if (IoU < threshold) IoU = 0;
+                                }
+                            }
+                            else
+                            {
+                                float x_tmp = (logistic_activate(srcData[box_index + 0]) - 0.5f) * scale_x_y + 0.5f;
+                                float y_tmp = (logistic_activate(srcData[box_index + 1]) - 0.5f) * scale_x_y + 0.5f;
+                                dstData[box_index + 0] = (x + x_tmp) / cols;
+                                dstData[box_index + 1] = (y + y_tmp) / rows;
+                                dstData[box_index + 2] = exp(srcData[box_index + 2]) * biasData[2 * a] / wNorm;
+                                dstData[box_index + 3] = exp(srcData[box_index + 3]) * biasData[2 * a + 1] / hNorm;
+
+                                int class_index = index_sample_offset + index * cell_size + 5;
+                                for (int j = 0; j < classes; ++j) {
+                                    float prob = scale*dstData[class_index + j];  // prob = IoU(box, object) = t0 * class-probability
+                                    dstData[class_index + j] = (prob > thresh) ? prob : 0;  // if (IoU < threshold) IoU = 0;
+                                }
                            }
                        }
            if (nmsThreshold > 0) {

--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -681,6 +681,78 @@ TEST_P(Test_Darknet_nets, YOLOv4_tiny)
 #endif
 }

+TEST_P(Test_Darknet_nets, YOLOv4x_mish)
+{
+    applyTestTag(CV_TEST_TAG_LONG, (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB));
+
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target == DNN_TARGET_OPENCL_FP16)
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+#if defined(INF_ENGINE_RELEASE)
+    if (target == DNN_TARGET_MYRIAD)  // NC_OUT_OF_MEMORY
+        applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
+
+    // batchId, classId, confidence, left, top, right, bottom
+    const int N0 = 3;
+    const int N1 = 5;
+    static const float ref_[/* (N0 + N1) * 7 */] = {
+0, 16, 0.925536f, 0.17188f,  0.386832f, 0.406138f, 0.941696f,
+0, 1,  0.912028f, 0.162125f, 0.208863f, 0.741316f, 0.729332f,
+0, 7,  0.841018f, 0.608953f, 0.128653f, 0.900692f, 0.295657f,
+
+1, 2, 0.925697f, 0.650438f, 0.458118f, 0.813927f, 0.661775f,
+1, 0, 0.882156f, 0.203644f, 0.365763f, 0.265473f, 0.632195f,
+1, 2, 0.848857f, 0.451044f, 0.462997f, 0.496629f, 0.522719f,
+1, 9, 0.736015f, 0.374503f, 0.316029f, 0.399358f, 0.392883f,
+1, 9, 0.727129f, 0.662469f, 0.373687f, 0.687877f, 0.441335f,
+    };
+    Mat ref(N0 + N1, 7, CV_32FC1, (void*)ref_);
+
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.006 : 8e-5;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.042 : 3e-4;
+
+    std::string config_file = "yolov4x-mish.cfg";
+    std::string weights_file = "yolov4x-mish.weights";
+
+#if defined(INF_ENGINE_RELEASE)
+    if ((backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 ||
+         backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && target == DNN_TARGET_MYRIAD &&
+        getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+    {
+        scoreDiff = 0.04;
+        iouDiff = 0.2;
+    }
+#endif
+
+    {
+        SCOPED_TRACE("batch size 1");
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff);
+    }
+
+    {
+        SCOPED_TRACE("batch size 2");
+
+#if defined(INF_ENGINE_RELEASE)
+        if (backend == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019)
+        {
+            if (target == DNN_TARGET_OPENCL)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+            else if (target == DNN_TARGET_OPENCL_FP16 && INF_ENGINE_VER_MAJOR_LE(202010000))
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16, CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+            else if (target == DNN_TARGET_MYRIAD &&
+                     getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X)
+                applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X);
+        }
+#endif
+
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+    }
+}
+

 INSTANTIATE_TEST_CASE_P(/**/, Test_Darknet_nets, dnnBackendsAndTargets());