improve c++ normalize and argmax performance

f10169ae · sjtubinlong · 5b80d74e · f10169ae · f10169ae · f10169ae
5 changed file
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -82,7 +82,7 @@ if (WIN32)
        add_definitions(-DSTATIC_LIB)
    endif()
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -o2 -std=c++11")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -o2 -fopenmp -std=c++11")
    set(CMAKE_STATIC_LIBRARY_PREFIX "")
 endif()


--- a/inference/predictor/seg_predictor.cpp
+++ b/inference/predictor/seg_predictor.cpp
 #include "seg_predictor.h"
+#include <unsupported/Eigen/CXX11/Tensor>

 namespace PaddleSolution {

@@ -78,26 +79,8 @@ namespace PaddleSolution {
            //post process
            _mask.clear();
            _scoremap.clear();
-            int out_img_len = eval_height * eval_width;
-            for (int i = 0; i < out_img_len; ++i) {
-                float max_value = -1;
-                int label = 0;
-                for (int j = 0; j < eval_num_class; ++j) {
-                    int index = i + j * out_img_len;
-                    if (index >= blob_out_len) {
-                        break;
-                    }
-                    float value = p_out[index];
-                    if (value > max_value) {
-                        max_value = value;
-                        label = j;
-                    }
-                }
-                if (label == 0) max_value = 0;
-                _mask[i] = uchar(label);
-                _scoremap[i] = uchar(max_value * 255);
-            }
-
+            std::vector<int> out_shape{eval_num_class, eval_height, eval_width};
+            utils::argmax(p_out, out_shape, _mask, _scoremap);
            cv::Mat mask_png = cv::Mat(eval_height, eval_width, CV_8UC1);
            mask_png.data = _mask.data();
            std::string nname(fname);
@@ -251,6 +234,7 @@ namespace PaddleSolution {
                    int idx = u * default_batch_size + i;
                    imgs_batch.push_back(imgs[idx]);
                }
+
                if (!_preprocessor->batch_process(imgs_batch, input_buffer.data(), org_height.data(), org_width.data())) {
                    return -1;
                }

--- a/inference/preprocessor/preprocessor_seg.cpp
+++ b/inference/preprocessor/preprocessor_seg.cpp
@@ -32,21 +32,7 @@ namespace PaddleSolution {
        if (*ori_h != rh || *ori_w != rw) {
            cv::resize(im, im, resize_size, 0, 0, cv::INTER_LINEAR);
        }
-
-        float* pmean = _config->_mean.data();
-        float* pscale = _config->_std.data();
-        for (int h = 0; h < rh; ++h) {
-            const uchar* ptr = im.ptr<uchar>(h);
-            int im_index = 0;
-            for (int w = 0; w < rw; ++w) {
-                for (int c = 0; c < channels; ++c) {
-                    int top_index = (c * rh + h) * rw + w;
-                    float pixel = static_cast<float>(ptr[im_index++]);
-                    pixel = (pixel / 255 - pmean[c]) / pscale[c];
-                    data[top_index] = pixel;
-                }
-            }
-        }
+        utils::normalize(im, data, _config->_mean, _config->_std);
        return true;
    }


--- a/inference/preprocessor/preprocessor_seg.h
+++ b/inference/preprocessor/preprocessor_seg.h
 #pragma once

 #include "preprocessor.h"
+#include "utils/utils.h"

 namespace PaddleSolution {


--- a/inference/utils/utils.h
+++ b/inference/utils/utils.h
@@ -4,6 +4,10 @@
 #include <vector>
 #include <string>

+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/highgui/highgui.hpp>
+
 #ifdef _WIN32
 #include <filesystem>
 #else
@@ -59,5 +63,58 @@ namespace PaddleSolution {
            return imgs;
        }
        #endif
+
+        // normalize and HWC_BGR -> CHW_RGB
+        inline void normalize(cv::Mat& im, float* data, std::vector<float>& fmean, std::vector<float>& fstd) {
+            int rh = im.rows;
+            int rw = im.cols;
+            int rc = im.channels();
+            double normf = (double)1.0 / 255.0;
+            #pragma omp parallel for
+            for (int h = 0; h < rh; ++h) {
+                const uchar* ptr = im.ptr<uchar>(h);
+                int im_index = 0;
+                for (int w = 0; w < rw; ++w) {
+                    for (int c = 0; c < rc; ++c) {
+                        int top_index = (c * rh + h) * rw + w;
+                        float pixel = static_cast<float>(ptr[im_index++]);
+                        pixel = (pixel * normf - fmean[c]) / fstd[c];
+                        data[top_index] = pixel;
+                    }
+                }
+            }
+        }
+
+        // argmax
+        inline void argmax(float* out, std::vector<int>& shape, std::vector<uchar>& mask, std::vector<uchar>& scoremap) {
+            int out_img_len = shape[1] * shape[2];
+            int blob_out_len = out_img_len * shape[0];
+            /*
+            Eigen::TensorMap<Eigen::Tensor<float, 3>> out_3d(out, shape[0], shape[1], shape[2]);
+            Eigen::Tensor<Eigen::DenseIndex, 2> argmax = out_3d.argmax(0);
+            */
+            float max_value = -1;
+            int label = 0;
+            #pragma omp parallel private(label)
+            for (int i = 0; i < out_img_len; ++i) {
+                max_value = -1;
+                label = 0;
+                #pragma omp for reduction(max : max_value)
+                for (int j = 0; j < shape[0]; ++j) {
+                    int index = i + j * out_img_len;
+                    if (index >= blob_out_len) {
+                        continue;
+                    }
+                    float value = out[index];
+                    if (value > max_value) {
+                        max_value = value;
+                        label = j;
+                    }
+                }
+                if (label == 0) max_value = 0;
+                mask[i] = uchar(label);
+                scoremap[i] = uchar(max_value * 255);
+            }
+        }
    }
 }