diff --git a/inference/CMakeLists.txt b/inference/CMakeLists.txt
index 994befc87be458ecab679c637d17cfd6239019fb..86b378b2ed0c7cac4f2269f4f94a2165c1db81e8 100644
--- a/inference/CMakeLists.txt
+++ b/inference/CMakeLists.txt
@@ -82,7 +82,7 @@ if (WIN32)
         add_definitions(-DSTATIC_LIB)
     endif()
 else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -o2 -std=c++11")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -o2 -fopenmp -std=c++11")
     set(CMAKE_STATIC_LIBRARY_PREFIX "")
 endif()
 
diff --git a/inference/predictor/seg_predictor.cpp b/inference/predictor/seg_predictor.cpp
index ee32d75561e5d93fa11c7013d1a4a9f845dc9919..d70084f67f2f3c38624e15fc6a454aca22482572 100644
--- a/inference/predictor/seg_predictor.cpp
+++ b/inference/predictor/seg_predictor.cpp
@@ -1,4 +1,5 @@
 #include "seg_predictor.h"
+#include <unsupported/Eigen/CXX11/Tensor>
 
 namespace PaddleSolution {
 
@@ -78,26 +79,8 @@ namespace PaddleSolution {
             //post process
             _mask.clear();
             _scoremap.clear();
-            int out_img_len = eval_height * eval_width;
-            for (int i = 0; i < out_img_len; ++i) {
-                float max_value = -1;
-                int label = 0;
-                for (int j = 0; j < eval_num_class; ++j) {
-                    int index = i + j * out_img_len;
-                    if (index >= blob_out_len) {
-                        break;
-                    }
-                    float value = p_out[index];
-                    if (value > max_value) {
-                        max_value = value;
-                        label = j;
-                    }
-                }
-                if (label == 0) max_value = 0;
-                _mask[i] = uchar(label);
-                _scoremap[i] = uchar(max_value * 255);
-            }
-
+            std::vector<int> out_shape{eval_num_class, eval_height, eval_width};
+            utils::argmax(p_out, out_shape, _mask, _scoremap);
             cv::Mat mask_png = cv::Mat(eval_height, eval_width, CV_8UC1);
             mask_png.data = _mask.data();
             std::string nname(fname);
@@ -251,6 +234,7 @@ namespace PaddleSolution {
                     int idx = u * default_batch_size + i;
                     imgs_batch.push_back(imgs[idx]);
                 }
+
                 if (!_preprocessor->batch_process(imgs_batch, input_buffer.data(), org_height.data(), org_width.data())) {
                     return -1;
                 }
diff --git a/inference/preprocessor/preprocessor_seg.cpp b/inference/preprocessor/preprocessor_seg.cpp
index a3177da5cbb907c27a05d8c5e9290fc70ef9ab02..c2d056bfd2706ad441b96d76165804c0d81cdfaf 100644
--- a/inference/preprocessor/preprocessor_seg.cpp
+++ b/inference/preprocessor/preprocessor_seg.cpp
@@ -32,21 +32,7 @@ namespace PaddleSolution {
         if (*ori_h != rh || *ori_w != rw) {
             cv::resize(im, im, resize_size, 0, 0, cv::INTER_LINEAR);
         }
-
-        float* pmean = _config->_mean.data();
-        float* pscale = _config->_std.data();
-        for (int h = 0; h < rh; ++h) {
-            const uchar* ptr = im.ptr<uchar>(h);
-            int im_index = 0;
-            for (int w = 0; w < rw; ++w) {
-                for (int c = 0; c < channels; ++c) {
-                    int top_index = (c * rh + h) * rw + w;
-                    float pixel = static_cast<float>(ptr[im_index++]);
-                    pixel = (pixel / 255 - pmean[c]) / pscale[c];
-                    data[top_index] = pixel;
-                }
-            }
-        }
+        utils::normalize(im, data, _config->_mean, _config->_std);
         return true;
     }
 
diff --git a/inference/preprocessor/preprocessor_seg.h b/inference/preprocessor/preprocessor_seg.h
index 8c280ab1d9a4e972de55e9afd2935a3a28e6bd90..eba904b8949b3c000799ee84541699989fea425a 100644
--- a/inference/preprocessor/preprocessor_seg.h
+++ b/inference/preprocessor/preprocessor_seg.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "preprocessor.h"
+#include "utils/utils.h"
 
 namespace PaddleSolution {
 
diff --git a/inference/utils/utils.h b/inference/utils/utils.h
index e349618a28282257b01ac44d661f292850cc19b9..894636499bb55b9018cd40072455ae5cedd8a63f 100644
--- a/inference/utils/utils.h
+++ b/inference/utils/utils.h
@@ -4,6 +4,10 @@
 #include <vector>
 #include <string>
 
+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/highgui/highgui.hpp>
+
 #ifdef _WIN32
 #include <filesystem>
 #else
@@ -59,5 +63,58 @@ namespace PaddleSolution {
             return imgs;
         }
         #endif
+
+        // normalize and HWC_BGR -> CHW_RGB
+        inline void normalize(cv::Mat& im, float* data, std::vector<float>& fmean, std::vector<float>& fstd) {
+            int rh = im.rows;
+            int rw = im.cols;
+            int rc = im.channels();
+            double normf = (double)1.0 / 255.0;
+            #pragma omp parallel for
+            for (int h = 0; h < rh; ++h) {
+                const uchar* ptr = im.ptr<uchar>(h);
+                int im_index = 0;
+                for (int w = 0; w < rw; ++w) {
+                    for (int c = 0; c < rc; ++c) {
+                        int top_index = (c * rh + h) * rw + w;
+                        float pixel = static_cast<float>(ptr[im_index++]);
+                        pixel = (pixel * normf - fmean[c]) / fstd[c];
+                        data[top_index] = pixel;
+                    }
+                }
+            }
+        }
+
+        // argmax
+        inline void argmax(float* out, std::vector<int>& shape, std::vector<uchar>& mask, std::vector<uchar>& scoremap) {
+            int out_img_len = shape[1] * shape[2];
+            int blob_out_len = out_img_len * shape[0];
+            /*
+            Eigen::TensorMap<Eigen::Tensor<float, 3>> out_3d(out, shape[0], shape[1], shape[2]);
+            Eigen::Tensor<Eigen::DenseIndex, 2> argmax = out_3d.argmax(0);
+            */
+            float max_value = -1;
+            int label = 0;
+            #pragma omp parallel private(label)
+            for (int i = 0; i < out_img_len; ++i) {
+                max_value = -1;
+                label = 0;
+                #pragma omp for reduction(max : max_value)
+                for (int j = 0; j < shape[0]; ++j) {
+                    int index = i + j * out_img_len;
+                    if (index >= blob_out_len) {
+                        continue;
+                    }
+                    float value = out[index];
+                    if (value > max_value) {
+                        max_value = value;
+                        label = j;
+                    }
+                }
+                if (label == 0) max_value = 0;
+                mask[i] = uchar(label);
+                scoremap[i] = uchar(max_value * 255);
+            }
+        }
     }
 }