add cpu affinity option (#491)

d8a27de9 · BUG1989 · GitHub · 0620c3b0 · d8a27de9 · d8a27de9
25 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,7 +62,7 @@ option(TENGINE_OPENMP "openmp support" ON)
 option(TENGINE_BUILD_BENCHMARK "build benchmark" ON)
 option(TENGINE_BUILD_EXAMPLES "build examples" ON)
 option(TENGINE_BUILD_TESTS "build tests" OFF)
-option(TENGINE_BUILD_CPP_API "build C++ API" ON)
+option(TENGINE_BUILD_CPP_API "build C++ API" OFF)
 option(TENGINE_DEBUG_DATA "extract data for every layer" OFF)
 option(TENGINE_DEBUG_TIME "print time information for every layer" OFF)
 option(TENGINE_DEBUG_MEM_STAT "print memory status for library" OFF)

--- a/benchmark/tm_benchmark.c
+++ b/benchmark/tm_benchmark.c
@@ -33,6 +33,7 @@
 #define DEFAULT_LOOP_COUNT      1
 #define DEFAULT_THREAD_COUNT    1
 #define DEFAULT_CLUSTER         TENGINE_CLUSTER_ALL
+#define DEFAULT_CPU_AFFINITY    255
 int loop_counts = DEFAULT_LOOP_COUNT;
@@ -142,9 +143,10 @@ int main(int argc, char* argv[])
    int select_num  = -1;
    int num_threads = DEFAULT_THREAD_COUNT;
    int power       = DEFAULT_CLUSTER;
+    int affinity    = DEFAULT_CPU_AFFINITY;
    int res;
-    while ((res = getopt(argc, argv, "r:t:p:s:h")) != -1)
+    while ((res = getopt(argc, argv, "r:t:p:s:a:h")) != -1)
    {
        switch (res)
        {
@@ -160,6 +162,9 @@ int main(int argc, char* argv[])
            case 's':
                select_num = atoi(optarg);
                break;
+            case 'a':
+                affinity = atoi(optarg);
+                break;                
            case 'h':
                show_usage();
                return 0;
@@ -171,6 +176,7 @@ int main(int argc, char* argv[])
    fprintf(stderr, "loop_counts = %d\n", loop_counts);
    fprintf(stderr, "num_threads = %d\n", num_threads);
    fprintf(stderr, "power       = %d\n", power);
+    fprintf(stderr, "affinity    = %d\n", affinity);
    /* inital tengine */
    if (init_tengine() != 0)
@@ -183,6 +189,7 @@ int main(int argc, char* argv[])
    struct options opt;
    opt.num_thread = num_threads;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = affinity;
    switch (power)
    {

--- a/examples/tm_alphapose.cpp
+++ b/examples/tm_alphapose.cpp
@@ -355,6 +355,7 @@ bool tengine_predict(float * input_data, graph_t graph, const int input_dims[4],
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
    if (input_tensor == NULL)

--- a/examples/tm_classification.c
+++ b/examples/tm_classification.c
@@ -40,15 +40,17 @@
 #define DEFAULT_MEAN3 122.679
 #define DEFAULT_LOOP_COUNT 1
 #define DEFAULT_THREAD_COUNT 1
+#define DEFAULT_CPU_AFFINITY 255
 int tengine_classify(const char* model_file, const char* image_file, int img_h, int img_w, const float* mean,
-                     const float* scale, int loop_count, int num_thread)
+                     const float* scale, int loop_count, int num_thread, int affinity)
 {
    /* set runtime options */
    struct options opt;
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = affinity;
    /* inital tengine */
    if (init_tengine() != 0)
@@ -151,7 +153,7 @@ void show_usage()
    fprintf(
        stderr,
        "[Usage]:  [-h]\n    [-m model_file] [-i image_file]\n [-g img_h,img_w] [-s scale[0],scale[1],scale[2]] [-w "
-        "mean[0],mean[1],mean[2]] [-r loop_count] [-t thread_count]\n");
+        "mean[0],mean[1],mean[2]] [-r loop_count] [-t thread_count] [-a cpu_affinity]\n");
    fprintf(
        stderr,
        "\nmobilenet example: \n    ./classification -m /path/to/mobilenet.tmfile -i /path/to/img.jpg -g 224,224 -s "
@@ -162,6 +164,7 @@ int main(int argc, char* argv[])
 {
    int loop_count = DEFAULT_LOOP_COUNT;
    int num_thread = DEFAULT_THREAD_COUNT;
+    int cpu_affinity = DEFAULT_CPU_AFFINITY;
    char* model_file = NULL;
    char* image_file = NULL;
    float img_hw[2] = {0.f};
@@ -171,7 +174,7 @@ int main(int argc, char* argv[])
    float scale[3] = {0.f, 0.f, 0.f};
    int res;
-    while ((res = getopt(argc, argv, "m:i:l:g:s:w:r:t:h")) != -1)
+    while ((res = getopt(argc, argv, "m:i:l:g:s:w:r:t:a:h")) != -1)
    {
        switch (res)
        {
@@ -198,6 +201,9 @@ int main(int argc, char* argv[])
            case 't':
                num_thread = atoi(optarg);
                break;
+            case 'a':
+                cpu_affinity = atoi(optarg);
+                break;
            case 'h':
                show_usage();
                return 0;
@@ -252,7 +258,7 @@ int main(int argc, char* argv[])
        fprintf(stderr, "Mean value not specified, use default   %.1f, %.1f, %.1f\n", mean[0], mean[1], mean[2]);
    }
-    if (tengine_classify(model_file, image_file, img_h, img_w, mean, scale, loop_count, num_thread) < 0)
+    if (tengine_classify(model_file, image_file, img_h, img_w, mean, scale, loop_count, num_thread, cpu_affinity) < 0)
        return -1;
    return 0;

--- a/examples/tm_classification_fp16.c
+++ b/examples/tm_classification_fp16.c
@@ -62,6 +62,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP16;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_classification_int8.c
+++ b/examples/tm_classification_int8.c
@@ -70,6 +70,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_INT8;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_classification_uint8.c
+++ b/examples/tm_classification_uint8.c
@@ -70,6 +70,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_UINT8;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_classification_vulkan.c
+++ b/examples/tm_classification_vulkan.c
@@ -49,6 +49,7 @@ int tengine_classify(const char* model_file, const char* image_file, int img_h,
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_crnn.cpp
+++ b/examples/tm_crnn.cpp
@@ -216,6 +216,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_landmark.cpp
+++ b/examples/tm_landmark.cpp
@@ -110,6 +110,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    /* inital tengine */
    init_tengine();

--- a/examples/tm_landmark_uint8.cpp
+++ b/examples/tm_landmark_uint8.cpp
@@ -119,6 +119,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_UINT8;
+    opt.affinity = 0;
    /* inital tengine */
    init_tengine();

--- a/examples/tm_mobilenet_ssd.c
+++ b/examples/tm_mobilenet_ssd.c
@@ -157,6 +157,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    /* inital tengine */
    init_tengine();

--- a/examples/tm_mobilenet_ssd_acl.c
+++ b/examples/tm_mobilenet_ssd_acl.c
@@ -156,7 +156,8 @@ int main(int argc, char* argv[])
    struct options opt;
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
-    opt.precision = TENGINE_MODE_FP32;        
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;    
    /* inital tengine */
    init_tengine();

--- a/examples/tm_mobilenet_ssd_uint8.cpp
+++ b/examples/tm_mobilenet_ssd_uint8.cpp
@@ -173,6 +173,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_UINT8;
+    opt.affinity = 0;
    // init tengine
    if (init_tengine() < 0)

--- a/examples/tm_openpose.cpp
+++ b/examples/tm_openpose.cpp
@@ -176,6 +176,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    /* inital tengine */
    init_tengine();

--- a/examples/tm_retinaface.cpp
+++ b/examples/tm_retinaface.cpp
@@ -452,7 +452,8 @@ int main(int argc, char* argv[])
    struct options opt;
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
-    opt.precision = TENGINE_MODE_FP32;        
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;       
    /* inital tengine */
    int ret = init_tengine();

--- a/examples/tm_yolact.cpp
+++ b/examples/tm_yolact.cpp
@@ -224,6 +224,7 @@ static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects, const
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_yolov3_tiny.cpp
+++ b/examples/tm_yolov3_tiny.cpp
@@ -699,6 +699,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_yolov3_uint8.cpp
+++ b/examples/tm_yolov3_uint8.cpp
@@ -713,6 +713,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_UINT8;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_yolov4.cpp
+++ b/examples/tm_yolov4.cpp
@@ -380,7 +380,8 @@ int main(int argc, char* argv[])
    struct options opt;
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
-    opt.precision = TENGINE_MODE_FP32;        
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;     
    /* inital tengine */
    if (init_tengine() != 0)

--- a/examples/tm_yolov4_tiny.cpp
+++ b/examples/tm_yolov4_tiny.cpp
@@ -700,6 +700,7 @@ int main(int argc, char* argv[])
    opt.num_thread = num_thread;
    opt.cluster = TENGINE_CLUSTER_ALL;
    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
    /* inital tengine */
    if (init_tengine() != 0)

--- a/include/tengine_c_api.h
+++ b/include/tengine_c_api.h
@@ -136,6 +136,7 @@ struct options
    int num_thread;
    int cluster;
    int precision;
+    uint64_t affinity;
 };
 /* performance profiling records */

--- a/src/lib/cpu.c
+++ b/src/lib/cpu.c
@@ -37,18 +37,20 @@
 * Author: lswang@openailab.com
 */
+#include "cpu.h"
 #include <stdio.h>
 #include <string.h>
 #include <limits.h>
 #include "tengine_c_api.h"
-//#ifndef __ANDROID__
+#ifndef _MSC_VER
+#include <pthread.h>
 #include <sys/syscall.h>
 #include <sched.h>
 #include <unistd.h>
 #include <stdint.h>
-//#endif
+#endif
 #if __APPLE__
 #include "TargetConditionals.h"
@@ -120,6 +122,7 @@ int init_cpu_count()
    return core_count;
 }
+#ifndef _MSC_VER
 static int get_max_freq_khz(int cpuid)
 {
    // first try, for all possible cpu
@@ -215,24 +218,36 @@ static int set_sched_affinity(size_t thread_affinity_mask)
 #define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
    // set affinity for thread
-#if defined(__GLIBC__) || defined(__OHOS__)
+#if (defined __GLIBC__) || (defined _OHOS_)
    pid_t pid = syscall(SYS_gettid);
 #else
 #ifdef PI3
    pid_t pid = getpid();
 #else
+#ifdef MACOS
+    uint64_t tid64;
+    pthread_threadid_np(NULL, &tid64);
+    pid_t pid = (pid_t)tid64;
+#else
    pid_t pid = gettid();
 #endif
+#endif
 #endif
    cpu_set_t mask;
    CPU_ZERO(&mask);
-    for (int i = 0; i < ( int )sizeof(size_t) * 8; i++)
+//    for (int i = 0; i < ( int )sizeof(size_t) * 8; i++)
+    for (int i = 0; i < core_count; i++)
    {
        if (thread_affinity_mask & (1 << i))
            CPU_SET(i, &mask);
    }
+#if MACOS
+    int syscallret = syscall(set_sched_affinity, pid, sizeof(mask), &mask);
+#else
    int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
+#endif
    if (syscallret)
    {
        fprintf(stderr, "syscall error %d\n", syscallret);
@@ -241,6 +256,7 @@ static int set_sched_affinity(size_t thread_affinity_mask)
    return 0;
 }
+#endif
 int init_cluster_mask()
 {
@@ -249,7 +265,7 @@ int init_cluster_mask()
    affinity_mask_all_cluster = ((size_t)(1) << core_count) - 1;
-    //#ifdef __ANDROID__
+#ifndef _MSC_VER
    int max_freq_min_val = INT_MAX;
    int max_freq_max_val = 0;
@@ -285,10 +301,10 @@ int init_cluster_mask()
                affinity_mask_medium_cluster |= (1 << i);
        }
    }
-    //#else
+#else
-    //    // TODO implement me for other platforms
+    // TODO implement me for other platforms
-    //    affinity_mask_big_cluster = affinity_mask_all_cluster;
+    affinity_mask_big_cluster = affinity_mask_all_cluster;
-    //#endif
+#endif
    return 0;
 }
@@ -305,7 +321,7 @@ int get_mask_count(size_t mask)
 {
    int count = 0;
-    for (int i = 0; i < sizeof(size_t) * 8; i++)
+    for (int i = 0; i < core_count; i++)
        if (mask & (1 << i))
            count++;
@@ -314,7 +330,7 @@ int get_mask_count(size_t mask)
 int set_cpu_affine(size_t mask)
 {
-#ifdef __ANDROID__
+#if defined __ANDROID__ || defined __linux__
    int count = get_mask_count(mask);
 #ifdef _OPENMP
@@ -322,7 +338,7 @@ int set_cpu_affine(size_t mask)
    omp_set_num_threads(count);
    int status[sizeof(size_t) * 8] = {0};
+    #pragma omp parallel for num_threads(count)
    for (int i = 0; i < count; i++)
    {
        status[i] = set_sched_affinity(mask);
@@ -339,15 +355,15 @@ int set_cpu_affine(size_t mask)
        return -1;
 #endif
-    return 0;
+#elif __APPLE_IOS__ || _MSC_VER
-#elif __APPLE_IOS__
    // thread affinity not supported on ios
    ( void )mask;
    return -1;
 #else
    int status = set_sched_affinity(mask);
-    if (0 != status)
+    if (0 != status) return -1;
-        return -1;
+	return 0;
 #endif
 }

--- a/src/lib/tengine_c_api.c
+++ b/src/lib/tengine_c_api.c
@@ -1041,7 +1041,15 @@ int DLLEXPORT prerun_graph_multithread(graph_t graph, struct options opt)
    }
    ir_graph->status = GRAPH_STAT_READY;
-    set_cpu_affine(mask);
+    if (0 != opt.affinity && 0 != (opt.affinity & mask))
+    {
+        set_cpu_affine(opt.affinity);
+    }
+    else
+    {
+        set_cpu_affine(mask);
+    }
    return 0;
 }

--- a/src/lib/tengine_cpp_api.cpp
+++ b/src/lib/tengine_cpp_api.cpp
@@ -42,6 +42,7 @@ Net::Net()
    opt.num_thread = 1;
    opt.precision = TENGINE_MODE_FP32;
    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.affinity = 0;
 }
 Net::~Net()