latency-test.c 18.6 KB
Newer Older
W
William Wang 已提交
1 2
#include "maprobe.h"

3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
inline uint64_t generate_rand_address(uint64_t base_addr, uint64_t end_addr, uint64_t align) {
    return (rand() % (end_addr - base_addr) + base_addr) / align * align;
}

void generate_rand_address_array(uint64_t* dest, uint64_t base_addr, uint64_t end_addr, uint64_t align, int number) {
    for (int i = 0; i < number; i++) {
        *(dest + i) = generate_rand_address(base_addr, end_addr, align);
    }
}

uint64_t generate_pointer_tracing_address(uint64_t base_addr, uint64_t end_addr, uint64_t step) {
    return setup_pointer_tracing_linklist(base_addr, end_addr, step);
}

uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step)
W
William Wang 已提交
18 19 20 21 22 23 24 25 26 27 28 29 30
{
    uint64_t num_valid_node = 0;
    assert(step % 8 == 0);
    assert(step >= 8);
    for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) {
        uint64_t next_addr = cur_addr + step;
        *((uint64_t*)cur_addr) = next_addr;
        cur_addr = next_addr;
        num_valid_node++;
    }
    return num_valid_node;
}

31
uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node)
W
William Wang 已提交
32 33 34 35 36 37 38 39 40 41
{
    uint64_t cur_addr = base_addr;
    for (int i = 0; i < num_valid_node; i++) {
        cur_addr = (*(uint64_t*)cur_addr);
    }
    return cur_addr;
}

void latency_test_warmup(uint64_t base_addr, uint64_t end_addr)
{
42
    setup_pointer_tracing_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE);
W
William Wang 已提交
43 44
}

45
float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv)
W
William Wang 已提交
46
{
47 48
    // printf("pointer tracing latency test\n");
    // printf("range (B), read latency, iters, samples, cycles\n");
49
    register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist
W
William Wang 已提交
50
    _perf_start_timer();
51
    uint64_t nnode = setup_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, step);
W
William Wang 已提交
52 53 54 55 56 57
    _perf_end_timer();
    uint64_t total_node = nnode * iter;
    // _perf_print_timer();

    _perf_start_timer();
    for (int i = 0; i < iter; i++) {
58
        result += read_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, nnode);
W
William Wang 已提交
59 60 61
    }
    _perf_end_timer();
    // _perf_print_timer();
62
    float acpa = (float)perf.cycle / total_node; // average cycle per access
63
    if (to_csv) {
64
        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_node, perf.cycle);
65 66
    } else {
        printf("range %ldKB (%d iters) pointer tracing read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n",
67
            size/KB, iter, acpa, total_node * 8 * BYTE / (float)perf.cycle, total_node, perf.cycle
68 69 70
        );
    }
    _perf_g_total_samples += total_node;
71
    _perf_blackhole(result);
72
    return acpa;
73 74
}

75
float test_same_address_load_latency(int iter, int to_csv)
76 77 78
{
    // printf("same address load latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
79
    register uint64_t result = 0; 
80 81 82 83 84
    // _perf_print_timer();

    _perf_start_timer();
    uint64_t address = _PERF_TEST_ADDR_BASE;
    for (int i = 0; i < iter; i++) {
85
        result += *((volatile uint64_t*) (address));
86 87 88 89
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = iter;
90
    float acpa = (float)perf.cycle / total_access; // average cycle per access
91
    if (to_csv) {
92
        printf("%ld, %f, %d, %ld, %ld\n", 0, acpa, iter, total_access, perf.cycle);
93 94
    } else {
        printf("same address read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", 
95
            acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle
96 97 98
        );
    }
    _perf_g_total_samples += total_access;
99
    _perf_blackhole(result);
100
    return acpa;
101 102
}

103
float test_read_after_write_latency(int iter, int to_csv)
104 105 106 107 108 109 110 111
{
    // printf("same address store-load latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
    volatile uint64_t result = 0; // make sure compiler will store data to memory
    // _perf_print_timer();

    _perf_start_timer();
    for (int i = 0; i < iter; i++) {
112
        uint64_t address = _PERF_TEST_ADDR_BASE;
113 114 115 116 117 118
        result += *((uint64_t*) (address));
        address += sizeof(uint64_t);
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = iter;
119
    float acpa = (float)perf.cycle / total_access; // average cycle per access
120
    if (to_csv) {
121
        printf("%ld, %f, %d, %ld, %ld\n", 0, acpa, iter, total_access, perf.cycle);
122 123
    } else {
        printf("read after write latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", 
124
            acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle
125 126 127
        );
    }
    _perf_g_total_samples += total_access;
128
    _perf_blackhole(result);
129
    return acpa;
130 131
}

132
float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv)
133 134 135
{
    // printf("stride %d linear access latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
136
    register uint64_t result = 0; 
137 138 139 140 141
    uint64_t num_access = size / step;
    // _perf_print_timer();

    _perf_start_timer();
    for (int i = 0; i < iter; i++) {
142
        uint64_t address = _PERF_TEST_ADDR_BASE;
143
        for (int j = 0; j < num_access; j++) {
144
            result += *((volatile uint64_t*) (address));
145 146 147 148 149 150
            address += step;
        }
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = num_access * iter;
151
    float acpa = (float)perf.cycle / total_access; // average cycle per access
152
    if (to_csv) {
153
        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
154
    } else {
155
        printf("range %ldKB (%d iters) simple linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
156
            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
157 158 159
        );
    }
    _perf_g_total_samples += total_access;
160
    _perf_blackhole(result);
161
    return acpa;
162 163
}

164
float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv)
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
{
    // printf("stride %d linear access latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
    uint64_t num_access = size / step;
    num_access += num_access % 8 ? 8 - num_access % 8 : 0;
    assert(num_access >= 8);
    // prepare access offset
    uint64_t address_offset_0 = 0;
    register uint64_t address_offset_1 = step * 1;
    register uint64_t address_offset_2 = step * 2;
    register uint64_t address_offset_3 = step * 3;
    register uint64_t address_offset_4 = step * 4;
    register uint64_t address_offset_5 = step * 5;
    register uint64_t address_offset_6 = step * 6;
    register uint64_t address_offset_7 = step * 7;
    register uint64_t address_offset_8 = step * 8;

    // _perf_print_timer();
    _perf_start_timer();
    for (int i = 0; i < iter; i++) {
185
        uint64_t address = _PERF_TEST_ADDR_BASE;
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
        for (int j = 0; j < num_access; j += 8) {
            register uint64_t access_addr_0 = address + address_offset_0;
            register uint64_t access_addr_1 = address + address_offset_1;
            register uint64_t access_addr_2 = address + address_offset_2;
            register uint64_t access_addr_3 = address + address_offset_3;
            register uint64_t access_addr_4 = address + address_offset_4;
            register uint64_t access_addr_5 = address + address_offset_5;
            register uint64_t access_addr_6 = address + address_offset_6;
            register uint64_t access_addr_7 = address + address_offset_7;
            address += address_offset_8;
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0");
        }
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = num_access * iter;
209
    float acpa = (float)perf.cycle / total_access; // average cycle per access
210
    if (to_csv) {
211
        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
212 213
    } else {
        printf("range %ldKB (%d iters) batch(8) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
214
            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
215 216
        );
    }
W
William Wang 已提交
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
    _perf_g_total_samples += total_access;
    return acpa;
}

float test_linear_write_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv)
{
    // printf("stride %d linear access latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
    uint64_t num_access = size / step;
    num_access += num_access % 8 ? 8 - num_access % 8 : 0;
    assert(num_access >= 8);
    // prepare access offset
    uint64_t address_offset_0 = 0;
    register uint64_t address_offset_1 = step * 1;
    register uint64_t address_offset_2 = step * 2;
    register uint64_t address_offset_3 = step * 3;
    register uint64_t address_offset_4 = step * 4;
    register uint64_t address_offset_5 = step * 5;
    register uint64_t address_offset_6 = step * 6;
    register uint64_t address_offset_7 = step * 7;
    register uint64_t address_offset_8 = step * 8;
238

W
William Wang 已提交
239 240 241
    // _perf_print_timer();
    _perf_start_timer();
    for (int i = 0; i < iter; i++) {
242
        uint64_t address = _PERF_TEST_ADDR_BASE;
W
William Wang 已提交
243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
        for (int j = 0; j < num_access; j += 8) {
            register uint64_t access_addr_0 = address + address_offset_0;
            register uint64_t access_addr_1 = address + address_offset_1;
            register uint64_t access_addr_2 = address + address_offset_2;
            register uint64_t access_addr_3 = address + address_offset_3;
            register uint64_t access_addr_4 = address + address_offset_4;
            register uint64_t access_addr_5 = address + address_offset_5;
            register uint64_t access_addr_6 = address + address_offset_6;
            register uint64_t access_addr_7 = address + address_offset_7;
            address += address_offset_8;
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0");
        }
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = num_access * iter;
    float acpa = (float)perf.cycle / total_access; // average cycle per access
    if (to_csv) {
        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
    } else {
        printf("range %ldKB (%d iters) batch(8) linear write latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
        );
    }
274
    _perf_g_total_samples += total_access;
275
    return acpa;
276 277
}

278
float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv)
279
{
280
    return test_linear_access_latency_batch8(size, step, iter, to_csv);
281 282
}

W
William Wang 已提交
283 284 285 286 287
float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv)
{
    return test_linear_write_latency_batch8(size, step, iter, to_csv);
}

288
float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv)
289 290 291 292 293
{
    // printf("align %d random access (cache line) latency test, %s\n",
    //     test_align, pregen_addr ? "use pregen addr array" : "gen rand addr at run time"
    // );
    // printf("range (B), read latency, iters, samples, cycles\n");
294
    register uint64_t result = 0; 
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
    // _perf_print_timer();

    // alloc memory for random access addr array and data
    assert(test_align >= 8 * BYTE);
    // assert(size >= test_align);
    // uint64_t num_access = size / test_align;
    if (pregen_addr) {
        uint64_t test_array_base_addr = _PERF_TEST_ADDR_BASE + num_access * sizeof(uint64_t*);
        uint64_t address_array_base_addr = _PERF_TEST_ADDR_BASE;
        generate_rand_address_array((uint64_t*)address_array_base_addr, test_array_base_addr, test_array_base_addr + test_range, test_align, num_access);
        _perf_start_timer();
        for (int i = 0; i < iter; i++) {
            for (int j = 0; j < num_access; j++) {
                result += *((uint64_t*) (address_array_base_addr + j * sizeof(uint64_t*)));
            }
        }
        _perf_end_timer();
    } else {
        _perf_start_timer();
        for (int i = 0; i < iter; i++) {
            for (int j = 0; j < num_access; j++) {
                result += *((uint64_t*) (generate_rand_address(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + test_range, test_align)));
            }
        }
        _perf_end_timer();
    }
    // _perf_print_timer();
    uint64_t total_access = num_access * iter;
323
    float acpa = (float)perf.cycle / total_access; // average cycle per access
324
    if (to_csv) {
325
        printf("%ld, %f, %d, %ld, %ld\n", test_range, acpa, iter, total_access, perf.cycle);
326 327
    } else {
        printf("range %ldKB, access cover %ldKB (%d iters) random read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), align %ldB, %s\n", 
328
            test_range/KB, total_access*8*_PERF_CACHELINE_SIZE_BYTE/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, test_align,
329 330 331 332
            pregen_addr ? "pregen addr" : "runtime addr"
        );
    }
    _perf_g_total_samples += total_access;
333
    _perf_blackhole(result);
334
    return acpa;
W
William Wang 已提交
335 336
}

337
void legacy_test_mem_throughput(uint64_t iter)
W
William Wang 已提交
338 339 340 341 342 343 344 345 346 347 348 349 350 351
{
    uint64_t remain = iter;
    uint64_t result = 0;
    uint64_t access_addr = _PERF_TEST_ADDR_BASE;
    _perf_start_timer();
    while (remain--) {
        result += *(uint64_t*) access_addr;
        access_addr += _PERF_CACHELINE_SIZE_BYTE;
    }
    _perf_end_timer();
    *(uint64_t*) _PERF_BLACKHOLE = result;
    printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
}

352
void legacy_test_mem_throughput_same_set(uint64_t iter)
W
William Wang 已提交
353 354 355 356 357 358 359
{
    uint64_t remain = iter;
    uint64_t result = 0;
    uint64_t access_addr = _PERF_TEST_ADDR_BASE;
    _perf_start_timer();
    while (remain--) {
        result += *(uint64_t*) access_addr;
360
        access_addr += _PERF_ADDR_STRIDE_L1_SAME_SET;
W
William Wang 已提交
361 362 363 364 365
    }
    _perf_end_timer();
    *(uint64_t*) _PERF_BLACKHOLE = result;
    printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
}
W
William Wang 已提交
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402

void generate_linear_access_latency_matrix()
{
#define LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14
    // LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
    DEFINE_FLOAT_RESULT_MATRIX(linear_access_latency,size_kb_pow2,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB,iter,3);
    FOR(x,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { linear_access_latency_row_array[x] = x; }
    FOR(x,3) { linear_access_latency_column_array[x] = x; }
    for (int i = 0; i < LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB; i++) {
        int warm_up_iter = i < 6 ? 4 : 1;
        int test_iter = i < 6 ? 4 : 2;
        linear_access_latency_result_array[i][0] = test_linear_access_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,warm_up_iter,0); //warmup
        linear_access_latency_result_array[i][1] = test_linear_access_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,test_iter,0); //test
        linear_access_latency_result_array[i][2] = test_linear_access_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,test_iter,0); //test
    }
    print_float_result_matrix(&linear_access_latency_matrix_meta);
}

void generate_pointer_tracing_latency_matrix()
{
#define POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB 14
    // POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
    DEFINE_FLOAT_RESULT_MATRIX(pointer_tracing_latency,size_kb_pow2,POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB,iter,3);
    FOR(x,POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB) { pointer_tracing_latency_row_array[x] = x; }
    FOR(x,3) { pointer_tracing_latency_column_array[x] = x; }
    for (int i = 0; i < POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB; i++) {
        int warm_up_iter = i < 6 ? 4 : 1;
        int test_iter = i < 6 ? 4 : 2;
        pointer_tracing_latency_result_array[i][0] = test_pointer_tracing_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,warm_up_iter,0); //warmup
        pointer_tracing_latency_result_array[i][1] = test_pointer_tracing_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,test_iter,0); //test
        pointer_tracing_latency_result_array[i][2] = test_pointer_tracing_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,test_iter,0); //test
    }
    print_float_result_matrix(&pointer_tracing_latency_matrix_meta);
}

void generate_random_access_latency_matrix()
{
403
#define RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 6
W
William Wang 已提交
404
    // RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 10: from 1KB to 512KB
405
#define RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 6
W
William Wang 已提交
406 407 408 409 410 411 412 413 414 415 416 417 418 419 420
    // RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 10: from 1KB to 512KB
    DEFINE_FLOAT_RESULT_MATRIX(random_access_latency,test_range_size_kb_pow2,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB,access_size_kb_pow2,RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB);
    FOR(x,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { random_access_latency_row_array[x] = x; }
    FOR(x,RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB) { random_access_latency_column_array[x] = x; }
    for (int i = 0; i < RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB; i++) {
        for (int j = 0; j < RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB; j++) {
            uint64_t access_size = (1<<j)*KB;
            uint64_t num_access = access_size / sizeof(uint64_t);
            uint64_t test_range = (1<<i)*KB;
            test_random_access_latency(num_access, test_range, sizeof(uint64_t), 1, 1, 0); //warmup
            random_access_latency_result_array[i][j] = test_random_access_latency(num_access, test_range, sizeof(uint64_t), 1, 1, 0); //test
        }
    }
    print_float_result_matrix(&random_access_latency_matrix_meta);
}