latency-test.c 18.5 KB
Newer Older
W
William Wang 已提交
1 2
#include "maprobe.h"

3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
inline uint64_t generate_rand_address(uint64_t base_addr, uint64_t end_addr, uint64_t align) {
    return (rand() % (end_addr - base_addr) + base_addr) / align * align;
}

void generate_rand_address_array(uint64_t* dest, uint64_t base_addr, uint64_t end_addr, uint64_t align, int number) {
    for (int i = 0; i < number; i++) {
        *(dest + i) = generate_rand_address(base_addr, end_addr, align);
    }
}

uint64_t generate_pointer_tracing_address(uint64_t base_addr, uint64_t end_addr, uint64_t step) {
    return setup_pointer_tracing_linklist(base_addr, end_addr, step);
}

uint64_t setup_pointer_tracing_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step)
W
William Wang 已提交
18 19 20 21 22 23 24 25 26 27 28 29 30
{
    uint64_t num_valid_node = 0;
    assert(step % 8 == 0);
    assert(step >= 8);
    for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) {
        uint64_t next_addr = cur_addr + step;
        *((uint64_t*)cur_addr) = next_addr;
        cur_addr = next_addr;
        num_valid_node++;
    }
    return num_valid_node;
}

31
uint64_t read_pointer_tracing_linklist(uint64_t base_addr, uint64_t num_valid_node)
W
William Wang 已提交
32 33 34 35 36 37 38 39 40 41
{
    uint64_t cur_addr = base_addr;
    for (int i = 0; i < num_valid_node; i++) {
        cur_addr = (*(uint64_t*)cur_addr);
    }
    return cur_addr;
}

void latency_test_warmup(uint64_t base_addr, uint64_t end_addr)
{
42
    setup_pointer_tracing_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE);
W
William Wang 已提交
43 44
}

45
float test_pointer_tracing_latency(uint64_t size, int step, int iter, int to_csv)
W
William Wang 已提交
46
{
47 48
    // printf("pointer tracing latency test\n");
    // printf("range (B), read latency, iters, samples, cycles\n");
49
    register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist
W
William Wang 已提交
50
    _perf_start_timer();
51
    uint64_t nnode = setup_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, step);
W
William Wang 已提交
52 53 54 55 56 57
    _perf_end_timer();
    uint64_t total_node = nnode * iter;
    // _perf_print_timer();

    _perf_start_timer();
    for (int i = 0; i < iter; i++) {
58
        result += read_pointer_tracing_linklist(_PERF_TEST_ADDR_BASE, nnode);
W
William Wang 已提交
59 60 61
    }
    _perf_end_timer();
    // _perf_print_timer();
62
    float acpa = (float)perf.cycle / total_node; // average cycle per access
63
    if (to_csv) {
64
        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_node, perf.cycle);
65 66
    } else {
        printf("range %ldKB (%d iters) pointer tracing read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n",
67
            size/KB, iter, acpa, total_node * 8 * BYTE / (float)perf.cycle, total_node, perf.cycle
68 69 70
        );
    }
    _perf_g_total_samples += total_node;
71
    return acpa;
72 73
}

74
float test_same_address_load_latency(int iter, int to_csv)
75 76 77
{
    // printf("same address load latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
78
    register uint64_t result = 0; 
79 80 81 82 83
    // _perf_print_timer();

    _perf_start_timer();
    uint64_t address = _PERF_TEST_ADDR_BASE;
    for (int i = 0; i < iter; i++) {
84
        result += *((volatile uint64_t*) (address));
85 86 87 88
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = iter;
89
    float acpa = (float)perf.cycle / total_access; // average cycle per access
90
    if (to_csv) {
91
        printf("%ld, %f, %d, %ld, %ld\n", 0, acpa, iter, total_access, perf.cycle);
92 93
    } else {
        printf("same address read latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", 
94
            acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle
95 96 97
        );
    }
    _perf_g_total_samples += total_access;
98
    return acpa;
99 100
}

101
float test_read_after_write_latency(int iter, int to_csv)
102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
{
    // printf("same address store-load latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
    volatile uint64_t result = 0; // make sure compiler will store data to memory
    // _perf_print_timer();

    _perf_start_timer();
    uint64_t address = _PERF_TEST_ADDR_BASE;
    for (int i = 0; i < iter; i++) {
        result += *((uint64_t*) (address));
        address += sizeof(uint64_t);
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = iter;
117
    float acpa = (float)perf.cycle / total_access; // average cycle per access
118
    if (to_csv) {
119
        printf("%ld, %f, %d, %ld, %ld\n", 0, acpa, iter, total_access, perf.cycle);
120 121
    } else {
        printf("read after write latency %f, throughput %f B/cycle (%ld samples, %ld cycles)\n", 
122
            acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle
123 124 125
        );
    }
    _perf_g_total_samples += total_access;
126
    return acpa;
127 128
}

129
float test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv)
130 131 132
{
    // printf("stride %d linear access latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
133
    register uint64_t result = 0; 
134 135 136 137 138 139 140 141 142 143 144 145 146 147
    uint64_t num_access = size / step;
    // _perf_print_timer();

    _perf_start_timer();
    uint64_t address = _PERF_TEST_ADDR_BASE;
    for (int i = 0; i < iter; i++) {
        for (int j = 0; j < num_access; j++) {
            result += *((uint64_t*) (address));
            address += step;
        }
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = num_access * iter;
148
    float acpa = (float)perf.cycle / total_access; // average cycle per access
149
    if (to_csv) {
150
        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
151
    } else {
152
        printf("range %ldKB (%d iters) simple linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
153
            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
154 155 156
        );
    }
    _perf_g_total_samples += total_access;
157
    return acpa;
158 159
}

160
float test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv)
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
{
    // printf("stride %d linear access latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
    uint64_t num_access = size / step;
    num_access += num_access % 8 ? 8 - num_access % 8 : 0;
    assert(num_access >= 8);
    // prepare access offset
    uint64_t address_offset_0 = 0;
    register uint64_t address_offset_1 = step * 1;
    register uint64_t address_offset_2 = step * 2;
    register uint64_t address_offset_3 = step * 3;
    register uint64_t address_offset_4 = step * 4;
    register uint64_t address_offset_5 = step * 5;
    register uint64_t address_offset_6 = step * 6;
    register uint64_t address_offset_7 = step * 7;
    register uint64_t address_offset_8 = step * 8;

    // _perf_print_timer();
    _perf_start_timer();
    uint64_t address = _PERF_TEST_ADDR_BASE;
    for (int i = 0; i < iter; i++) {
        for (int j = 0; j < num_access; j += 8) {
            register uint64_t access_addr_0 = address + address_offset_0;
            register uint64_t access_addr_1 = address + address_offset_1;
            register uint64_t access_addr_2 = address + address_offset_2;
            register uint64_t access_addr_3 = address + address_offset_3;
            register uint64_t access_addr_4 = address + address_offset_4;
            register uint64_t access_addr_5 = address + address_offset_5;
            register uint64_t access_addr_6 = address + address_offset_6;
            register uint64_t access_addr_7 = address + address_offset_7;
            address += address_offset_8;
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0");
            __asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0");
        }
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = num_access * iter;
205
    float acpa = (float)perf.cycle / total_access; // average cycle per access
206
    if (to_csv) {
207
        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
208 209
    } else {
        printf("range %ldKB (%d iters) batch(8) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
210
            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
211 212
        );
    }
W
William Wang 已提交
213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
    _perf_g_total_samples += total_access;
    return acpa;
}

float test_linear_write_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv)
{
    // printf("stride %d linear access latency test\n", step);
    // printf("range (B), read latency, iters, samples, cycles\n");
    uint64_t num_access = size / step;
    num_access += num_access % 8 ? 8 - num_access % 8 : 0;
    assert(num_access >= 8);
    // prepare access offset
    uint64_t address_offset_0 = 0;
    register uint64_t address_offset_1 = step * 1;
    register uint64_t address_offset_2 = step * 2;
    register uint64_t address_offset_3 = step * 3;
    register uint64_t address_offset_4 = step * 4;
    register uint64_t address_offset_5 = step * 5;
    register uint64_t address_offset_6 = step * 6;
    register uint64_t address_offset_7 = step * 7;
    register uint64_t address_offset_8 = step * 8;
234

W
William Wang 已提交
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
    // _perf_print_timer();
    _perf_start_timer();
    uint64_t address = _PERF_TEST_ADDR_BASE;
    for (int i = 0; i < iter; i++) {
        for (int j = 0; j < num_access; j += 8) {
            register uint64_t access_addr_0 = address + address_offset_0;
            register uint64_t access_addr_1 = address + address_offset_1;
            register uint64_t access_addr_2 = address + address_offset_2;
            register uint64_t access_addr_3 = address + address_offset_3;
            register uint64_t access_addr_4 = address + address_offset_4;
            register uint64_t access_addr_5 = address + address_offset_5;
            register uint64_t access_addr_6 = address + address_offset_6;
            register uint64_t access_addr_7 = address + address_offset_7;
            address += address_offset_8;
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0");
            __asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0");
        }
    }
    _perf_end_timer();
    // _perf_print_timer();
    uint64_t total_access = num_access * iter;
    float acpa = (float)perf.cycle / total_access; // average cycle per access
    if (to_csv) {
        printf("%ld, %f, %d, %ld, %ld\n", size, acpa, iter, total_access, perf.cycle);
    } else {
        printf("range %ldKB (%d iters) batch(8) linear write latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n", 
            size/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
        );
    }
270
    _perf_g_total_samples += total_access;
271
    return acpa;
272 273
}

274
float test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv)
275
{
276
    return test_linear_access_latency_batch8(size, step, iter, to_csv);
277 278
}

W
William Wang 已提交
279 280 281 282 283
float test_linear_write_latency(uint64_t size, uint64_t step, int iter, int to_csv)
{
    return test_linear_write_latency_batch8(size, step, iter, to_csv);
}

284
float test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv)
285 286 287 288 289
{
    // printf("align %d random access (cache line) latency test, %s\n",
    //     test_align, pregen_addr ? "use pregen addr array" : "gen rand addr at run time"
    // );
    // printf("range (B), read latency, iters, samples, cycles\n");
290
    register uint64_t result = 0; 
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
    // _perf_print_timer();

    // alloc memory for random access addr array and data
    assert(test_align >= 8 * BYTE);
    // assert(size >= test_align);
    // uint64_t num_access = size / test_align;
    if (pregen_addr) {
        uint64_t test_array_base_addr = _PERF_TEST_ADDR_BASE + num_access * sizeof(uint64_t*);
        uint64_t address_array_base_addr = _PERF_TEST_ADDR_BASE;
        generate_rand_address_array((uint64_t*)address_array_base_addr, test_array_base_addr, test_array_base_addr + test_range, test_align, num_access);
        _perf_start_timer();
        for (int i = 0; i < iter; i++) {
            for (int j = 0; j < num_access; j++) {
                result += *((uint64_t*) (address_array_base_addr + j * sizeof(uint64_t*)));
            }
        }
        _perf_end_timer();
    } else {
        _perf_start_timer();
        for (int i = 0; i < iter; i++) {
            for (int j = 0; j < num_access; j++) {
                result += *((uint64_t*) (generate_rand_address(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + test_range, test_align)));
            }
        }
        _perf_end_timer();
    }
    // _perf_print_timer();
    uint64_t total_access = num_access * iter;
319
    float acpa = (float)perf.cycle / total_access; // average cycle per access
320
    if (to_csv) {
321
        printf("%ld, %f, %d, %ld, %ld\n", test_range, acpa, iter, total_access, perf.cycle);
322 323
    } else {
        printf("range %ldKB, access cover %ldKB (%d iters) random read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), align %ldB, %s\n", 
324
            test_range/KB, total_access*8*_PERF_CACHELINE_SIZE_BYTE/KB, iter, acpa, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, test_align,
325 326 327 328
            pregen_addr ? "pregen addr" : "runtime addr"
        );
    }
    _perf_g_total_samples += total_access;
329
    return acpa;
W
William Wang 已提交
330 331
}

332
void legacy_test_mem_throughput(uint64_t iter)
W
William Wang 已提交
333 334 335 336 337 338 339 340 341 342 343 344 345 346
{
    uint64_t remain = iter;
    uint64_t result = 0;
    uint64_t access_addr = _PERF_TEST_ADDR_BASE;
    _perf_start_timer();
    while (remain--) {
        result += *(uint64_t*) access_addr;
        access_addr += _PERF_CACHELINE_SIZE_BYTE;
    }
    _perf_end_timer();
    *(uint64_t*) _PERF_BLACKHOLE = result;
    printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
}

347
void legacy_test_mem_throughput_same_set(uint64_t iter)
W
William Wang 已提交
348 349 350 351 352 353 354
{
    uint64_t remain = iter;
    uint64_t result = 0;
    uint64_t access_addr = _PERF_TEST_ADDR_BASE;
    _perf_start_timer();
    while (remain--) {
        result += *(uint64_t*) access_addr;
355
        access_addr += _PERF_ADDR_STRIDE_L1_SAME_SET;
W
William Wang 已提交
356 357 358 359 360
    }
    _perf_end_timer();
    *(uint64_t*) _PERF_BLACKHOLE = result;
    printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
}
W
William Wang 已提交
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415

void generate_linear_access_latency_matrix()
{
#define LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14
    // LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
    DEFINE_FLOAT_RESULT_MATRIX(linear_access_latency,size_kb_pow2,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB,iter,3);
    FOR(x,LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { linear_access_latency_row_array[x] = x; }
    FOR(x,3) { linear_access_latency_column_array[x] = x; }
    for (int i = 0; i < LINEAR_ACCESS_MATRIX_SIZE_MAX_POW2_KB; i++) {
        int warm_up_iter = i < 6 ? 4 : 1;
        int test_iter = i < 6 ? 4 : 2;
        linear_access_latency_result_array[i][0] = test_linear_access_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,warm_up_iter,0); //warmup
        linear_access_latency_result_array[i][1] = test_linear_access_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,test_iter,0); //test
        linear_access_latency_result_array[i][2] = test_linear_access_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,test_iter,0); //test
    }
    print_float_result_matrix(&linear_access_latency_matrix_meta);
}

void generate_pointer_tracing_latency_matrix()
{
#define POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB 14
    // POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB 14: 14 cases in total, from 1KB to 8MB
    DEFINE_FLOAT_RESULT_MATRIX(pointer_tracing_latency,size_kb_pow2,POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB,iter,3);
    FOR(x,POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB) { pointer_tracing_latency_row_array[x] = x; }
    FOR(x,3) { pointer_tracing_latency_column_array[x] = x; }
    for (int i = 0; i < POINTER_CHASING_MATRIX_SIZE_MAX_POW2_KB; i++) {
        int warm_up_iter = i < 6 ? 4 : 1;
        int test_iter = i < 6 ? 4 : 2;
        pointer_tracing_latency_result_array[i][0] = test_pointer_tracing_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,warm_up_iter,0); //warmup
        pointer_tracing_latency_result_array[i][1] = test_pointer_tracing_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,test_iter,0); //test
        pointer_tracing_latency_result_array[i][2] = test_pointer_tracing_latency((1<<i)*KB,_PERF_CACHELINE_SIZE_BYTE,test_iter,0); //test
    }
    print_float_result_matrix(&pointer_tracing_latency_matrix_meta);
}

void generate_random_access_latency_matrix()
{
#define RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 10
    // RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB 10: from 1KB to 512KB
#define RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 10
    // RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB 10: from 1KB to 512KB
    DEFINE_FLOAT_RESULT_MATRIX(random_access_latency,test_range_size_kb_pow2,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB,access_size_kb_pow2,RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB);
    FOR(x,RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB) { random_access_latency_row_array[x] = x; }
    FOR(x,RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB) { random_access_latency_column_array[x] = x; }
    for (int i = 0; i < RANDOM_ACCESS_MATRIX_SIZE_MAX_POW2_KB; i++) {
        for (int j = 0; j < RANDOM_ACCESS_MATRIX_ACCESS_MAX_POW2_KB; j++) {
            uint64_t access_size = (1<<j)*KB;
            uint64_t num_access = access_size / sizeof(uint64_t);
            uint64_t test_range = (1<<i)*KB;
            test_random_access_latency(num_access, test_range, sizeof(uint64_t), 1, 1, 0); //warmup
            random_access_latency_result_array[i][j] = test_random_access_latency(num_access, test_range, sizeof(uint64_t), 1, 1, 0); //test
        }
    }
    print_float_result_matrix(&random_access_latency_matrix_meta);
}