main.c 13.7 KB
Newer Older
1 2 3
#include <klib.h>
#include "maprobe.h"

4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
void typical_linear_load_test_set()
{
    _perf_calibrate();
    printf("------------- linear load test set -------------\n");
    printf("page size linear double word load:\n");
    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 1, 0);
    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 2, 0);
    printf("page size linear cache line load:\n");
    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    printf("dcache/2 linear double word load:\n");
    test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 1, 0);
    test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, sizeof(uint64_t), 2, 0);
    printf("dcache/2 linear cache line load:\n");
    test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
    test_linear_access_latency(_PERF_L1_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    printf("dcache linear double word load:\n");
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 1, 0);
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, sizeof(uint64_t), 2, 0);
    printf("dcache linear cache line load:\n");
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    printf("L2 linear cache line load:\n");
    test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
    test_linear_access_latency(_PERF_L2_SIZE_BYTE / 2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    printf("L1 (L1 same set) linear cache line load:\n");
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 10, 0);
31
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 100, 0);
32 33
    printf("L2 (L1 same set) linear cache line load:\n");
    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 2, 0);
34
    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L1_SAME_SET, 4, 0);
35 36 37 38 39 40 41 42
    printf("L1 (L2 same slice) linear cache line load:\n");
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0);
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0);
    printf("L2 (L2 same slice) linear cache line load:\n");
    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 1, 0);
    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_L2_SAME_SLICE, 2, 0);
    printf("L1 (page traverse) linear cache line load:\n");
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 10, 0);
43
    test_linear_access_latency(_PERF_L1_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 100, 0);
44 45
    printf("L2 (page traverse) linear cache line load:\n");
    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 2, 0);
46
    test_linear_access_latency(_PERF_L2_SIZE_BYTE, _PERF_ADDR_STRIDE_NEXT_PAGE, 4, 0);
47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
    printf("total samples: %ld\n", _perf_g_total_samples);
}

void typical_random_load_test_set()
{
    printf("------------- random load test set -------------\n");
    printf("from page size random load (word):\n");
    test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 1, 1, 0);
    test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, 8*BYTE, 0, 1, 0);
    printf("from page size random load (cache line):\n");
    test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
    test_random_access_latency(1024, _PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
    printf("from dcache/2 size random load (word):\n");
    test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 1, 1, 0);
    test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, 8*BYTE, 0, 1, 0);
    printf("from dcache/2 size random load (cache line):\n");
    test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
    test_random_access_latency(1024, _PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
    printf("from dcache size random load (word):\n");
    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 1, 1, 0);
    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, 8*BYTE, 0, 1, 0);
    printf("from dcache size random load (cache line):\n");
    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
    printf("from l2 size random load (word):\n");
    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 1, 1, 0);
    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, 8*BYTE, 0, 1, 0);
    printf("from l2 size random load (cache line):\n");
    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
    test_random_access_latency(_PERF_L1_SIZE_BYTE/_PERF_CACHELINE_SIZE_BYTE*2, _PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
    printf("total samples: %ld\n", _perf_g_total_samples);
}

void typical_pointer_tracing_load_test_set()
{
    printf("------------- pointer tracing load test set -------------\n");
83 84 85 86 87
    printf("dobule word by dobule word tracing:\n");
    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, 8*BYTE, 10, 0);
    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, 8*BYTE, 2, 0);
    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, 8*BYTE, 2, 0);
    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, 8*BYTE, 2, 0);
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
    printf("cacheline by cacheline tracing:\n");
    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 10, 0);
    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
    test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 1, 0);
    printf("page by page, tracing:\n");
    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE*2, _PERF_PAGE_SIZE_BYTE, 10, 0);
    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0);
    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0);
    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_PAGE_SIZE_BYTE, 10, 0);
    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_PAGE_SIZE_BYTE, 10, 0);
    printf("total samples: %ld\n", _perf_g_total_samples);
}

void typical_memory_disambiuation_test_set()
{
    printf("------------- memory disambiuation test set -------------\n");
    printf("load from the same address:\n");
    test_same_address_load_latency(1024, 0);
    test_same_address_load_latency(1024, 0);
    test_same_address_load_latency(1024, 0);
111 112 113 114
    printf("load then store to the same address:\n");
    test_read_after_write_latency(1024, 0);
    test_read_after_write_latency(1024, 0);
    test_read_after_write_latency(1024, 0);
115 116 117
    // more to be added
}

118 119 120 121 122 123 124 125 126 127 128 129
void typical_l1_access_test_set()
{
    printf("------------- typical dcache access pattern test set -------------\n");
    printf("ideal load bandwidth:\n");
    test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
    test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0);
    printf("ideal store bandwidth:\n");
    test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
    test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0);
    printf("ideal write combine buffer bandwidth:\n");
    test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
    test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 5, 0);
130 131 132
    printf("replacement error penalty:\n");
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
133 134
}

135 136 137 138
// typical latency test for fast regression
void typical_latency_test()
{
    _perf_g_total_samples = 0;
139
    typical_l1_access_test_set();
140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
    typical_linear_load_test_set();
    typical_random_load_test_set();
    typical_pointer_tracing_load_test_set();
    typical_memory_disambiuation_test_set();
}

void pointer_tracing_graph()
{
    _perf_g_total_samples = 0;
    _perf_calibrate();
    printf("data for pointer tracing latency graph:\n");
    printf("range (B), read latency, iters, samples\n");
    for (int i = 1*KB; i < 64*KB; i = i + 1*KB) {
        test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 2, 1);
    }
    for (int i = 64*KB; i < 1024*KB; i = i + 64*KB) {
        test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
    }
    test_pointer_tracing_latency(1024*KB, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
    for (int i = 1*MB; i <8*MB; i = i + 1*MB) {
        test_pointer_tracing_latency(i, _PERF_CACHELINE_SIZE_BYTE, 1, 1);
    }
    printf("total samples: %ld\n", _perf_g_total_samples);
}

// a simple test set used to check if test is working correctly
void latency_test_example()
{
    _perf_calibrate();
    printf("latency test example:\n");
170 171 172 173 174 175
    test_l1_load_bandwidth(4*KB, 5, 0);
    test_l1_load_bandwidth(4*KB, 5, 0);
    test_l1_store_bandwidth(4*KB, 5, 0);
    test_l1_store_bandwidth(4*KB, 5, 0);
    test_l1_store_wcb_bandwidth(8*KB, 5, 0);
    test_l1_store_wcb_bandwidth(8*KB, 5, 0);
176
    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
177
    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
178
    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
179
    test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
180
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*4,_PERF_ADDR_STRIDE_L1_SAME_SET,8,0);
181 182 183
    test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
    test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
    test_same_address_load_latency(1024, 0);
184
    test_read_after_write_latency(1024, 0);
185 186 187
    printf("total samples: %ld\n", _perf_g_total_samples);
}

188 189
void l2_l3_pressure_test()
{
190 191
    _perf_calibrate();
    printf("L2 and L3 same set pressure test:\n");
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
    for (int i = 1; i < 16; i++) {
        printf("ways accessed: %d\n", i);
        test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
        test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
    }
    for (int i = 16; i <= 512; i*=2) {
        printf("ways accessed: %d\n", i);
        // jump at i = 32
        test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L2_SAME_SET*i,_PERF_ADDR_STRIDE_L2_SAME_SET,64,0);
    }

    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*32,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*64,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    // jump at i = 128
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*128,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*256,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
    test_linear_access_latency_simple(_PERF_ADDR_STRIDE_L1_SAME_SET*512,_PERF_ADDR_STRIDE_L1_SAME_SET,64,0);
}

216
void legacy_latency_throughput_test()
217 218 219
{
    _perf_calibrate();
    printf("Memory throughput:\n");
220
    legacy_test_mem_throughput(1024);
221
    printf("L1 latency:\n");
222 223 224 225
    test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
    test_pointer_tracing_latency(_PERF_L1_NOALIAS_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    test_pointer_tracing_latency(_PERF_L1_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
226
    printf("L2 latency:\n");
227 228
    test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    // test_pointer_tracing_latency(_PERF_L2_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
229
    printf("L3 latency:\n");
230 231
    test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE/2, _PERF_CACHELINE_SIZE_BYTE, 2, 0);
    // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE,2, 0);
232
    // printf("MEM:\n");
233
    // test_pointer_tracing_latency(_PERF_L3_SIZE_BYTE*2, _PERF_CACHELINE_SIZE_BYTE,2, 0);
234
    printf("total sampl8es: %ld\n", _perf_g_total_samples);
235
}
236

237 238
int main()
{
239 240
    latency_test_example();

241 242 243 244
    generate_linear_access_latency_matrix(8*BYTE);
    generate_linear_access_latency_matrix(_PERF_CACHELINE_SIZE_BYTE);
    generate_pointer_tracing_latency_matrix(8*BYTE);
    generate_pointer_tracing_latency_matrix(_PERF_CACHELINE_SIZE_BYTE);
245
    // generate_random_access_latency_matrix();
246
    generate_replacement_test_matrix();
W
William Wang 已提交
247 248

    // matrix_print_example();
249 250 251 252
    typical_latency_test();
    // pointer_tracing_graph();
    // latency_test();
    // legacy_latency_throughput_test();
253
    l2_l3_pressure_test();
254
    return 0;
255
    // return 0;
256
}