提交 ac66935e 编写于 作者: W William Wang

maprobe: add basic store test and batch load test

上级 47fe2bc9
......@@ -72,6 +72,9 @@ extern void test_linear_access_latency(uint64_t size, uint64_t step, int iter, i
extern void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv);
extern void test_same_address_load_latency(int iter, int to_csv);
extern void test_read_after_write_latency(int iter, int to_csv);
extern void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv);
extern void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv);
extern void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv);
extern void legacy_test_mem_throughput(uint64_t iter);
extern void legacy_test_mem_throughput_same_set(uint64_t iter);
......
......@@ -79,7 +79,7 @@ void test_same_address_load_latency(int iter, int to_csv)
{
// printf("same address load latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist
register uint64_t result = 0;
// _perf_print_timer();
_perf_start_timer();
......@@ -130,11 +130,11 @@ void test_read_after_write_latency(int iter, int to_csv)
_perf_g_total_samples += total_access;
}
void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv)
void test_linear_access_latency_simple(uint64_t size, uint64_t step, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist
register uint64_t result = 0;
uint64_t num_access = size / step;
// _perf_print_timer();
......@@ -152,7 +152,7 @@ void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_c
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
printf("range %ldKB (%d iters) simple linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
);
}
......@@ -161,13 +161,168 @@ void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_c
_perf_g_total_samples += total_access;
}
void test_linear_access_latency_batch8(uint64_t size, uint64_t step, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
register uint64_t result = 0;
uint64_t num_access = size / step;
num_access += num_access % 8 ? 8 - num_access % 8 : 0;
assert(num_access >= 8);
// prepare access offset
uint64_t address_offset_0 = 0;
register uint64_t address_offset_1 = step * 1;
register uint64_t address_offset_2 = step * 2;
register uint64_t address_offset_3 = step * 3;
register uint64_t address_offset_4 = step * 4;
register uint64_t address_offset_5 = step * 5;
register uint64_t address_offset_6 = step * 6;
register uint64_t address_offset_7 = step * 7;
register uint64_t address_offset_8 = step * 8;
// _perf_print_timer();
_perf_start_timer();
uint64_t address = _PERF_TEST_ADDR_BASE;
for (int i = 0; i < iter; i++) {
for (int j = 0; j < num_access; j += 8) {
register uint64_t access_addr_0 = address + address_offset_0;
register uint64_t access_addr_1 = address + address_offset_1;
register uint64_t access_addr_2 = address + address_offset_2;
register uint64_t access_addr_3 = address + address_offset_3;
register uint64_t access_addr_4 = address + address_offset_4;
register uint64_t access_addr_5 = address + address_offset_5;
register uint64_t access_addr_6 = address + address_offset_6;
register uint64_t access_addr_7 = address + address_offset_7;
address += address_offset_8;
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_0) : "a0");
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_1) : "a0");
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_2) : "a0");
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_3) : "a0");
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_4) : "a0");
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_5) : "a0");
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_6) : "a0");
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(access_addr_7) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = num_access * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) batch(8) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, step
);
}
_perf_blackhole(result);
_perf_g_total_samples += total_access;
}
void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_csv)
{
test_linear_access_latency_batch8(size, step, iter, to_csv);
}
void test_l1_load_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("ld a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("ld a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}
void test_l1_store_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 8(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 16(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 24(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 32(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 40(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 48(%[addr])\n" :: [addr] "r"(address) : "a0");
__asm__ volatile ("sd a0, 56(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * 8 * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}
void test_l1_store_wcb_bandwidth(uint64_t size, int iter, int to_csv)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert(size >= _PERF_CACHELINE_SIZE_BYTE);
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
for (uint64_t address = _PERF_TEST_ADDR_BASE; address < _PERF_TEST_ADDR_BASE + size; address += _PERF_CACHELINE_SIZE_BYTE) {
__asm__ volatile ("sd a0, 0(%[addr])\n" :: [addr] "r"(address) : "a0");
}
}
_perf_end_timer();
// _perf_print_timer();
uint64_t total_access = size / _PERF_CACHELINE_SIZE_BYTE * iter;
if (to_csv) {
printf("%ld, %f, %d, %ld, %ld\n", size, (float)perf.cycle / total_access, iter, total_access, perf.cycle);
} else {
printf("range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB\n",
size/KB, iter, (float)perf.cycle / total_access, total_access * 8 * BYTE / (float)perf.cycle, total_access * _PERF_CACHELINE_SIZE_BYTE / (float)perf.cycle, total_access, perf.cycle, 8
);
}
_perf_g_total_samples += total_access;
}
void test_random_access_latency(uint64_t num_access, uint64_t test_range, uint64_t test_align, int pregen_addr, int iter, int to_csv)
{
// printf("align %d random access (cache line) latency test, %s\n",
// test_align, pregen_addr ? "use pregen addr array" : "gen rand addr at run time"
// );
// printf("range (B), read latency, iters, samples, cycles\n");
register uint64_t result = 0; // make sure compiler will not opt read_pointer_tracing_linklist
register uint64_t result = 0;
// _perf_print_timer();
// alloc memory for random access addr array and data
......
......@@ -110,10 +110,25 @@ void typical_memory_disambiuation_test_set()
// more to be added
}
void typical_l1_access_test_set()
{
printf("------------- typical dcache access pattern test set -------------\n");
printf("ideal load bandwidth:\n");
test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
test_l1_load_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0);
printf("ideal store bandwidth:\n");
test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
test_l1_store_bandwidth(_PERF_L1_SIZE_BYTE, 10, 0);
printf("ideal write combine buffer bandwidth:\n");
test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 2, 0);
test_l1_store_wcb_bandwidth(_PERF_L1_SIZE_BYTE, 5, 0);
}
// typical latency test for fast regression
void typical_latency_test()
{
_perf_g_total_samples = 0;
typical_l1_access_test_set();
typical_linear_load_test_set();
typical_random_load_test_set();
typical_pointer_tracing_load_test_set();
......@@ -144,8 +159,15 @@ void latency_test_example()
{
_perf_calibrate();
printf("latency test example:\n");
test_l1_load_bandwidth(4*KB, 5, 0);
test_l1_load_bandwidth(4*KB, 5, 0);
test_l1_store_bandwidth(4*KB, 5, 0);
test_l1_store_bandwidth(4*KB, 5, 0);
test_l1_store_wcb_bandwidth(8*KB, 5, 0);
test_l1_store_wcb_bandwidth(8*KB, 5, 0);
test_pointer_tracing_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, sizeof(uint64_t), 5, 0);
test_linear_access_latency(_PERF_PAGE_SIZE_BYTE, _PERF_CACHELINE_SIZE_BYTE, 5, 0);
test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 0, 1, 0);
test_random_access_latency(4096, 1024*MB, _PERF_CACHELINE_SIZE_BYTE, 1, 1, 0);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册