提交 c8d9202c 编写于 作者: W William Wang

apps: add basic latency and throughput test

上级 16f3b2bd
NAME = maprobe
SRCS = maprobe.c
include $(AM_HOME)/Makefile.app
// basic microarchtectural probe
#ifndef PROBE_H
#define PROBE_H
#include <klib.h>
#include <csr.h>
// perf const
#define BYTE (1)
#define KB (1024*BYTE)
#define MB (1024*KB)
#define GB (1024*MB)
// platform dependent const
// #define _PERF_TEST_ADDR_BASE 0x80400000
#define _PERF_TEST_ADDR_BASE 0x2000400000
#define _PERF_CACHELINE_SIZE_BYTE (64 * BYTE)
#define _PERF_L1_NOALIAS_SIZE_BYTE (32 * KB)
#define _PERF_L1_SIZE_BYTE (128 * KB)
#define _PERF_L2_SIZE_BYTE (512 * KB)
#define _PERF_L3_SIZE_BYTE (2 * MB)
// probe const
#define _PERF_BLACKHOLE _PERF_TEST_ADDR_BASE
struct perf
{
// const to be calibrated at run time
uint64_t csr_read_cycle; //# of cycles to read mcycle
uint64_t csr_read_ninst; // # of inst needed to read minstret
// timer
uint64_t cycle;
uint64_t instrcnt;
} perf;
void _perf_start_timer()
{
perf.cycle = csr_read(CSR_MCYCLE);
perf.instrcnt = csr_read(CSR_MINSTRET);
}
void _perf_end_timer()
{
perf.cycle = csr_read(CSR_MCYCLE) - perf.cycle;
perf.instrcnt = csr_read(CSR_MINSTRET) - perf.instrcnt;
}
void _perf_print_timer()
{
printf("cycle %d inst %d ipc %lf\n", perf.cycle, perf.instrcnt, (float)perf.instrcnt/perf.cycle);
}
void _perf_calibrate()
{
// csr read delay
uint64_t cycle_1 = csr_read(CSR_MCYCLE);
uint64_t cycle_2 = csr_read(CSR_MCYCLE);
perf.csr_read_cycle = cycle_2-cycle_1;
printf("perf_calibrate: csr_read_cycle %d\n", perf.csr_read_cycle);
// csr read inst cost
uint64_t inst_1 = csr_read(CSR_MINSTRET);
uint64_t inst_2 = csr_read(CSR_MINSTRET);
perf.csr_read_ninst = inst_2-inst_1;
printf("perf_calibrate: csr_read_ninst %d\n", perf.csr_read_ninst);
}
void _perf_blackhole(uint64_t value)
{
*(uint64_t*) _PERF_BLACKHOLE = value;
}
uint64_t setup_latency_test_linklist(uint64_t base_addr, uint64_t end_addr, uint64_t step)
{
uint64_t num_valid_node = 0;
assert(step % 8 == 0);
assert(step >= 8);
for (uint64_t cur_addr = base_addr; cur_addr < end_addr;) {
uint64_t next_addr = cur_addr + step;
*((uint64_t*)cur_addr) = next_addr;
cur_addr = next_addr;
num_valid_node++;
}
return num_valid_node;
}
uint64_t read_latency_test_linklist(uint64_t base_addr, uint64_t num_valid_node)
{
uint64_t cur_addr = base_addr;
for (int i = 0; i < num_valid_node; i++) {
cur_addr = (*(uint64_t*)cur_addr);
}
return cur_addr;
}
void warmup(uint64_t base_addr, uint64_t end_addr)
{
setup_latency_test_linklist(base_addr, end_addr, _PERF_CACHELINE_SIZE_BYTE);
}
void test_latency(uint64_t size, int iter)
{
volatile uint64_t result = 0; // make sure compiler will not opt read_latency_test_linklist
printf("range 0x%xB (%d iters) latency test\n", size, iter);
_perf_start_timer();
uint64_t nnode = setup_latency_test_linklist(_PERF_TEST_ADDR_BASE, _PERF_TEST_ADDR_BASE + size, _PERF_CACHELINE_SIZE_BYTE);
_perf_end_timer();
uint64_t total_node = nnode * iter;
// _perf_print_timer();
_perf_start_timer();
for (int i = 0; i < iter; i++) {
result += read_latency_test_linklist(_PERF_TEST_ADDR_BASE, nnode);
}
_perf_end_timer();
// _perf_print_timer();
printf("range 0x%xB (%d intrs) read latency %f (%d samples)\n", size, iter, (float)perf.cycle / total_node, total_node);
_perf_blackhole(result);
}
void test_mem_throughput(uint64_t iter)
{
uint64_t remain = iter;
uint64_t result = 0;
uint64_t access_addr = _PERF_TEST_ADDR_BASE;
_perf_start_timer();
while (remain--) {
result += *(uint64_t*) access_addr;
access_addr += _PERF_CACHELINE_SIZE_BYTE;
}
_perf_end_timer();
*(uint64_t*) _PERF_BLACKHOLE = result;
printf("mem band width %f B/cycle (%d samples)\n", (float)iter * _PERF_CACHELINE_SIZE_BYTE / perf.cycle, iter);
}
#endif
\ No newline at end of file
#include <klib.h>
#include "maprobe.h"
int main()
{
_perf_calibrate();
printf("Memory throughput:\n");
test_mem_throughput(512);
printf("L1 latency:\n");
test_latency(4 * KB, 5);
test_latency(_PERF_L1_NOALIAS_SIZE_BYTE, 2);
test_latency(_PERF_L1_SIZE_BYTE/2, 2);
test_latency(_PERF_L1_SIZE_BYTE, 2);
printf("L2 latency:\n");
test_latency(_PERF_L2_SIZE_BYTE/2, 2);
// test_latency(_PERF_L2_SIZE_BYTE, 2);
printf("L3 latency:\n");
test_latency(_PERF_L3_SIZE_BYTE/2, 2);
// test_latency(_PERF_L3_SIZE_BYTE,2);
// printf("MEM:\n");
// test_latency(_PERF_L3_SIZE_BYTE*2,2);
return 0;
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册