Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenXiangShan
nexus-am
提交
ac66935e
N
nexus-am
项目概览
OpenXiangShan
/
nexus-am
9 个月 前同步成功
通知
0
Star
21
Fork
25
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
N
nexus-am
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
ac66935e
编写于
3月 07, 2023
作者:
W
William Wang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
maprobe: add basic store test and batch load test
上级
47fe2bc9
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
185 addition
and
5 deletion
+185
-5
apps/maprobe/include/maprobe.h
apps/maprobe/include/maprobe.h
+3
-0
apps/maprobe/latency-test.c
apps/maprobe/latency-test.c
+160
-5
apps/maprobe/main.c
apps/maprobe/main.c
+22
-0
未找到文件。
apps/maprobe/include/maprobe.h
浏览文件 @
ac66935e
...
...
@@ -72,6 +72,9 @@ extern void test_linear_access_latency(uint64_t size, uint64_t step, int iter, i
extern
void
test_random_access_latency
(
uint64_t
num_access
,
uint64_t
test_range
,
uint64_t
test_align
,
int
pregen_addr
,
int
iter
,
int
to_csv
);
extern
void
test_same_address_load_latency
(
int
iter
,
int
to_csv
);
extern
void
test_read_after_write_latency
(
int
iter
,
int
to_csv
);
extern
void
test_l1_load_bandwidth
(
uint64_t
size
,
int
iter
,
int
to_csv
);
extern
void
test_l1_store_bandwidth
(
uint64_t
size
,
int
iter
,
int
to_csv
);
extern
void
test_l1_store_wcb_bandwidth
(
uint64_t
size
,
int
iter
,
int
to_csv
);
extern
void
legacy_test_mem_throughput
(
uint64_t
iter
);
extern
void
legacy_test_mem_throughput_same_set
(
uint64_t
iter
);
...
...
apps/maprobe/latency-test.c
浏览文件 @
ac66935e
...
...
@@ -79,7 +79,7 @@ void test_same_address_load_latency(int iter, int to_csv)
{
// printf("same address load latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
register
uint64_t
result
=
0
;
// make sure compiler will not opt read_pointer_tracing_linklist
register
uint64_t
result
=
0
;
// _perf_print_timer();
_perf_start_timer
();
...
...
@@ -130,11 +130,11 @@ void test_read_after_write_latency(int iter, int to_csv)
_perf_g_total_samples
+=
total_access
;
}
void
test_linear_access_latency
(
uint64_t
size
,
uint64_t
step
,
int
iter
,
int
to_csv
)
void
test_linear_access_latency
_simple
(
uint64_t
size
,
uint64_t
step
,
int
iter
,
int
to_csv
)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
register
uint64_t
result
=
0
;
// make sure compiler will not opt read_pointer_tracing_linklist
register
uint64_t
result
=
0
;
uint64_t
num_access
=
size
/
step
;
// _perf_print_timer();
...
...
@@ -152,7 +152,7 @@ void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_c
if
(
to_csv
)
{
printf
(
"%ld, %f, %d, %ld, %ld
\n
"
,
size
,
(
float
)
perf
.
cycle
/
total_access
,
iter
,
total_access
,
perf
.
cycle
);
}
else
{
printf
(
"range %ldKB (%d iters) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB
\n
"
,
printf
(
"range %ldKB (%d iters)
simple
linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB
\n
"
,
size
/
KB
,
iter
,
(
float
)
perf
.
cycle
/
total_access
,
total_access
*
8
*
BYTE
/
(
float
)
perf
.
cycle
,
total_access
,
perf
.
cycle
,
step
);
}
...
...
@@ -161,13 +161,168 @@ void test_linear_access_latency(uint64_t size, uint64_t step, int iter, int to_c
_perf_g_total_samples
+=
total_access
;
}
void
test_linear_access_latency_batch8
(
uint64_t
size
,
uint64_t
step
,
int
iter
,
int
to_csv
)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
register
uint64_t
result
=
0
;
uint64_t
num_access
=
size
/
step
;
num_access
+=
num_access
%
8
?
8
-
num_access
%
8
:
0
;
assert
(
num_access
>=
8
);
// prepare access offset
uint64_t
address_offset_0
=
0
;
register
uint64_t
address_offset_1
=
step
*
1
;
register
uint64_t
address_offset_2
=
step
*
2
;
register
uint64_t
address_offset_3
=
step
*
3
;
register
uint64_t
address_offset_4
=
step
*
4
;
register
uint64_t
address_offset_5
=
step
*
5
;
register
uint64_t
address_offset_6
=
step
*
6
;
register
uint64_t
address_offset_7
=
step
*
7
;
register
uint64_t
address_offset_8
=
step
*
8
;
// _perf_print_timer();
_perf_start_timer
();
uint64_t
address
=
_PERF_TEST_ADDR_BASE
;
for
(
int
i
=
0
;
i
<
iter
;
i
++
)
{
for
(
int
j
=
0
;
j
<
num_access
;
j
+=
8
)
{
register
uint64_t
access_addr_0
=
address
+
address_offset_0
;
register
uint64_t
access_addr_1
=
address
+
address_offset_1
;
register
uint64_t
access_addr_2
=
address
+
address_offset_2
;
register
uint64_t
access_addr_3
=
address
+
address_offset_3
;
register
uint64_t
access_addr_4
=
address
+
address_offset_4
;
register
uint64_t
access_addr_5
=
address
+
address_offset_5
;
register
uint64_t
access_addr_6
=
address
+
address_offset_6
;
register
uint64_t
access_addr_7
=
address
+
address_offset_7
;
address
+=
address_offset_8
;
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_0
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_1
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_2
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_3
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_4
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_5
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_6
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
access_addr_7
)
:
"a0"
);
}
}
_perf_end_timer
();
// _perf_print_timer();
uint64_t
total_access
=
num_access
*
iter
;
if
(
to_csv
)
{
printf
(
"%ld, %f, %d, %ld, %ld
\n
"
,
size
,
(
float
)
perf
.
cycle
/
total_access
,
iter
,
total_access
,
perf
.
cycle
);
}
else
{
printf
(
"range %ldKB (%d iters) batch(8) linear read latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB
\n
"
,
size
/
KB
,
iter
,
(
float
)
perf
.
cycle
/
total_access
,
total_access
*
8
*
BYTE
/
(
float
)
perf
.
cycle
,
total_access
,
perf
.
cycle
,
step
);
}
_perf_blackhole
(
result
);
_perf_g_total_samples
+=
total_access
;
}
void
test_linear_access_latency
(
uint64_t
size
,
uint64_t
step
,
int
iter
,
int
to_csv
)
{
test_linear_access_latency_batch8
(
size
,
step
,
iter
,
to_csv
);
}
void
test_l1_load_bandwidth
(
uint64_t
size
,
int
iter
,
int
to_csv
)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert
(
size
>=
_PERF_CACHELINE_SIZE_BYTE
);
// _perf_print_timer();
_perf_start_timer
();
for
(
int
i
=
0
;
i
<
iter
;
i
++
)
{
for
(
uint64_t
address
=
_PERF_TEST_ADDR_BASE
;
address
<
_PERF_TEST_ADDR_BASE
+
size
;
address
+=
_PERF_CACHELINE_SIZE_BYTE
)
{
__asm__
volatile
(
"ld a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 8(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 16(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 24(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 32(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 40(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 48(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"ld a0, 56(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
}
}
_perf_end_timer
();
// _perf_print_timer();
uint64_t
total_access
=
size
/
_PERF_CACHELINE_SIZE_BYTE
*
8
*
iter
;
if
(
to_csv
)
{
printf
(
"%ld, %f, %d, %ld, %ld
\n
"
,
size
,
(
float
)
perf
.
cycle
/
total_access
,
iter
,
total_access
,
perf
.
cycle
);
}
else
{
printf
(
"range %ldKB (%d iters) dcache linear (8Byte) read, latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB
\n
"
,
size
/
KB
,
iter
,
(
float
)
perf
.
cycle
/
total_access
,
total_access
*
8
*
BYTE
/
(
float
)
perf
.
cycle
,
total_access
,
perf
.
cycle
,
8
);
}
_perf_g_total_samples
+=
total_access
;
}
void
test_l1_store_bandwidth
(
uint64_t
size
,
int
iter
,
int
to_csv
)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert
(
size
>=
_PERF_CACHELINE_SIZE_BYTE
);
// _perf_print_timer();
_perf_start_timer
();
for
(
int
i
=
0
;
i
<
iter
;
i
++
)
{
for
(
uint64_t
address
=
_PERF_TEST_ADDR_BASE
;
address
<
_PERF_TEST_ADDR_BASE
+
size
;
address
+=
_PERF_CACHELINE_SIZE_BYTE
)
{
__asm__
volatile
(
"sd a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"sd a0, 8(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"sd a0, 16(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"sd a0, 24(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"sd a0, 32(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"sd a0, 40(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"sd a0, 48(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
__asm__
volatile
(
"sd a0, 56(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
}
}
_perf_end_timer
();
// _perf_print_timer();
uint64_t
total_access
=
size
/
_PERF_CACHELINE_SIZE_BYTE
*
8
*
iter
;
if
(
to_csv
)
{
printf
(
"%ld, %f, %d, %ld, %ld
\n
"
,
size
,
(
float
)
perf
.
cycle
/
total_access
,
iter
,
total_access
,
perf
.
cycle
);
}
else
{
printf
(
"range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (%ld samples, %ld cycles), stride %dB
\n
"
,
size
/
KB
,
iter
,
(
float
)
perf
.
cycle
/
total_access
,
total_access
*
8
*
BYTE
/
(
float
)
perf
.
cycle
,
total_access
,
perf
.
cycle
,
8
);
}
_perf_g_total_samples
+=
total_access
;
}
void
test_l1_store_wcb_bandwidth
(
uint64_t
size
,
int
iter
,
int
to_csv
)
{
// printf("stride %d linear access latency test\n", step);
// printf("range (B), read latency, iters, samples, cycles\n");
assert
(
size
>=
_PERF_CACHELINE_SIZE_BYTE
);
// _perf_print_timer();
_perf_start_timer
();
for
(
int
i
=
0
;
i
<
iter
;
i
++
)
{
for
(
uint64_t
address
=
_PERF_TEST_ADDR_BASE
;
address
<
_PERF_TEST_ADDR_BASE
+
size
;
address
+=
_PERF_CACHELINE_SIZE_BYTE
)
{
__asm__
volatile
(
"sd a0, 0(%[addr])
\n
"
::
[
addr
]
"r"
(
address
)
:
"a0"
);
}
}
_perf_end_timer
();
// _perf_print_timer();
uint64_t
total_access
=
size
/
_PERF_CACHELINE_SIZE_BYTE
*
iter
;
if
(
to_csv
)
{
printf
(
"%ld, %f, %d, %ld, %ld
\n
"
,
size
,
(
float
)
perf
.
cycle
/
total_access
,
iter
,
total_access
,
perf
.
cycle
);
}
else
{
printf
(
"range %ldKB (%d iters) dcache linear (8Byte) store latency %f, throughput %f B/cycle (L1-L2 %f B/cycle) (%ld samples, %ld cycles), stride %dB
\n
"
,
size
/
KB
,
iter
,
(
float
)
perf
.
cycle
/
total_access
,
total_access
*
8
*
BYTE
/
(
float
)
perf
.
cycle
,
total_access
*
_PERF_CACHELINE_SIZE_BYTE
/
(
float
)
perf
.
cycle
,
total_access
,
perf
.
cycle
,
8
);
}
_perf_g_total_samples
+=
total_access
;
}
void
test_random_access_latency
(
uint64_t
num_access
,
uint64_t
test_range
,
uint64_t
test_align
,
int
pregen_addr
,
int
iter
,
int
to_csv
)
{
// printf("align %d random access (cache line) latency test, %s\n",
// test_align, pregen_addr ? "use pregen addr array" : "gen rand addr at run time"
// );
// printf("range (B), read latency, iters, samples, cycles\n");
register
uint64_t
result
=
0
;
// make sure compiler will not opt read_pointer_tracing_linklist
register
uint64_t
result
=
0
;
// _perf_print_timer();
// alloc memory for random access addr array and data
...
...
apps/maprobe/main.c
浏览文件 @
ac66935e
...
...
@@ -110,10 +110,25 @@ void typical_memory_disambiuation_test_set()
// more to be added
}
void
typical_l1_access_test_set
()
{
printf
(
"------------- typical dcache access pattern test set -------------
\n
"
);
printf
(
"ideal load bandwidth:
\n
"
);
test_l1_load_bandwidth
(
_PERF_L1_SIZE_BYTE
,
2
,
0
);
test_l1_load_bandwidth
(
_PERF_L1_SIZE_BYTE
,
10
,
0
);
printf
(
"ideal store bandwidth:
\n
"
);
test_l1_store_bandwidth
(
_PERF_L1_SIZE_BYTE
,
2
,
0
);
test_l1_store_bandwidth
(
_PERF_L1_SIZE_BYTE
,
10
,
0
);
printf
(
"ideal write combine buffer bandwidth:
\n
"
);
test_l1_store_wcb_bandwidth
(
_PERF_L1_SIZE_BYTE
,
2
,
0
);
test_l1_store_wcb_bandwidth
(
_PERF_L1_SIZE_BYTE
,
5
,
0
);
}
// typical latency test for fast regression
void
typical_latency_test
()
{
_perf_g_total_samples
=
0
;
typical_l1_access_test_set
();
typical_linear_load_test_set
();
typical_random_load_test_set
();
typical_pointer_tracing_load_test_set
();
...
...
@@ -144,8 +159,15 @@ void latency_test_example()
{
_perf_calibrate
();
printf
(
"latency test example:
\n
"
);
test_l1_load_bandwidth
(
4
*
KB
,
5
,
0
);
test_l1_load_bandwidth
(
4
*
KB
,
5
,
0
);
test_l1_store_bandwidth
(
4
*
KB
,
5
,
0
);
test_l1_store_bandwidth
(
4
*
KB
,
5
,
0
);
test_l1_store_wcb_bandwidth
(
8
*
KB
,
5
,
0
);
test_l1_store_wcb_bandwidth
(
8
*
KB
,
5
,
0
);
test_pointer_tracing_latency
(
_PERF_PAGE_SIZE_BYTE
,
_PERF_CACHELINE_SIZE_BYTE
,
5
,
0
);
test_linear_access_latency
(
_PERF_PAGE_SIZE_BYTE
,
sizeof
(
uint64_t
),
5
,
0
);
test_linear_access_latency
(
_PERF_PAGE_SIZE_BYTE
,
sizeof
(
uint64_t
),
5
,
0
);
test_linear_access_latency
(
_PERF_PAGE_SIZE_BYTE
,
_PERF_CACHELINE_SIZE_BYTE
,
5
,
0
);
test_random_access_latency
(
4096
,
1024
*
MB
,
_PERF_CACHELINE_SIZE_BYTE
,
0
,
1
,
0
);
test_random_access_latency
(
4096
,
1024
*
MB
,
_PERF_CACHELINE_SIZE_BYTE
,
1
,
1
,
0
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录