Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
陶辉
geektime_distrib_perf
提交
a87d65d0
G
geektime_distrib_perf
项目概览
陶辉
/
geektime_distrib_perf
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
1
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
G
geektime_distrib_perf
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
1
合并请求
1
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
a87d65d0
编写于
11月 07, 2019
作者:
R
russelltao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update readme
上级
5f2763dd
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
68 addition
and
83 deletion
+68
-83
1-cpu_cache/branch_predict/README.md
1-cpu_cache/branch_predict/README.md
+1
-0
1-cpu_cache/branch_predict/branch_predict.cpp
1-cpu_cache/branch_predict/branch_predict.cpp
+10
-0
1-cpu_cache/cpu_migrate/README.md
1-cpu_cache/cpu_migrate/README.md
+40
-78
1-cpu_cache/cpu_migrate/cpu_migrate.cpp
1-cpu_cache/cpu_migrate/cpu_migrate.cpp
+17
-5
未找到文件。
1-cpu_cache/branch_predict/README.md
浏览文件 @
a87d65d0
...
...
@@ -20,6 +20,7 @@
#### 遍历有序数组
`./branch_predict -f`
消耗时间(毫秒):350
### c. 使用perf验证缓存命中率
#### 遍历随机数组
`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./branch_predict -`
```
...
...
1-cpu_cache/branch_predict/branch_predict.cpp
浏览文件 @
a87d65d0
...
...
@@ -12,6 +12,7 @@ using namespace std;
long
timediff
(
clock_t
t1
,
clock_t
t2
)
{
long
elapsed
;
//使用clock统计与取系统时间不同,它表示从进程启动到当下所消耗的CPU计时单元数
elapsed
=
((
double
)
t2
-
t1
)
/
CLOCKS_PER_SEC
*
1000
;
return
elapsed
;
}
...
...
@@ -22,12 +23,15 @@ int main(int argc, char** argv) {
while
((
ch
=
getopt
(
argc
,
argv
,
"fsg"
))
!=
-
1
)
{
switch
(
ch
)
{
//遍历随机数组
case
's'
:
mode
=
1
;
break
;
//遍历有序数组
case
'f'
:
mode
=
2
;
break
;
//生成数组至文件中,不影响遍历过程时的perf统计
case
'g'
:
mode
=
3
;
break
;
...
...
@@ -37,12 +41,16 @@ int main(int argc, char** argv) {
unsigned
char
*
arr
=
new
unsigned
char
[
TESTN
];
if
(
3
==
mode
)
{
//构造随机数组
for
(
long
i
=
0
;
i
<
TESTN
;
i
++
)
arr
[
i
]
=
rand
()
%
256
;
ofstream
ofs
;
//随机数组写入文件
ofs
.
open
(
"rand.array"
,
ios
::
out
|
ios
::
binary
);
ofs
.
write
((
const
char
*
)
arr
,
TESTN
);
ofs
.
close
();
//数组排序
sort
(
arr
,
arr
+
TESTN
);
//有序数组写入文件
ofs
.
open
(
"sort.array"
,
ios
::
out
|
ios
::
binary
);
ofs
.
write
((
const
char
*
)
arr
,
TESTN
);
ofs
.
close
();
...
...
@@ -57,9 +65,11 @@ int main(int argc, char** argv) {
ifs
.
open
(
fname
);
ifs
.
read
((
char
*
)
arr
,
TESTN
);
//使用clock比取系统时间能够更准确的看到消耗了多少CPU资源
clock_t
start
,
end
;
start
=
clock
();
for
(
long
i
=
0
;
i
<
TESTN
;
i
++
)
{
//条件分支预测在这里发生作用
if
(
arr
[
i
]
<
128
)
arr
[
i
]
=
0
;
}
end
=
clock
();
...
...
1-cpu_cache/cpu_migrate/README.md
浏览文件 @
a87d65d0
...
...
@@ -9,95 +9,57 @@
#### 安装编译依赖的软件
如Linux中需要安装gcc-c++,CentOS中可用
`yum install gcc-c++`
安装,Ubuntu中可用
`apt-get install gcc-c++`
#### 编译程序
`g++ traverse_2d_array.cpp -o traverse_2d_array`
`g++ cpu_migrate.cpp -o cpu_migrate -lpthread`
*
注意,多线程依赖pthread库,编译时需要链接
### b. 运行验证
#### 使用
array[i][j]遍历数组
`./
traverse_2d_array -f
`
消耗时间(毫秒):10
#### 使用
array[j][i]遍历数组
`./
traverse_2d_array -s
`
消耗时间(毫秒):70
#### 使用
14个(共28个CPU核心)并发线程测试,不绑定CPU
`./
cpu_migrate -t 14 -s
`
平均每线程消耗时间(毫秒):1083
#### 使用
14个(共28个CPU核心)并发线程测试,绑定CPU
`./
cpu_migrate -t 14 -f
`
平均每线程消耗时间(毫秒):926
### c. 使用perf验证缓存命中率
#### 使用
array[i][j]遍历数组
`perf stat -e c
ache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -f
`
#### 使用
14个(共28个CPU核心)并发线程测试,不绑定CPU
`perf stat -e c
pu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -s
`
*
输出结果:
```
Performance counter stats for './
traverse_2d_array -f
':
Performance counter stats for './
cpu_migrate -t 14 -s
':
147,927 cache-references (80.14%)
13,215 cache-misses # 8.933 % of all cache refs (65.49%)
54,454,827 instructions # 1.43 insn per cycle (85.11%)
38,197,267 cycles (85.09%)
161,503 L1-dcache-load-misses # 0.90% of all L1-dcache hits (85.09%)
18,035,307 L1-dcache-loads (84.19%)
10 cpu-migrations
8,193,825 cache-references (44.40%)
175,792 cache-misses # 2.145 % of all cache refs (44.34%)
45,480,238,906 instructions # 1.30 insn per cycle (55.47%)
35,111,144,560 cycles (55.47%)
11,997,428 L1-dcache-load-misses # 0.05% of all L1-dcache hits (55.57%)
26,407,960,253 L1-dcache-loads (55.60%)
2,459,766 L1-icache-load-misses (55.66%)
2,136,304 branch-load-misses (44.53%)
3,825,848,726 branch-loads (44.43%)
0.020651344
seconds time elapsed
1.251076337
seconds time elapsed
0.018625
000 seconds user
0.
002069
000 seconds sys
14.630618
000 seconds user
0.
459616
000 seconds sys
```
#### 使用
array[j][i]遍历数组
`perf stat -e c
ache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -s
`
#### 使用
14个(共28个CPU核心)并发线程测试,绑定CPU
`perf stat -e c
pu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -f
`
*
输出结果:
```
Performance counter stats for './
traverse_2d_array -s
':
Performance counter stats for './
cpu_migrate -t 14 -f
':
4,341,186 cache-references (83.01%)
13,974 cache-misses # 0.322 % of all cache refs (66.03%)
55,245,646 instructions # 0.25 insn per cycle (83.01%)
218,787,967 cycles (83.00%)
4,308,394 L1-dcache-load-misses # 23.79% of all L1-dcache hits (83.86%)
18,112,753 L1-dcache-loads (84.10%)
14 cpu-migrations
4,983,541 cache-references (44.42%)
1,611,627 cache-misses # 32.339 % of all cache refs (44.34%)
45,523,818,723 instructions # 1.52 insn per cycle (55.43%)
29,972,627,158 cycles (55.46%)
5,812,831 L1-dcache-load-misses # 0.02% of all L1-dcache hits (55.53%)
26,388,005,477 L1-dcache-loads (55.58%)
1,262,533 L1-icache-load-misses (55.66%)
1,363,376 branch-load-misses (44.54%)
3,828,570,015 branch-loads (44.47%)
0.
082950118
seconds time elapsed
0.
948650967
seconds time elapsed
0.079066
000 seconds user
0.
0039
53000 seconds sys
12.489932
000 seconds user
0.
4562
53000 seconds sys
```
## 3. Java程序
### a. 编译程序
`javac traverse_2d_array.java`
### b.运行验证
#### 使用array[i][j]遍历数组
`java traverse_2d_array -f`
消耗时间(毫秒):20
#### 使用array[j][i]遍历数组
`java traverse_2d_array -s`
消耗时间(毫秒):100
### c. 使用perf验证缓存命中率
#### 使用array[i][j]遍历数组
`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -f`
*
输出结果:
```
Performance counter stats for 'java traverse_2d_array -f':
6,379,138 cache-references (80.62%)
866,578 cache-misses # 13.585 % of all cache refs (68.93%)
459,726,039 instructions # 1.51 insn per cycle (85.22%)
303,673,757 cycles (85.69%)
5,270,707 L1-dcache-load-misses # 3.96% of all L1-dcache hits (81.64%)
133,211,743 L1-dcache-loads (83.13%)
0.126089887 seconds time elapsed
0.122353000 seconds user
0.047877000 seconds sys
```
#### 使用array[j][i]遍历数组
`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -s`
*
输出结果:
```
Performance counter stats for 'java traverse_2d_array -s':
42,441,956 cache-references (80.21%)
872,336 cache-misses # 2.055 % of all cache refs (66.61%)
386,326,280 instructions # 0.71 insn per cycle (84.29%)
544,411,061 cycles (85.01%)
38,884,991 L1-dcache-load-misses # 32.48% of all L1-dcache hits (85.24%)
119,711,464 L1-dcache-loads (82.94%)
0.192838747 seconds time elapsed
0.200693000 seconds user
0.052919000 seconds sys
```
\ No newline at end of file
1-cpu_cache/cpu_migrate/cpu_migrate.cpp
浏览文件 @
a87d65d0
...
...
@@ -18,26 +18,31 @@ void* loopcalc(void* args) {
if
(
setaffinity
)
{
cpu_set_t
mask
;
//CPU核的集合
cpu_set_t
get
;
//获取在集合中的CPU
//获取线程的序列号
int
*
thread_num
=
(
int
*
)
args
;
CPU_ZERO
(
&
mask
);
//置空
CPU_SET
(
*
thread_num
,
&
mask
);
//设置亲和力值
if
(
sched_setaffinity
(
0
,
sizeof
(
mask
),
&
mask
)
==
-
1
)
//设置线程CPU亲和力
//将当前线程绑定至特定CPU
CPU_ZERO
(
&
mask
);
CPU_SET
(
*
thread_num
,
&
mask
);
if
(
sched_setaffinity
(
0
,
sizeof
(
mask
),
&
mask
)
==
-
1
)
{
cout
<<
"warning: could not set CPU affinity, continuing...
\n
"
;
}
}
timeval
tStart
,
tEnd
;
//这里不再使用clock,因为clock表示的进程所占用过的CPU周期,它将所有CPU都计入了,不适合示例中的统计
gettimeofday
(
&
tStart
,
0
);
//这个循环中由于反复访问有限的数组,CPU缓存命中率非常高
unsigned
char
*
arr
=
new
unsigned
char
[
TESTN
];
for
(
long
i
=
0
;
i
<
TESTN
;
i
++
)
arr
[
i
]
=
rand
()
%
256
;
for
(
int
j
=
1
;
j
<
16
*
1024
;
j
++
)
{
for
(
int
j
=
1
;
j
<
TESTN
;
j
++
)
{
for
(
long
i
=
0
;
i
<
TESTN
;
i
++
)
arr
[
i
]
+=
1
;
}
gettimeofday
(
&
tEnd
,
0
);
//将消耗时间传出到timecost数组中对应的元素上
*
(
long
*
)
args
=
(
1000000LL
*
(
tEnd
.
tv_sec
-
tStart
.
tv_sec
)
+
(
tEnd
.
tv_usec
-
tStart
.
tv_usec
))
/
1000
;
}
...
...
@@ -47,12 +52,15 @@ int main(int argc, char** argv) {
while
((
ch
=
getopt
(
argc
,
argv
,
"t:fs"
))
!=
-
1
)
{
switch
(
ch
)
{
//设置测试的并发线程数,注意不要超过机器上的CPU核数
case
't'
:
threadnum
=
atoi
(
optarg
);
break
;
//将线程绑定至特定CPU上
case
'f'
:
setaffinity
=
true
;
break
;
//不绑定CPU
case
's'
:
setaffinity
=
false
;
break
;
...
...
@@ -60,8 +68,10 @@ int main(int argc, char** argv) {
}
pthread_t
*
id
=
new
pthread_t
[
threadnum
];
//统计每个线程计算所需要的时间
long
*
timecost
=
new
long
[
threadnum
];
for
(
int
i
=
0
;
i
<
threadnum
;
i
++
)
{
//最初timecost用于传递线程号,用于绑定CPU
timecost
[
i
]
=
i
;
int
ret
=
pthread_create
(
&
id
[
i
],
NULL
,
loopcalc
,
&
timecost
[
i
]);
...
...
@@ -72,10 +82,12 @@ int main(int argc, char** argv) {
}
long
costsum
=
0
;
//等待所有线程结束
for
(
int
i
=
0
;
i
<
threadnum
;
i
++
)
{
pthread_join
(
id
[
i
],
NULL
);
costsum
+=
timecost
[
i
];
}
//比较平均每线程所用时间
cout
<<
"costsum: "
<<
costsum
<<
", avg: "
<<
costsum
/
threadnum
<<
endl
;
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录