diff --git a/1-cpu_cache/branch_predict/README.md b/1-cpu_cache/branch_predict/README.md index be498a4a404e1a842daa01654686cbe90ba997ae..fd3b3fcdf749b217058b273c55de02a6039135f7 100644 --- a/1-cpu_cache/branch_predict/README.md +++ b/1-cpu_cache/branch_predict/README.md @@ -20,6 +20,7 @@ #### 遍历有序数组 `./branch_predict -f` 消耗时间(毫秒):350 +### c. 使用perf验证缓存命中率 #### 遍历随机数组 `perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./branch_predict -` ``` diff --git a/1-cpu_cache/branch_predict/branch_predict.cpp b/1-cpu_cache/branch_predict/branch_predict.cpp index ea61e98a0405d26967b78979155a9e9e13edb131..29400d0a19b5d61049baecca1194a38fc2d698a0 100644 --- a/1-cpu_cache/branch_predict/branch_predict.cpp +++ b/1-cpu_cache/branch_predict/branch_predict.cpp @@ -12,6 +12,7 @@ using namespace std; long timediff(clock_t t1, clock_t t2) { long elapsed; + //使用clock统计与取系统时间不同,它表示从进程启动到当下所消耗的CPU计时单元数 elapsed = ((double)t2 - t1) / CLOCKS_PER_SEC * 1000; return elapsed; } @@ -22,12 +23,15 @@ int main(int argc, char** argv) { while((ch = getopt(argc, argv, "fsg")) != -1) { switch(ch) { + //遍历随机数组 case 's': mode = 1; break; + //遍历有序数组 case 'f': mode = 2; break; + //生成数组至文件中,不影响遍历过程时的perf统计 case 'g': mode = 3; break; @@ -37,12 +41,16 @@ int main(int argc, char** argv) { unsigned char* arr = new unsigned char[TESTN]; if (3 == mode) { + //构造随机数组 for (long i = 0; i < TESTN; i++) arr[i] = rand() % 256; ofstream ofs; + //随机数组写入文件 ofs.open("rand.array", ios::out | ios::binary); ofs.write((const char*)arr, TESTN); ofs.close(); + //数组排序 sort(arr,arr+TESTN); + //有序数组写入文件 ofs.open("sort.array", ios::out | ios::binary); ofs.write((const char*)arr, TESTN); ofs.close(); @@ -57,9 +65,11 @@ int main(int argc, char** argv) { ifs.open(fname); ifs.read((char *)arr, TESTN); + //使用clock比取系统时间能够更准确的看到消耗了多少CPU资源 clock_t start,end; start =clock(); for(long i = 0; i < TESTN; i++) { + //条件分支预测在这里发生作用 if (arr[i] < 128) arr[i] = 0; } end =clock(); diff --git a/1-cpu_cache/cpu_migrate/README.md b/1-cpu_cache/cpu_migrate/README.md index 516a6a30c5a781e45ae7861b1d34f9191cb6ef87..436732daa896ea53a9abd8f940e0e4c07c84ce33 100644 --- a/1-cpu_cache/cpu_migrate/README.md +++ b/1-cpu_cache/cpu_migrate/README.md @@ -9,95 +9,57 @@ #### 安装编译依赖的软件 如Linux中需要安装gcc-c++,CentOS中可用`yum install gcc-c++`安装,Ubuntu中可用`apt-get install gcc-c++` #### 编译程序 -`g++ traverse_2d_array.cpp -o traverse_2d_array` +`g++ cpu_migrate.cpp -o cpu_migrate -lpthread` +* 注意,多线程依赖pthread库,编译时需要链接 ### b. 运行验证 -#### 使用array[i][j]遍历数组 -`./traverse_2d_array -f` -消耗时间(毫秒):10 -#### 使用array[j][i]遍历数组 -`./traverse_2d_array -s` -消耗时间(毫秒):70 +#### 使用14个(共28个CPU核心)并发线程测试,不绑定CPU +`./cpu_migrate -t 14 -s` +平均每线程消耗时间(毫秒):1083 +#### 使用14个(共28个CPU核心)并发线程测试,绑定CPU +`./cpu_migrate -t 14 -f` +平均每线程消耗时间(毫秒):926 ### c. 使用perf验证缓存命中率 -#### 使用array[i][j]遍历数组 -`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -f` +#### 使用14个(共28个CPU核心)并发线程测试,不绑定CPU +`perf stat -e cpu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -s` * 输出结果: ``` - Performance counter stats for './traverse_2d_array -f': + Performance counter stats for './cpu_migrate -t 14 -s': - 147,927 cache-references (80.14%) - 13,215 cache-misses # 8.933 % of all cache refs (65.49%) - 54,454,827 instructions # 1.43 insn per cycle (85.11%) - 38,197,267 cycles (85.09%) - 161,503 L1-dcache-load-misses # 0.90% of all L1-dcache hits (85.09%) - 18,035,307 L1-dcache-loads (84.19%) + 10 cpu-migrations + 8,193,825 cache-references (44.40%) + 175,792 cache-misses # 2.145 % of all cache refs (44.34%) + 45,480,238,906 instructions # 1.30 insn per cycle (55.47%) + 35,111,144,560 cycles (55.47%) + 11,997,428 L1-dcache-load-misses # 0.05% of all L1-dcache hits (55.57%) + 26,407,960,253 L1-dcache-loads (55.60%) + 2,459,766 L1-icache-load-misses (55.66%) + 2,136,304 branch-load-misses (44.53%) + 3,825,848,726 branch-loads (44.43%) - 0.020651344 seconds time elapsed + 1.251076337 seconds time elapsed - 0.018625000 seconds user - 0.002069000 seconds sys + 14.630618000 seconds user + 0.459616000 seconds sys ``` -#### 使用array[j][i]遍历数组 -`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -s` +#### 使用14个(共28个CPU核心)并发线程测试,绑定CPU +`perf stat -e cpu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -f` * 输出结果: ``` - Performance counter stats for './traverse_2d_array -s': + Performance counter stats for './cpu_migrate -t 14 -f': - 4,341,186 cache-references (83.01%) - 13,974 cache-misses # 0.322 % of all cache refs (66.03%) - 55,245,646 instructions # 0.25 insn per cycle (83.01%) - 218,787,967 cycles (83.00%) - 4,308,394 L1-dcache-load-misses # 23.79% of all L1-dcache hits (83.86%) - 18,112,753 L1-dcache-loads (84.10%) + 14 cpu-migrations + 4,983,541 cache-references (44.42%) + 1,611,627 cache-misses # 32.339 % of all cache refs (44.34%) + 45,523,818,723 instructions # 1.52 insn per cycle (55.43%) + 29,972,627,158 cycles (55.46%) + 5,812,831 L1-dcache-load-misses # 0.02% of all L1-dcache hits (55.53%) + 26,388,005,477 L1-dcache-loads (55.58%) + 1,262,533 L1-icache-load-misses (55.66%) + 1,363,376 branch-load-misses (44.54%) + 3,828,570,015 branch-loads (44.47%) - 0.082950118 seconds time elapsed + 0.948650967 seconds time elapsed - 0.079066000 seconds user - 0.003953000 seconds sys + 12.489932000 seconds user + 0.456253000 seconds sys ``` -## 3. Java程序 -### a. 编译程序 -`javac traverse_2d_array.java` -### b.运行验证 -#### 使用array[i][j]遍历数组 -`java traverse_2d_array -f` -消耗时间(毫秒):20 -#### 使用array[j][i]遍历数组 -`java traverse_2d_array -s` -消耗时间(毫秒):100 -### c. 使用perf验证缓存命中率 -#### 使用array[i][j]遍历数组 -`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -f` -* 输出结果: -``` - Performance counter stats for 'java traverse_2d_array -f': - - 6,379,138 cache-references (80.62%) - 866,578 cache-misses # 13.585 % of all cache refs (68.93%) - 459,726,039 instructions # 1.51 insn per cycle (85.22%) - 303,673,757 cycles (85.69%) - 5,270,707 L1-dcache-load-misses # 3.96% of all L1-dcache hits (81.64%) - 133,211,743 L1-dcache-loads (83.13%) - - 0.126089887 seconds time elapsed - - 0.122353000 seconds user - 0.047877000 seconds sys -``` -#### 使用array[j][i]遍历数组 -`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -s` -* 输出结果: -``` - Performance counter stats for 'java traverse_2d_array -s': - - 42,441,956 cache-references (80.21%) - 872,336 cache-misses # 2.055 % of all cache refs (66.61%) - 386,326,280 instructions # 0.71 insn per cycle (84.29%) - 544,411,061 cycles (85.01%) - 38,884,991 L1-dcache-load-misses # 32.48% of all L1-dcache hits (85.24%) - 119,711,464 L1-dcache-loads (82.94%) - - 0.192838747 seconds time elapsed - - 0.200693000 seconds user - 0.052919000 seconds sys -``` \ No newline at end of file diff --git a/1-cpu_cache/cpu_migrate/cpu_migrate.cpp b/1-cpu_cache/cpu_migrate/cpu_migrate.cpp index 24a1a9d9c3154b1200ad4a60d675f30137703f58..487464112012f6319949159bd69deb725a2d7f34 100644 --- a/1-cpu_cache/cpu_migrate/cpu_migrate.cpp +++ b/1-cpu_cache/cpu_migrate/cpu_migrate.cpp @@ -18,26 +18,31 @@ void* loopcalc(void* args) { if (setaffinity) { cpu_set_t mask; //CPU核的集合 cpu_set_t get; //获取在集合中的CPU + //获取线程的序列号 int *thread_num = (int *)args; - CPU_ZERO(&mask); //置空 - CPU_SET(*thread_num,&mask); //设置亲和力值 - if (sched_setaffinity(0, sizeof(mask), &mask) == -1)//设置线程CPU亲和力 + //将当前线程绑定至特定CPU + CPU_ZERO(&mask); + CPU_SET(*thread_num,&mask); + if (sched_setaffinity(0, sizeof(mask), &mask) == -1) { cout<<"warning: could not set CPU affinity, continuing...\n"; } } timeval tStart,tEnd; + //这里不再使用clock,因为clock表示的进程所占用过的CPU周期,它将所有CPU都计入了,不适合示例中的统计 gettimeofday(&tStart, 0); + //这个循环中由于反复访问有限的数组,CPU缓存命中率非常高 unsigned char* arr = new unsigned char[TESTN]; for (long i = 0; i < TESTN; i++) arr[i] = rand() % 256; - for (int j = 1; j < 16*1024; j++) { + for (int j = 1; j < TESTN; j++) { for (long i = 0; i < TESTN; i++) arr[i] += 1; } gettimeofday(&tEnd, 0); - + + //将消耗时间传出到timecost数组中对应的元素上 *(long*)args = (1000000LL * (tEnd.tv_sec-tStart.tv_sec) + (tEnd.tv_usec-tStart.tv_usec))/1000; } @@ -47,12 +52,15 @@ int main(int argc, char** argv) { while((ch = getopt(argc, argv, "t:fs")) != -1) { switch(ch) { + //设置测试的并发线程数,注意不要超过机器上的CPU核数 case 't': threadnum = atoi(optarg); break; + //将线程绑定至特定CPU上 case 'f': setaffinity = true; break; + //不绑定CPU case 's': setaffinity = false; break; @@ -60,8 +68,10 @@ int main(int argc, char** argv) { } pthread_t* id = new pthread_t[threadnum]; + //统计每个线程计算所需要的时间 long* timecost = new long[threadnum]; for(int i = 0; i < threadnum; i++) { + //最初timecost用于传递线程号,用于绑定CPU timecost[i] = i; int ret=pthread_create(&id[i],NULL,loopcalc,&timecost[i]); @@ -72,10 +82,12 @@ int main(int argc, char** argv) { } long costsum = 0; + //等待所有线程结束 for(int i = 0; i < threadnum; i++) { pthread_join(id[i],NULL); costsum += timecost[i]; } + //比较平均每线程所用时间 cout<<"costsum: "<