update readme

a87d65d0 · russelltao · 5f2763dd · a87d65d0 · a87d65d0 · a87d65d0
4 changed file
--- a/1-cpu_cache/branch_predict/README.md
+++ b/1-cpu_cache/branch_predict/README.md
@@ -20,6 +20,7 @@
 #### 遍历有序数组
 `./branch_predict -f`
 消耗时间（毫秒）：350
+### c. 使用perf验证缓存命中率
 #### 遍历随机数组
 `perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./branch_predict -`
 ```

--- a/1-cpu_cache/branch_predict/branch_predict.cpp
+++ b/1-cpu_cache/branch_predict/branch_predict.cpp
@@ -12,6 +12,7 @@ using namespace std;

 long timediff(clock_t t1, clock_t t2) {
    long elapsed;
+	//使用clock统计与取系统时间不同，它表示从进程启动到当下所消耗的CPU计时单元数
    elapsed = ((double)t2 - t1) / CLOCKS_PER_SEC * 1000;
    return elapsed;
 }
@@ -22,12 +23,15 @@ int main(int argc, char** argv) {
 	while((ch = getopt(argc, argv, "fsg")) != -1) {
 		switch(ch)
 		{
+			//遍历随机数组
 		   case 's':
 			  mode = 1;
 			  break;
+		   //遍历有序数组
 		   case 'f':
 			  mode = 2;
 			  break;
+		   //生成数组至文件中，不影响遍历过程时的perf统计
 		   case 'g':
 			  mode = 3;
 			  break;
@@ -37,12 +41,16 @@ int main(int argc, char** argv) {
 	unsigned char* arr = new unsigned char[TESTN];

 	if (3 == mode) {
+		//构造随机数组
 		for (long i = 0; i < TESTN; i++) arr[i] = rand() % 256;
 		ofstream ofs;
+		//随机数组写入文件
 		ofs.open("rand.array", ios::out | ios::binary);
 		ofs.write((const char*)arr, TESTN);
 		ofs.close();
+		//数组排序
 		sort(arr,arr+TESTN);
+		//有序数组写入文件
 		ofs.open("sort.array", ios::out | ios::binary);
 		ofs.write((const char*)arr, TESTN);
 		ofs.close();
@@ -57,9 +65,11 @@ int main(int argc, char** argv) {
 		ifs.open(fname);
 		ifs.read((char *)arr, TESTN);

+		//使用clock比取系统时间能够更准确的看到消耗了多少CPU资源
 		clock_t start,end;
 		start =clock();
 		for(long i = 0; i < TESTN; i++) {
+			//条件分支预测在这里发生作用
 			if (arr[i] < 128) arr[i] = 0;
 		}
 		end =clock();

--- a/1-cpu_cache/cpu_migrate/README.md
+++ b/1-cpu_cache/cpu_migrate/README.md
@@ -9,95 +9,57 @@
 #### 安装编译依赖的软件
 如Linux中需要安装gcc-c++，CentOS中可用`yum install gcc-c++`安装，Ubuntu中可用`apt-get install gcc-c++`
 #### 编译程序
-`g++ traverse_2d_array.cpp -o traverse_2d_array`
+`g++ cpu_migrate.cpp -o cpu_migrate -lpthread`
+* 注意，多线程依赖pthread库，编译时需要链接
 ### b. 运行验证
-#### 使用array[i][j]遍历数组
-`./traverse_2d_array -f`
-消耗时间（毫秒）：10
-#### 使用array[j][i]遍历数组
-`./traverse_2d_array -s`
-消耗时间（毫秒）：70
+#### 使用14个（共28个CPU核心）并发线程测试，不绑定CPU
+`./cpu_migrate -t 14 -s`
+平均每线程消耗时间（毫秒）：1083
+#### 使用14个（共28个CPU核心）并发线程测试，绑定CPU
+`./cpu_migrate -t 14 -f`
+平均每线程消耗时间（毫秒）：926
 ### c. 使用perf验证缓存命中率
-#### 使用array[i][j]遍历数组
-`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -f`
+#### 使用14个（共28个CPU核心）并发线程测试，不绑定CPU
+`perf stat -e cpu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -s`
 * 输出结果：
 ```
- Performance counter stats for './traverse_2d_array -f':
+ Performance counter stats for './cpu_migrate -t 14 -s':

-           147,927      cache-references                                              (80.14%)
-            13,215      cache-misses              #    8.933 % of all cache refs      (65.49%)
-        54,454,827      instructions              #    1.43  insn per cycle           (85.11%)
-        38,197,267      cycles                                                        (85.09%)
-           161,503      L1-dcache-load-misses     #    0.90% of all L1-dcache hits    (85.09%)
-        18,035,307      L1-dcache-loads                                               (84.19%)
+                10      cpu-migrations
+         8,193,825      cache-references                                              (44.40%)
+           175,792      cache-misses              #    2.145 % of all cache refs      (44.34%)
+    45,480,238,906      instructions              #    1.30  insn per cycle           (55.47%)
+    35,111,144,560      cycles                                                        (55.47%)
+        11,997,428      L1-dcache-load-misses     #    0.05% of all L1-dcache hits    (55.57%)
+    26,407,960,253      L1-dcache-loads                                               (55.60%)
+         2,459,766      L1-icache-load-misses                                         (55.66%)
+         2,136,304      branch-load-misses                                            (44.53%)
+     3,825,848,726      branch-loads                                                  (44.43%)

-       0.020651344 seconds time elapsed
+       1.251076337 seconds time elapsed

-       0.018625000 seconds user
-       0.002069000 seconds sys
+      14.630618000 seconds user
+       0.459616000 seconds sys
 ```
-#### 使用array[j][i]遍历数组
-`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -s`
+#### 使用14个（共28个CPU核心）并发线程测试，绑定CPU
+`perf stat -e cpu-migrations,cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads,L1-icache-load-misses,branch-load-misses,branch-loads ./cpu_migrate -t 14 -f`
 * 输出结果：
 ```
- Performance counter stats for './traverse_2d_array -s':
+ Performance counter stats for './cpu_migrate -t 14 -f':

-         4,341,186      cache-references                                              (83.01%)
-            13,974      cache-misses              #    0.322 % of all cache refs      (66.03%)
-        55,245,646      instructions              #    0.25  insn per cycle           (83.01%)
-       218,787,967      cycles                                                        (83.00%)
-         4,308,394      L1-dcache-load-misses     #   23.79% of all L1-dcache hits    (83.86%)
-        18,112,753      L1-dcache-loads                                               (84.10%)
+                14      cpu-migrations
+         4,983,541      cache-references                                              (44.42%)
+         1,611,627      cache-misses              #   32.339 % of all cache refs      (44.34%)
+    45,523,818,723      instructions              #    1.52  insn per cycle           (55.43%)
+    29,972,627,158      cycles                                                        (55.46%)
+         5,812,831      L1-dcache-load-misses     #    0.02% of all L1-dcache hits    (55.53%)
+    26,388,005,477      L1-dcache-loads                                               (55.58%)
+         1,262,533      L1-icache-load-misses                                         (55.66%)
+         1,363,376      branch-load-misses                                            (44.54%)
+     3,828,570,015      branch-loads                                                  (44.47%)

-       0.082950118 seconds time elapsed
+       0.948650967 seconds time elapsed

-       0.079066000 seconds user
-       0.003953000 seconds sys
+      12.489932000 seconds user
+       0.456253000 seconds sys
 ```
-## 3. Java程序
-### a. 编译程序
-`javac traverse_2d_array.java`
-### b.运行验证
-#### 使用array[i][j]遍历数组
-`java traverse_2d_array -f`
-消耗时间（毫秒）：20
-#### 使用array[j][i]遍历数组
-`java traverse_2d_array -s`
-消耗时间（毫秒）：100
-### c. 使用perf验证缓存命中率
-#### 使用array[i][j]遍历数组
-`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -f`
-* 输出结果：
-```
- Performance counter stats for 'java traverse_2d_array -f':
-
-         6,379,138      cache-references                                              (80.62%)
-           866,578      cache-misses              #   13.585 % of all cache refs      (68.93%)
-       459,726,039      instructions              #    1.51  insn per cycle           (85.22%)
-       303,673,757      cycles                                                        (85.69%)
-         5,270,707      L1-dcache-load-misses     #    3.96% of all L1-dcache hits    (81.64%)
-       133,211,743      L1-dcache-loads                                               (83.13%)
-
-       0.126089887 seconds time elapsed
-
-       0.122353000 seconds user
-       0.047877000 seconds sys
-```
-#### 使用array[j][i]遍历数组
-`perf stat -e cache-references,cache-misses,instructions,cycles,L1-dcache-load-misses,L1-dcache-loads ./traverse_2d_array -s`
-* 输出结果：
-```
- Performance counter stats for 'java traverse_2d_array -s':
-
-        42,441,956      cache-references                                              (80.21%)
-           872,336      cache-misses              #    2.055 % of all cache refs      (66.61%)
-       386,326,280      instructions              #    0.71  insn per cycle           (84.29%)
-       544,411,061      cycles                                                        (85.01%)
-        38,884,991      L1-dcache-load-misses     #   32.48% of all L1-dcache hits    (85.24%)
-       119,711,464      L1-dcache-loads                                               (82.94%)
-
-       0.192838747 seconds time elapsed
-
-       0.200693000 seconds user
-       0.052919000 seconds sys
-```
\ No newline at end of file
--- a/1-cpu_cache/cpu_migrate/cpu_migrate.cpp
+++ b/1-cpu_cache/cpu_migrate/cpu_migrate.cpp
@@ -18,26 +18,31 @@ void* loopcalc(void* args) {
 	if (setaffinity) {
 		cpu_set_t mask;  //CPU核的集合
 		cpu_set_t get;	 //获取在集合中的CPU
+		//获取线程的序列号
 		int *thread_num = (int *)args; 

-		CPU_ZERO(&mask);    //置空
-	    CPU_SET(*thread_num,&mask);   //设置亲和力值
-		if (sched_setaffinity(0, sizeof(mask), &mask) == -1)//设置线程CPU亲和力
+		//将当前线程绑定至特定CPU
+		CPU_ZERO(&mask);  
+	    CPU_SET(*thread_num,&mask); 
+		if (sched_setaffinity(0, sizeof(mask), &mask) == -1)
 		{
 			cout<<"warning: could not set CPU affinity, continuing...\n";
 		}
 	}
 	timeval tStart,tEnd;
+	//这里不再使用clock，因为clock表示的进程所占用过的CPU周期，它将所有CPU都计入了，不适合示例中的统计
 	gettimeofday(&tStart, 0);

+	//这个循环中由于反复访问有限的数组，CPU缓存命中率非常高
 	unsigned char* arr = new unsigned char[TESTN];
 	for (long i = 0; i < TESTN; i++) arr[i] = rand() % 256;
-	for (int j = 1; j < 16*1024; j++) {
+	for (int j = 1; j < TESTN; j++) {
 		for (long i = 0; i < TESTN; i++) arr[i] += 1;
 	}

 	gettimeofday(&tEnd, 0);
-	
+
+	//将消耗时间传出到timecost数组中对应的元素上
 	*(long*)args = (1000000LL * (tEnd.tv_sec-tStart.tv_sec) + (tEnd.tv_usec-tStart.tv_usec))/1000;
 }

@@ -47,12 +52,15 @@ int main(int argc, char** argv) {
 	while((ch = getopt(argc, argv, "t:fs")) != -1) {
 		switch(ch)
 		{
+			//设置测试的并发线程数，注意不要超过机器上的CPU核数
 		   case 't':
 			  threadnum = atoi(optarg);
 			  break;
+		   //将线程绑定至特定CPU上
 		   case 'f':
 		      setaffinity = true;
 			  break;
+		   //不绑定CPU
 		   case 's':
 		   	  setaffinity = false;
 			  break;
@@ -60,8 +68,10 @@ int main(int argc, char** argv) {
 	}

 	pthread_t* id = new pthread_t[threadnum];
+	//统计每个线程计算所需要的时间
 	long* timecost = new long[threadnum];
 	for(int i = 0; i < threadnum; i++) {
+		//最初timecost用于传递线程号，用于绑定CPU
 		timecost[i] = i;
 		int ret=pthread_create(&id[i],NULL,loopcalc,&timecost[i]); 
 		
@@ -72,10 +82,12 @@ int main(int argc, char** argv) {
 	}

 	long costsum = 0;
+	//等待所有线程结束 
 	for(int i = 0; i < threadnum; i++) {
 		pthread_join(id[i],NULL);
 		costsum += timecost[i];
 	}
+	//比较平均每线程所用时间
 	cout<<"costsum: "<<costsum<<", avg: "<<costsum/threadnum<<endl;
 }