segmentation fault for special huge scale of gemm
Created by: ysh329
特殊超大规模下的矩阵乘法有segfault问题(rows=32768 cols=32768),定位到问题出现在分配内存不足(OOM)后,alloc函数没有给nullptr,导致在计算出错。
改为使用calloc后该问题解决,从功能上来说:calloc = malloc + memset
,为此比较了在安卓手机上,calloc与malloc(为公平起见不算memset
)的性能差异。发现malloc
与calloc
性能持平,部分log如下所示,此外也可以看到有些规模下按理说内存不足,但是malloc
并没有给nullptr
(请看oom_flag,时间avg以毫秒ms计算):
malloc与calloc的性能差异
MeizuS6:/data/local/tmp/bin $ ./test-gemm-accuracy [635/1838]
calloc rows=2 cols=2 avg=0.000683594 oom_flag=0
malloc rows=2 cols=2 avg=0.000610352 oom_flag=0
calloc rows=2 cols=4 avg=0.000610352 oom_flag=0
malloc rows=2 cols=4 avg=0.000512695 oom_flag=0
calloc rows=2 cols=8 avg=0.000708008 oom_flag=0
malloc rows=2 cols=8 avg=0.000512695 oom_flag=0
calloc rows=2 cols=16 avg=0.0013916 oom_flag=0
malloc rows=2 cols=16 avg=0.000488281 oom_flag=0
calloc rows=2 cols=32 avg=0.00141602 oom_flag=0
malloc rows=2 cols=32 avg=0.000708008 oom_flag=0
calloc rows=2 cols=64 avg=0.00180664 oom_flag=0
malloc rows=2 cols=64 avg=0.000708008 oom_flag=0
calloc rows=2 cols=128 avg=0.00209961 oom_flag=0
malloc rows=2 cols=128 avg=0.000805664 oom_flag=0
calloc rows=2 cols=256 avg=0.00380859 oom_flag=0
malloc rows=2 cols=256 avg=0.00109863 oom_flag=0
calloc rows=2 cols=512 avg=0.00600586 oom_flag=0
malloc rows=2 cols=512 avg=0.00209961 oom_flag=0
calloc rows=2 cols=1024 avg=0.0105957 oom_flag=0
malloc rows=2 cols=1024 avg=0.00908203 oom_flag=0
calloc rows=2 cols=2048 avg=0.00200195 oom_flag=0
malloc rows=2 cols=2048 avg=0.00170898 oom_flag=0
calloc rows=2 cols=4096 avg=0.00668945 oom_flag=0
malloc rows=2 cols=4096 avg=0.00422363 oom_flag=0
calloc rows=2 cols=8192 avg=0.00458984 oom_flag=0
malloc rows=2 cols=8192 avg=0.00610352 oom_flag=0
calloc rows=2 cols=16384 avg=0.0079834 oom_flag=0
malloc rows=2 cols=16384 avg=0.0079834 oom_flag=0
calloc rows=16384 cols=64 avg=0.00288086 oom_flag=1
malloc rows=16384 cols=64 avg=0.00290527 oom_flag=1
calloc rows=16384 cols=128 avg=0.00300293 oom_flag=1
malloc rows=16384 cols=128 avg=0.00300293 oom_flag=1
calloc rows=16384 cols=256 avg=0.00300293 oom_flag=1
malloc rows=16384 cols=256 avg=0.00290527 oom_flag=1
calloc rows=16384 cols=512 avg=0.00290527 oom_flag=1
malloc rows=16384 cols=512 avg=0.00290527 oom_flag=1
calloc rows=16384 cols=1024 avg=0.0027832 oom_flag=1
malloc rows=16384 cols=1024 avg=0.00290527 oom_flag=1
calloc rows=16384 cols=2048 avg=0.00288086 oom_flag=1
malloc rows=16384 cols=2048 avg=0.00288086 oom_flag=1
calloc rows=16384 cols=4096 avg=0.00290527 oom_flag=1
malloc rows=16384 cols=4096 avg=0.00288086 oom_flag=1
calloc rows=16384 cols=8192 avg=0.00290527 oom_flag=1
malloc rows=16384 cols=8192 avg=0.00290527 oom_flag=1
calloc rows=16384 cols=16384 avg=0.00290527 oom_flag=1
malloc rows=16384 cols=16384 avg=0.00290527 oom_flag=1
calloc rows=16384 cols=32768 avg=0.000292969 oom_flag=1
malloc rows=16384 cols=32768 avg=0.000219727 oom_flag=1
calloc rows=32768 cols=2 avg=0.0026123 oom_flag=1
malloc rows=32768 cols=2 avg=0.00251465 oom_flag=1
calloc rows=32768 cols=4 avg=0.0032959 oom_flag=1 malloc rows=32768 cols=4 avg=0.00319824 oom_flag=1
calloc rows=32768 cols=8 avg=0.00290527 oom_flag=1
malloc rows=32768 cols=8 avg=0.00288086 oom_flag=1
calloc rows=32768 cols=16 avg=0.00280762 oom_flag=1
malloc rows=32768 cols=16 avg=0.00290527 oom_flag=1
calloc rows=32768 cols=8 avg=0.00290527 oom_flag=1
malloc rows=32768 cols=8 avg=0.00288086 oom_flag=1
calloc rows=32768 cols=16 avg=0.00280762 oom_flag=1
malloc rows=32768 cols=16 avg=0.00290527 oom_flag=1
calloc rows=32768 cols=32 avg=0.00290527 oom_flag=1
malloc rows=32768 cols=32 avg=0.00300293 oom_flag=1
calloc rows=32768 cols=64 avg=0.00288086 oom_flag=1
malloc rows=32768 cols=64 avg=0.00288086 oom_flag=1
calloc rows=32768 cols=128 avg=0.00290527 oom_flag=1
malloc rows=32768 cols=128 avg=0.00280762 oom_flag=1
calloc rows=32768 cols=256 avg=0.00280762 oom_flag=1
malloc rows=32768 cols=256 avg=0.00290527 oom_flag=1
calloc rows=32768 cols=512 avg=0.00290527 oom_flag=1
malloc rows=32768 cols=512 avg=0.00290527 oom_flag=1
calloc rows=32768 cols=1024 avg=0.00288086 oom_flag=1
malloc rows=32768 cols=1024 avg=0.00290527 oom_flag=1
calloc rows=32768 cols=2048 avg=0.00288086 oom_flag=1
malloc rows=32768 cols=2048 avg=0.00290527 oom_flag=1
calloc rows=32768 cols=4096 avg=0.00290527 oom_flag=1
malloc rows=32768 cols=4096 avg=0.00288086 oom_flag=1
calloc rows=32768 cols=8192 avg=0.00310059 oom_flag=1
malloc rows=32768 cols=8192 avg=0.00297852 oom_flag=1
calloc rows=32768 cols=16384 avg=0.000219727 oom_flag=1
malloc rows=32768 cols=16384 avg=0.000195313 oom_flag=1
calloc rows=32768 cols=32768 avg=0.00012207 oom_flag=1
malloc rows=32768 cols=32768 avg=0.000488281 oom_flag=0
此外,我又比较了malloc+memset与calloc,小规模上memset的占用时间可忽略不计,但是大规模上memset占了很多时间(因而,可以看到用calloc的使用优势,不仅性能上和malloc一样,calloc还做了初始化),具体如下:
malloc+memset与calloc的性能比较
MeizuS6:/data/local/tmp/bin $ ./test-gemm-accuracy [385/1898]
calloc rows=2 cols=2 avg=0.000805664 oom_flag=0
malloc rows=2 cols=2 avg=0.000708008 oom_flag=0
calloc rows=2 cols=4 avg=0.000805664 oom_flag=0
malloc rows=2 cols=4 avg=0.000708008 oom_flag=0
calloc rows=2 cols=8 avg=0.00078125 oom_flag=0
malloc rows=2 cols=8 avg=0.000585937 oom_flag=0
calloc rows=2 cols=16 avg=0.00170898 oom_flag=0
malloc rows=2 cols=16 avg=0.000610352 oom_flag=0
calloc rows=2 cols=32 avg=0.0013916 oom_flag=0
malloc rows=2 cols=32 avg=0.00129395 oom_flag=0
calloc rows=2 cols=64 avg=0.00241699 oom_flag=0
malloc rows=2 cols=64 avg=0.0013916 oom_flag=0
calloc rows=2 cols=128 avg=0.00219727 oom_flag=0
malloc rows=2 cols=128 avg=0.0026123 oom_flag=0
calloc rows=2 cols=16384 avg=0.0124023 oom_flag=0
malloc rows=2 cols=16384 avg=0.143091 oom_flag=0
calloc rows=2 cols=32768 avg=0.0236084 oom_flag=0
malloc rows=2 cols=32768 avg=0.307397 oom_flag=0
calloc rows=4 cols=2 avg=0.00090332 oom_flag=0
malloc rows=4 cols=2 avg=0.0393066 oom_flag=0
calloc rows=4 cols=4 avg=0.00180664 oom_flag=0
malloc rows=4 cols=4 avg=0.000488281 oom_flag=0
calloc rows=4 cols=8 avg=0.000683594 oom_flag=0
malloc rows=4 cols=8 avg=0.00168457 oom_flag=0
calloc rows=4 cols=16 avg=0.000708008 oom_flag=0
malloc rows=4 cols=16 avg=0.00109863 oom_flag=0
calloc rows=512 cols=512 avg=0.00539551 oom_flag=0
malloc rows=512 cols=512 avg=1.09768 oom_flag=0
calloc rows=512 cols=1024 avg=0.00820313 oom_flag=0
malloc rows=512 cols=1024 avg=3.72122 oom_flag=0
calloc rows=512 cols=2048 avg=0.00588379 oom_flag=0
malloc rows=512 cols=2048 avg=5.4019 oom_flag=0
calloc rows=512 cols=4096 avg=0.00910645 oom_flag=0
malloc rows=512 cols=4096 avg=5.23579 oom_flag=0
calloc rows=512 cols=8192 avg=0.00678711 oom_flag=0
malloc rows=512 cols=8192 avg=9.497 oom_flag=0
calloc rows=512 cols=16384 avg=0.0065918 oom_flag=0
malloc rows=512 cols=16384 avg=29.3813 oom_flag=0
calloc rows=512 cols=32768 avg=0.010083 oom_flag=1
Segmentation fault
用于比较的具体代码:
void* callocc(int rows, int cols) {
void *a = calloc(rows * cols, sizeof(float));
return a ? a : nullptr;
}
void* mallocc(int rows, int cols) {
void *a = malloc(rows * cols * sizeof(float));
memset(a, 0, rows * cols * sizeof(float));
return a ? a : nullptr;
}
void do_cmp(int max_pow_idx=16, const int pow_base=2, const int max_run_times=5) {
for (int r = 1; r < max_pow_idx; ++r) {
for (int c = 1; c < max_pow_idx; ++c) {
int rows = std::pow(pow_base, r);
int cols = std::pow(pow_base, c);
double start = 0.0f;
double end = 0.0f;
double avg = 0.0f;
// calloc
void *c_ptr = nullptr;
bool oom_flag = false;
start = get_current_time();
for (int ridx = 0; ridx < max_run_times; ++ridx) {
c_ptr = callocc(rows, cols);
}
end = get_current_time();
avg = (end - start) / max_run_times;
oom_flag = !c_ptr ? true : false;
std::cout << "calloc rows=" << rows << " cols=" << cols << " avg=" << avg << " oom_flag=" << oom_flag << std::endl;
// malloc
void *m_ptr = nullptr;
start = get_current_time();
for (int ridx = 0; ridx < max_run_times; ++ridx) {
m_ptr = mallocc(rows, cols);
}
end = get_current_time();
avg = (end - start) / max_run_times;
oom_flag = !m_ptr ? true : false;
std::cout << "malloc rows=" << rows << " cols=" << cols << " avg=" << avg << " oom_flag=" << oom_flag << "\n" << std::endl;
}
}
}