提交 b1b602a9 编写于 作者: A antirez

On crash memory test rewrote so that it actaully works.

1) We no longer test location by location, otherwise the CPU write cache
completely makes our business useless.
2) We still need a memory test that operates in steps from the first to
the last location in order to never hit the cache, but that is still
able to retain the memory content.

This was tested using a Linux box containing a bad memory module with a
zingle bit error (always zero).

So the final solution does has an error propagation step that is:

1) Invert bits at every location.
2) Swap adiacent locations.
3) Swap adiacent locations again.
4) Invert bits at every location.
5) Swap adiacent locations.
6) Swap adiacent locations again.

Before and after these steps, and after step 4, a CRC64 checksum is computed.
If the three CRC64 checksums don't match, a memory error was detected.
上级 7383c3b1
......@@ -667,16 +667,22 @@ void logCurrentClient(void) {
}
#if defined(HAVE_PROC_MAPS)
int memtest_non_destructive(void *addr, size_t size); /* memtest.c */
uint64_t crc64(uint64_t crc, const unsigned char *s, uint64_t l);
void memtest_non_destructive_invert(void *addr, size_t size);
void memtest_non_destructive_swap(void *addr, size_t size);
#define MEMTEST_MAX_REGIONS 128
int memtest_test_linux_anonymous_maps(void) {
FILE *fp = fopen("/proc/self/maps","r");
char line[1024];
size_t start_addr, end_addr, size;
size_t start_vect[MEMTEST_MAX_REGIONS];
size_t size_vect[MEMTEST_MAX_REGIONS];
int regions = 0, j;
uint64_t crc1 = 0, crc2 = 0, crc3 = 0;
while(fgets(line,sizeof(line),fp) != NULL) {
char *start, *end, *p = line;
int j;
start = p;
p = strchr(p,'-');
......@@ -695,17 +701,51 @@ int memtest_test_linux_anonymous_maps(void) {
start_addr = strtoul(start,NULL,16);
end_addr = strtoul(end,NULL,16);
size = end_addr-start_addr;
redisLog(REDIS_WARNING,
"Testing memory at %lx (%lu bytes)", start_addr, size);
for (j = 0; j < 3; j++) {
if (memtest_non_destructive((void*)start_addr,size) != 0) {
fclose(fp);
return 1;
}
}
start_vect[regions] = start_addr;
size_vect[regions] = size;
printf("Testing %lx %lu\n", start_vect[regions], size_vect[regions]);
regions++;
}
/* Test all the regions as an unique sequential region.
* 1) Take the CRC64 of the memory region. */
for (j = 0; j < regions; j++) {
crc1 = crc64(crc1,(void*)start_vect[j],size_vect[j]);
}
/* 2) Invert bits, swap adiacent words, swap again, invert bits.
* This is the error amplification step. */
for (j = 0; j < regions; j++)
memtest_non_destructive_invert((void*)start_vect[j],size_vect[j]);
for (j = 0; j < regions; j++)
memtest_non_destructive_swap((void*)start_vect[j],size_vect[j]);
for (j = 0; j < regions; j++)
memtest_non_destructive_swap((void*)start_vect[j],size_vect[j]);
for (j = 0; j < regions; j++)
memtest_non_destructive_invert((void*)start_vect[j],size_vect[j]);
/* 3) Take the CRC64 sum again. */
for (j = 0; j < regions; j++)
crc2 = crc64(crc2,(void*)start_vect[j],size_vect[j]);
/* 4) Swap + Swap again */
for (j = 0; j < regions; j++)
memtest_non_destructive_swap((void*)start_vect[j],size_vect[j]);
for (j = 0; j < regions; j++)
memtest_non_destructive_swap((void*)start_vect[j],size_vect[j]);
/* 5) Take the CRC64 sum again. */
for (j = 0; j < regions; j++)
crc3 = crc64(crc3,(void*)start_vect[j],size_vect[j]);
/* NOTE: It is very important to close the file descriptor only now
* because closing it before may result into unmapping of some memory
* region that we are testing. */
fclose(fp);
return 0;
/* If the two CRC are not the same, we trapped a memory error. */
return crc1 != crc2 || crc2 != crc3;
}
#endif
......
......@@ -241,34 +241,30 @@ void memtest_test(size_t megabytes, int passes) {
}
}
/* This is a fast O(N) best effort memory test, only ZERO-ONE tests and
* checkerboard tests are performed, without pauses between setting and
* reading the value, so this can only detect a subclass of permanent errors.
*
* However the function does not destroy the content of the memory tested that
* is left unmodified.
*
* If a memory error is detected, 1 is returned. Otherwise 0 is returned. */
int memtest_non_destructive(void *addr, size_t size) {
void memtest_non_destructive_invert(void *addr, size_t size) {
volatile unsigned long *p = addr;
unsigned long val;
size_t words = size / sizeof(unsigned long);
size_t j;
size /= sizeof(unsigned long);
for (j = 0; j < size; j++) {
val = p[j];
/* Invert */
for (j = 0; j < words; j++)
p[j] = ~p[j];
}
p[j] = 0; if (p[j] != 0) goto err;
p[j] = (unsigned long)-1; if (p[j] != (unsigned long)-1) goto err;
p[j] = ULONG_ONEZERO; if (p[j] != ULONG_ONEZERO) goto err;
p[j] = ULONG_ZEROONE; if (p[j] != ULONG_ZEROONE) goto err;
p[j] = val; /* restore the original value. */
}
return 0;
void memtest_non_destructive_swap(void *addr, size_t size) {
volatile unsigned long *p = addr;
size_t words = size / sizeof(unsigned long);
size_t j;
err: /* memory error detected. */
p[j] = val;
return 1;
/* Swap */
for (j = 0; j < words; j += 2) {
unsigned long a, b;
a = p[j];
b = p[j+1];
p[j] = b;
p[j+1] = a;
}
}
void memtest(size_t megabytes, int passes) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册