• K
    mm/mempolicy.c: refix mbind_range() vma issue · e26a5114
    KOSAKI Motohiro 提交于
    commit 8aacc9f5 ("mm/mempolicy.c: fix pgoff in mbind vma merge") is the
    slightly incorrect fix.
    
    Why? Think following case.
    
    1. map 4 pages of a file at offset 0
    
       [0123]
    
    2. map 2 pages just after the first mapping of the same file but with
       page offset 2
    
       [0123][23]
    
    3. mbind() 2 pages from the first mapping at offset 2.
       mbind_range() should treat new vma is,
    
       [0123][23]
         |23|
         mbind vma
    
       but it does
    
       [0123][23]
         |01|
         mbind vma
    
       Oops. then, it makes wrong vma merge and splitting ([01][0123] or similar).
    
    This patch fixes it.
    
    [testcase]
      test result - before the patch
    
    	case4: 126: test failed. expect '2,4', actual '2,2,2'
           	case5: passed
    	case6: passed
    	case7: passed
    	case8: passed
    	case_n: 246: test failed. expect '4,2', actual '1,4'
    
    	------------[ cut here ]------------
    	kernel BUG at mm/filemap.c:135!
    	invalid opcode: 0000 [#4] SMP DEBUG_PAGEALLOC
    
    	(snip long bug on messages)
    
      test result - after the patch
    
    	case4: passed
           	case5: passed
    	case6: passed
    	case7: passed
    	case8: passed
    	case_n: passed
    
      source:  mbind_vma_test.c
    ============================================================
     #include <numaif.h>
     #include <numa.h>
     #include <sys/mman.h>
     #include <stdio.h>
     #include <unistd.h>
     #include <stdlib.h>
     #include <string.h>
    
    static unsigned long pagesize;
    void* mmap_addr;
    struct bitmask *nmask;
    char buf[1024];
    FILE *file;
    char retbuf[10240] = "";
    int mapped_fd;
    
    char *rubysrc = "ruby -e '\
      pid = %d; \
      vstart = 0x%llx; \
      vend = 0x%llx; \
      s = `pmap -q #{pid}`; \
      rary = []; \
      s.each_line {|line|; \
        ary=line.split(\" \"); \
        addr = ary[0].to_i(16); \
        if(vstart <= addr && addr < vend) then \
          rary.push(ary[1].to_i()/4); \
        end; \
      }; \
      print rary.join(\",\"); \
    '";
    
    void init(void)
    {
    	void* addr;
    	char buf[128];
    
    	nmask = numa_allocate_nodemask();
    	numa_bitmask_setbit(nmask, 0);
    
    	pagesize = getpagesize();
    
    	sprintf(buf, "%s", "mbind_vma_XXXXXX");
    	mapped_fd = mkstemp(buf);
    	if (mapped_fd == -1)
    		perror("mkstemp "), exit(1);
    	unlink(buf);
    
    	if (lseek(mapped_fd, pagesize*8, SEEK_SET) < 0)
    		perror("lseek "), exit(1);
    	if (write(mapped_fd, "\0", 1) < 0)
    		perror("write "), exit(1);
    
    	addr = mmap(NULL, pagesize*8, PROT_NONE,
    		    MAP_SHARED, mapped_fd, 0);
    	if (addr == MAP_FAILED)
    		perror("mmap "), exit(1);
    
    	if (mprotect(addr+pagesize, pagesize*6, PROT_READ|PROT_WRITE) < 0)
    		perror("mprotect "), exit(1);
    
    	mmap_addr = addr + pagesize;
    
    	/* make page populate */
    	memset(mmap_addr, 0, pagesize*6);
    }
    
    void fin(void)
    {
    	void* addr = mmap_addr - pagesize;
    	munmap(addr, pagesize*8);
    
    	memset(buf, 0, sizeof(buf));
    	memset(retbuf, 0, sizeof(retbuf));
    }
    
    void mem_bind(int index, int len)
    {
    	int err;
    
    	err = mbind(mmap_addr+pagesize*index, pagesize*len,
    		    MPOL_BIND, nmask->maskp, nmask->size, 0);
    	if (err)
    		perror("mbind "), exit(err);
    }
    
    void mem_interleave(int index, int len)
    {
    	int err;
    
    	err = mbind(mmap_addr+pagesize*index, pagesize*len,
    		    MPOL_INTERLEAVE, nmask->maskp, nmask->size, 0);
    	if (err)
    		perror("mbind "), exit(err);
    }
    
    void mem_unbind(int index, int len)
    {
    	int err;
    
    	err = mbind(mmap_addr+pagesize*index, pagesize*len,
    		    MPOL_DEFAULT, NULL, 0, 0);
    	if (err)
    		perror("mbind "), exit(err);
    }
    
    void Assert(char *expected, char *value, char *name, int line)
    {
    	if (strcmp(expected, value) == 0) {
    		fprintf(stderr, "%s: passed\n", name);
    		return;
    	}
    	else {
    		fprintf(stderr, "%s: %d: test failed. expect '%s', actual '%s'\n",
    			name, line,
    			expected, value);
    //		exit(1);
    	}
    }
    
    /*
          AAAA
        PPPPPPNNNNNN
        might become
        PPNNNNNNNNNN
        case 4 below
    */
    void case4(void)
    {
    	init();
    	sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);
    
    	mem_bind(0, 4);
    	mem_unbind(2, 2);
    
    	file = popen(buf, "r");
    	fread(retbuf, sizeof(retbuf), 1, file);
    	Assert("2,4", retbuf, "case4", __LINE__);
    
    	fin();
    }
    
    /*
           AAAA
     PPPPPPNNNNNN
     might become
     PPPPPPPPPPNN
     case 5 below
    */
    void case5(void)
    {
    	init();
    	sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);
    
    	mem_bind(0, 2);
    	mem_bind(2, 2);
    
    	file = popen(buf, "r");
    	fread(retbuf, sizeof(retbuf), 1, file);
    	Assert("4,2", retbuf, "case5", __LINE__);
    
    	fin();
    }
    
    /*
    	    AAAA
    	PPPPNNNNXXXX
    	might become
    	PPPPPPPPPPPP 6
    */
    void case6(void)
    {
    	init();
    	sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);
    
    	mem_bind(0, 2);
    	mem_bind(4, 2);
    	mem_bind(2, 2);
    
    	file = popen(buf, "r");
    	fread(retbuf, sizeof(retbuf), 1, file);
    	Assert("6", retbuf, "case6", __LINE__);
    
    	fin();
    }
    
    /*
        AAAA
    PPPPNNNNXXXX
    might become
    PPPPPPPPXXXX 7
    */
    void case7(void)
    {
    	init();
    	sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);
    
    	mem_bind(0, 2);
    	mem_interleave(4, 2);
    	mem_bind(2, 2);
    
    	file = popen(buf, "r");
    	fread(retbuf, sizeof(retbuf), 1, file);
    	Assert("4,2", retbuf, "case7", __LINE__);
    
    	fin();
    }
    
    /*
        AAAA
    PPPPNNNNXXXX
    might become
    PPPPNNNNNNNN 8
    */
    void case8(void)
    {
    	init();
    	sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);
    
    	mem_bind(0, 2);
    	mem_interleave(4, 2);
    	mem_interleave(2, 2);
    
    	file = popen(buf, "r");
    	fread(retbuf, sizeof(retbuf), 1, file);
    	Assert("2,4", retbuf, "case8", __LINE__);
    
    	fin();
    }
    
    void case_n(void)
    {
    	init();
    	sprintf(buf, rubysrc, getpid(), mmap_addr, mmap_addr+pagesize*6);
    
    	/* make redundunt mappings [0][1234][34][7] */
    	mmap(mmap_addr + pagesize*4, pagesize*2, PROT_READ|PROT_WRITE,
    	     MAP_FIXED|MAP_SHARED, mapped_fd, pagesize*3);
    
    	/* Expect to do nothing. */
    	mem_unbind(2, 2);
    
    	file = popen(buf, "r");
    	fread(retbuf, sizeof(retbuf), 1, file);
    	Assert("4,2", retbuf, "case_n", __LINE__);
    
    	fin();
    }
    
    int main(int argc, char** argv)
    {
    	case4();
    	case5();
    	case6();
    	case7();
    	case8();
    	case_n();
    
    	return 0;
    }
    =============================================================
    Signed-off-by: NKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
    Acked-by: NJohannes Weiner <hannes@cmpxchg.org>
    Cc: Minchan Kim <minchan.kim@gmail.com>
    Cc: Caspar Zhang <caspar@casparzhang.com>
    Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
    Cc: Christoph Lameter <cl@linux.com>
    Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
    Cc: Mel Gorman <mel@csn.ul.ie>
    Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
    Cc: <stable@vger.kernel.org>		[3.1.x]
    Signed-off-by: NAndrew Morton <akpm@linux-foundation.org>
    Signed-off-by: NLinus Torvalds <torvalds@linux-foundation.org>
    e26a5114
mempolicy.c 64.5 KB