提交 44df75e6 编写于 作者: M Matt Tolentino 提交者: Linus Torvalds

[PATCH] x86_64: add x86-64 support for memory hot-add

Add x86-64 specific memory hot-add functions, Kconfig options,
and runtime kernel page table update functions to make
hot-add usable on x86-64 machines.  Also, fixup the nefarious
conditional locking and exports pointed out by Andi.

Tested on Intel and IBM x86-64 memory hot-add capable systems.
Signed-off-by: NMatt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: NAndi Kleen <ak@suse.de>
Signed-off-by: NLinus Torvalds <torvalds@osdl.org>
上级 8817210d
...@@ -305,7 +305,11 @@ config ARCH_DISCONTIGMEM_DEFAULT ...@@ -305,7 +305,11 @@ config ARCH_DISCONTIGMEM_DEFAULT
config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_ENABLE
def_bool y def_bool y
depends on NUMA depends on (NUMA || EXPERIMENTAL)
config ARCH_MEMORY_PROBE
def_bool y
depends on MEMORY_HOTPLUG
config ARCH_FLATMEM_ENABLE config ARCH_FLATMEM_ENABLE
def_bool y def_bool y
...@@ -315,6 +319,7 @@ source "mm/Kconfig" ...@@ -315,6 +319,7 @@ source "mm/Kconfig"
config HAVE_ARCH_EARLY_PFN_TO_NID config HAVE_ARCH_EARLY_PFN_TO_NID
def_bool y def_bool y
depends on NUMA
config NR_CPUS config NR_CPUS
int "Maximum number of CPUs (2-256)" int "Maximum number of CPUs (2-256)"
......
...@@ -24,6 +24,8 @@ ...@@ -24,6 +24,8 @@
#include <linux/proc_fs.h> #include <linux/proc_fs.h>
#include <linux/pci.h> #include <linux/pci.h>
#include <linux/dma-mapping.h> #include <linux/dma-mapping.h>
#include <linux/module.h>
#include <linux/memory_hotplug.h>
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/system.h> #include <asm/system.h>
...@@ -180,13 +182,19 @@ static struct temp_map { ...@@ -180,13 +182,19 @@ static struct temp_map {
{} {}
}; };
static __init void *alloc_low_page(int *index, unsigned long *phys) static __meminit void *alloc_low_page(int *index, unsigned long *phys)
{ {
struct temp_map *ti; struct temp_map *ti;
int i; int i;
unsigned long pfn = table_end++, paddr; unsigned long pfn = table_end++, paddr;
void *adr; void *adr;
if (after_bootmem) {
adr = (void *)get_zeroed_page(GFP_ATOMIC);
*phys = __pa(adr);
return adr;
}
if (pfn >= end_pfn) if (pfn >= end_pfn)
panic("alloc_low_page: ran out of memory"); panic("alloc_low_page: ran out of memory");
for (i = 0; temp_mappings[i].allocated; i++) { for (i = 0; temp_mappings[i].allocated; i++) {
...@@ -199,55 +207,86 @@ static __init void *alloc_low_page(int *index, unsigned long *phys) ...@@ -199,55 +207,86 @@ static __init void *alloc_low_page(int *index, unsigned long *phys)
ti->allocated = 1; ti->allocated = 1;
__flush_tlb(); __flush_tlb();
adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
memset(adr, 0, PAGE_SIZE);
*index = i; *index = i;
*phys = pfn * PAGE_SIZE; *phys = pfn * PAGE_SIZE;
return adr; return adr;
} }
static __init void unmap_low_page(int i) static __meminit void unmap_low_page(int i)
{ {
struct temp_map *ti = &temp_mappings[i]; struct temp_map *ti;
if (after_bootmem)
return;
ti = &temp_mappings[i];
set_pmd(ti->pmd, __pmd(0)); set_pmd(ti->pmd, __pmd(0));
ti->allocated = 0; ti->allocated = 0;
} }
static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end) static void __meminit
phys_pmd_init(pmd_t *pmd, unsigned long address, unsigned long end)
{
int i;
for (i = 0; i < PTRS_PER_PMD; pmd++, i++, address += PMD_SIZE) {
unsigned long entry;
if (address > end) {
for (; i < PTRS_PER_PMD; i++, pmd++)
set_pmd(pmd, __pmd(0));
break;
}
entry = _PAGE_NX|_PAGE_PSE|_KERNPG_TABLE|_PAGE_GLOBAL|address;
entry &= __supported_pte_mask;
set_pmd(pmd, __pmd(entry));
}
}
static void __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, (unsigned long)__va(address));
if (pmd_none(*pmd)) {
spin_lock(&init_mm.page_table_lock);
phys_pmd_init(pmd, address, end);
spin_unlock(&init_mm.page_table_lock);
__flush_tlb_all();
}
}
static void __meminit phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
{ {
long i, j; long i = pud_index(address);
i = pud_index(address);
pud = pud + i; pud = pud + i;
if (after_bootmem && pud_val(*pud)) {
phys_pmd_update(pud, address, end);
return;
}
for (; i < PTRS_PER_PUD; pud++, i++) { for (; i < PTRS_PER_PUD; pud++, i++) {
int map; int map;
unsigned long paddr, pmd_phys; unsigned long paddr, pmd_phys;
pmd_t *pmd; pmd_t *pmd;
paddr = address + i*PUD_SIZE; paddr = (address & PGDIR_MASK) + i*PUD_SIZE;
if (paddr >= end) { if (paddr >= end)
for (; i < PTRS_PER_PUD; i++, pud++)
set_pud(pud, __pud(0));
break; break;
}
if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) { if (!after_bootmem && !e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
set_pud(pud, __pud(0)); set_pud(pud, __pud(0));
continue; continue;
} }
pmd = alloc_low_page(&map, &pmd_phys); pmd = alloc_low_page(&map, &pmd_phys);
spin_lock(&init_mm.page_table_lock);
set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) { phys_pmd_init(pmd, paddr, end);
unsigned long pe; spin_unlock(&init_mm.page_table_lock);
if (paddr >= end) {
for (; j < PTRS_PER_PMD; j++, pmd++)
set_pmd(pmd, __pmd(0));
break;
}
pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
pe &= __supported_pte_mask;
set_pmd(pmd, __pmd(pe));
}
unmap_low_page(map); unmap_low_page(map);
} }
__flush_tlb(); __flush_tlb();
...@@ -272,12 +311,15 @@ static void __init find_early_table_space(unsigned long end) ...@@ -272,12 +311,15 @@ static void __init find_early_table_space(unsigned long end)
table_start >>= PAGE_SHIFT; table_start >>= PAGE_SHIFT;
table_end = table_start; table_end = table_start;
early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
end, table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
} }
/* Setup the direct mapping of the physical memory at PAGE_OFFSET. /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
This runs before bootmem is initialized and gets pages directly from the This runs before bootmem is initialized and gets pages directly from the
physical memory. To access them they are temporarily mapped. */ physical memory. To access them they are temporarily mapped. */
void __init init_memory_mapping(unsigned long start, unsigned long end) void __meminit init_memory_mapping(unsigned long start, unsigned long end)
{ {
unsigned long next; unsigned long next;
...@@ -289,7 +331,8 @@ void __init init_memory_mapping(unsigned long start, unsigned long end) ...@@ -289,7 +331,8 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
* mapped. Unfortunately this is done currently before the nodes are * mapped. Unfortunately this is done currently before the nodes are
* discovered. * discovered.
*/ */
find_early_table_space(end); if (!after_bootmem)
find_early_table_space(end);
start = (unsigned long)__va(start); start = (unsigned long)__va(start);
end = (unsigned long)__va(end); end = (unsigned long)__va(end);
...@@ -297,20 +340,26 @@ void __init init_memory_mapping(unsigned long start, unsigned long end) ...@@ -297,20 +340,26 @@ void __init init_memory_mapping(unsigned long start, unsigned long end)
for (; start < end; start = next) { for (; start < end; start = next) {
int map; int map;
unsigned long pud_phys; unsigned long pud_phys;
pud_t *pud = alloc_low_page(&map, &pud_phys); pgd_t *pgd = pgd_offset_k(start);
pud_t *pud;
if (after_bootmem)
pud = pud_offset_k(pgd, __PAGE_OFFSET);
else
pud = alloc_low_page(&map, &pud_phys);
next = start + PGDIR_SIZE; next = start + PGDIR_SIZE;
if (next > end) if (next > end)
next = end; next = end;
phys_pud_init(pud, __pa(start), __pa(next)); phys_pud_init(pud, __pa(start), __pa(next));
set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); if (!after_bootmem)
set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
unmap_low_page(map); unmap_low_page(map);
} }
asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features)); if (!after_bootmem)
asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
__flush_tlb_all(); __flush_tlb_all();
early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
table_start<<PAGE_SHIFT,
table_end<<PAGE_SHIFT);
} }
void __cpuinit zap_low_mappings(int cpu) void __cpuinit zap_low_mappings(int cpu)
...@@ -385,6 +434,9 @@ size_zones(unsigned long *z, unsigned long *h, ...@@ -385,6 +434,9 @@ size_zones(unsigned long *z, unsigned long *h,
void __init paging_init(void) void __init paging_init(void)
{ {
unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES]; unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
memory_present(0, 0, end_pfn);
sparse_init();
size_zones(zones, holes, 0, end_pfn); size_zones(zones, holes, 0, end_pfn);
free_area_init_node(0, NODE_DATA(0), zones, free_area_init_node(0, NODE_DATA(0), zones,
__pa(PAGE_OFFSET) >> PAGE_SHIFT, holes); __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
...@@ -425,6 +477,50 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) ...@@ -425,6 +477,50 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
__flush_tlb_all(); __flush_tlb_all();
} }
/*
* Memory hotplug specific functions
* These are only for non-NUMA machines right now.
*/
#ifdef CONFIG_MEMORY_HOTPLUG
void online_page(struct page *page)
{
ClearPageReserved(page);
set_page_count(page, 1);
__free_page(page);
totalram_pages++;
num_physpages++;
}
int add_memory(u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(0);
struct zone *zone = pgdat->node_zones + MAX_NR_ZONES-2;
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
ret = __add_pages(zone, start_pfn, nr_pages);
if (ret)
goto error;
init_memory_mapping(start, (start + size -1));
return ret;
error:
printk("%s: Problem encountered in __add_pages!\n", __func__);
return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
int remove_memory(u64 start, u64 size)
{
return -EINVAL;
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
kcore_vsyscall; kcore_vsyscall;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册