提交 e33c0197 编写于 作者: L Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (36 commits)
  x86, mm: Correct the implementation of is_untracked_pat_range()
  x86/pat: Trivial: don't create debugfs for memtype if pat is disabled
  x86, mtrr: Fix sorting of mtrr after subtracting
  x86: Move find_smp_config() earlier and avoid bootmem usage
  x86, platform: Change is_untracked_pat_range() to bool; cleanup init
  x86: Change is_ISA_range() into an inline function
  x86, mm: is_untracked_pat_range() takes a normal semiclosed range
  x86, mm: Call is_untracked_pat_range() rather than is_ISA_range()
  x86: UV SGI: Don't track GRU space in PAT
  x86: SGI UV: Fix BAU initialization
  x86, numa: Use near(er) online node instead of roundrobin for NUMA
  x86, numa, bootmem: Only free bootmem on NUMA failure path
  x86: Change crash kernel to reserve via reserve_early()
  x86: Eliminate redundant/contradicting cache line size config options
  x86: When cleaning MTRRs, do not fold WP into UC
  x86: remove "extern" from function prototypes in <asm/proto.h>
  x86, mm: Report state of NX protections during boot
  x86, mm: Clean up and simplify NX enablement
  x86, pageattr: Make set_memory_(x|nx) aware of NX support
  x86, sleep: Always save the value of EFER
  ...

Fix up conflicts (added both iommu_shutdown and is_untracked_pat_range)
to 'struct x86_platform_ops') in
	arch/x86/include/asm/x86_init.h
	arch/x86/kernel/x86_init.c
......@@ -301,15 +301,11 @@ config X86_CPU
#
# Define implied options from the CPU selection here
config X86_L1_CACHE_BYTES
config X86_INTERNODE_CACHE_SHIFT
int
default "128" if MPSC
default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32
config X86_INTERNODE_CACHE_BYTES
int
default "4096" if X86_VSMP
default X86_L1_CACHE_BYTES if !X86_VSMP
default "12" if X86_VSMP
default "7" if NUMA
default X86_L1_CACHE_SHIFT
config X86_CMPXCHG
def_bool X86_64 || (X86_32 && !M386)
......@@ -317,9 +313,9 @@ config X86_CMPXCHG
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
config X86_XADD
def_bool y
......
......@@ -107,8 +107,7 @@ ENTRY(startup_32)
lgdt gdt(%ebp)
/* Enable PAE mode */
xorl %eax, %eax
orl $(X86_CR4_PAE), %eax
movl $(X86_CR4_PAE), %eax
movl %eax, %cr4
/*
......
......@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
#undef i386
#include <asm/cache.h>
#include <asm/page_types.h>
#ifdef CONFIG_X86_64
......@@ -46,7 +47,7 @@ SECTIONS
*(.data.*)
_edata = . ;
}
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
. = ALIGN(L1_CACHE_BYTES);
.bss : {
_bss = . ;
*(.bss)
......
......@@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void);
extern unsigned long acpi_wakeup_address;
/* early initialization routine */
extern void acpi_reserve_bootmem(void);
extern void acpi_reserve_wakeup_memory(void);
/*
* Check if the CPU can handle C2 and deeper
......@@ -158,6 +158,7 @@ struct bootnode;
#ifdef CONFIG_ACPI_NUMA
extern int acpi_numa;
extern int acpi_get_nodes(struct bootnode *physnodes);
extern int acpi_scan_nodes(unsigned long start, unsigned long end);
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
......
......@@ -9,12 +9,13 @@
#define __read_mostly __attribute__((__section__(".data.read_mostly")))
#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
#ifdef CONFIG_X86_VSMP
/* vSMP Internode cacheline shift */
#define INTERNODE_CACHE_SHIFT (12)
#ifdef CONFIG_SMP
#define __cacheline_aligned_in_smp \
__attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
__attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
__page_aligned_data
#endif
#endif
......
......@@ -177,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size);
#ifdef CONFIG_DEBUG_RODATA
void mark_rodata_ro(void);
extern const int rodata_test_data;
extern int kernel_set_to_readonly;
void set_kernel_text_rw(void);
void set_kernel_text_ro(void);
#else
......
......@@ -61,6 +61,12 @@ struct e820map {
struct e820entry map[E820_X_MAX];
};
#define ISA_START_ADDRESS 0xa0000
#define ISA_END_ADDRESS 0x100000
#define BIOS_BEGIN 0x000a0000
#define BIOS_END 0x00100000
#ifdef __KERNEL__
/* see comment in arch/x86/kernel/e820.c */
extern struct e820map e820;
......@@ -126,15 +132,18 @@ extern void e820_reserve_resources(void);
extern void e820_reserve_resources_late(void);
extern void setup_memory_map(void);
extern char *default_machine_specific_memory_setup(void);
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
#define ISA_START_ADDRESS 0xa0000
#define ISA_END_ADDRESS 0x100000
#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
/*
* Returns true iff the specified range [s,e) is completely contained inside
* the ISA region.
*/
static inline bool is_ISA_range(u64 s, u64 e)
{
return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
}
#define BIOS_BEGIN 0x000a0000
#define BIOS_END 0x00100000
#endif /* __KERNEL__ */
#endif /* __ASSEMBLY__ */
#ifdef __KERNEL__
#include <linux/ioport.h>
......
......@@ -4,13 +4,16 @@
#include <linux/pci.h>
extern struct pci_device_id k8_nb_ids[];
struct bootnode;
extern int early_is_k8_nb(u32 value);
extern struct pci_dev **k8_northbridges;
extern int num_k8_northbridges;
extern int cache_k8_northbridges(void);
extern void k8_flush_garts(void);
extern int k8_scan_nodes(unsigned long start, unsigned long end);
extern int k8_get_nodes(struct bootnode *nodes);
extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
extern int k8_scan_nodes(void);
#ifdef CONFIG_K8_NB
static inline struct pci_dev *node_to_k8_nb_misc(int node)
......
......@@ -71,12 +71,7 @@ static inline void early_get_smp_config(void)
static inline void find_smp_config(void)
{
x86_init.mpparse.find_smp_config(1);
}
static inline void early_find_smp_config(void)
{
x86_init.mpparse.find_smp_config(0);
x86_init.mpparse.find_smp_config();
}
#ifdef CONFIG_X86_MPPARSE
......@@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
# else
# define default_mpc_oem_bus_info NULL
# endif
extern void default_find_smp_config(unsigned int reserve);
extern void default_find_smp_config(void);
extern void default_get_smp_config(unsigned int early);
#else
static inline void early_reserve_e820_mpc_new(void) { }
......@@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
#define default_mpc_apic_id NULL
#define default_smp_read_mpc_oem NULL
#define default_mpc_oem_bus_info NULL
#define default_find_smp_config x86_init_uint_noop
#define default_find_smp_config x86_init_noop
#define default_get_smp_config x86_init_uint_noop
#endif
......
......@@ -49,7 +49,8 @@ extern unsigned long max_pfn_mapped;
extern unsigned long init_memory_mapping(unsigned long start,
unsigned long end);
extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
int acpi, int k8);
extern void free_initmem(void);
#endif /* !__ASSEMBLY__ */
......
......@@ -16,6 +16,8 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
/*
* ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc..
......@@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
unsigned long new_flags)
{
/*
* PAT type is always WB for ISA. So no need to check.
* PAT type is always WB for untracked ranges, so no need to check.
*/
if (is_ISA_range(paddr, paddr + size - 1))
if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
return 1;
/*
......
......@@ -5,18 +5,19 @@
/* misc architecture specific prototypes */
extern void early_idt_handler(void);
void early_idt_handler(void);
extern void system_call(void);
extern void syscall_init(void);
void system_call(void);
void syscall_init(void);
extern void ia32_syscall(void);
extern void ia32_cstar_target(void);
extern void ia32_sysenter_target(void);
void ia32_syscall(void);
void ia32_cstar_target(void);
void ia32_sysenter_target(void);
extern void syscall32_cpu_init(void);
void syscall32_cpu_init(void);
extern void check_efer(void);
void x86_configure_nx(void);
void x86_report_nx(void);
extern int reboot_force;
......
......@@ -2,7 +2,13 @@
#define _ASM_X86_SECTIONS_H
#include <asm-generic/sections.h>
#include <asm/uaccess.h>
extern char __brk_base[], __brk_limit[];
extern struct exception_table_entry __stop___ex_table[];
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
extern char __end_rodata_hpage_align[];
#endif
#endif /* _ASM_X86_SECTIONS_H */
......@@ -26,7 +26,7 @@ struct x86_init_mpparse {
void (*smp_read_mpc_oem)(struct mpc_table *mpc);
void (*mpc_oem_pci_bus)(struct mpc_bus *m);
void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
void (*find_smp_config)(unsigned int reserve);
void (*find_smp_config)(void);
void (*get_smp_config)(unsigned int early);
};
......@@ -125,12 +125,14 @@ struct x86_cpuinit_ops {
* @calibrate_tsc: calibrate TSC
* @get_wallclock: get time from HW clock like RTC etc.
* @set_wallclock: set time back to HW clock
* @is_untracked_pat_range exclude from PAT logic
*/
struct x86_platform_ops {
unsigned long (*calibrate_tsc)(void);
unsigned long (*get_wallclock)(void);
int (*set_wallclock)(unsigned long nowtime);
void (*iommu_shutdown)(void);
bool (*is_untracked_pat_range)(u64 start, u64 end);
};
extern struct x86_init_ops x86_init;
......
......@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
#ifndef CONFIG_64BIT
store_gdt((struct desc_ptr *)&header->pmode_gdt);
header->pmode_efer_low = nx_enabled;
if (header->pmode_efer_low & 1) {
/* This is strange, why not save efer, always? */
rdmsr(MSR_EFER, header->pmode_efer_low,
header->pmode_efer_high);
}
if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
&header->pmode_efer_high))
header->pmode_efer_low = header->pmode_efer_high = 0;
#endif /* !CONFIG_64BIT */
header->pmode_cr0 = read_cr0();
......@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
/**
* acpi_reserve_bootmem - do _very_ early ACPI initialisation
* acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
*
* We allocate a page from the first 1MB of memory for the wakeup
* routine for when we come back from a sleep state. The
* runtime allocator allows specification of <16MB pages, but not
* <1MB pages.
*/
void __init acpi_reserve_bootmem(void)
void __init acpi_reserve_wakeup_memory(void)
{
unsigned long mem;
if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
printk(KERN_ERR
"ACPI: Wakeup code way too big, S3 disabled.\n");
return;
}
acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
if (!acpi_realmode) {
if (mem == -1L) {
printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
return;
}
acpi_wakeup_address = virt_to_phys((void *)acpi_realmode);
acpi_realmode = (unsigned long) phys_to_virt(mem);
acpi_wakeup_address = mem;
reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
}
......
......@@ -263,11 +263,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
static __init void early_check_numaq(void)
{
/*
* Find possible boot-time SMP configuration:
*/
early_find_smp_config();
/*
* get boot-time SMP configuration:
*/
......
......@@ -30,10 +30,22 @@
#include <asm/apic.h>
#include <asm/ipi.h>
#include <asm/smp.h>
#include <asm/x86_init.h>
DEFINE_PER_CPU(int, x2apic_extra_bits);
static enum uv_system_type uv_system_type;
static u64 gru_start_paddr, gru_end_paddr;
static inline bool is_GRU_range(u64 start, u64 end)
{
return start >= gru_start_paddr && end <= gru_end_paddr;
}
static bool uv_is_untracked_pat_range(u64 start, u64 end)
{
return is_ISA_range(start, end) || is_GRU_range(start, end);
}
static int early_get_nodeid(void)
{
......@@ -49,6 +61,7 @@ static int early_get_nodeid(void)
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
if (!strcmp(oem_id, "SGI")) {
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
if (!strcmp(oem_table_id, "UVL"))
uv_system_type = UV_LEGACY_APIC;
else if (!strcmp(oem_table_id, "UVX"))
......@@ -385,8 +398,12 @@ static __init void map_gru_high(int max_pnode)
int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
if (gru.s.enable)
if (gru.s.enable) {
map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
gru_start_paddr = ((u64)gru.s.base << shift);
gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
}
}
static __init void map_mmr_high(int max_pnode)
......
......@@ -1136,7 +1136,7 @@ void __cpuinit cpu_init(void)
wrmsrl(MSR_KERNEL_GS_BASE, 0);
barrier();
check_efer();
x86_configure_nx();
if (cpu != 0)
enable_x2apic();
......
......@@ -263,8 +263,12 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
/* Don't do the funky fallback heuristics the AMD version employs
for now. */
node = apicid_to_node[apicid];
if (node == NUMA_NO_NODE || !node_online(node))
if (node == NUMA_NO_NODE)
node = first_node(node_online_map);
else if (!node_online(node)) {
/* reuse the value from init_cpu_to_node() */
node = cpu_to_node(cpu);
}
numa_set_node(cpu, node);
printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
......
......@@ -170,6 +170,41 @@ static int __init cmp_range(const void *x1, const void *x2)
return start1 - start2;
}
static int __init clean_sort_range(struct res_range *range, int az)
{
int i, j, k = az - 1, nr_range = 0;
for (i = 0; i < k; i++) {
if (range[i].end)
continue;
for (j = k; j > i; j--) {
if (range[j].end) {
k = j;
break;
}
}
if (j == i)
break;
range[i].start = range[k].start;
range[i].end = range[k].end;
range[k].start = 0;
range[k].end = 0;
k--;
}
/* count it */
for (i = 0; i < az; i++) {
if (!range[i].end) {
nr_range = i;
break;
}
}
/* sort them */
sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
return nr_range;
}
#define BIOS_BUG_MSG KERN_WARNING \
"WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
......@@ -223,22 +258,18 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
subtract_range(range, extra_remove_base,
extra_remove_base + extra_remove_size - 1);
/* get new range num */
nr_range = 0;
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
nr_range++;
}
if (debug_print) {
printk(KERN_DEBUG "After UC checking\n");
for (i = 0; i < nr_range; i++)
for (i = 0; i < RANGE_NUM; i++) {
if (!range[i].end)
continue;
printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n",
range[i].start, range[i].end + 1);
}
}
/* sort the ranges */
sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
nr_range = clean_sort_range(range, RANGE_NUM);
if (debug_print) {
printk(KERN_DEBUG "After sorting\n");
for (i = 0; i < nr_range; i++)
......@@ -689,8 +720,6 @@ static int __init mtrr_need_cleanup(void)
continue;
if (!size)
type = MTRR_NUM_TYPES;
if (type == MTRR_TYPE_WRPROT)
type = MTRR_TYPE_UNCACHABLE;
num[type]++;
}
......
......@@ -189,9 +189,26 @@ static void wait_for_nmi(void)
nmi_wait_count++;
}
static inline int
within(unsigned long addr, unsigned long start, unsigned long end)
{
return addr >= start && addr < end;
}
static int
do_ftrace_mod_code(unsigned long ip, void *new_code)
{
/*
* On x86_64, kernel text mappings are mapped read-only with
* CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
* of the kernel text mapping to modify the kernel text.
*
* For 32bit kernels, these mappings are same and we can use
* kernel identity mapping to modify code.
*/
if (within(ip, (unsigned long)_text, (unsigned long)_etext))
ip = (unsigned long)__va(__pa(ip));
mod_code_ip = (void *)ip;
mod_code_newcode = new_code;
......
......@@ -18,6 +18,8 @@
#include <asm/asm-offsets.h>
#include <asm/setup.h>
#include <asm/processor-flags.h>
#include <asm/msr-index.h>
#include <asm/cpufeature.h>
#include <asm/percpu.h>
/* Physical address */
......@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
orl %edx,%eax
movl %eax,%cr4
btl $5, %eax # check if PAE is enabled
jnc 6f
testb $X86_CR4_PAE, %al # check if PAE is enabled
jz 6f
/* Check if extended functions are implemented */
movl $0x80000000, %eax
cpuid
cmpl $0x80000000, %eax
jbe 6f
/* Value must be in the range 0x80000001 to 0x8000ffff */
subl $0x80000001, %eax
cmpl $(0x8000ffff-0x80000001), %eax
ja 6f
mov $0x80000001, %eax
cpuid
/* Execute Disable bit supported? */
btl $20, %edx
btl $(X86_FEATURE_NX & 31), %edx
jnc 6f
/* Setup EFER (Extended Feature Enable Register) */
movl $0xc0000080, %ecx
movl $MSR_EFER, %ecx
rdmsr
btsl $11, %eax
btsl $_EFER_NX, %eax
/* Make changes effective */
wrmsr
......
......@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
.quad x86_64_start_kernel
ENTRY(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)
__FINITDATA
ENTRY(stack_start)
.quad init_thread_union+THREAD_SIZE-8
.word 0
__FINITDATA
bad_address:
jmp bad_address
......@@ -340,6 +340,7 @@ ENTRY(name)
i = i + 1 ; \
.endr
.data
/*
* This default setting generates an ident mapping at address 0x100000
* and a mapping for the kernel that precisely maps virtual address
......
......@@ -158,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
{
int error;
if (nx_enabled)
set_pages_x(image->control_code_page, 1);
set_pages_x(image->control_code_page, 1);
error = machine_kexec_alloc_page_tables(image);
if (error)
return error;
......@@ -173,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
*/
void machine_kexec_cleanup(struct kimage *image)
{
if (nx_enabled)
set_pages_nx(image->control_code_page, 1);
set_pages_nx(image->control_code_page, 1);
machine_kexec_free_page_tables(image);
}
......
......@@ -667,36 +667,18 @@ void __init default_get_smp_config(unsigned int early)
*/
}
static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
static void __init smp_reserve_memory(struct mpf_intel *mpf)
{
unsigned long size = get_mpc_size(mpf->physptr);
#ifdef CONFIG_X86_32
/*
* We cannot access to MPC table to compute table size yet,
* as only few megabytes from the bottom is mapped now.
* PC-9800's MPC table places on the very last of physical
* memory; so that simply reserving PAGE_SIZE from mpf->physptr
* yields BUG() in reserve_bootmem.
* also need to make sure physptr is below than max_low_pfn
* we don't need reserve the area above max_low_pfn
*/
unsigned long end = max_low_pfn * PAGE_SIZE;
if (mpf->physptr < end) {
if (mpf->physptr + size > end)
size = end - mpf->physptr;
reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
}
#else
reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
#endif
reserve_early(mpf->physptr, mpf->physptr+size, "MP-table mpc");
}
static int __init smp_scan_config(unsigned long base, unsigned long length,
unsigned reserve)
static int __init smp_scan_config(unsigned long base, unsigned long length)
{
unsigned int *bp = phys_to_virt(base);
struct mpf_intel *mpf;
unsigned long mem;
apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
bp, length);
......@@ -717,12 +699,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
mpf, (u64)virt_to_phys(mpf));
if (!reserve)
return 1;
reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
BOOTMEM_DEFAULT);
mem = virt_to_phys(mpf);
reserve_early(mem, mem + sizeof(*mpf), "MP-table mpf");
if (mpf->physptr)
smp_reserve_bootmem(mpf);
smp_reserve_memory(mpf);
return 1;
}
......@@ -732,7 +712,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
return 0;
}
void __init default_find_smp_config(unsigned int reserve)
void __init default_find_smp_config(void)
{
unsigned int address;
......@@ -744,9 +724,9 @@ void __init default_find_smp_config(unsigned int reserve)
* 2) Scan the top 1K of base RAM
* 3) Scan the 64K of bios
*/
if (smp_scan_config(0x0, 0x400, reserve) ||
smp_scan_config(639 * 0x400, 0x400, reserve) ||
smp_scan_config(0xF0000, 0x10000, reserve))
if (smp_scan_config(0x0, 0x400) ||
smp_scan_config(639 * 0x400, 0x400) ||
smp_scan_config(0xF0000, 0x10000))
return;
/*
* If it is an SMP machine we should know now, unless the
......@@ -767,7 +747,7 @@ void __init default_find_smp_config(unsigned int reserve)
address = get_bios_ebda();
if (address)
smp_scan_config(address, 0x400, reserve);
smp_scan_config(address, 0x400);
}
#ifdef CONFIG_X86_IO_APIC
......
......@@ -106,6 +106,7 @@
#include <asm/percpu.h>
#include <asm/topology.h>
#include <asm/apicdef.h>
#include <asm/k8.h>
#ifdef CONFIG_X86_64
#include <asm/numa_64.h>
#endif
......@@ -487,42 +488,11 @@ static void __init reserve_early_setup_data(void)
#ifdef CONFIG_KEXEC
/**
* Reserve @size bytes of crashkernel memory at any suitable offset.
*
* @size: Size of the crashkernel memory to reserve.
* Returns the base address on success, and -1ULL on failure.
*/
static
unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
{
const unsigned long long alignment = 16<<20; /* 16M */
unsigned long long start = 0LL;
while (1) {
int ret;
start = find_e820_area(start, ULONG_MAX, size, alignment);
if (start == -1ULL)
return start;
/* try to reserve it */
ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
if (ret >= 0)
return start;
start += alignment;
}
}
static inline unsigned long long get_total_mem(void)
{
unsigned long long total;
total = max_low_pfn - min_low_pfn;
#ifdef CONFIG_HIGHMEM
total += highend_pfn - highstart_pfn;
#endif
total = max_pfn - min_low_pfn;
return total << PAGE_SHIFT;
}
......@@ -542,21 +512,25 @@ static void __init reserve_crashkernel(void)
/* 0 means: find the address automatically */
if (crash_base <= 0) {
crash_base = find_and_reserve_crashkernel(crash_size);
const unsigned long long alignment = 16<<20; /* 16M */
crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
alignment);
if (crash_base == -1ULL) {
pr_info("crashkernel reservation failed. "
"No suitable area found.\n");
pr_info("crashkernel reservation failed - No suitable area found.\n");
return;
}
} else {
ret = reserve_bootmem_generic(crash_base, crash_size,
BOOTMEM_EXCLUSIVE);
if (ret < 0) {
pr_info("crashkernel reservation failed - "
"memory is in use\n");
unsigned long long start;
start = find_e820_area(crash_base, ULONG_MAX, crash_size,
1<<20);
if (start != crash_base) {
pr_info("crashkernel reservation failed - memory is in use.\n");
return;
}
}
reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
"for crashkernel (System RAM: %ldMB)\n",
......@@ -699,6 +673,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
void __init setup_arch(char **cmdline_p)
{
int acpi = 0;
int k8 = 0;
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
......@@ -791,21 +768,18 @@ void __init setup_arch(char **cmdline_p)
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
#ifdef CONFIG_X86_64
/*
* Must call this twice: Once just to detect whether hardware doesn't
* support NX (so that the early EHCI debug console setup can safely
* call set_fixmap(), and then again after parsing early parameters to
* honor the respective command line option.
* x86_configure_nx() is called before parse_early_param() to detect
* whether hardware doesn't support NX (so that the early EHCI debug
* console setup can safely call set_fixmap()). It may then be called
* again from within noexec_setup() during parsing early parameters
* to honor the respective command line option.
*/
check_efer();
#endif
x86_configure_nx();
parse_early_param();
#ifdef CONFIG_X86_64
check_efer();
#endif
x86_report_nx();
/* Must be before kernel pagetables are setup */
vmi_activate();
......@@ -901,6 +875,13 @@ void __init setup_arch(char **cmdline_p)
reserve_brk();
#ifdef CONFIG_ACPI_SLEEP
/*
* Reserve low memory region for sleep support.
* even before init_memory_mapping
*/
acpi_reserve_wakeup_memory();
#endif
init_gbpages();
/* max_pfn_mapped is updated here */
......@@ -927,6 +908,8 @@ void __init setup_arch(char **cmdline_p)
reserve_initrd();
reserve_crashkernel();
vsmp_init();
io_delay_init();
......@@ -938,27 +921,24 @@ void __init setup_arch(char **cmdline_p)
early_acpi_boot_init();
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
#ifdef CONFIG_ACPI_NUMA
/*
* Parse SRAT to discover nodes.
*/
acpi_numa_init();
acpi = acpi_numa_init();
#endif
initmem_init(0, max_pfn);
#ifdef CONFIG_ACPI_SLEEP
/*
* Reserve low memory region for sleep support.
*/
acpi_reserve_bootmem();
#ifdef CONFIG_K8_NUMA
if (!acpi)
k8 = !k8_numa_init(0, max_pfn);
#endif
/*
* Find and reserve possible boot-time SMP configuration:
*/
find_smp_config();
reserve_crashkernel();
initmem_init(0, max_pfn, acpi, k8);
#ifdef CONFIG_X86_64
/*
......
......@@ -817,10 +817,8 @@ static int __init uv_init_blade(int blade)
*/
apicid = blade_to_first_apicid(blade);
pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
if ((pa & 0xff) != UV_BAU_MESSAGE) {
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
((apicid << 32) | UV_BAU_MESSAGE));
}
return 0;
}
......
......@@ -197,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
apic_version[m->apicid] = ver;
}
static void __init visws_find_smp_config(unsigned int reserve)
static void __init visws_find_smp_config(void)
{
struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
......
......@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
jiffies_64 = jiffies;
#endif
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
/*
* On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
* we retain large page mappings for boundaries spanning kernel text, rodata
* and data sections.
*
* However, kernel identity mappings will have different RWX permissions
* to the pages mapping to text and to the pages padding (which are freed) the
* text section. Hence kernel identity mappings will be broken to smaller
* pages. For 64-bit, kernel text and kernel identity mappings are different,
* so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
* as well as retain 2MB large page mappings for kernel text.
*/
#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
#define X64_ALIGN_DEBUG_RODATA_END \
. = ALIGN(HPAGE_SIZE); \
__end_rodata_hpage_align = .;
#else
#define X64_ALIGN_DEBUG_RODATA_BEGIN
#define X64_ALIGN_DEBUG_RODATA_END
#endif
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
data PT_LOAD FLAGS(7); /* RWE */
......@@ -90,7 +116,9 @@ SECTIONS
EXCEPTION_TABLE(16) :text = 0x9090
X64_ALIGN_DEBUG_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
X64_ALIGN_DEBUG_RODATA_END
/* Data */
.data : AT(ADDR(.data) - LOAD_OFFSET) {
......@@ -107,13 +135,13 @@ SECTIONS
PAGE_ALIGNED_DATA(PAGE_SIZE)
CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
DATA_DATA
CONSTRUCTORS
/* rarely changed data like cpu maps */
READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
/* End of data section */
_edata = .;
......@@ -137,12 +165,12 @@ SECTIONS
*(.vsyscall_0)
} :user
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
. = ALIGN(L1_CACHE_BYTES);
.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
*(.vsyscall_fn)
}
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
. = ALIGN(L1_CACHE_BYTES);
.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
*(.vsyscall_gtod_data)
}
......@@ -166,7 +194,7 @@ SECTIONS
}
vgetcpu_mode = VVIRT(.vgetcpu_mode);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
. = ALIGN(L1_CACHE_BYTES);
.jiffies : AT(VLOAD(.jiffies)) {
*(.jiffies)
}
......
......@@ -13,6 +13,7 @@
#include <asm/e820.h>
#include <asm/time.h>
#include <asm/irq.h>
#include <asm/pat.h>
#include <asm/tsc.h>
#include <asm/iommu.h>
......@@ -80,4 +81,5 @@ struct x86_platform_ops x86_platform = {
.get_wallclock = mach_get_cmos_time,
.set_wallclock = mach_set_rtc_mmss,
.iommu_shutdown = iommu_shutdown_noop,
.is_untracked_pat_range = is_ISA_range,
};
......@@ -146,10 +146,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
use_gbpages = direct_gbpages;
#endif
set_nx();
if (nx_enabled)
printk(KERN_INFO "NX (Execute Disable) protection: active\n");
/* Enable PSE if available */
if (cpu_has_pse)
set_in_cr4(X86_CR4_PSE);
......
......@@ -412,7 +412,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = pte;
}
static void __init add_one_highpage_init(struct page *page, int pfn)
static void __init add_one_highpage_init(struct page *page)
{
ClearPageReserved(page);
init_page_count(page);
......@@ -445,7 +445,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
if (!pfn_valid(node_pfn))
continue;
page = pfn_to_page(node_pfn);
add_one_highpage_init(page, node_pfn);
add_one_highpage_init(page);
}
return 0;
......@@ -703,8 +703,8 @@ void __init find_low_pfn_range(void)
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
void __init initmem_init(unsigned long start_pfn,
unsigned long end_pfn)
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
int acpi, int k8)
{
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
......@@ -997,7 +997,7 @@ static noinline int do_test_wp_bit(void)
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
static int kernel_set_to_readonly;
int kernel_set_to_readonly __read_mostly;
void set_kernel_text_rw(void)
{
......
......@@ -568,7 +568,8 @@ kernel_physical_mapping_init(unsigned long start,
}
#ifndef CONFIG_NUMA
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
int acpi, int k8)
{
unsigned long bootmap_size, bootmap;
......@@ -694,12 +695,12 @@ void __init mem_init(void)
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
static int kernel_set_to_readonly;
int kernel_set_to_readonly;
void set_kernel_text_rw(void)
{
unsigned long start = PFN_ALIGN(_stext);
unsigned long end = PFN_ALIGN(__start_rodata);
unsigned long start = PFN_ALIGN(_text);
unsigned long end = PFN_ALIGN(__stop___ex_table);
if (!kernel_set_to_readonly)
return;
......@@ -707,13 +708,18 @@ void set_kernel_text_rw(void)
pr_debug("Set kernel text: %lx - %lx for read write\n",
start, end);
/*
* Make the kernel identity mapping for text RW. Kernel text
* mapping will always be RO. Refer to the comment in
* static_protections() in pageattr.c
*/
set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}
void set_kernel_text_ro(void)
{
unsigned long start = PFN_ALIGN(_stext);
unsigned long end = PFN_ALIGN(__start_rodata);
unsigned long start = PFN_ALIGN(_text);
unsigned long end = PFN_ALIGN(__stop___ex_table);
if (!kernel_set_to_readonly)
return;
......@@ -721,14 +727,21 @@ void set_kernel_text_ro(void)
pr_debug("Set kernel text: %lx - %lx for read only\n",
start, end);
/*
* Set the kernel identity mapping for text RO.
*/
set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
unsigned long start = PFN_ALIGN(_text);
unsigned long rodata_start =
((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
unsigned long end = (unsigned long) &__end_rodata_hpage_align;
unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
unsigned long data_start = (unsigned long) &_sdata;
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
......@@ -751,6 +764,14 @@ void mark_rodata_ro(void)
printk(KERN_INFO "Testing CPA: again\n");
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
free_init_pages("unused kernel memory",
(unsigned long) page_address(virt_to_page(text_end)),
(unsigned long)
page_address(virt_to_page(rodata_start)));
free_init_pages("unused kernel memory",
(unsigned long) page_address(virt_to_page(rodata_end)),
(unsigned long) page_address(virt_to_page(data_start)));
}
#endif
......
......@@ -24,6 +24,9 @@
#include <asm/apic.h>
#include <asm/k8.h>
static struct bootnode __initdata nodes[8];
static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
static __init int find_northbridge(void)
{
int num;
......@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
* need to get boot_cpu_id so can use that to create apicid_to_node
* in k8_scan_nodes()
*/
/*
* Find possible boot-time SMP configuration:
*/
#ifdef CONFIG_X86_MPPARSE
early_find_smp_config();
#endif
#ifdef CONFIG_ACPI
/*
* Read APIC information from ACPI tables.
*/
early_acpi_boot_init();
#endif
#ifdef CONFIG_X86_MPPARSE
/*
* get boot-time SMP configuration:
......@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
early_init_lapic_mapping();
}
int __init k8_scan_nodes(unsigned long start, unsigned long end)
int __init k8_get_nodes(struct bootnode *physnodes)
{
unsigned numnodes, cores, bits, apicid_base;
int i;
int ret = 0;
for_each_node_mask(i, nodes_parsed) {
physnodes[ret].start = nodes[i].start;
physnodes[ret].end = nodes[i].end;
ret++;
}
return ret;
}
int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long start = PFN_PHYS(start_pfn);
unsigned long end = PFN_PHYS(end_pfn);
unsigned numnodes;
unsigned long prevbase;
struct bootnode nodes[8];
int i, j, nb, found = 0;
int i, nb, found = 0;
u32 nodeid, reg;
if (!early_pci_allowed())
......@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
if (nb < 0)
return nb;
printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb);
pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
reg = read_pci_config(0, nb, 0, 0x60);
numnodes = ((reg >> 4) & 0xF) + 1;
if (numnodes <= 1)
return -1;
printk(KERN_INFO "Number of nodes %d\n", numnodes);
pr_info("Number of physical nodes %d\n", numnodes);
memset(&nodes, 0, sizeof(nodes));
prevbase = 0;
for (i = 0; i < 8; i++) {
unsigned long base, limit;
......@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
printk("Skipping disabled node %d\n", i);
pr_info("Skipping disabled node %d\n", i);
continue;
}
if (nodeid >= numnodes) {
printk("Ignoring excess node %d (%lx:%lx)\n", nodeid,
base, limit);
pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
base, limit);
continue;
}
if (!limit) {
printk(KERN_INFO "Skipping node entry %d (base %lx)\n",
i, base);
pr_info("Skipping node entry %d (base %lx)\n",
i, base);
continue;
}
if ((base >> 8) & 3 || (limit >> 8) & 3) {
printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n",
nodeid, (base>>8)&3, (limit>>8) & 3);
pr_err("Node %d using interleaving mode %lx/%lx\n",
nodeid, (base >> 8) & 3, (limit >> 8) & 3);
return -1;
}
if (node_isset(nodeid, node_possible_map)) {
printk(KERN_INFO "Node %d already present. Skipping\n",
nodeid);
if (node_isset(nodeid, nodes_parsed)) {
pr_info("Node %d already present, skipping\n",
nodeid);
continue;
}
......@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
limit |= (1<<24)-1;
limit++;
if (limit > max_pfn << PAGE_SHIFT)
limit = max_pfn << PAGE_SHIFT;
if (limit > end)
limit = end;
if (limit <= base)
continue;
......@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
if (limit > end)
limit = end;
if (limit == base) {
printk(KERN_ERR "Empty node %d\n", nodeid);
pr_err("Empty node %d\n", nodeid);
continue;
}
if (limit < base) {
printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n",
pr_err("Node %d bogus settings %lx-%lx.\n",
nodeid, base, limit);
continue;
}
/* Could sort here, but pun for now. Should not happen anyroads. */
if (prevbase > base) {
printk(KERN_ERR "Node map not sorted %lx,%lx\n",
pr_err("Node map not sorted %lx,%lx\n",
prevbase, base);
return -1;
}
printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n",
nodeid, base, limit);
pr_info("Node %d MemBase %016lx Limit %016lx\n",
nodeid, base, limit);
found++;
......@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
prevbase = base;
node_set(nodeid, node_possible_map);
node_set(nodeid, nodes_parsed);
}
if (!found)
return -1;
return 0;
}
int __init k8_scan_nodes(void)
{
unsigned int bits;
unsigned int cores;
unsigned int apicid_base;
int i;
BUG_ON(nodes_empty(nodes_parsed));
node_possible_map = nodes_parsed;
memnode_shift = compute_hash_shift(nodes, 8, NULL);
if (memnode_shift < 0) {
printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
pr_err("No NUMA node hash function found. Contact maintainer\n");
return -1;
}
printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift);
pr_info("Using node hash shift of %d\n", memnode_shift);
/* use the coreid bits from early_identify_cpu */
bits = boot_cpu_data.x86_coreid_bits;
......@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
/* need to get boot_cpu_id early for system with apicid lifting */
early_get_boot_cpu_id();
if (boot_cpu_physical_apicid > 0) {
printk(KERN_INFO "BSP APIC ID: %02x\n",
boot_cpu_physical_apicid);
pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
apicid_base = boot_cpu_physical_apicid;
}
for (i = 0; i < 8; i++) {
if (nodes[i].start == nodes[i].end)
continue;
for_each_node_mask(i, node_possible_map) {
int j;
e820_register_active_regions(i,
nodes[i].start >> PAGE_SHIFT,
......
......@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
(ulong) node_remap_end_vaddr[nid]);
}
void __init initmem_init(unsigned long start_pfn,
unsigned long end_pfn)
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
int acpi, int k8)
{
int nid;
long kva_target_pfn;
......
......@@ -239,8 +239,14 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
bootmap = early_node_mem(nodeid, bootmap_start, end,
bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
if (nodedata_phys < start || nodedata_phys >= end)
free_bootmem(nodedata_phys, pgdat_size);
if (nodedata_phys < start || nodedata_phys >= end) {
/*
* only need to free it if it is from other node
* bootmem
*/
if (nid != nodeid)
free_bootmem(nodedata_phys, pgdat_size);
}
node_data[nodeid] = NULL;
return;
}
......@@ -306,8 +312,71 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
static struct bootnode nodes[MAX_NUMNODES] __initdata;
static struct bootnode physnodes[MAX_NUMNODES] __initdata;
static char *cmdline __initdata;
static int __init setup_physnodes(unsigned long start, unsigned long end,
int acpi, int k8)
{
int nr_nodes = 0;
int ret = 0;
int i;
#ifdef CONFIG_ACPI_NUMA
if (acpi)
nr_nodes = acpi_get_nodes(physnodes);
#endif
#ifdef CONFIG_K8_NUMA
if (k8)
nr_nodes = k8_get_nodes(physnodes);
#endif
/*
* Basic sanity checking on the physical node map: there may be errors
* if the SRAT or K8 incorrectly reported the topology or the mem=
* kernel parameter is used.
*/
for (i = 0; i < nr_nodes; i++) {
if (physnodes[i].start == physnodes[i].end)
continue;
if (physnodes[i].start > end) {
physnodes[i].end = physnodes[i].start;
continue;
}
if (physnodes[i].end < start) {
physnodes[i].start = physnodes[i].end;
continue;
}
if (physnodes[i].start < start)
physnodes[i].start = start;
if (physnodes[i].end > end)
physnodes[i].end = end;
}
/*
* Remove all nodes that have no memory or were truncated because of the
* limited address range.
*/
for (i = 0; i < nr_nodes; i++) {
if (physnodes[i].start == physnodes[i].end)
continue;
physnodes[ret].start = physnodes[i].start;
physnodes[ret].end = physnodes[i].end;
ret++;
}
/*
* If no physical topology was detected, a single node is faked to cover
* the entire address space.
*/
if (!ret) {
physnodes[ret].start = start;
physnodes[ret].end = end;
ret = 1;
}
return ret;
}
/*
* Setups up nid to range from addr to addr + size. If the end
* boundary is greater than max_addr, then max_addr is used instead.
......@@ -315,11 +384,9 @@ static char *cmdline __initdata;
* allocation past addr and -1 otherwise. addr is adjusted to be at
* the end of the node.
*/
static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
u64 size, u64 max_addr)
static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
{
int ret = 0;
nodes[nid].start = *addr;
*addr += size;
if (*addr >= max_addr) {
......@@ -334,13 +401,112 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
return ret;
}
/*
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
* to max_addr. The return value is the number of nodes allocated.
*/
static int __init split_nodes_interleave(u64 addr, u64 max_addr,
int nr_phys_nodes, int nr_nodes)
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size;
int big;
int ret = 0;
int i;
if (nr_nodes <= 0)
return -1;
if (nr_nodes > MAX_NUMNODES) {
pr_info("numa=fake=%d too large, reducing to %d\n",
nr_nodes, MAX_NUMNODES);
nr_nodes = MAX_NUMNODES;
}
size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
/*
* Calculate the number of big nodes that can be allocated as a result
* of consolidating the remainder.
*/
big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) /
FAKE_NODE_MIN_SIZE;
size &= FAKE_NODE_MIN_HASH_MASK;
if (!size) {
pr_err("Not enough memory for each node. "
"NUMA emulation disabled.\n");
return -1;
}
for (i = 0; i < nr_phys_nodes; i++)
if (physnodes[i].start != physnodes[i].end)
node_set(i, physnode_mask);
/*
* Continue to fill physical nodes with fake nodes until there is no
* memory left on any of them.
*/
while (nodes_weight(physnode_mask)) {
for_each_node_mask(i, physnode_mask) {
u64 end = physnodes[i].start + size;
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
if (ret < big)
end += FAKE_NODE_MIN_SIZE;
/*
* Continue to add memory to this fake node if its
* non-reserved memory is less than the per-node size.
*/
while (end - physnodes[i].start -
e820_hole_size(physnodes[i].start, end) < size) {
end += FAKE_NODE_MIN_SIZE;
if (end > physnodes[i].end) {
end = physnodes[i].end;
break;
}
}
/*
* If there won't be at least FAKE_NODE_MIN_SIZE of
* non-reserved memory in ZONE_DMA32 for the next node,
* this one must extend to the boundary.
*/
if (end < dma32_end && dma32_end - end -
e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
end = dma32_end;
/*
* If there won't be enough non-reserved memory for the
* next node, this one must extend to the end of the
* physical node.
*/
if (physnodes[i].end - end -
e820_hole_size(end, physnodes[i].end) < size)
end = physnodes[i].end;
/*
* Avoid allocating more nodes than requested, which can
* happen as a result of rounding down each node's size
* to FAKE_NODE_MIN_SIZE.
*/
if (nodes_weight(physnode_mask) + ret >= nr_nodes)
end = physnodes[i].end;
if (setup_node_range(ret++, &physnodes[i].start,
end - physnodes[i].start,
physnodes[i].end) < 0)
node_clear(i, physnode_mask);
}
}
return ret;
}
/*
* Splits num_nodes nodes up equally starting at node_start. The return value
* is the number of nodes split up and addr is adjusted to be at the end of the
* last node allocated.
*/
static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
u64 max_addr, int node_start,
static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start,
int num_nodes)
{
unsigned int big;
......@@ -388,7 +554,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
break;
}
}
if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
if (setup_node_range(i, addr, end - *addr, max_addr) < 0)
break;
}
return i - node_start + 1;
......@@ -399,12 +565,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
* always assigned to a final node and can be asymmetric. Returns the number of
* nodes split.
*/
static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
u64 max_addr, int node_start, u64 size)
static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,
u64 size)
{
int i = node_start;
size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
while (!setup_node_range(i++, nodes, addr, size, max_addr))
while (!setup_node_range(i++, addr, size, max_addr))
;
return i - node_start;
}
......@@ -413,15 +579,15 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
* Sets up the system RAM area from start_pfn to last_pfn according to the
* numa=fake command-line option.
*/
static struct bootnode nodes[MAX_NUMNODES] __initdata;
static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
static int __init numa_emulation(unsigned long start_pfn,
unsigned long last_pfn, int acpi, int k8)
{
u64 size, addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT;
int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
int num_phys_nodes;
memset(&nodes, 0, sizeof(nodes));
num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
/*
* If the numa=fake command-line is just a single number N, split the
* system RAM into N fake nodes.
......@@ -429,7 +595,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
long n = simple_strtol(cmdline, NULL, 0);
num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n);
num_nodes = split_nodes_interleave(addr, max_addr,
num_phys_nodes, n);
if (num_nodes < 0)
return num_nodes;
goto out;
......@@ -456,8 +623,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
if (size)
for (i = 0; i < coeff; i++, num_nodes++)
if (setup_node_range(num_nodes, nodes,
&addr, size, max_addr) < 0)
if (setup_node_range(num_nodes, &addr,
size, max_addr) < 0)
goto done;
if (!*cmdline)
break;
......@@ -473,7 +640,7 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
if (addr < max_addr) {
if (coeff_flag && coeff < 0) {
/* Split remaining nodes into num-sized chunks */
num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
num_nodes += split_nodes_by_size(&addr, max_addr,
num_nodes, num);
goto out;
}
......@@ -482,7 +649,7 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
/* Split remaining nodes into coeff chunks */
if (coeff <= 0)
break;
num_nodes += split_nodes_equally(nodes, &addr, max_addr,
num_nodes += split_nodes_equally(&addr, max_addr,
num_nodes, coeff);
break;
case ',':
......@@ -490,8 +657,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
break;
default:
/* Give one final node */
setup_node_range(num_nodes, nodes, &addr,
max_addr - addr, max_addr);
setup_node_range(num_nodes, &addr, max_addr - addr,
max_addr);
num_nodes++;
}
}
......@@ -505,14 +672,10 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
}
/*
* We need to vacate all active ranges that may have been registered by
* SRAT and set acpi_numa to -1 so that srat_disabled() always returns
* true. NUMA emulation has succeeded so we will not scan ACPI nodes.
* We need to vacate all active ranges that may have been registered for
* the e820 memory map.
*/
remove_all_active_ranges();
#ifdef CONFIG_ACPI_NUMA
acpi_numa = -1;
#endif
for_each_node_mask(i, node_possible_map) {
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
......@@ -524,7 +687,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn
}
#endif /* CONFIG_NUMA_EMU */
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
int acpi, int k8)
{
int i;
......@@ -532,23 +696,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
if (cmdline && !numa_emulation(start_pfn, last_pfn))
if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
#ifdef CONFIG_ACPI_NUMA
if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
last_pfn << PAGE_SHIFT))
if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
last_pfn << PAGE_SHIFT))
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
#ifdef CONFIG_K8_NUMA
if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT,
last_pfn<<PAGE_SHIFT))
if (!numa_off && k8 && !k8_scan_nodes())
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
......@@ -601,6 +764,25 @@ static __init int numa_setup(char *opt)
early_param("numa", numa_setup);
#ifdef CONFIG_NUMA
static __init int find_near_online_node(int node)
{
int n, val;
int min_val = INT_MAX;
int best_node = -1;
for_each_online_node(n) {
val = node_distance(node, n);
if (val < min_val) {
min_val = val;
best_node = n;
}
}
return best_node;
}
/*
* Setup early cpu_to_node.
*
......@@ -632,7 +814,7 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;
if (!node_online(node))
continue;
node = find_near_online_node(node);
numa_set_node(cpu, node);
}
}
......
......@@ -279,6 +279,22 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
__pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
/*
* Once the kernel maps the text as RO (kernel_set_to_readonly is set),
* kernel text mappings for the large page aligned text, rodata sections
* will be always read-only. For the kernel identity mappings covering
* the holes caused by this alignment can be anything that user asks.
*
* This will preserve the large page mappings for kernel text/data
* at no extra cost.
*/
if (kernel_set_to_readonly &&
within(address, (unsigned long)_text,
(unsigned long)__end_rodata_hpage_align))
pgprot_val(forbidden) |= _PAGE_RW;
#endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
return prot;
......@@ -1069,12 +1085,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
int set_memory_x(unsigned long addr, int numpages)
{
if (!(__supported_pte_mask & _PAGE_NX))
return 0;
return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_x);
int set_memory_nx(unsigned long addr, int numpages)
{
if (!(__supported_pte_mask & _PAGE_NX))
return 0;
return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
}
EXPORT_SYMBOL(set_memory_nx);
......
......@@ -20,6 +20,7 @@
#include <asm/cacheflush.h>
#include <asm/processor.h>
#include <asm/tlbflush.h>
#include <asm/x86_init.h>
#include <asm/pgtable.h>
#include <asm/fcntl.h>
#include <asm/e820.h>
......@@ -388,7 +389,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
}
/* Low ISA region is always mapped WB in page table. No need to track */
if (is_ISA_range(start, end - 1)) {
if (x86_platform.is_untracked_pat_range(start, end)) {
if (new_type)
*new_type = _PAGE_CACHE_WB;
return 0;
......@@ -499,7 +500,7 @@ int free_memtype(u64 start, u64 end)
return 0;
/* Low ISA region is always mapped WB. No need to track */
if (is_ISA_range(start, end - 1))
if (x86_platform.is_untracked_pat_range(start, end))
return 0;
is_range_ram = pat_pagerange_is_ram(start, end);
......@@ -582,7 +583,7 @@ static unsigned long lookup_memtype(u64 paddr)
int rettype = _PAGE_CACHE_WB;
struct memtype *entry;
if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1))
if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
return rettype;
if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
......@@ -1018,8 +1019,10 @@ static const struct file_operations memtype_fops = {
static int __init pat_memtype_list_init(void)
{
debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
NULL, &memtype_fops);
if (pat_enabled) {
debugfs_create_file("pat_memtype_list", S_IRUSR,
arch_debugfs_dir, NULL, &memtype_fops);
}
return 0;
}
......
......@@ -3,10 +3,8 @@
#include <linux/init.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
int nx_enabled;
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
static int disable_nx __cpuinitdata;
/*
......@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
if (!str)
return -EINVAL;
if (!strncmp(str, "on", 2)) {
__supported_pte_mask |= _PAGE_NX;
disable_nx = 0;
} else if (!strncmp(str, "off", 3)) {
disable_nx = 1;
__supported_pte_mask &= ~_PAGE_NX;
}
x86_configure_nx();
return 0;
}
early_param("noexec", noexec_setup);
#endif
#ifdef CONFIG_X86_PAE
void __init set_nx(void)
void __cpuinit x86_configure_nx(void)
{
unsigned int v[4], l, h;
if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
if (cpu_has_nx && !disable_nx)
__supported_pte_mask |= _PAGE_NX;
else
__supported_pte_mask &= ~_PAGE_NX;
}
if ((v[3] & (1 << 20)) && !disable_nx) {
rdmsr(MSR_EFER, l, h);
l |= EFER_NX;
wrmsr(MSR_EFER, l, h);
nx_enabled = 1;
__supported_pte_mask |= _PAGE_NX;
void __init x86_report_nx(void)
{
if (!cpu_has_nx) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
"missing in CPU or disabled in BIOS!\n");
} else {
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
if (disable_nx) {
printk(KERN_INFO "NX (Execute Disable) protection: "
"disabled by kernel command line option\n");
} else {
printk(KERN_INFO "NX (Execute Disable) protection: "
"active\n");
}
}
}
#else
void set_nx(void)
{
}
/* 32bit non-PAE kernel, NX cannot be used */
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
"cannot be enabled: non-PAE kernel!\n");
#endif
#ifdef CONFIG_X86_64
void __cpuinit check_efer(void)
{
unsigned long efer;
rdmsrl(MSR_EFER, efer);
if (!(efer & EFER_NX) || disable_nx)
__supported_pte_mask &= ~_PAGE_NX;
}
}
#endif
......@@ -290,8 +290,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
start, end);
e820_register_active_regions(node, start >> PAGE_SHIFT,
end >> PAGE_SHIFT);
if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
update_nodes_add(node, start, end);
......@@ -338,6 +336,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
void __init acpi_numa_arch_fixup(void) {}
int __init acpi_get_nodes(struct bootnode *physnodes)
{
int i;
int ret = 0;
for_each_node_mask(i, nodes_parsed) {
physnodes[ret].start = nodes[i].start;
physnodes[ret].end = nodes[i].end;
ret++;
}
return ret;
}
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
{
......@@ -350,11 +361,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
for (i = 0; i < MAX_NUMNODES; i++)
cutoff_node(i, start, end);
if (!nodes_cover_memory(nodes)) {
bad_srat();
return -1;
}
memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
memblk_nodeid);
if (memnode_shift < 0) {
......@@ -364,6 +370,14 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return -1;
}
for_each_node_mask(i, nodes_parsed)
e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
nodes[i].end >> PAGE_SHIFT);
if (!nodes_cover_memory(nodes)) {
bad_srat();
return -1;
}
/* Account for nodes with cpus and no memory */
nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
......@@ -454,7 +468,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
for (i = 0; i < num_nodes; i++)
if (fake_nodes[i].start != fake_nodes[i].end)
node_set(i, nodes_parsed);
WARN_ON(!nodes_cover_memory(fake_nodes));
}
static int null_slit_node_compare(int a, int b)
......
......@@ -8,6 +8,7 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
......@@ -43,7 +44,7 @@ union smp_flush_state {
spinlock_t tlbstate_lock;
DECLARE_BITMAP(flush_cpumask, NR_CPUS);
};
char pad[CONFIG_X86_INTERNODE_CACHE_BYTES];
char pad[INTERNODE_CACHE_BYTES];
} ____cacheline_internodealigned_in_smp;
/* State is put into the per CPU data section, but padded
......
......@@ -1093,10 +1093,8 @@ asmlinkage void __init xen_start_kernel(void)
__supported_pte_mask |= _PAGE_IOMAP;
#ifdef CONFIG_X86_64
/* Work out if we support NX */
check_efer();
#endif
x86_configure_nx();
xen_setup_features();
......
......@@ -283,22 +283,24 @@ acpi_table_parse_srat(enum acpi_srat_type id,
int __init acpi_numa_init(void)
{
int ret = 0;
/* SRAT: Static Resource Affinity Table */
if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) {
acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY,
acpi_parse_x2apic_affinity, NR_CPUS);
acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY,
acpi_parse_processor_affinity, NR_CPUS);
acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity,
NR_NODE_MEMBLKS);
ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity,
NR_NODE_MEMBLKS);
}
/* SLIT: System Locality Information Table */
acpi_table_parse(ACPI_SIG_SLIT, acpi_parse_slit);
acpi_numa_arch_fixup();
return 0;
return ret;
}
int acpi_get_pxm(acpi_handle h)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册