machine_kexec.c 6.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * machine_kexec.c - handle transition of Linux booting another kernel
 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2.  See the file COPYING for more details.
 */

#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
#include <linux/reboot.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/io.h>
17 18

static void init_level2_page(pmd_t *level2p, unsigned long addr)
19 20
{
	unsigned long end_addr;
M
Maneesh Soni 已提交
21

22
	addr &= PAGE_MASK;
23
	end_addr = addr + PUD_SIZE;
M
Maneesh Soni 已提交
24
	while (addr < end_addr) {
25 26
		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
		addr += PMD_SIZE;
27 28 29
	}
}

30
static int init_level3_page(struct kimage *image, pud_t *level3p,
M
Maneesh Soni 已提交
31
				unsigned long addr, unsigned long last_addr)
32 33 34
{
	unsigned long end_addr;
	int result;
M
Maneesh Soni 已提交
35

36 37
	result = 0;
	addr &= PAGE_MASK;
38
	end_addr = addr + PGDIR_SIZE;
M
Maneesh Soni 已提交
39
	while ((addr < last_addr) && (addr < end_addr)) {
40
		struct page *page;
41
		pmd_t *level2p;
M
Maneesh Soni 已提交
42

43 44 45 46 47
		page = kimage_alloc_control_pages(image, 0);
		if (!page) {
			result = -ENOMEM;
			goto out;
		}
48
		level2p = (pmd_t *)page_address(page);
49
		init_level2_page(level2p, addr);
50 51
		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
		addr += PUD_SIZE;
52 53
	}
	/* clear the unused entries */
M
Maneesh Soni 已提交
54
	while (addr < end_addr) {
55 56
		pud_clear(level3p++);
		addr += PUD_SIZE;
57 58 59 60 61 62
	}
out:
	return result;
}


63
static int init_level4_page(struct kimage *image, pgd_t *level4p,
M
Maneesh Soni 已提交
64
				unsigned long addr, unsigned long last_addr)
65 66 67
{
	unsigned long end_addr;
	int result;
M
Maneesh Soni 已提交
68

69 70
	result = 0;
	addr &= PAGE_MASK;
71
	end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
M
Maneesh Soni 已提交
72
	while ((addr < last_addr) && (addr < end_addr)) {
73
		struct page *page;
74
		pud_t *level3p;
M
Maneesh Soni 已提交
75

76 77 78 79 80
		page = kimage_alloc_control_pages(image, 0);
		if (!page) {
			result = -ENOMEM;
			goto out;
		}
81
		level3p = (pud_t *)page_address(page);
82 83 84 85
		result = init_level3_page(image, level3p, addr, last_addr);
		if (result) {
			goto out;
		}
86 87
		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
		addr += PGDIR_SIZE;
88 89
	}
	/* clear the unused entries */
M
Maneesh Soni 已提交
90
	while (addr < end_addr) {
91 92
		pgd_clear(level4p++);
		addr += PGDIR_SIZE;
93
	}
M
Maneesh Soni 已提交
94
out:
95 96 97 98 99 100
	return result;
}


static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
101 102
	pgd_t *level4p;
	level4p = (pgd_t *)__va(start_pgtable);
M
Maneesh Soni 已提交
103
 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
104 105 106 107
}

static void set_idt(void *newidt, u16 limit)
{
108
	struct desc_ptr curidt;
109 110

	/* x86-64 supports unaliged loads & stores */
111 112
	curidt.size    = limit;
	curidt.address = (unsigned long)newidt;
113 114

	__asm__ __volatile__ (
115 116
		"lidtq %0\n"
		: : "m" (curidt)
117 118 119 120 121 122
		);
};


static void set_gdt(void *newgdt, u16 limit)
{
123
	struct desc_ptr curgdt;
124 125

	/* x86-64 supports unaligned loads & stores */
126 127
	curgdt.size    = limit;
	curgdt.address = (unsigned long)newgdt;
128 129

	__asm__ __volatile__ (
130 131
		"lgdtq %0\n"
		: : "m" (curgdt)
132 133 134 135 136 137
		);
};

static void load_segments(void)
{
	__asm__ __volatile__ (
138 139 140 141 142
		"\tmovl %0,%%ds\n"
		"\tmovl %0,%%es\n"
		"\tmovl %0,%%ss\n"
		"\tmovl %0,%%fs\n"
		"\tmovl %0,%%gs\n"
M
Michael Matz 已提交
143
		: : "a" (__KERNEL_DS) : "memory"
144 145 146
		);
}

M
Maneesh Soni 已提交
147 148 149 150
typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
					unsigned long control_code_buffer,
					unsigned long start_address,
					unsigned long pgtable) ATTRIB_NORET;
151

T
Tobias Klauser 已提交
152 153
extern const unsigned char relocate_new_kernel[];
extern const unsigned long relocate_new_kernel_size;
154 155 156 157 158 159 160

int machine_kexec_prepare(struct kimage *image)
{
	unsigned long start_pgtable, control_code_buffer;
	int result;

	/* Calculate the offsets */
M
Maneesh Soni 已提交
161
	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
162
	control_code_buffer = start_pgtable + PAGE_SIZE;
163 164 165

	/* Setup the identity mapped 64bit page table */
	result = init_pgtable(image, start_pgtable);
M
Maneesh Soni 已提交
166
	if (result)
167 168 169
		return result;

	/* Place the code in the reboot code buffer */
M
Maneesh Soni 已提交
170 171
	memcpy(__va(control_code_buffer), relocate_new_kernel,
						relocate_new_kernel_size);
172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195

	return 0;
}

void machine_kexec_cleanup(struct kimage *image)
{
	return;
}

/*
 * Do not allocate memory (or fail in any way) in machine_kexec().
 * We are past the point of no return, committed to rebooting now.
 */
NORET_TYPE void machine_kexec(struct kimage *image)
{
	unsigned long page_list;
	unsigned long control_code_buffer;
	unsigned long start_pgtable;
	relocate_new_kernel_t rnk;

	/* Interrupts aren't acceptable while we reboot */
	local_irq_disable();

	/* Calculate the offsets */
M
Maneesh Soni 已提交
196 197
	page_list = image->head;
	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
198
	control_code_buffer = start_pgtable + PAGE_SIZE;
199 200 201 202 203 204 205 206 207 208 209

	/* Set the low half of the page table to my identity mapped
	 * page table for kexec.  Leave the high half pointing at the
	 * kernel pages.   Don't bother to flush the global pages
	 * as that will happen when I fully switch to my identity mapped
	 * page table anyway.
	 */
	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
	__flush_tlb();


210 211 212 213 214
	/* The segment registers are funny things, they have both a
	 * visible and an invisible part.  Whenever the visible part is
	 * set to a specific selector, the invisible part is loaded
	 * with from a table in memory.  At no other time is the
	 * descriptor table in memory accessed.
215 216 217 218 219 220 221 222 223 224 225 226 227 228
	 *
	 * I take advantage of this here by force loading the
	 * segments, before I zap the gdt with an invalid value.
	 */
	load_segments();
	/* The gdt & idt are now invalid.
	 * If you want to load them you must set up your own idt & gdt.
	 */
	set_gdt(phys_to_virt(0),0);
	set_idt(phys_to_virt(0),0);
	/* now call it */
	rnk = (relocate_new_kernel_t) control_code_buffer;
	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
}
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256

/* crashkernel=size@addr specifies the location to reserve for
 * a crash kernel.  By reserving this memory we guarantee
 * that linux never set's it up as a DMA target.
 * Useful for holding code to do something appropriate
 * after a kernel panic.
 */
static int __init setup_crashkernel(char *arg)
{
	unsigned long size, base;
	char *p;
	if (!arg)
		return -EINVAL;
	size = memparse(arg, &p);
	if (arg == p)
		return -EINVAL;
	if (*p == '@') {
		base = memparse(p+1, &p);
		/* FIXME: Do I want a sanity check to validate the
		 * memory range?  Yes you do, but it's too early for
		 * e820 -AK */
		crashk_res.start = base;
		crashk_res.end   = base + size - 1;
	}
	return 0;
}
early_param("crashkernel", setup_crashkernel);