machine_kexec_64.c 8.2 KB
Newer Older
1
/*
D
Dave Jones 已提交
2
 * handle transition of Linux booting another kernel
3 4 5 6 7 8 9 10 11 12
 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2.  See the file COPYING for more details.
 */

#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/string.h>
#include <linux/reboot.h>
K
Ken'ichi Ohmichi 已提交
13
#include <linux/numa.h>
I
Ingo Molnar 已提交
14
#include <linux/ftrace.h>
15
#include <linux/io.h>
16
#include <linux/suspend.h>
I
Ingo Molnar 已提交
17

18 19 20
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
21

22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
				unsigned long addr)
{
	pud_t *pud;
	pmd_t *pmd;
	struct page *page;
	int result = -ENOMEM;

	addr &= PMD_MASK;
	pgd += pgd_index(addr);
	if (!pgd_present(*pgd)) {
		page = kimage_alloc_control_pages(image, 0);
		if (!page)
			goto out;
		pud = (pud_t *)page_address(page);
		memset(pud, 0, PAGE_SIZE);
		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
	}
	pud = pud_offset(pgd, addr);
	if (!pud_present(*pud)) {
		page = kimage_alloc_control_pages(image, 0);
		if (!page)
			goto out;
		pmd = (pmd_t *)page_address(page);
		memset(pmd, 0, PAGE_SIZE);
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
	}
	pmd = pmd_offset(pud, addr);
	if (!pmd_present(*pmd))
		set_pmd(pmd, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
	result = 0;
out:
	return result;
}

57
static void init_level2_page(pmd_t *level2p, unsigned long addr)
58 59
{
	unsigned long end_addr;
M
Maneesh Soni 已提交
60

61
	addr &= PAGE_MASK;
62
	end_addr = addr + PUD_SIZE;
M
Maneesh Soni 已提交
63
	while (addr < end_addr) {
64 65
		set_pmd(level2p++, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
		addr += PMD_SIZE;
66 67 68
	}
}

69
static int init_level3_page(struct kimage *image, pud_t *level3p,
M
Maneesh Soni 已提交
70
				unsigned long addr, unsigned long last_addr)
71 72 73
{
	unsigned long end_addr;
	int result;
M
Maneesh Soni 已提交
74

75 76
	result = 0;
	addr &= PAGE_MASK;
77
	end_addr = addr + PGDIR_SIZE;
M
Maneesh Soni 已提交
78
	while ((addr < last_addr) && (addr < end_addr)) {
79
		struct page *page;
80
		pmd_t *level2p;
M
Maneesh Soni 已提交
81

82 83 84 85 86
		page = kimage_alloc_control_pages(image, 0);
		if (!page) {
			result = -ENOMEM;
			goto out;
		}
87
		level2p = (pmd_t *)page_address(page);
88
		init_level2_page(level2p, addr);
89 90
		set_pud(level3p++, __pud(__pa(level2p) | _KERNPG_TABLE));
		addr += PUD_SIZE;
91 92
	}
	/* clear the unused entries */
M
Maneesh Soni 已提交
93
	while (addr < end_addr) {
94 95
		pud_clear(level3p++);
		addr += PUD_SIZE;
96 97 98 99 100 101
	}
out:
	return result;
}


102
static int init_level4_page(struct kimage *image, pgd_t *level4p,
M
Maneesh Soni 已提交
103
				unsigned long addr, unsigned long last_addr)
104 105 106
{
	unsigned long end_addr;
	int result;
M
Maneesh Soni 已提交
107

108 109
	result = 0;
	addr &= PAGE_MASK;
110
	end_addr = addr + (PTRS_PER_PGD * PGDIR_SIZE);
M
Maneesh Soni 已提交
111
	while ((addr < last_addr) && (addr < end_addr)) {
112
		struct page *page;
113
		pud_t *level3p;
M
Maneesh Soni 已提交
114

115 116 117 118 119
		page = kimage_alloc_control_pages(image, 0);
		if (!page) {
			result = -ENOMEM;
			goto out;
		}
120
		level3p = (pud_t *)page_address(page);
121
		result = init_level3_page(image, level3p, addr, last_addr);
122
		if (result)
123
			goto out;
124 125
		set_pgd(level4p++, __pgd(__pa(level3p) | _KERNPG_TABLE));
		addr += PGDIR_SIZE;
126 127
	}
	/* clear the unused entries */
M
Maneesh Soni 已提交
128
	while (addr < end_addr) {
129 130
		pgd_clear(level4p++);
		addr += PGDIR_SIZE;
131
	}
M
Maneesh Soni 已提交
132
out:
133 134 135
	return result;
}

136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
static void free_transition_pgtable(struct kimage *image)
{
	free_page((unsigned long)image->arch.pud);
	free_page((unsigned long)image->arch.pmd);
	free_page((unsigned long)image->arch.pte);
}

static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
	unsigned long vaddr, paddr;
	int result = -ENOMEM;

	vaddr = (unsigned long)relocate_kernel;
	paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
	pgd += pgd_index(vaddr);
	if (!pgd_present(*pgd)) {
		pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
		if (!pud)
			goto err;
		image->arch.pud = pud;
		set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
	}
	pud = pud_offset(pgd, vaddr);
	if (!pud_present(*pud)) {
		pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
		if (!pmd)
			goto err;
		image->arch.pmd = pmd;
		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
	}
	pmd = pmd_offset(pud, vaddr);
	if (!pmd_present(*pmd)) {
		pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
		if (!pte)
			goto err;
		image->arch.pte = pte;
		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
	}
	pte = pte_offset_kernel(pmd, vaddr);
	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
	return 0;
err:
	free_transition_pgtable(image);
	return result;
}

185 186 187

static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
188
	pgd_t *level4p;
189
	int result;
190
	level4p = (pgd_t *)__va(start_pgtable);
191
	result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
192 193 194 195 196 197 198
	if (result)
		return result;
	/*
	 * image->start may be outside 0 ~ max_pfn, for example when
	 * jump back to original kernel from kexeced kernel
	 */
	result = init_one_level2_page(image, level4p, image->start);
199 200 201
	if (result)
		return result;
	return init_transition_pgtable(image, level4p);
202 203 204 205
}

static void set_idt(void *newidt, u16 limit)
{
206
	struct desc_ptr curidt;
207 208

	/* x86-64 supports unaliged loads & stores */
209 210
	curidt.size    = limit;
	curidt.address = (unsigned long)newidt;
211 212

	__asm__ __volatile__ (
213 214
		"lidtq %0\n"
		: : "m" (curidt)
215 216 217 218 219 220
		);
};


static void set_gdt(void *newgdt, u16 limit)
{
221
	struct desc_ptr curgdt;
222 223

	/* x86-64 supports unaligned loads & stores */
224 225
	curgdt.size    = limit;
	curgdt.address = (unsigned long)newgdt;
226 227

	__asm__ __volatile__ (
228 229
		"lgdtq %0\n"
		: : "m" (curgdt)
230 231 232 233 234 235
		);
};

static void load_segments(void)
{
	__asm__ __volatile__ (
236 237 238 239 240
		"\tmovl %0,%%ds\n"
		"\tmovl %0,%%es\n"
		"\tmovl %0,%%ss\n"
		"\tmovl %0,%%fs\n"
		"\tmovl %0,%%gs\n"
M
Michael Matz 已提交
241
		: : "a" (__KERNEL_DS) : "memory"
242 243 244 245 246
		);
}

int machine_kexec_prepare(struct kimage *image)
{
247
	unsigned long start_pgtable;
248 249 250
	int result;

	/* Calculate the offsets */
M
Maneesh Soni 已提交
251
	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
252 253 254

	/* Setup the identity mapped 64bit page table */
	result = init_pgtable(image, start_pgtable);
M
Maneesh Soni 已提交
255
	if (result)
256 257 258 259 260 261 262
		return result;

	return 0;
}

void machine_kexec_cleanup(struct kimage *image)
{
263
	free_transition_pgtable(image);
264 265 266 267 268 269
}

/*
 * Do not allocate memory (or fail in any way) in machine_kexec().
 * We are past the point of no return, committed to rebooting now.
 */
H
Huang Ying 已提交
270
void machine_kexec(struct kimage *image)
271
{
272 273
	unsigned long page_list[PAGES_NR];
	void *control_page;
274
	int save_ftrace_enabled;
275

276 277 278 279 280 281
#ifdef CONFIG_KEXEC_JUMP
	if (kexec_image->preserve_context)
		save_processor_state();
#endif

	save_ftrace_enabled = __ftrace_enabled_save();
I
Ingo Molnar 已提交
282

283 284 285
	/* Interrupts aren't acceptable while we reboot */
	local_irq_disable();

286 287 288 289 290 291 292 293 294 295 296 297 298
	if (image->preserve_context) {
#ifdef CONFIG_X86_IO_APIC
		/*
		 * We need to put APICs in legacy mode so that we can
		 * get timer interrupts in second kernel. kexec/kdump
		 * paths already have calls to disable_IO_APIC() in
		 * one form or other. kexec jump path also need
		 * one.
		 */
		disable_IO_APIC();
#endif
	}

299
	control_page = page_address(image->control_code_page) + PAGE_SIZE;
300
	memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
301

302
	page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
303
	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
304 305
	page_list[PA_TABLE_PAGE] =
	  (unsigned long)__pa(page_address(image->control_code_page));
306

307 308 309 310
	if (image->type == KEXEC_TYPE_DEFAULT)
		page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
						<< PAGE_SHIFT);

311 312
	/*
	 * The segment registers are funny things, they have both a
313 314 315 316
	 * visible and an invisible part.  Whenever the visible part is
	 * set to a specific selector, the invisible part is loaded
	 * with from a table in memory.  At no other time is the
	 * descriptor table in memory accessed.
317 318 319 320 321
	 *
	 * I take advantage of this here by force loading the
	 * segments, before I zap the gdt with an invalid value.
	 */
	load_segments();
322 323
	/*
	 * The gdt & idt are now invalid.
324 325
	 * If you want to load them you must set up your own idt & gdt.
	 */
326 327
	set_gdt(phys_to_virt(0), 0);
	set_idt(phys_to_virt(0), 0);
328

329
	/* now call it */
330 331 332 333 334 335 336 337 338 339 340
	image->start = relocate_kernel((unsigned long)image->head,
				       (unsigned long)page_list,
				       image->start,
				       image->preserve_context);

#ifdef CONFIG_KEXEC_JUMP
	if (kexec_image->preserve_context)
		restore_processor_state();
#endif

	__ftrace_enabled_restore(save_ftrace_enabled);
341
}
342

K
Ken'ichi Ohmichi 已提交
343 344
void arch_crash_save_vmcoreinfo(void)
{
345
	VMCOREINFO_SYMBOL(phys_base);
346
	VMCOREINFO_SYMBOL(init_level4_pgt);
347 348 349 350 351

#ifdef CONFIG_NUMA
	VMCOREINFO_SYMBOL(node_data);
	VMCOREINFO_LENGTH(node_data, MAX_NUMNODES);
#endif
K
Ken'ichi Ohmichi 已提交
352 353
}