machine_kexec.c 6.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
/*
 * machine_kexec.c - handle transition of Linux booting another kernel
 * Copyright (C) 2002-2005 Eric Biederman  <ebiederm@xmission.com>
 *
 * This source code is licensed under the GNU General Public License,
 * Version 2.  See the file COPYING for more details.
 */

#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/delay.h>
#include <linux/string.h>
#include <linux/reboot.h>
#include <asm/pda.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/io.h>
#include <asm/apic.h>
#include <asm/cpufeature.h>
#include <asm/hw_irq.h>

#define LEVEL0_SIZE (1UL << 12UL)
#define LEVEL1_SIZE (1UL << 21UL)
#define LEVEL2_SIZE (1UL << 30UL)
#define LEVEL3_SIZE (1UL << 39UL)
#define LEVEL4_SIZE (1UL << 48UL)

#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)

M
Maneesh Soni 已提交
35
static void init_level2_page(u64 *level2p, unsigned long addr)
36 37
{
	unsigned long end_addr;
M
Maneesh Soni 已提交
38

39 40
	addr &= PAGE_MASK;
	end_addr = addr + LEVEL2_SIZE;
M
Maneesh Soni 已提交
41
	while (addr < end_addr) {
42 43 44 45 46
		*(level2p++) = addr | L1_ATTR;
		addr += LEVEL1_SIZE;
	}
}

M
Maneesh Soni 已提交
47 48
static int init_level3_page(struct kimage *image, u64 *level3p,
				unsigned long addr, unsigned long last_addr)
49 50 51
{
	unsigned long end_addr;
	int result;
M
Maneesh Soni 已提交
52

53 54 55
	result = 0;
	addr &= PAGE_MASK;
	end_addr = addr + LEVEL3_SIZE;
M
Maneesh Soni 已提交
56
	while ((addr < last_addr) && (addr < end_addr)) {
57 58
		struct page *page;
		u64 *level2p;
M
Maneesh Soni 已提交
59

60 61 62 63 64 65 66 67 68 69 70
		page = kimage_alloc_control_pages(image, 0);
		if (!page) {
			result = -ENOMEM;
			goto out;
		}
		level2p = (u64 *)page_address(page);
		init_level2_page(level2p, addr);
		*(level3p++) = __pa(level2p) | L2_ATTR;
		addr += LEVEL2_SIZE;
	}
	/* clear the unused entries */
M
Maneesh Soni 已提交
71
	while (addr < end_addr) {
72 73 74 75 76 77 78 79
		*(level3p++) = 0;
		addr += LEVEL2_SIZE;
	}
out:
	return result;
}


M
Maneesh Soni 已提交
80 81
static int init_level4_page(struct kimage *image, u64 *level4p,
				unsigned long addr, unsigned long last_addr)
82 83 84
{
	unsigned long end_addr;
	int result;
M
Maneesh Soni 已提交
85

86 87 88
	result = 0;
	addr &= PAGE_MASK;
	end_addr = addr + LEVEL4_SIZE;
M
Maneesh Soni 已提交
89
	while ((addr < last_addr) && (addr < end_addr)) {
90 91
		struct page *page;
		u64 *level3p;
M
Maneesh Soni 已提交
92

93 94 95 96 97 98 99 100 101 102 103 104 105 106
		page = kimage_alloc_control_pages(image, 0);
		if (!page) {
			result = -ENOMEM;
			goto out;
		}
		level3p = (u64 *)page_address(page);
		result = init_level3_page(image, level3p, addr, last_addr);
		if (result) {
			goto out;
		}
		*(level4p++) = __pa(level3p) | L3_ATTR;
		addr += LEVEL3_SIZE;
	}
	/* clear the unused entries */
M
Maneesh Soni 已提交
107
	while (addr < end_addr) {
108 109 110
		*(level4p++) = 0;
		addr += LEVEL3_SIZE;
	}
M
Maneesh Soni 已提交
111
out:
112 113 114 115 116 117 118 119
	return result;
}


static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
{
	u64 *level4p;
	level4p = (u64 *)__va(start_pgtable);
M
Maneesh Soni 已提交
120
 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
}

static void set_idt(void *newidt, u16 limit)
{
	unsigned char curidt[10];

	/* x86-64 supports unaliged loads & stores */
	(*(u16 *)(curidt)) = limit;
	(*(u64 *)(curidt +2)) = (unsigned long)(newidt);

	__asm__ __volatile__ (
		"lidt %0\n"
		: "=m" (curidt)
		);
};


static void set_gdt(void *newgdt, u16 limit)
{
	unsigned char curgdt[10];

	/* x86-64 supports unaligned loads & stores */
	(*(u16 *)(curgdt)) = limit;
	(*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);

	__asm__ __volatile__ (
		"lgdt %0\n"
		: "=m" (curgdt)
		);
};

static void load_segments(void)
{
	__asm__ __volatile__ (
		"\tmovl $"STR(__KERNEL_DS)",%eax\n"
		"\tmovl %eax,%ds\n"
		"\tmovl %eax,%es\n"
		"\tmovl %eax,%ss\n"
		"\tmovl %eax,%fs\n"
		"\tmovl %eax,%gs\n"
		);
#undef STR
#undef __STR
}

M
Maneesh Soni 已提交
166 167 168 169
typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
					unsigned long control_code_buffer,
					unsigned long start_address,
					unsigned long pgtable) ATTRIB_NORET;
170 171 172 173 174 175 176 177 178 179

const extern unsigned char relocate_new_kernel[];
const extern unsigned long relocate_new_kernel_size;

int machine_kexec_prepare(struct kimage *image)
{
	unsigned long start_pgtable, control_code_buffer;
	int result;

	/* Calculate the offsets */
M
Maneesh Soni 已提交
180
	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
181 182 183 184
	control_code_buffer = start_pgtable + 4096UL;

	/* Setup the identity mapped 64bit page table */
	result = init_pgtable(image, start_pgtable);
M
Maneesh Soni 已提交
185
	if (result)
186 187 188
		return result;

	/* Place the code in the reboot code buffer */
M
Maneesh Soni 已提交
189 190
	memcpy(__va(control_code_buffer), relocate_new_kernel,
						relocate_new_kernel_size);
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214

	return 0;
}

void machine_kexec_cleanup(struct kimage *image)
{
	return;
}

/*
 * Do not allocate memory (or fail in any way) in machine_kexec().
 * We are past the point of no return, committed to rebooting now.
 */
NORET_TYPE void machine_kexec(struct kimage *image)
{
	unsigned long page_list;
	unsigned long control_code_buffer;
	unsigned long start_pgtable;
	relocate_new_kernel_t rnk;

	/* Interrupts aren't acceptable while we reboot */
	local_irq_disable();

	/* Calculate the offsets */
M
Maneesh Soni 已提交
215 216
	page_list = image->head;
	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
	control_code_buffer = start_pgtable + 4096UL;

	/* Set the low half of the page table to my identity mapped
	 * page table for kexec.  Leave the high half pointing at the
	 * kernel pages.   Don't bother to flush the global pages
	 * as that will happen when I fully switch to my identity mapped
	 * page table anyway.
	 */
	memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
	__flush_tlb();


	/* The segment registers are funny things, they are
	 * automatically loaded from a table, in memory wherever you
	 * set them to a specific selector, but this table is never
	 * accessed again unless you set the segment to a different selector.
	 *
	 * The more common model are caches where the behide
	 * the scenes work is done, but is also dropped at arbitrary
	 * times.
	 *
	 * I take advantage of this here by force loading the
	 * segments, before I zap the gdt with an invalid value.
	 */
	load_segments();
	/* The gdt & idt are now invalid.
	 * If you want to load them you must set up your own idt & gdt.
	 */
	set_gdt(phys_to_virt(0),0);
	set_idt(phys_to_virt(0),0);
	/* now call it */
	rnk = (relocate_new_kernel_t) control_code_buffer;
	(*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
}