alternative.c 19.2 KB
Newer Older
1 2
#define pr_fmt(fmt) "SMP alternatives: " fmt

G
Gerd Hoffmann 已提交
3
#include <linux/module.h>
A
Al Viro 已提交
4
#include <linux/sched.h>
5
#include <linux/mutex.h>
G
Gerd Hoffmann 已提交
6
#include <linux/list.h>
7
#include <linux/stringify.h>
8 9
#include <linux/mm.h>
#include <linux/vmalloc.h>
10
#include <linux/memory.h>
11
#include <linux/stop_machine.h>
12
#include <linux/slab.h>
13
#include <linux/kdebug.h>
G
Gerd Hoffmann 已提交
14 15
#include <asm/alternative.h>
#include <asm/sections.h>
16
#include <asm/pgtable.h>
17 18
#include <asm/mce.h>
#include <asm/nmi.h>
19
#include <asm/cacheflush.h>
20
#include <asm/tlbflush.h>
21
#include <asm/io.h>
22
#include <asm/fixmap.h>
G
Gerd Hoffmann 已提交
23

24 25
#define MAX_PATCH_LEN (255-1)

26
static int __initdata_or_module debug_alternative;
27

28 29 30 31 32 33 34
static int __init debug_alt(char *str)
{
	debug_alternative = 1;
	return 1;
}
__setup("debug-alternative", debug_alt);

35 36
static int noreplace_smp;

37 38 39 40 41 42 43
static int __init setup_noreplace_smp(char *str)
{
	noreplace_smp = 1;
	return 1;
}
__setup("noreplace-smp", setup_noreplace_smp);

44
#ifdef CONFIG_PARAVIRT
45
static int __initdata_or_module noreplace_paravirt = 0;
46 47 48 49 50 51 52 53

static int __init setup_noreplace_paravirt(char *str)
{
	noreplace_paravirt = 1;
	return 1;
}
__setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif
54

55 56 57 58
#define DPRINTK(fmt, args...)						\
do {									\
	if (debug_alternative)						\
		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
59
} while (0)
60

61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
#define DUMP_BYTES(buf, len, fmt, args...)				\
do {									\
	if (unlikely(debug_alternative)) {				\
		int j;							\
									\
		if (!(len))						\
			break;						\
									\
		printk(KERN_DEBUG fmt, ##args);				\
		for (j = 0; j < (len) - 1; j++)				\
			printk(KERN_CONT "%02hhx ", buf[j]);		\
		printk(KERN_CONT "%02hhx\n", buf[j]);			\
	}								\
} while (0)

76 77 78 79 80 81 82 83 84
/*
 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
 * that correspond to that nop. Getting from one nop to the next, we
 * add to the array the offset that is equal to the sum of all sizes of
 * nops preceding the one we are after.
 *
 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
 * nice symmetry of sizes of the previous nops.
 */
85
#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
86 87 88 89 90 91 92 93 94 95 96 97 98 99
static const unsigned char intelnops[] =
{
	GENERIC_NOP1,
	GENERIC_NOP2,
	GENERIC_NOP3,
	GENERIC_NOP4,
	GENERIC_NOP5,
	GENERIC_NOP6,
	GENERIC_NOP7,
	GENERIC_NOP8,
	GENERIC_NOP5_ATOMIC
};
static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
{
G
Gerd Hoffmann 已提交
100 101 102 103 104 105 106 107 108
	NULL,
	intelnops,
	intelnops + 1,
	intelnops + 1 + 2,
	intelnops + 1 + 2 + 3,
	intelnops + 1 + 2 + 3 + 4,
	intelnops + 1 + 2 + 3 + 4 + 5,
	intelnops + 1 + 2 + 3 + 4 + 5 + 6,
	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
109
	intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
G
Gerd Hoffmann 已提交
110
};
111 112 113
#endif

#ifdef K8_NOP1
114 115 116 117 118 119 120 121 122 123 124 125 126 127
static const unsigned char k8nops[] =
{
	K8_NOP1,
	K8_NOP2,
	K8_NOP3,
	K8_NOP4,
	K8_NOP5,
	K8_NOP6,
	K8_NOP7,
	K8_NOP8,
	K8_NOP5_ATOMIC
};
static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
{
G
Gerd Hoffmann 已提交
128 129 130 131 132 133 134 135 136
	NULL,
	k8nops,
	k8nops + 1,
	k8nops + 1 + 2,
	k8nops + 1 + 2 + 3,
	k8nops + 1 + 2 + 3 + 4,
	k8nops + 1 + 2 + 3 + 4 + 5,
	k8nops + 1 + 2 + 3 + 4 + 5 + 6,
	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
137
	k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
G
Gerd Hoffmann 已提交
138
};
139 140
#endif

141
#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
142 143 144 145 146 147 148 149 150 151 152 153 154 155
static const unsigned char k7nops[] =
{
	K7_NOP1,
	K7_NOP2,
	K7_NOP3,
	K7_NOP4,
	K7_NOP5,
	K7_NOP6,
	K7_NOP7,
	K7_NOP8,
	K7_NOP5_ATOMIC
};
static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
{
G
Gerd Hoffmann 已提交
156 157 158 159 160 161 162 163 164
	NULL,
	k7nops,
	k7nops + 1,
	k7nops + 1 + 2,
	k7nops + 1 + 2 + 3,
	k7nops + 1 + 2 + 3 + 4,
	k7nops + 1 + 2 + 3 + 4 + 5,
	k7nops + 1 + 2 + 3 + 4 + 5 + 6,
	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
165
	k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
G
Gerd Hoffmann 已提交
166
};
167 168
#endif

169
#ifdef P6_NOP1
170
static const unsigned char p6nops[] =
171 172 173 174 175 176 177 178 179 180 181 182 183
{
	P6_NOP1,
	P6_NOP2,
	P6_NOP3,
	P6_NOP4,
	P6_NOP5,
	P6_NOP6,
	P6_NOP7,
	P6_NOP8,
	P6_NOP5_ATOMIC
};
static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
{
184 185 186 187 188 189 190 191 192
	NULL,
	p6nops,
	p6nops + 1,
	p6nops + 1 + 2,
	p6nops + 1 + 2 + 3,
	p6nops + 1 + 2 + 3 + 4,
	p6nops + 1 + 2 + 3 + 4 + 5,
	p6nops + 1 + 2 + 3 + 4 + 5 + 6,
	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
193
	p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
194 195 196
};
#endif

197
/* Initialize these to a safe default */
198
#ifdef CONFIG_X86_64
199 200 201 202
const unsigned char * const *ideal_nops = p6_nops;
#else
const unsigned char * const *ideal_nops = intel_nops;
#endif
203

204
void __init arch_init_ideal_nops(void)
205
{
206 207
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
208 209 210 211 212 213 214 215 216 217 218 219 220
		/*
		 * Due to a decoder implementation quirk, some
		 * specific Intel CPUs actually perform better with
		 * the "k8_nops" than with the SDM-recommended NOPs.
		 */
		if (boot_cpu_data.x86 == 6 &&
		    boot_cpu_data.x86_model >= 0x0f &&
		    boot_cpu_data.x86_model != 0x1c &&
		    boot_cpu_data.x86_model != 0x26 &&
		    boot_cpu_data.x86_model != 0x27 &&
		    boot_cpu_data.x86_model < 0x30) {
			ideal_nops = k8_nops;
		} else if (boot_cpu_has(X86_FEATURE_NOPL)) {
221 222 223 224 225 226 227 228
			   ideal_nops = p6_nops;
		} else {
#ifdef CONFIG_X86_64
			ideal_nops = k8_nops;
#else
			ideal_nops = intel_nops;
#endif
		}
229
		break;
230 231 232 233 234 235 236 237 238 239 240 241
	default:
#ifdef CONFIG_X86_64
		ideal_nops = k8_nops;
#else
		if (boot_cpu_has(X86_FEATURE_K8))
			ideal_nops = k8_nops;
		else if (boot_cpu_has(X86_FEATURE_K7))
			ideal_nops = k7_nops;
		else
			ideal_nops = intel_nops;
#endif
	}
G
Gerd Hoffmann 已提交
242 243
}

244
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
245
static void __init_or_module add_nops(void *insns, unsigned int len)
246 247 248 249 250
{
	while (len > 0) {
		unsigned int noplen = len;
		if (noplen > ASM_NOP_MAX)
			noplen = ASM_NOP_MAX;
251
		memcpy(insns, ideal_nops[noplen], noplen);
252 253 254 255 256
		insns += noplen;
		len -= noplen;
	}
}

257
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
258
extern s32 __smp_locks[], __smp_locks_end[];
259
void *text_poke_early(void *addr, const void *opcode, size_t len);
260

261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
/*
 * Are we looking at a near JMP with a 1 or 4-byte displacement.
 */
static inline bool is_jmp(const u8 opcode)
{
	return opcode == 0xeb || opcode == 0xe9;
}

static void __init_or_module
recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
{
	u8 *next_rip, *tgt_rip;
	s32 n_dspl, o_dspl;
	int repl_len;

	if (a->replacementlen != 5)
		return;

	o_dspl = *(s32 *)(insnbuf + 1);

	/* next_rip of the replacement JMP */
	next_rip = repl_insn + a->replacementlen;
	/* target rip of the replacement JMP */
	tgt_rip  = next_rip + o_dspl;
	n_dspl = tgt_rip - orig_insn;

	DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);

	if (tgt_rip - orig_insn >= 0) {
		if (n_dspl - 2 <= 127)
			goto two_byte_jmp;
		else
			goto five_byte_jmp;
	/* negative offset */
	} else {
		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
			goto two_byte_jmp;
		else
			goto five_byte_jmp;
	}

two_byte_jmp:
	n_dspl -= 2;

	insnbuf[0] = 0xeb;
	insnbuf[1] = (s8)n_dspl;
	add_nops(insnbuf + 2, 3);

	repl_len = 2;
	goto done;

five_byte_jmp:
	n_dspl -= 5;

	insnbuf[0] = 0xe9;
	*(s32 *)&insnbuf[1] = n_dspl;

	repl_len = 5;

done:

	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
}

326 327 328 329 330 331 332 333
static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
{
	add_nops(instr + (a->instrlen - a->padlen), a->padlen);

	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
		   instr, a->instrlen - a->padlen, a->padlen);
}

334 335 336 337 338 339 340
/*
 * Replace instructions with better alternatives for this CPU type. This runs
 * before SMP is initialized to avoid SMP problems with self modifying code.
 * This implies that asymmetric systems where APs have less capabilities than
 * the boot processor are not handled. Tough. Make sure you disable such
 * features by hand.
 */
341 342
void __init_or_module apply_alternatives(struct alt_instr *start,
					 struct alt_instr *end)
G
Gerd Hoffmann 已提交
343 344
{
	struct alt_instr *a;
345
	u8 *instr, *replacement;
346
	u8 insnbuf[MAX_PATCH_LEN];
G
Gerd Hoffmann 已提交
347

348
	DPRINTK("alt table %p -> %p", start, end);
349 350
	/*
	 * The scan order should be from start to end. A later scanned
351
	 * alternative code can overwrite previously scanned alternative code.
352 353 354 355 356 357
	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
	 * patch code.
	 *
	 * So be careful if you want to change the scan order to any other
	 * order.
	 */
G
Gerd Hoffmann 已提交
358
	for (a = start; a < end; a++) {
359 360
		int insnbuf_sz = 0;

361 362
		instr = (u8 *)&a->instr_offset + a->instr_offset;
		replacement = (u8 *)&a->repl_offset + a->repl_offset;
363
		BUG_ON(a->instrlen > sizeof(insnbuf));
364
		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
365 366 367 368
		if (!boot_cpu_has(a->cpuid)) {
			if (a->padlen > 1)
				optimize_nops(a, instr);

G
Gerd Hoffmann 已提交
369
			continue;
370
		}
371

372
		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
373 374 375
			a->cpuid >> 5,
			a->cpuid & 0x1f,
			instr, a->instrlen,
376
			replacement, a->replacementlen, a->padlen);
377

378 379 380
		DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
		DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);

381
		memcpy(insnbuf, replacement, a->replacementlen);
382
		insnbuf_sz = a->replacementlen;
383 384

		/* 0xe8 is a relative jump; fix the offset. */
385 386
		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
			*(s32 *)(insnbuf + 1) += replacement - instr;
387 388 389
			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
				*(s32 *)(insnbuf + 1),
				(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
390
		}
391

392 393 394 395
		if (a->replacementlen && is_jmp(replacement[0]))
			recompute_jump(a, instr, replacement, insnbuf);

		if (a->instrlen > a->replacementlen) {
396 397
			add_nops(insnbuf + a->replacementlen,
				 a->instrlen - a->replacementlen);
398 399 400
			insnbuf_sz += a->instrlen - a->replacementlen;
		}
		DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
401

402
		text_poke_early(instr, insnbuf, insnbuf_sz);
G
Gerd Hoffmann 已提交
403 404 405
	}
}

406
#ifdef CONFIG_SMP
407 408
static void alternatives_smp_lock(const s32 *start, const s32 *end,
				  u8 *text, u8 *text_end)
G
Gerd Hoffmann 已提交
409
{
410
	const s32 *poff;
G
Gerd Hoffmann 已提交
411

412
	mutex_lock(&text_mutex);
413 414 415 416
	for (poff = start; poff < end; poff++) {
		u8 *ptr = (u8 *)poff + *poff;

		if (!*poff || ptr < text || ptr >= text_end)
G
Gerd Hoffmann 已提交
417
			continue;
418
		/* turn DS segment override prefix into lock prefix */
419 420
		if (*ptr == 0x3e)
			text_poke(ptr, ((unsigned char []){0xf0}), 1);
421
	}
422
	mutex_unlock(&text_mutex);
G
Gerd Hoffmann 已提交
423 424
}

425 426
static void alternatives_smp_unlock(const s32 *start, const s32 *end,
				    u8 *text, u8 *text_end)
G
Gerd Hoffmann 已提交
427
{
428
	const s32 *poff;
G
Gerd Hoffmann 已提交
429

430
	mutex_lock(&text_mutex);
431 432 433 434
	for (poff = start; poff < end; poff++) {
		u8 *ptr = (u8 *)poff + *poff;

		if (!*poff || ptr < text || ptr >= text_end)
G
Gerd Hoffmann 已提交
435
			continue;
436
		/* turn lock prefix into DS segment override prefix */
437 438
		if (*ptr == 0xf0)
			text_poke(ptr, ((unsigned char []){0x3E}), 1);
439
	}
440
	mutex_unlock(&text_mutex);
G
Gerd Hoffmann 已提交
441 442 443 444 445 446 447 448
}

struct smp_alt_module {
	/* what is this ??? */
	struct module	*mod;
	char		*name;

	/* ptrs to lock prefixes */
449 450
	const s32	*locks;
	const s32	*locks_end;
G
Gerd Hoffmann 已提交
451 452 453 454 455 456 457 458

	/* .text segment, needed to avoid patching init code ;) */
	u8		*text;
	u8		*text_end;

	struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
459
static DEFINE_MUTEX(smp_alt);
460
static bool uniproc_patched = false;	/* protected by smp_alt */
G
Gerd Hoffmann 已提交
461

462 463 464 465
void __init_or_module alternatives_smp_module_add(struct module *mod,
						  char *name,
						  void *locks, void *locks_end,
						  void *text,  void *text_end)
G
Gerd Hoffmann 已提交
466 467 468
{
	struct smp_alt_module *smp;

469 470 471
	mutex_lock(&smp_alt);
	if (!uniproc_patched)
		goto unlock;
472

473 474 475
	if (num_possible_cpus() == 1)
		/* Don't bother remembering, we'll never have to undo it. */
		goto smp_unlock;
G
Gerd Hoffmann 已提交
476 477 478

	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
	if (NULL == smp)
479 480
		/* we'll run the (safe but slow) SMP code then ... */
		goto unlock;
G
Gerd Hoffmann 已提交
481 482 483 484 485 486 487

	smp->mod	= mod;
	smp->name	= name;
	smp->locks	= locks;
	smp->locks_end	= locks_end;
	smp->text	= text;
	smp->text_end	= text_end;
488 489
	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
		smp->locks, smp->locks_end,
G
Gerd Hoffmann 已提交
490 491 492
		smp->text, smp->text_end, smp->name);

	list_add_tail(&smp->next, &smp_alt_modules);
493 494 495
smp_unlock:
	alternatives_smp_unlock(locks, locks_end, text, text_end);
unlock:
496
	mutex_unlock(&smp_alt);
G
Gerd Hoffmann 已提交
497 498
}

499
void __init_or_module alternatives_smp_module_del(struct module *mod)
G
Gerd Hoffmann 已提交
500 501 502
{
	struct smp_alt_module *item;

503
	mutex_lock(&smp_alt);
G
Gerd Hoffmann 已提交
504 505 506 507 508
	list_for_each_entry(item, &smp_alt_modules, next) {
		if (mod != item->mod)
			continue;
		list_del(&item->next);
		kfree(item);
509
		break;
G
Gerd Hoffmann 已提交
510
	}
511
	mutex_unlock(&smp_alt);
G
Gerd Hoffmann 已提交
512 513
}

514
void alternatives_enable_smp(void)
G
Gerd Hoffmann 已提交
515 516 517
{
	struct smp_alt_module *mod;

518 519
	/* Why bother if there are no other CPUs? */
	BUG_ON(num_possible_cpus() == 1);
G
Gerd Hoffmann 已提交
520

521
	mutex_lock(&smp_alt);
522

523
	if (uniproc_patched) {
524
		pr_info("switching to SMP code\n");
525
		BUG_ON(num_online_cpus() != 1);
526 527
		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
G
Gerd Hoffmann 已提交
528 529 530
		list_for_each_entry(mod, &smp_alt_modules, next)
			alternatives_smp_lock(mod->locks, mod->locks_end,
					      mod->text, mod->text_end);
531
		uniproc_patched = false;
G
Gerd Hoffmann 已提交
532
	}
533
	mutex_unlock(&smp_alt);
G
Gerd Hoffmann 已提交
534 535
}

536 537 538 539
/* Return 1 if the address range is reserved for smp-alternatives */
int alternatives_text_reserved(void *start, void *end)
{
	struct smp_alt_module *mod;
540
	const s32 *poff;
541 542
	u8 *text_start = start;
	u8 *text_end = end;
543 544

	list_for_each_entry(mod, &smp_alt_modules, next) {
545
		if (mod->text > text_end || mod->text_end < text_start)
546
			continue;
547 548 549 550
		for (poff = mod->locks; poff < mod->locks_end; poff++) {
			const u8 *ptr = (const u8 *)poff + *poff;

			if (text_start <= ptr && text_end > ptr)
551
				return 1;
552
		}
553 554 555 556
	}

	return 0;
}
557
#endif /* CONFIG_SMP */
558

559
#ifdef CONFIG_PARAVIRT
560 561
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
				     struct paravirt_patch_site *end)
562
{
563
	struct paravirt_patch_site *p;
564
	char insnbuf[MAX_PATCH_LEN];
565

566 567 568
	if (noreplace_paravirt)
		return;

569 570 571
	for (p = start; p < end; p++) {
		unsigned int used;

572
		BUG_ON(p->len > MAX_PATCH_LEN);
573 574
		/* prep the buffer with the original instructions */
		memcpy(insnbuf, p->instr, p->len);
575 576
		used = pv_init_ops.patch(p->instrtype, p->clobbers, insnbuf,
					 (unsigned long)p->instr, p->len);
577

578 579
		BUG_ON(used > p->len);

580
		/* Pad the rest with nops */
581
		add_nops(insnbuf + used, p->len - used);
582
		text_poke_early(p->instr, insnbuf, p->len);
583 584
	}
}
585
extern struct paravirt_patch_site __start_parainstructions[],
586 587 588
	__stop_parainstructions[];
#endif	/* CONFIG_PARAVIRT */

G
Gerd Hoffmann 已提交
589 590
void __init alternative_instructions(void)
{
591 592 593 594
	/* The patching is not fully atomic, so try to avoid local interruptions
	   that might execute the to be patched code.
	   Other CPUs are not running. */
	stop_nmi();
595 596 597 598 599 600 601 602 603 604 605

	/*
	 * Don't stop machine check exceptions while patching.
	 * MCEs only happen when something got corrupted and in this
	 * case we must do something about the corruption.
	 * Ignoring it is worse than a unlikely patching race.
	 * Also machine checks tend to be broadcast and if one CPU
	 * goes into machine check the others follow quickly, so we don't
	 * expect a machine check to cause undue problems during to code
	 * patching.
	 */
606

G
Gerd Hoffmann 已提交
607 608
	apply_alternatives(__alt_instructions, __alt_instructions_end);

609
#ifdef CONFIG_SMP
610 611 612
	/* Patch to UP if other cpus not imminent. */
	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
		uniproc_patched = true;
G
Gerd Hoffmann 已提交
613 614 615 616
		alternatives_smp_module_add(NULL, "core kernel",
					    __smp_locks, __smp_locks_end,
					    _text, _etext);
	}
617

618
	if (!uniproc_patched || num_possible_cpus() == 1)
619 620 621
		free_init_pages("SMP alternatives",
				(unsigned long)__smp_locks,
				(unsigned long)__smp_locks_end);
622 623 624
#endif

	apply_paravirt(__parainstructions, __parainstructions_end);
625

626
	restart_nmi();
G
Gerd Hoffmann 已提交
627
}
628

629 630 631 632 633 634
/**
 * text_poke_early - Update instructions on a live kernel at boot time
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
635 636
 * When you use this code to patch more than one byte of an instruction
 * you need to make sure that other CPUs cannot execute this code in parallel.
637 638 639
 * Also no thread must be currently preempted in the middle of these
 * instructions. And on the local CPU you need to be protected again NMI or MCE
 * handlers seeing an inconsistent instruction while you patch.
640
 */
641
void *__init_or_module text_poke_early(void *addr, const void *opcode,
642
					      size_t len)
643
{
644 645
	unsigned long flags;
	local_irq_save(flags);
646
	memcpy(addr, opcode, len);
647
	sync_core();
648
	local_irq_restore(flags);
649 650 651 652 653 654 655 656 657 658 659 660 661 662 663
	/* Could also do a CLFLUSH here to speed up CPU recovery; but
	   that causes hangs on some VIA CPUs. */
	return addr;
}

/**
 * text_poke - Update instructions on a live kernel
 * @addr: address to modify
 * @opcode: source of the copy
 * @len: length to copy
 *
 * Only atomic text poke/set should be allowed when not doing early patching.
 * It means the size must be writable atomically and the address must be aligned
 * in a way that permits an atomic write. It also makes sure we fit on a single
 * page.
664 665
 *
 * Note: Must be called under text_mutex.
666
 */
667
void *text_poke(void *addr, const void *opcode, size_t len)
668
{
669
	unsigned long flags;
670
	char *vaddr;
M
Mathieu Desnoyers 已提交
671 672
	struct page *pages[2];
	int i;
673

M
Mathieu Desnoyers 已提交
674 675 676
	if (!core_kernel_text((unsigned long)addr)) {
		pages[0] = vmalloc_to_page(addr);
		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
677
	} else {
M
Mathieu Desnoyers 已提交
678
		pages[0] = virt_to_page(addr);
I
Ingo Molnar 已提交
679
		WARN_ON(!PageReserved(pages[0]));
M
Mathieu Desnoyers 已提交
680
		pages[1] = virt_to_page(addr + PAGE_SIZE);
681
	}
M
Mathieu Desnoyers 已提交
682
	BUG_ON(!pages[0]);
683
	local_irq_save(flags);
684 685 686 687
	set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0]));
	if (pages[1])
		set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1]));
	vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0);
M
Mathieu Desnoyers 已提交
688
	memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
689 690 691 692
	clear_fixmap(FIX_TEXT_POKE0);
	if (pages[1])
		clear_fixmap(FIX_TEXT_POKE1);
	local_flush_tlb();
693
	sync_core();
694 695
	/* Could also do a CLFLUSH here to speed up CPU recovery; but
	   that causes hangs on some VIA CPUs. */
M
Mathieu Desnoyers 已提交
696 697
	for (i = 0; i < len; i++)
		BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
698
	local_irq_restore(flags);
699
	return addr;
700
}
701

702 703 704 705 706 707 708 709
static void do_sync_core(void *info)
{
	sync_core();
}

static bool bp_patching_in_progress;
static void *bp_int3_handler, *bp_int3_addr;

710
int poke_int3_handler(struct pt_regs *regs)
711 712 713 714 715
{
	/* bp_patching_in_progress */
	smp_rmb();

	if (likely(!bp_patching_in_progress))
716
		return 0;
717

718
	if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
719
		return 0;
720 721

	/* set up the specified breakpoint handler */
722 723 724
	regs->ip = (unsigned long) bp_int3_handler;

	return 1;
725 726

}
727

728 729 730 731 732 733 734 735
/**
 * text_poke_bp() -- update instructions on live kernel on SMP
 * @addr:	address to patch
 * @opcode:	opcode of new instruction
 * @len:	length to copy
 * @handler:	address to jump to when the temporary breakpoint is hit
 *
 * Modify multi-byte instruction by using int3 breakpoint on SMP.
736 737
 * We completely avoid stop_machine() here, and achieve the
 * synchronization using int3 breakpoint.
738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
 *
 * The way it is done:
 *	- add a int3 trap to the address that will be patched
 *	- sync cores
 *	- update all but the first byte of the patched range
 *	- sync cores
 *	- replace the first byte (int3) by the first byte of
 *	  replacing opcode
 *	- sync cores
 *
 * Note: must be called under text_mutex.
 */
void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler)
{
	unsigned char int3 = 0xcc;

	bp_int3_handler = handler;
	bp_int3_addr = (u8 *)addr + sizeof(int3);
	bp_patching_in_progress = true;
	/*
	 * Corresponding read barrier in int3 notifier for
	 * making sure the in_progress flags is correctly ordered wrt.
	 * patching
	 */
	smp_wmb();

	text_poke(addr, &int3, sizeof(int3));

	on_each_cpu(do_sync_core, NULL, 1);

	if (len - sizeof(int3) > 0) {
		/* patch all but the first byte */
		text_poke((char *)addr + sizeof(int3),
			  (const char *) opcode + sizeof(int3),
			  len - sizeof(int3));
		/*
		 * According to Intel, this core syncing is very likely
		 * not necessary and we'd be safe even without it. But
		 * better safe than sorry (plus there's not only Intel).
		 */
		on_each_cpu(do_sync_core, NULL, 1);
	}

	/* patch the first byte */
	text_poke(addr, opcode, sizeof(int3));

	on_each_cpu(do_sync_core, NULL, 1);

	bp_patching_in_progress = false;
	smp_wmb();

	return addr;
}