memory.c 16.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * drivers/base/memory.c - basic Memory class support
 *
 * Written by Matt Tolentino <matthew.e.tolentino@intel.com>
 *            Dave Hansen <haveblue@us.ibm.com>
 *
 * This file provides the necessary infrastructure to represent
 * a SPARSEMEM-memory-model system's physical memory in /sysfs.
 * All arch-independent code that assumes MEMORY_HOTPLUG requires
 * SPARSEMEM should be contained here, or in mm/memory_hotplug.c.
 */

#include <linux/sysdev.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/topology.h>
17
#include <linux/capability.h>
18 19 20 21 22
#include <linux/device.h>
#include <linux/memory.h>
#include <linux/kobject.h>
#include <linux/memory_hotplug.h>
#include <linux/mm.h>
23
#include <linux/mutex.h>
24
#include <linux/stat.h>
25
#include <linux/slab.h>
26

A
Arun Sharma 已提交
27
#include <linux/atomic.h>
28 29
#include <asm/uaccess.h>

30 31
static DEFINE_MUTEX(mem_sysfs_mutex);

32
#define MEMORY_CLASS_NAME	"memory"
33 34 35 36 37 38 39

static int sections_per_block;

static inline int base_memory_block_id(int section_nr)
{
	return section_nr / sections_per_block;
}
40 41

static struct sysdev_class memory_sysdev_class = {
42
	.name = MEMORY_CLASS_NAME,
43 44
};

45
static const char *memory_uevent_name(struct kset *kset, struct kobject *kobj)
46 47 48 49
{
	return MEMORY_CLASS_NAME;
}

50 51
static int memory_uevent(struct kset *kset, struct kobject *obj,
			struct kobj_uevent_env *env)
52 53 54 55 56 57
{
	int retval = 0;

	return retval;
}

58
static const struct kset_uevent_ops memory_uevent_ops = {
59 60
	.name		= memory_uevent_name,
	.uevent		= memory_uevent,
61 62
};

63
static BLOCKING_NOTIFIER_HEAD(memory_chain);
64

65
int register_memory_notifier(struct notifier_block *nb)
66
{
67
        return blocking_notifier_chain_register(&memory_chain, nb);
68
}
69
EXPORT_SYMBOL(register_memory_notifier);
70

71
void unregister_memory_notifier(struct notifier_block *nb)
72
{
73
        blocking_notifier_chain_unregister(&memory_chain, nb);
74
}
75
EXPORT_SYMBOL(unregister_memory_notifier);
76

77 78 79 80 81 82 83 84 85 86 87 88 89 90
static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain);

int register_memory_isolate_notifier(struct notifier_block *nb)
{
	return atomic_notifier_chain_register(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(register_memory_isolate_notifier);

void unregister_memory_isolate_notifier(struct notifier_block *nb)
{
	atomic_notifier_chain_unregister(&memory_isolate_chain, nb);
}
EXPORT_SYMBOL(unregister_memory_isolate_notifier);

91 92 93
/*
 * register_memory - Setup a sysfs device for a memory block
 */
94
static
95
int register_memory(struct memory_block *memory)
96 97 98 99
{
	int error;

	memory->sysdev.cls = &memory_sysdev_class;
100
	memory->sysdev.id = memory->start_section_nr / sections_per_block;
101 102 103 104 105 106

	error = sysdev_register(&memory->sysdev);
	return error;
}

static void
107
unregister_memory(struct memory_block *memory)
108 109 110
{
	BUG_ON(memory->sysdev.cls != &memory_sysdev_class);

111 112
	/* drop the ref. we got in remove_memory_block() */
	kobject_put(&memory->sysdev.kobj);
113 114 115
	sysdev_unregister(&memory->sysdev);
}

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
unsigned long __weak memory_block_size_bytes(void)
{
	return MIN_MEMORY_BLOCK_SIZE;
}

static unsigned long get_memory_block_size(void)
{
	unsigned long block_sz;

	block_sz = memory_block_size_bytes();

	/* Validate blk_sz is a power of 2 and not less than section size */
	if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
		WARN_ON(1);
		block_sz = MIN_MEMORY_BLOCK_SIZE;
	}

	return block_sz;
}

136 137 138 139 140
/*
 * use this as the physical section index that this memsection
 * uses.
 */

141
static ssize_t show_mem_start_phys_index(struct sys_device *dev,
142
			struct sysdev_attribute *attr, char *buf)
143 144 145
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
	unsigned long phys_index;

	phys_index = mem->start_section_nr / sections_per_block;
	return sprintf(buf, "%08lx\n", phys_index);
}

static ssize_t show_mem_end_phys_index(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	unsigned long phys_index;

	phys_index = mem->end_section_nr / sections_per_block;
	return sprintf(buf, "%08lx\n", phys_index);
161 162
}

163 164 165
/*
 * Show whether the section of memory is likely to be hot-removable
 */
166 167
static ssize_t show_mem_removable(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
168
{
169 170
	unsigned long i, pfn;
	int ret = 1;
171 172 173
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);

174
	for (i = 0; i < sections_per_block; i++) {
175
		pfn = section_nr_to_pfn(mem->start_section_nr + i);
176 177 178
		ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
	}

179 180 181
	return sprintf(buf, "%d\n", ret);
}

182 183 184
/*
 * online, offline, going offline, etc.
 */
185 186
static ssize_t show_mem_state(struct sys_device *dev,
			struct sysdev_attribute *attr, char *buf)
187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	ssize_t len = 0;

	/*
	 * We can probably put these states in a nice little array
	 * so that they're not open-coded
	 */
	switch (mem->state) {
		case MEM_ONLINE:
			len = sprintf(buf, "online\n");
			break;
		case MEM_OFFLINE:
			len = sprintf(buf, "offline\n");
			break;
		case MEM_GOING_OFFLINE:
			len = sprintf(buf, "going-offline\n");
			break;
		default:
			len = sprintf(buf, "ERROR-UNKNOWN-%ld\n",
					mem->state);
			WARN_ON(1);
			break;
	}

	return len;
}

216
int memory_notify(unsigned long val, void *v)
217
{
218
	return blocking_notifier_call_chain(&memory_chain, val, v);
219 220
}

221 222 223 224 225
int memory_isolate_notify(unsigned long val, void *v)
{
	return atomic_notifier_call_chain(&memory_isolate_chain, val, v);
}

226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261
/*
 * The probe routines leave the pages reserved, just as the bootmem code does.
 * Make sure they're still that way.
 */
static bool pages_correctly_reserved(unsigned long start_pfn,
					unsigned long nr_pages)
{
	int i, j;
	struct page *page;
	unsigned long pfn = start_pfn;

	/*
	 * memmap between sections is not contiguous except with
	 * SPARSEMEM_VMEMMAP. We lookup the page once per section
	 * and assume memmap is contiguous within each section
	 */
	for (i = 0; i < sections_per_block; i++, pfn += PAGES_PER_SECTION) {
		if (WARN_ON_ONCE(!pfn_valid(pfn)))
			return false;
		page = pfn_to_page(pfn);

		for (j = 0; j < PAGES_PER_SECTION; j++) {
			if (PageReserved(page + j))
				continue;

			printk(KERN_WARNING "section number %ld page number %d "
				"not reserved, was it already online?\n",
				pfn_to_section_nr(pfn), j);

			return false;
		}
	}

	return true;
}

262 263 264 265 266
/*
 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
 * OK to have direct references to sparsemem variables in here.
 */
static int
267
memory_block_action(unsigned long phys_index, unsigned long action)
268 269
{
	unsigned long start_pfn, start_paddr;
270
	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
271
	struct page *first_page;
272 273
	int ret;

274 275
	first_page = pfn_to_page(phys_index << PFN_SECTION_SHIFT);

276 277
	switch (action) {
		case MEM_ONLINE:
278
			start_pfn = page_to_pfn(first_page);
279 280 281 282

			if (!pages_correctly_reserved(start_pfn, nr_pages))
				return -EBUSY;

283
			ret = online_pages(start_pfn, nr_pages);
284 285
			break;
		case MEM_OFFLINE:
286
			start_paddr = page_to_pfn(first_page) << PAGE_SHIFT;
287
			ret = remove_memory(start_paddr,
288
					    nr_pages << PAGE_SHIFT);
289 290
			break;
		default:
291 292
			WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
			     "%ld\n", __func__, phys_index, action, action);
293 294 295 296 297 298 299 300 301
			ret = -EINVAL;
	}

	return ret;
}

static int memory_block_change_state(struct memory_block *mem,
		unsigned long to_state, unsigned long from_state_req)
{
302
	int ret = 0;
303

304
	mutex_lock(&mem->state_mutex);
305 306 307 308 309 310

	if (mem->state != from_state_req) {
		ret = -EINVAL;
		goto out;
	}

311 312 313
	if (to_state == MEM_OFFLINE)
		mem->state = MEM_GOING_OFFLINE;

314
	ret = memory_block_action(mem->start_section_nr, to_state);
315

316
	if (ret)
317
		mem->state = from_state_req;
318
	else
319 320 321
		mem->state = to_state;

out:
322
	mutex_unlock(&mem->state_mutex);
323 324 325 326
	return ret;
}

static ssize_t
327 328
store_mem_state(struct sys_device *dev,
		struct sysdev_attribute *attr, const char *buf, size_t count)
329 330 331 332 333 334 335 336 337 338
{
	struct memory_block *mem;
	int ret = -EINVAL;

	mem = container_of(dev, struct memory_block, sysdev);

	if (!strncmp(buf, "online", min((int)count, 6)))
		ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
	else if(!strncmp(buf, "offline", min((int)count, 7)))
		ret = memory_block_change_state(mem, MEM_OFFLINE, MEM_ONLINE);
339

340 341 342 343 344 345 346 347 348 349 350 351 352 353
	if (ret)
		return ret;
	return count;
}

/*
 * phys_device is a bad name for this.  What I really want
 * is a way to differentiate between memory ranges that
 * are part of physical devices that constitute
 * a complete removable unit or fru.
 * i.e. do these ranges belong to the same physical device,
 * s.t. if I offline all of these sections I can then
 * remove the physical device?
 */
354 355
static ssize_t show_phys_device(struct sys_device *dev,
				struct sysdev_attribute *attr, char *buf)
356 357 358 359 360 361
{
	struct memory_block *mem =
		container_of(dev, struct memory_block, sysdev);
	return sprintf(buf, "%d\n", mem->phys_device);
}

362 363
static SYSDEV_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL);
static SYSDEV_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL);
364 365
static SYSDEV_ATTR(state, 0644, show_mem_state, store_mem_state);
static SYSDEV_ATTR(phys_device, 0444, show_phys_device, NULL);
366
static SYSDEV_ATTR(removable, 0444, show_mem_removable, NULL);
367 368 369 370 371 372 373 374 375 376

#define mem_create_simple_file(mem, attr_name)	\
	sysdev_create_file(&mem->sysdev, &attr_##attr_name)
#define mem_remove_simple_file(mem, attr_name)	\
	sysdev_remove_file(&mem->sysdev, &attr_##attr_name)

/*
 * Block size attribute stuff
 */
static ssize_t
377 378
print_block_size(struct sysdev_class *class, struct sysdev_class_attribute *attr,
		 char *buf)
379
{
380
	return sprintf(buf, "%lx\n", get_memory_block_size());
381 382
}

383
static SYSDEV_CLASS_ATTR(block_size_bytes, 0444, print_block_size, NULL);
384 385 386

static int block_size_init(void)
{
387
	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
388
				&attr_block_size_bytes.attr);
389 390 391 392 393 394 395 396 397 398
}

/*
 * Some architectures will have custom drivers to do this, and
 * will not need to do it from userspace.  The fake hot-add code
 * as well as ppc64 will do all of their discovery in userspace
 * and will require this interface.
 */
#ifdef CONFIG_ARCH_MEMORY_PROBE
static ssize_t
399 400
memory_probe_store(struct class *class, struct class_attribute *attr,
		   const char *buf, size_t count)
401 402
{
	u64 phys_addr;
403
	int nid;
404
	int i, ret;
405
	unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block;
406 407 408

	phys_addr = simple_strtoull(buf, NULL, 0);

409 410 411
	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
		return -EINVAL;

412 413 414 415 416
	for (i = 0; i < sections_per_block; i++) {
		nid = memory_add_physaddr_to_nid(phys_addr);
		ret = add_memory(nid, phys_addr,
				 PAGES_PER_SECTION << PAGE_SHIFT);
		if (ret)
417
			goto out;
418 419 420

		phys_addr += MIN_MEMORY_BLOCK_SIZE;
	}
421

422 423 424
	ret = count;
out:
	return ret;
425
}
426
static CLASS_ATTR(probe, S_IWUSR, NULL, memory_probe_store);
427 428 429

static int memory_probe_init(void)
{
430
	return sysfs_create_file(&memory_sysdev_class.kset.kobj,
431
				&class_attr_probe.attr);
432 433
}
#else
434 435 436 437
static inline int memory_probe_init(void)
{
	return 0;
}
438 439
#endif

440 441 442 443 444 445 446
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Support for offlining pages of memory
 */

/* Soft offline a page */
static ssize_t
447 448 449
store_soft_offline_page(struct class *class,
			struct class_attribute *attr,
			const char *buf, size_t count)
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	if (!pfn_valid(pfn))
		return -ENXIO;
	ret = soft_offline_page(pfn_to_page(pfn), 0);
	return ret == 0 ? count : ret;
}

/* Forcibly offline a page, including killing processes. */
static ssize_t
466 467 468
store_hard_offline_page(struct class *class,
			struct class_attribute *attr,
			const char *buf, size_t count)
469 470 471 472 473 474 475 476 477 478 479 480
{
	int ret;
	u64 pfn;
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;
	if (strict_strtoull(buf, 0, &pfn) < 0)
		return -EINVAL;
	pfn >>= PAGE_SHIFT;
	ret = __memory_failure(pfn, 0, 0);
	return ret ? ret : count;
}

481 482
static CLASS_ATTR(soft_offline_page, 0644, NULL, store_soft_offline_page);
static CLASS_ATTR(hard_offline_page, 0644, NULL, store_hard_offline_page);
483 484 485 486 487 488

static __init int memory_fail_init(void)
{
	int err;

	err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
489
				&class_attr_soft_offline_page.attr);
490 491
	if (!err)
		err = sysfs_create_file(&memory_sysdev_class.kset.kobj,
492
				&class_attr_hard_offline_page.attr);
493 494 495 496 497 498 499 500 501
	return err;
}
#else
static inline int memory_fail_init(void)
{
	return 0;
}
#endif

502 503 504 505 506
/*
 * Note that phys_device is optional.  It is here to allow for
 * differentiation between which *physical* devices each
 * section belongs to...
 */
507 508 509 510
int __weak arch_get_memory_phys_device(unsigned long start_pfn)
{
	return 0;
}
511

512 513
struct memory_block *find_memory_block_hinted(struct mem_section *section,
					      struct memory_block *hint)
514 515 516 517 518
{
	struct kobject *kobj;
	struct sys_device *sysdev;
	struct memory_block *mem;
	char name[sizeof(MEMORY_CLASS_NAME) + 9 + 1];
519
	int block_id = base_memory_block_id(__section_nr(section));
520

521 522
	kobj = hint ? &hint->sysdev.kobj : NULL;

523 524 525 526
	/*
	 * This only works because we know that section == sysdev->id
	 * slightly redundant with sysdev_register()
	 */
527
	sprintf(&name[0], "%s%d", MEMORY_CLASS_NAME, block_id);
528

529
	kobj = kset_find_obj_hinted(&memory_sysdev_class.kset, name, kobj);
530 531 532 533 534 535 536 537 538
	if (!kobj)
		return NULL;

	sysdev = container_of(kobj, struct sys_device, kobj);
	mem = container_of(sysdev, struct memory_block, sysdev);

	return mem;
}

539 540 541 542 543 544 545 546 547 548 549 550 551
/*
 * For now, we have a linear search to go find the appropriate
 * memory_block corresponding to a particular phys_index. If
 * this gets to be a real problem, we can always use a radix
 * tree or something here.
 *
 * This could be made generic for all sysdev classes.
 */
struct memory_block *find_memory_block(struct mem_section *section)
{
	return find_memory_block_hinted(section, NULL);
}

552 553
static int init_memory_block(struct memory_block **memory,
			     struct mem_section *section, unsigned long state)
554
{
555
	struct memory_block *mem;
556
	unsigned long start_pfn;
557
	int scn_nr;
558 559
	int ret = 0;

560
	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
561 562 563
	if (!mem)
		return -ENOMEM;

564
	scn_nr = __section_nr(section);
565 566 567
	mem->start_section_nr =
			base_memory_block_id(scn_nr) * sections_per_block;
	mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
568
	mem->state = state;
569
	mem->section_count++;
570
	mutex_init(&mem->state_mutex);
571
	start_pfn = section_nr_to_pfn(mem->start_section_nr);
572 573
	mem->phys_device = arch_get_memory_phys_device(start_pfn);

574
	ret = register_memory(mem);
575 576
	if (!ret)
		ret = mem_create_simple_file(mem, phys_index);
577 578
	if (!ret)
		ret = mem_create_simple_file(mem, end_phys_index);
579 580 581 582 583 584
	if (!ret)
		ret = mem_create_simple_file(mem, state);
	if (!ret)
		ret = mem_create_simple_file(mem, phys_device);
	if (!ret)
		ret = mem_create_simple_file(mem, removable);
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604

	*memory = mem;
	return ret;
}

static int add_memory_section(int nid, struct mem_section *section,
			unsigned long state, enum mem_add_context context)
{
	struct memory_block *mem;
	int ret = 0;

	mutex_lock(&mem_sysfs_mutex);

	mem = find_memory_block(section);
	if (mem) {
		mem->section_count++;
		kobject_put(&mem->sysdev.kobj);
	} else
		ret = init_memory_block(&mem, section, state);

605
	if (!ret) {
606 607
		if (context == HOTPLUG &&
		    mem->section_count == sections_per_block)
608 609 610
			ret = register_mem_sect_under_node(mem, nid);
	}

611
	mutex_unlock(&mem_sysfs_mutex);
612 613 614
	return ret;
}

615 616 617 618 619
int remove_memory_block(unsigned long node_id, struct mem_section *section,
		int phys_device)
{
	struct memory_block *mem;

620
	mutex_lock(&mem_sysfs_mutex);
621
	mem = find_memory_block(section);
622
	unregister_mem_sect_under_nodes(mem, __section_nr(section));
623 624 625 626

	mem->section_count--;
	if (mem->section_count == 0) {
		mem_remove_simple_file(mem, phys_index);
627
		mem_remove_simple_file(mem, end_phys_index);
628 629 630
		mem_remove_simple_file(mem, state);
		mem_remove_simple_file(mem, phys_device);
		mem_remove_simple_file(mem, removable);
631 632 633 634
		unregister_memory(mem);
		kfree(mem);
	} else
		kobject_put(&mem->sysdev.kobj);
635

636
	mutex_unlock(&mem_sysfs_mutex);
637 638 639 640 641 642 643
	return 0;
}

/*
 * need an interface for the VM to add new memory regions,
 * but without onlining it.
 */
644
int register_new_memory(int nid, struct mem_section *section)
645
{
646
	return add_memory_section(nid, section, MEM_OFFLINE, HOTPLUG);
647 648 649 650
}

int unregister_memory_section(struct mem_section *section)
{
651
	if (!present_section(section))
652 653 654 655 656 657 658 659 660 661 662 663
		return -EINVAL;

	return remove_memory_block(0, section, 0);
}

/*
 * Initialize the sysfs support for memory devices...
 */
int __init memory_dev_init(void)
{
	unsigned int i;
	int ret;
664
	int err;
665
	unsigned long block_sz;
666

667
	memory_sysdev_class.kset.uevent_ops = &memory_uevent_ops;
668
	ret = sysdev_class_register(&memory_sysdev_class);
669 670
	if (ret)
		goto out;
671

672 673 674
	block_sz = get_memory_block_size();
	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;

675 676 677 678 679
	/*
	 * Create entries for memory sections that were found
	 * during boot and have been initialized
	 */
	for (i = 0; i < NR_MEM_SECTIONS; i++) {
680
		if (!present_section_nr(i))
681
			continue;
682 683
		err = add_memory_section(0, __nr_to_section(i), MEM_ONLINE,
					 BOOT);
684 685
		if (!ret)
			ret = err;
686 687
	}

688
	err = memory_probe_init();
689 690 691
	if (!ret)
		ret = err;
	err = memory_fail_init();
692 693 694 695 696 697 698
	if (!ret)
		ret = err;
	err = block_size_init();
	if (!ret)
		ret = err;
out:
	if (ret)
699
		printk(KERN_ERR "%s() failed: %d\n", __func__, ret);
700 701
	return ret;
}