node.c 21.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2
/*
3
 * Basic Node interface support
L
Linus Torvalds 已提交
4 5 6 7 8
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/mm.h>
9
#include <linux/memory.h>
10
#include <linux/vmstat.h>
11
#include <linux/notifier.h>
L
Linus Torvalds 已提交
12 13
#include <linux/node.h>
#include <linux/hugetlb.h>
14
#include <linux/compaction.h>
L
Linus Torvalds 已提交
15 16 17
#include <linux/cpumask.h>
#include <linux/topology.h>
#include <linux/nodemask.h>
18
#include <linux/cpu.h>
19
#include <linux/device.h>
20
#include <linux/pm_runtime.h>
21
#include <linux/swap.h>
22
#include <linux/slab.h>
L
Linus Torvalds 已提交
23

24
static struct bus_type node_subsys = {
25
	.name = "node",
26
	.dev_name = "node",
L
Linus Torvalds 已提交
27 28 29
};


30
static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf)
L
Linus Torvalds 已提交
31
{
32 33
	ssize_t n;
	cpumask_var_t mask;
L
Linus Torvalds 已提交
34 35
	struct node *node_dev = to_node(dev);

36 37
	/* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
	BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
L
Linus Torvalds 已提交
38

39 40 41 42 43 44 45 46
	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
		return 0;

	cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask);
	n = cpumap_print_to_pagebuf(list, buf, mask);
	free_cpumask_var(mask);

	return n;
L
Linus Torvalds 已提交
47 48
}

49 50
static inline ssize_t node_read_cpumask(struct device *dev,
				struct device_attribute *attr, char *buf)
51
{
52
	return node_read_cpumap(dev, false, buf);
53
}
54 55
static inline ssize_t node_read_cpulist(struct device *dev,
				struct device_attribute *attr, char *buf)
56
{
57
	return node_read_cpumap(dev, true, buf);
58 59
}

60 61
static DEVICE_ATTR(cpumap,  S_IRUGO, node_read_cpumask, NULL);
static DEVICE_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL);
L
Linus Torvalds 已提交
62

63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
/**
 * struct node_access_nodes - Access class device to hold user visible
 * 			      relationships to other nodes.
 * @dev:	Device for this memory access class
 * @list_node:	List element in the node's access list
 * @access:	The access class rank
 */
struct node_access_nodes {
	struct device		dev;
	struct list_head	list_node;
	unsigned		access;
};
#define to_access_nodes(dev) container_of(dev, struct node_access_nodes, dev)

static struct attribute *node_init_access_node_attrs[] = {
	NULL,
};

static struct attribute *node_targ_access_node_attrs[] = {
	NULL,
};

static const struct attribute_group initiators = {
	.name	= "initiators",
	.attrs	= node_init_access_node_attrs,
};

static const struct attribute_group targets = {
	.name	= "targets",
	.attrs	= node_targ_access_node_attrs,
};

static const struct attribute_group *node_access_node_groups[] = {
	&initiators,
	&targets,
	NULL,
};

static void node_remove_accesses(struct node *node)
{
	struct node_access_nodes *c, *cnext;

	list_for_each_entry_safe(c, cnext, &node->access_list, list_node) {
		list_del(&c->list_node);
		device_unregister(&c->dev);
	}
}

static void node_access_release(struct device *dev)
{
	kfree(to_access_nodes(dev));
}

static struct node_access_nodes *node_init_node_access(struct node *node,
						       unsigned access)
{
	struct node_access_nodes *access_node;
	struct device *dev;

	list_for_each_entry(access_node, &node->access_list, list_node)
		if (access_node->access == access)
			return access_node;

	access_node = kzalloc(sizeof(*access_node), GFP_KERNEL);
	if (!access_node)
		return NULL;

	access_node->access = access;
	dev = &access_node->dev;
	dev->parent = &node->dev;
	dev->release = node_access_release;
	dev->groups = node_access_node_groups;
	if (dev_set_name(dev, "access%u", access))
		goto free;

	if (device_register(dev))
		goto free_name;

	pm_runtime_no_callbacks(dev);
	list_add_tail(&access_node->list_node, &node->access_list);
	return access_node;
free_name:
	kfree_const(dev->kobj.name);
free:
	kfree(access_node);
	return NULL;
}

L
Linus Torvalds 已提交
151
#define K(x) ((x) << (PAGE_SHIFT - 10))
152 153
static ssize_t node_read_meminfo(struct device *dev,
			struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
154 155 156
{
	int n;
	int nid = dev->id;
M
Mel Gorman 已提交
157
	struct pglist_data *pgdat = NODE_DATA(nid);
L
Linus Torvalds 已提交
158 159 160
	struct sysinfo i;

	si_meminfo_node(&i, nid);
161
	n = sprintf(buf,
162 163 164 165 166 167 168 169 170
		       "Node %d MemTotal:       %8lu kB\n"
		       "Node %d MemFree:        %8lu kB\n"
		       "Node %d MemUsed:        %8lu kB\n"
		       "Node %d Active:         %8lu kB\n"
		       "Node %d Inactive:       %8lu kB\n"
		       "Node %d Active(anon):   %8lu kB\n"
		       "Node %d Inactive(anon): %8lu kB\n"
		       "Node %d Active(file):   %8lu kB\n"
		       "Node %d Inactive(file): %8lu kB\n"
N
Nick Piggin 已提交
171
		       "Node %d Unevictable:    %8lu kB\n"
172 173 174 175
		       "Node %d Mlocked:        %8lu kB\n",
		       nid, K(i.totalram),
		       nid, K(i.freeram),
		       nid, K(i.totalram - i.freeram),
M
Mel Gorman 已提交
176 177 178 179 180 181 182 183 184
		       nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
				node_page_state(pgdat, NR_ACTIVE_FILE)),
		       nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
				node_page_state(pgdat, NR_INACTIVE_FILE)),
		       nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
		       nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
		       nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
		       nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
		       nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
185
		       nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
186

187
#ifdef CONFIG_HIGHMEM
188
	n += sprintf(buf + n,
189 190 191
		       "Node %d HighTotal:      %8lu kB\n"
		       "Node %d HighFree:       %8lu kB\n"
		       "Node %d LowTotal:       %8lu kB\n"
192 193 194 195 196
		       "Node %d LowFree:        %8lu kB\n",
		       nid, K(i.totalhigh),
		       nid, K(i.freehigh),
		       nid, K(i.totalram - i.totalhigh),
		       nid, K(i.freeram - i.freehigh));
197
#endif
198
	n += sprintf(buf + n,
199 200 201 202 203
		       "Node %d Dirty:          %8lu kB\n"
		       "Node %d Writeback:      %8lu kB\n"
		       "Node %d FilePages:      %8lu kB\n"
		       "Node %d Mapped:         %8lu kB\n"
		       "Node %d AnonPages:      %8lu kB\n"
204
		       "Node %d Shmem:          %8lu kB\n"
205
		       "Node %d KernelStack:    %8lu kB\n"
206 207 208 209 210 211
		       "Node %d PageTables:     %8lu kB\n"
		       "Node %d NFS_Unstable:   %8lu kB\n"
		       "Node %d Bounce:         %8lu kB\n"
		       "Node %d WritebackTmp:   %8lu kB\n"
		       "Node %d Slab:           %8lu kB\n"
		       "Node %d SReclaimable:   %8lu kB\n"
212 213 214
		       "Node %d SUnreclaim:     %8lu kB\n"
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
		       "Node %d AnonHugePages:  %8lu kB\n"
215 216
		       "Node %d ShmemHugePages: %8lu kB\n"
		       "Node %d ShmemPmdMapped: %8lu kB\n"
217 218
#endif
			,
219 220 221
		       nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
		       nid, K(node_page_state(pgdat, NR_WRITEBACK)),
		       nid, K(node_page_state(pgdat, NR_FILE_PAGES)),
222
		       nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
223
		       nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
224
		       nid, K(i.sharedram),
225
		       nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
226
		       nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
227
		       nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
228
		       nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
229
		       nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
230 231 232
		       nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE) +
			      node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)),
		       nid, K(node_page_state(pgdat, NR_SLAB_RECLAIMABLE)),
233
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
234
		       nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)),
235
		       nid, K(node_page_state(pgdat, NR_ANON_THPS) *
236
				       HPAGE_PMD_NR),
237
		       nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
238
				       HPAGE_PMD_NR),
239
		       nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
240
				       HPAGE_PMD_NR));
241
#else
242
		       nid, K(node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE)));
243
#endif
L
Linus Torvalds 已提交
244 245 246 247 248
	n += hugetlb_report_node_meminfo(nid, buf + n);
	return n;
}

#undef K
249
static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL);
L
Linus Torvalds 已提交
250

251 252
static ssize_t node_read_numastat(struct device *dev,
				struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
253 254 255 256 257 258 259 260
{
	return sprintf(buf,
		       "numa_hit %lu\n"
		       "numa_miss %lu\n"
		       "numa_foreign %lu\n"
		       "interleave_hit %lu\n"
		       "local_node %lu\n"
		       "other_node %lu\n",
261 262 263 264 265 266
		       sum_zone_numa_state(dev->id, NUMA_HIT),
		       sum_zone_numa_state(dev->id, NUMA_MISS),
		       sum_zone_numa_state(dev->id, NUMA_FOREIGN),
		       sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
		       sum_zone_numa_state(dev->id, NUMA_LOCAL),
		       sum_zone_numa_state(dev->id, NUMA_OTHER));
L
Linus Torvalds 已提交
267
}
268
static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
L
Linus Torvalds 已提交
269

270 271
static ssize_t node_read_vmstat(struct device *dev,
				struct device_attribute *attr, char *buf)
272 273
{
	int nid = dev->id;
274
	struct pglist_data *pgdat = NODE_DATA(nid);
275 276 277 278 279
	int i;
	int n = 0;

	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
		n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
280 281
			     sum_zone_node_page_state(nid, i));

282 283
#ifdef CONFIG_NUMA
	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
284 285
		n += sprintf(buf+n, "%s %lu\n",
			     vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
286 287 288
			     sum_zone_numa_state(nid, i));
#endif

289 290 291 292 293
	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
		/* Skip hidden vmstat items. */
		if (*vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
				 NR_VM_NUMA_STAT_ITEMS] == '\0')
			continue;
294 295 296
		n += sprintf(buf+n, "%s %lu\n",
			     vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
			     NR_VM_NUMA_STAT_ITEMS],
297
			     node_page_state(pgdat, i));
298
	}
299 300

	return n;
301
}
302
static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
303

304
static ssize_t node_read_distance(struct device *dev,
305
			struct device_attribute *attr, char *buf)
L
Linus Torvalds 已提交
306 307 308 309 310
{
	int nid = dev->id;
	int len = 0;
	int i;

311 312 313 314 315
	/*
	 * buf is currently PAGE_SIZE in length and each node needs 4 chars
	 * at the most (distance + space or newline).
	 */
	BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE);
L
Linus Torvalds 已提交
316 317 318 319 320 321 322

	for_each_online_node(i)
		len += sprintf(buf + len, "%s%d", i ? " " : "", node_distance(nid, i));

	len += sprintf(buf + len, "\n");
	return len;
}
323
static DEVICE_ATTR(distance, S_IRUGO, node_read_distance, NULL);
L
Linus Torvalds 已提交
324

325 326 327 328 329 330 331 332 333
static struct attribute *node_dev_attrs[] = {
	&dev_attr_cpumap.attr,
	&dev_attr_cpulist.attr,
	&dev_attr_meminfo.attr,
	&dev_attr_numastat.attr,
	&dev_attr_distance.attr,
	&dev_attr_vmstat.attr,
	NULL
};
334
ATTRIBUTE_GROUPS(node_dev);
335

336 337 338 339
#ifdef CONFIG_HUGETLBFS
/*
 * hugetlbfs per node attributes registration interface:
 * When/if hugetlb[fs] subsystem initializes [sometime after this module],
340 341
 * it will register its per node attributes for all online nodes with
 * memory.  It will also call register_hugetlbfs_with_node(), below, to
342 343 344 345 346 347 348
 * register its attribute registration functions with this node driver.
 * Once these hooks have been initialized, the node driver will call into
 * the hugetlb module to [un]register attributes for hot-plugged nodes.
 */
static node_registration_func_t __hugetlb_register_node;
static node_registration_func_t __hugetlb_unregister_node;

349
static inline bool hugetlb_register_node(struct node *node)
350
{
351
	if (__hugetlb_register_node &&
352
			node_state(node->dev.id, N_MEMORY)) {
353
		__hugetlb_register_node(node);
354 355 356
		return true;
	}
	return false;
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
}

static inline void hugetlb_unregister_node(struct node *node)
{
	if (__hugetlb_unregister_node)
		__hugetlb_unregister_node(node);
}

void register_hugetlbfs_with_node(node_registration_func_t doregister,
				  node_registration_func_t unregister)
{
	__hugetlb_register_node   = doregister;
	__hugetlb_unregister_node = unregister;
}
#else
static inline void hugetlb_register_node(struct node *node) {}

static inline void hugetlb_unregister_node(struct node *node) {}
#endif

377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
static void node_device_release(struct device *dev)
{
	struct node *node = to_node(dev);

#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
	/*
	 * We schedule the work only when a memory section is
	 * onlined/offlined on this node. When we come here,
	 * all the memory on this node has been offlined,
	 * so we won't enqueue new work to this work.
	 *
	 * The work is using node->node_work, so we should
	 * flush work before freeing the memory.
	 */
	flush_work(&node->node_work);
#endif
	kfree(node);
}
L
Linus Torvalds 已提交
395 396

/*
397
 * register_node - Setup a sysfs device for a node.
L
Linus Torvalds 已提交
398 399 400 401
 * @num - Node number to use when creating the device.
 *
 * Initialize and register the node device.
 */
402
static int register_node(struct node *node, int num)
L
Linus Torvalds 已提交
403 404 405
{
	int error;

406 407
	node->dev.id = num;
	node->dev.bus = &node_subsys;
408
	node->dev.release = node_device_release;
409
	node->dev.groups = node_dev_groups;
410
	error = device_register(&node->dev);
L
Linus Torvalds 已提交
411

412 413 414
	if (error)
		put_device(&node->dev);
	else {
415
		hugetlb_register_node(node);
416 417

		compaction_register_node(node);
L
Linus Torvalds 已提交
418 419 420 421
	}
	return error;
}

422 423 424 425 426 427 428 429 430
/**
 * unregister_node - unregister a node device
 * @node: node going away
 *
 * Unregisters a node device @node.  All the devices on the node must be
 * unregistered before calling this function.
 */
void unregister_node(struct node *node)
{
431
	hugetlb_unregister_node(node);		/* no-op, if memoryless node */
432
	node_remove_accesses(node);
433
	device_unregister(&node->dev);
434
}
L
Linus Torvalds 已提交
435

436
struct node *node_devices[MAX_NUMNODES];
437

438 439 440 441 442
/*
 * register cpu under node
 */
int register_cpu_under_node(unsigned int cpu, unsigned int nid)
{
443
	int ret;
444
	struct device *obj;
445

446 447 448
	if (!node_online(nid))
		return 0;

449
	obj = get_cpu_device(cpu);
450 451 452
	if (!obj)
		return 0;

453
	ret = sysfs_create_link(&node_devices[nid]->dev.kobj,
454 455
				&obj->kobj,
				kobject_name(&obj->kobj));
456 457 458 459
	if (ret)
		return ret;

	return sysfs_create_link(&obj->kobj,
460 461
				 &node_devices[nid]->dev.kobj,
				 kobject_name(&node_devices[nid]->dev.kobj));
462 463
}

464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513
/**
 * register_memory_node_under_compute_node - link memory node to its compute
 *					     node for a given access class.
 * @mem_node:	Memory node number
 * @cpu_node:	Cpu  node number
 * @access:	Access class to register
 *
 * Description:
 * 	For use with platforms that may have separate memory and compute nodes.
 * 	This function will export node relationships linking which memory
 * 	initiator nodes can access memory targets at a given ranked access
 * 	class.
 */
int register_memory_node_under_compute_node(unsigned int mem_nid,
					    unsigned int cpu_nid,
					    unsigned access)
{
	struct node *init_node, *targ_node;
	struct node_access_nodes *initiator, *target;
	int ret;

	if (!node_online(cpu_nid) || !node_online(mem_nid))
		return -ENODEV;

	init_node = node_devices[cpu_nid];
	targ_node = node_devices[mem_nid];
	initiator = node_init_node_access(init_node, access);
	target = node_init_node_access(targ_node, access);
	if (!initiator || !target)
		return -ENOMEM;

	ret = sysfs_add_link_to_group(&initiator->dev.kobj, "targets",
				      &targ_node->dev.kobj,
				      dev_name(&targ_node->dev));
	if (ret)
		return ret;

	ret = sysfs_add_link_to_group(&target->dev.kobj, "initiators",
				      &init_node->dev.kobj,
				      dev_name(&init_node->dev));
	if (ret)
		goto err;

	return 0;
 err:
	sysfs_remove_link_from_group(&initiator->dev.kobj, "targets",
				     dev_name(&targ_node->dev));
	return ret;
}

514 515
int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
{
516
	struct device *obj;
517 518 519 520

	if (!node_online(nid))
		return 0;

521
	obj = get_cpu_device(cpu);
522 523 524
	if (!obj)
		return 0;

525
	sysfs_remove_link(&node_devices[nid]->dev.kobj,
526
			  kobject_name(&obj->kobj));
527
	sysfs_remove_link(&obj->kobj,
528
			  kobject_name(&node_devices[nid]->dev.kobj));
529

530 531 532
	return 0;
}

533
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
534
static int __ref get_nid_for_pfn(unsigned long pfn)
535 536 537
{
	if (!pfn_valid_within(pfn))
		return -1;
538
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
T
Thomas Gleixner 已提交
539
	if (system_state < SYSTEM_RUNNING)
540 541
		return early_pfn_to_nid(pfn);
#endif
542 543 544 545
	return pfn_to_nid(pfn);
}

/* register memory section under specified node if it spans that node */
546
int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
547
{
548
	int ret, nid = *(int *)arg;
549 550
	unsigned long pfn, sect_start_pfn, sect_end_pfn;

551
	mem_blk->nid = nid;
552 553 554 555

	sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
	sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
	sect_end_pfn += PAGES_PER_SECTION - 1;
556 557 558
	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
		int page_nid;

559 560 561 562 563 564 565 566 567 568
		/*
		 * memory block could have several absent sections from start.
		 * skip pfn range from absent section
		 */
		if (!pfn_present(pfn)) {
			pfn = round_down(pfn + PAGES_PER_SECTION,
					 PAGES_PER_SECTION) - 1;
			continue;
		}

569 570 571 572 573
		/*
		 * We need to check if page belongs to nid only for the boot
		 * case, during hotplug we know that all pages in the memory
		 * block belong to the same node.
		 */
574
		if (system_state == SYSTEM_BOOTING) {
575 576 577 578 579 580
			page_nid = get_nid_for_pfn(pfn);
			if (page_nid < 0)
				continue;
			if (page_nid != nid)
				continue;
		}
581
		ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
582 583
					&mem_blk->dev.kobj,
					kobject_name(&mem_blk->dev.kobj));
584 585 586
		if (ret)
			return ret;

587
		return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
588 589
				&node_devices[nid]->dev.kobj,
				kobject_name(&node_devices[nid]->dev.kobj));
590 591 592 593 594 595
	}
	/* mem section does not span the specified node */
	return 0;
}

/* unregister memory section under all nodes that it spans */
596 597
int unregister_mem_sect_under_nodes(struct memory_block *mem_blk,
				    unsigned long phys_index)
598
{
599
	NODEMASK_ALLOC(nodemask_t, unlinked_nodes, GFP_KERNEL);
600 601
	unsigned long pfn, sect_start_pfn, sect_end_pfn;

602 603
	if (!mem_blk) {
		NODEMASK_FREE(unlinked_nodes);
604
		return -EFAULT;
605 606 607 608
	}
	if (!unlinked_nodes)
		return -ENOMEM;
	nodes_clear(*unlinked_nodes);
609 610

	sect_start_pfn = section_nr_to_pfn(phys_index);
611 612
	sect_end_pfn = sect_start_pfn + PAGES_PER_SECTION - 1;
	for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
R
Roel Kluin 已提交
613
		int nid;
614 615 616 617 618 619

		nid = get_nid_for_pfn(pfn);
		if (nid < 0)
			continue;
		if (!node_online(nid))
			continue;
620
		if (node_test_and_set(nid, *unlinked_nodes))
621
			continue;
622
		sysfs_remove_link(&node_devices[nid]->dev.kobj,
623 624
			 kobject_name(&mem_blk->dev.kobj));
		sysfs_remove_link(&mem_blk->dev.kobj,
625
			 kobject_name(&node_devices[nid]->dev.kobj));
626
	}
627
	NODEMASK_FREE(unlinked_nodes);
628 629 630
	return 0;
}

631
int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
632
{
633 634
	return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
					register_mem_sect_under_node);
635
}
636

637
#ifdef CONFIG_HUGETLBFS
638 639 640 641
/*
 * Handle per node hstate attribute [un]registration on transistions
 * to/from memoryless state.
 */
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659
static void node_hugetlb_work(struct work_struct *work)
{
	struct node *node = container_of(work, struct node, node_work);

	/*
	 * We only get here when a node transitions to/from memoryless state.
	 * We can detect which transition occurred by examining whether the
	 * node has memory now.  hugetlb_register_node() already check this
	 * so we try to register the attributes.  If that fails, then the
	 * node has transitioned to memoryless, try to unregister the
	 * attributes.
	 */
	if (!hugetlb_register_node(node))
		hugetlb_unregister_node(node);
}

static void init_node_hugetlb_work(int nid)
{
660
	INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work);
661
}
662 663 664 665 666 667 668 669

static int node_memory_callback(struct notifier_block *self,
				unsigned long action, void *arg)
{
	struct memory_notify *mnb = arg;
	int nid = mnb->status_change_nid;

	switch (action) {
670 671 672 673 674 675
	case MEM_ONLINE:
	case MEM_OFFLINE:
		/*
		 * offload per node hstate [un]registration to a work thread
		 * when transitioning to/from memoryless state.
		 */
676
		if (nid != NUMA_NO_NODE)
677
			schedule_work(&node_devices[nid]->node_work);
678
		break;
679

680 681 682 683 684 685 686 687 688 689
	case MEM_GOING_ONLINE:
	case MEM_GOING_OFFLINE:
	case MEM_CANCEL_ONLINE:
	case MEM_CANCEL_OFFLINE:
	default:
		break;
	}

	return NOTIFY_OK;
}
690
#endif	/* CONFIG_HUGETLBFS */
691
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
692

693 694
#if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
    !defined(CONFIG_HUGETLBFS)
695 696 697 698 699
static inline int node_memory_callback(struct notifier_block *self,
				unsigned long action, void *arg)
{
	return NOTIFY_OK;
}
700 701 702 703

static void init_node_hugetlb_work(int nid) { }

#endif
704

705
int __register_one_node(int nid)
706
{
707
	int error;
708
	int cpu;
709

710 711 712
	node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
	if (!node_devices[nid])
		return -ENOMEM;
713

714
	error = register_node(node_devices[nid], nid);
715

716 717 718 719
	/* link cpu under this node */
	for_each_present_cpu(cpu) {
		if (cpu_to_node(cpu) == nid)
			register_cpu_under_node(cpu, nid);
720 721
	}

722
	INIT_LIST_HEAD(&node_devices[nid]->access_list);
723 724
	/* initialize work queue for memory hot plug */
	init_node_hugetlb_work(nid);
725

726
	return error;
727 728 729 730
}

void unregister_one_node(int nid)
{
731 732 733
	if (!node_devices[nid])
		return;

734 735
	unregister_node(node_devices[nid]);
	node_devices[nid] = NULL;
736 737
}

738 739 740 741 742 743 744 745
/*
 * node states attributes
 */

static ssize_t print_nodes_state(enum node_states state, char *buf)
{
	int n;

746 747
	n = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
		      nodemask_pr_args(&node_states[state]));
748 749
	buf[n++] = '\n';
	buf[n] = '\0';
750 751 752
	return n;
}

753
struct node_attr {
754
	struct device_attribute attr;
755 756
	enum node_states state;
};
757

758 759
static ssize_t show_node_state(struct device *dev,
			       struct device_attribute *attr, char *buf)
760
{
761 762
	struct node_attr *na = container_of(attr, struct node_attr, attr);
	return print_nodes_state(na->state, buf);
763 764
}

765
#define _NODE_ATTR(name, state) \
766
	{ __ATTR(name, 0444, show_node_state, NULL), state }
767

768
static struct node_attr node_state_attr[] = {
769 770 771
	[N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE),
	[N_ONLINE] = _NODE_ATTR(online, N_ONLINE),
	[N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
772
#ifdef CONFIG_HIGHMEM
773
	[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
774 775
#endif
	[N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
776
	[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
777 778
};

779
static struct attribute *node_state_attrs[] = {
780 781 782
	&node_state_attr[N_POSSIBLE].attr.attr,
	&node_state_attr[N_ONLINE].attr.attr,
	&node_state_attr[N_NORMAL_MEMORY].attr.attr,
783
#ifdef CONFIG_HIGHMEM
784
	&node_state_attr[N_HIGH_MEMORY].attr.attr,
785 786
#endif
	&node_state_attr[N_MEMORY].attr.attr,
787
	&node_state_attr[N_CPU].attr.attr,
788 789
	NULL
};
790

791 792 793 794 795 796 797 798 799
static struct attribute_group memory_root_attr_group = {
	.attrs = node_state_attrs,
};

static const struct attribute_group *cpu_root_attr_groups[] = {
	&memory_root_attr_group,
	NULL,
};

800
#define NODE_CALLBACK_PRI	2	/* lower than SLAB */
801
static int __init register_node_type(void)
L
Linus Torvalds 已提交
802
{
803 804
	int ret;

805 806 807
 	BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES);
 	BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES);

808
	ret = subsys_system_register(&node_subsys, cpu_root_attr_groups);
809
	if (!ret) {
810 811 812 813 814
		static struct notifier_block node_memory_callback_nb = {
			.notifier_call = node_memory_callback,
			.priority = NODE_CALLBACK_PRI,
		};
		register_hotmemory_notifier(&node_memory_callback_nb);
815
	}
816 817 818 819 820 821

	/*
	 * Note:  we're not going to unregister the node class if we fail
	 * to register the node state class attribute files.
	 */
	return ret;
L
Linus Torvalds 已提交
822 823
}
postcore_initcall(register_node_type);