taskstats.c 13.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 * taskstats.c - Export per-task statistics to userland
 *
 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
 *           (C) Balbir Singh,   IBM Corp. 2006
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 */

#include <linux/kernel.h>
#include <linux/taskstats_kern.h>
21
#include <linux/tsacct_kern.h>
22
#include <linux/delayacct.h>
23 24
#include <linux/cpumask.h>
#include <linux/percpu.h>
B
Balbir Singh 已提交
25 26 27 28
#include <linux/cgroupstats.h>
#include <linux/cgroup.h>
#include <linux/fs.h>
#include <linux/file.h>
29 30 31
#include <net/genetlink.h>
#include <asm/atomic.h>

32 33 34 35 36 37
/*
 * Maximum length of a cpumask that can be specified in
 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
 */
#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)

38 39
static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
static int family_registered;
40
struct kmem_cache *taskstats_cache;
41 42 43 44 45 46 47 48 49 50 51 52

static struct genl_family family = {
	.id		= GENL_ID_GENERATE,
	.name		= TASKSTATS_GENL_NAME,
	.version	= TASKSTATS_GENL_VERSION,
	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
};

static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
__read_mostly = {
	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
53 54 55
	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};

B
Balbir Singh 已提交
56 57 58 59 60
static struct nla_policy
cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = {
	[CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 },
};

61 62 63
struct listener {
	struct list_head list;
	pid_t pid;
64
	char valid;
65 66
};

67 68 69 70 71 72 73 74 75 76 77
struct listener_list {
	struct rw_semaphore sem;
	struct list_head list;
};
static DEFINE_PER_CPU(struct listener_list, listener_array);

enum actions {
	REGISTER,
	DEREGISTER,
	CPU_DONT_CARE
};
78 79

static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
80
				size_t size)
81 82 83 84 85 86 87
{
	struct sk_buff *skb;
	void *reply;

	/*
	 * If new attributes are added, please revisit this allocation
	 */
88
	skb = genlmsg_new(size, GFP_KERNEL);
89 90 91 92 93 94 95
	if (!skb)
		return -ENOMEM;

	if (!info) {
		int seq = get_cpu_var(taskstats_seqnum)++;
		put_cpu_var(taskstats_seqnum);

96
		reply = genlmsg_put(skb, 0, seq, &family, 0, cmd);
97
	} else
98
		reply = genlmsg_put_reply(skb, info, &family, 0, cmd);
99 100 101 102 103 104 105 106 107
	if (reply == NULL) {
		nlmsg_free(skb);
		return -EINVAL;
	}

	*skbp = skb;
	return 0;
}

108 109 110 111
/*
 * Send taskstats data in @skb to listener with nl_pid @pid
 */
static int send_reply(struct sk_buff *skb, pid_t pid)
112
{
113
	struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
114
	void *reply = genlmsg_data(genlhdr);
115 116 117 118 119 120 121 122 123 124 125
	int rc;

	rc = genlmsg_end(skb, reply);
	if (rc < 0) {
		nlmsg_free(skb);
		return rc;
	}

	return genlmsg_unicast(skb, pid);
}

126 127 128
/*
 * Send taskstats data in @skb to listeners registered for @cpu's exit data
 */
129 130
static void send_cpu_listeners(struct sk_buff *skb,
					struct listener_list *listeners)
131
{
132
	struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
133 134 135
	struct listener *s, *tmp;
	struct sk_buff *skb_next, *skb_cur = skb;
	void *reply = genlmsg_data(genlhdr);
136
	int rc, delcount = 0;
137 138 139 140

	rc = genlmsg_end(skb, reply);
	if (rc < 0) {
		nlmsg_free(skb);
141
		return;
142 143 144
	}

	rc = 0;
145
	down_read(&listeners->sem);
146
	list_for_each_entry(s, &listeners->list, list) {
147 148 149
		skb_next = NULL;
		if (!list_is_last(&s->list, &listeners->list)) {
			skb_next = skb_clone(skb_cur, GFP_KERNEL);
150
			if (!skb_next)
151 152
				break;
		}
153 154
		rc = genlmsg_unicast(skb_cur, s->pid);
		if (rc == -ECONNREFUSED) {
155 156
			s->valid = 0;
			delcount++;
157 158 159
		}
		skb_cur = skb_next;
	}
160
	up_read(&listeners->sem);
161

162 163 164
	if (skb_cur)
		nlmsg_free(skb_cur);

165
	if (!delcount)
166
		return;
167 168 169 170 171 172 173 174 175 176

	/* Delete invalidated entries */
	down_write(&listeners->sem);
	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
		if (!s->valid) {
			list_del(&s->list);
			kfree(s);
		}
	}
	up_write(&listeners->sem);
177 178
}

179
static int fill_pid(pid_t pid, struct task_struct *tsk,
180 181
		struct taskstats *stats)
{
182
	int rc = 0;
183

184 185
	if (!tsk) {
		rcu_read_lock();
186
		tsk = find_task_by_pid(pid);
187 188 189 190
		if (tsk)
			get_task_struct(tsk);
		rcu_read_unlock();
		if (!tsk)
191 192 193 194
			return -ESRCH;
	} else
		get_task_struct(tsk);

195
	memset(stats, 0, sizeof(*stats));
196 197 198 199
	/*
	 * Each accounting subsystem adds calls to its functions to
	 * fill in relevant parts of struct taskstsats as follows
	 *
200
	 *	per-task-foo(stats, tsk);
201 202
	 */

203
	delayacct_add_tsk(stats, tsk);
204 205

	/* fill in basic acct fields */
206
	stats->version = TASKSTATS_VERSION;
207 208
	stats->nvcsw = tsk->nvcsw;
	stats->nivcsw = tsk->nivcsw;
209
	bacct_add_tsk(stats, tsk);
210

211 212 213
	/* fill in extended acct fields */
	xacct_add_tsk(stats, tsk);

214
	/* Define err: label here if needed */
215 216 217 218 219
	put_task_struct(tsk);
	return rc;

}

220
static int fill_tgid(pid_t tgid, struct task_struct *first,
221 222
		struct taskstats *stats)
{
223
	struct task_struct *tsk;
224
	unsigned long flags;
225
	int rc = -ESRCH;
226

227 228 229 230
	/*
	 * Add additional stats from live tasks except zombie thread group
	 * leaders who are already counted with the dead tasks
	 */
231 232
	rcu_read_lock();
	if (!first)
233
		first = find_task_by_pid(tgid);
234

235 236
	if (!first || !lock_task_sighand(first, &flags))
		goto out;
237

238 239
	if (first->signal->stats)
		memcpy(stats, first->signal->stats, sizeof(*stats));
240 241
	else
		memset(stats, 0, sizeof(*stats));
242

243
	tsk = first;
244
	do {
245
		if (tsk->exit_state)
246
			continue;
247
		/*
248
		 * Accounting subsystem can call its functions here to
249 250
		 * fill in relevant parts of struct taskstsats as follows
		 *
251
		 *	per-task-foo(stats, tsk);
252
		 */
253
		delayacct_add_tsk(stats, tsk);
254

255 256
		stats->nvcsw += tsk->nvcsw;
		stats->nivcsw += tsk->nivcsw;
257
	} while_each_thread(first, tsk);
258

259 260 261 262 263 264
	unlock_task_sighand(first, &flags);
	rc = 0;
out:
	rcu_read_unlock();

	stats->version = TASKSTATS_VERSION;
265
	/*
266 267
	 * Accounting subsytems can also add calls here to modify
	 * fields of taskstats.
268
	 */
269
	return rc;
270 271 272 273 274 275 276
}


static void fill_tgid_exit(struct task_struct *tsk)
{
	unsigned long flags;

277
	spin_lock_irqsave(&tsk->sighand->siglock, flags);
278 279 280 281 282 283 284 285 286 287 288
	if (!tsk->signal->stats)
		goto ret;

	/*
	 * Each accounting subsystem calls its functions here to
	 * accumalate its per-task stats for tsk, into the per-tgid structure
	 *
	 *	per-task-foo(tsk->signal->stats, tsk);
	 */
	delayacct_add_tsk(tsk->signal->stats, tsk);
ret:
289
	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
290
	return;
291 292
}

293 294 295 296 297 298
static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
{
	struct listener_list *listeners;
	struct listener *s, *tmp;
	unsigned int cpu;
	cpumask_t mask = *maskp;
299

300 301 302 303 304 305 306 307 308 309 310
	if (!cpus_subset(mask, cpu_possible_map))
		return -EINVAL;

	if (isadd == REGISTER) {
		for_each_cpu_mask(cpu, mask) {
			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
					 cpu_to_node(cpu));
			if (!s)
				goto cleanup;
			s->pid = pid;
			INIT_LIST_HEAD(&s->list);
311
			s->valid = 1;
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359

			listeners = &per_cpu(listener_array, cpu);
			down_write(&listeners->sem);
			list_add(&s->list, &listeners->list);
			up_write(&listeners->sem);
		}
		return 0;
	}

	/* Deregister or cleanup */
cleanup:
	for_each_cpu_mask(cpu, mask) {
		listeners = &per_cpu(listener_array, cpu);
		down_write(&listeners->sem);
		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
			if (s->pid == pid) {
				list_del(&s->list);
				kfree(s);
				break;
			}
		}
		up_write(&listeners->sem);
	}
	return 0;
}

static int parse(struct nlattr *na, cpumask_t *mask)
{
	char *data;
	int len;
	int ret;

	if (na == NULL)
		return 1;
	len = nla_len(na);
	if (len > TASKSTATS_CPUMASK_MAXLEN)
		return -E2BIG;
	if (len < 1)
		return -EINVAL;
	data = kmalloc(len, GFP_KERNEL);
	if (!data)
		return -ENOMEM;
	nla_strlcpy(data, na, len);
	ret = cpulist_parse(data, *mask);
	kfree(data);
	return ret;
}

360
static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
361
{
362
	struct nlattr *na, *ret;
363 364
	int aggr;

365 366 367
	aggr = (type == TASKSTATS_TYPE_PID)
			? TASKSTATS_TYPE_AGGR_PID
			: TASKSTATS_TYPE_AGGR_TGID;
368 369

	na = nla_nest_start(skb, aggr);
370 371
	if (!na)
		goto err;
372 373 374 375 376
	if (nla_put(skb, type, sizeof(pid), &pid) < 0)
		goto err;
	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
	if (!ret)
		goto err;
377 378
	nla_nest_end(skb, na);

379 380 381
	return nla_data(ret);
err:
	return NULL;
382 383
}

B
Balbir Singh 已提交
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
{
	int rc = 0;
	struct sk_buff *rep_skb;
	struct cgroupstats *stats;
	struct nlattr *na;
	size_t size;
	u32 fd;
	struct file *file;
	int fput_needed;

	na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
	if (!na)
		return -EINVAL;

	fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
	file = fget_light(fd, &fput_needed);
	if (file) {
		size = nla_total_size(sizeof(struct cgroupstats));

		rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
					size);
		if (rc < 0)
			goto err;

		na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
					sizeof(struct cgroupstats));
		stats = nla_data(na);
		memset(stats, 0, sizeof(*stats));

		rc = cgroupstats_build(stats, file->f_dentry);
		if (rc < 0)
			goto err;

		fput_light(file, fput_needed);
		return send_reply(rep_skb, info->snd_pid);
	}

err:
	if (file)
		fput_light(file, fput_needed);
	nlmsg_free(rep_skb);
	return rc;
}

429
static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
430 431 432
{
	int rc = 0;
	struct sk_buff *rep_skb;
433
	struct taskstats *stats;
434
	size_t size;
435 436 437 438 439 440 441 442 443 444 445 446 447
	cpumask_t mask;

	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
	if (rc < 0)
		return rc;
	if (rc == 0)
		return add_del_listener(info->snd_pid, &mask, REGISTER);

	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
	if (rc < 0)
		return rc;
	if (rc == 0)
		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
448 449 450 451 452 453 454

	/*
	 * Size includes space for nested attributes
	 */
	size = nla_total_size(sizeof(u32)) +
		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);

455
	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
456 457 458
	if (rc < 0)
		return rc;

459
	rc = -EINVAL;
460 461
	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
462 463
		stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
		if (!stats)
464
			goto err;
465

466 467
		rc = fill_pid(pid, NULL, stats);
		if (rc < 0)
468
			goto err;
469 470
	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
471 472
		stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
		if (!stats)
473
			goto err;
474

475 476
		rc = fill_tgid(tgid, NULL, stats);
		if (rc < 0)
477
			goto err;
478
	} else
479 480
		goto err;

481
	return send_reply(rep_skb, info->snd_pid);
482 483 484 485 486
err:
	nlmsg_free(rep_skb);
	return rc;
}

487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
	struct signal_struct *sig = tsk->signal;
	struct taskstats *stats;

	if (sig->stats || thread_group_empty(tsk))
		goto ret;

	/* No problem if kmem_cache_zalloc() fails */
	stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL);

	spin_lock_irq(&tsk->sighand->siglock);
	if (!sig->stats) {
		sig->stats = stats;
		stats = NULL;
	}
	spin_unlock_irq(&tsk->sighand->siglock);

	if (stats)
		kmem_cache_free(taskstats_cache, stats);
ret:
	return sig->stats;
}

511
/* Send pid data out on exit */
512
void taskstats_exit(struct task_struct *tsk, int group_dead)
513 514
{
	int rc;
515
	struct listener_list *listeners;
516
	struct taskstats *stats;
517 518 519 520
	struct sk_buff *rep_skb;
	size_t size;
	int is_thread_group;

521
	if (!family_registered)
522 523 524 525 526 527 528 529
		return;

	/*
	 * Size includes space for nested attributes
	 */
	size = nla_total_size(sizeof(u32)) +
		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);

530
	is_thread_group = !!taskstats_tgid_alloc(tsk);
531 532 533 534 535 536 537
	if (is_thread_group) {
		/* PID + STATS + TGID + STATS */
		size = 2 * size;
		/* fill the tsk->signal->stats structure */
		fill_tgid_exit(tsk);
	}

538 539 540 541
	listeners = &__raw_get_cpu_var(listener_array);
	if (list_empty(&listeners->list))
		return;

542
	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size);
543
	if (rc < 0)
544
		return;
545

546 547
	stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid);
	if (!stats)
548
		goto err;
549

550 551
	rc = fill_pid(tsk->pid, tsk, stats);
	if (rc < 0)
552
		goto err;
553 554

	/*
555
	 * Doesn't matter if tsk is the leader or the last group member leaving
556
	 */
557
	if (!is_thread_group || !group_dead)
558
		goto send;
559

560 561
	stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid);
	if (!stats)
562
		goto err;
563 564

	memcpy(stats, tsk->signal->stats, sizeof(*stats));
565

566
send:
567
	send_cpu_listeners(rep_skb, listeners);
568
	return;
569
err:
570 571 572 573 574
	nlmsg_free(rep_skb);
}

static struct genl_ops taskstats_ops = {
	.cmd		= TASKSTATS_CMD_GET,
575
	.doit		= taskstats_user_cmd,
576 577 578
	.policy		= taskstats_cmd_get_policy,
};

B
Balbir Singh 已提交
579 580 581 582 583 584
static struct genl_ops cgroupstats_ops = {
	.cmd		= CGROUPSTATS_CMD_GET,
	.doit		= cgroupstats_user_cmd,
	.policy		= cgroupstats_cmd_get_policy,
};

585 586 587
/* Needed early in initialization */
void __init taskstats_init_early(void)
{
588 589
	unsigned int i;

590
	taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
591 592 593 594
	for_each_possible_cpu(i) {
		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
		init_rwsem(&(per_cpu(listener_array, i).sem));
	}
595 596 597 598 599 600 601 602 603 604 605 606 607 608
}

static int __init taskstats_init(void)
{
	int rc;

	rc = genl_register_family(&family);
	if (rc)
		return rc;

	rc = genl_register_ops(&family, &taskstats_ops);
	if (rc < 0)
		goto err;

B
Balbir Singh 已提交
609 610 611 612
	rc = genl_register_ops(&family, &cgroupstats_ops);
	if (rc < 0)
		goto err_cgroup_ops;

613
	family_registered = 1;
B
Balbir Singh 已提交
614
	printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
615
	return 0;
B
Balbir Singh 已提交
616 617
err_cgroup_ops:
	genl_unregister_ops(&family, &taskstats_ops);
618 619 620 621 622 623 624 625 626 627
err:
	genl_unregister_family(&family);
	return rc;
}

/*
 * late initcall ensures initialization of statistics collection
 * mechanisms precedes initialization of the taskstats interface
 */
late_initcall(taskstats_init);