resctrlfs.c 28.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0+
X
Xie XiuQi 已提交
2
/*
3
 * User interface for ARM v8 MPAM
X
Xie XiuQi 已提交
4
 *
5
 * Copyright (C) 2018-2019 Huawei Technologies Co., Ltd
X
Xie XiuQi 已提交
6
 *
7 8 9
 * Author: Xie XiuQi <xiexiuqi@huawei.com>
 *
 * Code was partially borrowed from arch/x86/kernel/cpu/intel_rdt*.
X
Xie XiuQi 已提交
10 11 12 13 14 15 16 17 18 19
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
20 21 22 23
 * More information about MPAM be found in the Arm Architecture Reference
 * Manual.
 *
 * https://static.docs.arm.com/ddi0598/a/DDI0598_MPAM_supp_armv8a.pdf
X
Xie XiuQi 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
 */

#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt

#include <linux/cpu.h>
#include <linux/fs.h>
#include <linux/fs_parser.h>
#include <linux/sysfs.h>
#include <linux/kernfs.h>
#include <linux/seq_buf.h>
#include <linux/seq_file.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
#include <linux/resctrlfs.h>

#include <uapi/linux/magic.h>

#include <asm/resctrl.h>
44
#include <asm/mpam.h>
X
Xie XiuQi 已提交
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81

DEFINE_STATIC_KEY_FALSE(resctrl_enable_key);
DEFINE_STATIC_KEY_FALSE(resctrl_mon_enable_key);
DEFINE_STATIC_KEY_FALSE(resctrl_alloc_enable_key);
static struct kernfs_root *resctrl_root;
struct resctrl_group resctrl_group_default;
LIST_HEAD(resctrl_all_groups);

/* Kernel fs node for "info" directory under root */
static struct kernfs_node *kn_info;

/* Kernel fs node for "mon_groups" directory under root */
static struct kernfs_node *kn_mongrp;

/* Kernel fs node for "mon_data" directory under root */
static struct kernfs_node *kn_mondata;

/* set uid and gid of resctrl_group dirs and files to that of the creator */
static int resctrl_group_kn_set_ugid(struct kernfs_node *kn)
{
	struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
				.ia_uid = current_fsuid(),
				.ia_gid = current_fsgid(), };

	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
	    gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
		return 0;

	return kernfs_setattr(kn, &iattr);
}

static int resctrl_group_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
{
	struct kernfs_node *kn;
	int ret;

	kn = __kernfs_create_file(parent_kn, rft->name, rft->mode,
82
				  GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
X
Xie XiuQi 已提交
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
				  0, rft->kf_ops, rft, NULL, NULL);
	if (IS_ERR(kn))
		return PTR_ERR(kn);

	ret = resctrl_group_kn_set_ugid(kn);
	if (ret) {
		kernfs_remove(kn);
		return ret;
	}

	return 0;
}

static struct rftype *res_common_files;
static size_t res_common_files_len;

int register_resctrl_specific_files(struct rftype *files, size_t len)
{
	if (res_common_files) {
		pr_err("Only allowed register specific files once\n");
		return -EINVAL;
	}

	if (!files) {
		pr_err("Invalid input files\n");
		return -EINVAL;
	}

	res_common_files = files;
	res_common_files_len = len;

	return 0;
}

static int __resctrl_group_add_files(struct kernfs_node *kn, unsigned long fflags,
				     struct rftype *rfts, int len)
{
	struct rftype *rft;
121
	int ret = 0;
X
Xie XiuQi 已提交
122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145

	lockdep_assert_held(&resctrl_group_mutex);

	for (rft = rfts; rft < rfts + len; rft++) {
		if (rft->enable && !rft->enable(NULL))
			continue;

		if ((fflags & rft->fflags) == rft->fflags) {
			ret = resctrl_group_add_file(kn, rft);
			if (ret)
				goto error;
		}
	}

	return 0;
error:
	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
	while (--rft >= rfts) {
		if ((fflags & rft->fflags) == rft->fflags)
			kernfs_remove_by_name(kn, rft->name);
	}
	return ret;
}

146
int resctrl_group_add_files(struct kernfs_node *kn, unsigned long fflags)
X
Xie XiuQi 已提交
147
{
148
	int ret = 0;
X
Xie XiuQi 已提交
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222

	if (res_common_files)
		ret = __resctrl_group_add_files(kn, fflags, res_common_files,
						res_common_files_len);

	return ret;
}

/*
 * We don't allow resctrl_group directories to be created anywhere
 * except the root directory. Thus when looking for the resctrl_group
 * structure for a kernfs node we are either looking at a directory,
 * in which case the resctrl_group structure is pointed at by the "priv"
 * field, otherwise we have a file, and need only look to the parent
 * to find the resctrl_group.
 */
static struct resctrl_group *kernfs_to_resctrl_group(struct kernfs_node *kn)
{
	if (kernfs_type(kn) == KERNFS_DIR) {
		/*
		 * All the resource directories use "kn->priv"
		 * to point to the "struct resctrl_group" for the
		 * resource. "info" and its subdirectories don't
		 * have resctrl_group structures, so return NULL here.
		 */
		if (kn == kn_info || kn->parent == kn_info)
			return NULL;
		else
			return kn->priv;
	} else {
		return kn->parent->priv;
	}
}

struct resctrl_group *resctrl_group_kn_lock_live(struct kernfs_node *kn)
{
	struct resctrl_group *rdtgrp = kernfs_to_resctrl_group(kn);

	if (!rdtgrp)
		return NULL;

	atomic_inc(&rdtgrp->waitcount);
	kernfs_break_active_protection(kn);

	mutex_lock(&resctrl_group_mutex);

	/* Was this group deleted while we waited? */
	if (rdtgrp->flags & RDT_DELETED)
		return NULL;

	return rdtgrp;
}

void resctrl_group_kn_unlock(struct kernfs_node *kn)
{
	struct resctrl_group *rdtgrp = kernfs_to_resctrl_group(kn);

	if (!rdtgrp)
		return;

	mutex_unlock(&resctrl_group_mutex);

	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
	    (rdtgrp->flags & RDT_DELETED)) {
		kernfs_unbreak_active_protection(kn);
		kernfs_put(rdtgrp->kn);
		kfree(rdtgrp);
	} else {
		kernfs_unbreak_active_protection(kn);
	}
}

static int resctrl_enable_ctx(struct resctrl_fs_context *ctx)
{
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
	int ret = 0;

	extend_ctrl_disable();
	basic_ctrl_enable();
	disable_cdp();

	if (ctx->enable_cdpl3)
		ret = cdpl3_enable();

	if (!ret && ctx->enable_cdpl2)
		ret = cdpl2_enable();

	if (!ret && ctx->enable_mbMax)
		ret = mbMax_enable();

	if (!ret && ctx->enable_mbMin)
		ret = mbMin_enable();

	if (!ret && ctx->enable_mbHdl)
		ret = mbHdl_enable();

	if (!ret && ctx->enable_mbPrio)
		ret = mbPrio_enable();

	if (!ret && ctx->enable_caPbm)
		ret = caPbm_enable();

	if (!ret && ctx->enable_caMax)
		ret = caMax_enable();

	if (!ret && ctx->enable_caPrio)
		ret = caPrio_enable();

	return ret;
X
Xie XiuQi 已提交
257 258
}

259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
static int
mongroup_create_dir(struct kernfs_node *parent_kn, struct resctrl_group *prgrp,
		    char *name, struct kernfs_node **dest_kn)
{
	struct kernfs_node *kn;
	int ret;

	/* create the directory */
	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
	if (IS_ERR(kn)) {
		return PTR_ERR(kn);
	}

	if (dest_kn)
		*dest_kn = kn;

	/*
	 * This extra ref will be put in kernfs_remove() and guarantees
	 * that @rdtgrp->kn is always accessible.
	 */
	kernfs_get(kn);

	ret = resctrl_group_kn_set_ugid(kn);
	if (ret)
		goto out_destroy;

	kernfs_activate(kn);

	return 0;

out_destroy:
	kernfs_remove(kn);
	return ret;
}

294 295 296
static void mkdir_mondata_all_prepare_clean(struct resctrl_group *prgrp)
{
	if (prgrp->type == RDTCTRL_GROUP && prgrp->closid.intpartid)
297 298
		closid_free(prgrp->closid.intpartid);
	rmid_free(prgrp->mon.rmid);
299 300 301 302
}

static int mkdir_mondata_all_prepare(struct resctrl_group *rdtgrp)
{
303
	struct resctrl_group *prgrp;
304 305

	if (rdtgrp->type == RDTMON_GROUP) {
306
		prgrp = rdtgrp->mon.parent;
307
		rdtgrp->closid.intpartid = prgrp->closid.intpartid;
308 309
	}

310
	return 0;
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
}

/*
 * This creates a directory mon_data which contains the monitored data.
 *
 * mon_data has one directory for each domain whic are named
 * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
 * with L3 domain looks as below:
 * ./mon_data:
 * mon_L3_00
 * mon_L3_01
 * mon_L3_02
 * ...
 *
 * Each domain directory has one file per event:
 * ./mon_L3_00/:
 * llc_occupancy
 *
 */
static int mkdir_mondata_all(struct kernfs_node *parent_kn,
			     struct resctrl_group *prgrp,
			     struct kernfs_node **dest_kn)
{
	struct kernfs_node *kn;
	int ret;

	/*
	 * Create the mon_data directory first.
	 */
	ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn);
	if (ret)
		return ret;

	if (dest_kn)
		*dest_kn = kn;

	ret = resctrl_mkdir_mondata_all_subdir(kn, prgrp);
	if (ret)
		goto out_destroy;

	kernfs_activate(kn);

	return 0;

out_destroy:
	kernfs_remove(kn);
	return ret;
}

360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
static void resctrl_cdp_update_cpus_state(struct resctrl_group *r)
{
	int cpu;

	/*
     * If cdp on, tasks in resctrl default group with closid=0
	 * and rmid=0 don't know how to fill proper partid_i/pmg_i
	 * and partid_d/pmg_d into MPAMx_ELx sysregs by mpam_sched_in()
	 * called by __switch_to(), it's because current cpu's default
	 * closid and rmid are also equal to 0 and to make the operation
	 * modifying configuration passed. Update per cpu default closid
	 * of none-zero value, call update_closid_rmid() to update each
	 * cpu's mpam proper MPAMx_ELx sysregs for setting partid and
	 * pmg when mounting resctrl sysfs, it looks like a practical
	 * method.
	 */
	for_each_cpu(cpu, &r->cpu_mask)
		per_cpu(pqr_state.default_closid, cpu) = ~0;

	update_closid_rmid(&r->cpu_mask, NULL);
}

X
Xie XiuQi 已提交
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
static int resctrl_get_tree(struct fs_context *fc)
{
	int ret;
	struct resctrl_fs_context *ctx = resctrl_fc2context(fc);

	cpus_read_lock();
	mutex_lock(&resctrl_group_mutex);
	/*
	 * resctrl file system can only be mounted once.
	 */
	if (static_branch_unlikely(&resctrl_enable_key)) {
		ret = -EBUSY;
		goto out;
	}

	ret = resctrl_enable_ctx(ctx);
	if (ret)
		goto out;

401 402 403
	ret = schemata_list_init();
	if (ret)
		goto out;
404

405 406
	ret = resctrl_id_init();
	if (ret)
407
		goto out_schema;
X
Xie XiuQi 已提交
408

409
	ret = resctrl_group_create_info_dir(resctrl_group_default.kn, &kn_info);
X
Xie XiuQi 已提交
410
	if (ret)
411
		goto out_schema;
X
Xie XiuQi 已提交
412 413 414 415 416 417 418 419 420 421

	if (resctrl_mon_capable) {
		ret = mongroup_create_dir(resctrl_group_default.kn,
					  NULL, "mon_groups",
					  &kn_mongrp);
		if (ret)
			goto out_info;

		kernfs_get(kn_mongrp);

422 423 424 425
		ret = mkdir_mondata_all_prepare(&resctrl_group_default);
		if (ret < 0)
			goto out_mongrp;

X
Xie XiuQi 已提交
426 427 428 429 430 431 432 433 434 435 436 437 438
		ret = mkdir_mondata_all(resctrl_group_default.kn,
					&resctrl_group_default, &kn_mondata);
		if (ret)
			goto out_mongrp;

		kernfs_get(kn_mondata);
		resctrl_group_default.mon.mon_data_kn = kn_mondata;
	}

	ret = kernfs_get_tree(fc);
	if (ret < 0)
		goto out_mondata;

439 440
	resctrl_cdp_update_cpus_state(&resctrl_group_default);

X
Xie XiuQi 已提交
441 442 443 444 445 446 447 448 449 450 451 452
	post_resctrl_mount();

	goto out;

out_mondata:
	if (resctrl_mon_capable)
		kernfs_remove(kn_mondata);
out_mongrp:
	if (resctrl_mon_capable)
		kernfs_remove(kn_mongrp);
out_info:
	kernfs_remove(kn_info);
453 454
out_schema:
	schemata_list_destroy();
X
Xie XiuQi 已提交
455 456 457 458 459 460 461 462
out:
	rdt_last_cmd_clear();
	mutex_unlock(&resctrl_group_mutex);
	cpus_read_unlock();

	return ret;
}

463 464
static inline bool
is_task_match_resctrl_group(struct task_struct *t, struct resctrl_group *r)
X
Xie XiuQi 已提交
465
{
466
	return (t->closid == r->closid.intpartid);
X
Xie XiuQi 已提交
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
}

/*
 * Move tasks from one to the other group. If @from is NULL, then all tasks
 * in the systems are moved unconditionally (used for teardown).
 *
 * If @mask is not NULL the cpus on which moved tasks are running are set
 * in that mask so the update smp function call is restricted to affected
 * cpus.
 */
static void resctrl_move_group_tasks(struct resctrl_group *from, struct resctrl_group *to,
				 struct cpumask *mask)
{
	struct task_struct *p, *t;

	read_lock(&tasklist_lock);
	for_each_process_thread(p, t) {
484
		if (!from || is_task_match_resctrl_group(t, from)) {
485 486
			t->closid = resctrl_navie_closid(to->closid);
			t->rmid = resctrl_navie_rmid(to->mon.rmid);
X
Xie XiuQi 已提交
487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512

#ifdef CONFIG_SMP
			/*
			 * This is safe on x86 w/o barriers as the ordering
			 * of writing to task_cpu() and t->on_cpu is
			 * reverse to the reading here. The detection is
			 * inaccurate as tasks might move or schedule
			 * before the smp function call takes place. In
			 * such a case the function call is pointless, but
			 * there is no other side effect.
			 */
			if (mask && t->on_cpu)
				cpumask_set_cpu(task_cpu(t), mask);
#endif
		}
	}
	read_unlock(&tasklist_lock);
}

static void free_all_child_rdtgrp(struct resctrl_group *rdtgrp)
{
	struct resctrl_group *sentry, *stmp;
	struct list_head *head;

	head = &rdtgrp->mon.crdtgrp_list;
	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
513
		/* rmid may not be used */
514
		rmid_free(sentry->mon.rmid);
X
Xie XiuQi 已提交
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545
		list_del(&sentry->mon.crdtgrp_list);
		kfree(sentry);
	}
}

/*
 * Forcibly remove all of subdirectories under root.
 */
static void rmdir_all_sub(void)
{
	struct resctrl_group *rdtgrp, *tmp;

	/* Move all tasks to the default resource group */
	resctrl_move_group_tasks(NULL, &resctrl_group_default, NULL);

	list_for_each_entry_safe(rdtgrp, tmp, &resctrl_all_groups, resctrl_group_list) {
		/* Free any child rmids */
		free_all_child_rdtgrp(rdtgrp);

		/* Remove each resctrl_group other than root */
		if (rdtgrp == &resctrl_group_default)
			continue;

		/*
		 * Give any CPUs back to the default group. We cannot copy
		 * cpu_online_mask because a CPU might have executed the
		 * offline callback already, but is still marked online.
		 */
		cpumask_or(&resctrl_group_default.cpu_mask,
			   &resctrl_group_default.cpu_mask, &rdtgrp->cpu_mask);

546
		rmid_free(rdtgrp->mon.rmid);
X
Xie XiuQi 已提交
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566

		kernfs_remove(rdtgrp->kn);
		list_del(&rdtgrp->resctrl_group_list);
		kfree(rdtgrp);
	}
	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
	update_closid_rmid(cpu_online_mask, &resctrl_group_default);

	kernfs_remove(kn_info);
	kernfs_remove(kn_mongrp);
	kernfs_remove(kn_mondata);
}

static void resctrl_kill_sb(struct super_block *sb)
{

	cpus_read_lock();
	mutex_lock(&resctrl_group_mutex);

	resctrl_resource_reset();
567

568
	schemata_list_destroy();
X
Xie XiuQi 已提交
569 570 571 572 573 574 575 576 577 578

	rmdir_all_sub();
	static_branch_disable_cpuslocked(&resctrl_alloc_enable_key);
	static_branch_disable_cpuslocked(&resctrl_mon_enable_key);
	static_branch_disable_cpuslocked(&resctrl_enable_key);
	kernfs_kill_sb(sb);
	mutex_unlock(&resctrl_group_mutex);
	cpus_read_unlock();
}

579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
enum resctrl_param {
	Opt_cdpl3,
	Opt_cdpl2,
	Opt_mbMax,
	Opt_mbMin,
	Opt_mbHdl,
	Opt_mbPrio,
	Opt_caPbm,
	Opt_caMax,
	Opt_caPrio,
	nr__resctrl_params
};

static const struct fs_parameter_spec resctrl_fs_parameters[] = {
	fsparam_flag("cdpl3",        Opt_cdpl3),
	fsparam_flag("cdpl2",        Opt_cdpl2),
	fsparam_flag("mbMax",        Opt_mbMax),
	fsparam_flag("mbMin",        Opt_mbMin),
	fsparam_flag("mbHdl",        Opt_mbHdl),
	fsparam_flag("mbPrio",       Opt_mbPrio),
	fsparam_flag("caPbm",        Opt_caPbm),
	fsparam_flag("caMax",        Opt_caMax),
	fsparam_flag("caPrio",       Opt_caPrio),
	{}
X
Xie XiuQi 已提交
603 604 605 606
};

static int resctrl_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
	struct resctrl_fs_context *ctx = resctrl_fc2context(fc);
	struct fs_parse_result result;
	int opt;

	opt = fs_parse(fc, resctrl_fs_parameters, param, &result);
	if (opt < 0)
		return opt;

	switch (opt) {
	case Opt_cdpl3:
		ctx->enable_cdpl3 = true;
		return 0;
	case Opt_cdpl2:
		ctx->enable_cdpl2 = true;
		return 0;
	case Opt_mbMax:
		ctx->enable_mbMax = true;
		return 0;
	case Opt_mbMin:
		ctx->enable_mbMin = true;
		return 0;
	case Opt_mbHdl:
		ctx->enable_mbHdl = true;
		return 0;
	case Opt_mbPrio:
		ctx->enable_mbPrio = true;
		return 0;
	case Opt_caPbm:
		ctx->enable_caPbm = true;
		return 0;
	case Opt_caMax:
		ctx->enable_caMax = true;
		return 0;
	case Opt_caPrio:
		ctx->enable_caPrio = true;
		return 0;

X
Xie XiuQi 已提交
644 645 646
	return 0;
}

647 648 649 650
return -EINVAL;

}

X
Xie XiuQi 已提交
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
static void resctrl_fs_context_free(struct fs_context *fc)
{
	struct resctrl_fs_context *ctx = resctrl_fc2context(fc);

	kernfs_free_fs_context(fc);
	kfree(ctx);
}

static const struct fs_context_operations resctrl_fs_context_ops = {
	.free           = resctrl_fs_context_free,
	.parse_param    = resctrl_parse_param,
	.get_tree       = resctrl_get_tree,
};

static int resctrl_init_fs_context(struct fs_context *fc)
{
	struct resctrl_fs_context *ctx;

	ctx = kzalloc(sizeof(struct resctrl_fs_context), GFP_KERNEL);
	if (!ctx)
		return -ENOMEM;

	ctx->kfc.root = resctrl_root;
	ctx->kfc.magic = RDTGROUP_SUPER_MAGIC;
	fc->fs_private = &ctx->kfc;
	fc->ops = &resctrl_fs_context_ops;
	if (fc->user_ns)
		put_user_ns(fc->user_ns);
	fc->user_ns = get_user_ns(&init_user_ns);
	fc->global = true;
	return 0;
}

static struct file_system_type resctrl_fs_type = {
	.name                   = "resctrl",
	.init_fs_context        = resctrl_init_fs_context,
687
	.parameters             = resctrl_fs_parameters,
X
Xie XiuQi 已提交
688 689 690
	.kill_sb                = resctrl_kill_sb,
};

691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730
static int find_rdtgrp_allocable_rmid(struct resctrl_group *rdtgrp)
{
	int ret, rmid, reqpartid;
	struct resctrl_group *prgrp, *entry;
	struct list_head *head;

	prgrp = rdtgrp->mon.parent;
	if (prgrp == &resctrl_group_default) {
		rmid = rmid_alloc(-1);
		if (rmid < 0)
			return rmid;
	} else {
		do {
			rmid = rmid_alloc(prgrp->closid.reqpartid);
			if (rmid >= 0)
				break;

			head = &prgrp->mon.crdtgrp_list;
			list_for_each_entry(entry, head, mon.crdtgrp_list) {
				if (entry == rdtgrp)
					continue;
				rmid = rmid_alloc(entry->closid.reqpartid);
				if (rmid >= 0)
					break;
			}
		} while (0);
	}

	if (rmid < 0)
		rmid = rmid_alloc(-1);

	ret = mpam_rmid_to_partid_pmg(rmid, &reqpartid, NULL);
	if (ret)
		return ret;
	rdtgrp->mon.rmid = rmid;
	rdtgrp->closid.reqpartid = reqpartid;

	return rmid;
}

X
Xie XiuQi 已提交
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758
static int mkdir_resctrl_prepare(struct kernfs_node *parent_kn,
			     struct kernfs_node *prgrp_kn,
			     const char *name, umode_t mode,
			     enum rdt_group_type rtype, struct resctrl_group **r)
{
	struct resctrl_group *prdtgrp, *rdtgrp;
	struct kernfs_node *kn;
	uint files = 0;
	int ret;

	prdtgrp = resctrl_group_kn_lock_live(prgrp_kn);
	rdt_last_cmd_clear();
	if (!prdtgrp) {
		ret = -ENODEV;
		rdt_last_cmd_puts("directory was removed\n");
		goto out_unlock;
	}

	/* allocate the resctrl_group. */
	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
	if (!rdtgrp) {
		ret = -ENOSPC;
		rdt_last_cmd_puts("kernel out of memory\n");
		goto out_unlock;
	}
	*r = rdtgrp;
	rdtgrp->mon.parent = prdtgrp;
	rdtgrp->type = rtype;
759

760 761 762 763 764 765
	/*
	 * for ctrlmon group, intpartid is used for
	 * applying configuration, reqpartid is
	 * used for following this configuration and
	 * getting monitoring for child mon groups.
	 */
766
	if (rdtgrp->type == RDTCTRL_GROUP) {
767
		ret = closid_alloc();
768 769 770 771
		if (ret < 0) {
			rdt_last_cmd_puts("out of CLOSIDs\n");
			goto out_unlock;
		}
772
		rdtgrp->closid.intpartid = ret;
773 774
	}

775 776 777 778 779 780 781
	ret = find_rdtgrp_allocable_rmid(rdtgrp);
	if (ret < 0) {
		rdt_last_cmd_puts("out of RMIDs\n");
		goto out_free_closid;
	}
	rdtgrp->mon.rmid = ret;

X
Xie XiuQi 已提交
782 783 784 785 786 787 788
	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);

	/* kernfs creates the directory for rdtgrp */
	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
	if (IS_ERR(kn)) {
		ret = PTR_ERR(kn);
		rdt_last_cmd_puts("kernfs create error\n");
789
		goto out_free_rmid;
X
Xie XiuQi 已提交
790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814
	}
	rdtgrp->kn = kn;

	/*
	 * kernfs_remove() will drop the reference count on "kn" which
	 * will free it. But we still need it to stick around for the
	 * resctrl_group_kn_unlock(kn} call below. Take one extra reference
	 * here, which will be dropped inside resctrl_group_kn_unlock().
	 */
	kernfs_get(kn);

	ret = resctrl_group_kn_set_ugid(kn);
	if (ret) {
		rdt_last_cmd_puts("kernfs perm error\n");
		goto out_destroy;
	}

	files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
	ret = resctrl_group_add_files(kn, files);
	if (ret) {
		rdt_last_cmd_puts("kernfs fill error\n");
		goto out_destroy;
	}

	if (resctrl_mon_capable) {
815
		ret = mkdir_mondata_all_prepare(rdtgrp);
816 817 818 819
		if (ret < 0) {
			goto out_destroy;
		}

X
Xie XiuQi 已提交
820 821 822
		ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
		if (ret) {
			rdt_last_cmd_puts("kernfs subdir error\n");
823
			goto out_prepare_clean;
X
Xie XiuQi 已提交
824 825
		}
	}
826

X
Xie XiuQi 已提交
827 828 829 830 831 832 833
	kernfs_activate(kn);

	/*
	 * The caller unlocks the prgrp_kn upon success.
	 */
	return 0;

834 835
out_prepare_clean:
	mkdir_mondata_all_prepare_clean(rdtgrp);
X
Xie XiuQi 已提交
836 837
out_destroy:
	kernfs_remove(rdtgrp->kn);
838 839
out_free_rmid:
	rmid_free(rdtgrp->mon.rmid);
X
Xie XiuQi 已提交
840
	kfree(rdtgrp);
841 842 843
out_free_closid:
	if (rdtgrp->type == RDTCTRL_GROUP)
		closid_free(rdtgrp->closid.intpartid);
X
Xie XiuQi 已提交
844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879
out_unlock:
	resctrl_group_kn_unlock(prgrp_kn);
	return ret;
}

static void mkdir_resctrl_prepare_clean(struct resctrl_group *rgrp)
{
	kernfs_remove(rgrp->kn);
	kfree(rgrp);
}

/*
 * Create a monitor group under "mon_groups" directory of a control
 * and monitor group(ctrl_mon). This is a resource group
 * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
 */
static int resctrl_group_mkdir_mon(struct kernfs_node *parent_kn,
			      struct kernfs_node *prgrp_kn,
			      const char *name,
			      umode_t mode)
{
	struct resctrl_group *rdtgrp, *prgrp;
	int ret;

	ret = mkdir_resctrl_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
				&rdtgrp);
	if (ret)
		return ret;

	prgrp = rdtgrp->mon.parent;
	/*
	 * Add the rdtgrp to the list of rdtgrps the parent
	 * ctrl_mon group has to track.
	 */
	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);

880 881 882 883 884 885
	/*
	 * update all mon group's configuration under this parent group
	 * for master-slave model.
	 */
	ret = resctrl_update_groups_config(prgrp);

X
Xie XiuQi 已提交
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
	resctrl_group_kn_unlock(prgrp_kn);
	return ret;
}

/*
 * These are resctrl_groups created under the root directory. Can be used
 * to allocate and monitor resources.
 */
static int resctrl_group_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
				   struct kernfs_node *prgrp_kn,
				   const char *name, umode_t mode)
{
	struct resctrl_group *rdtgrp;
	struct kernfs_node *kn;
	int ret;

	ret = mkdir_resctrl_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
				&rdtgrp);
	if (ret)
		return ret;

	kn = rdtgrp->kn;
908

909
	ret = resctrl_group_init_alloc(rdtgrp);
910
	if (ret < 0)
911
		goto out_common_fail;
912

X
Xie XiuQi 已提交
913 914 915 916 917 918 919 920 921 922
	list_add(&rdtgrp->resctrl_group_list, &resctrl_all_groups);

	if (resctrl_mon_capable) {
		/*
		 * Create an empty mon_groups directory to hold the subset
		 * of tasks and cpus to monitor.
		 */
		ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
		if (ret) {
			rdt_last_cmd_puts("kernfs subdir error\n");
923
			goto out_list_del;
X
Xie XiuQi 已提交
924 925 926 927 928
		}
	}

	goto out_unlock;

929
out_list_del:
X
Xie XiuQi 已提交
930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
	list_del(&rdtgrp->resctrl_group_list);
out_common_fail:
	mkdir_resctrl_prepare_clean(rdtgrp);
out_unlock:
	resctrl_group_kn_unlock(prgrp_kn);
	return ret;
}

/*
 * We allow creating mon groups only with in a directory called "mon_groups"
 * which is present in every ctrl_mon group. Check if this is a valid
 * "mon_groups" directory.
 *
 * 1. The directory should be named "mon_groups".
 * 2. The mon group itself should "not" be named "mon_groups".
 *   This makes sure "mon_groups" directory always has a ctrl_mon group
 *   as parent.
 */
static bool is_mon_groups(struct kernfs_node *kn, const char *name)
{
	return (!strcmp(kn->name, "mon_groups") &&
		strcmp(name, "mon_groups"));
}

static int resctrl_group_mkdir(struct kernfs_node *parent_kn, const char *name,
			  umode_t mode)
{
	/* Do not accept '\n' to avoid unparsable situation. */
	if (strchr(name, '\n'))
		return -EINVAL;

	/*
	 * If the parent directory is the root directory and RDT
	 * allocation is supported, add a control and monitoring
	 * subdirectory
	 */
	if (resctrl_alloc_capable && parent_kn == resctrl_group_default.kn)
		return resctrl_group_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);

	/*
	 * If RDT monitoring is supported and the parent directory is a valid
	 * "mon_groups" directory, add a monitoring subdirectory.
	 */
	if (resctrl_mon_capable && is_mon_groups(parent_kn, name))
		return resctrl_group_mkdir_mon(parent_kn, parent_kn->parent, name, mode);

	return -EPERM;
}

static void resctrl_group_rm_mon(struct resctrl_group *rdtgrp,
			      cpumask_var_t tmpmask)
{
	struct resctrl_group *prdtgrp = rdtgrp->mon.parent;
	int cpu;

	/* Give any tasks back to the parent group */
	resctrl_move_group_tasks(rdtgrp, prdtgrp, tmpmask);

988 989
	/* Update per cpu closid and rmid of the moved CPUs first */
	for_each_cpu(cpu, &rdtgrp->cpu_mask) {
990 991
		per_cpu(pqr_state.default_closid, cpu) = resctrl_navie_closid(prdtgrp->closid);
		per_cpu(pqr_state.default_rmid, cpu) = resctrl_navie_rmid(prdtgrp->mon.rmid);
992
	}
993

X
Xie XiuQi 已提交
994 995 996 997 998 999 1000
	/*
	 * Update the MSR on moved CPUs and CPUs which have moved
	 * task running on them.
	 */
	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
	update_closid_rmid(tmpmask, NULL);

X
Xie XiuQi 已提交
1001
	rdtgrp->flags |= RDT_DELETED;
1002

1003
	rmid_free(rdtgrp->mon.rmid);
X
Xie XiuQi 已提交
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039

	/*
	 * Remove the rdtgrp from the parent ctrl_mon group's list
	 */
	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
	list_del(&rdtgrp->mon.crdtgrp_list);
}

static int resctrl_group_rmdir_mon(struct kernfs_node *kn, struct resctrl_group *rdtgrp,
			      cpumask_var_t tmpmask)
{
	resctrl_group_rm_mon(rdtgrp, tmpmask);

	/*
	 * one extra hold on this, will drop when we kfree(rdtgrp)
	 * in resctrl_group_kn_unlock()
	 */
	kernfs_get(kn);
	kernfs_remove(rdtgrp->kn);

	return 0;
}

static void resctrl_group_rm_ctrl(struct resctrl_group *rdtgrp, cpumask_var_t tmpmask)
{
	int cpu;

	/* Give any tasks back to the default group */
	resctrl_move_group_tasks(rdtgrp, &resctrl_group_default, tmpmask);

	/* Give any CPUs back to the default group */
	cpumask_or(&resctrl_group_default.cpu_mask,
		   &resctrl_group_default.cpu_mask, &rdtgrp->cpu_mask);

	/* Update per cpu closid and rmid of the moved CPUs first */
	for_each_cpu(cpu, &rdtgrp->cpu_mask) {
1040
		per_cpu(pqr_state.default_closid, cpu) =
1041
			resctrl_navie_closid(resctrl_group_default.closid);
1042
		per_cpu(pqr_state.default_rmid, cpu) =
1043
			resctrl_navie_rmid(resctrl_group_default.mon.rmid);
X
Xie XiuQi 已提交
1044 1045 1046 1047 1048 1049 1050 1051 1052
	}

	/*
	 * Update the MSR on moved CPUs and CPUs which have moved
	 * task running on them.
	 */
	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
	update_closid_rmid(tmpmask, NULL);

X
Xie XiuQi 已提交
1053
	rdtgrp->flags |= RDT_DELETED;
1054 1055
	closid_free(rdtgrp->closid.intpartid);
	rmid_free(rdtgrp->mon.rmid);
X
Xie XiuQi 已提交
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129

	/*
	 * Free all the child monitor group rmids.
	 */
	free_all_child_rdtgrp(rdtgrp);

	list_del(&rdtgrp->resctrl_group_list);
}

static int resctrl_group_rmdir_ctrl(struct kernfs_node *kn, struct resctrl_group *rdtgrp,
			       cpumask_var_t tmpmask)
{
	resctrl_group_rm_ctrl(rdtgrp, tmpmask);

	/*
	 * one extra hold on this, will drop when we kfree(rdtgrp)
	 * in resctrl_group_kn_unlock()
	 */
	kernfs_get(kn);
	kernfs_remove(rdtgrp->kn);

	return 0;
}

static int resctrl_group_rmdir(struct kernfs_node *kn)
{
	struct kernfs_node *parent_kn = kn->parent;
	struct resctrl_group *rdtgrp;
	cpumask_var_t tmpmask;
	int ret = 0;

	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
		return -ENOMEM;

	rdtgrp = resctrl_group_kn_lock_live(kn);
	if (!rdtgrp) {
		ret = -EPERM;
		goto out;
	}

	/*
	 * If the resctrl_group is a ctrl_mon group and parent directory
	 * is the root directory, remove the ctrl_mon group.
	 *
	 * If the resctrl_group is a mon group and parent directory
	 * is a valid "mon_groups" directory, remove the mon group.
	 */
	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == resctrl_group_default.kn)
		ret = resctrl_group_rmdir_ctrl(kn, rdtgrp, tmpmask);
	else if (rdtgrp->type == RDTMON_GROUP &&
		 is_mon_groups(parent_kn, kn->name))
		ret = resctrl_group_rmdir_mon(kn, rdtgrp, tmpmask);
	else
		ret = -EPERM;

out:
	resctrl_group_kn_unlock(kn);
	free_cpumask_var(tmpmask);
	return ret;
}

static int resctrl_group_show_options(struct seq_file *seq, struct kernfs_root *kf)
{
	return __resctrl_group_show_options(seq);
}

static struct kernfs_syscall_ops resctrl_group_kf_syscall_ops = {
	.mkdir		= resctrl_group_mkdir,
	.rmdir		= resctrl_group_rmdir,
	.show_options	= resctrl_group_show_options,
};

static void resctrl_group_default_init(struct resctrl_group *r)
{
1130 1131
	r->closid.intpartid = 0;
	r->closid.reqpartid = 0;
X
Xie XiuQi 已提交
1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200
	r->mon.rmid = 0;
	r->type = RDTCTRL_GROUP;
}

static int __init resctrl_group_setup_root(void)
{
	int ret;

	resctrl_root = kernfs_create_root(&resctrl_group_kf_syscall_ops,
				      KERNFS_ROOT_CREATE_DEACTIVATED,
				      &resctrl_group_default);
	if (IS_ERR(resctrl_root))
		return PTR_ERR(resctrl_root);

	mutex_lock(&resctrl_group_mutex);

	resctrl_group_default_init(&resctrl_group_default);
	INIT_LIST_HEAD(&resctrl_group_default.mon.crdtgrp_list);

	list_add(&resctrl_group_default.resctrl_group_list, &resctrl_all_groups);

	ret = resctrl_group_add_files(resctrl_root->kn, RF_CTRL_BASE);
	if (ret) {
		kernfs_destroy_root(resctrl_root);
		goto out;
	}

	resctrl_group_default.kn = resctrl_root->kn;
	kernfs_activate(resctrl_group_default.kn);

out:
	mutex_unlock(&resctrl_group_mutex);

	return ret;
}

/*
 * resctrl_group_init - resctrl_group initialization
 *
 * Setup resctrl file system including set up root, create mount point,
 * register resctrl_group filesystem, and initialize files under root directory.
 *
 * Return: 0 on success or -errno
 */
int __init resctrl_group_init(void)
{
	int ret = 0;

	ret = resctrl_group_setup_root();
	if (ret)
		return ret;

	ret = sysfs_create_mount_point(fs_kobj, "resctrl");
	if (ret)
		goto cleanup_root;

	ret = register_filesystem(&resctrl_fs_type);
	if (ret)
		goto cleanup_mountpoint;

	return 0;

cleanup_mountpoint:
	sysfs_remove_mount_point(fs_kobj, "resctrl");
cleanup_root:
	kernfs_destroy_root(resctrl_root);

	return ret;
}