Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
kernel_linux
提交
936c663a
K
kernel_linux
项目概览
OpenHarmony
/
kernel_linux
上一次同步 4 年多
通知
15
Star
8
Fork
2
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
K
kernel_linux
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
936c663a
编写于
3月 27, 2015
作者:
I
Ingo Molnar
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'perf/x86' into perf/core, because it's ready
Signed-off-by:
N
Ingo Molnar
<
mingo@kernel.org
>
上级
072e5a1c
50f16a8b
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
1514 addition
and
63 deletion
+1514
-63
arch/arm/kernel/hw_breakpoint.c
arch/arm/kernel/hw_breakpoint.c
+1
-1
arch/arm64/kernel/hw_breakpoint.c
arch/arm64/kernel/hw_breakpoint.c
+1
-1
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/cpufeature.h
+8
-1
arch/x86/include/asm/processor.h
arch/x86/include/asm/processor.h
+3
-0
arch/x86/kernel/cpu/Makefile
arch/x86/kernel/cpu/Makefile
+1
-1
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/common.c
+39
-0
arch/x86/kernel/cpu/perf_event_intel_cqm.c
arch/x86/kernel/cpu/perf_event_intel_cqm.c
+1379
-0
include/linux/perf_event.h
include/linux/perf_event.h
+48
-2
kernel/events/core.c
kernel/events/core.c
+25
-48
kernel/events/hw_breakpoint.c
kernel/events/hw_breakpoint.c
+4
-4
kernel/trace/trace_uprobe.c
kernel/trace/trace_uprobe.c
+5
-5
未找到文件。
arch/arm/kernel/hw_breakpoint.c
浏览文件 @
936c663a
...
...
@@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
* Per-cpu breakpoints are not supported by our stepping
* mechanism.
*/
if
(
!
bp
->
hw
.
bp_
target
)
if
(
!
bp
->
hw
.
target
)
return
-
EINVAL
;
/*
...
...
arch/arm64/kernel/hw_breakpoint.c
浏览文件 @
936c663a
...
...
@@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
* Disallow per-task kernel breakpoints since these would
* complicate the stepping code.
*/
if
(
info
->
ctrl
.
privilege
==
AARCH64_BREAKPOINT_EL1
&&
bp
->
hw
.
bp_
target
)
if
(
info
->
ctrl
.
privilege
==
AARCH64_BREAKPOINT_EL1
&&
bp
->
hw
.
target
)
return
-
EINVAL
;
return
0
;
...
...
arch/x86/include/asm/cpufeature.h
浏览文件 @
936c663a
...
...
@@ -12,7 +12,7 @@
#include <asm/disabled-features.h>
#endif
#define NCAPINTS 1
1
/* N 32-bit words worth of info */
#define NCAPINTS 1
3
/* N 32-bit words worth of info */
#define NBUGINTS 1
/* N 32-bit bug flags */
/*
...
...
@@ -226,6 +226,7 @@
#define X86_FEATURE_ERMS ( 9*32+ 9)
/* Enhanced REP MOVSB/STOSB */
#define X86_FEATURE_INVPCID ( 9*32+10)
/* Invalidate Processor Context ID */
#define X86_FEATURE_RTM ( 9*32+11)
/* Restricted Transactional Memory */
#define X86_FEATURE_CQM ( 9*32+12)
/* Cache QoS Monitoring */
#define X86_FEATURE_MPX ( 9*32+14)
/* Memory Protection Extension */
#define X86_FEATURE_AVX512F ( 9*32+16)
/* AVX-512 Foundation */
#define X86_FEATURE_RDSEED ( 9*32+18)
/* The RDSEED instruction */
...
...
@@ -242,6 +243,12 @@
#define X86_FEATURE_XGETBV1 (10*32+ 2)
/* XGETBV with ECX = 1 */
#define X86_FEATURE_XSAVES (10*32+ 3)
/* XSAVES/XRSTORS */
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
#define X86_FEATURE_CQM_LLC (11*32+ 1)
/* LLC QoS if 1 */
/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0)
/* LLC occupancy monitoring if 1 */
/*
* BUG word(s)
*/
...
...
arch/x86/include/asm/processor.h
浏览文件 @
936c663a
...
...
@@ -109,6 +109,9 @@ struct cpuinfo_x86 {
/* in KB - valid for CPUS which support this call: */
int
x86_cache_size
;
int
x86_cache_alignment
;
/* In bytes */
/* Cache QoS architectural values: */
int
x86_cache_max_rmid
;
/* max index */
int
x86_cache_occ_scale
;
/* scale to bytes */
int
x86_power
;
unsigned
long
loops_per_jiffy
;
/* cpuid returned max cores value: */
...
...
arch/x86/kernel/cpu/Makefile
浏览文件 @
936c663a
...
...
@@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
endif
obj-$(CONFIG_CPU_SUP_INTEL)
+=
perf_event_p6.o perf_event_knc.o perf_event_p4.o
obj-$(CONFIG_CPU_SUP_INTEL)
+=
perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
obj-$(CONFIG_CPU_SUP_INTEL)
+=
perf_event_intel_rapl.o
obj-$(CONFIG_CPU_SUP_INTEL)
+=
perf_event_intel_rapl.o
perf_event_intel_cqm.o
obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)
+=
perf_event_intel_uncore.o
\
perf_event_intel_uncore_snb.o
\
...
...
arch/x86/kernel/cpu/common.c
浏览文件 @
936c663a
...
...
@@ -646,6 +646,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
c
->
x86_capability
[
10
]
=
eax
;
}
/* Additional Intel-defined flags: level 0x0000000F */
if
(
c
->
cpuid_level
>=
0x0000000F
)
{
u32
eax
,
ebx
,
ecx
,
edx
;
/* QoS sub-leaf, EAX=0Fh, ECX=0 */
cpuid_count
(
0x0000000F
,
0
,
&
eax
,
&
ebx
,
&
ecx
,
&
edx
);
c
->
x86_capability
[
11
]
=
edx
;
if
(
cpu_has
(
c
,
X86_FEATURE_CQM_LLC
))
{
/* will be overridden if occupancy monitoring exists */
c
->
x86_cache_max_rmid
=
ebx
;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count
(
0x0000000F
,
1
,
&
eax
,
&
ebx
,
&
ecx
,
&
edx
);
c
->
x86_capability
[
12
]
=
edx
;
if
(
cpu_has
(
c
,
X86_FEATURE_CQM_OCCUP_LLC
))
{
c
->
x86_cache_max_rmid
=
ecx
;
c
->
x86_cache_occ_scale
=
ebx
;
}
}
else
{
c
->
x86_cache_max_rmid
=
-
1
;
c
->
x86_cache_occ_scale
=
-
1
;
}
}
/* AMD-defined flags: level 0x80000001 */
xlvl
=
cpuid_eax
(
0x80000000
);
c
->
extended_cpuid_level
=
xlvl
;
...
...
@@ -834,6 +858,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
detect_nopl
(
c
);
}
static
void
x86_init_cache_qos
(
struct
cpuinfo_x86
*
c
)
{
/*
* The heavy lifting of max_rmid and cache_occ_scale are handled
* in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
* in case CQM bits really aren't there in this CPU.
*/
if
(
c
!=
&
boot_cpu_data
)
{
boot_cpu_data
.
x86_cache_max_rmid
=
min
(
boot_cpu_data
.
x86_cache_max_rmid
,
c
->
x86_cache_max_rmid
);
}
}
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
...
...
@@ -923,6 +961,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
init_hypervisor
(
c
);
x86_init_rdrand
(
c
);
x86_init_cache_qos
(
c
);
/*
* Clear/Set all flags overriden by options, need do it
...
...
arch/x86/kernel/cpu/perf_event_intel_cqm.c
0 → 100644
浏览文件 @
936c663a
/*
* Intel Cache Quality-of-Service Monitoring (CQM) support.
*
* Based very, very heavily on work by Peter Zijlstra.
*/
#include <linux/perf_event.h>
#include <linux/slab.h>
#include <asm/cpu_device_id.h>
#include "perf_event.h"
#define MSR_IA32_PQR_ASSOC 0x0c8f
#define MSR_IA32_QM_CTR 0x0c8e
#define MSR_IA32_QM_EVTSEL 0x0c8d
static
unsigned
int
cqm_max_rmid
=
-
1
;
static
unsigned
int
cqm_l3_scale
;
/* supposedly cacheline size */
struct
intel_cqm_state
{
raw_spinlock_t
lock
;
int
rmid
;
int
cnt
;
};
static
DEFINE_PER_CPU
(
struct
intel_cqm_state
,
cqm_state
);
/*
* Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
* Also protects event->hw.cqm_rmid
*
* Hold either for stability, both for modification of ->hw.cqm_rmid.
*/
static
DEFINE_MUTEX
(
cache_mutex
);
static
DEFINE_RAW_SPINLOCK
(
cache_lock
);
/*
* Groups of events that have the same target(s), one RMID per group.
*/
static
LIST_HEAD
(
cache_groups
);
/*
* Mask of CPUs for reading CQM values. We only need one per-socket.
*/
static
cpumask_t
cqm_cpumask
;
#define RMID_VAL_ERROR (1ULL << 63)
#define RMID_VAL_UNAVAIL (1ULL << 62)
#define QOS_L3_OCCUP_EVENT_ID (1 << 0)
#define QOS_EVENT_MASK QOS_L3_OCCUP_EVENT_ID
/*
* This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
*
* This rmid is always free and is guaranteed to have an associated
* near-zero occupancy value, i.e. no cachelines are tagged with this
* RMID, once __intel_cqm_rmid_rotate() returns.
*/
static
unsigned
int
intel_cqm_rotation_rmid
;
#define INVALID_RMID (-1)
/*
* Is @rmid valid for programming the hardware?
*
* rmid 0 is reserved by the hardware for all non-monitored tasks, which
* means that we should never come across an rmid with that value.
* Likewise, an rmid value of -1 is used to indicate "no rmid currently
* assigned" and is used as part of the rotation code.
*/
static
inline
bool
__rmid_valid
(
unsigned
int
rmid
)
{
if
(
!
rmid
||
rmid
==
INVALID_RMID
)
return
false
;
return
true
;
}
static
u64
__rmid_read
(
unsigned
int
rmid
)
{
u64
val
;
/*
* Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
* it just says that to increase confusion.
*/
wrmsr
(
MSR_IA32_QM_EVTSEL
,
QOS_L3_OCCUP_EVENT_ID
,
rmid
);
rdmsrl
(
MSR_IA32_QM_CTR
,
val
);
/*
* Aside from the ERROR and UNAVAIL bits, assume this thing returns
* the number of cachelines tagged with @rmid.
*/
return
val
;
}
enum
rmid_recycle_state
{
RMID_YOUNG
=
0
,
RMID_AVAILABLE
,
RMID_DIRTY
,
};
struct
cqm_rmid_entry
{
unsigned
int
rmid
;
enum
rmid_recycle_state
state
;
struct
list_head
list
;
unsigned
long
queue_time
;
};
/*
* cqm_rmid_free_lru - A least recently used list of RMIDs.
*
* Oldest entry at the head, newest (most recently used) entry at the
* tail. This list is never traversed, it's only used to keep track of
* the lru order. That is, we only pick entries of the head or insert
* them on the tail.
*
* All entries on the list are 'free', and their RMIDs are not currently
* in use. To mark an RMID as in use, remove its entry from the lru
* list.
*
*
* cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
*
* This list is contains RMIDs that no one is currently using but that
* may have a non-zero occupancy value associated with them. The
* rotation worker moves RMIDs from the limbo list to the free list once
* the occupancy value drops below __intel_cqm_threshold.
*
* Both lists are protected by cache_mutex.
*/
static
LIST_HEAD
(
cqm_rmid_free_lru
);
static
LIST_HEAD
(
cqm_rmid_limbo_lru
);
/*
* We use a simple array of pointers so that we can lookup a struct
* cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
* and __put_rmid() from having to worry about dealing with struct
* cqm_rmid_entry - they just deal with rmids, i.e. integers.
*
* Once this array is initialized it is read-only. No locks are required
* to access it.
*
* All entries for all RMIDs can be looked up in the this array at all
* times.
*/
static
struct
cqm_rmid_entry
**
cqm_rmid_ptrs
;
static
inline
struct
cqm_rmid_entry
*
__rmid_entry
(
int
rmid
)
{
struct
cqm_rmid_entry
*
entry
;
entry
=
cqm_rmid_ptrs
[
rmid
];
WARN_ON
(
entry
->
rmid
!=
rmid
);
return
entry
;
}
/*
* Returns < 0 on fail.
*
* We expect to be called with cache_mutex held.
*/
static
int
__get_rmid
(
void
)
{
struct
cqm_rmid_entry
*
entry
;
lockdep_assert_held
(
&
cache_mutex
);
if
(
list_empty
(
&
cqm_rmid_free_lru
))
return
INVALID_RMID
;
entry
=
list_first_entry
(
&
cqm_rmid_free_lru
,
struct
cqm_rmid_entry
,
list
);
list_del
(
&
entry
->
list
);
return
entry
->
rmid
;
}
static
void
__put_rmid
(
unsigned
int
rmid
)
{
struct
cqm_rmid_entry
*
entry
;
lockdep_assert_held
(
&
cache_mutex
);
WARN_ON
(
!
__rmid_valid
(
rmid
));
entry
=
__rmid_entry
(
rmid
);
entry
->
queue_time
=
jiffies
;
entry
->
state
=
RMID_YOUNG
;
list_add_tail
(
&
entry
->
list
,
&
cqm_rmid_limbo_lru
);
}
static
int
intel_cqm_setup_rmid_cache
(
void
)
{
struct
cqm_rmid_entry
*
entry
;
unsigned
int
nr_rmids
;
int
r
=
0
;
nr_rmids
=
cqm_max_rmid
+
1
;
cqm_rmid_ptrs
=
kmalloc
(
sizeof
(
struct
cqm_rmid_entry
*
)
*
nr_rmids
,
GFP_KERNEL
);
if
(
!
cqm_rmid_ptrs
)
return
-
ENOMEM
;
for
(;
r
<=
cqm_max_rmid
;
r
++
)
{
struct
cqm_rmid_entry
*
entry
;
entry
=
kmalloc
(
sizeof
(
*
entry
),
GFP_KERNEL
);
if
(
!
entry
)
goto
fail
;
INIT_LIST_HEAD
(
&
entry
->
list
);
entry
->
rmid
=
r
;
cqm_rmid_ptrs
[
r
]
=
entry
;
list_add_tail
(
&
entry
->
list
,
&
cqm_rmid_free_lru
);
}
/*
* RMID 0 is special and is always allocated. It's used for all
* tasks that are not monitored.
*/
entry
=
__rmid_entry
(
0
);
list_del
(
&
entry
->
list
);
mutex_lock
(
&
cache_mutex
);
intel_cqm_rotation_rmid
=
__get_rmid
();
mutex_unlock
(
&
cache_mutex
);
return
0
;
fail:
while
(
r
--
)
kfree
(
cqm_rmid_ptrs
[
r
]);
kfree
(
cqm_rmid_ptrs
);
return
-
ENOMEM
;
}
/*
* Determine if @a and @b measure the same set of tasks.
*
* If @a and @b measure the same set of tasks then we want to share a
* single RMID.
*/
static
bool
__match_event
(
struct
perf_event
*
a
,
struct
perf_event
*
b
)
{
/* Per-cpu and task events don't mix */
if
((
a
->
attach_state
&
PERF_ATTACH_TASK
)
!=
(
b
->
attach_state
&
PERF_ATTACH_TASK
))
return
false
;
#ifdef CONFIG_CGROUP_PERF
if
(
a
->
cgrp
!=
b
->
cgrp
)
return
false
;
#endif
/* If not task event, we're machine wide */
if
(
!
(
b
->
attach_state
&
PERF_ATTACH_TASK
))
return
true
;
/*
* Events that target same task are placed into the same cache group.
*/
if
(
a
->
hw
.
target
==
b
->
hw
.
target
)
return
true
;
/*
* Are we an inherited event?
*/
if
(
b
->
parent
==
a
)
return
true
;
return
false
;
}
#ifdef CONFIG_CGROUP_PERF
static
inline
struct
perf_cgroup
*
event_to_cgroup
(
struct
perf_event
*
event
)
{
if
(
event
->
attach_state
&
PERF_ATTACH_TASK
)
return
perf_cgroup_from_task
(
event
->
hw
.
target
);
return
event
->
cgrp
;
}
#endif
/*
* Determine if @a's tasks intersect with @b's tasks
*
* There are combinations of events that we explicitly prohibit,
*
* PROHIBITS
* system-wide -> cgroup and task
* cgroup -> system-wide
* -> task in cgroup
* task -> system-wide
* -> task in cgroup
*
* Call this function before allocating an RMID.
*/
static
bool
__conflict_event
(
struct
perf_event
*
a
,
struct
perf_event
*
b
)
{
#ifdef CONFIG_CGROUP_PERF
/*
* We can have any number of cgroups but only one system-wide
* event at a time.
*/
if
(
a
->
cgrp
&&
b
->
cgrp
)
{
struct
perf_cgroup
*
ac
=
a
->
cgrp
;
struct
perf_cgroup
*
bc
=
b
->
cgrp
;
/*
* This condition should have been caught in
* __match_event() and we should be sharing an RMID.
*/
WARN_ON_ONCE
(
ac
==
bc
);
if
(
cgroup_is_descendant
(
ac
->
css
.
cgroup
,
bc
->
css
.
cgroup
)
||
cgroup_is_descendant
(
bc
->
css
.
cgroup
,
ac
->
css
.
cgroup
))
return
true
;
return
false
;
}
if
(
a
->
cgrp
||
b
->
cgrp
)
{
struct
perf_cgroup
*
ac
,
*
bc
;
/*
* cgroup and system-wide events are mutually exclusive
*/
if
((
a
->
cgrp
&&
!
(
b
->
attach_state
&
PERF_ATTACH_TASK
))
||
(
b
->
cgrp
&&
!
(
a
->
attach_state
&
PERF_ATTACH_TASK
)))
return
true
;
/*
* Ensure neither event is part of the other's cgroup
*/
ac
=
event_to_cgroup
(
a
);
bc
=
event_to_cgroup
(
b
);
if
(
ac
==
bc
)
return
true
;
/*
* Must have cgroup and non-intersecting task events.
*/
if
(
!
ac
||
!
bc
)
return
false
;
/*
* We have cgroup and task events, and the task belongs
* to a cgroup. Check for for overlap.
*/
if
(
cgroup_is_descendant
(
ac
->
css
.
cgroup
,
bc
->
css
.
cgroup
)
||
cgroup_is_descendant
(
bc
->
css
.
cgroup
,
ac
->
css
.
cgroup
))
return
true
;
return
false
;
}
#endif
/*
* If one of them is not a task, same story as above with cgroups.
*/
if
(
!
(
a
->
attach_state
&
PERF_ATTACH_TASK
)
||
!
(
b
->
attach_state
&
PERF_ATTACH_TASK
))
return
true
;
/*
* Must be non-overlapping.
*/
return
false
;
}
struct
rmid_read
{
unsigned
int
rmid
;
atomic64_t
value
;
};
static
void
__intel_cqm_event_count
(
void
*
info
);
/*
* Exchange the RMID of a group of events.
*/
static
unsigned
int
intel_cqm_xchg_rmid
(
struct
perf_event
*
group
,
unsigned
int
rmid
)
{
struct
perf_event
*
event
;
unsigned
int
old_rmid
=
group
->
hw
.
cqm_rmid
;
struct
list_head
*
head
=
&
group
->
hw
.
cqm_group_entry
;
lockdep_assert_held
(
&
cache_mutex
);
/*
* If our RMID is being deallocated, perform a read now.
*/
if
(
__rmid_valid
(
old_rmid
)
&&
!
__rmid_valid
(
rmid
))
{
struct
rmid_read
rr
=
{
.
value
=
ATOMIC64_INIT
(
0
),
.
rmid
=
old_rmid
,
};
on_each_cpu_mask
(
&
cqm_cpumask
,
__intel_cqm_event_count
,
&
rr
,
1
);
local64_set
(
&
group
->
count
,
atomic64_read
(
&
rr
.
value
));
}
raw_spin_lock_irq
(
&
cache_lock
);
group
->
hw
.
cqm_rmid
=
rmid
;
list_for_each_entry
(
event
,
head
,
hw
.
cqm_group_entry
)
event
->
hw
.
cqm_rmid
=
rmid
;
raw_spin_unlock_irq
(
&
cache_lock
);
return
old_rmid
;
}
/*
* If we fail to assign a new RMID for intel_cqm_rotation_rmid because
* cachelines are still tagged with RMIDs in limbo, we progressively
* increment the threshold until we find an RMID in limbo with <=
* __intel_cqm_threshold lines tagged. This is designed to mitigate the
* problem where cachelines tagged with an RMID are not steadily being
* evicted.
*
* On successful rotations we decrease the threshold back towards zero.
*
* __intel_cqm_max_threshold provides an upper bound on the threshold,
* and is measured in bytes because it's exposed to userland.
*/
static
unsigned
int
__intel_cqm_threshold
;
static
unsigned
int
__intel_cqm_max_threshold
;
/*
* Test whether an RMID has a zero occupancy value on this cpu.
*/
static
void
intel_cqm_stable
(
void
*
arg
)
{
struct
cqm_rmid_entry
*
entry
;
list_for_each_entry
(
entry
,
&
cqm_rmid_limbo_lru
,
list
)
{
if
(
entry
->
state
!=
RMID_AVAILABLE
)
break
;
if
(
__rmid_read
(
entry
->
rmid
)
>
__intel_cqm_threshold
)
entry
->
state
=
RMID_DIRTY
;
}
}
/*
* If we have group events waiting for an RMID that don't conflict with
* events already running, assign @rmid.
*/
static
bool
intel_cqm_sched_in_event
(
unsigned
int
rmid
)
{
struct
perf_event
*
leader
,
*
event
;
lockdep_assert_held
(
&
cache_mutex
);
leader
=
list_first_entry
(
&
cache_groups
,
struct
perf_event
,
hw
.
cqm_groups_entry
);
event
=
leader
;
list_for_each_entry_continue
(
event
,
&
cache_groups
,
hw
.
cqm_groups_entry
)
{
if
(
__rmid_valid
(
event
->
hw
.
cqm_rmid
))
continue
;
if
(
__conflict_event
(
event
,
leader
))
continue
;
intel_cqm_xchg_rmid
(
event
,
rmid
);
return
true
;
}
return
false
;
}
/*
* Initially use this constant for both the limbo queue time and the
* rotation timer interval, pmu::hrtimer_interval_ms.
*
* They don't need to be the same, but the two are related since if you
* rotate faster than you recycle RMIDs, you may run out of available
* RMIDs.
*/
#define RMID_DEFAULT_QUEUE_TIME 250
/* ms */
static
unsigned
int
__rmid_queue_time_ms
=
RMID_DEFAULT_QUEUE_TIME
;
/*
* intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
* @nr_available: number of freeable RMIDs on the limbo list
*
* Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
* cachelines are tagged with those RMIDs. After this we can reuse them
* and know that the current set of active RMIDs is stable.
*
* Return %true or %false depending on whether stabilization needs to be
* reattempted.
*
* If we return %true then @nr_available is updated to indicate the
* number of RMIDs on the limbo list that have been queued for the
* minimum queue time (RMID_AVAILABLE), but whose data occupancy values
* are above __intel_cqm_threshold.
*/
static
bool
intel_cqm_rmid_stabilize
(
unsigned
int
*
available
)
{
struct
cqm_rmid_entry
*
entry
,
*
tmp
;
lockdep_assert_held
(
&
cache_mutex
);
*
available
=
0
;
list_for_each_entry
(
entry
,
&
cqm_rmid_limbo_lru
,
list
)
{
unsigned
long
min_queue_time
;
unsigned
long
now
=
jiffies
;
/*
* We hold RMIDs placed into limbo for a minimum queue
* time. Before the minimum queue time has elapsed we do
* not recycle RMIDs.
*
* The reasoning is that until a sufficient time has
* passed since we stopped using an RMID, any RMID
* placed onto the limbo list will likely still have
* data tagged in the cache, which means we'll probably
* fail to recycle it anyway.
*
* We can save ourselves an expensive IPI by skipping
* any RMIDs that have not been queued for the minimum
* time.
*/
min_queue_time
=
entry
->
queue_time
+
msecs_to_jiffies
(
__rmid_queue_time_ms
);
if
(
time_after
(
min_queue_time
,
now
))
break
;
entry
->
state
=
RMID_AVAILABLE
;
(
*
available
)
++
;
}
/*
* Fast return if none of the RMIDs on the limbo list have been
* sitting on the queue for the minimum queue time.
*/
if
(
!*
available
)
return
false
;
/*
* Test whether an RMID is free for each package.
*/
on_each_cpu_mask
(
&
cqm_cpumask
,
intel_cqm_stable
,
NULL
,
true
);
list_for_each_entry_safe
(
entry
,
tmp
,
&
cqm_rmid_limbo_lru
,
list
)
{
/*
* Exhausted all RMIDs that have waited min queue time.
*/
if
(
entry
->
state
==
RMID_YOUNG
)
break
;
if
(
entry
->
state
==
RMID_DIRTY
)
continue
;
list_del
(
&
entry
->
list
);
/* remove from limbo */
/*
* The rotation RMID gets priority if it's
* currently invalid. In which case, skip adding
* the RMID to the the free lru.
*/
if
(
!
__rmid_valid
(
intel_cqm_rotation_rmid
))
{
intel_cqm_rotation_rmid
=
entry
->
rmid
;
continue
;
}
/*
* If we have groups waiting for RMIDs, hand
* them one now provided they don't conflict.
*/
if
(
intel_cqm_sched_in_event
(
entry
->
rmid
))
continue
;
/*
* Otherwise place it onto the free list.
*/
list_add_tail
(
&
entry
->
list
,
&
cqm_rmid_free_lru
);
}
return
__rmid_valid
(
intel_cqm_rotation_rmid
);
}
/*
* Pick a victim group and move it to the tail of the group list.
* @next: The first group without an RMID
*/
static
void
__intel_cqm_pick_and_rotate
(
struct
perf_event
*
next
)
{
struct
perf_event
*
rotor
;
unsigned
int
rmid
;
lockdep_assert_held
(
&
cache_mutex
);
rotor
=
list_first_entry
(
&
cache_groups
,
struct
perf_event
,
hw
.
cqm_groups_entry
);
/*
* The group at the front of the list should always have a valid
* RMID. If it doesn't then no groups have RMIDs assigned and we
* don't need to rotate the list.
*/
if
(
next
==
rotor
)
return
;
rmid
=
intel_cqm_xchg_rmid
(
rotor
,
INVALID_RMID
);
__put_rmid
(
rmid
);
list_rotate_left
(
&
cache_groups
);
}
/*
* Deallocate the RMIDs from any events that conflict with @event, and
* place them on the back of the group list.
*/
static
void
intel_cqm_sched_out_conflicting_events
(
struct
perf_event
*
event
)
{
struct
perf_event
*
group
,
*
g
;
unsigned
int
rmid
;
lockdep_assert_held
(
&
cache_mutex
);
list_for_each_entry_safe
(
group
,
g
,
&
cache_groups
,
hw
.
cqm_groups_entry
)
{
if
(
group
==
event
)
continue
;
rmid
=
group
->
hw
.
cqm_rmid
;
/*
* Skip events that don't have a valid RMID.
*/
if
(
!
__rmid_valid
(
rmid
))
continue
;
/*
* No conflict? No problem! Leave the event alone.
*/
if
(
!
__conflict_event
(
group
,
event
))
continue
;
intel_cqm_xchg_rmid
(
group
,
INVALID_RMID
);
__put_rmid
(
rmid
);
}
}
/*
* Attempt to rotate the groups and assign new RMIDs.
*
* We rotate for two reasons,
* 1. To handle the scheduling of conflicting events
* 2. To recycle RMIDs
*
* Rotating RMIDs is complicated because the hardware doesn't give us
* any clues.
*
* There's problems with the hardware interface; when you change the
* task:RMID map cachelines retain their 'old' tags, giving a skewed
* picture. In order to work around this, we must always keep one free
* RMID - intel_cqm_rotation_rmid.
*
* Rotation works by taking away an RMID from a group (the old RMID),
* and assigning the free RMID to another group (the new RMID). We must
* then wait for the old RMID to not be used (no cachelines tagged).
* This ensure that all cachelines are tagged with 'active' RMIDs. At
* this point we can start reading values for the new RMID and treat the
* old RMID as the free RMID for the next rotation.
*
* Return %true or %false depending on whether we did any rotating.
*/
static
bool
__intel_cqm_rmid_rotate
(
void
)
{
struct
perf_event
*
group
,
*
start
=
NULL
;
unsigned
int
threshold_limit
;
unsigned
int
nr_needed
=
0
;
unsigned
int
nr_available
;
bool
rotated
=
false
;
mutex_lock
(
&
cache_mutex
);
again:
/*
* Fast path through this function if there are no groups and no
* RMIDs that need cleaning.
*/
if
(
list_empty
(
&
cache_groups
)
&&
list_empty
(
&
cqm_rmid_limbo_lru
))
goto
out
;
list_for_each_entry
(
group
,
&
cache_groups
,
hw
.
cqm_groups_entry
)
{
if
(
!
__rmid_valid
(
group
->
hw
.
cqm_rmid
))
{
if
(
!
start
)
start
=
group
;
nr_needed
++
;
}
}
/*
* We have some event groups, but they all have RMIDs assigned
* and no RMIDs need cleaning.
*/
if
(
!
nr_needed
&&
list_empty
(
&
cqm_rmid_limbo_lru
))
goto
out
;
if
(
!
nr_needed
)
goto
stabilize
;
/*
* We have more event groups without RMIDs than available RMIDs,
* or we have event groups that conflict with the ones currently
* scheduled.
*
* We force deallocate the rmid of the group at the head of
* cache_groups. The first event group without an RMID then gets
* assigned intel_cqm_rotation_rmid. This ensures we always make
* forward progress.
*
* Rotate the cache_groups list so the previous head is now the
* tail.
*/
__intel_cqm_pick_and_rotate
(
start
);
/*
* If the rotation is going to succeed, reduce the threshold so
* that we don't needlessly reuse dirty RMIDs.
*/
if
(
__rmid_valid
(
intel_cqm_rotation_rmid
))
{
intel_cqm_xchg_rmid
(
start
,
intel_cqm_rotation_rmid
);
intel_cqm_rotation_rmid
=
__get_rmid
();
intel_cqm_sched_out_conflicting_events
(
start
);
if
(
__intel_cqm_threshold
)
__intel_cqm_threshold
--
;
}
rotated
=
true
;
stabilize:
/*
* We now need to stablize the RMID we freed above (if any) to
* ensure that the next time we rotate we have an RMID with zero
* occupancy value.
*
* Alternatively, if we didn't need to perform any rotation,
* we'll have a bunch of RMIDs in limbo that need stabilizing.
*/
threshold_limit
=
__intel_cqm_max_threshold
/
cqm_l3_scale
;
while
(
intel_cqm_rmid_stabilize
(
&
nr_available
)
&&
__intel_cqm_threshold
<
threshold_limit
)
{
unsigned
int
steal_limit
;
/*
* Don't spin if nobody is actively waiting for an RMID,
* the rotation worker will be kicked as soon as an
* event needs an RMID anyway.
*/
if
(
!
nr_needed
)
break
;
/* Allow max 25% of RMIDs to be in limbo. */
steal_limit
=
(
cqm_max_rmid
+
1
)
/
4
;
/*
* We failed to stabilize any RMIDs so our rotation
* logic is now stuck. In order to make forward progress
* we have a few options:
*
* 1. rotate ("steal") another RMID
* 2. increase the threshold
* 3. do nothing
*
* We do both of 1. and 2. until we hit the steal limit.
*
* The steal limit prevents all RMIDs ending up on the
* limbo list. This can happen if every RMID has a
* non-zero occupancy above threshold_limit, and the
* occupancy values aren't dropping fast enough.
*
* Note that there is prioritisation at work here - we'd
* rather increase the number of RMIDs on the limbo list
* than increase the threshold, because increasing the
* threshold skews the event data (because we reuse
* dirty RMIDs) - threshold bumps are a last resort.
*/
if
(
nr_available
<
steal_limit
)
goto
again
;
__intel_cqm_threshold
++
;
}
out:
mutex_unlock
(
&
cache_mutex
);
return
rotated
;
}
static
void
intel_cqm_rmid_rotate
(
struct
work_struct
*
work
);
static
DECLARE_DELAYED_WORK
(
intel_cqm_rmid_work
,
intel_cqm_rmid_rotate
);
static
struct
pmu
intel_cqm_pmu
;
static
void
intel_cqm_rmid_rotate
(
struct
work_struct
*
work
)
{
unsigned
long
delay
;
__intel_cqm_rmid_rotate
();
delay
=
msecs_to_jiffies
(
intel_cqm_pmu
.
hrtimer_interval_ms
);
schedule_delayed_work
(
&
intel_cqm_rmid_work
,
delay
);
}
/*
* Find a group and setup RMID.
*
* If we're part of a group, we use the group's RMID.
*/
static
void
intel_cqm_setup_event
(
struct
perf_event
*
event
,
struct
perf_event
**
group
)
{
struct
perf_event
*
iter
;
unsigned
int
rmid
;
bool
conflict
=
false
;
list_for_each_entry
(
iter
,
&
cache_groups
,
hw
.
cqm_groups_entry
)
{
rmid
=
iter
->
hw
.
cqm_rmid
;
if
(
__match_event
(
iter
,
event
))
{
/* All tasks in a group share an RMID */
event
->
hw
.
cqm_rmid
=
rmid
;
*
group
=
iter
;
return
;
}
/*
* We only care about conflicts for events that are
* actually scheduled in (and hence have a valid RMID).
*/
if
(
__conflict_event
(
iter
,
event
)
&&
__rmid_valid
(
rmid
))
conflict
=
true
;
}
if
(
conflict
)
rmid
=
INVALID_RMID
;
else
rmid
=
__get_rmid
();
event
->
hw
.
cqm_rmid
=
rmid
;
}
static
void
intel_cqm_event_read
(
struct
perf_event
*
event
)
{
unsigned
long
flags
;
unsigned
int
rmid
;
u64
val
;
/*
* Task events are handled by intel_cqm_event_count().
*/
if
(
event
->
cpu
==
-
1
)
return
;
raw_spin_lock_irqsave
(
&
cache_lock
,
flags
);
rmid
=
event
->
hw
.
cqm_rmid
;
if
(
!
__rmid_valid
(
rmid
))
goto
out
;
val
=
__rmid_read
(
rmid
);
/*
* Ignore this reading on error states and do not update the value.
*/
if
(
val
&
(
RMID_VAL_ERROR
|
RMID_VAL_UNAVAIL
))
goto
out
;
local64_set
(
&
event
->
count
,
val
);
out:
raw_spin_unlock_irqrestore
(
&
cache_lock
,
flags
);
}
static
void
__intel_cqm_event_count
(
void
*
info
)
{
struct
rmid_read
*
rr
=
info
;
u64
val
;
val
=
__rmid_read
(
rr
->
rmid
);
if
(
val
&
(
RMID_VAL_ERROR
|
RMID_VAL_UNAVAIL
))
return
;
atomic64_add
(
val
,
&
rr
->
value
);
}
static
inline
bool
cqm_group_leader
(
struct
perf_event
*
event
)
{
return
!
list_empty
(
&
event
->
hw
.
cqm_groups_entry
);
}
static
u64
intel_cqm_event_count
(
struct
perf_event
*
event
)
{
unsigned
long
flags
;
struct
rmid_read
rr
=
{
.
value
=
ATOMIC64_INIT
(
0
),
};
/*
* We only need to worry about task events. System-wide events
* are handled like usual, i.e. entirely with
* intel_cqm_event_read().
*/
if
(
event
->
cpu
!=
-
1
)
return
__perf_event_count
(
event
);
/*
* Only the group leader gets to report values. This stops us
* reporting duplicate values to userspace, and gives us a clear
* rule for which task gets to report the values.
*
* Note that it is impossible to attribute these values to
* specific packages - we forfeit that ability when we create
* task events.
*/
if
(
!
cqm_group_leader
(
event
))
return
0
;
/*
* Notice that we don't perform the reading of an RMID
* atomically, because we can't hold a spin lock across the
* IPIs.
*
* Speculatively perform the read, since @event might be
* assigned a different (possibly invalid) RMID while we're
* busying performing the IPI calls. It's therefore necessary to
* check @event's RMID afterwards, and if it has changed,
* discard the result of the read.
*/
rr
.
rmid
=
ACCESS_ONCE
(
event
->
hw
.
cqm_rmid
);
if
(
!
__rmid_valid
(
rr
.
rmid
))
goto
out
;
on_each_cpu_mask
(
&
cqm_cpumask
,
__intel_cqm_event_count
,
&
rr
,
1
);
raw_spin_lock_irqsave
(
&
cache_lock
,
flags
);
if
(
event
->
hw
.
cqm_rmid
==
rr
.
rmid
)
local64_set
(
&
event
->
count
,
atomic64_read
(
&
rr
.
value
));
raw_spin_unlock_irqrestore
(
&
cache_lock
,
flags
);
out:
return
__perf_event_count
(
event
);
}
static
void
intel_cqm_event_start
(
struct
perf_event
*
event
,
int
mode
)
{
struct
intel_cqm_state
*
state
=
this_cpu_ptr
(
&
cqm_state
);
unsigned
int
rmid
=
event
->
hw
.
cqm_rmid
;
unsigned
long
flags
;
if
(
!
(
event
->
hw
.
cqm_state
&
PERF_HES_STOPPED
))
return
;
event
->
hw
.
cqm_state
&=
~
PERF_HES_STOPPED
;
raw_spin_lock_irqsave
(
&
state
->
lock
,
flags
);
if
(
state
->
cnt
++
)
WARN_ON_ONCE
(
state
->
rmid
!=
rmid
);
else
WARN_ON_ONCE
(
state
->
rmid
);
state
->
rmid
=
rmid
;
wrmsrl
(
MSR_IA32_PQR_ASSOC
,
state
->
rmid
);
raw_spin_unlock_irqrestore
(
&
state
->
lock
,
flags
);
}
static
void
intel_cqm_event_stop
(
struct
perf_event
*
event
,
int
mode
)
{
struct
intel_cqm_state
*
state
=
this_cpu_ptr
(
&
cqm_state
);
unsigned
long
flags
;
if
(
event
->
hw
.
cqm_state
&
PERF_HES_STOPPED
)
return
;
event
->
hw
.
cqm_state
|=
PERF_HES_STOPPED
;
raw_spin_lock_irqsave
(
&
state
->
lock
,
flags
);
intel_cqm_event_read
(
event
);
if
(
!--
state
->
cnt
)
{
state
->
rmid
=
0
;
wrmsrl
(
MSR_IA32_PQR_ASSOC
,
0
);
}
else
{
WARN_ON_ONCE
(
!
state
->
rmid
);
}
raw_spin_unlock_irqrestore
(
&
state
->
lock
,
flags
);
}
static
int
intel_cqm_event_add
(
struct
perf_event
*
event
,
int
mode
)
{
unsigned
long
flags
;
unsigned
int
rmid
;
raw_spin_lock_irqsave
(
&
cache_lock
,
flags
);
event
->
hw
.
cqm_state
=
PERF_HES_STOPPED
;
rmid
=
event
->
hw
.
cqm_rmid
;
if
(
__rmid_valid
(
rmid
)
&&
(
mode
&
PERF_EF_START
))
intel_cqm_event_start
(
event
,
mode
);
raw_spin_unlock_irqrestore
(
&
cache_lock
,
flags
);
return
0
;
}
static
void
intel_cqm_event_del
(
struct
perf_event
*
event
,
int
mode
)
{
intel_cqm_event_stop
(
event
,
mode
);
}
static
void
intel_cqm_event_destroy
(
struct
perf_event
*
event
)
{
struct
perf_event
*
group_other
=
NULL
;
mutex_lock
(
&
cache_mutex
);
/*
* If there's another event in this group...
*/
if
(
!
list_empty
(
&
event
->
hw
.
cqm_group_entry
))
{
group_other
=
list_first_entry
(
&
event
->
hw
.
cqm_group_entry
,
struct
perf_event
,
hw
.
cqm_group_entry
);
list_del
(
&
event
->
hw
.
cqm_group_entry
);
}
/*
* And we're the group leader..
*/
if
(
cqm_group_leader
(
event
))
{
/*
* If there was a group_other, make that leader, otherwise
* destroy the group and return the RMID.
*/
if
(
group_other
)
{
list_replace
(
&
event
->
hw
.
cqm_groups_entry
,
&
group_other
->
hw
.
cqm_groups_entry
);
}
else
{
unsigned
int
rmid
=
event
->
hw
.
cqm_rmid
;
if
(
__rmid_valid
(
rmid
))
__put_rmid
(
rmid
);
list_del
(
&
event
->
hw
.
cqm_groups_entry
);
}
}
mutex_unlock
(
&
cache_mutex
);
}
static
int
intel_cqm_event_init
(
struct
perf_event
*
event
)
{
struct
perf_event
*
group
=
NULL
;
bool
rotate
=
false
;
if
(
event
->
attr
.
type
!=
intel_cqm_pmu
.
type
)
return
-
ENOENT
;
if
(
event
->
attr
.
config
&
~
QOS_EVENT_MASK
)
return
-
EINVAL
;
/* unsupported modes and filters */
if
(
event
->
attr
.
exclude_user
||
event
->
attr
.
exclude_kernel
||
event
->
attr
.
exclude_hv
||
event
->
attr
.
exclude_idle
||
event
->
attr
.
exclude_host
||
event
->
attr
.
exclude_guest
||
event
->
attr
.
sample_period
)
/* no sampling */
return
-
EINVAL
;
INIT_LIST_HEAD
(
&
event
->
hw
.
cqm_group_entry
);
INIT_LIST_HEAD
(
&
event
->
hw
.
cqm_groups_entry
);
event
->
destroy
=
intel_cqm_event_destroy
;
mutex_lock
(
&
cache_mutex
);
/* Will also set rmid */
intel_cqm_setup_event
(
event
,
&
group
);
if
(
group
)
{
list_add_tail
(
&
event
->
hw
.
cqm_group_entry
,
&
group
->
hw
.
cqm_group_entry
);
}
else
{
list_add_tail
(
&
event
->
hw
.
cqm_groups_entry
,
&
cache_groups
);
/*
* All RMIDs are either in use or have recently been
* used. Kick the rotation worker to clean/free some.
*
* We only do this for the group leader, rather than for
* every event in a group to save on needless work.
*/
if
(
!
__rmid_valid
(
event
->
hw
.
cqm_rmid
))
rotate
=
true
;
}
mutex_unlock
(
&
cache_mutex
);
if
(
rotate
)
schedule_delayed_work
(
&
intel_cqm_rmid_work
,
0
);
return
0
;
}
EVENT_ATTR_STR
(
llc_occupancy
,
intel_cqm_llc
,
"event=0x01"
);
EVENT_ATTR_STR
(
llc_occupancy
.
per
-
pkg
,
intel_cqm_llc_pkg
,
"1"
);
EVENT_ATTR_STR
(
llc_occupancy
.
unit
,
intel_cqm_llc_unit
,
"Bytes"
);
EVENT_ATTR_STR
(
llc_occupancy
.
scale
,
intel_cqm_llc_scale
,
NULL
);
EVENT_ATTR_STR
(
llc_occupancy
.
snapshot
,
intel_cqm_llc_snapshot
,
"1"
);
static
struct
attribute
*
intel_cqm_events_attr
[]
=
{
EVENT_PTR
(
intel_cqm_llc
),
EVENT_PTR
(
intel_cqm_llc_pkg
),
EVENT_PTR
(
intel_cqm_llc_unit
),
EVENT_PTR
(
intel_cqm_llc_scale
),
EVENT_PTR
(
intel_cqm_llc_snapshot
),
NULL
,
};
static
struct
attribute_group
intel_cqm_events_group
=
{
.
name
=
"events"
,
.
attrs
=
intel_cqm_events_attr
,
};
PMU_FORMAT_ATTR
(
event
,
"config:0-7"
);
static
struct
attribute
*
intel_cqm_formats_attr
[]
=
{
&
format_attr_event
.
attr
,
NULL
,
};
static
struct
attribute_group
intel_cqm_format_group
=
{
.
name
=
"format"
,
.
attrs
=
intel_cqm_formats_attr
,
};
static
ssize_t
max_recycle_threshold_show
(
struct
device
*
dev
,
struct
device_attribute
*
attr
,
char
*
page
)
{
ssize_t
rv
;
mutex_lock
(
&
cache_mutex
);
rv
=
snprintf
(
page
,
PAGE_SIZE
-
1
,
"%u
\n
"
,
__intel_cqm_max_threshold
);
mutex_unlock
(
&
cache_mutex
);
return
rv
;
}
static
ssize_t
max_recycle_threshold_store
(
struct
device
*
dev
,
struct
device_attribute
*
attr
,
const
char
*
buf
,
size_t
count
)
{
unsigned
int
bytes
,
cachelines
;
int
ret
;
ret
=
kstrtouint
(
buf
,
0
,
&
bytes
);
if
(
ret
)
return
ret
;
mutex_lock
(
&
cache_mutex
);
__intel_cqm_max_threshold
=
bytes
;
cachelines
=
bytes
/
cqm_l3_scale
;
/*
* The new maximum takes effect immediately.
*/
if
(
__intel_cqm_threshold
>
cachelines
)
__intel_cqm_threshold
=
cachelines
;
mutex_unlock
(
&
cache_mutex
);
return
count
;
}
static
DEVICE_ATTR_RW
(
max_recycle_threshold
);
static
struct
attribute
*
intel_cqm_attrs
[]
=
{
&
dev_attr_max_recycle_threshold
.
attr
,
NULL
,
};
static
const
struct
attribute_group
intel_cqm_group
=
{
.
attrs
=
intel_cqm_attrs
,
};
static
const
struct
attribute_group
*
intel_cqm_attr_groups
[]
=
{
&
intel_cqm_events_group
,
&
intel_cqm_format_group
,
&
intel_cqm_group
,
NULL
,
};
static
struct
pmu
intel_cqm_pmu
=
{
.
hrtimer_interval_ms
=
RMID_DEFAULT_QUEUE_TIME
,
.
attr_groups
=
intel_cqm_attr_groups
,
.
task_ctx_nr
=
perf_sw_context
,
.
event_init
=
intel_cqm_event_init
,
.
add
=
intel_cqm_event_add
,
.
del
=
intel_cqm_event_del
,
.
start
=
intel_cqm_event_start
,
.
stop
=
intel_cqm_event_stop
,
.
read
=
intel_cqm_event_read
,
.
count
=
intel_cqm_event_count
,
};
static
inline
void
cqm_pick_event_reader
(
int
cpu
)
{
int
phys_id
=
topology_physical_package_id
(
cpu
);
int
i
;
for_each_cpu
(
i
,
&
cqm_cpumask
)
{
if
(
phys_id
==
topology_physical_package_id
(
i
))
return
;
/* already got reader for this socket */
}
cpumask_set_cpu
(
cpu
,
&
cqm_cpumask
);
}
static
void
intel_cqm_cpu_prepare
(
unsigned
int
cpu
)
{
struct
intel_cqm_state
*
state
=
&
per_cpu
(
cqm_state
,
cpu
);
struct
cpuinfo_x86
*
c
=
&
cpu_data
(
cpu
);
raw_spin_lock_init
(
&
state
->
lock
);
state
->
rmid
=
0
;
state
->
cnt
=
0
;
WARN_ON
(
c
->
x86_cache_max_rmid
!=
cqm_max_rmid
);
WARN_ON
(
c
->
x86_cache_occ_scale
!=
cqm_l3_scale
);
}
static
void
intel_cqm_cpu_exit
(
unsigned
int
cpu
)
{
int
phys_id
=
topology_physical_package_id
(
cpu
);
int
i
;
/*
* Is @cpu a designated cqm reader?
*/
if
(
!
cpumask_test_and_clear_cpu
(
cpu
,
&
cqm_cpumask
))
return
;
for_each_online_cpu
(
i
)
{
if
(
i
==
cpu
)
continue
;
if
(
phys_id
==
topology_physical_package_id
(
i
))
{
cpumask_set_cpu
(
i
,
&
cqm_cpumask
);
break
;
}
}
}
static
int
intel_cqm_cpu_notifier
(
struct
notifier_block
*
nb
,
unsigned
long
action
,
void
*
hcpu
)
{
unsigned
int
cpu
=
(
unsigned
long
)
hcpu
;
switch
(
action
&
~
CPU_TASKS_FROZEN
)
{
case
CPU_UP_PREPARE
:
intel_cqm_cpu_prepare
(
cpu
);
break
;
case
CPU_DOWN_PREPARE
:
intel_cqm_cpu_exit
(
cpu
);
break
;
case
CPU_STARTING
:
cqm_pick_event_reader
(
cpu
);
break
;
}
return
NOTIFY_OK
;
}
static
const
struct
x86_cpu_id
intel_cqm_match
[]
=
{
{
.
vendor
=
X86_VENDOR_INTEL
,
.
feature
=
X86_FEATURE_CQM_OCCUP_LLC
},
{}
};
static
int
__init
intel_cqm_init
(
void
)
{
char
*
str
,
scale
[
20
];
int
i
,
cpu
,
ret
;
if
(
!
x86_match_cpu
(
intel_cqm_match
))
return
-
ENODEV
;
cqm_l3_scale
=
boot_cpu_data
.
x86_cache_occ_scale
;
/*
* It's possible that not all resources support the same number
* of RMIDs. Instead of making scheduling much more complicated
* (where we have to match a task's RMID to a cpu that supports
* that many RMIDs) just find the minimum RMIDs supported across
* all cpus.
*
* Also, check that the scales match on all cpus.
*/
cpu_notifier_register_begin
();
for_each_online_cpu
(
cpu
)
{
struct
cpuinfo_x86
*
c
=
&
cpu_data
(
cpu
);
if
(
c
->
x86_cache_max_rmid
<
cqm_max_rmid
)
cqm_max_rmid
=
c
->
x86_cache_max_rmid
;
if
(
c
->
x86_cache_occ_scale
!=
cqm_l3_scale
)
{
pr_err
(
"Multiple LLC scale values, disabling
\n
"
);
ret
=
-
EINVAL
;
goto
out
;
}
}
/*
* A reasonable upper limit on the max threshold is the number
* of lines tagged per RMID if all RMIDs have the same number of
* lines tagged in the LLC.
*
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
*/
__intel_cqm_max_threshold
=
boot_cpu_data
.
x86_cache_size
*
1024
/
(
cqm_max_rmid
+
1
);
snprintf
(
scale
,
sizeof
(
scale
),
"%u"
,
cqm_l3_scale
);
str
=
kstrdup
(
scale
,
GFP_KERNEL
);
if
(
!
str
)
{
ret
=
-
ENOMEM
;
goto
out
;
}
event_attr_intel_cqm_llc_scale
.
event_str
=
str
;
ret
=
intel_cqm_setup_rmid_cache
();
if
(
ret
)
goto
out
;
for_each_online_cpu
(
i
)
{
intel_cqm_cpu_prepare
(
i
);
cqm_pick_event_reader
(
i
);
}
__perf_cpu_notifier
(
intel_cqm_cpu_notifier
);
ret
=
perf_pmu_register
(
&
intel_cqm_pmu
,
"intel_cqm"
,
-
1
);
if
(
ret
)
pr_err
(
"Intel CQM perf registration failed: %d
\n
"
,
ret
);
else
pr_info
(
"Intel CQM monitoring enabled
\n
"
);
out:
cpu_notifier_register_done
();
return
ret
;
}
device_initcall
(
intel_cqm_init
);
include/linux/perf_event.h
浏览文件 @
936c663a
...
...
@@ -53,6 +53,7 @@ struct perf_guest_info_callbacks {
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <asm/local.h>
struct
perf_callchain_entry
{
...
...
@@ -118,10 +119,16 @@ struct hw_perf_event {
struct
hrtimer
hrtimer
;
};
struct
{
/* tracepoint */
struct
task_struct
*
tp_target
;
/* for tp_event->class */
struct
list_head
tp_list
;
};
struct
{
/* intel_cqm */
int
cqm_state
;
int
cqm_rmid
;
struct
list_head
cqm_events_entry
;
struct
list_head
cqm_groups_entry
;
struct
list_head
cqm_group_entry
;
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT
struct
{
/* breakpoint */
/*
...
...
@@ -129,12 +136,12 @@ struct hw_perf_event {
* problem hw_breakpoint has with context
* creation and event initalization.
*/
struct
task_struct
*
bp_target
;
struct
arch_hw_breakpoint
info
;
struct
list_head
bp_list
;
};
#endif
};
struct
task_struct
*
target
;
int
state
;
local64_t
prev_count
;
u64
sample_period
;
...
...
@@ -271,6 +278,11 @@ struct pmu {
*/
size_t
task_ctx_size
;
/*
* Return the count value for a counter.
*/
u64
(
*
count
)
(
struct
perf_event
*
event
);
/*optional*/
};
/**
...
...
@@ -547,6 +559,35 @@ struct perf_output_handle {
int
page
;
};
#ifdef CONFIG_CGROUP_PERF
/*
* perf_cgroup_info keeps track of time_enabled for a cgroup.
* This is a per-cpu dynamically allocated data structure.
*/
struct
perf_cgroup_info
{
u64
time
;
u64
timestamp
;
};
struct
perf_cgroup
{
struct
cgroup_subsys_state
css
;
struct
perf_cgroup_info
__percpu
*
info
;
};
/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
* if there is no cgroup event for the current CPU context.
*/
static
inline
struct
perf_cgroup
*
perf_cgroup_from_task
(
struct
task_struct
*
task
)
{
return
container_of
(
task_css
(
task
,
perf_event_cgrp_id
),
struct
perf_cgroup
,
css
);
}
#endif
/* CONFIG_CGROUP_PERF */
#ifdef CONFIG_PERF_EVENTS
extern
int
perf_pmu_register
(
struct
pmu
*
pmu
,
const
char
*
name
,
int
type
);
...
...
@@ -740,6 +781,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
__perf_event_task_sched_out
(
prev
,
next
);
}
static
inline
u64
__perf_event_count
(
struct
perf_event
*
event
)
{
return
local64_read
(
&
event
->
count
)
+
atomic64_read
(
&
event
->
child_count
);
}
extern
void
perf_event_mmap
(
struct
vm_area_struct
*
vma
);
extern
struct
perf_guest_info_callbacks
*
perf_guest_cbs
;
extern
int
perf_register_guest_info_callbacks
(
struct
perf_guest_info_callbacks
*
callbacks
);
...
...
kernel/events/core.c
浏览文件 @
936c663a
...
...
@@ -34,11 +34,11 @@
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
...
...
@@ -351,32 +351,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
/*
* perf_cgroup_info keeps track of time_enabled for a cgroup.
* This is a per-cpu dynamically allocated data structure.
*/
struct
perf_cgroup_info
{
u64
time
;
u64
timestamp
;
};
struct
perf_cgroup
{
struct
cgroup_subsys_state
css
;
struct
perf_cgroup_info
__percpu
*
info
;
};
/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
* if there is no cgroup event for the current CPU context.
*/
static
inline
struct
perf_cgroup
*
perf_cgroup_from_task
(
struct
task_struct
*
task
)
{
return
container_of
(
task_css
(
task
,
perf_event_cgrp_id
),
struct
perf_cgroup
,
css
);
}
static
inline
bool
perf_cgroup_match
(
struct
perf_event
*
event
)
{
...
...
@@ -3220,7 +3194,10 @@ static void __perf_event_read(void *info)
static
inline
u64
perf_event_count
(
struct
perf_event
*
event
)
{
return
local64_read
(
&
event
->
count
)
+
atomic64_read
(
&
event
->
child_count
);
if
(
event
->
pmu
->
count
)
return
event
->
pmu
->
count
(
event
);
return
__perf_event_count
(
event
);
}
static
u64
perf_event_read
(
struct
perf_event
*
event
)
...
...
@@ -7149,7 +7126,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
struct
perf_event
*
group_leader
,
struct
perf_event
*
parent_event
,
perf_overflow_handler_t
overflow_handler
,
void
*
context
)
void
*
context
,
int
cgroup_fd
)
{
struct
pmu
*
pmu
;
struct
perf_event
*
event
;
...
...
@@ -7204,16 +7181,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if
(
task
)
{
event
->
attach_state
=
PERF_ATTACH_TASK
;
if
(
attr
->
type
==
PERF_TYPE_TRACEPOINT
)
event
->
hw
.
tp_target
=
task
;
#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
* hw_breakpoint is a bit difficult here..
* XXX pmu::event_init needs to know what task to account to
* and we cannot use the ctx information because we need the
* pmu before we get a ctx.
*/
else
if
(
attr
->
type
==
PERF_TYPE_BREAKPOINT
)
event
->
hw
.
bp_target
=
task
;
#endif
event
->
hw
.
target
=
task
;
}
if
(
!
overflow_handler
&&
parent_event
)
{
...
...
@@ -7245,6 +7218,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if
(
!
has_branch_stack
(
event
))
event
->
attr
.
branch_sample_type
=
0
;
if
(
cgroup_fd
!=
-
1
)
{
err
=
perf_cgroup_connect
(
cgroup_fd
,
event
,
attr
,
group_leader
);
if
(
err
)
goto
err_ns
;
}
pmu
=
perf_init_event
(
event
);
if
(
!
pmu
)
goto
err_ns
;
...
...
@@ -7268,6 +7247,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event
->
destroy
(
event
);
module_put
(
pmu
->
module
);
err_ns:
if
(
is_cgroup_event
(
event
))
perf_detach_cgroup
(
event
);
if
(
event
->
ns
)
put_pid_ns
(
event
->
ns
);
kfree
(
event
);
...
...
@@ -7486,6 +7467,7 @@ SYSCALL_DEFINE5(perf_event_open,
int
move_group
=
0
;
int
err
;
int
f_flags
=
O_RDWR
;
int
cgroup_fd
=
-
1
;
/* for future expandability... */
if
(
flags
&
~
PERF_FLAG_ALL
)
...
...
@@ -7551,21 +7533,16 @@ SYSCALL_DEFINE5(perf_event_open,
get_online_cpus
();
if
(
flags
&
PERF_FLAG_PID_CGROUP
)
cgroup_fd
=
pid
;
event
=
perf_event_alloc
(
&
attr
,
cpu
,
task
,
group_leader
,
NULL
,
NULL
,
NULL
);
NULL
,
NULL
,
cgroup_fd
);
if
(
IS_ERR
(
event
))
{
err
=
PTR_ERR
(
event
);
goto
err_cpus
;
}
if
(
flags
&
PERF_FLAG_PID_CGROUP
)
{
err
=
perf_cgroup_connect
(
pid
,
event
,
&
attr
,
group_leader
);
if
(
err
)
{
__free_event
(
event
);
goto
err_cpus
;
}
}
if
(
is_sampling_event
(
event
))
{
if
(
event
->
pmu
->
capabilities
&
PERF_PMU_CAP_NO_INTERRUPT
)
{
err
=
-
ENOTSUPP
;
...
...
@@ -7802,7 +7779,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
*/
event
=
perf_event_alloc
(
attr
,
cpu
,
task
,
NULL
,
NULL
,
overflow_handler
,
context
);
overflow_handler
,
context
,
-
1
);
if
(
IS_ERR
(
event
))
{
err
=
PTR_ERR
(
event
);
goto
err
;
...
...
@@ -8163,7 +8140,7 @@ inherit_event(struct perf_event *parent_event,
parent_event
->
cpu
,
child
,
group_leader
,
parent_event
,
NULL
,
NULL
);
NULL
,
NULL
,
-
1
);
if
(
IS_ERR
(
child_event
))
return
child_event
;
...
...
kernel/events/hw_breakpoint.c
浏览文件 @
936c663a
...
...
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
*/
static
int
task_bp_pinned
(
int
cpu
,
struct
perf_event
*
bp
,
enum
bp_type_idx
type
)
{
struct
task_struct
*
tsk
=
bp
->
hw
.
bp_
target
;
struct
task_struct
*
tsk
=
bp
->
hw
.
target
;
struct
perf_event
*
iter
;
int
count
=
0
;
list_for_each_entry
(
iter
,
&
bp_task_head
,
hw
.
bp_list
)
{
if
(
iter
->
hw
.
bp_
target
==
tsk
&&
if
(
iter
->
hw
.
target
==
tsk
&&
find_slot_idx
(
iter
)
==
type
&&
(
iter
->
cpu
<
0
||
cpu
==
iter
->
cpu
))
count
+=
hw_breakpoint_weight
(
iter
);
...
...
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
int
nr
;
nr
=
info
->
cpu_pinned
;
if
(
!
bp
->
hw
.
bp_
target
)
if
(
!
bp
->
hw
.
target
)
nr
+=
max_task_bp_pinned
(
cpu
,
type
);
else
nr
+=
task_bp_pinned
(
cpu
,
bp
,
type
);
...
...
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
weight
=
-
weight
;
/* Pinned counter cpu profiling */
if
(
!
bp
->
hw
.
bp_
target
)
{
if
(
!
bp
->
hw
.
target
)
{
get_bp_info
(
bp
->
cpu
,
type
)
->
cpu_pinned
+=
weight
;
return
;
}
...
...
kernel/trace/trace_uprobe.c
浏览文件 @
936c663a
...
...
@@ -1005,7 +1005,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
return
true
;
list_for_each_entry
(
event
,
&
filter
->
perf_events
,
hw
.
tp_list
)
{
if
(
event
->
hw
.
t
p_t
arget
->
mm
==
mm
)
if
(
event
->
hw
.
target
->
mm
==
mm
)
return
true
;
}
...
...
@@ -1015,7 +1015,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
static
inline
bool
uprobe_filter_event
(
struct
trace_uprobe
*
tu
,
struct
perf_event
*
event
)
{
return
__uprobe_perf_filter
(
&
tu
->
filter
,
event
->
hw
.
t
p_t
arget
->
mm
);
return
__uprobe_perf_filter
(
&
tu
->
filter
,
event
->
hw
.
target
->
mm
);
}
static
int
uprobe_perf_close
(
struct
trace_uprobe
*
tu
,
struct
perf_event
*
event
)
...
...
@@ -1023,10 +1023,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
bool
done
;
write_lock
(
&
tu
->
filter
.
rwlock
);
if
(
event
->
hw
.
t
p_t
arget
)
{
if
(
event
->
hw
.
target
)
{
list_del
(
&
event
->
hw
.
tp_list
);
done
=
tu
->
filter
.
nr_systemwide
||
(
event
->
hw
.
t
p_t
arget
->
flags
&
PF_EXITING
)
||
(
event
->
hw
.
target
->
flags
&
PF_EXITING
)
||
uprobe_filter_event
(
tu
,
event
);
}
else
{
tu
->
filter
.
nr_systemwide
--
;
...
...
@@ -1046,7 +1046,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
int
err
;
write_lock
(
&
tu
->
filter
.
rwlock
);
if
(
event
->
hw
.
t
p_t
arget
)
{
if
(
event
->
hw
.
target
)
{
/*
* event->parent != NULL means copy_process(), we can avoid
* uprobe_apply(). current->mm must be probed and we can rely
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录