Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
kernel_linux
提交
e13e75b8
K
kernel_linux
项目概览
OpenHarmony
/
kernel_linux
上一次同步 4 年多
通知
15
Star
8
Fork
2
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
K
kernel_linux
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
e13e75b8
编写于
4月 09, 2018
作者:
D
Dan Williams
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'for-4.17/dax' into libnvdimm-for-next
上级
1ed41b56
976431b0
变更
60
隐藏空白更改
内联
并排
Showing
60 changed file
with
1637 addition
and
1298 deletion
+1637
-1298
Documentation/admin-guide/kernel-parameters.txt
Documentation/admin-guide/kernel-parameters.txt
+11
-0
drivers/dax/Kconfig
drivers/dax/Kconfig
+4
-1
drivers/dax/super.c
drivers/dax/super.c
+12
-3
drivers/md/Kconfig
drivers/md/Kconfig
+1
-1
drivers/md/dm-linear.c
drivers/md/dm-linear.c
+6
-0
drivers/md/dm-log-writes.c
drivers/md/dm-log-writes.c
+50
-45
drivers/md/dm-stripe.c
drivers/md/dm-stripe.c
+6
-0
drivers/md/dm.c
drivers/md/dm.c
+6
-4
drivers/nvdimm/Kconfig
drivers/nvdimm/Kconfig
+1
-1
drivers/s390/block/Kconfig
drivers/s390/block/Kconfig
+1
-1
fs/block_dev.c
fs/block_dev.c
+0
-5
fs/dax.c
fs/dax.c
+94
-52
fs/ext2/ext2.h
fs/ext2/ext2.h
+1
-0
fs/ext2/inode.c
fs/ext2/inode.c
+27
-19
fs/ext2/namei.c
fs/ext2/namei.c
+2
-16
fs/ext4/inode.c
fs/ext4/inode.c
+31
-11
fs/libfs.c
fs/libfs.c
+39
-0
fs/xfs/xfs_aops.c
fs/xfs/xfs_aops.c
+18
-16
fs/xfs/xfs_aops.h
fs/xfs/xfs_aops.h
+1
-0
fs/xfs/xfs_iops.c
fs/xfs/xfs_iops.c
+4
-1
include/linux/dax.h
include/linux/dax.h
+33
-9
include/linux/fs.h
include/linux/fs.h
+4
-0
include/linux/sched/deadline.h
include/linux/sched/deadline.h
+0
-6
include/linux/sched/isolation.h
include/linux/sched/isolation.h
+1
-0
include/linux/sched/nohz.h
include/linux/sched/nohz.h
+0
-4
include/linux/tick.h
include/linux/tick.h
+3
-1
include/linux/wait_bit.h
include/linux/wait_bit.h
+70
-0
kernel/sched/Makefile
kernel/sched/Makefile
+3
-2
kernel/sched/autogroup.c
kernel/sched/autogroup.c
+9
-12
kernel/sched/autogroup.h
kernel/sched/autogroup.h
+3
-9
kernel/sched/clock.c
kernel/sched/clock.c
+12
-24
kernel/sched/completion.c
kernel/sched/completion.c
+1
-4
kernel/sched/core.c
kernel/sched/core.c
+108
-57
kernel/sched/cpuacct.c
kernel/sched/cpuacct.c
+11
-22
kernel/sched/cpudeadline.c
kernel/sched/cpudeadline.c
+11
-12
kernel/sched/cpudeadline.h
kernel/sched/cpudeadline.h
+10
-19
kernel/sched/cpufreq.c
kernel/sched/cpufreq.c
+0
-1
kernel/sched/cpufreq_schedutil.c
kernel/sched/cpufreq_schedutil.c
+67
-70
kernel/sched/cpupri.c
kernel/sched/cpupri.c
+5
-10
kernel/sched/cpupri.h
kernel/sched/cpupri.h
+9
-16
kernel/sched/cputime.c
kernel/sched/cputime.c
+27
-31
kernel/sched/deadline.c
kernel/sched/deadline.c
+43
-35
kernel/sched/debug.c
kernel/sched/debug.c
+35
-64
kernel/sched/fair.c
kernel/sched/fair.c
+180
-119
kernel/sched/idle.c
kernel/sched/idle.c
+125
-17
kernel/sched/idle_task.c
kernel/sched/idle_task.c
+0
-110
kernel/sched/isolation.c
kernel/sched/isolation.c
+6
-8
kernel/sched/loadavg.c
kernel/sched/loadavg.c
+15
-19
kernel/sched/membarrier.c
kernel/sched/membarrier.c
+12
-15
kernel/sched/rt.c
kernel/sched/rt.c
+29
-22
kernel/sched/sched.h
kernel/sched/sched.h
+325
-298
kernel/sched/stats.c
kernel/sched/stats.c
+11
-9
kernel/sched/stats.h
kernel/sched/stats.h
+40
-46
kernel/sched/stop_task.c
kernel/sched/stop_task.c
+9
-2
kernel/sched/swait.c
kernel/sched/swait.c
+4
-2
kernel/sched/topology.c
kernel/sched/topology.c
+21
-25
kernel/sched/wait.c
kernel/sched/wait.c
+5
-8
kernel/sched/wait_bit.c
kernel/sched/wait_bit.c
+65
-6
kernel/time/tick-sched.c
kernel/time/tick-sched.c
+8
-7
kernel/workqueue.c
kernel/workqueue.c
+2
-1
未找到文件。
Documentation/admin-guide/kernel-parameters.txt
浏览文件 @
e13e75b8
...
...
@@ -1766,6 +1766,17 @@
nohz
Disable the tick when a single task runs.
A residual 1Hz tick is offloaded to workqueues, which you
need to affine to housekeeping through the global
workqueue's affinity configured via the
/sys/devices/virtual/workqueue/cpumask sysfs file, or
by using the 'domain' flag described below.
NOTE: by default the global workqueue runs on all CPUs,
so to protect individual CPUs the 'cpumask' file has to
be configured manually after bootup.
domain
Isolate from the general SMP balancing and scheduling
algorithms. Note that performing domain isolation this way
...
...
drivers/dax/Kconfig
浏览文件 @
e13e75b8
config DAX_DRIVER
select DAX
bool
menuconfig DAX
tristate "DAX: direct access to differentiated memory"
select SRCU
...
...
@@ -16,7 +20,6 @@ config DEV_DAX
baseline memory pool. Mappings of a /dev/daxX.Y device impose
restrictions that make the mapping behavior deterministic.
config DEV_DAX_PMEM
tristate "PMEM DAX: direct access to persistent memory"
depends on LIBNVDIMM && NVDIMM_DAX && DEV_DAX
...
...
drivers/dax/super.c
浏览文件 @
e13e75b8
...
...
@@ -124,10 +124,19 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
return
len
<
0
?
len
:
-
EIO
;
}
if
((
IS_ENABLED
(
CONFIG_FS_DAX_LIMITED
)
&&
pfn_t_special
(
pfn
))
||
pfn_t_devmap
(
pfn
))
if
(
IS_ENABLED
(
CONFIG_FS_DAX_LIMITED
)
&&
pfn_t_special
(
pfn
))
{
/*
* An arch that has enabled the pmem api should also
* have its drivers support pfn_t_devmap()
*
* This is a developer warning and should not trigger in
* production. dax_flush() will crash since it depends
* on being able to do (page_address(pfn_to_page())).
*/
WARN_ON
(
IS_ENABLED
(
CONFIG_ARCH_HAS_PMEM_API
));
}
else
if
(
pfn_t_devmap
(
pfn
))
{
/* pass */
;
else
{
}
else
{
pr_debug
(
"VFS (%s): error: dax support not enabled
\n
"
,
sb
->
s_id
);
return
-
EOPNOTSUPP
;
...
...
drivers/md/Kconfig
浏览文件 @
e13e75b8
...
...
@@ -201,7 +201,7 @@ config BLK_DEV_DM_BUILTIN
config BLK_DEV_DM
tristate "Device mapper support"
select BLK_DEV_DM_BUILTIN
select DAX
depends on DAX || DAX=n
---help---
Device-mapper is a low level volume manager. It works by allowing
people to specify mappings for ranges of logical sectors. Various
...
...
drivers/md/dm-linear.c
浏览文件 @
e13e75b8
...
...
@@ -154,6 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti,
return
fn
(
ti
,
lc
->
dev
,
lc
->
start
,
ti
->
len
,
data
);
}
#if IS_ENABLED(CONFIG_DAX_DRIVER)
static
long
linear_dax_direct_access
(
struct
dm_target
*
ti
,
pgoff_t
pgoff
,
long
nr_pages
,
void
**
kaddr
,
pfn_t
*
pfn
)
{
...
...
@@ -184,6 +185,11 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
return
dax_copy_from_iter
(
dax_dev
,
pgoff
,
addr
,
bytes
,
i
);
}
#else
#define linear_dax_direct_access NULL
#define linear_dax_copy_from_iter NULL
#endif
static
struct
target_type
linear_target
=
{
.
name
=
"linear"
,
.
version
=
{
1
,
4
,
0
},
...
...
drivers/md/dm-log-writes.c
浏览文件 @
e13e75b8
...
...
@@ -610,51 +610,6 @@ static int log_mark(struct log_writes_c *lc, char *data)
return
0
;
}
static
int
log_dax
(
struct
log_writes_c
*
lc
,
sector_t
sector
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
struct
pending_block
*
block
;
if
(
!
bytes
)
return
0
;
block
=
kzalloc
(
sizeof
(
struct
pending_block
),
GFP_KERNEL
);
if
(
!
block
)
{
DMERR
(
"Error allocating dax pending block"
);
return
-
ENOMEM
;
}
block
->
data
=
kzalloc
(
bytes
,
GFP_KERNEL
);
if
(
!
block
->
data
)
{
DMERR
(
"Error allocating dax data space"
);
kfree
(
block
);
return
-
ENOMEM
;
}
/* write data provided via the iterator */
if
(
!
copy_from_iter
(
block
->
data
,
bytes
,
i
))
{
DMERR
(
"Error copying dax data"
);
kfree
(
block
->
data
);
kfree
(
block
);
return
-
EIO
;
}
/* rewind the iterator so that the block driver can use it */
iov_iter_revert
(
i
,
bytes
);
block
->
datalen
=
bytes
;
block
->
sector
=
bio_to_dev_sectors
(
lc
,
sector
);
block
->
nr_sectors
=
ALIGN
(
bytes
,
lc
->
sectorsize
)
>>
lc
->
sectorshift
;
atomic_inc
(
&
lc
->
pending_blocks
);
spin_lock_irq
(
&
lc
->
blocks_lock
);
list_add_tail
(
&
block
->
list
,
&
lc
->
unflushed_blocks
);
spin_unlock_irq
(
&
lc
->
blocks_lock
);
wake_up_process
(
lc
->
log_kthread
);
return
0
;
}
static
void
log_writes_dtr
(
struct
dm_target
*
ti
)
{
struct
log_writes_c
*
lc
=
ti
->
private
;
...
...
@@ -920,6 +875,52 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit
limits
->
io_min
=
limits
->
physical_block_size
;
}
#if IS_ENABLED(CONFIG_DAX_DRIVER)
static
int
log_dax
(
struct
log_writes_c
*
lc
,
sector_t
sector
,
size_t
bytes
,
struct
iov_iter
*
i
)
{
struct
pending_block
*
block
;
if
(
!
bytes
)
return
0
;
block
=
kzalloc
(
sizeof
(
struct
pending_block
),
GFP_KERNEL
);
if
(
!
block
)
{
DMERR
(
"Error allocating dax pending block"
);
return
-
ENOMEM
;
}
block
->
data
=
kzalloc
(
bytes
,
GFP_KERNEL
);
if
(
!
block
->
data
)
{
DMERR
(
"Error allocating dax data space"
);
kfree
(
block
);
return
-
ENOMEM
;
}
/* write data provided via the iterator */
if
(
!
copy_from_iter
(
block
->
data
,
bytes
,
i
))
{
DMERR
(
"Error copying dax data"
);
kfree
(
block
->
data
);
kfree
(
block
);
return
-
EIO
;
}
/* rewind the iterator so that the block driver can use it */
iov_iter_revert
(
i
,
bytes
);
block
->
datalen
=
bytes
;
block
->
sector
=
bio_to_dev_sectors
(
lc
,
sector
);
block
->
nr_sectors
=
ALIGN
(
bytes
,
lc
->
sectorsize
)
>>
lc
->
sectorshift
;
atomic_inc
(
&
lc
->
pending_blocks
);
spin_lock_irq
(
&
lc
->
blocks_lock
);
list_add_tail
(
&
block
->
list
,
&
lc
->
unflushed_blocks
);
spin_unlock_irq
(
&
lc
->
blocks_lock
);
wake_up_process
(
lc
->
log_kthread
);
return
0
;
}
static
long
log_writes_dax_direct_access
(
struct
dm_target
*
ti
,
pgoff_t
pgoff
,
long
nr_pages
,
void
**
kaddr
,
pfn_t
*
pfn
)
{
...
...
@@ -956,6 +957,10 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
dax_copy:
return
dax_copy_from_iter
(
lc
->
dev
->
dax_dev
,
pgoff
,
addr
,
bytes
,
i
);
}
#else
#define log_writes_dax_direct_access NULL
#define log_writes_dax_copy_from_iter NULL
#endif
static
struct
target_type
log_writes_target
=
{
.
name
=
"log-writes"
,
...
...
drivers/md/dm-stripe.c
浏览文件 @
e13e75b8
...
...
@@ -311,6 +311,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
return
DM_MAPIO_REMAPPED
;
}
#if IS_ENABLED(CONFIG_DAX_DRIVER)
static
long
stripe_dax_direct_access
(
struct
dm_target
*
ti
,
pgoff_t
pgoff
,
long
nr_pages
,
void
**
kaddr
,
pfn_t
*
pfn
)
{
...
...
@@ -351,6 +352,11 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
return
dax_copy_from_iter
(
dax_dev
,
pgoff
,
addr
,
bytes
,
i
);
}
#else
#define stripe_dax_direct_access NULL
#define stripe_dax_copy_from_iter NULL
#endif
/*
* Stripe status:
*
...
...
drivers/md/dm.c
浏览文件 @
e13e75b8
...
...
@@ -1805,7 +1805,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
static
struct
mapped_device
*
alloc_dev
(
int
minor
)
{
int
r
,
numa_node_id
=
dm_get_numa_node
();
struct
dax_device
*
dax_dev
;
struct
dax_device
*
dax_dev
=
NULL
;
struct
mapped_device
*
md
;
void
*
old_md
;
...
...
@@ -1871,9 +1871,11 @@ static struct mapped_device *alloc_dev(int minor)
md
->
disk
->
private_data
=
md
;
sprintf
(
md
->
disk
->
disk_name
,
"dm-%d"
,
minor
);
dax_dev
=
alloc_dax
(
md
,
md
->
disk
->
disk_name
,
&
dm_dax_ops
);
if
(
!
dax_dev
)
goto
bad
;
if
(
IS_ENABLED
(
CONFIG_DAX_DRIVER
))
{
dax_dev
=
alloc_dax
(
md
,
md
->
disk
->
disk_name
,
&
dm_dax_ops
);
if
(
!
dax_dev
)
goto
bad
;
}
md
->
dax_dev
=
dax_dev
;
add_disk_no_queue_reg
(
md
->
disk
);
...
...
drivers/nvdimm/Kconfig
浏览文件 @
e13e75b8
...
...
@@ -20,7 +20,7 @@ if LIBNVDIMM
config BLK_DEV_PMEM
tristate "PMEM: Persistent memory block device support"
default LIBNVDIMM
select DAX
select DAX
_DRIVER
select ND_BTT if BTT
select ND_PFN if NVDIMM_PFN
help
...
...
drivers/s390/block/Kconfig
浏览文件 @
e13e75b8
...
...
@@ -15,8 +15,8 @@ config BLK_DEV_XPRAM
config DCSSBLK
def_tristate m
select DAX
select FS_DAX_LIMITED
select DAX_DRIVER
prompt "DCSSBLK support"
depends on S390 && BLOCK
help
...
...
fs/block_dev.c
浏览文件 @
e13e75b8
...
...
@@ -1946,11 +1946,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
static
int
blkdev_writepages
(
struct
address_space
*
mapping
,
struct
writeback_control
*
wbc
)
{
if
(
dax_mapping
(
mapping
))
{
struct
block_device
*
bdev
=
I_BDEV
(
mapping
->
host
);
return
dax_writeback_mapping_range
(
mapping
,
bdev
,
wbc
);
}
return
generic_writepages
(
mapping
,
wbc
);
}
...
...
fs/dax.c
浏览文件 @
e13e75b8
...
...
@@ -73,16 +73,15 @@ fs_initcall(init_dax_wait_table);
#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
static
unsigned
long
dax_radix_
sector
(
void
*
entry
)
static
unsigned
long
dax_radix_
pfn
(
void
*
entry
)
{
return
(
unsigned
long
)
entry
>>
RADIX_DAX_SHIFT
;
}
static
void
*
dax_radix_locked_entry
(
sector_t
sector
,
unsigned
long
flags
)
static
void
*
dax_radix_locked_entry
(
unsigned
long
pfn
,
unsigned
long
flags
)
{
return
(
void
*
)(
RADIX_TREE_EXCEPTIONAL_ENTRY
|
flags
|
((
unsigned
long
)
sector
<<
RADIX_DAX_SHIFT
)
|
RADIX_DAX_ENTRY_LOCK
);
(
pfn
<<
RADIX_DAX_SHIFT
)
|
RADIX_DAX_ENTRY_LOCK
);
}
static
unsigned
int
dax_radix_order
(
void
*
entry
)
...
...
@@ -299,6 +298,63 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
entry
,
false
);
}
static
unsigned
long
dax_entry_size
(
void
*
entry
)
{
if
(
dax_is_zero_entry
(
entry
))
return
0
;
else
if
(
dax_is_empty_entry
(
entry
))
return
0
;
else
if
(
dax_is_pmd_entry
(
entry
))
return
PMD_SIZE
;
else
return
PAGE_SIZE
;
}
static
unsigned
long
dax_radix_end_pfn
(
void
*
entry
)
{
return
dax_radix_pfn
(
entry
)
+
dax_entry_size
(
entry
)
/
PAGE_SIZE
;
}
/*
* Iterate through all mapped pfns represented by an entry, i.e. skip
* 'empty' and 'zero' entries.
*/
#define for_each_mapped_pfn(entry, pfn) \
for (pfn = dax_radix_pfn(entry); \
pfn < dax_radix_end_pfn(entry); pfn++)
static
void
dax_associate_entry
(
void
*
entry
,
struct
address_space
*
mapping
)
{
unsigned
long
pfn
;
if
(
IS_ENABLED
(
CONFIG_FS_DAX_LIMITED
))
return
;
for_each_mapped_pfn
(
entry
,
pfn
)
{
struct
page
*
page
=
pfn_to_page
(
pfn
);
WARN_ON_ONCE
(
page
->
mapping
);
page
->
mapping
=
mapping
;
}
}
static
void
dax_disassociate_entry
(
void
*
entry
,
struct
address_space
*
mapping
,
bool
trunc
)
{
unsigned
long
pfn
;
if
(
IS_ENABLED
(
CONFIG_FS_DAX_LIMITED
))
return
;
for_each_mapped_pfn
(
entry
,
pfn
)
{
struct
page
*
page
=
pfn_to_page
(
pfn
);
WARN_ON_ONCE
(
trunc
&&
page_ref_count
(
page
)
>
1
);
WARN_ON_ONCE
(
page
->
mapping
&&
page
->
mapping
!=
mapping
);
page
->
mapping
=
NULL
;
}
}
/*
* Find radix tree entry at given index. If it points to an exceptional entry,
* return it with the radix tree entry locked. If the radix tree doesn't
...
...
@@ -405,6 +461,7 @@ static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
}
if
(
pmd_downgrade
)
{
dax_disassociate_entry
(
entry
,
mapping
,
false
);
radix_tree_delete
(
&
mapping
->
page_tree
,
index
);
mapping
->
nrexceptional
--
;
dax_wake_mapping_entry_waiter
(
mapping
,
index
,
entry
,
...
...
@@ -454,6 +511,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
(
radix_tree_tag_get
(
page_tree
,
index
,
PAGECACHE_TAG_DIRTY
)
||
radix_tree_tag_get
(
page_tree
,
index
,
PAGECACHE_TAG_TOWRITE
)))
goto
out
;
dax_disassociate_entry
(
entry
,
mapping
,
trunc
);
radix_tree_delete
(
page_tree
,
index
);
mapping
->
nrexceptional
--
;
ret
=
1
;
...
...
@@ -526,12 +584,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
*/
static
void
*
dax_insert_mapping_entry
(
struct
address_space
*
mapping
,
struct
vm_fault
*
vmf
,
void
*
entry
,
sector_t
sector
,
void
*
entry
,
pfn_t
pfn_t
,
unsigned
long
flags
,
bool
dirty
)
{
struct
radix_tree_root
*
page_tree
=
&
mapping
->
page_tree
;
void
*
new_entry
;
unsigned
long
pfn
=
pfn_t_to_pfn
(
pfn_t
)
;
pgoff_t
index
=
vmf
->
pgoff
;
void
*
new_entry
;
if
(
dirty
)
__mark_inode_dirty
(
mapping
->
host
,
I_DIRTY_PAGES
);
...
...
@@ -546,7 +605,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
}
spin_lock_irq
(
&
mapping
->
tree_lock
);
new_entry
=
dax_radix_locked_entry
(
sector
,
flags
);
new_entry
=
dax_radix_locked_entry
(
pfn
,
flags
);
if
(
dax_entry_size
(
entry
)
!=
dax_entry_size
(
new_entry
))
{
dax_disassociate_entry
(
entry
,
mapping
,
false
);
dax_associate_entry
(
new_entry
,
mapping
);
}
if
(
dax_is_zero_entry
(
entry
)
||
dax_is_empty_entry
(
entry
))
{
/*
...
...
@@ -657,17 +720,14 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
i_mmap_unlock_read
(
mapping
);
}
static
int
dax_writeback_one
(
struct
block_device
*
bdev
,
struct
dax_device
*
dax_dev
,
struct
address_space
*
mapping
,
pgoff_t
index
,
void
*
entry
)
static
int
dax_writeback_one
(
struct
dax_device
*
dax_dev
,
struct
address_space
*
mapping
,
pgoff_t
index
,
void
*
entry
)
{
struct
radix_tree_root
*
page_tree
=
&
mapping
->
page_tree
;
void
*
entry2
,
**
slot
,
*
kaddr
;
long
ret
=
0
,
id
;
sector_t
sector
;
pgoff_t
pgoff
;
void
*
entry2
,
**
slot
;
unsigned
long
pfn
;
long
ret
=
0
;
size_t
size
;
pfn_t
pfn
;
/*
* A page got tagged dirty in DAX mapping? Something is seriously
...
...
@@ -683,10 +743,10 @@ static int dax_writeback_one(struct block_device *bdev,
goto
put_unlocked
;
/*
* Entry got reallocated elsewhere? No need to writeback. We have to
* compare
sector
s as we must not bail out due to difference in lockbit
* compare
pfn
s as we must not bail out due to difference in lockbit
* or entry type.
*/
if
(
dax_radix_
sector
(
entry2
)
!=
dax_radix_sector
(
entry
))
if
(
dax_radix_
pfn
(
entry2
)
!=
dax_radix_pfn
(
entry
))
goto
put_unlocked
;
if
(
WARN_ON_ONCE
(
dax_is_empty_entry
(
entry
)
||
dax_is_zero_entry
(
entry
)))
{
...
...
@@ -712,33 +772,15 @@ static int dax_writeback_one(struct block_device *bdev,
/*
* Even if dax_writeback_mapping_range() was given a wbc->range_start
* in the middle of a PMD, the 'index' we are given will be aligned to
* the start index of the PMD, as will the
sector we pull from
*
'entry'. This allows us to flush for PMD_SIZE and not have to
*
worry about
partial PMD writebacks.
* the start index of the PMD, as will the
pfn we pull from 'entry'.
*
This allows us to flush for PMD_SIZE and not have to worry about
* partial PMD writebacks.
*/
sector
=
dax_radix_sector
(
entry
);
pfn
=
dax_radix_pfn
(
entry
);
size
=
PAGE_SIZE
<<
dax_radix_order
(
entry
);
id
=
dax_read_lock
();
ret
=
bdev_dax_pgoff
(
bdev
,
sector
,
size
,
&
pgoff
);
if
(
ret
)
goto
dax_unlock
;
/*
* dax_direct_access() may sleep, so cannot hold tree_lock over
* its invocation.
*/
ret
=
dax_direct_access
(
dax_dev
,
pgoff
,
size
/
PAGE_SIZE
,
&
kaddr
,
&
pfn
);
if
(
ret
<
0
)
goto
dax_unlock
;
if
(
WARN_ON_ONCE
(
ret
<
size
/
PAGE_SIZE
))
{
ret
=
-
EIO
;
goto
dax_unlock
;
}
dax_mapping_entry_mkclean
(
mapping
,
index
,
pfn_t_to_pfn
(
pfn
));
dax_flush
(
dax_dev
,
kaddr
,
size
);
dax_mapping_entry_mkclean
(
mapping
,
index
,
pfn
);
dax_flush
(
dax_dev
,
page_address
(
pfn_to_page
(
pfn
)),
size
);
/*
* After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as
...
...
@@ -749,8 +791,6 @@ static int dax_writeback_one(struct block_device *bdev,
radix_tree_tag_clear
(
page_tree
,
index
,
PAGECACHE_TAG_DIRTY
);
spin_unlock_irq
(
&
mapping
->
tree_lock
);
trace_dax_writeback_one
(
mapping
->
host
,
index
,
size
>>
PAGE_SHIFT
);
dax_unlock:
dax_read_unlock
(
id
);
put_locked_mapping_entry
(
mapping
,
index
);
return
ret
;
...
...
@@ -808,8 +848,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
break
;
}
ret
=
dax_writeback_one
(
bdev
,
dax_dev
,
mapping
,
indices
[
i
],
pvec
.
pages
[
i
]);
ret
=
dax_writeback_one
(
dax_dev
,
mapping
,
indices
[
i
]
,
pvec
.
pages
[
i
]);
if
(
ret
<
0
)
{
mapping_set_error
(
mapping
,
ret
);
goto
out
;
...
...
@@ -877,6 +917,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
int
ret
=
VM_FAULT_NOPAGE
;
struct
page
*
zero_page
;
void
*
entry2
;
pfn_t
pfn
;
zero_page
=
ZERO_PAGE
(
0
);
if
(
unlikely
(
!
zero_page
))
{
...
...
@@ -884,14 +925,15 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
goto
out
;
}
entry2
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
0
,
pfn
=
page_to_pfn_t
(
zero_page
);
entry2
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
pfn
,
RADIX_DAX_ZERO_PAGE
,
false
);
if
(
IS_ERR
(
entry2
))
{
ret
=
VM_FAULT_SIGBUS
;
goto
out
;
}
vm_insert_mixed
(
vmf
->
vma
,
vaddr
,
p
age_to_pfn_t
(
zero_page
)
);
vm_insert_mixed
(
vmf
->
vma
,
vaddr
,
p
fn
);
out:
trace_dax_load_hole
(
inode
,
vmf
,
ret
);
return
ret
;
...
...
@@ -1200,8 +1242,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
if
(
error
<
0
)
goto
error_finish_iomap
;
entry
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
dax_iomap_sector
(
&
iomap
,
pos
),
entry
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
pfn
,
0
,
write
&&
!
sync
);
if
(
IS_ERR
(
entry
))
{
error
=
PTR_ERR
(
entry
);
...
...
@@ -1280,13 +1321,15 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
void
*
ret
=
NULL
;
spinlock_t
*
ptl
;
pmd_t
pmd_entry
;
pfn_t
pfn
;
zero_page
=
mm_get_huge_zero_page
(
vmf
->
vma
->
vm_mm
);
if
(
unlikely
(
!
zero_page
))
goto
fallback
;
ret
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
0
,
pfn
=
page_to_pfn_t
(
zero_page
);
ret
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
pfn
,
RADIX_DAX_PMD
|
RADIX_DAX_ZERO_PAGE
,
false
);
if
(
IS_ERR
(
ret
))
goto
fallback
;
...
...
@@ -1409,8 +1452,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
if
(
error
<
0
)
goto
finish_iomap
;
entry
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
dax_iomap_sector
(
&
iomap
,
pos
),
entry
=
dax_insert_mapping_entry
(
mapping
,
vmf
,
entry
,
pfn
,
RADIX_DAX_PMD
,
write
&&
!
sync
);
if
(
IS_ERR
(
entry
))
goto
finish_iomap
;
...
...
fs/ext2/ext2.h
浏览文件 @
e13e75b8
...
...
@@ -814,6 +814,7 @@ extern const struct inode_operations ext2_file_inode_operations;
extern
const
struct
file_operations
ext2_file_operations
;
/* inode.c */
extern
void
ext2_set_file_ops
(
struct
inode
*
inode
);
extern
const
struct
address_space_operations
ext2_aops
;
extern
const
struct
address_space_operations
ext2_nobh_aops
;
extern
const
struct
iomap_ops
ext2_iomap_ops
;
...
...
fs/ext2/inode.c
浏览文件 @
e13e75b8
...
...
@@ -940,9 +940,6 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
loff_t
offset
=
iocb
->
ki_pos
;
ssize_t
ret
;
if
(
WARN_ON_ONCE
(
IS_DAX
(
inode
)))
return
-
EIO
;
ret
=
blockdev_direct_IO
(
iocb
,
inode
,
iter
,
ext2_get_block
);
if
(
ret
<
0
&&
iov_iter_rw
(
iter
)
==
WRITE
)
ext2_write_failed
(
mapping
,
offset
+
count
);
...
...
@@ -952,17 +949,16 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
static
int
ext2_writepages
(
struct
address_space
*
mapping
,
struct
writeback_control
*
wbc
)
{
#ifdef CONFIG_FS_DAX
if
(
dax_mapping
(
mapping
))
{
return
dax_writeback_mapping_range
(
mapping
,
mapping
->
host
->
i_sb
->
s_bdev
,
wbc
);
}
#endif
return
mpage_writepages
(
mapping
,
wbc
,
ext2_get_block
);
}
static
int
ext2_dax_writepages
(
struct
address_space
*
mapping
,
struct
writeback_control
*
wbc
)
{
return
dax_writeback_mapping_range
(
mapping
,
mapping
->
host
->
i_sb
->
s_bdev
,
wbc
);
}
const
struct
address_space_operations
ext2_aops
=
{
.
readpage
=
ext2_readpage
,
.
readpages
=
ext2_readpages
,
...
...
@@ -990,6 +986,13 @@ const struct address_space_operations ext2_nobh_aops = {
.
error_remove_page
=
generic_error_remove_page
,
};
static
const
struct
address_space_operations
ext2_dax_aops
=
{
.
writepages
=
ext2_dax_writepages
,
.
direct_IO
=
noop_direct_IO
,
.
set_page_dirty
=
noop_set_page_dirty
,
.
invalidatepage
=
noop_invalidatepage
,
};
/*
* Probably it should be a library function... search for first non-zero word
* or memcmp with zero_page, whatever is better for particular architecture.
...
...
@@ -1388,6 +1391,18 @@ void ext2_set_inode_flags(struct inode *inode)
inode
->
i_flags
|=
S_DAX
;
}
void
ext2_set_file_ops
(
struct
inode
*
inode
)
{
inode
->
i_op
=
&
ext2_file_inode_operations
;
inode
->
i_fop
=
&
ext2_file_operations
;
if
(
IS_DAX
(
inode
))
inode
->
i_mapping
->
a_ops
=
&
ext2_dax_aops
;
else
if
(
test_opt
(
inode
->
i_sb
,
NOBH
))
inode
->
i_mapping
->
a_ops
=
&
ext2_nobh_aops
;
else
inode
->
i_mapping
->
a_ops
=
&
ext2_aops
;
}
struct
inode
*
ext2_iget
(
struct
super_block
*
sb
,
unsigned
long
ino
)
{
struct
ext2_inode_info
*
ei
;
...
...
@@ -1480,14 +1495,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
ei
->
i_data
[
n
]
=
raw_inode
->
i_block
[
n
];
if
(
S_ISREG
(
inode
->
i_mode
))
{
inode
->
i_op
=
&
ext2_file_inode_operations
;
if
(
test_opt
(
inode
->
i_sb
,
NOBH
))
{
inode
->
i_mapping
->
a_ops
=
&
ext2_nobh_aops
;
inode
->
i_fop
=
&
ext2_file_operations
;
}
else
{
inode
->
i_mapping
->
a_ops
=
&
ext2_aops
;
inode
->
i_fop
=
&
ext2_file_operations
;
}
ext2_set_file_ops
(
inode
);
}
else
if
(
S_ISDIR
(
inode
->
i_mode
))
{
inode
->
i_op
=
&
ext2_dir_inode_operations
;
inode
->
i_fop
=
&
ext2_dir_operations
;
...
...
fs/ext2/namei.c
浏览文件 @
e13e75b8
...
...
@@ -107,14 +107,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
if
(
IS_ERR
(
inode
))
return
PTR_ERR
(
inode
);
inode
->
i_op
=
&
ext2_file_inode_operations
;
if
(
test_opt
(
inode
->
i_sb
,
NOBH
))
{
inode
->
i_mapping
->
a_ops
=
&
ext2_nobh_aops
;
inode
->
i_fop
=
&
ext2_file_operations
;
}
else
{
inode
->
i_mapping
->
a_ops
=
&
ext2_aops
;
inode
->
i_fop
=
&
ext2_file_operations
;
}
ext2_set_file_ops
(
inode
);
mark_inode_dirty
(
inode
);
return
ext2_add_nondir
(
dentry
,
inode
);
}
...
...
@@ -125,14 +118,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if
(
IS_ERR
(
inode
))
return
PTR_ERR
(
inode
);
inode
->
i_op
=
&
ext2_file_inode_operations
;
if
(
test_opt
(
inode
->
i_sb
,
NOBH
))
{
inode
->
i_mapping
->
a_ops
=
&
ext2_nobh_aops
;
inode
->
i_fop
=
&
ext2_file_operations
;
}
else
{
inode
->
i_mapping
->
a_ops
=
&
ext2_aops
;
inode
->
i_fop
=
&
ext2_file_operations
;
}
ext2_set_file_ops
(
inode
);
mark_inode_dirty
(
inode
);
d_tmpfile
(
dentry
,
inode
);
unlock_new_inode
(
inode
);
...
...
fs/ext4/inode.c
浏览文件 @
e13e75b8
...
...
@@ -2725,12 +2725,6 @@ static int ext4_writepages(struct address_space *mapping,
percpu_down_read
(
&
sbi
->
s_journal_flag_rwsem
);
trace_ext4_writepages
(
inode
,
wbc
);
if
(
dax_mapping
(
mapping
))
{
ret
=
dax_writeback_mapping_range
(
mapping
,
inode
->
i_sb
->
s_bdev
,
wbc
);
goto
out_writepages
;
}
/*
* No pages to write? This is mainly a kludge to avoid starting
* a transaction for special inodes like journal inode on last iput()
...
...
@@ -2955,6 +2949,27 @@ static int ext4_writepages(struct address_space *mapping,
return
ret
;
}
static
int
ext4_dax_writepages
(
struct
address_space
*
mapping
,
struct
writeback_control
*
wbc
)
{
int
ret
;
long
nr_to_write
=
wbc
->
nr_to_write
;
struct
inode
*
inode
=
mapping
->
host
;
struct
ext4_sb_info
*
sbi
=
EXT4_SB
(
mapping
->
host
->
i_sb
);
if
(
unlikely
(
ext4_forced_shutdown
(
EXT4_SB
(
inode
->
i_sb
))))
return
-
EIO
;
percpu_down_read
(
&
sbi
->
s_journal_flag_rwsem
);
trace_ext4_writepages
(
inode
,
wbc
);
ret
=
dax_writeback_mapping_range
(
mapping
,
inode
->
i_sb
->
s_bdev
,
wbc
);
trace_ext4_writepages_result
(
inode
,
wbc
,
ret
,
nr_to_write
-
wbc
->
nr_to_write
);
percpu_up_read
(
&
sbi
->
s_journal_flag_rwsem
);
return
ret
;
}
static
int
ext4_nonda_switch
(
struct
super_block
*
sb
)
{
s64
free_clusters
,
dirty_clusters
;
...
...
@@ -3857,10 +3872,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
if
(
ext4_has_inline_data
(
inode
))
return
0
;
/* DAX uses iomap path now */
if
(
WARN_ON_ONCE
(
IS_DAX
(
inode
)))
return
0
;
trace_ext4_direct_IO_enter
(
inode
,
offset
,
count
,
iov_iter_rw
(
iter
));
if
(
iov_iter_rw
(
iter
)
==
READ
)
ret
=
ext4_direct_IO_read
(
iocb
,
iter
);
...
...
@@ -3946,6 +3957,13 @@ static const struct address_space_operations ext4_da_aops = {
.
error_remove_page
=
generic_error_remove_page
,
};
static
const
struct
address_space_operations
ext4_dax_aops
=
{
.
writepages
=
ext4_dax_writepages
,
.
direct_IO
=
noop_direct_IO
,
.
set_page_dirty
=
noop_set_page_dirty
,
.
invalidatepage
=
noop_invalidatepage
,
};
void
ext4_set_aops
(
struct
inode
*
inode
)
{
switch
(
ext4_inode_journal_mode
(
inode
))
{
...
...
@@ -3958,7 +3976,9 @@ void ext4_set_aops(struct inode *inode)
default:
BUG
();
}
if
(
test_opt
(
inode
->
i_sb
,
DELALLOC
))
if
(
IS_DAX
(
inode
))
inode
->
i_mapping
->
a_ops
=
&
ext4_dax_aops
;
else
if
(
test_opt
(
inode
->
i_sb
,
DELALLOC
))
inode
->
i_mapping
->
a_ops
=
&
ext4_da_aops
;
else
inode
->
i_mapping
->
a_ops
=
&
ext4_aops
;
...
...
fs/libfs.c
浏览文件 @
e13e75b8
...
...
@@ -1060,6 +1060,45 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL
(
noop_fsync
);
int
noop_set_page_dirty
(
struct
page
*
page
)
{
/*
* Unlike __set_page_dirty_no_writeback that handles dirty page
* tracking in the page object, dax does all dirty tracking in
* the inode address_space in response to mkwrite faults. In the
* dax case we only need to worry about potentially dirty CPU
* caches, not dirty page cache pages to write back.
*
* This callback is defined to prevent fallback to
* __set_page_dirty_buffers() in set_page_dirty().
*/
return
0
;
}
EXPORT_SYMBOL_GPL
(
noop_set_page_dirty
);
void
noop_invalidatepage
(
struct
page
*
page
,
unsigned
int
offset
,
unsigned
int
length
)
{
/*
* There is no page cache to invalidate in the dax case, however
* we need this callback defined to prevent falling back to
* block_invalidatepage() in do_invalidatepage().
*/
}
EXPORT_SYMBOL_GPL
(
noop_invalidatepage
);
ssize_t
noop_direct_IO
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
)
{
/*
* iomap based filesystems support direct I/O without need for
* this callback. However, it still needs to be set in
* inode->a_ops so that open/fcntl know that direct I/O is
* generally supported.
*/
return
-
EINVAL
;
}
EXPORT_SYMBOL_GPL
(
noop_direct_IO
);
/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
void
kfree_link
(
void
*
p
)
{
...
...
fs/xfs/xfs_aops.c
浏览文件 @
e13e75b8
...
...
@@ -1194,16 +1194,22 @@ xfs_vm_writepages(
int
ret
;
xfs_iflags_clear
(
XFS_I
(
mapping
->
host
),
XFS_ITRUNCATED
);
if
(
dax_mapping
(
mapping
))
return
dax_writeback_mapping_range
(
mapping
,
xfs_find_bdev_for_inode
(
mapping
->
host
),
wbc
);
ret
=
write_cache_pages
(
mapping
,
wbc
,
xfs_do_writepage
,
&
wpc
);
if
(
wpc
.
ioend
)
ret
=
xfs_submit_ioend
(
wbc
,
wpc
.
ioend
,
ret
);
return
ret
;
}
STATIC
int
xfs_dax_writepages
(
struct
address_space
*
mapping
,
struct
writeback_control
*
wbc
)
{
xfs_iflags_clear
(
XFS_I
(
mapping
->
host
),
XFS_ITRUNCATED
);
return
dax_writeback_mapping_range
(
mapping
,
xfs_find_bdev_for_inode
(
mapping
->
host
),
wbc
);
}
/*
* Called to move a page into cleanable state - and from there
* to be released. The page should already be clean. We always
...
...
@@ -1367,17 +1373,6 @@ xfs_get_blocks(
return
error
;
}
STATIC
ssize_t
xfs_vm_direct_IO
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
)
{
/*
* We just need the method present so that open/fcntl allow direct I/O.
*/
return
-
EINVAL
;
}
STATIC
sector_t
xfs_vm_bmap
(
struct
address_space
*
mapping
,
...
...
@@ -1500,8 +1495,15 @@ const struct address_space_operations xfs_address_space_operations = {
.
releasepage
=
xfs_vm_releasepage
,
.
invalidatepage
=
xfs_vm_invalidatepage
,
.
bmap
=
xfs_vm_bmap
,
.
direct_IO
=
xfs_vm
_direct_IO
,
.
direct_IO
=
noop
_direct_IO
,
.
migratepage
=
buffer_migrate_page
,
.
is_partially_uptodate
=
block_is_partially_uptodate
,
.
error_remove_page
=
generic_error_remove_page
,
};
const
struct
address_space_operations
xfs_dax_aops
=
{
.
writepages
=
xfs_dax_writepages
,
.
direct_IO
=
noop_direct_IO
,
.
set_page_dirty
=
noop_set_page_dirty
,
.
invalidatepage
=
noop_invalidatepage
,
};
fs/xfs/xfs_aops.h
浏览文件 @
e13e75b8
...
...
@@ -54,6 +54,7 @@ struct xfs_ioend {
};
extern
const
struct
address_space_operations
xfs_address_space_operations
;
extern
const
struct
address_space_operations
xfs_dax_aops
;
int
xfs_setfilesize
(
struct
xfs_inode
*
ip
,
xfs_off_t
offset
,
size_t
size
);
...
...
fs/xfs/xfs_iops.c
浏览文件 @
e13e75b8
...
...
@@ -1272,7 +1272,10 @@ xfs_setup_iops(
case
S_IFREG
:
inode
->
i_op
=
&
xfs_inode_operations
;
inode
->
i_fop
=
&
xfs_file_operations
;
inode
->
i_mapping
->
a_ops
=
&
xfs_address_space_operations
;
if
(
IS_DAX
(
inode
))
inode
->
i_mapping
->
a_ops
=
&
xfs_dax_aops
;
else
inode
->
i_mapping
->
a_ops
=
&
xfs_address_space_operations
;
break
;
case
S_IFDIR
:
if
(
xfs_sb_version_hasasciici
(
&
XFS_M
(
inode
->
i_sb
)
->
m_sb
))
...
...
include/linux/dax.h
浏览文件 @
e13e75b8
...
...
@@ -26,18 +26,42 @@ extern struct attribute_group dax_attribute_group;
#if IS_ENABLED(CONFIG_DAX)
struct
dax_device
*
dax_get_by_host
(
const
char
*
host
);
struct
dax_device
*
alloc_dax
(
void
*
private
,
const
char
*
host
,
const
struct
dax_operations
*
ops
);
void
put_dax
(
struct
dax_device
*
dax_dev
);
void
kill_dax
(
struct
dax_device
*
dax_dev
);
void
dax_write_cache
(
struct
dax_device
*
dax_dev
,
bool
wc
);
bool
dax_write_cache_enabled
(
struct
dax_device
*
dax_dev
);
#else
static
inline
struct
dax_device
*
dax_get_by_host
(
const
char
*
host
)
{
return
NULL
;
}
static
inline
struct
dax_device
*
alloc_dax
(
void
*
private
,
const
char
*
host
,
const
struct
dax_operations
*
ops
)
{
/*
* Callers should check IS_ENABLED(CONFIG_DAX) to know if this
* NULL is an error or expected.
*/
return
NULL
;
}
static
inline
void
put_dax
(
struct
dax_device
*
dax_dev
)
{
}
static
inline
void
kill_dax
(
struct
dax_device
*
dax_dev
)
{
}
static
inline
void
dax_write_cache
(
struct
dax_device
*
dax_dev
,
bool
wc
)
{
}
static
inline
bool
dax_write_cache_enabled
(
struct
dax_device
*
dax_dev
)
{
return
false
;
}
#endif
struct
writeback_control
;
int
bdev_dax_pgoff
(
struct
block_device
*
,
sector_t
,
size_t
,
pgoff_t
*
pgoff
);
#if IS_ENABLED(CONFIG_FS_DAX)
int
__bdev_dax_supported
(
struct
super_block
*
sb
,
int
blocksize
);
...
...
@@ -57,6 +81,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
}
struct
dax_device
*
fs_dax_get_by_bdev
(
struct
block_device
*
bdev
);
int
dax_writeback_mapping_range
(
struct
address_space
*
mapping
,
struct
block_device
*
bdev
,
struct
writeback_control
*
wbc
);
#else
static
inline
int
bdev_dax_supported
(
struct
super_block
*
sb
,
int
blocksize
)
{
...
...
@@ -76,22 +102,23 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
{
return
NULL
;
}
static
inline
int
dax_writeback_mapping_range
(
struct
address_space
*
mapping
,
struct
block_device
*
bdev
,
struct
writeback_control
*
wbc
)
{
return
-
EOPNOTSUPP
;
}
#endif
int
dax_read_lock
(
void
);
void
dax_read_unlock
(
int
id
);
struct
dax_device
*
alloc_dax
(
void
*
private
,
const
char
*
host
,
const
struct
dax_operations
*
ops
);
bool
dax_alive
(
struct
dax_device
*
dax_dev
);
void
kill_dax
(
struct
dax_device
*
dax_dev
);
void
*
dax_get_private
(
struct
dax_device
*
dax_dev
);
long
dax_direct_access
(
struct
dax_device
*
dax_dev
,
pgoff_t
pgoff
,
long
nr_pages
,
void
**
kaddr
,
pfn_t
*
pfn
);
size_t
dax_copy_from_iter
(
struct
dax_device
*
dax_dev
,
pgoff_t
pgoff
,
void
*
addr
,
size_t
bytes
,
struct
iov_iter
*
i
);
void
dax_flush
(
struct
dax_device
*
dax_dev
,
void
*
addr
,
size_t
size
);
void
dax_write_cache
(
struct
dax_device
*
dax_dev
,
bool
wc
);
bool
dax_write_cache_enabled
(
struct
dax_device
*
dax_dev
);
ssize_t
dax_iomap_rw
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
,
const
struct
iomap_ops
*
ops
);
...
...
@@ -121,7 +148,4 @@ static inline bool dax_mapping(struct address_space *mapping)
return
mapping
->
host
&&
IS_DAX
(
mapping
->
host
);
}
struct
writeback_control
;
int
dax_writeback_mapping_range
(
struct
address_space
*
mapping
,
struct
block_device
*
bdev
,
struct
writeback_control
*
wbc
);
#endif
include/linux/fs.h
浏览文件 @
e13e75b8
...
...
@@ -3130,6 +3130,10 @@ extern int simple_rmdir(struct inode *, struct dentry *);
extern
int
simple_rename
(
struct
inode
*
,
struct
dentry
*
,
struct
inode
*
,
struct
dentry
*
,
unsigned
int
);
extern
int
noop_fsync
(
struct
file
*
,
loff_t
,
loff_t
,
int
);
extern
int
noop_set_page_dirty
(
struct
page
*
page
);
extern
void
noop_invalidatepage
(
struct
page
*
page
,
unsigned
int
offset
,
unsigned
int
length
);
extern
ssize_t
noop_direct_IO
(
struct
kiocb
*
iocb
,
struct
iov_iter
*
iter
);
extern
int
simple_empty
(
struct
dentry
*
);
extern
int
simple_readpage
(
struct
file
*
file
,
struct
page
*
page
);
extern
int
simple_write_begin
(
struct
file
*
file
,
struct
address_space
*
mapping
,
...
...
include/linux/sched/deadline.h
浏览文件 @
e13e75b8
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_DEADLINE_H
#define _LINUX_SCHED_DEADLINE_H
#include <linux/sched.h>
/*
* SCHED_DEADLINE tasks has negative priorities, reflecting
...
...
@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
{
return
(
s64
)(
a
-
b
)
<
0
;
}
#endif
/* _LINUX_SCHED_DEADLINE_H */
include/linux/sched/isolation.h
浏览文件 @
e13e75b8
...
...
@@ -12,6 +12,7 @@ enum hk_flags {
HK_FLAG_SCHED
=
(
1
<<
3
),
HK_FLAG_TICK
=
(
1
<<
4
),
HK_FLAG_DOMAIN
=
(
1
<<
5
),
HK_FLAG_WQ
=
(
1
<<
6
),
};
#ifdef CONFIG_CPU_ISOLATION
...
...
include/linux/sched/nohz.h
浏览文件 @
e13e75b8
...
...
@@ -37,8 +37,4 @@ extern void wake_up_nohz_cpu(int cpu);
static
inline
void
wake_up_nohz_cpu
(
int
cpu
)
{
}
#endif
#ifdef CONFIG_NO_HZ_FULL
extern
u64
scheduler_tick_max_deferment
(
void
);
#endif
#endif
/* _LINUX_SCHED_NOHZ_H */
include/linux/tick.h
浏览文件 @
e13e75b8
...
...
@@ -113,7 +113,8 @@ enum tick_dep_bits {
#ifdef CONFIG_NO_HZ_COMMON
extern
bool
tick_nohz_enabled
;
extern
int
tick_nohz_tick_stopped
(
void
);
extern
bool
tick_nohz_tick_stopped
(
void
);
extern
bool
tick_nohz_tick_stopped_cpu
(
int
cpu
);
extern
void
tick_nohz_idle_enter
(
void
);
extern
void
tick_nohz_idle_exit
(
void
);
extern
void
tick_nohz_irq_exit
(
void
);
...
...
@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
#else
/* !CONFIG_NO_HZ_COMMON */
#define tick_nohz_enabled (0)
static
inline
int
tick_nohz_tick_stopped
(
void
)
{
return
0
;
}
static
inline
int
tick_nohz_tick_stopped_cpu
(
int
cpu
)
{
return
0
;
}
static
inline
void
tick_nohz_idle_enter
(
void
)
{
}
static
inline
void
tick_nohz_idle_exit
(
void
)
{
}
...
...
include/linux/wait_bit.h
浏览文件 @
e13e75b8
...
...
@@ -262,4 +262,74 @@ int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode
return
out_of_line_wait_on_atomic_t
(
val
,
action
,
mode
);
}
extern
void
init_wait_var_entry
(
struct
wait_bit_queue_entry
*
wbq_entry
,
void
*
var
,
int
flags
);
extern
void
wake_up_var
(
void
*
var
);
extern
wait_queue_head_t
*
__var_waitqueue
(
void
*
p
);
#define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \
({ \
__label__ __out; \
struct wait_queue_head *__wq_head = __var_waitqueue(var); \
struct wait_bit_queue_entry __wbq_entry; \
long __ret = ret;
/* explicit shadow */
\
\
init_wait_var_entry(&__wbq_entry, var, \
exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
for (;;) { \
long __int = prepare_to_wait_event(__wq_head, \
&__wbq_entry.wq_entry, \
state); \
if (condition) \
break; \
\
if (___wait_is_interruptible(state) && __int) { \
__ret = __int; \
goto __out; \
} \
\
cmd; \
} \
finish_wait(__wq_head, &__wbq_entry.wq_entry); \
__out: __ret; \
})
#define __wait_var_event(var, condition) \
___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
schedule())
#define wait_var_event(var, condition) \
do { \
might_sleep(); \
if (condition) \
break; \
__wait_var_event(var, condition); \
} while (0)
#define __wait_var_event_killable(var, condition) \
___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \
schedule())
#define wait_var_event_killable(var, condition) \
({ \
int __ret = 0; \
might_sleep(); \
if (!(condition)) \
__ret = __wait_var_event_killable(var, condition); \
__ret; \
})
#define __wait_var_event_timeout(var, condition, timeout) \
___wait_var_event(var, ___wait_cond_timeout(condition), \
TASK_UNINTERRUPTIBLE, 0, timeout, \
__ret = schedule_timeout(__ret))
#define wait_var_event_timeout(var, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_var_event_timeout(var, condition, timeout); \
__ret; \
})
#endif
/* _LINUX_WAIT_BIT_H */
kernel/sched/Makefile
浏览文件 @
e13e75b8
...
...
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
obj-y
+=
core.o loadavg.o clock.o cputime.o
obj-y
+=
idle_task.o fair.o rt.o deadline.o
obj-y
+=
wait.o wait_bit.o swait.o completion.o idle.o
obj-y
+=
idle.o fair.o rt.o deadline.o
obj-y
+=
wait.o wait_bit.o swait.o completion.o
obj-$(CONFIG_SMP)
+=
cpupri.o cpudeadline.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP)
+=
autogroup.o
obj-$(CONFIG_SCHEDSTATS)
+=
stats.o
...
...
kernel/sched/autogroup.c
浏览文件 @
e13e75b8
// SPDX-License-Identifier: GPL-2.0
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/utsname.h>
#include <linux/security.h>
#include <linux/export.h>
/*
* Auto-group scheduling implementation:
*/
#include "sched.h"
unsigned
int
__read_mostly
sysctl_sched_autogroup_enabled
=
1
;
...
...
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
autogroup_kref_put
(
prev
);
}
/* Allocates GFP_KERNEL, cannot be called under any spinlock */
/* Allocates GFP_KERNEL, cannot be called under any spinlock
:
*/
void
sched_autogroup_create_attach
(
struct
task_struct
*
p
)
{
struct
autogroup
*
ag
=
autogroup_create
();
autogroup_move_group
(
p
,
ag
);
/* drop extra reference added by autogroup_create() */
/* Drop extra reference added by autogroup_create(): */
autogroup_kref_put
(
ag
);
}
EXPORT_SYMBOL
(
sched_autogroup_create_attach
);
/* Cannot be called under siglock.
Currently has no users
*/
/* Cannot be called under siglock.
Currently has no users:
*/
void
sched_autogroup_detach
(
struct
task_struct
*
p
)
{
autogroup_move_group
(
p
,
&
autogroup_default
);
...
...
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
return
1
;
}
__setup
(
"noautogroup"
,
setup_autogroup
);
#ifdef CONFIG_PROC_FS
...
...
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
if
(
nice
<
0
&&
!
can_nice
(
current
,
nice
))
return
-
EPERM
;
/*
this is a heavy operation
taking global locks.. */
/*
This is a heavy operation,
taking global locks.. */
if
(
!
capable
(
CAP_SYS_ADMIN
)
&&
time_before
(
jiffies
,
next
))
return
-
EAGAIN
;
...
...
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
return
snprintf
(
buf
,
buflen
,
"%s-%ld"
,
"/autogroup"
,
tg
->
autogroup
->
id
);
}
#endif
/* CONFIG_SCHED_DEBUG */
#endif
kernel/sched/autogroup.h
浏览文件 @
e13e75b8
/* SPDX-License-Identifier: GPL-2.0 */
#ifdef CONFIG_SCHED_AUTOGROUP
#include <linux/kref.h>
#include <linux/rwsem.h>
#include <linux/sched/autogroup.h>
struct
autogroup
{
/*
*
reference doesn't mean how many thread
attach to this
* autogroup now. It just stands for the number of task
* could use this autogroup.
*
Reference doesn't mean how many threads
attach to this
* autogroup now. It just stands for the number of task
s
*
which
could use this autogroup.
*/
struct
kref
kref
;
struct
task_group
*
tg
;
...
...
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
return
tg
;
}
#ifdef CONFIG_SCHED_DEBUG
static
inline
int
autogroup_path
(
struct
task_group
*
tg
,
char
*
buf
,
int
buflen
)
{
return
0
;
}
#endif
#endif
/* CONFIG_SCHED_AUTOGROUP */
kernel/sched/clock.c
浏览文件 @
e13e75b8
/*
* sched_clock
for unstable cpu
clocks
* sched_clock
() for unstable CPU
clocks
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
*
...
...
@@ -11,7 +11,7 @@
* Guillaume Chazarain <guichaz@gmail.com>
*
*
* What:
* What
this file implements
:
*
* cpu_clock(i) provides a fast (execution time) high resolution
* clock with bounded drift between CPUs. The value of cpu_clock(i)
...
...
@@ -26,11 +26,11 @@
* at 0 on boot (but people really shouldn't rely on that).
*
* cpu_clock(i) -- can be used from any context, including NMI.
* local_clock() -- is cpu_clock() on the current
cpu
.
* local_clock() -- is cpu_clock() on the current
CPU
.
*
* sched_clock_cpu(i)
*
* How:
* How
it is implemented
:
*
* The implementation either uses sched_clock() when
* !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
...
...
@@ -52,19 +52,7 @@
* that is otherwise invisible (TSC gets stopped).
*
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
#include <linux/nmi.h>
#include <linux/sched/clock.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
#include <linux/init.h>
#include "sched.h"
/*
* Scheduler clock - returns current time in nanosec units.
...
...
@@ -302,21 +290,21 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
* cmpxchg64 below only protects one readout.
*
* We must reread via sched_clock_local() in the retry case on
* 32
bit
as an NMI could use sched_clock_local() via the
* 32
-bit kernels
as an NMI could use sched_clock_local() via the
* tracer and hit between the readout of
* the low
32bit and the high 32
bit portion.
* the low
32-bit and the high 32-
bit portion.
*/
this_clock
=
sched_clock_local
(
my_scd
);
/*
* We must enforce atomic readout on 32bit, otherwise the
* update on the remote
cpu
can hit inbetween the readout of
* the low
32bit and the high 32
bit portion.
* We must enforce atomic readout on 32
-
bit, otherwise the
* update on the remote
CPU
can hit inbetween the readout of
* the low
32-bit and the high 32-
bit portion.
*/
remote_clock
=
cmpxchg64
(
&
scd
->
clock
,
0
,
0
);
#else
/*
* On 64
bit
the read of [my]scd->clock is atomic versus the
* update, so we can avoid the above 32bit dance.
* On 64
-bit kernels
the read of [my]scd->clock is atomic versus the
* update, so we can avoid the above 32
-
bit dance.
*/
sched_clock_local
(
my_scd
);
again:
...
...
kernel/sched/completion.c
浏览文件 @
e13e75b8
...
...
@@ -11,10 +11,7 @@
* typically be used for exclusion which gives rise to priority inversion.
* Waiting for completion is a typically sync point, but not an exclusion point.
*/
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/completion.h>
#include "sched.h"
/**
* complete: - signals a single thread waiting on this completion
...
...
kernel/sched/core.c
浏览文件 @
e13e75b8
...
...
@@ -5,37 +5,11 @@
*
* Copyright (C) 1991-2002 Linus Torvalds
*/
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <uapi/linux/sched/types.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/hotplug.h>
#include <linux/wait_bit.h>
#include <linux/cpuset.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
#include <linux/mmu_context.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/sched/isolation.h>
#include "sched.h"
#include <asm/switch_to.h>
#include <asm/tlb.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
#include "sched.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
...
...
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
* [L] ->on_rq
* RELEASE (rq->lock)
*
* If we observe the old
cpu
in task_rq_lock, the acquire of
* If we observe the old
CPU
in task_rq_lock, the acquire of
* the old rq->lock will fully serialize against the stores.
*
* If we observe the new CPU in task_rq_lock, the acquire will
...
...
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
}
#endif
/* CONFIG_SMP */
static
void
init_rq_hrtick
(
struct
rq
*
rq
)
static
void
hrtick_rq_init
(
struct
rq
*
rq
)
{
#ifdef CONFIG_SMP
rq
->
hrtick_csd_pending
=
0
;
...
...
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
{
}
static
inline
void
init_rq_hrtick
(
struct
rq
*
rq
)
static
inline
void
hrtick_rq_init
(
struct
rq
*
rq
)
{
}
#endif
/* CONFIG_SCHED_HRTICK */
...
...
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
*
* - cpu_active must be a subset of cpu_online
*
* - on
cpu-up we allow per-cpu kthreads on the online && !active cpu
,
* - on
CPU-up we allow per-CPU kthreads on the online && !active CPU
,
* see __set_cpus_allowed_ptr(). At this point the newly online
* CPU isn't yet part of the sched domains, and balancing will not
* see it.
...
...
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
raw_spin_unlock_irq
(
&
rq
->
lock
);
}
/*
* NOP if the arch has not defined these:
*/
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
/**
* prepare_task_switch - prepare to switch tasks
* @rq: the runqueue preparing to switch
...
...
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
* 64-bit doesn't need locks to atomically read a 64bit value.
* 64-bit doesn't need locks to atomically read a 64
-
bit value.
* So we have a optimization chance when the task's delta_exec is 0.
* Reading ->on_cpu is racy, but this is ok.
*
...
...
@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
rq
->
idle_balance
=
idle_cpu
(
cpu
);
trigger_load_balance
(
rq
);
#endif
rq_last_tick_reset
(
rq
);
}
#ifdef CONFIG_NO_HZ_FULL
/**
* scheduler_tick_max_deferment
*
* Keep at least one tick per second when a single
* active task is running because the scheduler doesn't
* yet completely support full dynticks environment.
*
* This makes sure that uptime, CFS vruntime, load
* balancing, etc... continue to move forward, even
* with a very low granularity.
*
* Return: Maximum deferment in nanoseconds.
*/
u64
scheduler_tick_max_deferment
(
void
)
struct
tick_work
{
int
cpu
;
struct
delayed_work
work
;
};
static
struct
tick_work
__percpu
*
tick_work_cpu
;
static
void
sched_tick_remote
(
struct
work_struct
*
work
)
{
struct
rq
*
rq
=
this_rq
();
unsigned
long
next
,
now
=
READ_ONCE
(
jiffies
);
struct
delayed_work
*
dwork
=
to_delayed_work
(
work
);
struct
tick_work
*
twork
=
container_of
(
dwork
,
struct
tick_work
,
work
);
int
cpu
=
twork
->
cpu
;
struct
rq
*
rq
=
cpu_rq
(
cpu
);
struct
rq_flags
rf
;
next
=
rq
->
last_sched_tick
+
HZ
;
/*
* Handle the tick only if it appears the remote CPU is running in full
* dynticks mode. The check is racy by nature, but missing a tick or
* having one too much is no big deal because the scheduler tick updates
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
if
(
!
idle_cpu
(
cpu
)
&&
tick_nohz_tick_stopped_cpu
(
cpu
))
{
struct
task_struct
*
curr
;
u64
delta
;
if
(
time_before_eq
(
next
,
now
))
return
0
;
rq_lock_irq
(
rq
,
&
rf
);
update_rq_clock
(
rq
);
curr
=
rq
->
curr
;
delta
=
rq_clock_task
(
rq
)
-
curr
->
se
.
exec_start
;
return
jiffies_to_nsecs
(
next
-
now
);
/*
* Make sure the next tick runs within a reasonable
* amount of time.
*/
WARN_ON_ONCE
(
delta
>
(
u64
)
NSEC_PER_SEC
*
3
);
curr
->
sched_class
->
task_tick
(
rq
,
curr
,
0
);
rq_unlock_irq
(
rq
,
&
rf
);
}
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
* to keep scheduler internal stats reasonably up to date.
*/
queue_delayed_work
(
system_unbound_wq
,
dwork
,
HZ
);
}
static
void
sched_tick_start
(
int
cpu
)
{
struct
tick_work
*
twork
;
if
(
housekeeping_cpu
(
cpu
,
HK_FLAG_TICK
))
return
;
WARN_ON_ONCE
(
!
tick_work_cpu
);
twork
=
per_cpu_ptr
(
tick_work_cpu
,
cpu
);
twork
->
cpu
=
cpu
;
INIT_DELAYED_WORK
(
&
twork
->
work
,
sched_tick_remote
);
queue_delayed_work
(
system_unbound_wq
,
&
twork
->
work
,
HZ
);
}
#ifdef CONFIG_HOTPLUG_CPU
static
void
sched_tick_stop
(
int
cpu
)
{
struct
tick_work
*
twork
;
if
(
housekeeping_cpu
(
cpu
,
HK_FLAG_TICK
))
return
;
WARN_ON_ONCE
(
!
tick_work_cpu
);
twork
=
per_cpu_ptr
(
tick_work_cpu
,
cpu
);
cancel_delayed_work_sync
(
&
twork
->
work
);
}
#endif
/* CONFIG_HOTPLUG_CPU */
int
__init
sched_tick_offload_init
(
void
)
{
tick_work_cpu
=
alloc_percpu
(
struct
tick_work
);
BUG_ON
(
!
tick_work_cpu
);
return
0
;
}
#else
/* !CONFIG_NO_HZ_FULL */
static
inline
void
sched_tick_start
(
int
cpu
)
{
}
static
inline
void
sched_tick_stop
(
int
cpu
)
{
}
#endif
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
...
...
@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu)
{
set_cpu_rq_start_time
(
cpu
);
sched_rq_cpu_starting
(
cpu
);
sched_tick_start
(
cpu
);
return
0
;
}
...
...
@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu)
/* Handle pending wakeups and then migrate everything off */
sched_ttwu_pending
();
sched_tick_stop
(
cpu
);
rq_lock_irqsave
(
rq
,
&
rf
);
if
(
rq
->
rd
)
{
...
...
@@ -6024,11 +6076,8 @@ void __init sched_init(void)
rq
->
last_load_update_tick
=
jiffies
;
rq
->
nohz_flags
=
0
;
#endif
#ifdef CONFIG_NO_HZ_FULL
rq
->
last_sched_tick
=
0
;
#endif
#endif
/* CONFIG_SMP */
init_rq_hrtick
(
rq
);
hrtick_rq_init
(
rq
);
atomic_set
(
&
rq
->
nr_iowait
,
0
);
}
...
...
@@ -7027,3 +7076,5 @@ const u32 sched_prio_to_wmult[40] = {
/* 10 */
39045157
,
49367440
,
61356676
,
76695844
,
95443717
,
/* 15 */
119304647
,
148102320
,
186737708
,
238609294
,
286331153
,
};
#undef CREATE_TRACE_POINTS
kernel/sched/cpuacct.c
浏览文件 @
e13e75b8
// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/slab.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/cpumask.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <linux/kernel_stat.h>
#include <linux/err.h>
#include "sched.h"
/*
* CPU accounting code for task groups.
*
* Based on the work by Paul Menage (menage@google.com) and Balbir Singh
* (balbir@in.ibm.com).
*/
#include "sched.h"
/* Time spent by the tasks of the
cpu
accounting group executing in ... */
/* Time spent by the tasks of the
CPU
accounting group executing in ... */
enum
cpuacct_stat_index
{
CPUACCT_STAT_USER
,
/* ... user mode */
CPUACCT_STAT_SYSTEM
,
/* ... kernel mode */
...
...
@@ -35,12 +24,12 @@ struct cpuacct_usage {
u64
usages
[
CPUACCT_STAT_NSTATS
];
};
/* track
cpu
usage of a group of tasks and its child groups */
/* track
CPU
usage of a group of tasks and its child groups */
struct
cpuacct
{
struct
cgroup_subsys_state
css
;
/* cpuusage holds pointer to a u64-type object on every
cpu
*/
struct
cpuacct_usage
__percpu
*
cpuusage
;
struct
kernel_cpustat
__percpu
*
cpustat
;
struct
cgroup_subsys_state
css
;
/* cpuusage holds pointer to a u64-type object on every
CPU
*/
struct
cpuacct_usage
__percpu
*
cpuusage
;
struct
kernel_cpustat
__percpu
*
cpustat
;
};
static
inline
struct
cpuacct
*
css_ca
(
struct
cgroup_subsys_state
*
css
)
...
...
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
return
css
?
container_of
(
css
,
struct
cpuacct
,
css
)
:
NULL
;
}
/*
return cpu
accounting group to which this task belongs */
/*
Return CPU
accounting group to which this task belongs */
static
inline
struct
cpuacct
*
task_ca
(
struct
task_struct
*
tsk
)
{
return
css_ca
(
task_css
(
tsk
,
cpuacct_cgrp_id
));
...
...
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
.
cpuusage
=
&
root_cpuacct_cpuusage
,
};
/*
create a new cpu
accounting group */
/*
Create a new CPU
accounting group */
static
struct
cgroup_subsys_state
*
cpuacct_css_alloc
(
struct
cgroup_subsys_state
*
parent_css
)
{
...
...
@@ -96,7 +85,7 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
return
ERR_PTR
(
-
ENOMEM
);
}
/*
destroy an existing cpu
accounting group */
/*
Destroy an existing CPU
accounting group */
static
void
cpuacct_css_free
(
struct
cgroup_subsys_state
*
css
)
{
struct
cpuacct
*
ca
=
css_ca
(
css
);
...
...
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
#endif
}
/*
return total cpu
usage (in nanoseconds) of a group */
/*
Return total CPU
usage (in nanoseconds) of a group */
static
u64
__cpuusage_read
(
struct
cgroup_subsys_state
*
css
,
enum
cpuacct_stat_index
index
)
{
...
...
kernel/sched/cpudeadline.c
浏览文件 @
e13e75b8
...
...
@@ -10,11 +10,7 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include "cpudeadline.h"
#include "sched.h"
static
inline
int
parent
(
int
i
)
{
...
...
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
return
;
/* adapted from lib/prio_heap.c */
while
(
1
)
{
while
(
1
)
{
u64
largest_dl
;
l
=
left_child
(
idx
);
r
=
right_child
(
idx
);
largest
=
idx
;
...
...
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
return
1
;
}
else
{
int
best_cpu
=
cpudl_maximum
(
cp
);
WARN_ON
(
best_cpu
!=
-
1
&&
!
cpu_present
(
best_cpu
));
if
(
cpumask_test_cpu
(
best_cpu
,
&
p
->
cpus_allowed
)
&&
...
...
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
}
/*
* cpudl_clear - remove a
cpu
from the cpudl max-heap
* cpudl_clear - remove a
CPU
from the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target
cpu
* @cpu: the target
CPU
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
...
...
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
/*
* cpudl_set - update the cpudl max-heap
* @cp: the cpudl max-heap context
* @cpu: the target
cpu
* @dl: the new earliest deadline for this
cpu
* @cpu: the target
CPU
* @dl: the new earliest deadline for this
CPU
*
* Notes: assumes cpu_rq(cpu)->lock is locked
*
...
...
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
old_idx
=
cp
->
elements
[
cpu
].
idx
;
if
(
old_idx
==
IDX_INVALID
)
{
int
new_idx
=
cp
->
size
++
;
cp
->
elements
[
new_idx
].
dl
=
dl
;
cp
->
elements
[
new_idx
].
cpu
=
cpu
;
cp
->
elements
[
cpu
].
idx
=
new_idx
;
...
...
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
/*
* cpudl_set_freecpu - Set the cpudl.free_cpus
* @cp: the cpudl max-heap context
* @cpu: rd attached
cpu
* @cpu: rd attached
CPU
*/
void
cpudl_set_freecpu
(
struct
cpudl
*
cp
,
int
cpu
)
{
...
...
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
/*
* cpudl_clear_freecpu - Clear the cpudl.free_cpus
* @cp: the cpudl max-heap context
* @cpu: rd attached
cpu
* @cpu: rd attached
CPU
*/
void
cpudl_clear_freecpu
(
struct
cpudl
*
cp
,
int
cpu
)
{
...
...
kernel/sched/cpudeadline.h
浏览文件 @
e13e75b8
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUDL_H
#define _LINUX_CPUDL_H
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#define IDX_INVALID -1
#define IDX_INVALID -1
struct
cpudl_item
{
u64
dl
;
int
cpu
;
int
idx
;
u64
dl
;
int
cpu
;
int
idx
;
};
struct
cpudl
{
raw_spinlock_t
lock
;
int
size
;
cpumask_var_t
free_cpus
;
struct
cpudl_item
*
elements
;
raw_spinlock_t
lock
;
int
size
;
cpumask_var_t
free_cpus
;
struct
cpudl_item
*
elements
;
};
#ifdef CONFIG_SMP
int
cpudl_find
(
struct
cpudl
*
cp
,
struct
task_struct
*
p
,
struct
cpumask
*
later_mask
);
int
cpudl_find
(
struct
cpudl
*
cp
,
struct
task_struct
*
p
,
struct
cpumask
*
later_mask
);
void
cpudl_set
(
struct
cpudl
*
cp
,
int
cpu
,
u64
dl
);
void
cpudl_clear
(
struct
cpudl
*
cp
,
int
cpu
);
int
cpudl_init
(
struct
cpudl
*
cp
);
int
cpudl_init
(
struct
cpudl
*
cp
);
void
cpudl_set_freecpu
(
struct
cpudl
*
cp
,
int
cpu
);
void
cpudl_clear_freecpu
(
struct
cpudl
*
cp
,
int
cpu
);
void
cpudl_cleanup
(
struct
cpudl
*
cp
);
#endif
/* CONFIG_SMP */
#endif
/* _LINUX_CPUDL_H */
kernel/sched/cpufreq.c
浏览文件 @
e13e75b8
...
...
@@ -8,7 +8,6 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include "sched.h"
DEFINE_PER_CPU
(
struct
update_util_data
*
,
cpufreq_update_util_data
);
...
...
kernel/sched/cpufreq_schedutil.c
浏览文件 @
e13e75b8
...
...
@@ -11,61 +11,57 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cpufreq.h>
#include <linux/kthread.h>
#include <uapi/linux/sched/types.h>
#include <linux/slab.h>
#include <trace/events/power.h>
#include "sched.h"
#include <trace/events/power.h>
struct
sugov_tunables
{
struct
gov_attr_set
attr_set
;
unsigned
int
rate_limit_us
;
struct
gov_attr_set
attr_set
;
unsigned
int
rate_limit_us
;
};
struct
sugov_policy
{
struct
cpufreq_policy
*
policy
;
struct
sugov_tunables
*
tunables
;
struct
list_head
tunables_hook
;
raw_spinlock_t
update_lock
;
/* For shared policies */
u64
last_freq_update_time
;
s64
freq_update_delay_ns
;
unsigned
int
next_freq
;
unsigned
int
cached_raw_freq
;
/* The next fields are only needed if fast switch cannot be used
.
*/
struct
irq_work
irq_work
;
struct
kthread_work
work
;
struct
mutex
work_lock
;
struct
kthread_worker
worker
;
struct
task_struct
*
thread
;
bool
work_in_progress
;
bool
need_freq_update
;
struct
cpufreq_policy
*
policy
;
struct
sugov_tunables
*
tunables
;
struct
list_head
tunables_hook
;
raw_spinlock_t
update_lock
;
/* For shared policies */
u64
last_freq_update_time
;
s64
freq_update_delay_ns
;
unsigned
int
next_freq
;
unsigned
int
cached_raw_freq
;
/* The next fields are only needed if fast switch cannot be used
:
*/
struct
irq_work
irq_work
;
struct
kthread_work
work
;
struct
mutex
work_lock
;
struct
kthread_worker
worker
;
struct
task_struct
*
thread
;
bool
work_in_progress
;
bool
need_freq_update
;
};
struct
sugov_cpu
{
struct
update_util_data
update_util
;
struct
sugov_policy
*
sg_policy
;
unsigned
int
cpu
;
struct
update_util_data
update_util
;
struct
sugov_policy
*
sg_policy
;
unsigned
int
cpu
;
bool
iowait_boost_pending
;
unsigned
int
iowait_boost
;
unsigned
int
iowait_boost_max
;
bool
iowait_boost_pending
;
unsigned
int
iowait_boost
;
unsigned
int
iowait_boost_max
;
u64
last_update
;
/* The fields below are only needed when sharing a policy
.
*/
unsigned
long
util_cfs
;
unsigned
long
util_dl
;
unsigned
long
max
;
unsigned
int
flags
;
/* The fields below are only needed when sharing a policy
:
*/
unsigned
long
util_cfs
;
unsigned
long
util_dl
;
unsigned
long
max
;
unsigned
int
flags
;
/* The field below is for single-CPU policies only
.
*/
/* The field below is for single-CPU policies only
:
*/
#ifdef CONFIG_NO_HZ_COMMON
unsigned
long
saved_idle_calls
;
unsigned
long
saved_idle_calls
;
#endif
};
...
...
@@ -79,9 +75,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
/*
* Since cpufreq_update_util() is called with rq->lock held for
* the @target_cpu, our per-
cpu
data is fully serialized.
* the @target_cpu, our per-
CPU
data is fully serialized.
*
* However, drivers cannot in general deal with cross-
cpu
* However, drivers cannot in general deal with cross-
CPU
* requests, so while get_next_freq() will work, our
* sugov_update_commit() call may not for the fast switching platforms.
*
...
...
@@ -111,6 +107,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
}
delta_ns
=
time
-
sg_policy
->
last_freq_update_time
;
return
delta_ns
>=
sg_policy
->
freq_update_delay_ns
;
}
...
...
@@ -345,8 +342,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
return
get_next_freq
(
sg_policy
,
util
,
max
);
}
static
void
sugov_update_shared
(
struct
update_util_data
*
hook
,
u64
time
,
unsigned
int
flags
)
static
void
sugov_update_shared
(
struct
update_util_data
*
hook
,
u64
time
,
unsigned
int
flags
)
{
struct
sugov_cpu
*
sg_cpu
=
container_of
(
hook
,
struct
sugov_cpu
,
update_util
);
struct
sugov_policy
*
sg_policy
=
sg_cpu
->
sg_policy
;
...
...
@@ -423,8 +420,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
return
sprintf
(
buf
,
"%u
\n
"
,
tunables
->
rate_limit_us
);
}
static
ssize_t
rate_limit_us_store
(
struct
gov_attr_set
*
attr_set
,
const
char
*
buf
,
size_t
count
)
static
ssize_t
rate_limit_us_store
(
struct
gov_attr_set
*
attr_set
,
const
char
*
buf
,
size_t
count
)
{
struct
sugov_tunables
*
tunables
=
to_sugov_tunables
(
attr_set
);
struct
sugov_policy
*
sg_policy
;
...
...
@@ -479,11 +476,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
{
struct
task_struct
*
thread
;
struct
sched_attr
attr
=
{
.
size
=
sizeof
(
struct
sched_attr
),
.
sched_policy
=
SCHED_DEADLINE
,
.
sched_flags
=
SCHED_FLAG_SUGOV
,
.
sched_nice
=
0
,
.
sched_priority
=
0
,
.
size
=
sizeof
(
struct
sched_attr
),
.
sched_policy
=
SCHED_DEADLINE
,
.
sched_flags
=
SCHED_FLAG_SUGOV
,
.
sched_nice
=
0
,
.
sched_priority
=
0
,
/*
* Fake (unused) bandwidth; workaround to "fix"
* priority inheritance.
...
...
@@ -663,21 +660,21 @@ static int sugov_start(struct cpufreq_policy *policy)
struct
sugov_policy
*
sg_policy
=
policy
->
governor_data
;
unsigned
int
cpu
;
sg_policy
->
freq_update_delay_ns
=
sg_policy
->
tunables
->
rate_limit_us
*
NSEC_PER_USEC
;
sg_policy
->
last_freq_update_time
=
0
;
sg_policy
->
next_freq
=
UINT_MAX
;
sg_policy
->
work_in_progress
=
false
;
sg_policy
->
need_freq_update
=
false
;
sg_policy
->
cached_raw_freq
=
0
;
sg_policy
->
freq_update_delay_ns
=
sg_policy
->
tunables
->
rate_limit_us
*
NSEC_PER_USEC
;
sg_policy
->
last_freq_update_time
=
0
;
sg_policy
->
next_freq
=
UINT_MAX
;
sg_policy
->
work_in_progress
=
false
;
sg_policy
->
need_freq_update
=
false
;
sg_policy
->
cached_raw_freq
=
0
;
for_each_cpu
(
cpu
,
policy
->
cpus
)
{
struct
sugov_cpu
*
sg_cpu
=
&
per_cpu
(
sugov_cpu
,
cpu
);
memset
(
sg_cpu
,
0
,
sizeof
(
*
sg_cpu
));
sg_cpu
->
cpu
=
cpu
;
sg_cpu
->
sg_policy
=
sg_policy
;
sg_cpu
->
flags
=
0
;
sg_cpu
->
iowait_boost_max
=
policy
->
cpuinfo
.
max_freq
;
sg_cpu
->
cpu
=
cpu
;
sg_cpu
->
sg_policy
=
sg_policy
;
sg_cpu
->
flags
=
0
;
sg_cpu
->
iowait_boost_max
=
policy
->
cpuinfo
.
max_freq
;
}
for_each_cpu
(
cpu
,
policy
->
cpus
)
{
...
...
@@ -721,14 +718,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
}
static
struct
cpufreq_governor
schedutil_gov
=
{
.
name
=
"schedutil"
,
.
owner
=
THIS_MODULE
,
.
dynamic_switching
=
true
,
.
init
=
sugov_init
,
.
exit
=
sugov_exit
,
.
start
=
sugov_start
,
.
stop
=
sugov_stop
,
.
limits
=
sugov_limits
,
.
name
=
"schedutil"
,
.
owner
=
THIS_MODULE
,
.
dynamic_switching
=
true
,
.
init
=
sugov_init
,
.
exit
=
sugov_exit
,
.
start
=
sugov_start
,
.
stop
=
sugov_stop
,
.
limits
=
sugov_limits
,
};
#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
...
...
kernel/sched/cpupri.c
浏览文件 @
e13e75b8
...
...
@@ -14,7 +14,7 @@
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
* a 2 dimensional bitmap (the first for priority class, the second for
cpu
s
* a 2 dimensional bitmap (the first for priority class, the second for
CPU
s
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
...
...
@@ -26,12 +26,7 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/slab.h>
#include "cpupri.h"
#include "sched.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
static
int
convert_prio
(
int
prio
)
...
...
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
}
/**
* cpupri_set - update the
cpu
priority setting
* cpupri_set - update the
CPU
priority setting
* @cp: The cpupri context
* @cpu: The target
cpu
* @cpu: The target
CPU
* @newpri: The priority (INVALID-RT99) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
...
...
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
return
;
/*
* If the
cpu
was currently mapped to a different value, we
* If the
CPU
was currently mapped to a different value, we
* need to map it to the new value then remove the old value.
* Note, we must add the new value first, otherwise we risk the
* cpu being missed by the priority loop in cpupri_find.
...
...
kernel/sched/cpupri.h
浏览文件 @
e13e75b8
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUPRI_H
#define _LINUX_CPUPRI_H
#include <linux/sched.h>
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
#define CPUPRI_INVALID
-1
#define CPUPRI_IDLE
0
#define CPUPRI_NORMAL
1
#define CPUPRI_INVALID
-1
#define CPUPRI_IDLE
0
#define CPUPRI_NORMAL
1
/* values 2-101 are RT priorities 0-99 */
struct
cpupri_vec
{
atomic_t
count
;
cpumask_var_t
mask
;
atomic_t
count
;
cpumask_var_t
mask
;
};
struct
cpupri
{
struct
cpupri_vec
pri_to_cpu
[
CPUPRI_NR_PRIORITIES
];
int
*
cpu_to_pri
;
struct
cpupri_vec
pri_to_cpu
[
CPUPRI_NR_PRIORITIES
];
int
*
cpu_to_pri
;
};
#ifdef CONFIG_SMP
int
cpupri_find
(
struct
cpupri
*
cp
,
struct
task_struct
*
p
,
struct
cpumask
*
lowest_mask
);
int
cpupri_find
(
struct
cpupri
*
cp
,
struct
task_struct
*
p
,
struct
cpumask
*
lowest_mask
);
void
cpupri_set
(
struct
cpupri
*
cp
,
int
cpu
,
int
pri
);
int
cpupri_init
(
struct
cpupri
*
cp
);
int
cpupri_init
(
struct
cpupri
*
cp
);
void
cpupri_cleanup
(
struct
cpupri
*
cp
);
#endif
#endif
/* _LINUX_CPUPRI_H */
kernel/sched/cputime.c
浏览文件 @
e13e75b8
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/tsacct_kern.h>
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
#include <linux/sched/cputime.h>
/*
* Simple CPU accounting cgroup controller
*/
#include "sched.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
...
...
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
}
/*
* Account user
cpu
time to a process.
* @p: the process that the
cpu
time gets accounted to
* @cputime: the
cpu
time spent in user space since the last update
* Account user
CPU
time to a process.
* @p: the process that the
CPU
time gets accounted to
* @cputime: the
CPU
time spent in user space since the last update
*/
void
account_user_time
(
struct
task_struct
*
p
,
u64
cputime
)
{
...
...
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
}
/*
* Account guest
cpu
time to a process.
* @p: the process that the
cpu
time gets accounted to
* @cputime: the
cpu
time spent in virtual machine since the last update
* Account guest
CPU
time to a process.
* @p: the process that the
CPU
time gets accounted to
* @cputime: the
CPU
time spent in virtual machine since the last update
*/
void
account_guest_time
(
struct
task_struct
*
p
,
u64
cputime
)
{
...
...
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
}
/*
* Account system
cpu
time to a process and desired cpustat field
* @p: the process that the
cpu
time gets accounted to
* @cputime: the
cpu
time spent in kernel space since the last update
* Account system
CPU
time to a process and desired cpustat field
* @p: the process that the
CPU
time gets accounted to
* @cputime: the
CPU
time spent in kernel space since the last update
* @index: pointer to cpustat field that has to be updated
*/
void
account_system_index_time
(
struct
task_struct
*
p
,
...
...
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
}
/*
* Account system
cpu
time to a process.
* @p: the process that the
cpu
time gets accounted to
* Account system
CPU
time to a process.
* @p: the process that the
CPU
time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
* @cputime: the
cpu
time spent in kernel space since the last update
* @cputime: the
CPU
time spent in kernel space since the last update
*/
void
account_system_time
(
struct
task_struct
*
p
,
int
hardirq_offset
,
u64
cputime
)
{
...
...
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
/*
* Account for involuntary wait time.
* @cputime: the
cpu
time spent in involuntary wait
* @cputime: the
CPU
time spent in involuntary wait
*/
void
account_steal_time
(
u64
cputime
)
{
...
...
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
/*
* Account for idle time.
* @cputime: the
cpu
time spent in idle wait
* @cputime: the
CPU
time spent in idle wait
*/
void
account_idle_time
(
u64
cputime
)
{
...
...
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* Account a tick to a process and cpustat
* @p: the process that the
cpu
time gets accounted to
* @p: the process that the
CPU
time gets accounted to
* @user_tick: is the tick from userspace
* @rq: the pointer to rq
*
...
...
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
irqtime_account_process_tick
(
current
,
0
,
rq
,
ticks
);
}
#else
/* CONFIG_IRQ_TIME_ACCOUNTING */
static
inline
void
irqtime_account_idle_ticks
(
int
ticks
)
{}
static
inline
void
irqtime_account_idle_ticks
(
int
ticks
)
{
}
static
inline
void
irqtime_account_process_tick
(
struct
task_struct
*
p
,
int
user_tick
,
struct
rq
*
rq
,
int
nr_ticks
)
{}
struct
rq
*
rq
,
int
nr_ticks
)
{
}
#endif
/* CONFIG_IRQ_TIME_ACCOUNTING */
/*
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void
vtime_common_task_switch
(
struct
task_struct
*
prev
)
{
if
(
is_idle_task
(
prev
))
...
...
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
vtime_flush
(
prev
);
arch_vtime_task_switch
(
prev
);
}
#endif
# endif
#endif
/* CONFIG_VIRT_CPU_ACCOUNTING */
...
...
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
*
ut
=
cputime
.
utime
;
*
st
=
cputime
.
stime
;
}
#else
/* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#else
/* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
/*
* Account a single tick of
cpu
time.
* @p: the process that the
cpu
time gets accounted to
* Account a single tick of
CPU
time.
* @p: the process that the
CPU
time gets accounted to
* @user_tick: indicates if the tick is a user or a system tick
*/
void
account_process_tick
(
struct
task_struct
*
p
,
int
user_tick
)
...
...
kernel/sched/deadline.c
浏览文件 @
e13e75b8
...
...
@@ -17,9 +17,6 @@
*/
#include "sched.h"
#include <linux/slab.h>
#include <uapi/linux/sched/types.h>
struct
dl_bandwidth
def_dl_bandwidth
;
static
inline
struct
task_struct
*
dl_task_of
(
struct
sched_dl_entity
*
dl_se
)
...
...
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
static
void
push_dl_tasks
(
struct
rq
*
);
static
void
pull_dl_task
(
struct
rq
*
);
static
inline
void
queue_push_tasks
(
struct
rq
*
rq
)
static
inline
void
deadline_
queue_push_tasks
(
struct
rq
*
rq
)
{
if
(
!
has_pushable_dl_tasks
(
rq
))
return
;
...
...
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
queue_balance_callback
(
rq
,
&
per_cpu
(
dl_push_head
,
rq
->
cpu
),
push_dl_tasks
);
}
static
inline
void
queue_pull_task
(
struct
rq
*
rq
)
static
inline
void
deadline_
queue_pull_task
(
struct
rq
*
rq
)
{
queue_balance_callback
(
rq
,
&
per_cpu
(
dl_pull_head
,
rq
->
cpu
),
pull_dl_task
);
}
...
...
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
/*
* If we cannot preempt any rq, fall back to pick any
* online
cpu.
* online
CPU:
*/
cpu
=
cpumask_any_and
(
cpu_active_mask
,
&
p
->
cpus_allowed
);
if
(
cpu
>=
nr_cpu_ids
)
{
/*
* Fail
to find any suitable cpu
.
* Fail
ed to find any suitable CPU
.
* The task will never come back!
*/
BUG_ON
(
dl_bandwidth_enabled
());
...
...
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
{
}
static
inline
void
queue_push_tasks
(
struct
rq
*
rq
)
static
inline
void
deadline_
queue_push_tasks
(
struct
rq
*
rq
)
{
}
static
inline
void
queue_pull_task
(
struct
rq
*
rq
)
static
inline
void
deadline_
queue_pull_task
(
struct
rq
*
rq
)
{
}
#endif
/* CONFIG_SMP */
static
void
enqueue_task_dl
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
static
void
__dequeue_task_dl
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
static
void
check_preempt_curr_dl
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
static
void
check_preempt_curr_dl
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
/*
* We are being explicitly informed that a new instance is starting,
...
...
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if
(
hrtick_enabled
(
rq
))
start_hrtick_dl
(
rq
,
p
);
queue_push_tasks
(
rq
);
deadline_
queue_push_tasks
(
rq
);
return
p
;
}
...
...
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
enqueue_pushable_dl_task
(
rq
,
p
);
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static
void
task_tick_dl
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
queued
)
{
update_curr_dl
(
rq
);
...
...
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
/*
* We have to consider system topology and task affinity
* first, then we can look for a suitable
cpu
.
* first, then we can look for a suitable
CPU
.
*/
if
(
!
cpudl_find
(
&
task_rq
(
task
)
->
rd
->
cpudl
,
task
,
later_mask
))
return
-
1
;
...
...
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
* Now we check how well this matches with task's
* affinity and system topology.
*
* The last
cpu
where the task run is our first
* The last
CPU
where the task run is our first
* guess, since it is most likely cache-hot there.
*/
if
(
cpumask_test_cpu
(
cpu
,
later_mask
))
...
...
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
best_cpu
=
cpumask_first_and
(
later_mask
,
sched_domain_span
(
sd
));
/*
* Last chance: if a
cpu
being in both later_mask
* Last chance: if a
CPU
being in both later_mask
* and current sd span is valid, that becomes our
* choice. Of course, the latest possible
cpu
is
* choice. Of course, the latest possible
CPU
is
* already under consideration through later_mask.
*/
if
(
best_cpu
<
nr_cpu_ids
)
{
...
...
@@ -2067,7 +2071,7 @@ static int push_dl_task(struct rq *rq)
if
(
task
==
next_task
)
{
/*
* The task is still there. We don't try
* again, some other
cpu
will pull it when ready.
* again, some other
CPU
will pull it when ready.
*/
goto
out
;
}
...
...
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
* from an overloaded
cpu
, if any.
* from an overloaded
CPU
, if any.
*/
if
(
!
task_on_rq_queued
(
p
)
||
rq
->
dl
.
dl_nr_running
)
return
;
queue_pull_task
(
rq
);
deadline_
queue_pull_task
(
rq
);
}
/*
...
...
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if
(
rq
->
curr
!=
p
)
{
#ifdef CONFIG_SMP
if
(
p
->
nr_cpus_allowed
>
1
&&
rq
->
dl
.
overloaded
)
queue_push_tasks
(
rq
);
deadline_
queue_push_tasks
(
rq
);
#endif
if
(
dl_task
(
rq
->
curr
))
check_preempt_curr_dl
(
rq
,
p
,
0
);
...
...
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
* or lowering its prio, so...
*/
if
(
!
rq
->
dl
.
overloaded
)
queue_pull_task
(
rq
);
deadline_
queue_pull_task
(
rq
);
/*
* If we now have a earlier deadline task than p,
...
...
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p)
{
struct
sched_dl_entity
*
dl_se
=
&
p
->
dl
;
dl_se
->
dl_runtime
=
0
;
dl_se
->
dl_deadline
=
0
;
dl_se
->
dl_period
=
0
;
dl_se
->
flags
=
0
;
dl_se
->
dl_bw
=
0
;
dl_se
->
dl_density
=
0
;
dl_se
->
dl_runtime
=
0
;
dl_se
->
dl_deadline
=
0
;
dl_se
->
dl_period
=
0
;
dl_se
->
flags
=
0
;
dl_se
->
dl_bw
=
0
;
dl_se
->
dl_density
=
0
;
dl_se
->
dl_throttled
=
0
;
dl_se
->
dl_yielded
=
0
;
dl_se
->
dl_non_contending
=
0
;
dl_se
->
dl_overrun
=
0
;
dl_se
->
dl_throttled
=
0
;
dl_se
->
dl_yielded
=
0
;
dl_se
->
dl_non_contending
=
0
;
dl_se
->
dl_overrun
=
0
;
}
bool
dl_param_changed
(
struct
task_struct
*
p
,
const
struct
sched_attr
*
attr
)
...
...
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
#ifdef CONFIG_SMP
int
dl_task_can_attach
(
struct
task_struct
*
p
,
const
struct
cpumask
*
cs_cpus_allowed
)
{
unsigned
int
dest_cpu
=
cpumask_any_and
(
cpu_active_mask
,
cs_cpus_allowed
);
unsigned
int
dest_cpu
;
struct
dl_bw
*
dl_b
;
bool
overflow
;
int
cpus
,
ret
;
unsigned
long
flags
;
dest_cpu
=
cpumask_any_and
(
cpu_active_mask
,
cs_cpus_allowed
);
rcu_read_lock_sched
();
dl_b
=
dl_bw_of
(
dest_cpu
);
raw_spin_lock_irqsave
(
&
dl_b
->
lock
,
flags
);
cpus
=
dl_bw_cpus
(
dest_cpu
);
overflow
=
__dl_overflow
(
dl_b
,
cpus
,
0
,
p
->
dl
.
dl_bw
);
if
(
overflow
)
if
(
overflow
)
{
ret
=
-
EBUSY
;
else
{
}
else
{
/*
* We reserve space for this task in the destination
* root_domain, as we can't fail after this point.
...
...
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
}
raw_spin_unlock_irqrestore
(
&
dl_b
->
lock
,
flags
);
rcu_read_unlock_sched
();
return
ret
;
}
...
...
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
ret
=
0
;
raw_spin_unlock_irqrestore
(
&
cur_dl_b
->
lock
,
flags
);
rcu_read_unlock_sched
();
return
ret
;
}
...
...
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu)
overflow
=
__dl_overflow
(
dl_b
,
cpus
,
0
,
0
);
raw_spin_unlock_irqrestore
(
&
dl_b
->
lock
,
flags
);
rcu_read_unlock_sched
();
return
overflow
;
}
#endif
...
...
kernel/sched/debug.c
浏览文件 @
e13e75b8
/*
* kernel/sched/debug.c
*
* Print the CFS rbtree
* Print the CFS rbtree
and other debugging details
*
* Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
*
...
...
@@ -9,16 +9,6 @@
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/proc_fs.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/mempolicy.h>
#include <linux/debugfs.h>
#include "sched.h"
static
DEFINE_SPINLOCK
(
sched_debug_lock
);
...
...
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
if
(
table
==
NULL
)
return
NULL
;
set_table_entry
(
&
table
[
0
],
"min_interval"
,
&
sd
->
min_interval
,
sizeof
(
long
),
0644
,
proc_doulongvec_minmax
,
false
);
set_table_entry
(
&
table
[
1
],
"max_interval"
,
&
sd
->
max_interval
,
sizeof
(
long
),
0644
,
proc_doulongvec_minmax
,
false
);
set_table_entry
(
&
table
[
2
],
"busy_idx"
,
&
sd
->
busy_idx
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
3
],
"idle_idx"
,
&
sd
->
idle_idx
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
4
],
"newidle_idx"
,
&
sd
->
newidle_idx
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
5
],
"wake_idx"
,
&
sd
->
wake_idx
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
6
],
"forkexec_idx"
,
&
sd
->
forkexec_idx
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
7
],
"busy_factor"
,
&
sd
->
busy_factor
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
false
);
set_table_entry
(
&
table
[
8
],
"imbalance_pct"
,
&
sd
->
imbalance_pct
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
false
);
set_table_entry
(
&
table
[
9
],
"cache_nice_tries"
,
&
sd
->
cache_nice_tries
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
false
);
set_table_entry
(
&
table
[
10
],
"flags"
,
&
sd
->
flags
,
sizeof
(
int
),
0644
,
proc_dointvec_minmax
,
false
);
set_table_entry
(
&
table
[
11
],
"max_newidle_lb_cost"
,
&
sd
->
max_newidle_lb_cost
,
sizeof
(
long
),
0644
,
proc_doulongvec_minmax
,
false
);
set_table_entry
(
&
table
[
12
],
"name"
,
sd
->
name
,
CORENAME_MAX_SIZE
,
0444
,
proc_dostring
,
false
);
set_table_entry
(
&
table
[
0
]
,
"min_interval"
,
&
sd
->
min_interval
,
sizeof
(
long
),
0644
,
proc_doulongvec_minmax
,
false
);
set_table_entry
(
&
table
[
1
]
,
"max_interval"
,
&
sd
->
max_interval
,
sizeof
(
long
),
0644
,
proc_doulongvec_minmax
,
false
);
set_table_entry
(
&
table
[
2
]
,
"busy_idx"
,
&
sd
->
busy_idx
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
3
]
,
"idle_idx"
,
&
sd
->
idle_idx
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
4
]
,
"newidle_idx"
,
&
sd
->
newidle_idx
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
5
]
,
"wake_idx"
,
&
sd
->
wake_idx
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
6
]
,
"forkexec_idx"
,
&
sd
->
forkexec_idx
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
true
);
set_table_entry
(
&
table
[
7
]
,
"busy_factor"
,
&
sd
->
busy_factor
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
false
);
set_table_entry
(
&
table
[
8
]
,
"imbalance_pct"
,
&
sd
->
imbalance_pct
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
false
);
set_table_entry
(
&
table
[
9
]
,
"cache_nice_tries"
,
&
sd
->
cache_nice_tries
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
false
);
set_table_entry
(
&
table
[
10
],
"flags"
,
&
sd
->
flags
,
sizeof
(
int
)
,
0644
,
proc_dointvec_minmax
,
false
);
set_table_entry
(
&
table
[
11
],
"max_newidle_lb_cost"
,
&
sd
->
max_newidle_lb_cost
,
sizeof
(
long
),
0644
,
proc_doulongvec_minmax
,
false
);
set_table_entry
(
&
table
[
12
],
"name"
,
sd
->
name
,
CORENAME_MAX_SIZE
,
0444
,
proc_dostring
,
false
);
/* &table[13] is terminator */
return
table
;
...
...
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return
table
;
}
static
cpumask_var_t
sd_sysctl_cpus
;
static
struct
ctl_table_header
*
sd_sysctl_header
;
static
cpumask_var_t
sd_sysctl_cpus
;
static
struct
ctl_table_header
*
sd_sysctl_header
;
void
register_sched_domain_sysctl
(
void
)
{
...
...
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
{
struct
sched_entity
*
se
=
tg
->
se
[
cpu
];
#define P(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define P_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define PN(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN_SCHEDSTAT(F) \
SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
if
(
!
se
)
return
;
...
...
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN
(
se
->
exec_start
);
PN
(
se
->
vruntime
);
PN
(
se
->
sum_exec_runtime
);
if
(
schedstat_enabled
())
{
PN_SCHEDSTAT
(
se
->
statistics
.
wait_start
);
PN_SCHEDSTAT
(
se
->
statistics
.
sleep_start
);
...
...
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN_SCHEDSTAT
(
se
->
statistics
.
wait_sum
);
P_SCHEDSTAT
(
se
->
statistics
.
wait_count
);
}
P
(
se
->
load
.
weight
);
P
(
se
->
runnable_weight
);
#ifdef CONFIG_SMP
...
...
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
return
group_path
;
cgroup_path
(
tg
->
css
.
cgroup
,
group_path
,
PATH_MAX
);
return
group_path
;
}
#endif
...
...
@@ -804,9 +778,9 @@ void sysrq_sched_debug_show(void)
/*
* This itererator needs some explanation.
* It returns 1 for the header position.
* This means 2 is
cpu
0.
* In a hotplugged system some
cpus, including cpu
0, may be missing so we have
* to use cpumask_* to iterate over the
cpu
s.
* This means 2 is
CPU
0.
* In a hotplugged system some
CPUs, including CPU
0, may be missing so we have
* to use cpumask_* to iterate over the
CPU
s.
*/
static
void
*
sched_debug_start
(
struct
seq_file
*
file
,
loff_t
*
offset
)
{
...
...
@@ -826,6 +800,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
if
(
n
<
nr_cpu_ids
)
return
(
void
*
)(
unsigned
long
)(
n
+
2
);
return
NULL
;
}
...
...
@@ -840,10 +815,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
}
static
const
struct
seq_operations
sched_debug_sops
=
{
.
start
=
sched_debug_start
,
.
next
=
sched_debug_next
,
.
stop
=
sched_debug_stop
,
.
show
=
sched_debug_show
,
.
start
=
sched_debug_start
,
.
next
=
sched_debug_next
,
.
stop
=
sched_debug_stop
,
.
show
=
sched_debug_show
,
};
static
int
sched_debug_release
(
struct
inode
*
inode
,
struct
file
*
file
)
...
...
@@ -881,14 +856,10 @@ static int __init init_sched_debug_procfs(void)
__initcall
(
init_sched_debug_procfs
);
#define __P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) \
SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) \
SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
#ifdef CONFIG_NUMA_BALANCING
...
...
kernel/sched/fair.c
浏览文件 @
e13e75b8
...
...
@@ -20,25 +20,10 @@
* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/sched/mm.h>
#include <linux/sched/topology.h>
#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
#include <linux/sched/isolation.h>
#include "sched.h"
#include <trace/events/sched.h>
#include "sched.h"
/*
* Targeted preemption latency for CPU-bound tasks:
*
...
...
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
#ifdef CONFIG_SMP
/*
* For asym packing, by default the lower numbered
cpu
has higher priority.
* For asym packing, by default the lower numbered
CPU
has higher priority.
*/
int
__weak
arch_asym_cpu_priority
(
int
cpu
)
{
...
...
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p)
}
/*
* The averaged statistics, shared & private, memory &
cpu
,
* The averaged statistics, shared & private, memory &
CPU
,
* occupy the first half of the array. The second half of the
* array is for current counters, which are averaged into the
* first set by task_numa_placement.
...
...
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env,
* be incurred if the tasks were swapped.
*/
if
(
cur
)
{
/* Skip this swap candidate if cannot move to the source
cpu
*/
/* Skip this swap candidate if cannot move to the source
CPU:
*/
if
(
!
cpumask_test_cpu
(
env
->
src_cpu
,
&
cur
->
cpus_allowed
))
goto
unlock
;
...
...
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env,
goto
balance
;
}
/* Balance doesn't matter much if we're running a task per
cpu
*/
/* Balance doesn't matter much if we're running a task per
CPU:
*/
if
(
imp
>
env
->
best_imp
&&
src_rq
->
nr_running
==
1
&&
dst_rq
->
nr_running
==
1
)
goto
assign
;
...
...
@@ -1676,7 +1661,7 @@ static void task_numa_compare(struct task_numa_env *env,
*/
if
(
!
cur
)
{
/*
* select_idle_siblings() uses an per-
cpu
cpumask that
* select_idle_siblings() uses an per-
CPU
cpumask that
* can be used from IRQ context.
*/
local_irq_disable
();
...
...
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p)
static
void
numa_migrate_preferred
(
struct
task_struct
*
p
)
{
unsigned
long
interval
=
HZ
;
unsigned
long
numa_migrate_retry
;
/* This task has no NUMA fault statistics yet */
if
(
unlikely
(
p
->
numa_preferred_nid
==
-
1
||
!
p
->
numa_faults
))
...
...
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p)
/* Periodically retry migrating the task to the preferred node */
interval
=
min
(
interval
,
msecs_to_jiffies
(
p
->
numa_scan_period
)
/
16
);
p
->
numa_migrate_retry
=
jiffies
+
interval
;
numa_migrate_retry
=
jiffies
+
interval
;
/*
* Check that the new retry threshold is after the current one. If
* the retry is in the future, it implies that wake_affine has
* temporarily asked NUMA balancing to backoff from placement.
*/
if
(
numa_migrate_retry
>
p
->
numa_migrate_retry
)
return
;
/* Safe to try placing the task on the preferred node */
p
->
numa_migrate_retry
=
numa_migrate_retry
;
/* Success if task is already running on preferred CPU */
if
(
task_node
(
p
)
==
p
->
numa_preferred_nid
)
...
...
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
#
ifdef CONFIG_SMP
#ifdef CONFIG_SMP
/*
* All this does is approximate the hierarchical proportion which includes that
* global sum we all love to hate.
...
...
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
return
clamp_t
(
long
,
runnable
,
MIN_SHARES
,
shares
);
}
#
endif
/* CONFIG_SMP */
#endif
/* CONFIG_SMP */
static
inline
int
throttled_hierarchy
(
struct
cfs_rq
*
cfs_rq
);
...
...
@@ -3350,7 +3347,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
}
/*
* Called within set_task_rq() right before setting a task's
cpu
. The
* Called within set_task_rq() right before setting a task's
CPU
. The
* caller only guarantees p->pi_lock is held; no other assumptions,
* including the state of rq->lock, should be made.
*/
...
...
@@ -3529,7 +3526,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
/*
* runnable_sum can't be lower than running_sum
* As running sum is scale with
cpu
capacity wehreas the runnable sum
* As running sum is scale with
CPU
capacity wehreas the runnable sum
* is not we rescale running_sum 1st
*/
running_sum
=
se
->
avg
.
util_sum
/
...
...
@@ -4676,7 +4673,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
if
(
!
se
)
add_nr_running
(
rq
,
task_delta
);
/*
determine whether we need to wake up potentially idle cpu
*/
/*
Determine whether we need to wake up potentially idle CPU:
*/
if
(
rq
->
curr
==
rq
->
idle
&&
rq
->
cfs
.
nr_running
)
resched_curr
(
rq
);
}
...
...
@@ -5041,7 +5038,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
}
/*
* Both these
cpu
hotplug callbacks race against unregister_fair_sched_group()
* Both these
CPU
hotplug callbacks race against unregister_fair_sched_group()
*
* The race is harmless, since modifying bandwidth settings of unhooked group
* bits doesn't do much.
...
...
@@ -5086,7 +5083,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
*/
cfs_rq
->
runtime_remaining
=
1
;
/*
* Offline rq is schedulable till
cpu
is completely disabled
* Offline rq is schedulable till
CPU
is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq
->
runtime_enabled
=
0
;
...
...
@@ -5323,8 +5320,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
*
* load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
*
* If a
cpu
misses updates for n ticks (as it was idle) and update gets
* called on the n+1-th tick when
cpu
may be busy, then we have:
* If a
CPU
misses updates for n ticks (as it was idle) and update gets
* called on the n+1-th tick when
CPU
may be busy, then we have:
*
* load_n = (1 - 1/2^i)^n * load_0
* load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
...
...
@@ -5468,7 +5465,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
#ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
*
cpu doing the jiffies update might drift wrt the cpu
doing the jiffy reading
*
CPU doing the jiffies update might drift wrt the CPU
doing the jiffy reading
* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
*
* Therefore we need to avoid the delta approach from the regular tick when
...
...
@@ -5579,7 +5576,7 @@ void cpu_load_update_active(struct rq *this_rq)
}
/*
* Return a low guess at the load of a migration-source
cpu
weighted
* Return a low guess at the load of a migration-source
CPU
weighted
* according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
...
...
@@ -5597,7 +5594,7 @@ static unsigned long source_load(int cpu, int type)
}
/*
* Return a high guess at the load of a migration-target
cpu
weighted
* Return a high guess at the load of a migration-target
CPU
weighted
* according to the scheduling class and "nice" value.
*/
static
unsigned
long
target_load
(
int
cpu
,
int
type
)
...
...
@@ -5724,7 +5721,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
unsigned
long
task_load
;
this_eff_load
=
target_load
(
this_cpu
,
sd
->
wake_idx
);
prev_eff_load
=
source_load
(
prev_cpu
,
sd
->
wake_idx
);
if
(
sync
)
{
unsigned
long
current_load
=
task_h_load
(
current
);
...
...
@@ -5742,18 +5738,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
this_eff_load
*=
100
;
this_eff_load
*=
capacity_of
(
prev_cpu
);
prev_eff_load
=
source_load
(
prev_cpu
,
sd
->
wake_idx
);
prev_eff_load
-=
task_load
;
if
(
sched_feat
(
WA_BIAS
))
prev_eff_load
*=
100
+
(
sd
->
imbalance_pct
-
100
)
/
2
;
prev_eff_load
*=
capacity_of
(
this_cpu
);
return
this_eff_load
<=
prev_eff_load
?
this_cpu
:
nr_cpumask_bits
;
/*
* If sync, adjust the weight of prev_eff_load such that if
* prev_eff == this_eff that select_idle_sibling() will consider
* stacking the wakee on top of the waker if no other CPU is
* idle.
*/
if
(
sync
)
prev_eff_load
+=
1
;
return
this_eff_load
<
prev_eff_load
?
this_cpu
:
nr_cpumask_bits
;
}
#ifdef CONFIG_NUMA_BALANCING
static
void
update_wa_numa_placement
(
struct
task_struct
*
p
,
int
prev_cpu
,
int
target
)
{
unsigned
long
interval
;
if
(
!
static_branch_likely
(
&
sched_numa_balancing
))
return
;
/* If balancing has no preference then continue gathering data */
if
(
p
->
numa_preferred_nid
==
-
1
)
return
;
/*
* If the wakeup is not affecting locality then it is neutral from
* the perspective of NUMA balacing so continue gathering data.
*/
if
(
cpu_to_node
(
prev_cpu
)
==
cpu_to_node
(
target
))
return
;
/*
* Temporarily prevent NUMA balancing trying to place waker/wakee after
* wakee has been moved by wake_affine. This will potentially allow
* related tasks to converge and update their data placement. The
* 4 * numa_scan_period is to allow the two-pass filter to migrate
* hot data to the wakers node.
*/
interval
=
max
(
sysctl_numa_balancing_scan_delay
,
p
->
numa_scan_period
<<
2
);
p
->
numa_migrate_retry
=
jiffies
+
msecs_to_jiffies
(
interval
);
interval
=
max
(
sysctl_numa_balancing_scan_delay
,
current
->
numa_scan_period
<<
2
);
current
->
numa_migrate_retry
=
jiffies
+
msecs_to_jiffies
(
interval
);
}
#else
static
void
update_wa_numa_placement
(
struct
task_struct
*
p
,
int
prev_cpu
,
int
target
)
{
}
#endif
static
int
wake_affine
(
struct
sched_domain
*
sd
,
struct
task_struct
*
p
,
int
prev_cpu
,
int
sync
)
int
this_cpu
,
int
prev_cpu
,
int
sync
)
{
int
this_cpu
=
smp_processor_id
();
int
target
=
nr_cpumask_bits
;
if
(
sched_feat
(
WA_IDLE
))
...
...
@@ -5766,6 +5813,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if
(
target
==
nr_cpumask_bits
)
return
prev_cpu
;
update_wa_numa_placement
(
p
,
prev_cpu
,
target
);
schedstat_inc
(
sd
->
ttwu_move_affine
);
schedstat_inc
(
p
->
se
.
statistics
.
nr_wakeups_affine
);
return
target
;
...
...
@@ -5826,7 +5874,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
max_spare_cap
=
0
;
for_each_cpu
(
i
,
sched_group_span
(
group
))
{
/* Bias balancing toward
cpu
s of our domain */
/* Bias balancing toward
CPU
s of our domain */
if
(
local_group
)
load
=
source_load
(
i
,
load_idx
);
else
...
...
@@ -5856,7 +5904,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if
(
min_runnable_load
>
(
runnable_load
+
imbalance
))
{
/*
* The runnable load is significantly smaller
* so we can pick this new
cpu
* so we can pick this new
CPU:
*/
min_runnable_load
=
runnable_load
;
min_avg_load
=
avg_load
;
...
...
@@ -5865,7 +5913,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
(
100
*
min_avg_load
>
imbalance_scale
*
avg_load
))
{
/*
* The runnable loads are close so take the
* blocked load into account through avg_load
.
* blocked load into account through avg_load
:
*/
min_avg_load
=
avg_load
;
idlest
=
group
;
...
...
@@ -5903,6 +5951,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if
(
!
idlest
)
return
NULL
;
/*
* When comparing groups across NUMA domains, it's possible for the
* local domain to be very lightly loaded relative to the remote
* domains but "imbalance" skews the comparison making remote CPUs
* look much more favourable. When considering cross-domain, add
* imbalance to the runnable load on the remote node and consider
* staying local.
*/
if
((
sd
->
flags
&
SD_NUMA
)
&&
min_runnable_load
+
imbalance
>=
this_runnable_load
)
return
NULL
;
if
(
min_runnable_load
>
(
this_runnable_load
+
imbalance
))
return
NULL
;
...
...
@@ -5914,7 +5974,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/*
* find_idlest_group_cpu - find the idlest
cpu among the cpus in
group.
* find_idlest_group_cpu - find the idlest
CPU among the CPUs in the
group.
*/
static
int
find_idlest_group_cpu
(
struct
sched_group
*
group
,
struct
task_struct
*
p
,
int
this_cpu
)
...
...
@@ -5992,12 +6052,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
new_cpu
=
find_idlest_group_cpu
(
group
,
p
,
cpu
);
if
(
new_cpu
==
cpu
)
{
/* Now try balancing at a lower domain level of
cpu
*/
/* Now try balancing at a lower domain level of
'cpu':
*/
sd
=
sd
->
child
;
continue
;
}
/* Now try balancing at a lower domain level of
new_cpu
*/
/* Now try balancing at a lower domain level of
'new_cpu':
*/
cpu
=
new_cpu
;
weight
=
sd
->
span_weight
;
sd
=
NULL
;
...
...
@@ -6007,7 +6067,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
if
(
tmp
->
flags
&
sd_flag
)
sd
=
tmp
;
}
/* while loop will break here if sd == NULL */
}
return
new_cpu
;
...
...
@@ -6203,12 +6262,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return
target
;
/*
* If the previous
cpu is cache affine and idle, don't be stupid.
* If the previous
CPU is cache affine and idle, don't be stupid:
*/
if
(
prev
!=
target
&&
cpus_share_cache
(
prev
,
target
)
&&
idle_cpu
(
prev
))
return
prev
;
/* Check a recently used CPU as a potential idle candidate */
/* Check a recently used CPU as a potential idle candidate
:
*/
recent_used_cpu
=
p
->
recent_used_cpu
;
if
(
recent_used_cpu
!=
prev
&&
recent_used_cpu
!=
target
&&
...
...
@@ -6217,7 +6276,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
cpumask_test_cpu
(
p
->
recent_used_cpu
,
&
p
->
cpus_allowed
))
{
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake
.
* candidate for the next wake
:
*/
p
->
recent_used_cpu
=
prev
;
return
recent_used_cpu
;
...
...
@@ -6282,7 +6341,7 @@ static inline unsigned long task_util(struct task_struct *p)
}
/*
* cpu_util_wake: Compute
cpu
utilization with any contributions from
* cpu_util_wake: Compute
CPU
utilization with any contributions from
* the waking task p removed.
*/
static
unsigned
long
cpu_util_wake
(
int
cpu
,
struct
task_struct
*
p
)
...
...
@@ -6328,10 +6387,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
* Balances load by selecting the idlest
cpu
in the idlest group, or under
* certain conditions an idle sibling
cpu
if the domain has SD_WAKE_AFFINE set.
* Balances load by selecting the idlest
CPU
in the idlest group, or under
* certain conditions an idle sibling
CPU
if the domain has SD_WAKE_AFFINE set.
*
* Returns the target
cpu
number.
* Returns the target
CPU
number.
*
* preempt must be disabled.
*/
...
...
@@ -6342,7 +6401,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int
cpu
=
smp_processor_id
();
int
new_cpu
=
prev_cpu
;
int
want_affine
=
0
;
int
sync
=
wake_flags
&
WF_SYNC
;
int
sync
=
(
wake_flags
&
WF_SYNC
)
&&
!
(
current
->
flags
&
PF_EXITING
)
;
if
(
sd_flag
&
SD_BALANCE_WAKE
)
{
record_wakee
(
p
);
...
...
@@ -6356,7 +6415,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break
;
/*
* If both
cpu and prev_cpu
are part of this domain,
* If both
'cpu' and 'prev_cpu'
are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if
(
want_affine
&&
(
tmp
->
flags
&
SD_WAKE_AFFINE
)
&&
...
...
@@ -6376,7 +6435,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if
(
cpu
==
prev_cpu
)
goto
pick_cpu
;
new_cpu
=
wake_affine
(
affine_sd
,
p
,
prev_cpu
,
sync
);
new_cpu
=
wake_affine
(
affine_sd
,
p
,
cpu
,
prev_cpu
,
sync
);
}
if
(
sd
&&
!
(
sd_flag
&
SD_BALANCE_FORK
))
{
...
...
@@ -6407,9 +6466,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
static
void
detach_entity_cfs_rq
(
struct
sched_entity
*
se
);
/*
* Called immediately before a task is migrated to a new
cpu
; task_cpu(p) and
* Called immediately before a task is migrated to a new
CPU
; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous
cpu
. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
* previous
CPU
. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
static
void
migrate_task_rq_fair
(
struct
task_struct
*
p
)
{
...
...
@@ -6843,17 +6902,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* BASICS
*
* The purpose of load-balancing is to achieve the same basic fairness the
* per-
cpu
scheduler provides, namely provide a proportional amount of compute
* per-
CPU
scheduler provides, namely provide a proportional amount of compute
* time to each task. This is expressed in the following equation:
*
* W_i,n/P_i == W_j,n/P_j for all i,j (1)
*
* Where W_i,n is the n-th weight average for
cpu
i. The instantaneous weight
* Where W_i,n is the n-th weight average for
CPU
i. The instantaneous weight
* W_i,0 is defined as:
*
* W_i,0 = \Sum_j w_i,j (2)
*
* Where w_i,j is the weight of the j-th runnable task on
cpu
i. This weight
* Where w_i,j is the weight of the j-th runnable task on
CPU
i. This weight
* is derived from the nice value as per sched_prio_to_weight[].
*
* The weight average is an exponential decay average of the instantaneous
...
...
@@ -6861,7 +6920,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
* C_i is the compute capacity of
cpu
i, typically it is the
* C_i is the compute capacity of
CPU
i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
...
...
@@ -6882,11 +6941,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* SCHED DOMAINS
*
* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
* for all i,j solution, we create a tree of
cpu
s that follows the hardware
* for all i,j solution, we create a tree of
CPU
s that follows the hardware
* topology where each level pairs two lower groups (or better). This results
* in O(log n) layers. Furthermore we reduce the number of
cpu
s going up the
* in O(log n) layers. Furthermore we reduce the number of
CPU
s going up the
* tree to only the first of the previous level and we decrease the frequency
* of load-balance at each level inv. proportional to the number of
cpu
s in
* of load-balance at each level inv. proportional to the number of
CPU
s in
* the groups.
*
* This yields:
...
...
@@ -6895,7 +6954,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* \Sum { --- * --- * 2^i } = O(n) (5)
* i = 0 2^i 2^i
* `- size of each group
* | | `- number of
cpu
s doing load-balance
* | | `- number of
CPU
s doing load-balance
* | `- freq
* `- sum over all levels
*
...
...
@@ -6903,7 +6962,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* this makes (5) the runtime complexity of the balancer.
*
* An important property here is that each CPU is still (indirectly) connected
* to every other
cpu
in at most O(log n) steps:
* to every other
CPU
in at most O(log n) steps:
*
* The adjacency matrix of the resulting graph is given by:
*
...
...
@@ -6915,7 +6974,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* A^(log_2 n)_i,j != 0 for all i,j (7)
*
* Showing there's indeed a path between every
cpu
in at most O(log n) steps.
* Showing there's indeed a path between every
CPU
in at most O(log n) steps.
* The task movement gives a factor of O(m), giving a convergence complexity
* of:
*
...
...
@@ -6925,7 +6984,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
* WORK CONSERVING
*
* In order to avoid CPUs going idle while there's still work to do, new idle
* balancing is more aggressive and has the newly idle
cpu
iterate up the domain
* balancing is more aggressive and has the newly idle
CPU
iterate up the domain
* tree itself instead of relying on other CPUs to bring it work.
*
* This adds some complexity to both (5) and (8) but it reduces the total idle
...
...
@@ -6946,7 +7005,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
*
* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on
cpu
i.
* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on
CPU
i.
*
* The big problem is S_k, its a global sum needed to compute a local (W_i)
* property.
...
...
@@ -7110,7 +7169,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
env
->
flags
|=
LBF_SOME_PINNED
;
/*
* Remember if this task can be migrated to any other
cpu
in
* Remember if this task can be migrated to any other
CPU
in
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
...
...
@@ -7120,7 +7179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if
(
env
->
idle
==
CPU_NEWLY_IDLE
||
(
env
->
flags
&
LBF_DST_PINNED
))
return
0
;
/* Prevent to re-select dst_cpu via env's
cpus
*/
/* Prevent to re-select dst_cpu via env's
CPUs:
*/
for_each_cpu_and
(
cpu
,
env
->
dst_grpmask
,
env
->
cpus
)
{
if
(
cpumask_test_cpu
(
cpu
,
&
p
->
cpus_allowed
))
{
env
->
flags
|=
LBF_DST_PINNED
;
...
...
@@ -7694,8 +7753,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
* Group imbalance indicates (and tries to solve) the problem where balancing
* groups is inadequate due to ->cpus_allowed constraints.
*
* Imagine a situation of two groups of 4
cpu
s each and 4 tasks each with a
* cpumask covering 1
cpu of the first group and 3 cpu
s of the second group.
* Imagine a situation of two groups of 4
CPU
s each and 4 tasks each with a
* cpumask covering 1
CPU of the first group and 3 CPU
s of the second group.
* Something like:
*
* { 0 1 2 3 } { 4 5 6 7 }
...
...
@@ -7703,7 +7762,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
*
* If we were to balance group-wise we'd place two tasks in the first group and
* two tasks in the second group. Clearly this is undesired as it will overload
* cpu 3 and leave one of the
cpu
s in the second group unused.
* cpu 3 and leave one of the
CPU
s in the second group unused.
*
* The current solution to this issue is detecting the skew in the first group
* by noticing the lower domain failed to reach balance and had difficulty
...
...
@@ -7816,7 +7875,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and
(
i
,
sched_group_span
(
group
),
env
->
cpus
)
{
struct
rq
*
rq
=
cpu_rq
(
i
);
/* Bias balancing toward
cpus of our domain
*/
/* Bias balancing toward
CPUs of our domain:
*/
if
(
local_group
)
load
=
target_load
(
i
,
load_idx
);
else
...
...
@@ -7902,7 +7961,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if
(
!
(
env
->
sd
->
flags
&
SD_ASYM_PACKING
))
return
true
;
/* No ASYM_PACKING if target
cpu
is already busy */
/* No ASYM_PACKING if target
CPU
is already busy */
if
(
env
->
idle
==
CPU_NOT_IDLE
)
return
true
;
/*
...
...
@@ -7915,7 +7974,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if
(
!
sds
->
busiest
)
return
true
;
/* Prefer to move from lowest priority
cpu
's work */
/* Prefer to move from lowest priority
CPU
's work */
if
(
sched_asym_prefer
(
sds
->
busiest
->
asym_prefer_cpu
,
sg
->
asym_prefer_cpu
))
return
true
;
...
...
@@ -8168,7 +8227,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
if
(
busiest
->
group_type
==
group_imbalanced
)
{
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure
cpu
-load equilibrium, look at wider averages. XXX
* to ensure
CPU
-load equilibrium, look at wider averages. XXX
*/
busiest
->
load_per_task
=
min
(
busiest
->
load_per_task
,
sds
->
avg_load
);
...
...
@@ -8187,7 +8246,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/*
* If there aren't any idle
cpu
s, avoid creating some.
* If there aren't any idle
CPU
s, avoid creating some.
*/
if
(
busiest
->
group_type
==
group_overloaded
&&
local
->
group_type
==
group_overloaded
)
{
...
...
@@ -8201,9 +8260,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
}
/*
* We're trying to get all the
cpu
s to the average_load, so we don't
* We're trying to get all the
CPU
s to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded
cpu
below the average load. At the same time,
* reduce the max loaded
CPU
below the average load. At the same time,
* we also don't want to reduce the group load below the group
* capacity. Thus we look for the minimum possible imbalance.
*/
...
...
@@ -8297,9 +8356,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if
(
env
->
idle
==
CPU_IDLE
)
{
/*
* This
cpu
is idle. If the busiest group is not overloaded
* This
CPU
is idle. If the busiest group is not overloaded
* and there is no imbalance between this and busiest group
* wrt idle
cpu
s, it is balanced. The imbalance becomes
* wrt idle
CPU
s, it is balanced. The imbalance becomes
* significant if the diff is greater than 1 otherwise we
* might end up to just move the imbalance on another group
*/
...
...
@@ -8327,7 +8386,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
}
/*
* find_busiest_queue - find the busiest runqueue among the
cpus in
group.
* find_busiest_queue - find the busiest runqueue among the
CPUs in the
group.
*/
static
struct
rq
*
find_busiest_queue
(
struct
lb_env
*
env
,
struct
sched_group
*
group
)
...
...
@@ -8371,7 +8430,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
/*
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the
cpu
capacity.
* which is not scaled with the
CPU
capacity.
*/
if
(
rq
->
nr_running
==
1
&&
wl
>
env
->
imbalance
&&
...
...
@@ -8379,9 +8438,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
continue
;
/*
* For the load comparisons with the other
cpu
's, consider
* the weighted_cpuload() scaled with the
cpu
capacity, so
* that the load can be moved away from the
cpu
that is
* For the load comparisons with the other
CPU
's, consider
* the weighted_cpuload() scaled with the
CPU
capacity, so
* that the load can be moved away from the
CPU
that is
* potentially running at a lower capacity.
*
* Thus we're looking for max(wl_i / capacity_i), crosswise
...
...
@@ -8452,13 +8511,13 @@ static int should_we_balance(struct lb_env *env)
return
0
;
/*
* In the newly idle case, we will allow all the
cpu'
s
* In the newly idle case, we will allow all the
CPU
s
* to do the newly idle load balance.
*/
if
(
env
->
idle
==
CPU_NEWLY_IDLE
)
return
1
;
/* Try to find first idle
cpu
*/
/* Try to find first idle
CPU
*/
for_each_cpu_and
(
cpu
,
group_balance_mask
(
sg
),
env
->
cpus
)
{
if
(
!
idle_cpu
(
cpu
))
continue
;
...
...
@@ -8471,7 +8530,7 @@ static int should_we_balance(struct lb_env *env)
balance_cpu
=
group_balance_cpu
(
sg
);
/*
* First idle
cpu or the first cpu
(busiest) in this sched group
* First idle
CPU or the first CPU
(busiest) in this sched group
* is eligible for doing load balancing at this and above domains.
*/
return
balance_cpu
==
env
->
dst_cpu
;
...
...
@@ -8580,7 +8639,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of
cpu
s in our
* iterate on same src_cpu is dependent on number of
CPU
s in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
...
...
@@ -8597,7 +8656,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
*/
if
((
env
.
flags
&
LBF_DST_PINNED
)
&&
env
.
imbalance
>
0
)
{
/* Prevent to re-select dst_cpu via env's
cpu
s */
/* Prevent to re-select dst_cpu via env's
CPU
s */
cpumask_clear_cpu
(
env
.
dst_cpu
,
env
.
cpus
);
env
.
dst_rq
=
cpu_rq
(
env
.
new_dst_cpu
);
...
...
@@ -8659,9 +8718,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
raw_spin_lock_irqsave
(
&
busiest
->
lock
,
flags
);
/* don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest cpu can't be
* moved to this_cpu
/*
* Don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest CPU can't be
* moved to this_cpu:
*/
if
(
!
cpumask_test_cpu
(
this_cpu
,
&
busiest
->
curr
->
cpus_allowed
))
{
raw_spin_unlock_irqrestore
(
&
busiest
->
lock
,
...
...
@@ -8887,7 +8947,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
}
/*
* active_load_balance_cpu_stop is run by
cpu
stopper. It pushes
* active_load_balance_cpu_stop is run by
the CPU
stopper. It pushes
* running tasks off the busiest CPU onto idle CPUs. It requires at
* least 1 task to be running on each physical CPU where possible, and
* avoids physical / logical imbalances.
...
...
@@ -8911,7 +8971,7 @@ static int active_load_balance_cpu_stop(void *data)
if
(
!
cpu_active
(
busiest_cpu
)
||
!
cpu_active
(
target_cpu
))
goto
out_unlock
;
/*
make sure the requested cpu hasn't gone down in the meantime
*/
/*
Make sure the requested CPU hasn't gone down in the meantime:
*/
if
(
unlikely
(
busiest_cpu
!=
smp_processor_id
()
||
!
busiest_rq
->
active_balance
))
goto
out_unlock
;
...
...
@@ -8923,7 +8983,7 @@ static int active_load_balance_cpu_stop(void *data)
/*
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-
cpu
setup.
* Bjorn Helgaas on a 128-
CPU
setup.
*/
BUG_ON
(
busiest_rq
==
target_rq
);
...
...
@@ -9025,7 +9085,7 @@ static void nohz_balancer_kick(void)
return
;
/*
* Use smp_send_reschedule() instead of resched_cpu().
* This way we generate a sched IPI on the target
cpu
which
* This way we generate a sched IPI on the target
CPU
which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
...
...
@@ -9082,14 +9142,12 @@ void set_cpu_sd_state_idle(void)
}
/*
* This routine will record that the
cpu
is going idle with tick stopped.
* This routine will record that the
CPU
is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
void
nohz_balance_enter_idle
(
int
cpu
)
{
/*
* If this cpu is going down, then nothing needs to be done.
*/
/* If this CPU is going down, then nothing needs to be done: */
if
(
!
cpu_active
(
cpu
))
return
;
...
...
@@ -9100,9 +9158,7 @@ void nohz_balance_enter_idle(int cpu)
if
(
test_bit
(
NOHZ_TICK_STOPPED
,
nohz_flags
(
cpu
)))
return
;
/*
* If we're a completely isolated CPU, we don't play.
*/
/* If we're a completely isolated CPU, we don't play: */
if
(
on_null_domain
(
cpu_rq
(
cpu
)))
return
;
...
...
@@ -9211,7 +9267,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
/*
* next_balance will be updated only when there is a need.
* When the
cpu
is attached to null domain for ex, it will not be
* When the
CPU
is attached to null domain for ex, it will not be
* updated.
*/
if
(
likely
(
update_next_balance
))
{
...
...
@@ -9235,7 +9291,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
#ifdef CONFIG_NO_HZ_COMMON
/*
* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the
cpu
s for whom scheduler ticks are stopped.
* rebalancing for all the
CPU
s for whom scheduler ticks are stopped.
*/
static
void
nohz_idle_balance
(
struct
rq
*
this_rq
,
enum
cpu_idle_type
idle
)
{
...
...
@@ -9255,8 +9311,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
continue
;
/*
* If this
cpu
gets work to do, stop the load balancing
* work being done for other
cpu
s. Next load
* If this
CPU
gets work to do, stop the load balancing
* work being done for other
CPU
s. Next load
* balancing owner will pick it up.
*/
if
(
need_resched
())
...
...
@@ -9298,13 +9354,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle
cpu
in the system.
* of an idle
CPU
in the system.
* - This rq has more than one task.
* - This rq has at least one CFS task and the capacity of the CPU is
* significantly reduced because of RT tasks or IRQs.
* - At parent of LLC scheduler domain level, this
cpu
's scheduler group has
* multiple busy
cpu
.
* - For SD_ASYM_PACKING, if the lower numbered
cpu
's in the scheduler
* - At parent of LLC scheduler domain level, this
CPU
's scheduler group has
* multiple busy
CPUs
.
* - For SD_ASYM_PACKING, if the lower numbered
CPU
's in the scheduler
* domain span are idle.
*/
static
inline
bool
nohz_kick_needed
(
struct
rq
*
rq
)
...
...
@@ -9394,10 +9450,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
CPU_IDLE
:
CPU_NOT_IDLE
;
/*
* If this
cpu
has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle
cpu
s whose ticks are
* If this
CPU
has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle
CPU
s whose ticks are
* stopped. Do nohz_idle_balance *before* rebalance_domains to
* give the idle
cpu
s a chance to load balance. Else we may
* give the idle
CPU
s a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
...
...
@@ -9440,7 +9496,12 @@ static void rq_offline_fair(struct rq *rq)
#endif
/* CONFIG_SMP */
/*
* scheduler tick hitting a task of our scheduling class:
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static
void
task_tick_fair
(
struct
rq
*
rq
,
struct
task_struct
*
curr
,
int
queued
)
{
...
...
kernel/sched/idle.c
浏览文件 @
e13e75b8
/*
* Generic entry point for the idle threads
* Generic entry points for the idle threads and
* implementation of the idle task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE batch scheduled
* tasks which are handled in sched/fair.c )
*/
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
#include <linux/cpuhotplug.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
#include <linux/suspend.h>
#include <linux/livepatch.h>
#include <asm/tlb.h>
#include "sched.h"
#include <trace/events/power.h>
#include "sched.h"
/* Linker adds these: start and end of __cpuidle functions */
extern
char
__cpuidle_text_start
[],
__cpuidle_text_end
[];
...
...
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
static
int
__init
cpu_idle_poll_setup
(
char
*
__unused
)
{
cpu_idle_force_poll
=
1
;
return
1
;
}
__setup
(
"nohlt"
,
cpu_idle_poll_setup
);
...
...
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
static
int
__init
cpu_idle_nopoll_setup
(
char
*
__unused
)
{
cpu_idle_force_poll
=
0
;
return
1
;
}
__setup
(
"hlt"
,
cpu_idle_nopoll_setup
);
...
...
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
trace_cpu_idle_rcuidle
(
0
,
smp_processor_id
());
local_irq_enable
();
stop_critical_timings
();
while
(
!
tif_need_resched
()
&&
(
cpu_idle_force_poll
||
tick_check_broadcast_expired
()))
cpu_relax
();
start_critical_timings
();
trace_cpu_idle_rcuidle
(
PWR_EVENT_EXIT
,
smp_processor_id
());
rcu_idle_exit
();
return
1
;
}
...
...
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state)
{
/*
* This #ifdef needs to die, but it's too late in the cycle to
* make this generic (
arm and sh
have never invoked the canary
* init for the non boot
cpu
s!). Will be fixed in 3.11
* make this generic (
ARM and SH
have never invoked the canary
* init for the non boot
CPU
s!). Will be fixed in 3.11
*/
#ifdef CONFIG_X86
/*
...
...
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
while
(
1
)
do_idle
();
}
/*
* idle-task scheduling class.
*/
#ifdef CONFIG_SMP
static
int
select_task_rq_idle
(
struct
task_struct
*
p
,
int
cpu
,
int
sd_flag
,
int
flags
)
{
return
task_cpu
(
p
);
/* IDLE tasks as never migrated */
}
#endif
/*
* Idle tasks are unconditionally rescheduled:
*/
static
void
check_preempt_curr_idle
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
)
{
resched_curr
(
rq
);
}
static
struct
task_struct
*
pick_next_task_idle
(
struct
rq
*
rq
,
struct
task_struct
*
prev
,
struct
rq_flags
*
rf
)
{
put_prev_task
(
rq
,
prev
);
update_idle_core
(
rq
);
schedstat_inc
(
rq
->
sched_goidle
);
return
rq
->
idle
;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static
void
dequeue_task_idle
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
)
{
raw_spin_unlock_irq
(
&
rq
->
lock
);
printk
(
KERN_ERR
"bad: scheduling from the idle thread!
\n
"
);
dump_stack
();
raw_spin_lock_irq
(
&
rq
->
lock
);
}
static
void
put_prev_task_idle
(
struct
rq
*
rq
,
struct
task_struct
*
prev
)
{
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static
void
task_tick_idle
(
struct
rq
*
rq
,
struct
task_struct
*
curr
,
int
queued
)
{
}
static
void
set_curr_task_idle
(
struct
rq
*
rq
)
{
}
static
void
switched_to_idle
(
struct
rq
*
rq
,
struct
task_struct
*
p
)
{
BUG
();
}
static
void
prio_changed_idle
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
oldprio
)
{
BUG
();
}
static
unsigned
int
get_rr_interval_idle
(
struct
rq
*
rq
,
struct
task_struct
*
task
)
{
return
0
;
}
static
void
update_curr_idle
(
struct
rq
*
rq
)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const
struct
sched_class
idle_sched_class
=
{
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.
dequeue_task
=
dequeue_task_idle
,
.
check_preempt_curr
=
check_preempt_curr_idle
,
.
pick_next_task
=
pick_next_task_idle
,
.
put_prev_task
=
put_prev_task_idle
,
#ifdef CONFIG_SMP
.
select_task_rq
=
select_task_rq_idle
,
.
set_cpus_allowed
=
set_cpus_allowed_common
,
#endif
.
set_curr_task
=
set_curr_task_idle
,
.
task_tick
=
task_tick_idle
,
.
get_rr_interval
=
get_rr_interval_idle
,
.
prio_changed
=
prio_changed_idle
,
.
switched_to
=
switched_to_idle
,
.
update_curr
=
update_curr_idle
,
};
kernel/sched/idle_task.c
已删除
100644 → 0
浏览文件 @
1ed41b56
// SPDX-License-Identifier: GPL-2.0
#include "sched.h"
/*
* idle-task scheduling class.
*
* (NOTE: these are not related to SCHED_IDLE tasks which are
* handled in sched/fair.c)
*/
#ifdef CONFIG_SMP
static
int
select_task_rq_idle
(
struct
task_struct
*
p
,
int
cpu
,
int
sd_flag
,
int
flags
)
{
return
task_cpu
(
p
);
/* IDLE tasks as never migrated */
}
#endif
/* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
*/
static
void
check_preempt_curr_idle
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
)
{
resched_curr
(
rq
);
}
static
struct
task_struct
*
pick_next_task_idle
(
struct
rq
*
rq
,
struct
task_struct
*
prev
,
struct
rq_flags
*
rf
)
{
put_prev_task
(
rq
,
prev
);
update_idle_core
(
rq
);
schedstat_inc
(
rq
->
sched_goidle
);
return
rq
->
idle
;
}
/*
* It is not legal to sleep in the idle task - print a warning
* message if some code attempts to do it:
*/
static
void
dequeue_task_idle
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
)
{
raw_spin_unlock_irq
(
&
rq
->
lock
);
printk
(
KERN_ERR
"bad: scheduling from the idle thread!
\n
"
);
dump_stack
();
raw_spin_lock_irq
(
&
rq
->
lock
);
}
static
void
put_prev_task_idle
(
struct
rq
*
rq
,
struct
task_struct
*
prev
)
{
rq_last_tick_reset
(
rq
);
}
static
void
task_tick_idle
(
struct
rq
*
rq
,
struct
task_struct
*
curr
,
int
queued
)
{
}
static
void
set_curr_task_idle
(
struct
rq
*
rq
)
{
}
static
void
switched_to_idle
(
struct
rq
*
rq
,
struct
task_struct
*
p
)
{
BUG
();
}
static
void
prio_changed_idle
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
oldprio
)
{
BUG
();
}
static
unsigned
int
get_rr_interval_idle
(
struct
rq
*
rq
,
struct
task_struct
*
task
)
{
return
0
;
}
static
void
update_curr_idle
(
struct
rq
*
rq
)
{
}
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
const
struct
sched_class
idle_sched_class
=
{
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
.
dequeue_task
=
dequeue_task_idle
,
.
check_preempt_curr
=
check_preempt_curr_idle
,
.
pick_next_task
=
pick_next_task_idle
,
.
put_prev_task
=
put_prev_task_idle
,
#ifdef CONFIG_SMP
.
select_task_rq
=
select_task_rq_idle
,
.
set_cpus_allowed
=
set_cpus_allowed_common
,
#endif
.
set_curr_task
=
set_curr_task_idle
,
.
task_tick
=
task_tick_idle
,
.
get_rr_interval
=
get_rr_interval_idle
,
.
prio_changed
=
prio_changed_idle
,
.
switched_to
=
switched_to_idle
,
.
update_curr
=
update_curr_idle
,
};
kernel/sched/isolation.c
浏览文件 @
e13e75b8
...
...
@@ -3,15 +3,10 @@
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
*
* Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
* Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
*
*/
#include <linux/sched/isolation.h>
#include <linux/tick.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/static_key.h>
#include <linux/ctype.h>
#include "sched.h"
DEFINE_STATIC_KEY_FALSE
(
housekeeping_overriden
);
EXPORT_SYMBOL_GPL
(
housekeeping_overriden
);
...
...
@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
static_branch_enable
(
&
housekeeping_overriden
);
if
(
housekeeping_flags
&
HK_FLAG_TICK
)
sched_tick_offload_init
();
/* We need at least one CPU to handle housekeeping work */
WARN_ON_ONCE
(
cpumask_empty
(
housekeeping_mask
));
}
...
...
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned
int
flags
;
flags
=
HK_FLAG_TICK
|
HK_FLAG_TIMER
|
HK_FLAG_RCU
|
HK_FLAG_MISC
;
flags
=
HK_FLAG_TICK
|
HK_FLAG_
WQ
|
HK_FLAG_
TIMER
|
HK_FLAG_RCU
|
HK_FLAG_MISC
;
return
housekeeping_setup
(
str
,
flags
);
}
...
...
kernel/sched/loadavg.c
浏览文件 @
e13e75b8
...
...
@@ -6,10 +6,6 @@
* figure. Its a silly number but people think its important. We go through
* great pains to make it work on big machines and tickless kernels.
*/
#include <linux/export.h>
#include <linux/sched/loadavg.h>
#include "sched.h"
/*
...
...
@@ -32,29 +28,29 @@
* Due to a number of reasons the above turns in the mess below:
*
* - for_each_possible_cpu() is prohibitively expensive on machines with
* serious number of
cpu
s, therefore we need to take a distributed approach
* serious number of
CPU
s, therefore we need to take a distributed approach
* to calculating nr_active.
*
* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
*
* So assuming nr_active := 0 when we start out -- true per definition, we
* can simply take per-
cpu
deltas and fold those into a global accumulate
* can simply take per-
CPU
deltas and fold those into a global accumulate
* to obtain the same result. See calc_load_fold_active().
*
* Furthermore, in order to avoid synchronizing all per-
cpu
delta folding
* Furthermore, in order to avoid synchronizing all per-
CPU
delta folding
* across the machine, we assume 10 ticks is sufficient time for every
*
cpu
to have completed this task.
*
CPU
to have completed this task.
*
* This places an upper-bound on the IRQ-off latency of the machine. Then
* again, being late doesn't loose the delta, just wrecks the sample.
*
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-
cpu
because
* this would add another cross-
cpu
cacheline miss and atomic operation
* to the wakeup path. Instead we increment on whatever
cpu
the task ran
* when it went into uninterruptible state and decrement on whatever
cpu
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-
CPU
because
* this would add another cross-
CPU
cacheline miss and atomic operation
* to the wakeup path. Instead we increment on whatever
CPU
the task ran
* when it went into uninterruptible state and decrement on whatever
CPU
* did the wakeup. This means that only the sum of nr_uninterruptible over
* all
cpu
s yields the correct result.
* all
CPU
s yields the correct result.
*
* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
*/
...
...
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* Handle NO_HZ for the global load-average.
*
* Since the above described distributed algorithm to compute the global
* load-average relies on per-
cpu
sampling from the tick, it is affected by
* load-average relies on per-
CPU
sampling from the tick, it is affected by
* NO_HZ.
*
* The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
* entering NO_HZ state such that we can include this as an 'extra'
cpu
delta
* entering NO_HZ state such that we can include this as an 'extra'
CPU
delta
* when we read the global state.
*
* Obviously reality has to ruin such a delightfully simple scheme:
...
...
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
* busy state.
*
* This is solved by pushing the window forward, and thus skipping the
* sample, for this
cpu (effectively using the NO_HZ-delta for this cpu
which
* sample, for this
CPU (effectively using the NO_HZ-delta for this CPU
which
* was in effect at the time the window opened). This also solves the issue
* of having to deal with a
cpu
having been in NO_HZ for multiple LOAD_FREQ
* of having to deal with a
CPU
having been in NO_HZ for multiple LOAD_FREQ
* intervals.
*
* When making the ILB scale, we should try to pull this in as well.
...
...
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
}
/*
* NO_HZ can leave us missing all per-
cpu
ticks calling
* NO_HZ can leave us missing all per-
CPU
ticks calling
* calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
* calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
* in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
...
...
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
return
;
/*
* Fold the 'old' NO_HZ-delta to include all NO_HZ
cpu
s.
* Fold the 'old' NO_HZ-delta to include all NO_HZ
CPU
s.
*/
delta
=
calc_load_nohz_fold
();
if
(
delta
)
...
...
kernel/sched/membarrier.c
浏览文件 @
e13e75b8
...
...
@@ -13,32 +13,25 @@
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#include <linux/syscalls.h>
#include <linux/membarrier.h>
#include <linux/tick.h>
#include <linux/cpumask.h>
#include <linux/atomic.h>
#include "sched.h"
/* for cpu_rq(). */
#include "sched.h"
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
\
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK
\
(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE
\
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
#else
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED
\
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED
\
| MEMBARRIER_CMD_PRIVATE_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
#define MEMBARRIER_CMD_BITMASK
\
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED
\
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED
\
| MEMBARRIER_CMD_PRIVATE_EXPEDITED
\
| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED
\
| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
static
void
ipi_mb
(
void
*
info
)
...
...
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
*/
if
(
cpu
==
raw_smp_processor_id
())
continue
;
rcu_read_lock
();
p
=
task_rcu_dereference
(
&
cpu_rq
(
cpu
)
->
curr
);
if
(
p
&&
p
->
mm
&&
(
atomic_read
(
&
p
->
mm
->
membarrier_state
)
&
...
...
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
* rq->curr modification in scheduler.
*/
smp_mb
();
/* exit from system call is not a mb */
return
0
;
}
...
...
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
}
atomic_or
(
MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY
,
&
mm
->
membarrier_state
);
return
0
;
}
...
...
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
synchronize_sched
();
}
atomic_or
(
state
,
&
mm
->
membarrier_state
);
return
0
;
}
...
...
kernel/sched/rt.c
浏览文件 @
e13e75b8
...
...
@@ -3,12 +3,8 @@
* Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
* policies)
*/
#include "sched.h"
#include <linux/slab.h>
#include <linux/irq_work.h>
int
sched_rr_timeslice
=
RR_TIMESLICE
;
int
sysctl_sched_rr_timeslice
=
(
MSEC_PER_SEC
/
HZ
)
*
RR_TIMESLICE
;
...
...
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
static
void
push_rt_tasks
(
struct
rq
*
);
static
void
pull_rt_task
(
struct
rq
*
);
static
inline
void
queue_push_tasks
(
struct
rq
*
rq
)
static
inline
void
rt_
queue_push_tasks
(
struct
rq
*
rq
)
{
if
(
!
has_pushable_tasks
(
rq
))
return
;
...
...
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
queue_balance_callback
(
rq
,
&
per_cpu
(
rt_push_head
,
rq
->
cpu
),
push_rt_tasks
);
}
static
inline
void
queue_pull_task
(
struct
rq
*
rq
)
static
inline
void
rt_
queue_pull_task
(
struct
rq
*
rq
)
{
queue_balance_callback
(
rq
,
&
per_cpu
(
rt_pull_head
,
rq
->
cpu
),
pull_rt_task
);
}
...
...
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
{
}
static
inline
void
queue_push_tasks
(
struct
rq
*
rq
)
static
inline
void
rt_
queue_push_tasks
(
struct
rq
*
rq
)
{
}
#endif
/* CONFIG_SMP */
...
...
@@ -1453,9 +1449,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
return
;
/*
* There appear
s to be other cpu
s that can accept
*
current and none to
run 'p', so lets reschedule
* to try and push
current
away:
* There appear
to be other CPU
s that can accept
*
the current task but none can
run 'p', so lets reschedule
* to try and push
the current task
away:
*/
requeue_task_rt
(
rq
,
p
,
1
);
resched_curr
(
rq
);
...
...
@@ -1569,7 +1565,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* The running task is never eligible for pushing */
dequeue_pushable_task
(
rq
,
p
);
queue_push_tasks
(
rq
);
rt_
queue_push_tasks
(
rq
);
return
p
;
}
...
...
@@ -1596,12 +1592,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
if
(
!
task_running
(
rq
,
p
)
&&
cpumask_test_cpu
(
cpu
,
&
p
->
cpus_allowed
))
return
1
;
return
0
;
}
/*
* Return the highest pushable rq's task, which is suitable to be executed
* on the
cpu
, NULL otherwise
* on the
CPU
, NULL otherwise
*/
static
struct
task_struct
*
pick_highest_pushable_task
(
struct
rq
*
rq
,
int
cpu
)
{
...
...
@@ -1639,11 +1636,11 @@ static int find_lowest_rq(struct task_struct *task)
return
-
1
;
/* No targets found */
/*
* At this point we have built a mask of
cpu
s representing the
* At this point we have built a mask of
CPU
s representing the
* lowest priority tasks in the system. Now we want to elect
* the best one based on our affinity and topology.
*
* We prioritize the last
cpu
that the task executed on since
* We prioritize the last
CPU
that the task executed on since
* it is most likely cache-hot in that location.
*/
if
(
cpumask_test_cpu
(
cpu
,
lowest_mask
))
...
...
@@ -1651,7 +1648,7 @@ static int find_lowest_rq(struct task_struct *task)
/*
* Otherwise, we consult the sched_domains span maps to figure
* out which
cpu
is logically closest to our hot cache data.
* out which
CPU
is logically closest to our hot cache data.
*/
if
(
!
cpumask_test_cpu
(
this_cpu
,
lowest_mask
))
this_cpu
=
-
1
;
/* Skip this_cpu opt if not among lowest */
...
...
@@ -1692,6 +1689,7 @@ static int find_lowest_rq(struct task_struct *task)
cpu
=
cpumask_any
(
lowest_mask
);
if
(
cpu
<
nr_cpu_ids
)
return
cpu
;
return
-
1
;
}
...
...
@@ -1827,7 +1825,7 @@ static int push_rt_task(struct rq *rq)
* The task hasn't migrated, and is still the next
* eligible task, but we failed to find a run-queue
* to push it to. Do not retry in this case, since
* other
cpu
s will pull from us when ready.
* other
CPU
s will pull from us when ready.
*/
goto
out
;
}
...
...
@@ -1919,7 +1917,7 @@ static int rto_next_cpu(struct root_domain *rd)
* rt_next_cpu() will simply return the first CPU found in
* the rto_mask.
*
* If rto_next_cpu() is called with rto_cpu is a valid
cpu
, it
* If rto_next_cpu() is called with rto_cpu is a valid
CPU
, it
* will return the next CPU found in the rto_mask.
*
* If there are no more CPUs left in the rto_mask, then a check is made
...
...
@@ -1980,7 +1978,7 @@ static void tell_cpu_to_push(struct rq *rq)
raw_spin_lock
(
&
rq
->
rd
->
rto_lock
);
/*
* The rto_cpu is updated under the lock, if it has a valid
cpu
* The rto_cpu is updated under the lock, if it has a valid
CPU
* then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here.
* Otherwise it is finishing up and an ipi needs to be sent.
...
...
@@ -2105,7 +2103,7 @@ static void pull_rt_task(struct rq *this_rq)
/*
* There's a chance that p is higher in priority
* than what's currently running on its
cpu
.
* than what's currently running on its
CPU
.
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
...
...
@@ -2187,7 +2185,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
if
(
!
task_on_rq_queued
(
p
)
||
rq
->
rt
.
rt_nr_running
)
return
;
queue_pull_task
(
rq
);
rt_
queue_pull_task
(
rq
);
}
void
__init
init_sched_rt_class
(
void
)
...
...
@@ -2218,7 +2216,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if
(
task_on_rq_queued
(
p
)
&&
rq
->
curr
!=
p
)
{
#ifdef CONFIG_SMP
if
(
p
->
nr_cpus_allowed
>
1
&&
rq
->
rt
.
overloaded
)
queue_push_tasks
(
rq
);
rt_
queue_push_tasks
(
rq
);
#endif
/* CONFIG_SMP */
if
(
p
->
prio
<
rq
->
curr
->
prio
&&
cpu_online
(
cpu_of
(
rq
)))
resched_curr
(
rq
);
...
...
@@ -2242,7 +2240,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* may need to pull tasks to this runqueue.
*/
if
(
oldprio
<
p
->
prio
)
queue_pull_task
(
rq
);
rt_
queue_pull_task
(
rq
);
/*
* If there's a higher priority task waiting to run
...
...
@@ -2292,6 +2290,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static
inline
void
watchdog
(
struct
rq
*
rq
,
struct
task_struct
*
p
)
{
}
#endif
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static
void
task_tick_rt
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
queued
)
{
struct
sched_rt_entity
*
rt_se
=
&
p
->
rt
;
...
...
@@ -2685,6 +2691,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
msecs_to_jiffies
(
sysctl_sched_rr_timeslice
);
}
mutex_unlock
(
&
mutex
);
return
ret
;
}
...
...
kernel/sched/sched.h
浏览文件 @
e13e75b8
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Scheduler internal types and methods:
*/
#include <linux/sched.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/clock.h>
#include <linux/sched/wake_q.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/cpufreq.h>
#include <linux/sched/
stat
.h>
#include <linux/sched/
nohz
.h>
#include <linux/sched/
cputime
.h>
#include <linux/sched/
deadline
.h>
#include <linux/sched/debug.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/idle.h>
#include <linux/sched/init.h>
#include <linux/sched/isolation.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
#include <linux/sched/nohz.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/prio.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/sched/stat.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/sched/init.h>
#include <linux/sched/topology.h>
#include <linux/sched/user.h>
#include <linux/sched/wake_q.h>
#include <linux/sched/xacct.h>
#include <uapi/linux/sched/types.h>
#include <linux/u64_stats_sync.h>
#include <linux/kernel_stat.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/compat.h>
#include <linux/context_tracking.h>
#include <linux/cpufreq.h>
#include <linux/cpuidle.h>
#include <linux/cpuset.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
#include <linux/delayacct.h>
#include <linux/init_task.h>
#include <linux/kprobes.h>
#include <linux/kthread.h>
#include <linux/membarrier.h>
#include <linux/migrate.h>
#include <linux/mmu_context.h>
#include <linux/nmi.h>
#include <linux/proc_fs.h>
#include <linux/prefetch.h>
#include <linux/profile.h>
#include <linux/rcupdate_wait.h>
#include <linux/security.h>
#include <linux/stackprotector.h>
#include <linux/stop_machine.h>
#include <linux/irq_work.h>
#include <linux/tick.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/suspend.h>
#include <linux/swait.h>
#include <linux/syscalls.h>
#include <linux/task_work.h>
#include <linux/tsacct_kern.h>
#include <asm/tlb.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#
include <asm/paravirt.h>
#endif
#include "cpupri.h"
...
...
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
* and does not change the user-interface for setting shares/weights.
*
* We increase resolution only if we have enough bits to allow this increased
* resolution (i.e. 64
bit). The costs for increasing resolution when 32bit are
* pretty high and the returns do not justify the increased costs.
* resolution (i.e. 64
-bit). The costs for increasing resolution when 32-bit
*
are
pretty high and the returns do not justify the increased costs.
*
* Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
* increase coverage and consistency always enable it on 64bit platforms.
* Really only required when CONFIG_FAIR_GROUP_SCHED
=y
is also set, but to
* increase coverage and consistency always enable it on 64
-
bit platforms.
*/
#ifdef CONFIG_64BIT
# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
...
...
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
* 10 -> just above 1us
* 9 -> just above 0.5us
*/
#define DL_SCALE (10)
/*
* These are the 'tuning knobs' of the scheduler:
*/
#define DL_SCALE 10
/*
*
s
ingle value that denotes runtime == period, ie unlimited time.
*
S
ingle value that denotes runtime == period, ie unlimited time.
*/
#define RUNTIME_INF ((u64)~0ULL)
#define RUNTIME_INF
((u64)~0ULL)
static
inline
int
idle_policy
(
int
policy
)
{
...
...
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p);
* control.
*/
struct
dl_bandwidth
{
raw_spinlock_t
dl_runtime_lock
;
u64
dl_runtime
;
u64
dl_period
;
raw_spinlock_t
dl_runtime_lock
;
u64
dl_runtime
;
u64
dl_period
;
};
static
inline
int
dl_bandwidth_enabled
(
void
)
...
...
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void)
}
struct
dl_bw
{
raw_spinlock_t
lock
;
u64
bw
,
total_bw
;
raw_spinlock_t
lock
;
u64
bw
;
u64
total_bw
;
};
static
inline
void
__dl_update
(
struct
dl_bw
*
dl_b
,
s64
bw
);
...
...
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
dl_b
->
bw
*
cpus
<
dl_b
->
total_bw
-
old_bw
+
new_bw
;
}
void
dl_change_utilization
(
struct
task_struct
*
p
,
u64
new_bw
);
extern
void
dl_change_utilization
(
struct
task_struct
*
p
,
u64
new_bw
);
extern
void
init_dl_bw
(
struct
dl_bw
*
dl_b
);
extern
int
sched_dl_global_validate
(
void
);
extern
int
sched_dl_global_validate
(
void
);
extern
void
sched_dl_do_global
(
void
);
extern
int
sched_dl_overflow
(
struct
task_struct
*
p
,
int
policy
,
const
struct
sched_attr
*
attr
);
extern
int
sched_dl_overflow
(
struct
task_struct
*
p
,
int
policy
,
const
struct
sched_attr
*
attr
);
extern
void
__setparam_dl
(
struct
task_struct
*
p
,
const
struct
sched_attr
*
attr
);
extern
void
__getparam_dl
(
struct
task_struct
*
p
,
struct
sched_attr
*
attr
);
extern
bool
__checkparam_dl
(
const
struct
sched_attr
*
attr
);
extern
bool
dl_param_changed
(
struct
task_struct
*
p
,
const
struct
sched_attr
*
attr
);
extern
int
dl_task_can_attach
(
struct
task_struct
*
p
,
const
struct
cpumask
*
cs_cpus_allowed
);
extern
int
dl_cpuset_cpumask_can_shrink
(
const
struct
cpumask
*
cur
,
const
struct
cpumask
*
trial
);
extern
int
dl_task_can_attach
(
struct
task_struct
*
p
,
const
struct
cpumask
*
cs_cpus_allowed
);
extern
int
dl_cpuset_cpumask_can_shrink
(
const
struct
cpumask
*
cur
,
const
struct
cpumask
*
trial
);
extern
bool
dl_cpu_busy
(
unsigned
int
cpu
);
#ifdef CONFIG_CGROUP_SCHED
...
...
@@ -300,32 +328,36 @@ extern struct list_head task_groups;
struct
cfs_bandwidth
{
#ifdef CONFIG_CFS_BANDWIDTH
raw_spinlock_t
lock
;
ktime_t
period
;
u64
quota
,
runtime
;
s64
hierarchical_quota
;
u64
runtime_expires
;
int
idle
,
period_active
;
struct
hrtimer
period_timer
,
slack_timer
;
struct
list_head
throttled_cfs_rq
;
/* statistics */
int
nr_periods
,
nr_throttled
;
u64
throttled_time
;
raw_spinlock_t
lock
;
ktime_t
period
;
u64
quota
;
u64
runtime
;
s64
hierarchical_quota
;
u64
runtime_expires
;
int
idle
;
int
period_active
;
struct
hrtimer
period_timer
;
struct
hrtimer
slack_timer
;
struct
list_head
throttled_cfs_rq
;
/* Statistics: */
int
nr_periods
;
int
nr_throttled
;
u64
throttled_time
;
#endif
};
/*
t
ask group related information */
/*
T
ask group related information */
struct
task_group
{
struct
cgroup_subsys_state
css
;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* schedulable entities of this group on each
cpu
*/
struct
sched_entity
**
se
;
/* runqueue "owned" by this group on each
cpu
*/
struct
cfs_rq
**
cfs_rq
;
unsigned
long
shares
;
/* schedulable entities of this group on each
CPU
*/
struct
sched_entity
**
se
;
/* runqueue "owned" by this group on each
CPU
*/
struct
cfs_rq
**
cfs_rq
;
unsigned
long
shares
;
#ifdef CONFIG_SMP
/*
...
...
@@ -333,29 +365,29 @@ struct task_group {
* it in its own cacheline separated from the fields above which
* will also be accessed at each tick.
*/
atomic_long_t
load_avg
____cacheline_aligned
;
atomic_long_t
load_avg
____cacheline_aligned
;
#endif
#endif
#ifdef CONFIG_RT_GROUP_SCHED
struct
sched_rt_entity
**
rt_se
;
struct
rt_rq
**
rt_rq
;
struct
sched_rt_entity
**
rt_se
;
struct
rt_rq
**
rt_rq
;
struct
rt_bandwidth
rt_bandwidth
;
struct
rt_bandwidth
rt_bandwidth
;
#endif
struct
rcu_head
rcu
;
struct
list_head
list
;
struct
rcu_head
rcu
;
struct
list_head
list
;
struct
task_group
*
parent
;
struct
list_head
siblings
;
struct
list_head
children
;
struct
task_group
*
parent
;
struct
list_head
siblings
;
struct
list_head
children
;
#ifdef CONFIG_SCHED_AUTOGROUP
struct
autogroup
*
autogroup
;
struct
autogroup
*
autogroup
;
#endif
struct
cfs_bandwidth
cfs_bandwidth
;
struct
cfs_bandwidth
cfs_bandwidth
;
};
#ifdef CONFIG_FAIR_GROUP_SCHED
...
...
@@ -369,8 +401,8 @@ struct task_group {
* (The default weight is 1024 - so there's no practical
* limitation from this.)
*/
#define MIN_SHARES (1UL << 1)
#define MAX_SHARES (1UL << 18)
#define MIN_SHARES
(1UL << 1)
#define MAX_SHARES
(1UL << 18)
#endif
typedef
int
(
*
tg_visitor
)(
struct
task_group
*
,
void
*
);
...
...
@@ -443,35 +475,39 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct
cfs_rq
{
struct
load_weight
load
;
unsigned
long
runnable_weight
;
unsigned
int
nr_running
,
h_nr_running
;
struct
load_weight
load
;
unsigned
long
runnable_weight
;
unsigned
int
nr_running
;
unsigned
int
h_nr_running
;
u64
exec_clock
;
u64
min_vruntime
;
u64
exec_clock
;
u64
min_vruntime
;
#ifndef CONFIG_64BIT
u64
min_vruntime_copy
;
u64
min_vruntime_copy
;
#endif
struct
rb_root_cached
tasks_timeline
;
struct
rb_root_cached
tasks_timeline
;
/*
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct
sched_entity
*
curr
,
*
next
,
*
last
,
*
skip
;
struct
sched_entity
*
curr
;
struct
sched_entity
*
next
;
struct
sched_entity
*
last
;
struct
sched_entity
*
skip
;
#ifdef CONFIG_SCHED_DEBUG
unsigned
int
nr_spread_over
;
unsigned
int
nr_spread_over
;
#endif
#ifdef CONFIG_SMP
/*
* CFS load tracking
*/
struct
sched_avg
avg
;
struct
sched_avg
avg
;
#ifndef CONFIG_64BIT
u64
load_last_update_time_copy
;
u64
load_last_update_time_copy
;
#endif
struct
{
raw_spinlock_t
lock
____cacheline_aligned
;
...
...
@@ -482,9 +518,9 @@ struct cfs_rq {
}
removed
;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned
long
tg_load_avg_contrib
;
long
propagate
;
long
prop_runnable_sum
;
unsigned
long
tg_load_avg_contrib
;
long
propagate
;
long
prop_runnable_sum
;
/*
* h_load = weight * f(tg)
...
...
@@ -492,36 +528,38 @@ struct cfs_rq {
* Where f(tg) is the recursive weight fraction assigned to
* this group.
*/
unsigned
long
h_load
;
u64
last_h_load_update
;
struct
sched_entity
*
h_load_next
;
unsigned
long
h_load
;
u64
last_h_load_update
;
struct
sched_entity
*
h_load_next
;
#endif
/* CONFIG_FAIR_GROUP_SCHED */
#endif
/* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
struct
rq
*
rq
;
/* cpu
runqueue to which this cfs_rq is attached */
struct
rq
*
rq
;
/* CPU
runqueue to which this cfs_rq is attached */
/*
* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* a hierarchy). Non-leaf lrqs hold other higher schedulable entities
* (like users, containers etc.)
*
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a
cpu. This
* list is used during load balance.
* leaf_cfs_rq_list ties together list of leaf cfs_rq's in a
CPU.
*
This
list is used during load balance.
*/
int
on_list
;
struct
list_head
leaf_cfs_rq_list
;
struct
task_group
*
tg
;
/* group that "owns" this runqueue */
int
on_list
;
struct
list_head
leaf_cfs_rq_list
;
struct
task_group
*
tg
;
/* group that "owns" this runqueue */
#ifdef CONFIG_CFS_BANDWIDTH
int
runtime_enabled
;
u64
runtime_expires
;
s64
runtime_remaining
;
u64
throttled_clock
,
throttled_clock_task
;
u64
throttled_clock_task_time
;
int
throttled
,
throttle_count
;
struct
list_head
throttled_list
;
int
runtime_enabled
;
u64
runtime_expires
;
s64
runtime_remaining
;
u64
throttled_clock
;
u64
throttled_clock_task
;
u64
throttled_clock_task_time
;
int
throttled
;
int
throttle_count
;
struct
list_head
throttled_list
;
#endif
/* CONFIG_CFS_BANDWIDTH */
#endif
/* CONFIG_FAIR_GROUP_SCHED */
};
...
...
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void)
/* Real-Time classes' related field in a runqueue: */
struct
rt_rq
{
struct
rt_prio_array
active
;
unsigned
int
rt_nr_running
;
unsigned
int
rr_nr_running
;
struct
rt_prio_array
active
;
unsigned
int
rt_nr_running
;
unsigned
int
rr_nr_running
;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct
{
int
curr
;
/* highest queued rt task prio */
int
curr
;
/* highest queued rt task prio */
#ifdef CONFIG_SMP
int
next
;
/* next highest */
int
next
;
/* next highest */
#endif
}
highest_prio
;
#endif
#ifdef CONFIG_SMP
unsigned
long
rt_nr_migratory
;
unsigned
long
rt_nr_total
;
int
overloaded
;
struct
plist_head
pushable_tasks
;
unsigned
long
rt_nr_migratory
;
unsigned
long
rt_nr_total
;
int
overloaded
;
struct
plist_head
pushable_tasks
;
#endif
/* CONFIG_SMP */
int
rt_queued
;
int
rt_queued
;
int
rt_throttled
;
u64
rt_time
;
u64
rt_runtime
;
int
rt_throttled
;
u64
rt_time
;
u64
rt_runtime
;
/* Nests inside the rq lock: */
raw_spinlock_t
rt_runtime_lock
;
raw_spinlock_t
rt_runtime_lock
;
#ifdef CONFIG_RT_GROUP_SCHED
unsigned
long
rt_nr_boosted
;
unsigned
long
rt_nr_boosted
;
struct
rq
*
rq
;
struct
task_group
*
tg
;
struct
rq
*
rq
;
struct
task_group
*
tg
;
#endif
};
/* Deadline class' related fields in a runqueue */
struct
dl_rq
{
/* runqueue is an rbtree, ordered by deadline */
struct
rb_root_cached
root
;
struct
rb_root_cached
root
;
unsigned
long
dl_nr_running
;
unsigned
long
dl_nr_running
;
#ifdef CONFIG_SMP
/*
...
...
@@ -586,28 +624,28 @@ struct dl_rq {
* should migrate somewhere else.
*/
struct
{
u64
curr
;
u64
next
;
u64
curr
;
u64
next
;
}
earliest_dl
;
unsigned
long
dl_nr_migratory
;
int
overloaded
;
unsigned
long
dl_nr_migratory
;
int
overloaded
;
/*
* Tasks on this rq that can be pushed away. They are kept in
* an rb-tree, ordered by tasks' deadlines, with caching
* of the leftmost (earliest deadline) element.
*/
struct
rb_root_cached
pushable_dl_tasks_root
;
struct
rb_root_cached
pushable_dl_tasks_root
;
#else
struct
dl_bw
dl_bw
;
struct
dl_bw
dl_bw
;
#endif
/*
* "Active utilization" for this runqueue: increased when a
* task wakes up (becomes TASK_RUNNING) and decreased when a
* task blocks
*/
u64
running_bw
;
u64
running_bw
;
/*
* Utilization of the tasks "assigned" to this runqueue (including
...
...
@@ -618,14 +656,14 @@ struct dl_rq {
* This is needed to compute the "inactive utilization" for the
* runqueue (inactive utilization = this_bw - running_bw).
*/
u64
this_bw
;
u64
extra_bw
;
u64
this_bw
;
u64
extra_bw
;
/*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
u64
bw_ratio
;
u64
bw_ratio
;
};
#ifdef CONFIG_SMP
...
...
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b)
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
* fully partitioning the member
cpu
s from any other cpuset. Whenever a new
* fully partitioning the member
CPU
s from any other cpuset. Whenever a new
* exclusive cpuset is created, we also create and attach a new root-domain
* object.
*
*/
struct
root_domain
{
atomic_t
refcount
;
atomic_t
rto_count
;
struct
rcu_head
rcu
;
cpumask_var_t
span
;
cpumask_var_t
online
;
atomic_t
refcount
;
atomic_t
rto_count
;
struct
rcu_head
rcu
;
cpumask_var_t
span
;
cpumask_var_t
online
;
/* Indicate more than one runnable task for any CPU */
bool
overload
;
bool
overload
;
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
*/
cpumask_var_t
dlo_mask
;
atomic_t
dlo_count
;
struct
dl_bw
dl_bw
;
struct
cpudl
cpudl
;
cpumask_var_t
dlo_mask
;
atomic_t
dlo_count
;
struct
dl_bw
dl_bw
;
struct
cpudl
cpudl
;
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
*/
struct
irq_work
rto_push_work
;
raw_spinlock_t
rto_lock
;
struct
irq_work
rto_push_work
;
raw_spinlock_t
rto_lock
;
/* These are only updated and read within rto_lock */
int
rto_loop
;
int
rto_cpu
;
int
rto_loop
;
int
rto_cpu
;
/* These atomics are updated outside of a lock */
atomic_t
rto_loop_next
;
atomic_t
rto_loop_start
;
atomic_t
rto_loop_next
;
atomic_t
rto_loop_start
;
#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
*/
cpumask_var_t
rto_mask
;
struct
cpupri
cpupri
;
cpumask_var_t
rto_mask
;
struct
cpupri
cpupri
;
unsigned
long
max_cpu_capacity
;
unsigned
long
max_cpu_capacity
;
};
extern
struct
root_domain
def_root_domain
;
...
...
@@ -708,41 +746,39 @@ extern void rto_push_irq_work_func(struct irq_work *work);
*/
struct
rq
{
/* runqueue lock: */
raw_spinlock_t
lock
;
raw_spinlock_t
lock
;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
unsigned
int
nr_running
;
unsigned
int
nr_running
;
#ifdef CONFIG_NUMA_BALANCING
unsigned
int
nr_numa_running
;
unsigned
int
nr_preferred_running
;
unsigned
int
nr_numa_running
;
unsigned
int
nr_preferred_running
;
#endif
#define CPU_LOAD_IDX_MAX 5
unsigned
long
cpu_load
[
CPU_LOAD_IDX_MAX
];
unsigned
long
cpu_load
[
CPU_LOAD_IDX_MAX
];
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMP
unsigned
long
last_load_update_tick
;
unsigned
long
last_load_update_tick
;
#endif
/* CONFIG_SMP */
unsigned
long
nohz_flags
;
unsigned
long
nohz_flags
;
#endif
/* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
unsigned
long
last_sched_tick
;
#endif
/* capture load from *all* tasks on this cpu: */
struct
load_weight
load
;
unsigned
long
nr_load_updates
;
u64
nr_switches
;
struct
cfs_rq
cfs
;
struct
rt_rq
rt
;
struct
dl_rq
dl
;
/* capture load from *all* tasks on this CPU: */
struct
load_weight
load
;
unsigned
long
nr_load_updates
;
u64
nr_switches
;
struct
cfs_rq
cfs
;
struct
rt_rq
rt
;
struct
dl_rq
dl
;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this
cpu
: */
struct
list_head
leaf_cfs_rq_list
;
struct
list_head
*
tmp_alone_branch
;
/* list of leaf cfs_rq on this
CPU
: */
struct
list_head
leaf_cfs_rq_list
;
struct
list_head
*
tmp_alone_branch
;
#endif
/* CONFIG_FAIR_GROUP_SCHED */
/*
...
...
@@ -751,94 +787,98 @@ struct rq {
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
unsigned
long
nr_uninterruptible
;
unsigned
long
nr_uninterruptible
;
struct
task_struct
*
curr
,
*
idle
,
*
stop
;
unsigned
long
next_balance
;
struct
mm_struct
*
prev_mm
;
struct
task_struct
*
curr
;
struct
task_struct
*
idle
;
struct
task_struct
*
stop
;
unsigned
long
next_balance
;
struct
mm_struct
*
prev_mm
;
unsigned
int
clock_update_flags
;
u64
clock
;
u64
clock_task
;
unsigned
int
clock_update_flags
;
u64
clock
;
u64
clock_task
;
atomic_t
nr_iowait
;
atomic_t
nr_iowait
;
#ifdef CONFIG_SMP
struct
root_domain
*
rd
;
struct
sched_domain
*
sd
;
struct
root_domain
*
rd
;
struct
sched_domain
*
sd
;
unsigned
long
cpu_capacity
;
unsigned
long
cpu_capacity_orig
;
unsigned
long
cpu_capacity
;
unsigned
long
cpu_capacity_orig
;
struct
callback_head
*
balance_callback
;
struct
callback_head
*
balance_callback
;
unsigned
char
idle_balance
;
unsigned
char
idle_balance
;
/* For active balancing */
int
active_balance
;
int
push_cpu
;
struct
cpu_stop_work
active_balance_work
;
/* cpu of this runqueue: */
int
cpu
;
int
online
;
int
active_balance
;
int
push_cpu
;
struct
cpu_stop_work
active_balance_work
;
/* CPU of this runqueue: */
int
cpu
;
int
online
;
struct
list_head
cfs_tasks
;
u64
rt_avg
;
u64
age_stamp
;
u64
idle_stamp
;
u64
avg_idle
;
u64
rt_avg
;
u64
age_stamp
;
u64
idle_stamp
;
u64
avg_idle
;
/* This is used to determine avg_idle's max value */
u64
max_idle_balance_cost
;
u64
max_idle_balance_cost
;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64
prev_irq_time
;
u64
prev_irq_time
;
#endif
#ifdef CONFIG_PARAVIRT
u64
prev_steal_time
;
u64
prev_steal_time
;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
u64
prev_steal_time_rq
;
u64
prev_steal_time_rq
;
#endif
/* calc_load related fields */
unsigned
long
calc_load_update
;
long
calc_load_active
;
unsigned
long
calc_load_update
;
long
calc_load_active
;
#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
int
hrtick_csd_pending
;
call_single_data_t
hrtick_csd
;
int
hrtick_csd_pending
;
call_single_data_t
hrtick_csd
;
#endif
struct
hrtimer
hrtick_timer
;
struct
hrtimer
hrtick_timer
;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct
sched_info
rq_sched_info
;
unsigned
long
long
rq_cpu_time
;
struct
sched_info
rq_sched_info
;
unsigned
long
long
rq_cpu_time
;
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
unsigned
int
yld_count
;
unsigned
int
yld_count
;
/* schedule() stats */
unsigned
int
sched_count
;
unsigned
int
sched_goidle
;
unsigned
int
sched_count
;
unsigned
int
sched_goidle
;
/* try_to_wake_up() stats */
unsigned
int
ttwu_count
;
unsigned
int
ttwu_local
;
unsigned
int
ttwu_count
;
unsigned
int
ttwu_local
;
#endif
#ifdef CONFIG_SMP
struct
llist_head
wake_list
;
struct
llist_head
wake_list
;
#endif
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct
cpuidle_state
*
idle_state
;
struct
cpuidle_state
*
idle_state
;
#endif
};
...
...
@@ -904,9 +944,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
* one position though, because the next rq_unpin_lock() will shift it
* back.
*/
#define RQCF_REQ_SKIP 0x01
#define RQCF_ACT_SKIP 0x02
#define RQCF_UPDATED 0x04
#define RQCF_REQ_SKIP
0x01
#define RQCF_ACT_SKIP
0x02
#define RQCF_UPDATED
0x04
static
inline
void
assert_clock_updated
(
struct
rq
*
rq
)
{
...
...
@@ -1059,12 +1099,12 @@ extern void sched_ttwu_pending(void);
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The
cpu
whose highest level of sched domain is to
* @cpu: The
CPU
whose highest level of sched domain is to
* be returned.
* @flag: The flag to check for the highest sched_domain
* for the given
cpu
.
* for the given
CPU
.
*
* Returns the highest sched_domain of a
cpu
which contains the given flag.
* Returns the highest sched_domain of a
CPU
which contains the given flag.
*/
static
inline
struct
sched_domain
*
highest_flag_domain
(
int
cpu
,
int
flag
)
{
...
...
@@ -1099,30 +1139,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU
(
struct
sched_domain
*
,
sd_asym
);
struct
sched_group_capacity
{
atomic_t
ref
;
atomic_t
ref
;
/*
* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
* for a single CPU.
*/
unsigned
long
capacity
;
unsigned
long
min_capacity
;
/* Min per-CPU capacity in group */
unsigned
long
next_update
;
int
imbalance
;
/* XXX unrelated to capacity but shared group state */
unsigned
long
capacity
;
unsigned
long
min_capacity
;
/* Min per-CPU capacity in group */
unsigned
long
next_update
;
int
imbalance
;
/* XXX unrelated to capacity but shared group state */
#ifdef CONFIG_SCHED_DEBUG
int
id
;
int
id
;
#endif
unsigned
long
cpumask
[
0
];
/* b
alance mask */
unsigned
long
cpumask
[
0
];
/* B
alance mask */
};
struct
sched_group
{
struct
sched_group
*
next
;
/* Must be a circular list */
atomic_t
ref
;
struct
sched_group
*
next
;
/* Must be a circular list */
atomic_t
ref
;
unsigned
int
group_weight
;
unsigned
int
group_weight
;
struct
sched_group_capacity
*
sgc
;
int
asym_prefer_cpu
;
/* cpu
of highest priority in group */
int
asym_prefer_cpu
;
/* CPU
of highest priority in group */
/*
* The CPUs this group covers.
...
...
@@ -1131,7 +1171,7 @@ struct sched_group {
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned
long
cpumask
[
0
];
unsigned
long
cpumask
[
0
];
};
static
inline
struct
cpumask
*
sched_group_span
(
struct
sched_group
*
sg
)
...
...
@@ -1148,8 +1188,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
}
/**
* group_first_cpu - Returns the first
cpu
in the cpumask of a sched_group.
* @group: The group whose first
cpu
is to be returned.
* group_first_cpu - Returns the first
CPU
in the cpumask of a sched_group.
* @group: The group whose first
CPU
is to be returned.
*/
static
inline
unsigned
int
group_first_cpu
(
struct
sched_group
*
group
)
{
...
...
@@ -1349,19 +1389,12 @@ static inline int task_on_rq_migrating(struct task_struct *p)
return
p
->
on_rq
==
TASK_ON_RQ_MIGRATING
;
}
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch() do { } while (0)
#endif
/*
* wake flags
*/
#define WF_SYNC
0x01
/* w
aker goes to sleep after wakeup */
#define WF_FORK
0x02
/* c
hild wakeup after fork */
#define WF_MIGRATED
0x4
/* i
nternal use, task got migrated */
#define WF_SYNC
0x01
/* W
aker goes to sleep after wakeup */
#define WF_FORK
0x02
/* C
hild wakeup after fork */
#define WF_MIGRATED
0x4
/* I
nternal use, task got migrated */
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
...
...
@@ -1372,11 +1405,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
* slice expiry etc.
*/
#define WEIGHT_IDLEPRIO
3
#define WMULT_IDLEPRIO
1431655765
#define WEIGHT_IDLEPRIO
3
#define WMULT_IDLEPRIO
1431655765
extern
const
int
sched_prio_to_weight
[
40
];
extern
const
u32
sched_prio_to_wmult
[
40
];
extern
const
int
sched_prio_to_weight
[
40
];
extern
const
u32
sched_prio_to_wmult
[
40
];
/*
* {de,en}queue flags:
...
...
@@ -1398,9 +1431,9 @@ extern const u32 sched_prio_to_wmult[40];
*/
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02
/*
m
atches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04
/*
m
atches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08
/*
m
atches ENQUEUE_NOCLOCK */
#define DEQUEUE_SAVE 0x02
/*
M
atches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04
/*
M
atches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08
/*
M
atches ENQUEUE_NOCLOCK */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
...
...
@@ -1422,10 +1455,10 @@ struct sched_class {
void
(
*
enqueue_task
)
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
void
(
*
dequeue_task
)
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
void
(
*
yield_task
)
(
struct
rq
*
rq
);
bool
(
*
yield_to_task
)
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
bool
preempt
);
void
(
*
yield_task
)
(
struct
rq
*
rq
);
bool
(
*
yield_to_task
)(
struct
rq
*
rq
,
struct
task_struct
*
p
,
bool
preempt
);
void
(
*
check_preempt_curr
)
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
void
(
*
check_preempt_curr
)(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
/*
* It is the responsibility of the pick_next_task() method that will
...
...
@@ -1435,16 +1468,16 @@ struct sched_class {
* May return RETRY_TASK when it finds a higher prio class has runnable
* tasks.
*/
struct
task_struct
*
(
*
pick_next_task
)
(
struct
rq
*
rq
,
struct
task_struct
*
prev
,
struct
rq_flags
*
rf
);
void
(
*
put_prev_task
)
(
struct
rq
*
rq
,
struct
task_struct
*
p
);
struct
task_struct
*
(
*
pick_next_task
)(
struct
rq
*
rq
,
struct
task_struct
*
prev
,
struct
rq_flags
*
rf
);
void
(
*
put_prev_task
)(
struct
rq
*
rq
,
struct
task_struct
*
p
);
#ifdef CONFIG_SMP
int
(
*
select_task_rq
)(
struct
task_struct
*
p
,
int
task_cpu
,
int
sd_flag
,
int
flags
);
void
(
*
migrate_task_rq
)(
struct
task_struct
*
p
);
void
(
*
task_woken
)
(
struct
rq
*
this_rq
,
struct
task_struct
*
task
);
void
(
*
task_woken
)(
struct
rq
*
this_rq
,
struct
task_struct
*
task
);
void
(
*
set_cpus_allowed
)(
struct
task_struct
*
p
,
const
struct
cpumask
*
newmask
);
...
...
@@ -1453,31 +1486,31 @@ struct sched_class {
void
(
*
rq_offline
)(
struct
rq
*
rq
);
#endif
void
(
*
set_curr_task
)
(
struct
rq
*
rq
);
void
(
*
task_tick
)
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
queued
);
void
(
*
task_fork
)
(
struct
task_struct
*
p
);
void
(
*
task_dead
)
(
struct
task_struct
*
p
);
void
(
*
set_curr_task
)(
struct
rq
*
rq
);
void
(
*
task_tick
)(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
queued
);
void
(
*
task_fork
)(
struct
task_struct
*
p
);
void
(
*
task_dead
)(
struct
task_struct
*
p
);
/*
* The switched_from() call is allowed to drop rq->lock, therefore we
* cannot assume the switched_from/switched_to pair is serliazed by
* rq->lock. They are however serialized by p->pi_lock.
*/
void
(
*
switched_from
)
(
struct
rq
*
this_rq
,
struct
task_struct
*
task
);
void
(
*
switched_to
)
(
struct
rq
*
this_rq
,
struct
task_struct
*
task
);
void
(
*
switched_from
)(
struct
rq
*
this_rq
,
struct
task_struct
*
task
);
void
(
*
switched_to
)
(
struct
rq
*
this_rq
,
struct
task_struct
*
task
);
void
(
*
prio_changed
)
(
struct
rq
*
this_rq
,
struct
task_struct
*
task
,
int
oldprio
);
int
oldprio
);
unsigned
int
(
*
get_rr_interval
)
(
struct
rq
*
rq
,
struct
task_struct
*
task
);
unsigned
int
(
*
get_rr_interval
)(
struct
rq
*
rq
,
struct
task_struct
*
task
);
void
(
*
update_curr
)
(
struct
rq
*
rq
);
void
(
*
update_curr
)(
struct
rq
*
rq
);
#define TASK_SET_GROUP
0
#define TASK_MOVE_GROUP 1
#define TASK_SET_GROUP
0
#define TASK_MOVE_GROUP
1
#ifdef CONFIG_FAIR_GROUP_SCHED
void
(
*
task_change_group
)
(
struct
task_struct
*
p
,
int
type
);
void
(
*
task_change_group
)(
struct
task_struct
*
p
,
int
type
);
#endif
};
...
...
@@ -1526,6 +1559,7 @@ static inline void idle_set_state(struct rq *rq,
static
inline
struct
cpuidle_state
*
idle_get_state
(
struct
rq
*
rq
)
{
SCHED_WARN_ON
(
!
rcu_read_lock_held
());
return
rq
->
idle_state
;
}
#else
...
...
@@ -1564,9 +1598,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern
void
init_dl_inactive_task_timer
(
struct
sched_dl_entity
*
dl_se
);
extern
void
init_dl_rq_bw_ratio
(
struct
dl_rq
*
dl_rq
);
#define BW_SHIFT 20
#define BW_UNIT (1 << BW_SHIFT)
#define RATIO_SHIFT 8
#define BW_SHIFT
20
#define BW_UNIT
(1 << BW_SHIFT)
#define RATIO_SHIFT
8
unsigned
long
to_ratio
(
u64
period
,
u64
runtime
);
extern
void
init_entity_runnable_average
(
struct
sched_entity
*
se
);
...
...
@@ -1574,6 +1608,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
#ifdef CONFIG_NO_HZ_FULL
extern
bool
sched_can_stop_tick
(
struct
rq
*
rq
);
extern
int
__init
sched_tick_offload_init
(
void
);
/*
* Tick may be needed by tasks in the runqueue depending on their policy and
...
...
@@ -1598,6 +1633,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
tick_nohz_dep_set_cpu
(
cpu
,
TICK_DEP_BIT_SCHED
);
}
#else
static
inline
int
sched_tick_offload_init
(
void
)
{
return
0
;
}
static
inline
void
sched_update_tick_dependency
(
struct
rq
*
rq
)
{
}
#endif
...
...
@@ -1624,13 +1660,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
sched_update_tick_dependency
(
rq
);
}
static
inline
void
rq_last_tick_reset
(
struct
rq
*
rq
)
{
#ifdef CONFIG_NO_HZ_FULL
rq
->
last_sched_tick
=
jiffies
;
#endif
}
extern
void
update_rq_clock
(
struct
rq
*
rq
);
extern
void
activate_task
(
struct
rq
*
rq
,
struct
task_struct
*
p
,
int
flags
);
...
...
@@ -1821,8 +1850,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
/*
* Unfair double_lock_balance: Optimizes throughput at the expense of
* latency by eliminating extra atomic operations when the locks are
* already in proper order on entry. This favors lower
cpu
-ids and will
* grant the double lock to lower
cpu
s over higher ids under contention,
* already in proper order on entry. This favors lower
CPU
-ids and will
* grant the double lock to lower
CPU
s over higher ids under contention,
* regardless of entry order into the function.
*/
static
inline
int
_double_lock_balance
(
struct
rq
*
this_rq
,
struct
rq
*
busiest
)
...
...
@@ -1854,7 +1883,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
static
inline
int
double_lock_balance
(
struct
rq
*
this_rq
,
struct
rq
*
busiest
)
{
if
(
unlikely
(
!
irqs_disabled
()))
{
/* printk() doesn't work
good
under rq->lock */
/* printk() doesn't work
well
under rq->lock */
raw_spin_unlock
(
&
this_rq
->
lock
);
BUG_ON
(
1
);
}
...
...
@@ -2113,15 +2142,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif
/* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
#ifndef arch_scale_freq_invariant
#
define arch_scale_freq_invariant() (true)
#endif
#else
/* arch_scale_freq_capacity */
#
define arch_scale_freq_invariant() (false)
#
ifndef arch_scale_freq_invariant
#
define arch_scale_freq_invariant() true
#
endif
#else
#
define arch_scale_freq_invariant() false
#endif
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
static
inline
unsigned
long
cpu_util_dl
(
struct
rq
*
rq
)
{
return
(
rq
->
dl
.
running_bw
*
SCHED_CAPACITY_SCALE
)
>>
BW_SHIFT
;
...
...
@@ -2131,5 +2159,4 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
{
return
rq
->
cfs
.
avg
.
util_avg
;
}
#endif
kernel/sched/stats.c
浏览文件 @
e13e75b8
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
/*
* /proc/schedstat implementation
*/
#include "sched.h"
/*
* bump this up when changing the output format or the meaning of an existing
* Current schedstat API version.
*
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
#define SCHEDSTAT_VERSION 15
...
...
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
* This itererator needs some explanation.
* It returns 1 for the header position.
* This means 2 is cpu 0.
* In a hotplugged system some
cpu
s, including cpu 0, may be missing so we have
* to use cpumask_* to iterate over the
cpu
s.
* In a hotplugged system some
CPU
s, including cpu 0, may be missing so we have
* to use cpumask_* to iterate over the
CPU
s.
*/
static
void
*
schedstat_start
(
struct
seq_file
*
file
,
loff_t
*
offset
)
{
...
...
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
if
(
n
<
nr_cpu_ids
)
return
(
void
*
)(
unsigned
long
)(
n
+
2
);
return
NULL
;
}
static
void
*
schedstat_next
(
struct
seq_file
*
file
,
void
*
data
,
loff_t
*
offset
)
{
(
*
offset
)
++
;
return
schedstat_start
(
file
,
offset
);
}
...
...
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = {
static
int
__init
proc_schedstat_init
(
void
)
{
proc_create
(
"schedstat"
,
0
,
NULL
,
&
proc_schedstat_operations
);
return
0
;
}
subsys_initcall
(
proc_schedstat_init
);
kernel/sched/stats.h
浏览文件 @
e13e75b8
...
...
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if
(
rq
)
rq
->
rq_sched_info
.
run_delay
+=
delta
;
}
#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
#define
schedstat_enabled() static_branch_unlikely(&sched_schedstats)
#define __schedstat_inc(var) do { var++; } while (0)
#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
#define
schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
#define __schedstat_add(var, amt) do { var += (amt); } while (0)
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
#else
/* !CONFIG_SCHEDSTATS */
static
inline
void
rq_sched_info_arrive
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{}
static
inline
void
rq_sched_info_dequeued
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{}
static
inline
void
rq_sched_info_depart
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{}
#define schedstat_enabled() 0
#define __schedstat_inc(var) do { } while (0)
#define schedstat_inc(var) do { } while (0)
#define __schedstat_add(var, amt) do { } while (0)
#define schedstat_add(var, amt) do { } while (0)
#define __schedstat_set(var, val) do { } while (0)
#define schedstat_set(var, val) do { } while (0)
#define schedstat_val(var) 0
#define schedstat_val_or_zero(var) 0
#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
#define __schedstat_set(var, val) do { var = (val); } while (0)
#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#define schedstat_val(var) (var)
#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
#else
/* !CONFIG_SCHEDSTATS: */
static
inline
void
rq_sched_info_arrive
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{
}
static
inline
void
rq_sched_info_dequeued
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{
}
static
inline
void
rq_sched_info_depart
(
struct
rq
*
rq
,
unsigned
long
long
delta
)
{
}
# define schedstat_enabled() 0
# define __schedstat_inc(var) do { } while (0)
# define schedstat_inc(var) do { } while (0)
# define __schedstat_add(var, amt) do { } while (0)
# define schedstat_add(var, amt) do { } while (0)
# define __schedstat_set(var, val) do { } while (0)
# define schedstat_set(var, val) do { } while (0)
# define schedstat_val(var) 0
# define schedstat_val_or_zero(var) 0
#endif
/* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SCHED_INFO
...
...
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
/*
* We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a
cpu
, we call this routine
* from dequeue_task() to account for possible rq->clock skew across
cpu
s. The
* delta taken on each
cpu
would annul the skew.
* task was queued to the time that it finally hit a
CPU
, we call this routine
* from dequeue_task() to account for possible rq->clock skew across
CPU
s. The
* delta taken on each
CPU
would annul the skew.
*/
static
inline
void
sched_info_dequeued
(
struct
rq
*
rq
,
struct
task_struct
*
t
)
{
...
...
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
}
/*
* Called when a task finally hits the
cpu
. We can now calculate how
* Called when a task finally hits the
CPU
. We can now calculate how
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
...
...
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
*/
static
inline
void
sched_info_queued
(
struct
rq
*
rq
,
struct
task_struct
*
t
)
{
if
(
unlikely
(
sched_info_on
()))
if
(
unlikely
(
sched_info_on
()))
{
if
(
!
t
->
sched_info
.
last_queued
)
t
->
sched_info
.
last_queued
=
rq_clock
(
rq
);
}
}
/*
...
...
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
*/
static
inline
void
sched_info_depart
(
struct
rq
*
rq
,
struct
task_struct
*
t
)
{
unsigned
long
long
delta
=
rq_clock
(
rq
)
-
t
->
sched_info
.
last_arrival
;
unsigned
long
long
delta
=
rq_clock
(
rq
)
-
t
->
sched_info
.
last_arrival
;
rq_sched_info_depart
(
rq
,
delta
);
...
...
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
* the idle task.) We are only called when prev != next.
*/
static
inline
void
__sched_info_switch
(
struct
rq
*
rq
,
struct
task_struct
*
prev
,
struct
task_struct
*
next
)
__sched_info_switch
(
struct
rq
*
rq
,
struct
task_struct
*
prev
,
struct
task_struct
*
next
)
{
/*
* prev now departs the
cpu
. It's not interesting to record
* prev now departs the
CPU
. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
...
...
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
if
(
next
!=
rq
->
idle
)
sched_info_arrive
(
rq
,
next
);
}
static
inline
void
sched_info_switch
(
struct
rq
*
rq
,
struct
task_struct
*
prev
,
struct
task_struct
*
next
)
sched_info_switch
(
struct
rq
*
rq
,
struct
task_struct
*
prev
,
struct
task_struct
*
next
)
{
if
(
unlikely
(
sched_info_on
()))
__sched_info_switch
(
rq
,
prev
,
next
);
}
#else
#define sched_info_queued(rq, t) do { } while (0)
#define sched_info_reset_dequeued(t) do { } while (0)
#define sched_info_dequeued(rq, t) do { } while (0)
#define sched_info_depart(rq, t) do { } while (0)
#define sched_info_arrive(rq, next) do { } while (0)
#define sched_info_switch(rq, t, next) do { } while (0)
#else
/* !CONFIG_SCHED_INFO: */
# define sched_info_queued(rq, t) do { } while (0)
# define sched_info_reset_dequeued(t) do { } while (0)
# define sched_info_dequeued(rq, t) do { } while (0)
# define sched_info_depart(rq, t) do { } while (0)
# define sched_info_arrive(rq, next) do { } while (0)
# define sched_info_switch(rq, t, next) do { } while (0)
#endif
/* CONFIG_SCHED_INFO */
kernel/sched/stop_task.c
浏览文件 @
e13e75b8
// SPDX-License-Identifier: GPL-2.0
#include "sched.h"
/*
* stop-task scheduling class.
*
...
...
@@ -9,6 +7,7 @@
*
* See kernel/stop_machine.c
*/
#include "sched.h"
#ifdef CONFIG_SMP
static
int
...
...
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
cgroup_account_cputime
(
curr
,
delta_exec
);
}
/*
* scheduler tick hitting a task of our scheduling class.
*
* NOTE: This function can be called remotely by the tick offload that
* goes along full dynticks. Therefore no local assumption can be made
* and everything must be accessed through the @rq and @curr passed in
* parameters.
*/
static
void
task_tick_stop
(
struct
rq
*
rq
,
struct
task_struct
*
curr
,
int
queued
)
{
}
...
...
kernel/sched/swait.c
浏览文件 @
e13e75b8
// SPDX-License-Identifier: GPL-2.0
#include <linux/sched/signal.h>
#include <linux/swait.h>
/*
* <linux/swait.h> (simple wait queues ) implementation:
*/
#include "sched.h"
void
__init_swait_queue_head
(
struct
swait_queue_head
*
q
,
const
char
*
name
,
struct
lock_class_key
*
key
)
...
...
kernel/sched/topology.c
浏览文件 @
e13e75b8
...
...
@@ -2,10 +2,6 @@
/*
* Scheduler topology setup/handling methods
*/
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/sched/isolation.h>
#include "sched.h"
DEFINE_MUTEX
(
sched_domains_mutex
);
...
...
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if
(
!
(
sd
->
flags
&
SD_LOAD_BALANCE
))
{
printk
(
"does not load-balance
\n
"
);
if
(
sd
->
parent
)
printk
(
KERN_ERR
"ERROR: !SD_LOAD_BALANCE domain"
" has parent"
);
printk
(
KERN_ERR
"ERROR: !SD_LOAD_BALANCE domain has parent"
);
return
-
1
;
}
...
...
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_pr_args
(
sched_domain_span
(
sd
)),
sd
->
name
);
if
(
!
cpumask_test_cpu
(
cpu
,
sched_domain_span
(
sd
)))
{
printk
(
KERN_ERR
"ERROR: domain->span does not contain "
"CPU%d
\n
"
,
cpu
);
printk
(
KERN_ERR
"ERROR: domain->span does not contain CPU%d
\n
"
,
cpu
);
}
if
(
!
cpumask_test_cpu
(
cpu
,
sched_group_span
(
group
)))
{
printk
(
KERN_ERR
"ERROR: domain->groups does not contain"
" CPU%d
\n
"
,
cpu
);
printk
(
KERN_ERR
"ERROR: domain->groups does not contain CPU%d
\n
"
,
cpu
);
}
printk
(
KERN_DEBUG
"%*s groups:"
,
level
+
1
,
""
);
...
...
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
if
(
sd
->
parent
&&
!
cpumask_subset
(
groupmask
,
sched_domain_span
(
sd
->
parent
)))
printk
(
KERN_ERR
"ERROR: parent span is not a superset "
"of domain->span
\n
"
);
printk
(
KERN_ERR
"ERROR: parent span is not a superset of domain->span
\n
"
);
return
0
;
}
...
...
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
* are not.
*
* This leads to a few particularly weird cases where the sched_domain's are
* not of the same number for each
cpu
. Consider:
* not of the same number for each
CPU
. Consider:
*
* NUMA-2 0-3 0-3
* groups: {0-2},{1-3} {1-3},{0-2}
...
...
@@ -780,7 +772,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
* ^ ^ ^ ^
* `-' `-'
*
* The sched_domains are per-
cpu
and have a two way link (parent & child) and
* The sched_domains are per-
CPU
and have a two way link (parent & child) and
* denote the ever growing mask of CPUs belonging to that level of topology.
*
* Each sched_domain has a circular (double) linked list of sched_group's, each
...
...
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
d
->
rd
=
alloc_rootdomain
();
if
(
!
d
->
rd
)
return
sa_sd
;
return
sa_rootdomain
;
}
...
...
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
}
#ifdef CONFIG_NUMA
static
int
sched_domains_numa_levels
;
enum
numa_topology_type
sched_numa_topology_type
;
static
int
*
sched_domains_numa_distance
;
int
sched_max_numa_distance
;
static
struct
cpumask
***
sched_domains_numa_masks
;
static
int
sched_domains_curr_level
;
static
int
sched_domains_numa_levels
;
static
int
sched_domains_curr_level
;
int
sched_max_numa_distance
;
static
int
*
sched_domains_numa_distance
;
static
struct
cpumask
***
sched_domains_numa_masks
;
#endif
/*
...
...
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level;
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
(SD_SHARE_CPUCAPACITY
|
\
(SD_SHARE_CPUCAPACITY
|
\
SD_SHARE_PKG_RESOURCES | \
SD_NUMA
|
\
SD_ASYM_PACKING
|
\
SD_ASYM_CPUCAPACITY
|
\
SD_NUMA
|
\
SD_ASYM_PACKING
|
\
SD_ASYM_CPUCAPACITY
|
\
SD_SHARE_POWERDOMAIN)
static
struct
sched_domain
*
...
...
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
pr_err
(
" the %s domain not a subset of the %s domain
\n
"
,
child
->
name
,
sd
->
name
);
#endif
/* Fixup, ensure @sd has at least @child
cpu
s. */
/* Fixup, ensure @sd has at least @child
CPU
s. */
cpumask_or
(
sched_domain_span
(
sd
),
sched_domain_span
(
sd
),
sched_domain_span
(
child
));
...
...
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
ret
=
0
;
error:
__free_domain_allocs
(
&
d
,
alloc_state
,
cpu_map
);
return
ret
;
}
...
...
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
return
1
;
tmp
=
SD_ATTR_INIT
;
return
!
memcmp
(
cur
?
(
cur
+
idx_cur
)
:
&
tmp
,
new
?
(
new
+
idx_new
)
:
&
tmp
,
sizeof
(
struct
sched_domain_attr
));
...
...
@@ -1929,4 +1926,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
mutex_unlock
(
&
sched_domains_mutex
);
}
kernel/sched/wait.c
浏览文件 @
e13e75b8
...
...
@@ -3,14 +3,7 @@
*
* (C) 2004 Nadia Yvette Chambers, Oracle
*/
#include <linux/init.h>
#include <linux/export.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
#include <linux/kthread.h>
#include "sched.h"
void
__init_waitqueue_head
(
struct
wait_queue_head
*
wq_head
,
const
char
*
name
,
struct
lock_class_key
*
key
)
{
...
...
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
break
;
}
}
return
nr_exclusive
;
}
...
...
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
spin_unlock
(
&
wq
->
lock
);
schedule
();
spin_lock
(
&
wq
->
lock
);
return
0
;
}
EXPORT_SYMBOL
(
do_wait_intr
);
...
...
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
spin_unlock_irq
(
&
wq
->
lock
);
schedule
();
spin_lock_irq
(
&
wq
->
lock
);
return
0
;
}
EXPORT_SYMBOL
(
do_wait_intr_irq
);
...
...
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
if
(
ret
)
list_del_init
(
&
wq_entry
->
entry
);
return
ret
;
}
EXPORT_SYMBOL
(
autoremove_wake_function
);
...
...
kernel/sched/wait_bit.c
浏览文件 @
e13e75b8
/*
* The implementation of the wait_bit*() and related waiting APIs:
*/
#include <linux/wait_bit.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/hash.h>
#include "sched.h"
#define WAIT_TABLE_BITS 8
#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
...
...
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
wait_bit
->
key
.
bit_nr
!=
key
->
bit_nr
||
test_bit
(
key
->
bit_nr
,
key
->
flags
))
return
0
;
else
return
autoremove_wake_function
(
wq_entry
,
mode
,
sync
,
key
);
return
autoremove_wake_function
(
wq_entry
,
mode
,
sync
,
key
);
}
EXPORT_SYMBOL
(
wake_bit_function
);
...
...
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
if
(
test_bit
(
wbq_entry
->
key
.
bit_nr
,
wbq_entry
->
key
.
flags
))
ret
=
(
*
action
)(
&
wbq_entry
->
key
,
mode
);
}
while
(
test_bit
(
wbq_entry
->
key
.
bit_nr
,
wbq_entry
->
key
.
flags
)
&&
!
ret
);
finish_wait
(
wq_head
,
&
wbq_entry
->
wq_entry
);
return
ret
;
}
EXPORT_SYMBOL
(
__wait_on_bit
);
...
...
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
DEFINE_WAIT_BIT
(
wq_entry
,
word
,
bit
);
wq_entry
.
key
.
timeout
=
jiffies
+
timeout
;
return
__wait_on_bit
(
wq_head
,
&
wq_entry
,
action
,
mode
);
}
EXPORT_SYMBOL_GPL
(
out_of_line_wait_on_bit_timeout
);
...
...
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
void
__wake_up_bit
(
struct
wait_queue_head
*
wq_head
,
void
*
word
,
int
bit
)
{
struct
wait_bit_key
key
=
__WAIT_BIT_KEY_INITIALIZER
(
word
,
bit
);
if
(
waitqueue_active
(
wq_head
))
__wake_up
(
wq_head
,
TASK_NORMAL
,
1
,
&
key
);
}
...
...
@@ -148,6 +149,54 @@ void wake_up_bit(void *word, int bit)
}
EXPORT_SYMBOL
(
wake_up_bit
);
wait_queue_head_t
*
__var_waitqueue
(
void
*
p
)
{
if
(
BITS_PER_LONG
==
64
)
{
unsigned
long
q
=
(
unsigned
long
)
p
;
return
bit_waitqueue
((
void
*
)(
q
&
~
1
),
q
&
1
);
}
return
bit_waitqueue
(
p
,
0
);
}
EXPORT_SYMBOL
(
__var_waitqueue
);
static
int
var_wake_function
(
struct
wait_queue_entry
*
wq_entry
,
unsigned
int
mode
,
int
sync
,
void
*
arg
)
{
struct
wait_bit_key
*
key
=
arg
;
struct
wait_bit_queue_entry
*
wbq_entry
=
container_of
(
wq_entry
,
struct
wait_bit_queue_entry
,
wq_entry
);
if
(
wbq_entry
->
key
.
flags
!=
key
->
flags
||
wbq_entry
->
key
.
bit_nr
!=
key
->
bit_nr
)
return
0
;
return
autoremove_wake_function
(
wq_entry
,
mode
,
sync
,
key
);
}
void
init_wait_var_entry
(
struct
wait_bit_queue_entry
*
wbq_entry
,
void
*
var
,
int
flags
)
{
*
wbq_entry
=
(
struct
wait_bit_queue_entry
){
.
key
=
{
.
flags
=
(
var
),
.
bit_nr
=
-
1
,
},
.
wq_entry
=
{
.
private
=
current
,
.
func
=
var_wake_function
,
.
entry
=
LIST_HEAD_INIT
(
wbq_entry
->
wq_entry
.
entry
),
},
};
}
EXPORT_SYMBOL
(
init_wait_var_entry
);
void
wake_up_var
(
void
*
var
)
{
__wake_up_bit
(
__var_waitqueue
(
var
),
var
,
-
1
);
}
EXPORT_SYMBOL
(
wake_up_var
);
/*
* Manipulate the atomic_t address to produce a better bit waitqueue table hash
* index (we're keying off bit -1, but that would produce a horrible hash
...
...
@@ -157,6 +206,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
{
if
(
BITS_PER_LONG
==
64
)
{
unsigned
long
q
=
(
unsigned
long
)
p
;
return
bit_waitqueue
((
void
*
)(
q
&
~
1
),
q
&
1
);
}
return
bit_waitqueue
(
p
,
0
);
...
...
@@ -173,6 +223,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
wait_bit
->
key
.
bit_nr
!=
key
->
bit_nr
||
atomic_read
(
val
)
!=
0
)
return
0
;
return
autoremove_wake_function
(
wq_entry
,
mode
,
sync
,
key
);
}
...
...
@@ -196,6 +247,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
ret
=
(
*
action
)(
val
,
mode
);
}
while
(
!
ret
&&
atomic_read
(
val
)
!=
0
);
finish_wait
(
wq_head
,
&
wbq_entry
->
wq_entry
);
return
ret
;
}
...
...
@@ -226,6 +278,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
schedule
();
if
(
signal_pending_state
(
mode
,
current
))
return
-
EINTR
;
return
0
;
}
EXPORT_SYMBOL
(
atomic_t_wait
);
...
...
@@ -250,6 +303,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode)
schedule
();
if
(
signal_pending_state
(
mode
,
current
))
return
-
EINTR
;
return
0
;
}
EXPORT_SYMBOL
(
bit_wait
);
...
...
@@ -259,6 +313,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
io_schedule
();
if
(
signal_pending_state
(
mode
,
current
))
return
-
EINTR
;
return
0
;
}
EXPORT_SYMBOL
(
bit_wait_io
);
...
...
@@ -266,11 +321,13 @@ EXPORT_SYMBOL(bit_wait_io);
__sched
int
bit_wait_timeout
(
struct
wait_bit_key
*
word
,
int
mode
)
{
unsigned
long
now
=
READ_ONCE
(
jiffies
);
if
(
time_after_eq
(
now
,
word
->
timeout
))
return
-
EAGAIN
;
schedule_timeout
(
word
->
timeout
-
now
);
if
(
signal_pending_state
(
mode
,
current
))
return
-
EINTR
;
return
0
;
}
EXPORT_SYMBOL_GPL
(
bit_wait_timeout
);
...
...
@@ -278,11 +335,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
__sched
int
bit_wait_io_timeout
(
struct
wait_bit_key
*
word
,
int
mode
)
{
unsigned
long
now
=
READ_ONCE
(
jiffies
);
if
(
time_after_eq
(
now
,
word
->
timeout
))
return
-
EAGAIN
;
io_schedule_timeout
(
word
->
timeout
-
now
);
if
(
signal_pending_state
(
mode
,
current
))
return
-
EINTR
;
return
0
;
}
EXPORT_SYMBOL_GPL
(
bit_wait_io_timeout
);
...
...
kernel/time/tick-sched.c
浏览文件 @
e13e75b8
...
...
@@ -481,11 +481,18 @@ static int __init setup_tick_nohz(char *str)
__setup
(
"nohz="
,
setup_tick_nohz
);
int
tick_nohz_tick_stopped
(
void
)
bool
tick_nohz_tick_stopped
(
void
)
{
return
__this_cpu_read
(
tick_cpu_sched
.
tick_stopped
);
}
bool
tick_nohz_tick_stopped_cpu
(
int
cpu
)
{
struct
tick_sched
*
ts
=
per_cpu_ptr
(
&
tick_cpu_sched
,
cpu
);
return
ts
->
tick_stopped
;
}
/**
* tick_nohz_update_jiffies - update jiffies when idle was interrupted
*
...
...
@@ -741,12 +748,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
delta
=
KTIME_MAX
;
}
#ifdef CONFIG_NO_HZ_FULL
/* Limit the tick delta to the maximum scheduler deferment */
if
(
!
ts
->
inidle
)
delta
=
min
(
delta
,
scheduler_tick_max_deferment
());
#endif
/* Calculate the next expiry time */
if
(
delta
<
(
KTIME_MAX
-
basemono
))
expires
=
basemono
+
delta
;
...
...
kernel/workqueue.c
浏览文件 @
e13e75b8
...
...
@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void)
int
__init
workqueue_init_early
(
void
)
{
int
std_nice
[
NR_STD_WORKER_POOLS
]
=
{
0
,
HIGHPRI_NICE_LEVEL
};
int
hk_flags
=
HK_FLAG_DOMAIN
|
HK_FLAG_WQ
;
int
i
,
cpu
;
WARN_ON
(
__alignof__
(
struct
pool_workqueue
)
<
__alignof__
(
long
long
));
BUG_ON
(
!
alloc_cpumask_var
(
&
wq_unbound_cpumask
,
GFP_KERNEL
));
cpumask_copy
(
wq_unbound_cpumask
,
housekeeping_cpumask
(
HK_FLAG_DOMAIN
));
cpumask_copy
(
wq_unbound_cpumask
,
housekeeping_cpumask
(
hk_flags
));
pwq_cache
=
KMEM_CACHE
(
pool_workqueue
,
SLAB_PANIC
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录