Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenHarmony
kernel_linux
提交
20737738
K
kernel_linux
项目概览
OpenHarmony
/
kernel_linux
上一次同步 4 年多
通知
15
Star
8
Fork
2
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
K
kernel_linux
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
20737738
编写于
12月 13, 2016
作者:
S
Shaohua Li
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'md-next' into md-linus
上级
b78b499a
2953079c
变更
16
显示空白变更内容
内联
并排
Showing
16 changed file
with
3429 addition
and
1262 deletion
+3429
-1262
drivers/md/bitmap.c
drivers/md/bitmap.c
+98
-68
drivers/md/dm-raid.c
drivers/md/dm-raid.c
+2
-2
drivers/md/linear.c
drivers/md/linear.c
+17
-14
drivers/md/md.c
drivers/md/md.c
+364
-337
drivers/md/md.h
drivers/md/md.h
+67
-41
drivers/md/multipath.c
drivers/md/multipath.c
+34
-58
drivers/md/raid0.c
drivers/md/raid0.c
+59
-48
drivers/md/raid1.c
drivers/md/raid1.c
+165
-82
drivers/md/raid1.h
drivers/md/raid1.h
+11
-8
drivers/md/raid10.c
drivers/md/raid10.c
+189
-106
drivers/md/raid10.h
drivers/md/raid10.h
+2
-0
drivers/md/raid5-cache.c
drivers/md/raid5-cache.c
+1645
-240
drivers/md/raid5.c
drivers/md/raid5.c
+381
-242
drivers/md/raid5.h
drivers/md/raid5.h
+160
-12
include/uapi/linux/raid/md_p.h
include/uapi/linux/raid/md_p.h
+6
-1
lib/raid6/avx2.c
lib/raid6/avx2.c
+229
-3
未找到文件。
drivers/md/bitmap.c
浏览文件 @
20737738
...
@@ -27,6 +27,7 @@
...
@@ -27,6 +27,7 @@
#include <linux/mount.h>
#include <linux/mount.h>
#include <linux/buffer_head.h>
#include <linux/buffer_head.h>
#include <linux/seq_file.h>
#include <linux/seq_file.h>
#include <trace/events/block.h>
#include "md.h"
#include "md.h"
#include "bitmap.h"
#include "bitmap.h"
...
@@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
...
@@ -208,11 +209,13 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
static
int
write_sb_page
(
struct
bitmap
*
bitmap
,
struct
page
*
page
,
int
wait
)
static
int
write_sb_page
(
struct
bitmap
*
bitmap
,
struct
page
*
page
,
int
wait
)
{
{
struct
md_rdev
*
rdev
=
NULL
;
struct
md_rdev
*
rdev
;
struct
block_device
*
bdev
;
struct
block_device
*
bdev
;
struct
mddev
*
mddev
=
bitmap
->
mddev
;
struct
mddev
*
mddev
=
bitmap
->
mddev
;
struct
bitmap_storage
*
store
=
&
bitmap
->
storage
;
struct
bitmap_storage
*
store
=
&
bitmap
->
storage
;
restart:
rdev
=
NULL
;
while
((
rdev
=
next_active_rdev
(
rdev
,
mddev
))
!=
NULL
)
{
while
((
rdev
=
next_active_rdev
(
rdev
,
mddev
))
!=
NULL
)
{
int
size
=
PAGE_SIZE
;
int
size
=
PAGE_SIZE
;
loff_t
offset
=
mddev
->
bitmap_info
.
offset
;
loff_t
offset
=
mddev
->
bitmap_info
.
offset
;
...
@@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
...
@@ -268,8 +271,8 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
page
);
page
);
}
}
if
(
wait
)
if
(
wait
&&
md_super_wait
(
mddev
)
<
0
)
md_super_wait
(
mddev
)
;
goto
restart
;
return
0
;
return
0
;
bad_alignment:
bad_alignment:
...
@@ -405,7 +408,7 @@ static int read_page(struct file *file, unsigned long index,
...
@@ -405,7 +408,7 @@ static int read_page(struct file *file, unsigned long index,
ret
=
-
EIO
;
ret
=
-
EIO
;
out:
out:
if
(
ret
)
if
(
ret
)
pr
intk
(
KERN_ALERT
"md: bitmap read error: (%dB @ %llu): %d
\n
"
,
pr
_err
(
"md: bitmap read error: (%dB @ %llu): %d
\n
"
,
(
int
)
PAGE_SIZE
,
(
int
)
PAGE_SIZE
,
(
unsigned
long
long
)
index
<<
PAGE_SHIFT
,
(
unsigned
long
long
)
index
<<
PAGE_SHIFT
,
ret
);
ret
);
...
@@ -416,6 +419,28 @@ static int read_page(struct file *file, unsigned long index,
...
@@ -416,6 +419,28 @@ static int read_page(struct file *file, unsigned long index,
* bitmap file superblock operations
* bitmap file superblock operations
*/
*/
/*
* bitmap_wait_writes() should be called before writing any bitmap
* blocks, to ensure previous writes, particularly from
* bitmap_daemon_work(), have completed.
*/
static
void
bitmap_wait_writes
(
struct
bitmap
*
bitmap
)
{
if
(
bitmap
->
storage
.
file
)
wait_event
(
bitmap
->
write_wait
,
atomic_read
(
&
bitmap
->
pending_writes
)
==
0
);
else
/* Note that we ignore the return value. The writes
* might have failed, but that would just mean that
* some bits which should be cleared haven't been,
* which is safe. The relevant bitmap blocks will
* probably get written again, but there is no great
* loss if they aren't.
*/
md_super_wait
(
bitmap
->
mddev
);
}
/* update the event counter and sync the superblock to disk */
/* update the event counter and sync the superblock to disk */
void
bitmap_update_sb
(
struct
bitmap
*
bitmap
)
void
bitmap_update_sb
(
struct
bitmap
*
bitmap
)
{
{
...
@@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap)
...
@@ -455,24 +480,24 @@ void bitmap_print_sb(struct bitmap *bitmap)
if
(
!
bitmap
||
!
bitmap
->
storage
.
sb_page
)
if
(
!
bitmap
||
!
bitmap
->
storage
.
sb_page
)
return
;
return
;
sb
=
kmap_atomic
(
bitmap
->
storage
.
sb_page
);
sb
=
kmap_atomic
(
bitmap
->
storage
.
sb_page
);
pr
intk
(
KERN_DEBUG
"%s: bitmap file superblock:
\n
"
,
bmname
(
bitmap
));
pr
_debug
(
"%s: bitmap file superblock:
\n
"
,
bmname
(
bitmap
));
pr
intk
(
KERN_DEBUG
" magic: %08x
\n
"
,
le32_to_cpu
(
sb
->
magic
));
pr
_debug
(
" magic: %08x
\n
"
,
le32_to_cpu
(
sb
->
magic
));
pr
intk
(
KERN_DEBUG
" version: %d
\n
"
,
le32_to_cpu
(
sb
->
version
));
pr
_debug
(
" version: %d
\n
"
,
le32_to_cpu
(
sb
->
version
));
pr
intk
(
KERN_DEBUG
" uuid: %08x.%08x.%08x.%08x
\n
"
,
pr
_debug
(
" uuid: %08x.%08x.%08x.%08x
\n
"
,
*
(
__u32
*
)(
sb
->
uuid
+
0
),
*
(
__u32
*
)(
sb
->
uuid
+
0
),
*
(
__u32
*
)(
sb
->
uuid
+
4
),
*
(
__u32
*
)(
sb
->
uuid
+
4
),
*
(
__u32
*
)(
sb
->
uuid
+
8
),
*
(
__u32
*
)(
sb
->
uuid
+
8
),
*
(
__u32
*
)(
sb
->
uuid
+
12
));
*
(
__u32
*
)(
sb
->
uuid
+
12
));
pr
intk
(
KERN_DEBUG
" events: %llu
\n
"
,
pr
_debug
(
" events: %llu
\n
"
,
(
unsigned
long
long
)
le64_to_cpu
(
sb
->
events
));
(
unsigned
long
long
)
le64_to_cpu
(
sb
->
events
));
pr
intk
(
KERN_DEBUG
"events cleared: %llu
\n
"
,
pr
_debug
(
"events cleared: %llu
\n
"
,
(
unsigned
long
long
)
le64_to_cpu
(
sb
->
events_cleared
));
(
unsigned
long
long
)
le64_to_cpu
(
sb
->
events_cleared
));
pr
intk
(
KERN_DEBUG
" state: %08x
\n
"
,
le32_to_cpu
(
sb
->
state
));
pr
_debug
(
" state: %08x
\n
"
,
le32_to_cpu
(
sb
->
state
));
pr
intk
(
KERN_DEBUG
" chunksize: %d B
\n
"
,
le32_to_cpu
(
sb
->
chunksize
));
pr
_debug
(
" chunksize: %d B
\n
"
,
le32_to_cpu
(
sb
->
chunksize
));
pr
intk
(
KERN_DEBUG
" daemon sleep: %ds
\n
"
,
le32_to_cpu
(
sb
->
daemon_sleep
));
pr
_debug
(
" daemon sleep: %ds
\n
"
,
le32_to_cpu
(
sb
->
daemon_sleep
));
pr
intk
(
KERN_DEBUG
" sync size: %llu KB
\n
"
,
pr
_debug
(
" sync size: %llu KB
\n
"
,
(
unsigned
long
long
)
le64_to_cpu
(
sb
->
sync_size
)
/
2
);
(
unsigned
long
long
)
le64_to_cpu
(
sb
->
sync_size
)
/
2
);
pr
intk
(
KERN_DEBUG
"max write behind: %d
\n
"
,
le32_to_cpu
(
sb
->
write_behind
));
pr
_debug
(
"max write behind: %d
\n
"
,
le32_to_cpu
(
sb
->
write_behind
));
kunmap_atomic
(
sb
);
kunmap_atomic
(
sb
);
}
}
...
@@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
...
@@ -506,14 +531,14 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
BUG_ON
(
!
chunksize
);
BUG_ON
(
!
chunksize
);
if
(
!
is_power_of_2
(
chunksize
))
{
if
(
!
is_power_of_2
(
chunksize
))
{
kunmap_atomic
(
sb
);
kunmap_atomic
(
sb
);
pr
intk
(
KERN_ERR
"bitmap chunksize not a power of 2
\n
"
);
pr
_warn
(
"bitmap chunksize not a power of 2
\n
"
);
return
-
EINVAL
;
return
-
EINVAL
;
}
}
sb
->
chunksize
=
cpu_to_le32
(
chunksize
);
sb
->
chunksize
=
cpu_to_le32
(
chunksize
);
daemon_sleep
=
bitmap
->
mddev
->
bitmap_info
.
daemon_sleep
;
daemon_sleep
=
bitmap
->
mddev
->
bitmap_info
.
daemon_sleep
;
if
(
!
daemon_sleep
||
(
daemon_sleep
>
MAX_SCHEDULE_TIMEOUT
))
{
if
(
!
daemon_sleep
||
(
daemon_sleep
>
MAX_SCHEDULE_TIMEOUT
))
{
pr
intk
(
KERN_INFO
"Choosing daemon_sleep default (5 sec)
\n
"
);
pr
_debug
(
"Choosing daemon_sleep default (5 sec)
\n
"
);
daemon_sleep
=
5
*
HZ
;
daemon_sleep
=
5
*
HZ
;
}
}
sb
->
daemon_sleep
=
cpu_to_le32
(
daemon_sleep
);
sb
->
daemon_sleep
=
cpu_to_le32
(
daemon_sleep
);
...
@@ -584,7 +609,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
...
@@ -584,7 +609,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
/* to 4k blocks */
/* to 4k blocks */
bm_blocks
=
DIV_ROUND_UP_SECTOR_T
(
bm_blocks
,
4096
);
bm_blocks
=
DIV_ROUND_UP_SECTOR_T
(
bm_blocks
,
4096
);
offset
=
bitmap
->
mddev
->
bitmap_info
.
offset
+
(
bitmap
->
cluster_slot
*
(
bm_blocks
<<
3
));
offset
=
bitmap
->
mddev
->
bitmap_info
.
offset
+
(
bitmap
->
cluster_slot
*
(
bm_blocks
<<
3
));
pr_
info
(
"%s:%d bm slot: %d offset: %llu
\n
"
,
__func__
,
__LINE__
,
pr_
debug
(
"%s:%d bm slot: %d offset: %llu
\n
"
,
__func__
,
__LINE__
,
bitmap
->
cluster_slot
,
offset
);
bitmap
->
cluster_slot
,
offset
);
}
}
...
@@ -634,7 +659,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
...
@@ -634,7 +659,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
else
if
(
write_behind
>
COUNTER_MAX
)
else
if
(
write_behind
>
COUNTER_MAX
)
reason
=
"write-behind limit out of range (0 - 16383)"
;
reason
=
"write-behind limit out of range (0 - 16383)"
;
if
(
reason
)
{
if
(
reason
)
{
pr
intk
(
KERN_INFO
"%s: invalid bitmap file superblock: %s
\n
"
,
pr
_warn
(
"%s: invalid bitmap file superblock: %s
\n
"
,
bmname
(
bitmap
),
reason
);
bmname
(
bitmap
),
reason
);
goto
out
;
goto
out
;
}
}
...
@@ -648,16 +673,13 @@ static int bitmap_read_sb(struct bitmap *bitmap)
...
@@ -648,16 +673,13 @@ static int bitmap_read_sb(struct bitmap *bitmap)
* bitmap's UUID and event counter to the mddev's
* bitmap's UUID and event counter to the mddev's
*/
*/
if
(
memcmp
(
sb
->
uuid
,
bitmap
->
mddev
->
uuid
,
16
))
{
if
(
memcmp
(
sb
->
uuid
,
bitmap
->
mddev
->
uuid
,
16
))
{
printk
(
KERN_INFO
pr_warn
(
"%s: bitmap superblock UUID mismatch
\n
"
,
"%s: bitmap superblock UUID mismatch
\n
"
,
bmname
(
bitmap
));
bmname
(
bitmap
));
goto
out
;
goto
out
;
}
}
events
=
le64_to_cpu
(
sb
->
events
);
events
=
le64_to_cpu
(
sb
->
events
);
if
(
!
nodes
&&
(
events
<
bitmap
->
mddev
->
events
))
{
if
(
!
nodes
&&
(
events
<
bitmap
->
mddev
->
events
))
{
printk
(
KERN_INFO
pr_warn
(
"%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery
\n
"
,
"%s: bitmap file is out of date (%llu < %llu) "
"-- forcing full recovery
\n
"
,
bmname
(
bitmap
),
events
,
bmname
(
bitmap
),
events
,
(
unsigned
long
long
)
bitmap
->
mddev
->
events
);
(
unsigned
long
long
)
bitmap
->
mddev
->
events
);
set_bit
(
BITMAP_STALE
,
&
bitmap
->
flags
);
set_bit
(
BITMAP_STALE
,
&
bitmap
->
flags
);
...
@@ -679,7 +701,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
...
@@ -679,7 +701,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
if
(
err
==
0
&&
nodes
&&
(
bitmap
->
cluster_slot
<
0
))
{
if
(
err
==
0
&&
nodes
&&
(
bitmap
->
cluster_slot
<
0
))
{
err
=
md_setup_cluster
(
bitmap
->
mddev
,
nodes
);
err
=
md_setup_cluster
(
bitmap
->
mddev
,
nodes
);
if
(
err
)
{
if
(
err
)
{
pr_
err
(
"%s: Could not setup cluster service (%d)
\n
"
,
pr_
warn
(
"%s: Could not setup cluster service (%d)
\n
"
,
bmname
(
bitmap
),
err
);
bmname
(
bitmap
),
err
);
goto
out_no_sb
;
goto
out_no_sb
;
}
}
...
@@ -847,14 +869,12 @@ static void bitmap_file_kick(struct bitmap *bitmap)
...
@@ -847,14 +869,12 @@ static void bitmap_file_kick(struct bitmap *bitmap)
ptr
=
file_path
(
bitmap
->
storage
.
file
,
ptr
=
file_path
(
bitmap
->
storage
.
file
,
path
,
PAGE_SIZE
);
path
,
PAGE_SIZE
);
printk
(
KERN_ALERT
pr_warn
(
"%s: kicking failed bitmap file %s from array!
\n
"
,
"%s: kicking failed bitmap file %s from array!
\n
"
,
bmname
(
bitmap
),
IS_ERR
(
ptr
)
?
""
:
ptr
);
bmname
(
bitmap
),
IS_ERR
(
ptr
)
?
""
:
ptr
);
kfree
(
path
);
kfree
(
path
);
}
else
}
else
printk
(
KERN_ALERT
pr_warn
(
"%s: disabling internal bitmap due to errors
\n
"
,
"%s: disabling internal bitmap due to errors
\n
"
,
bmname
(
bitmap
));
bmname
(
bitmap
));
}
}
}
}
...
@@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap)
...
@@ -983,6 +1003,7 @@ void bitmap_unplug(struct bitmap *bitmap)
{
{
unsigned
long
i
;
unsigned
long
i
;
int
dirty
,
need_write
;
int
dirty
,
need_write
;
int
writing
=
0
;
if
(
!
bitmap
||
!
bitmap
->
storage
.
filemap
||
if
(
!
bitmap
||
!
bitmap
->
storage
.
filemap
||
test_bit
(
BITMAP_STALE
,
&
bitmap
->
flags
))
test_bit
(
BITMAP_STALE
,
&
bitmap
->
flags
))
...
@@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap)
...
@@ -997,15 +1018,19 @@ void bitmap_unplug(struct bitmap *bitmap)
need_write
=
test_and_clear_page_attr
(
bitmap
,
i
,
need_write
=
test_and_clear_page_attr
(
bitmap
,
i
,
BITMAP_PAGE_NEEDWRITE
);
BITMAP_PAGE_NEEDWRITE
);
if
(
dirty
||
need_write
)
{
if
(
dirty
||
need_write
)
{
if
(
!
writing
)
{
bitmap_wait_writes
(
bitmap
);
if
(
bitmap
->
mddev
->
queue
)
blk_add_trace_msg
(
bitmap
->
mddev
->
queue
,
"md bitmap_unplug"
);
}
clear_page_attr
(
bitmap
,
i
,
BITMAP_PAGE_PENDING
);
clear_page_attr
(
bitmap
,
i
,
BITMAP_PAGE_PENDING
);
write_page
(
bitmap
,
bitmap
->
storage
.
filemap
[
i
],
0
);
write_page
(
bitmap
,
bitmap
->
storage
.
filemap
[
i
],
0
);
writing
=
1
;
}
}
}
}
if
(
bitmap
->
storage
.
file
)
if
(
writing
)
wait_event
(
bitmap
->
write_wait
,
bitmap_wait_writes
(
bitmap
);
atomic_read
(
&
bitmap
->
pending_writes
)
==
0
);
else
md_super_wait
(
bitmap
->
mddev
);
if
(
test_bit
(
BITMAP_WRITE_ERROR
,
&
bitmap
->
flags
))
if
(
test_bit
(
BITMAP_WRITE_ERROR
,
&
bitmap
->
flags
))
bitmap_file_kick
(
bitmap
);
bitmap_file_kick
(
bitmap
);
...
@@ -1056,11 +1081,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
...
@@ -1056,11 +1081,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
outofdate
=
test_bit
(
BITMAP_STALE
,
&
bitmap
->
flags
);
outofdate
=
test_bit
(
BITMAP_STALE
,
&
bitmap
->
flags
);
if
(
outofdate
)
if
(
outofdate
)
printk
(
KERN_INFO
"%s: bitmap file is out of date, doing full "
pr_warn
(
"%s: bitmap file is out of date, doing full recovery
\n
"
,
bmname
(
bitmap
));
"recovery
\n
"
,
bmname
(
bitmap
));
if
(
file
&&
i_size_read
(
file
->
f_mapping
->
host
)
<
store
->
bytes
)
{
if
(
file
&&
i_size_read
(
file
->
f_mapping
->
host
)
<
store
->
bytes
)
{
pr
intk
(
KERN_INFO
"%s: bitmap file too short %lu < %lu
\n
"
,
pr
_warn
(
"%s: bitmap file too short %lu < %lu
\n
"
,
bmname
(
bitmap
),
bmname
(
bitmap
),
(
unsigned
long
)
i_size_read
(
file
->
f_mapping
->
host
),
(
unsigned
long
)
i_size_read
(
file
->
f_mapping
->
host
),
store
->
bytes
);
store
->
bytes
);
...
@@ -1137,15 +1161,14 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
...
@@ -1137,15 +1161,14 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
offset
=
0
;
offset
=
0
;
}
}
printk
(
KERN_INFO
"%s: bitmap initialized from disk: "
pr_debug
(
"%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits
\n
"
,
"read %lu pages, set %lu of %lu bits
\n
"
,
bmname
(
bitmap
),
store
->
file_pages
,
bmname
(
bitmap
),
store
->
file_pages
,
bit_cnt
,
chunks
);
bit_cnt
,
chunks
);
return
0
;
return
0
;
err:
err:
pr
intk
(
KERN_INFO
"%s: bitmap initialisation failed: %d
\n
"
,
pr
_warn
(
"%s: bitmap initialisation failed: %d
\n
"
,
bmname
(
bitmap
),
ret
);
bmname
(
bitmap
),
ret
);
return
ret
;
return
ret
;
}
}
...
@@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev)
...
@@ -1225,6 +1248,10 @@ void bitmap_daemon_work(struct mddev *mddev)
}
}
bitmap
->
allclean
=
1
;
bitmap
->
allclean
=
1
;
if
(
bitmap
->
mddev
->
queue
)
blk_add_trace_msg
(
bitmap
->
mddev
->
queue
,
"md bitmap_daemon_work"
);
/* Any file-page which is PENDING now needs to be written.
/* Any file-page which is PENDING now needs to be written.
* So set NEEDWRITE now, then after we make any last-minute changes
* So set NEEDWRITE now, then after we make any last-minute changes
* we will write it.
* we will write it.
...
@@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev)
...
@@ -1289,6 +1316,7 @@ void bitmap_daemon_work(struct mddev *mddev)
}
}
spin_unlock_irq
(
&
counts
->
lock
);
spin_unlock_irq
(
&
counts
->
lock
);
bitmap_wait_writes
(
bitmap
);
/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
* DIRTY pages need to be written by bitmap_unplug so it can wait
* DIRTY pages need to be written by bitmap_unplug so it can wait
* for them.
* for them.
...
@@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
...
@@ -1595,7 +1623,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
atomic_read
(
&
bitmap
->
mddev
->
recovery_active
)
==
0
);
atomic_read
(
&
bitmap
->
mddev
->
recovery_active
)
==
0
);
bitmap
->
mddev
->
curr_resync_completed
=
sector
;
bitmap
->
mddev
->
curr_resync_completed
=
sector
;
set_bit
(
MD_
CHANGE_CLEAN
,
&
bitmap
->
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_CLEAN
,
&
bitmap
->
mddev
->
sb_
flags
);
sector
&=
~
((
1ULL
<<
bitmap
->
counts
.
chunkshift
)
-
1
);
sector
&=
~
((
1ULL
<<
bitmap
->
counts
.
chunkshift
)
-
1
);
s
=
0
;
s
=
0
;
while
(
s
<
sector
&&
s
<
bitmap
->
mddev
->
resync_max_sectors
)
{
while
(
s
<
sector
&&
s
<
bitmap
->
mddev
->
resync_max_sectors
)
{
...
@@ -1825,7 +1853,7 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
...
@@ -1825,7 +1853,7 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
if
(
err
)
if
(
err
)
goto
error
;
goto
error
;
pr
intk
(
KERN_INFO
"created bitmap (%lu pages) for device %s
\n
"
,
pr
_debug
(
"created bitmap (%lu pages) for device %s
\n
"
,
bitmap
->
counts
.
pages
,
bmname
(
bitmap
));
bitmap
->
counts
.
pages
,
bmname
(
bitmap
));
err
=
test_bit
(
BITMAP_WRITE_ERROR
,
&
bitmap
->
flags
)
?
-
EIO
:
0
;
err
=
test_bit
(
BITMAP_WRITE_ERROR
,
&
bitmap
->
flags
)
?
-
EIO
:
0
;
...
@@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
...
@@ -2029,8 +2057,10 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
!
bitmap
->
mddev
->
bitmap_info
.
external
,
!
bitmap
->
mddev
->
bitmap_info
.
external
,
mddev_is_clustered
(
bitmap
->
mddev
)
mddev_is_clustered
(
bitmap
->
mddev
)
?
bitmap
->
cluster_slot
:
0
);
?
bitmap
->
cluster_slot
:
0
);
if
(
ret
)
if
(
ret
)
{
bitmap_file_unmap
(
&
store
);
goto
err
;
goto
err
;
}
pages
=
DIV_ROUND_UP
(
chunks
,
PAGE_COUNTER_RATIO
);
pages
=
DIV_ROUND_UP
(
chunks
,
PAGE_COUNTER_RATIO
);
...
@@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
...
@@ -2089,7 +2119,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bitmap
->
mddev
->
bitmap_info
.
chunksize
=
1
<<
(
old_counts
.
chunkshift
+
bitmap
->
mddev
->
bitmap_info
.
chunksize
=
1
<<
(
old_counts
.
chunkshift
+
BITMAP_BLOCK_SHIFT
);
BITMAP_BLOCK_SHIFT
);
blocks
=
old_counts
.
chunks
<<
old_counts
.
chunkshift
;
blocks
=
old_counts
.
chunks
<<
old_counts
.
chunkshift
;
pr_
err
(
"Could not pre-allocate in-memory bitmap for cluster raid
\n
"
);
pr_
warn
(
"Could not pre-allocate in-memory bitmap for cluster raid
\n
"
);
break
;
break
;
}
else
}
else
bitmap
->
counts
.
bp
[
page
].
count
+=
1
;
bitmap
->
counts
.
bp
[
page
].
count
+=
1
;
...
@@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -2266,7 +2296,7 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
/* Ensure new bitmap info is stored in
/* Ensure new bitmap info is stored in
* metadata promptly.
* metadata promptly.
*/
*/
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
}
}
rv
=
0
;
rv
=
0
;
...
...
drivers/md/dm-raid.c
浏览文件 @
20737738
...
@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
...
@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
sb
->
compat_features
=
cpu_to_le32
(
FEATURE_FLAG_SUPPORTS_V190
);
sb
->
compat_features
=
cpu_to_le32
(
FEATURE_FLAG_SUPPORTS_V190
);
/* Force writing of superblocks to disk */
/* Force writing of superblocks to disk */
set_bit
(
MD_
CHANGE_DEVS
,
&
rdev
->
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
rdev
->
mddev
->
sb_
flags
);
/* Any superblock is better than none, choose that if given */
/* Any superblock is better than none, choose that if given */
return
refdev
?
0
:
1
;
return
refdev
?
0
:
1
;
...
@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs)
...
@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs)
struct
mddev
*
mddev
=
&
rs
->
md
;
struct
mddev
*
mddev
=
&
rs
->
md
;
int
ro
=
mddev
->
ro
;
int
ro
=
mddev
->
ro
;
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
mddev
->
ro
=
0
;
mddev
->
ro
=
0
;
md_update_sb
(
mddev
,
1
);
md_update_sb
(
mddev
,
1
);
mddev
->
ro
=
ro
;
mddev
->
ro
=
ro
;
...
...
drivers/md/linear.c
浏览文件 @
20737738
...
@@ -21,6 +21,7 @@
...
@@ -21,6 +21,7 @@
#include <linux/seq_file.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/slab.h>
#include <trace/events/block.h>
#include "md.h"
#include "md.h"
#include "linear.h"
#include "linear.h"
...
@@ -101,7 +102,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
...
@@ -101,7 +102,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
sector_t
sectors
;
sector_t
sectors
;
if
(
j
<
0
||
j
>=
raid_disks
||
disk
->
rdev
)
{
if
(
j
<
0
||
j
>=
raid_disks
||
disk
->
rdev
)
{
pr
intk
(
KERN_ERR
"md/linear:%s: disk numbering problem. Aborting!
\n
"
,
pr
_warn
(
"md/linear:%s: disk numbering problem. Aborting!
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
goto
out
;
goto
out
;
}
}
...
@@ -123,7 +124,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
...
@@ -123,7 +124,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
discard_supported
=
true
;
discard_supported
=
true
;
}
}
if
(
cnt
!=
raid_disks
)
{
if
(
cnt
!=
raid_disks
)
{
pr
intk
(
KERN_ERR
"md/linear:%s: not enough drives present. Aborting!
\n
"
,
pr
_warn
(
"md/linear:%s: not enough drives present. Aborting!
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
goto
out
;
goto
out
;
}
}
...
@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
...
@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
}
}
do
{
do
{
tmp_dev
=
which_dev
(
mddev
,
bio
->
bi_iter
.
bi_sector
);
sector_t
bio_sector
=
bio
->
bi_iter
.
bi_sector
;
tmp_dev
=
which_dev
(
mddev
,
bio_sector
);
start_sector
=
tmp_dev
->
end_sector
-
tmp_dev
->
rdev
->
sectors
;
start_sector
=
tmp_dev
->
end_sector
-
tmp_dev
->
rdev
->
sectors
;
end_sector
=
tmp_dev
->
end_sector
;
end_sector
=
tmp_dev
->
end_sector
;
data_offset
=
tmp_dev
->
rdev
->
data_offset
;
data_offset
=
tmp_dev
->
rdev
->
data_offset
;
bio
->
bi_bdev
=
tmp_dev
->
rdev
->
bdev
;
bio
->
bi_bdev
=
tmp_dev
->
rdev
->
bdev
;
if
(
unlikely
(
bio
->
bi_iter
.
bi
_sector
>=
end_sector
||
if
(
unlikely
(
bio_sector
>=
end_sector
||
bio
->
bi_iter
.
bi
_sector
<
start_sector
))
bio_sector
<
start_sector
))
goto
out_of_bounds
;
goto
out_of_bounds
;
if
(
unlikely
(
bio_end_sector
(
bio
)
>
end_sector
))
{
if
(
unlikely
(
bio_end_sector
(
bio
)
>
end_sector
))
{
/* This bio crosses a device boundary, so we have to
/* This bio crosses a device boundary, so we have to
* split it.
* split it.
*/
*/
split
=
bio_split
(
bio
,
end_sector
-
split
=
bio_split
(
bio
,
end_sector
-
bio_sector
,
bio
->
bi_iter
.
bi_sector
,
GFP_NOIO
,
fs_bio_set
);
GFP_NOIO
,
fs_bio_set
);
bio_chain
(
split
,
bio
);
bio_chain
(
split
,
bio
);
}
else
{
}
else
{
...
@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
...
@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
!
blk_queue_discard
(
bdev_get_queue
(
split
->
bi_bdev
))))
{
!
blk_queue_discard
(
bdev_get_queue
(
split
->
bi_bdev
))))
{
/* Just ignore it */
/* Just ignore it */
bio_endio
(
split
);
bio_endio
(
split
);
}
else
}
else
{
if
(
mddev
->
gendisk
)
trace_block_bio_remap
(
bdev_get_queue
(
split
->
bi_bdev
),
split
,
disk_devt
(
mddev
->
gendisk
),
bio_sector
);
generic_make_request
(
split
);
generic_make_request
(
split
);
}
}
while
(
split
!=
bio
);
}
while
(
split
!=
bio
);
return
;
return
;
out_of_bounds:
out_of_bounds:
printk
(
KERN_ERR
pr_err
(
"md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu
\n
"
,
"md/linear:%s: make_request: Sector %llu out of bounds on "
"dev %s: %llu sectors, offset %llu
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
(
unsigned
long
long
)
bio
->
bi_iter
.
bi_sector
,
(
unsigned
long
long
)
bio
->
bi_iter
.
bi_sector
,
bdevname
(
tmp_dev
->
rdev
->
bdev
,
b
),
bdevname
(
tmp_dev
->
rdev
->
bdev
,
b
),
...
@@ -275,7 +279,6 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
...
@@ -275,7 +279,6 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
static
void
linear_status
(
struct
seq_file
*
seq
,
struct
mddev
*
mddev
)
static
void
linear_status
(
struct
seq_file
*
seq
,
struct
mddev
*
mddev
)
{
{
seq_printf
(
seq
,
" %dk rounding"
,
mddev
->
chunk_sectors
/
2
);
seq_printf
(
seq
,
" %dk rounding"
,
mddev
->
chunk_sectors
/
2
);
}
}
...
...
drivers/md/md.c
浏览文件 @
20737738
...
@@ -30,6 +30,18 @@
...
@@ -30,6 +30,18 @@
You should have received a copy of the GNU General Public License
You should have received a copy of the GNU General Public License
(for example /usr/src/linux/COPYING); if not, write to the Free
(for example /usr/src/linux/COPYING); if not, write to the Free
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
Errors, Warnings, etc.
Please use:
pr_crit() for error conditions that risk data loss
pr_err() for error conditions that are unexpected, like an IO error
or internal inconsistency
pr_warn() for error conditions that could have been predicated, like
adding a device to an array when it has incompatible metadata
pr_info() for every interesting, very rare events, like an array starting
or stopping, or resync starting or stopping
pr_debug() for everything else.
*/
*/
#include <linux/kthread.h>
#include <linux/kthread.h>
...
@@ -52,6 +64,7 @@
...
@@ -52,6 +64,7 @@
#include <linux/raid/md_p.h>
#include <linux/raid/md_p.h>
#include <linux/raid/md_u.h>
#include <linux/raid/md_u.h>
#include <linux/slab.h>
#include <linux/slab.h>
#include <trace/events/block.h>
#include "md.h"
#include "md.h"
#include "bitmap.h"
#include "bitmap.h"
#include "md-cluster.h"
#include "md-cluster.h"
...
@@ -684,11 +697,8 @@ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
...
@@ -684,11 +697,8 @@ static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
static
int
alloc_disk_sb
(
struct
md_rdev
*
rdev
)
static
int
alloc_disk_sb
(
struct
md_rdev
*
rdev
)
{
{
rdev
->
sb_page
=
alloc_page
(
GFP_KERNEL
);
rdev
->
sb_page
=
alloc_page
(
GFP_KERNEL
);
if
(
!
rdev
->
sb_page
)
{
if
(
!
rdev
->
sb_page
)
printk
(
KERN_ALERT
"md: out of memory.
\n
"
);
return
-
ENOMEM
;
return
-
ENOMEM
;
}
return
0
;
return
0
;
}
}
...
@@ -715,9 +725,15 @@ static void super_written(struct bio *bio)
...
@@ -715,9 +725,15 @@ static void super_written(struct bio *bio)
struct
mddev
*
mddev
=
rdev
->
mddev
;
struct
mddev
*
mddev
=
rdev
->
mddev
;
if
(
bio
->
bi_error
)
{
if
(
bio
->
bi_error
)
{
pr
intk
(
"md: super_written gets error=%d
\n
"
,
bio
->
bi_error
);
pr
_err
(
"md: super_written gets error=%d
\n
"
,
bio
->
bi_error
);
md_error
(
mddev
,
rdev
);
md_error
(
mddev
,
rdev
);
if
(
!
test_bit
(
Faulty
,
&
rdev
->
flags
)
&&
(
bio
->
bi_opf
&
MD_FAILFAST
))
{
set_bit
(
MD_SB_NEED_REWRITE
,
&
mddev
->
sb_flags
);
set_bit
(
LastDev
,
&
rdev
->
flags
);
}
}
}
else
clear_bit
(
LastDev
,
&
rdev
->
flags
);
if
(
atomic_dec_and_test
(
&
mddev
->
pending_writes
))
if
(
atomic_dec_and_test
(
&
mddev
->
pending_writes
))
wake_up
(
&
mddev
->
sb_wait
);
wake_up
(
&
mddev
->
sb_wait
);
...
@@ -734,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
...
@@ -734,7 +750,13 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
* if zero is reached.
* if zero is reached.
* If an error occurred, call md_error
* If an error occurred, call md_error
*/
*/
struct
bio
*
bio
=
bio_alloc_mddev
(
GFP_NOIO
,
1
,
mddev
);
struct
bio
*
bio
;
int
ff
=
0
;
if
(
test_bit
(
Faulty
,
&
rdev
->
flags
))
return
;
bio
=
bio_alloc_mddev
(
GFP_NOIO
,
1
,
mddev
);
atomic_inc
(
&
rdev
->
nr_pending
);
atomic_inc
(
&
rdev
->
nr_pending
);
...
@@ -743,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
...
@@ -743,16 +765,24 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
bio_add_page
(
bio
,
page
,
size
,
0
);
bio_add_page
(
bio
,
page
,
size
,
0
);
bio
->
bi_private
=
rdev
;
bio
->
bi_private
=
rdev
;
bio
->
bi_end_io
=
super_written
;
bio
->
bi_end_io
=
super_written
;
bio
->
bi_opf
=
REQ_OP_WRITE
|
REQ_PREFLUSH
|
REQ_FUA
;
if
(
test_bit
(
MD_FAILFAST_SUPPORTED
,
&
mddev
->
flags
)
&&
test_bit
(
FailFast
,
&
rdev
->
flags
)
&&
!
test_bit
(
LastDev
,
&
rdev
->
flags
))
ff
=
MD_FAILFAST
;
bio
->
bi_opf
=
REQ_OP_WRITE
|
REQ_PREFLUSH
|
REQ_FUA
|
ff
;
atomic_inc
(
&
mddev
->
pending_writes
);
atomic_inc
(
&
mddev
->
pending_writes
);
submit_bio
(
bio
);
submit_bio
(
bio
);
}
}
void
md_super_wait
(
struct
mddev
*
mddev
)
int
md_super_wait
(
struct
mddev
*
mddev
)
{
{
/* wait for all superblock writes that were scheduled to complete */
/* wait for all superblock writes that were scheduled to complete */
wait_event
(
mddev
->
sb_wait
,
atomic_read
(
&
mddev
->
pending_writes
)
==
0
);
wait_event
(
mddev
->
sb_wait
,
atomic_read
(
&
mddev
->
pending_writes
)
==
0
);
if
(
test_and_clear_bit
(
MD_SB_NEED_REWRITE
,
&
mddev
->
sb_flags
))
return
-
EAGAIN
;
return
0
;
}
}
int
sync_page_io
(
struct
md_rdev
*
rdev
,
sector_t
sector
,
int
size
,
int
sync_page_io
(
struct
md_rdev
*
rdev
,
sector_t
sector
,
int
size
,
...
@@ -795,7 +825,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
...
@@ -795,7 +825,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
return
0
;
return
0
;
fail:
fail:
pr
intk
(
KERN_WARNING
"md: disabled device %s, could not read superblock.
\n
"
,
pr
_err
(
"md: disabled device %s, could not read superblock.
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -818,7 +848,6 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
...
@@ -818,7 +848,6 @@ static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
if
(
!
tmp1
||
!
tmp2
)
{
if
(
!
tmp1
||
!
tmp2
)
{
ret
=
0
;
ret
=
0
;
printk
(
KERN_INFO
"md.c sb_equal(): failed to allocate memory!
\n
"
);
goto
abort
;
goto
abort
;
}
}
...
@@ -932,7 +961,7 @@ int md_check_no_bitmap(struct mddev *mddev)
...
@@ -932,7 +961,7 @@ int md_check_no_bitmap(struct mddev *mddev)
{
{
if
(
!
mddev
->
bitmap_info
.
file
&&
!
mddev
->
bitmap_info
.
offset
)
if
(
!
mddev
->
bitmap_info
.
file
&&
!
mddev
->
bitmap_info
.
offset
)
return
0
;
return
0
;
pr
intk
(
KERN_ERR
"%s: bitmaps are not supported for %s
\n
"
,
pr
_warn
(
"%s: bitmaps are not supported for %s
\n
"
,
mdname
(
mddev
),
mddev
->
pers
->
name
);
mdname
(
mddev
),
mddev
->
pers
->
name
);
return
1
;
return
1
;
}
}
...
@@ -956,7 +985,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
...
@@ -956,7 +985,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
rdev
->
sb_start
=
calc_dev_sboffset
(
rdev
);
rdev
->
sb_start
=
calc_dev_sboffset
(
rdev
);
ret
=
read_disk_sb
(
rdev
,
MD_SB_BYTES
);
ret
=
read_disk_sb
(
rdev
,
MD_SB_BYTES
);
if
(
ret
)
return
ret
;
if
(
ret
)
return
ret
;
ret
=
-
EINVAL
;
ret
=
-
EINVAL
;
...
@@ -964,17 +994,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
...
@@ -964,17 +994,15 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
sb
=
page_address
(
rdev
->
sb_page
);
sb
=
page_address
(
rdev
->
sb_page
);
if
(
sb
->
md_magic
!=
MD_SB_MAGIC
)
{
if
(
sb
->
md_magic
!=
MD_SB_MAGIC
)
{
printk
(
KERN_ERR
"md: invalid raid superblock magic on %s
\n
"
,
pr_warn
(
"md: invalid raid superblock magic on %s
\n
"
,
b
);
b
);
goto
abort
;
goto
abort
;
}
}
if
(
sb
->
major_version
!=
0
||
if
(
sb
->
major_version
!=
0
||
sb
->
minor_version
<
90
||
sb
->
minor_version
<
90
||
sb
->
minor_version
>
91
)
{
sb
->
minor_version
>
91
)
{
printk
(
KERN_WARNING
"Bad version number %d.%d on %s
\n
"
,
pr_warn
(
"Bad version number %d.%d on %s
\n
"
,
sb
->
major_version
,
sb
->
minor_version
,
sb
->
major_version
,
sb
->
minor_version
,
b
);
b
);
goto
abort
;
goto
abort
;
}
}
...
@@ -982,8 +1010,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
...
@@ -982,8 +1010,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
goto
abort
;
goto
abort
;
if
(
md_csum_fold
(
calc_sb_csum
(
sb
))
!=
md_csum_fold
(
sb
->
sb_csum
))
{
if
(
md_csum_fold
(
calc_sb_csum
(
sb
))
!=
md_csum_fold
(
sb
->
sb_csum
))
{
printk
(
KERN_WARNING
"md: invalid superblock checksum on %s
\n
"
,
pr_warn
(
"md: invalid superblock checksum on %s
\n
"
,
b
);
b
);
goto
abort
;
goto
abort
;
}
}
...
@@ -1004,13 +1031,12 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
...
@@ -1004,13 +1031,12 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
__u64
ev1
,
ev2
;
__u64
ev1
,
ev2
;
mdp_super_t
*
refsb
=
page_address
(
refdev
->
sb_page
);
mdp_super_t
*
refsb
=
page_address
(
refdev
->
sb_page
);
if
(
!
uuid_equal
(
refsb
,
sb
))
{
if
(
!
uuid_equal
(
refsb
,
sb
))
{
pr
intk
(
KERN_WARNING
"md: %s has different UUID to %s
\n
"
,
pr
_warn
(
"md: %s has different UUID to %s
\n
"
,
b
,
bdevname
(
refdev
->
bdev
,
b2
));
b
,
bdevname
(
refdev
->
bdev
,
b2
));
goto
abort
;
goto
abort
;
}
}
if
(
!
sb_equal
(
refsb
,
sb
))
{
if
(
!
sb_equal
(
refsb
,
sb
))
{
printk
(
KERN_WARNING
"md: %s has same UUID"
pr_warn
(
"md: %s has same UUID but different superblock to %s
\n
"
,
" but different superblock to %s
\n
"
,
b
,
bdevname
(
refdev
->
bdev
,
b2
));
b
,
bdevname
(
refdev
->
bdev
,
b2
));
goto
abort
;
goto
abort
;
}
}
...
@@ -1158,6 +1184,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1158,6 +1184,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
}
}
if
(
desc
->
state
&
(
1
<<
MD_DISK_WRITEMOSTLY
))
if
(
desc
->
state
&
(
1
<<
MD_DISK_WRITEMOSTLY
))
set_bit
(
WriteMostly
,
&
rdev
->
flags
);
set_bit
(
WriteMostly
,
&
rdev
->
flags
);
if
(
desc
->
state
&
(
1
<<
MD_DISK_FAILFAST
))
set_bit
(
FailFast
,
&
rdev
->
flags
);
}
else
/* MULTIPATH are always insync */
}
else
/* MULTIPATH are always insync */
set_bit
(
In_sync
,
&
rdev
->
flags
);
set_bit
(
In_sync
,
&
rdev
->
flags
);
return
0
;
return
0
;
...
@@ -1283,6 +1311,8 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1283,6 +1311,8 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
}
}
if
(
test_bit
(
WriteMostly
,
&
rdev2
->
flags
))
if
(
test_bit
(
WriteMostly
,
&
rdev2
->
flags
))
d
->
state
|=
(
1
<<
MD_DISK_WRITEMOSTLY
);
d
->
state
|=
(
1
<<
MD_DISK_WRITEMOSTLY
);
if
(
test_bit
(
FailFast
,
&
rdev2
->
flags
))
d
->
state
|=
(
1
<<
MD_DISK_FAILFAST
);
}
}
/* now set the "removed" and "faulty" bits on any missing devices */
/* now set the "removed" and "faulty" bits on any missing devices */
for
(
i
=
0
;
i
<
mddev
->
raid_disks
;
i
++
)
{
for
(
i
=
0
;
i
<
mddev
->
raid_disks
;
i
++
)
{
...
@@ -1324,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
...
@@ -1324,9 +1354,10 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
if
(
IS_ENABLED
(
CONFIG_LBDAF
)
&&
(
u64
)
num_sectors
>=
(
2ULL
<<
32
)
&&
if
(
IS_ENABLED
(
CONFIG_LBDAF
)
&&
(
u64
)
num_sectors
>=
(
2ULL
<<
32
)
&&
rdev
->
mddev
->
level
>=
1
)
rdev
->
mddev
->
level
>=
1
)
num_sectors
=
(
sector_t
)(
2ULL
<<
32
)
-
2
;
num_sectors
=
(
sector_t
)(
2ULL
<<
32
)
-
2
;
do
{
md_super_write
(
rdev
->
mddev
,
rdev
,
rdev
->
sb_start
,
rdev
->
sb_size
,
md_super_write
(
rdev
->
mddev
,
rdev
,
rdev
->
sb_start
,
rdev
->
sb_size
,
rdev
->
sb_page
);
rdev
->
sb_page
);
md_super_wait
(
rdev
->
mddev
);
}
while
(
md_super_wait
(
rdev
->
mddev
)
<
0
);
return
num_sectors
;
return
num_sectors
;
}
}
...
@@ -1413,12 +1444,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
...
@@ -1413,12 +1444,12 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
return
-
EINVAL
;
return
-
EINVAL
;
if
(
calc_sb_1_csum
(
sb
)
!=
sb
->
sb_csum
)
{
if
(
calc_sb_1_csum
(
sb
)
!=
sb
->
sb_csum
)
{
pr
intk
(
"md: invalid superblock checksum on %s
\n
"
,
pr
_warn
(
"md: invalid superblock checksum on %s
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
if
(
le64_to_cpu
(
sb
->
data_size
)
<
10
)
{
if
(
le64_to_cpu
(
sb
->
data_size
)
<
10
)
{
pr
intk
(
"md: data_size too small on %s
\n
"
,
pr
_warn
(
"md: data_size too small on %s
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -1503,8 +1534,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
...
@@ -1503,8 +1534,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
sb
->
level
!=
refsb
->
level
||
sb
->
level
!=
refsb
->
level
||
sb
->
layout
!=
refsb
->
layout
||
sb
->
layout
!=
refsb
->
layout
||
sb
->
chunksize
!=
refsb
->
chunksize
)
{
sb
->
chunksize
!=
refsb
->
chunksize
)
{
printk
(
KERN_WARNING
"md: %s has strangely different"
pr_warn
(
"md: %s has strangely different superblock to %s
\n
"
,
" superblock to %s
\n
"
,
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
refdev
->
bdev
,
b2
));
bdevname
(
refdev
->
bdev
,
b2
));
return
-
EINVAL
;
return
-
EINVAL
;
...
@@ -1646,8 +1676,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1646,8 +1676,7 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
case
MD_DISK_ROLE_JOURNAL
:
/* journal device */
case
MD_DISK_ROLE_JOURNAL
:
/* journal device */
if
(
!
(
le32_to_cpu
(
sb
->
feature_map
)
&
MD_FEATURE_JOURNAL
))
{
if
(
!
(
le32_to_cpu
(
sb
->
feature_map
)
&
MD_FEATURE_JOURNAL
))
{
/* journal device without journal feature */
/* journal device without journal feature */
printk
(
KERN_WARNING
pr_warn
(
"md: journal device provided without journal feature, ignoring the device
\n
"
);
"md: journal device provided without journal feature, ignoring the device
\n
"
);
return
-
EINVAL
;
return
-
EINVAL
;
}
}
set_bit
(
Journal
,
&
rdev
->
flags
);
set_bit
(
Journal
,
&
rdev
->
flags
);
...
@@ -1669,6 +1698,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1669,6 +1698,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
}
}
if
(
sb
->
devflags
&
WriteMostly1
)
if
(
sb
->
devflags
&
WriteMostly1
)
set_bit
(
WriteMostly
,
&
rdev
->
flags
);
set_bit
(
WriteMostly
,
&
rdev
->
flags
);
if
(
sb
->
devflags
&
FailFast1
)
set_bit
(
FailFast
,
&
rdev
->
flags
);
if
(
le32_to_cpu
(
sb
->
feature_map
)
&
MD_FEATURE_REPLACEMENT
)
if
(
le32_to_cpu
(
sb
->
feature_map
)
&
MD_FEATURE_REPLACEMENT
)
set_bit
(
Replacement
,
&
rdev
->
flags
);
set_bit
(
Replacement
,
&
rdev
->
flags
);
}
else
/* MULTIPATH are always insync */
}
else
/* MULTIPATH are always insync */
...
@@ -1707,6 +1738,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1707,6 +1738,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb
->
chunksize
=
cpu_to_le32
(
mddev
->
chunk_sectors
);
sb
->
chunksize
=
cpu_to_le32
(
mddev
->
chunk_sectors
);
sb
->
level
=
cpu_to_le32
(
mddev
->
level
);
sb
->
level
=
cpu_to_le32
(
mddev
->
level
);
sb
->
layout
=
cpu_to_le32
(
mddev
->
layout
);
sb
->
layout
=
cpu_to_le32
(
mddev
->
layout
);
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
))
sb
->
devflags
|=
FailFast1
;
else
sb
->
devflags
&=
~
FailFast1
;
if
(
test_bit
(
WriteMostly
,
&
rdev
->
flags
))
if
(
test_bit
(
WriteMostly
,
&
rdev
->
flags
))
sb
->
devflags
|=
WriteMostly1
;
sb
->
devflags
|=
WriteMostly1
;
...
@@ -1863,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
...
@@ -1863,9 +1898,10 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
sb
->
data_size
=
cpu_to_le64
(
num_sectors
);
sb
->
data_size
=
cpu_to_le64
(
num_sectors
);
sb
->
super_offset
=
rdev
->
sb_start
;
sb
->
super_offset
=
rdev
->
sb_start
;
sb
->
sb_csum
=
calc_sb_1_csum
(
sb
);
sb
->
sb_csum
=
calc_sb_1_csum
(
sb
);
do
{
md_super_write
(
rdev
->
mddev
,
rdev
,
rdev
->
sb_start
,
rdev
->
sb_size
,
md_super_write
(
rdev
->
mddev
,
rdev
,
rdev
->
sb_start
,
rdev
->
sb_size
,
rdev
->
sb_page
);
rdev
->
sb_page
);
md_super_wait
(
rdev
->
mddev
);
}
while
(
md_super_wait
(
rdev
->
mddev
)
<
0
);
return
num_sectors
;
return
num_sectors
;
}
}
...
@@ -2004,9 +2040,9 @@ int md_integrity_register(struct mddev *mddev)
...
@@ -2004,9 +2040,9 @@ int md_integrity_register(struct mddev *mddev)
blk_integrity_register
(
mddev
->
gendisk
,
blk_integrity_register
(
mddev
->
gendisk
,
bdev_get_integrity
(
reference
->
bdev
));
bdev_get_integrity
(
reference
->
bdev
));
pr
intk
(
KERN_NOTICE
"md: data integrity enabled on %s
\n
"
,
mdname
(
mddev
));
pr
_debug
(
"md: data integrity enabled on %s
\n
"
,
mdname
(
mddev
));
if
(
bioset_integrity_create
(
mddev
->
bio_set
,
BIO_POOL_SIZE
))
{
if
(
bioset_integrity_create
(
mddev
->
bio_set
,
BIO_POOL_SIZE
))
{
pr
intk
(
KERN_ERR
"md: failed to create integrity pool for %s
\n
"
,
pr
_err
(
"md: failed to create integrity pool for %s
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -2034,7 +2070,7 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
...
@@ -2034,7 +2070,7 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
return
0
;
return
0
;
if
(
blk_integrity_compare
(
mddev
->
gendisk
,
rdev
->
bdev
->
bd_disk
)
!=
0
)
{
if
(
blk_integrity_compare
(
mddev
->
gendisk
,
rdev
->
bdev
->
bd_disk
)
!=
0
)
{
pr
intk
(
KERN_NOTICE
"%s: incompatible integrity profile for %s
\n
"
,
pr
_err
(
"%s: incompatible integrity profile for %s
\n
"
,
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
name
));
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
name
));
return
-
ENXIO
;
return
-
ENXIO
;
}
}
...
@@ -2089,7 +2125,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
...
@@ -2089,7 +2125,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
rcu_read_unlock
();
rcu_read_unlock
();
if
(
!
test_bit
(
Journal
,
&
rdev
->
flags
)
&&
if
(
!
test_bit
(
Journal
,
&
rdev
->
flags
)
&&
mddev
->
max_disks
&&
rdev
->
desc_nr
>=
mddev
->
max_disks
)
{
mddev
->
max_disks
&&
rdev
->
desc_nr
>=
mddev
->
max_disks
)
{
pr
intk
(
KERN_WARNING
"md: %s: array is limited to %d devices
\n
"
,
pr
_warn
(
"md: %s: array is limited to %d devices
\n
"
,
mdname
(
mddev
),
mddev
->
max_disks
);
mdname
(
mddev
),
mddev
->
max_disks
);
return
-
EBUSY
;
return
-
EBUSY
;
}
}
...
@@ -2097,7 +2133,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
...
@@ -2097,7 +2133,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
strreplace
(
b
,
'/'
,
'!'
);
strreplace
(
b
,
'/'
,
'!'
);
rdev
->
mddev
=
mddev
;
rdev
->
mddev
=
mddev
;
pr
intk
(
KERN_INFO
"md: bind<%s>
\n
"
,
b
);
pr
_debug
(
"md: bind<%s>
\n
"
,
b
);
if
((
err
=
kobject_add
(
&
rdev
->
kobj
,
&
mddev
->
kobj
,
"dev-%s"
,
b
)))
if
((
err
=
kobject_add
(
&
rdev
->
kobj
,
&
mddev
->
kobj
,
"dev-%s"
,
b
)))
goto
fail
;
goto
fail
;
...
@@ -2116,7 +2152,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
...
@@ -2116,7 +2152,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return
0
;
return
0
;
fail:
fail:
pr
intk
(
KERN_WARNING
"md: failed to register dev-%s for %s
\n
"
,
pr
_warn
(
"md: failed to register dev-%s for %s
\n
"
,
b
,
mdname
(
mddev
));
b
,
mdname
(
mddev
));
return
err
;
return
err
;
}
}
...
@@ -2134,7 +2170,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
...
@@ -2134,7 +2170,7 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
bd_unlink_disk_holder
(
rdev
->
bdev
,
rdev
->
mddev
->
gendisk
);
bd_unlink_disk_holder
(
rdev
->
bdev
,
rdev
->
mddev
->
gendisk
);
list_del_rcu
(
&
rdev
->
same_set
);
list_del_rcu
(
&
rdev
->
same_set
);
pr
intk
(
KERN_INFO
"md: unbind<%s>
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
pr
_debug
(
"md: unbind<%s>
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
rdev
->
mddev
=
NULL
;
rdev
->
mddev
=
NULL
;
sysfs_remove_link
(
&
rdev
->
kobj
,
"block"
);
sysfs_remove_link
(
&
rdev
->
kobj
,
"block"
);
sysfs_put
(
rdev
->
sysfs_state
);
sysfs_put
(
rdev
->
sysfs_state
);
...
@@ -2164,8 +2200,7 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
...
@@ -2164,8 +2200,7 @@ static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
bdev
=
blkdev_get_by_dev
(
dev
,
FMODE_READ
|
FMODE_WRITE
|
FMODE_EXCL
,
bdev
=
blkdev_get_by_dev
(
dev
,
FMODE_READ
|
FMODE_WRITE
|
FMODE_EXCL
,
shared
?
(
struct
md_rdev
*
)
lock_rdev
:
rdev
);
shared
?
(
struct
md_rdev
*
)
lock_rdev
:
rdev
);
if
(
IS_ERR
(
bdev
))
{
if
(
IS_ERR
(
bdev
))
{
printk
(
KERN_ERR
"md: could not open %s.
\n
"
,
pr_warn
(
"md: could not open %s.
\n
"
,
__bdevname
(
dev
,
b
));
__bdevname
(
dev
,
b
));
return
PTR_ERR
(
bdev
);
return
PTR_ERR
(
bdev
);
}
}
rdev
->
bdev
=
bdev
;
rdev
->
bdev
=
bdev
;
...
@@ -2185,8 +2220,7 @@ static void export_rdev(struct md_rdev *rdev)
...
@@ -2185,8 +2220,7 @@ static void export_rdev(struct md_rdev *rdev)
{
{
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
printk
(
KERN_INFO
"md: export_rdev(%s)
\n
"
,
pr_debug
(
"md: export_rdev(%s)
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
md_rdev_clear
(
rdev
);
md_rdev_clear
(
rdev
);
#ifndef MODULE
#ifndef MODULE
if
(
test_bit
(
AutoDetected
,
&
rdev
->
flags
))
if
(
test_bit
(
AutoDetected
,
&
rdev
->
flags
))
...
@@ -2288,24 +2322,24 @@ void md_update_sb(struct mddev *mddev, int force_change)
...
@@ -2288,24 +2322,24 @@ void md_update_sb(struct mddev *mddev, int force_change)
if
(
mddev
->
ro
)
{
if
(
mddev
->
ro
)
{
if
(
force_change
)
if
(
force_change
)
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
return
;
return
;
}
}
repeat:
repeat:
if
(
mddev_is_clustered
(
mddev
))
{
if
(
mddev_is_clustered
(
mddev
))
{
if
(
test_and_clear_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
))
if
(
test_and_clear_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
))
force_change
=
1
;
force_change
=
1
;
if
(
test_and_clear_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
))
if
(
test_and_clear_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
))
nospares
=
1
;
nospares
=
1
;
ret
=
md_cluster_ops
->
metadata_update_start
(
mddev
);
ret
=
md_cluster_ops
->
metadata_update_start
(
mddev
);
/* Has someone else has updated the sb */
/* Has someone else has updated the sb */
if
(
!
does_sb_need_changing
(
mddev
))
{
if
(
!
does_sb_need_changing
(
mddev
))
{
if
(
ret
==
0
)
if
(
ret
==
0
)
md_cluster_ops
->
metadata_update_cancel
(
mddev
);
md_cluster_ops
->
metadata_update_cancel
(
mddev
);
bit_clear_unless
(
&
mddev
->
flags
,
BIT
(
MD
_CHANGE_PENDING
),
bit_clear_unless
(
&
mddev
->
sb_flags
,
BIT
(
MD_SB
_CHANGE_PENDING
),
BIT
(
MD_CHANGE_DEVS
)
|
BIT
(
MD_
SB_
CHANGE_DEVS
)
|
BIT
(
MD_CHANGE_CLEAN
));
BIT
(
MD_
SB_
CHANGE_CLEAN
));
return
;
return
;
}
}
}
}
...
@@ -2321,10 +2355,10 @@ void md_update_sb(struct mddev *mddev, int force_change)
...
@@ -2321,10 +2355,10 @@ void md_update_sb(struct mddev *mddev, int force_change)
}
}
if
(
!
mddev
->
persistent
)
{
if
(
!
mddev
->
persistent
)
{
clear_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
);
clear_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
);
clear_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
clear_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
if
(
!
mddev
->
external
)
{
if
(
!
mddev
->
external
)
{
clear_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
);
clear_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
);
rdev_for_each
(
rdev
,
mddev
)
{
rdev_for_each
(
rdev
,
mddev
)
{
if
(
rdev
->
badblocks
.
changed
)
{
if
(
rdev
->
badblocks
.
changed
)
{
rdev
->
badblocks
.
changed
=
0
;
rdev
->
badblocks
.
changed
=
0
;
...
@@ -2344,9 +2378,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
...
@@ -2344,9 +2378,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
mddev
->
utime
=
ktime_get_real_seconds
();
mddev
->
utime
=
ktime_get_real_seconds
();
if
(
test_and_clear_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
))
if
(
test_and_clear_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
))
force_change
=
1
;
force_change
=
1
;
if
(
test_and_clear_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
))
if
(
test_and_clear_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
))
/* just a clean<-> dirty transition, possibly leave spares alone,
/* just a clean<-> dirty transition, possibly leave spares alone,
* though if events isn't the right even/odd, we will have to do
* though if events isn't the right even/odd, we will have to do
* spares after all
* spares after all
...
@@ -2402,6 +2436,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
...
@@ -2402,6 +2436,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
pr_debug
(
"md: updating %s RAID superblock on device (in sync %d)
\n
"
,
pr_debug
(
"md: updating %s RAID superblock on device (in sync %d)
\n
"
,
mdname
(
mddev
),
mddev
->
in_sync
);
mdname
(
mddev
),
mddev
->
in_sync
);
if
(
mddev
->
queue
)
blk_add_trace_msg
(
mddev
->
queue
,
"md md_update_sb"
);
rewrite:
bitmap_update_sb
(
mddev
->
bitmap
);
bitmap_update_sb
(
mddev
->
bitmap
);
rdev_for_each
(
rdev
,
mddev
)
{
rdev_for_each
(
rdev
,
mddev
)
{
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
...
@@ -2433,15 +2470,16 @@ void md_update_sb(struct mddev *mddev, int force_change)
...
@@ -2433,15 +2470,16 @@ void md_update_sb(struct mddev *mddev, int force_change)
/* only need to write one superblock... */
/* only need to write one superblock... */
break
;
break
;
}
}
md_super_wait
(
mddev
);
if
(
md_super_wait
(
mddev
)
<
0
)
/* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
goto
rewrite
;
/* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
if
(
mddev_is_clustered
(
mddev
)
&&
ret
==
0
)
if
(
mddev_is_clustered
(
mddev
)
&&
ret
==
0
)
md_cluster_ops
->
metadata_update_finish
(
mddev
);
md_cluster_ops
->
metadata_update_finish
(
mddev
);
if
(
mddev
->
in_sync
!=
sync_req
||
if
(
mddev
->
in_sync
!=
sync_req
||
!
bit_clear_unless
(
&
mddev
->
flags
,
BIT
(
MD
_CHANGE_PENDING
),
!
bit_clear_unless
(
&
mddev
->
sb_flags
,
BIT
(
MD_SB
_CHANGE_PENDING
),
BIT
(
MD_
CHANGE_DEVS
)
|
BIT
(
MD
_CHANGE_CLEAN
)))
BIT
(
MD_
SB_CHANGE_DEVS
)
|
BIT
(
MD_SB
_CHANGE_CLEAN
)))
/* have to write it out again */
/* have to write it out again */
goto
repeat
;
goto
repeat
;
wake_up
(
&
mddev
->
sb_wait
);
wake_up
(
&
mddev
->
sb_wait
);
...
@@ -2485,7 +2523,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
...
@@ -2485,7 +2523,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
}
}
sysfs_notify_dirent_safe
(
rdev
->
sysfs_state
);
sysfs_notify_dirent_safe
(
rdev
->
sysfs_state
);
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
if
(
mddev
->
degraded
)
if
(
mddev
->
degraded
)
set_bit
(
MD_RECOVERY_RECOVER
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_RECOVER
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
...
@@ -2523,51 +2561,41 @@ struct rdev_sysfs_entry {
...
@@ -2523,51 +2561,41 @@ struct rdev_sysfs_entry {
static
ssize_t
static
ssize_t
state_show
(
struct
md_rdev
*
rdev
,
char
*
page
)
state_show
(
struct
md_rdev
*
rdev
,
char
*
page
)
{
{
char
*
sep
=
""
;
char
*
sep
=
"
,
"
;
size_t
len
=
0
;
size_t
len
=
0
;
unsigned
long
flags
=
ACCESS_ONCE
(
rdev
->
flags
);
unsigned
long
flags
=
ACCESS_ONCE
(
rdev
->
flags
);
if
(
test_bit
(
Faulty
,
&
flags
)
||
if
(
test_bit
(
Faulty
,
&
flags
)
||
rdev
->
badblocks
.
unacked_exist
)
{
(
!
test_bit
(
ExternalBbl
,
&
flags
)
&&
len
+=
sprintf
(
page
+
len
,
"%sfaulty"
,
sep
);
rdev
->
badblocks
.
unacked_exist
))
sep
=
","
;
len
+=
sprintf
(
page
+
len
,
"faulty%s"
,
sep
);
}
if
(
test_bit
(
In_sync
,
&
flags
))
if
(
test_bit
(
In_sync
,
&
flags
))
{
len
+=
sprintf
(
page
+
len
,
"in_sync%s"
,
sep
);
len
+=
sprintf
(
page
+
len
,
"%sin_sync"
,
sep
);
if
(
test_bit
(
Journal
,
&
flags
))
sep
=
","
;
len
+=
sprintf
(
page
+
len
,
"journal%s"
,
sep
);
}
if
(
test_bit
(
WriteMostly
,
&
flags
))
if
(
test_bit
(
Journal
,
&
flags
))
{
len
+=
sprintf
(
page
+
len
,
"write_mostly%s"
,
sep
);
len
+=
sprintf
(
page
+
len
,
"%sjournal"
,
sep
);
sep
=
","
;
}
if
(
test_bit
(
WriteMostly
,
&
flags
))
{
len
+=
sprintf
(
page
+
len
,
"%swrite_mostly"
,
sep
);
sep
=
","
;
}
if
(
test_bit
(
Blocked
,
&
flags
)
||
if
(
test_bit
(
Blocked
,
&
flags
)
||
(
rdev
->
badblocks
.
unacked_exist
(
rdev
->
badblocks
.
unacked_exist
&&
!
test_bit
(
Faulty
,
&
flags
)))
{
&&
!
test_bit
(
Faulty
,
&
flags
)))
len
+=
sprintf
(
page
+
len
,
"%sblocked"
,
sep
);
len
+=
sprintf
(
page
+
len
,
"blocked%s"
,
sep
);
sep
=
","
;
}
if
(
!
test_bit
(
Faulty
,
&
flags
)
&&
if
(
!
test_bit
(
Faulty
,
&
flags
)
&&
!
test_bit
(
Journal
,
&
flags
)
&&
!
test_bit
(
Journal
,
&
flags
)
&&
!
test_bit
(
In_sync
,
&
flags
))
{
!
test_bit
(
In_sync
,
&
flags
))
len
+=
sprintf
(
page
+
len
,
"%sspare"
,
sep
);
len
+=
sprintf
(
page
+
len
,
"spare%s"
,
sep
);
sep
=
","
;
if
(
test_bit
(
WriteErrorSeen
,
&
flags
))
}
len
+=
sprintf
(
page
+
len
,
"write_error%s"
,
sep
);
if
(
test_bit
(
WriteErrorSeen
,
&
flags
))
{
if
(
test_bit
(
WantReplacement
,
&
flags
))
len
+=
sprintf
(
page
+
len
,
"%swrite_error"
,
sep
);
len
+=
sprintf
(
page
+
len
,
"want_replacement%s"
,
sep
);
sep
=
","
;
if
(
test_bit
(
Replacement
,
&
flags
))
}
len
+=
sprintf
(
page
+
len
,
"replacement%s"
,
sep
);
if
(
test_bit
(
WantReplacement
,
&
flags
))
{
if
(
test_bit
(
ExternalBbl
,
&
flags
))
len
+=
sprintf
(
page
+
len
,
"%swant_replacement"
,
sep
);
len
+=
sprintf
(
page
+
len
,
"external_bbl%s"
,
sep
);
sep
=
","
;
if
(
test_bit
(
FailFast
,
&
flags
))
}
len
+=
sprintf
(
page
+
len
,
"failfast%s"
,
sep
);
if
(
test_bit
(
Replacement
,
&
flags
))
{
len
+=
sprintf
(
page
+
len
,
"%sreplacement"
,
sep
);
if
(
len
)
sep
=
","
;
len
-=
strlen
(
sep
);
}
return
len
+
sprintf
(
page
+
len
,
"
\n
"
);
return
len
+
sprintf
(
page
+
len
,
"
\n
"
);
}
}
...
@@ -2587,6 +2615,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
...
@@ -2587,6 +2615,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* so that it gets rebuilt based on bitmap
* so that it gets rebuilt based on bitmap
* write_error - sets WriteErrorSeen
* write_error - sets WriteErrorSeen
* -write_error - clears WriteErrorSeen
* -write_error - clears WriteErrorSeen
* {,-}failfast - set/clear FailFast
*/
*/
int
err
=
-
EINVAL
;
int
err
=
-
EINVAL
;
if
(
cmd_match
(
buf
,
"faulty"
)
&&
rdev
->
mddev
->
pers
)
{
if
(
cmd_match
(
buf
,
"faulty"
)
&&
rdev
->
mddev
->
pers
)
{
...
@@ -2610,8 +2639,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
...
@@ -2610,8 +2639,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
if
(
err
==
0
)
{
if
(
err
==
0
)
{
md_kick_rdev_from_array
(
rdev
);
md_kick_rdev_from_array
(
rdev
);
if
(
mddev
->
pers
)
if
(
mddev
->
pers
)
{
md_update_sb
(
mddev
,
1
);
set_bit
(
MD_SB_CHANGE_DEVS
,
&
mddev
->
sb_flags
);
md_wakeup_thread
(
mddev
->
thread
);
}
md_new_event
(
mddev
);
md_new_event
(
mddev
);
}
}
}
}
...
@@ -2626,6 +2657,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
...
@@ -2626,6 +2657,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err
=
0
;
err
=
0
;
}
else
if
(
cmd_match
(
buf
,
"-blocked"
))
{
}
else
if
(
cmd_match
(
buf
,
"-blocked"
))
{
if
(
!
test_bit
(
Faulty
,
&
rdev
->
flags
)
&&
if
(
!
test_bit
(
Faulty
,
&
rdev
->
flags
)
&&
!
test_bit
(
ExternalBbl
,
&
rdev
->
flags
)
&&
rdev
->
badblocks
.
unacked_exist
)
{
rdev
->
badblocks
.
unacked_exist
)
{
/* metadata handler doesn't understand badblocks,
/* metadata handler doesn't understand badblocks,
* so we need to fail the device
* so we need to fail the device
...
@@ -2642,6 +2674,12 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
...
@@ -2642,6 +2674,12 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
else
if
(
cmd_match
(
buf
,
"insync"
)
&&
rdev
->
raid_disk
==
-
1
)
{
}
else
if
(
cmd_match
(
buf
,
"insync"
)
&&
rdev
->
raid_disk
==
-
1
)
{
set_bit
(
In_sync
,
&
rdev
->
flags
);
set_bit
(
In_sync
,
&
rdev
->
flags
);
err
=
0
;
err
=
0
;
}
else
if
(
cmd_match
(
buf
,
"failfast"
))
{
set_bit
(
FailFast
,
&
rdev
->
flags
);
err
=
0
;
}
else
if
(
cmd_match
(
buf
,
"-failfast"
))
{
clear_bit
(
FailFast
,
&
rdev
->
flags
);
err
=
0
;
}
else
if
(
cmd_match
(
buf
,
"-insync"
)
&&
rdev
->
raid_disk
>=
0
&&
}
else
if
(
cmd_match
(
buf
,
"-insync"
)
&&
rdev
->
raid_disk
>=
0
&&
!
test_bit
(
Journal
,
&
rdev
->
flags
))
{
!
test_bit
(
Journal
,
&
rdev
->
flags
))
{
if
(
rdev
->
mddev
->
pers
==
NULL
)
{
if
(
rdev
->
mddev
->
pers
==
NULL
)
{
...
@@ -2708,6 +2746,13 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
...
@@ -2708,6 +2746,13 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
}
}
}
else
}
else
err
=
-
EBUSY
;
err
=
-
EBUSY
;
}
else
if
(
cmd_match
(
buf
,
"external_bbl"
)
&&
(
rdev
->
mddev
->
external
))
{
set_bit
(
ExternalBbl
,
&
rdev
->
flags
);
rdev
->
badblocks
.
shift
=
0
;
err
=
0
;
}
else
if
(
cmd_match
(
buf
,
"-external_bbl"
)
&&
(
rdev
->
mddev
->
external
))
{
clear_bit
(
ExternalBbl
,
&
rdev
->
flags
);
err
=
0
;
}
}
if
(
!
err
)
if
(
!
err
)
sysfs_notify_dirent_safe
(
rdev
->
sysfs_state
);
sysfs_notify_dirent_safe
(
rdev
->
sysfs_state
);
...
@@ -3211,10 +3256,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
...
@@ -3211,10 +3256,8 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
sector_t
size
;
sector_t
size
;
rdev
=
kzalloc
(
sizeof
(
*
rdev
),
GFP_KERNEL
);
rdev
=
kzalloc
(
sizeof
(
*
rdev
),
GFP_KERNEL
);
if
(
!
rdev
)
{
if
(
!
rdev
)
printk
(
KERN_ERR
"md: could not alloc mem for new device!
\n
"
);
return
ERR_PTR
(
-
ENOMEM
);
return
ERR_PTR
(
-
ENOMEM
);
}
err
=
md_rdev_init
(
rdev
);
err
=
md_rdev_init
(
rdev
);
if
(
err
)
if
(
err
)
...
@@ -3231,8 +3274,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
...
@@ -3231,8 +3274,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
size
=
i_size_read
(
rdev
->
bdev
->
bd_inode
)
>>
BLOCK_SIZE_BITS
;
size
=
i_size_read
(
rdev
->
bdev
->
bd_inode
)
>>
BLOCK_SIZE_BITS
;
if
(
!
size
)
{
if
(
!
size
)
{
printk
(
KERN_WARNING
pr_warn
(
"md: %s has zero or unknown size, marking faulty!
\n
"
,
"md: %s has zero or unknown size, marking faulty!
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
err
=
-
EINVAL
;
err
=
-
EINVAL
;
goto
abort_free
;
goto
abort_free
;
...
@@ -3242,16 +3284,13 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
...
@@ -3242,16 +3284,13 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
err
=
super_types
[
super_format
].
err
=
super_types
[
super_format
].
load_super
(
rdev
,
NULL
,
super_minor
);
load_super
(
rdev
,
NULL
,
super_minor
);
if
(
err
==
-
EINVAL
)
{
if
(
err
==
-
EINVAL
)
{
printk
(
KERN_WARNING
pr_warn
(
"md: %s does not have a valid v%d.%d superblock, not importing!
\n
"
,
"md: %s does not have a valid v%d.%d "
"superblock, not importing!
\n
"
,
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
super_format
,
super_minor
);
super_format
,
super_minor
);
goto
abort_free
;
goto
abort_free
;
}
}
if
(
err
<
0
)
{
if
(
err
<
0
)
{
printk
(
KERN_WARNING
pr_warn
(
"md: could not read %s's sb, not importing!
\n
"
,
"md: could not read %s's sb, not importing!
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
goto
abort_free
;
goto
abort_free
;
}
}
...
@@ -3287,9 +3326,7 @@ static void analyze_sbs(struct mddev *mddev)
...
@@ -3287,9 +3326,7 @@ static void analyze_sbs(struct mddev *mddev)
case
0
:
case
0
:
break
;
break
;
default:
default:
printk
(
KERN_ERR
\
pr_warn
(
"md: fatal superblock inconsistency in %s -- removing from array
\n
"
,
"md: fatal superblock inconsistency in %s"
" -- removing from array
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
md_kick_rdev_from_array
(
rdev
);
md_kick_rdev_from_array
(
rdev
);
}
}
...
@@ -3302,8 +3339,7 @@ static void analyze_sbs(struct mddev *mddev)
...
@@ -3302,8 +3339,7 @@ static void analyze_sbs(struct mddev *mddev)
if
(
mddev
->
max_disks
&&
if
(
mddev
->
max_disks
&&
(
rdev
->
desc_nr
>=
mddev
->
max_disks
||
(
rdev
->
desc_nr
>=
mddev
->
max_disks
||
i
>
mddev
->
max_disks
))
{
i
>
mddev
->
max_disks
))
{
printk
(
KERN_WARNING
pr_warn
(
"md: %s: %s: only %d devices permitted
\n
"
,
"md: %s: %s: only %d devices permitted
\n
"
,
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
mddev
->
max_disks
);
mddev
->
max_disks
);
md_kick_rdev_from_array
(
rdev
);
md_kick_rdev_from_array
(
rdev
);
...
@@ -3312,8 +3348,7 @@ static void analyze_sbs(struct mddev *mddev)
...
@@ -3312,8 +3348,7 @@ static void analyze_sbs(struct mddev *mddev)
if
(
rdev
!=
freshest
)
{
if
(
rdev
!=
freshest
)
{
if
(
super_types
[
mddev
->
major_version
].
if
(
super_types
[
mddev
->
major_version
].
validate_super
(
mddev
,
rdev
))
{
validate_super
(
mddev
,
rdev
))
{
printk
(
KERN_WARNING
"md: kicking non-fresh %s"
pr_warn
(
"md: kicking non-fresh %s from array!
\n
"
,
" from array!
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
md_kick_rdev_from_array
(
rdev
);
md_kick_rdev_from_array
(
rdev
);
continue
;
continue
;
...
@@ -3384,7 +3419,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
...
@@ -3384,7 +3419,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
unsigned
long
msec
;
unsigned
long
msec
;
if
(
mddev_is_clustered
(
mddev
))
{
if
(
mddev_is_clustered
(
mddev
))
{
pr_
info
(
"md: Safemode is disabled for clustered mode
\n
"
);
pr_
warn
(
"md: Safemode is disabled for clustered mode
\n
"
);
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -3472,7 +3507,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3472,7 +3507,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
rv
=
-
EINVAL
;
rv
=
-
EINVAL
;
if
(
!
mddev
->
pers
->
quiesce
)
{
if
(
!
mddev
->
pers
->
quiesce
)
{
pr
intk
(
KERN_WARNING
"md: %s: %s does not support online personality change
\n
"
,
pr
_warn
(
"md: %s: %s does not support online personality change
\n
"
,
mdname
(
mddev
),
mddev
->
pers
->
name
);
mdname
(
mddev
),
mddev
->
pers
->
name
);
goto
out_unlock
;
goto
out_unlock
;
}
}
...
@@ -3491,7 +3526,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3491,7 +3526,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
pers
=
find_pers
(
level
,
clevel
);
pers
=
find_pers
(
level
,
clevel
);
if
(
!
pers
||
!
try_module_get
(
pers
->
owner
))
{
if
(
!
pers
||
!
try_module_get
(
pers
->
owner
))
{
spin_unlock
(
&
pers_lock
);
spin_unlock
(
&
pers_lock
);
pr
intk
(
KERN_WARNING
"md: personality %s not loaded
\n
"
,
clevel
);
pr
_warn
(
"md: personality %s not loaded
\n
"
,
clevel
);
rv
=
-
EINVAL
;
rv
=
-
EINVAL
;
goto
out_unlock
;
goto
out_unlock
;
}
}
...
@@ -3505,7 +3540,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3505,7 +3540,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
}
if
(
!
pers
->
takeover
)
{
if
(
!
pers
->
takeover
)
{
module_put
(
pers
->
owner
);
module_put
(
pers
->
owner
);
pr
intk
(
KERN_WARNING
"md: %s: %s does not support personality takeover
\n
"
,
pr
_warn
(
"md: %s: %s does not support personality takeover
\n
"
,
mdname
(
mddev
),
clevel
);
mdname
(
mddev
),
clevel
);
rv
=
-
EINVAL
;
rv
=
-
EINVAL
;
goto
out_unlock
;
goto
out_unlock
;
...
@@ -3526,7 +3561,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3526,7 +3561,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev
->
delta_disks
=
0
;
mddev
->
delta_disks
=
0
;
mddev
->
reshape_backwards
=
0
;
mddev
->
reshape_backwards
=
0
;
module_put
(
pers
->
owner
);
module_put
(
pers
->
owner
);
pr
intk
(
KERN_WARNING
"md: %s: %s would not accept array
\n
"
,
pr
_warn
(
"md: %s: %s would not accept array
\n
"
,
mdname
(
mddev
),
clevel
);
mdname
(
mddev
),
clevel
);
rv
=
PTR_ERR
(
priv
);
rv
=
PTR_ERR
(
priv
);
goto
out_unlock
;
goto
out_unlock
;
...
@@ -3570,8 +3605,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3570,8 +3605,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
pers
->
sync_request
!=
NULL
)
{
pers
->
sync_request
!=
NULL
)
{
/* need to add the md_redundancy_group */
/* need to add the md_redundancy_group */
if
(
sysfs_create_group
(
&
mddev
->
kobj
,
&
md_redundancy_group
))
if
(
sysfs_create_group
(
&
mddev
->
kobj
,
&
md_redundancy_group
))
printk
(
KERN_WARNING
pr_warn
(
"md: cannot register extra attributes for %s
\n
"
,
"md: cannot register extra attributes for %s
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
mddev
->
sysfs_action
=
sysfs_get_dirent
(
mddev
->
kobj
.
sd
,
"sync_action"
);
mddev
->
sysfs_action
=
sysfs_get_dirent
(
mddev
->
kobj
.
sd
,
"sync_action"
);
}
}
...
@@ -3603,8 +3637,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3603,8 +3637,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
clear_bit
(
In_sync
,
&
rdev
->
flags
);
clear_bit
(
In_sync
,
&
rdev
->
flags
);
else
{
else
{
if
(
sysfs_link_rdev
(
mddev
,
rdev
))
if
(
sysfs_link_rdev
(
mddev
,
rdev
))
printk
(
KERN_WARNING
"md: cannot register rd%d"
pr_warn
(
"md: cannot register rd%d for %s after level change
\n
"
,
" for %s after level change
\n
"
,
rdev
->
raid_disk
,
mdname
(
mddev
));
rdev
->
raid_disk
,
mdname
(
mddev
));
}
}
}
}
...
@@ -3618,7 +3651,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3618,7 +3651,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
}
}
blk_set_stacking_limits
(
&
mddev
->
queue
->
limits
);
blk_set_stacking_limits
(
&
mddev
->
queue
->
limits
);
pers
->
run
(
mddev
);
pers
->
run
(
mddev
);
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
mddev_resume
(
mddev
);
mddev_resume
(
mddev
);
if
(
!
mddev
->
thread
)
if
(
!
mddev
->
thread
)
md_update_sb
(
mddev
,
1
);
md_update_sb
(
mddev
,
1
);
...
@@ -3813,7 +3846,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3813,7 +3846,7 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len)
if
(
!
err
)
{
if
(
!
err
)
{
mddev
->
recovery_cp
=
n
;
mddev
->
recovery_cp
=
n
;
if
(
mddev
->
pers
)
if
(
mddev
->
pers
)
set_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
);
}
}
mddev_unlock
(
mddev
);
mddev_unlock
(
mddev
);
return
err
?:
len
;
return
err
?:
len
;
...
@@ -3887,7 +3920,7 @@ array_state_show(struct mddev *mddev, char *page)
...
@@ -3887,7 +3920,7 @@ array_state_show(struct mddev *mddev, char *page)
st
=
read_auto
;
st
=
read_auto
;
break
;
break
;
case
0
:
case
0
:
if
(
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
if
(
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
st
=
write_pending
;
st
=
write_pending
;
else
if
(
mddev
->
in_sync
)
else
if
(
mddev
->
in_sync
)
st
=
clean
;
st
=
clean
;
...
@@ -3925,7 +3958,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3925,7 +3958,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
spin_lock
(
&
mddev
->
lock
);
spin_lock
(
&
mddev
->
lock
);
if
(
st
==
active
)
{
if
(
st
==
active
)
{
restart_array
(
mddev
);
restart_array
(
mddev
);
clear_bit
(
MD_CHANGE_PENDING
,
&
mddev
->
flags
);
clear_bit
(
MD_SB_CHANGE_PENDING
,
&
mddev
->
sb_flags
);
md_wakeup_thread
(
mddev
->
thread
);
wake_up
(
&
mddev
->
sb_wait
);
wake_up
(
&
mddev
->
sb_wait
);
err
=
0
;
err
=
0
;
}
else
/* st == clean */
{
}
else
/* st == clean */
{
...
@@ -3935,7 +3969,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -3935,7 +3969,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
mddev
->
in_sync
=
1
;
mddev
->
in_sync
=
1
;
if
(
mddev
->
safemode
==
1
)
if
(
mddev
->
safemode
==
1
)
mddev
->
safemode
=
0
;
mddev
->
safemode
=
0
;
set_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
);
}
}
err
=
0
;
err
=
0
;
}
else
}
else
...
@@ -4001,7 +4035,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -4001,7 +4035,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
mddev
->
in_sync
=
1
;
mddev
->
in_sync
=
1
;
if
(
mddev
->
safemode
==
1
)
if
(
mddev
->
safemode
==
1
)
mddev
->
safemode
=
0
;
mddev
->
safemode
=
0
;
set_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
);
}
}
err
=
0
;
err
=
0
;
}
else
}
else
...
@@ -4015,7 +4049,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
...
@@ -4015,7 +4049,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err
=
restart_array
(
mddev
);
err
=
restart_array
(
mddev
);
if
(
err
)
if
(
err
)
break
;
break
;
clear_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
);
clear_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
);
wake_up
(
&
mddev
->
sb_wait
);
wake_up
(
&
mddev
->
sb_wait
);
err
=
0
;
err
=
0
;
}
else
{
}
else
{
...
@@ -5071,13 +5105,13 @@ static int md_alloc(dev_t dev, char *name)
...
@@ -5071,13 +5105,13 @@ static int md_alloc(dev_t dev, char *name)
/* This isn't possible, but as kobject_init_and_add is marked
/* This isn't possible, but as kobject_init_and_add is marked
* __must_check, we must do something with the result
* __must_check, we must do something with the result
*/
*/
pr
intk
(
KERN_WARNING
"md: cannot register %s/md - name in use
\n
"
,
pr
_debug
(
"md: cannot register %s/md - name in use
\n
"
,
disk
->
disk_name
);
disk
->
disk_name
);
error
=
0
;
error
=
0
;
}
}
if
(
mddev
->
kobj
.
sd
&&
if
(
mddev
->
kobj
.
sd
&&
sysfs_create_group
(
&
mddev
->
kobj
,
&
md_bitmap_group
))
sysfs_create_group
(
&
mddev
->
kobj
,
&
md_bitmap_group
))
pr
intk
(
KERN_DEBUG
"pointless warning
\n
"
);
pr
_debug
(
"pointless warning
\n
"
);
mutex_unlock
(
&
mddev
->
open_mutex
);
mutex_unlock
(
&
mddev
->
open_mutex
);
abort:
abort:
mutex_unlock
(
&
disks_mutex
);
mutex_unlock
(
&
disks_mutex
);
...
@@ -5179,14 +5213,14 @@ int md_run(struct mddev *mddev)
...
@@ -5179,14 +5213,14 @@ int md_run(struct mddev *mddev)
if
(
mddev
->
dev_sectors
&&
if
(
mddev
->
dev_sectors
&&
rdev
->
data_offset
+
mddev
->
dev_sectors
rdev
->
data_offset
+
mddev
->
dev_sectors
>
rdev
->
sb_start
)
{
>
rdev
->
sb_start
)
{
pr
intk
(
"md: %s: data overlaps metadata
\n
"
,
pr
_warn
(
"md: %s: data overlaps metadata
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
}
else
{
}
else
{
if
(
rdev
->
sb_start
+
rdev
->
sb_size
/
512
if
(
rdev
->
sb_start
+
rdev
->
sb_size
/
512
>
rdev
->
data_offset
)
{
>
rdev
->
data_offset
)
{
pr
intk
(
"md: %s: metadata overlaps data
\n
"
,
pr
_warn
(
"md: %s: metadata overlaps data
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -5202,10 +5236,10 @@ int md_run(struct mddev *mddev)
...
@@ -5202,10 +5236,10 @@ int md_run(struct mddev *mddev)
if
(
!
pers
||
!
try_module_get
(
pers
->
owner
))
{
if
(
!
pers
||
!
try_module_get
(
pers
->
owner
))
{
spin_unlock
(
&
pers_lock
);
spin_unlock
(
&
pers_lock
);
if
(
mddev
->
level
!=
LEVEL_NONE
)
if
(
mddev
->
level
!=
LEVEL_NONE
)
pr
intk
(
KERN_WARNING
"md: personality for level %d is not loaded!
\n
"
,
pr
_warn
(
"md: personality for level %d is not loaded!
\n
"
,
mddev
->
level
);
mddev
->
level
);
else
else
pr
intk
(
KERN_WARNING
"md: personality for level %s is not loaded!
\n
"
,
pr
_warn
(
"md: personality for level %s is not loaded!
\n
"
,
mddev
->
clevel
);
mddev
->
clevel
);
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -5236,10 +5270,7 @@ int md_run(struct mddev *mddev)
...
@@ -5236,10 +5270,7 @@ int md_run(struct mddev *mddev)
if
(
rdev
<
rdev2
&&
if
(
rdev
<
rdev2
&&
rdev
->
bdev
->
bd_contains
==
rdev
->
bdev
->
bd_contains
==
rdev2
->
bdev
->
bd_contains
)
{
rdev2
->
bdev
->
bd_contains
)
{
printk
(
KERN_WARNING
pr_warn
(
"%s: WARNING: %s appears to be on the same physical disk as %s.
\n
"
,
"%s: WARNING: %s appears to be"
" on the same physical disk as"
" %s.
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev2
->
bdev
,
b2
));
bdevname
(
rdev2
->
bdev
,
b2
));
...
@@ -5248,9 +5279,7 @@ int md_run(struct mddev *mddev)
...
@@ -5248,9 +5279,7 @@ int md_run(struct mddev *mddev)
}
}
if
(
warned
)
if
(
warned
)
printk
(
KERN_WARNING
pr_warn
(
"True protection against single-disk failure might be compromised.
\n
"
);
"True protection against single-disk"
" failure might be compromised.
\n
"
);
}
}
mddev
->
recovery
=
0
;
mddev
->
recovery
=
0
;
...
@@ -5264,12 +5293,12 @@ int md_run(struct mddev *mddev)
...
@@ -5264,12 +5293,12 @@ int md_run(struct mddev *mddev)
err
=
pers
->
run
(
mddev
);
err
=
pers
->
run
(
mddev
);
if
(
err
)
if
(
err
)
pr
intk
(
KERN_ERR
"md: pers->run() failed ...
\n
"
);
pr
_warn
(
"md: pers->run() failed ...
\n
"
);
else
if
(
pers
->
size
(
mddev
,
0
,
0
)
<
mddev
->
array_sectors
)
{
else
if
(
pers
->
size
(
mddev
,
0
,
0
)
<
mddev
->
array_sectors
)
{
WARN_ONCE
(
!
mddev
->
external_size
,
"%s: default size too small,"
WARN_ONCE
(
!
mddev
->
external_size
,
"
but 'external_size' not in effect?
\n
"
,
__func__
);
"
%s: default size too small, but 'external_size' not in effect?
\n
"
,
printk
(
KERN_ERR
__func__
);
"md: invalid array_size %llu > default size %llu
\n
"
,
pr_warn
(
"md: invalid array_size %llu > default size %llu
\n
"
,
(
unsigned
long
long
)
mddev
->
array_sectors
/
2
,
(
unsigned
long
long
)
mddev
->
array_sectors
/
2
,
(
unsigned
long
long
)
pers
->
size
(
mddev
,
0
,
0
)
/
2
);
(
unsigned
long
long
)
pers
->
size
(
mddev
,
0
,
0
)
/
2
);
err
=
-
EINVAL
;
err
=
-
EINVAL
;
...
@@ -5281,7 +5310,7 @@ int md_run(struct mddev *mddev)
...
@@ -5281,7 +5310,7 @@ int md_run(struct mddev *mddev)
bitmap
=
bitmap_create
(
mddev
,
-
1
);
bitmap
=
bitmap_create
(
mddev
,
-
1
);
if
(
IS_ERR
(
bitmap
))
{
if
(
IS_ERR
(
bitmap
))
{
err
=
PTR_ERR
(
bitmap
);
err
=
PTR_ERR
(
bitmap
);
pr
intk
(
KERN_ERR
"%s: failed to create bitmap (%d)
\n
"
,
pr
_warn
(
"%s: failed to create bitmap (%d)
\n
"
,
mdname
(
mddev
),
err
);
mdname
(
mddev
),
err
);
}
else
}
else
mddev
->
bitmap
=
bitmap
;
mddev
->
bitmap
=
bitmap
;
...
@@ -5318,8 +5347,7 @@ int md_run(struct mddev *mddev)
...
@@ -5318,8 +5347,7 @@ int md_run(struct mddev *mddev)
if
(
pers
->
sync_request
)
{
if
(
pers
->
sync_request
)
{
if
(
mddev
->
kobj
.
sd
&&
if
(
mddev
->
kobj
.
sd
&&
sysfs_create_group
(
&
mddev
->
kobj
,
&
md_redundancy_group
))
sysfs_create_group
(
&
mddev
->
kobj
,
&
md_redundancy_group
))
printk
(
KERN_WARNING
pr_warn
(
"md: cannot register extra attributes for %s
\n
"
,
"md: cannot register extra attributes for %s
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
mddev
->
sysfs_action
=
sysfs_get_dirent_safe
(
mddev
->
kobj
.
sd
,
"sync_action"
);
mddev
->
sysfs_action
=
sysfs_get_dirent_safe
(
mddev
->
kobj
.
sd
,
"sync_action"
);
}
else
if
(
mddev
->
ro
==
2
)
/* auto-readonly not meaningful */
}
else
if
(
mddev
->
ro
==
2
)
/* auto-readonly not meaningful */
...
@@ -5350,7 +5378,7 @@ int md_run(struct mddev *mddev)
...
@@ -5350,7 +5378,7 @@ int md_run(struct mddev *mddev)
set_bit
(
MD_RECOVERY_RECOVER
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_RECOVER
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
if
(
mddev
->
flags
&
MD_UPDATE_SB_FLAGS
)
if
(
mddev
->
sb_flags
)
md_update_sb
(
mddev
,
0
);
md_update_sb
(
mddev
,
0
);
md_new_event
(
mddev
);
md_new_event
(
mddev
);
...
@@ -5421,8 +5449,7 @@ static int restart_array(struct mddev *mddev)
...
@@ -5421,8 +5449,7 @@ static int restart_array(struct mddev *mddev)
mddev
->
safemode
=
0
;
mddev
->
safemode
=
0
;
mddev
->
ro
=
0
;
mddev
->
ro
=
0
;
set_disk_ro
(
disk
,
0
);
set_disk_ro
(
disk
,
0
);
printk
(
KERN_INFO
"md: %s switched to read-write mode.
\n
"
,
pr_debug
(
"md: %s switched to read-write mode.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
/* Kick recovery or resync if necessary */
/* Kick recovery or resync if necessary */
set_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
...
@@ -5446,6 +5473,7 @@ static void md_clean(struct mddev *mddev)
...
@@ -5446,6 +5473,7 @@ static void md_clean(struct mddev *mddev)
mddev
->
level
=
LEVEL_NONE
;
mddev
->
level
=
LEVEL_NONE
;
mddev
->
clevel
[
0
]
=
0
;
mddev
->
clevel
[
0
]
=
0
;
mddev
->
flags
=
0
;
mddev
->
flags
=
0
;
mddev
->
sb_flags
=
0
;
mddev
->
ro
=
0
;
mddev
->
ro
=
0
;
mddev
->
metadata_type
[
0
]
=
0
;
mddev
->
metadata_type
[
0
]
=
0
;
mddev
->
chunk_sectors
=
0
;
mddev
->
chunk_sectors
=
0
;
...
@@ -5490,12 +5518,15 @@ static void __md_stop_writes(struct mddev *mddev)
...
@@ -5490,12 +5518,15 @@ static void __md_stop_writes(struct mddev *mddev)
del_timer_sync
(
&
mddev
->
safemode_timer
);
del_timer_sync
(
&
mddev
->
safemode_timer
);
if
(
mddev
->
pers
&&
mddev
->
pers
->
quiesce
)
{
mddev
->
pers
->
quiesce
(
mddev
,
1
);
mddev
->
pers
->
quiesce
(
mddev
,
0
);
}
bitmap_flush
(
mddev
);
bitmap_flush
(
mddev
);
md_super_wait
(
mddev
);
if
(
mddev
->
ro
==
0
&&
if
(
mddev
->
ro
==
0
&&
((
!
mddev
->
in_sync
&&
!
mddev_is_clustered
(
mddev
))
||
((
!
mddev
->
in_sync
&&
!
mddev_is_clustered
(
mddev
))
||
(
mddev
->
flags
&
MD_UPDATE_SB_FLAGS
)
))
{
mddev
->
sb_flags
))
{
/* mark array as shutdown cleanly */
/* mark array as shutdown cleanly */
if
(
!
mddev_is_clustered
(
mddev
))
if
(
!
mddev_is_clustered
(
mddev
))
mddev
->
in_sync
=
1
;
mddev
->
in_sync
=
1
;
...
@@ -5516,7 +5547,7 @@ static void mddev_detach(struct mddev *mddev)
...
@@ -5516,7 +5547,7 @@ static void mddev_detach(struct mddev *mddev)
struct
bitmap
*
bitmap
=
mddev
->
bitmap
;
struct
bitmap
*
bitmap
=
mddev
->
bitmap
;
/* wait for behind writes to complete */
/* wait for behind writes to complete */
if
(
bitmap
&&
atomic_read
(
&
bitmap
->
behind_writes
)
>
0
)
{
if
(
bitmap
&&
atomic_read
(
&
bitmap
->
behind_writes
)
>
0
)
{
pr
intk
(
KERN_INFO
"md:%s: behind writes in progress - waiting to stop.
\n
"
,
pr
_debug
(
"md:%s: behind writes in progress - waiting to stop.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
/* need to kick something here to make sure I/O goes? */
/* need to kick something here to make sure I/O goes? */
wait_event
(
bitmap
->
behind_wait
,
wait_event
(
bitmap
->
behind_wait
,
...
@@ -5578,20 +5609,20 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
...
@@ -5578,20 +5609,20 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
* which will now never happen */
* which will now never happen */
wake_up_process
(
mddev
->
sync_thread
->
tsk
);
wake_up_process
(
mddev
->
sync_thread
->
tsk
);
if
(
mddev
->
external
&&
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
if
(
mddev
->
external
&&
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
return
-
EBUSY
;
return
-
EBUSY
;
mddev_unlock
(
mddev
);
mddev_unlock
(
mddev
);
wait_event
(
resync_wait
,
!
test_bit
(
MD_RECOVERY_RUNNING
,
wait_event
(
resync_wait
,
!
test_bit
(
MD_RECOVERY_RUNNING
,
&
mddev
->
recovery
));
&
mddev
->
recovery
));
wait_event
(
mddev
->
sb_wait
,
wait_event
(
mddev
->
sb_wait
,
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
));
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
));
mddev_lock_nointr
(
mddev
);
mddev_lock_nointr
(
mddev
);
mutex_lock
(
&
mddev
->
open_mutex
);
mutex_lock
(
&
mddev
->
open_mutex
);
if
((
mddev
->
pers
&&
atomic_read
(
&
mddev
->
openers
)
>
!!
bdev
)
||
if
((
mddev
->
pers
&&
atomic_read
(
&
mddev
->
openers
)
>
!!
bdev
)
||
mddev
->
sync_thread
||
mddev
->
sync_thread
||
test_bit
(
MD_RECOVERY_RUNNING
,
&
mddev
->
recovery
))
{
test_bit
(
MD_RECOVERY_RUNNING
,
&
mddev
->
recovery
))
{
pr
intk
(
"md: %s still in use.
\n
"
,
mdname
(
mddev
));
pr
_warn
(
"md: %s still in use.
\n
"
,
mdname
(
mddev
));
if
(
did_freeze
)
{
if
(
did_freeze
)
{
clear_bit
(
MD_RECOVERY_FROZEN
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_FROZEN
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
...
@@ -5653,7 +5684,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
...
@@ -5653,7 +5684,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
mddev
->
sysfs_active
||
mddev
->
sysfs_active
||
mddev
->
sync_thread
||
mddev
->
sync_thread
||
test_bit
(
MD_RECOVERY_RUNNING
,
&
mddev
->
recovery
))
{
test_bit
(
MD_RECOVERY_RUNNING
,
&
mddev
->
recovery
))
{
pr
intk
(
"md: %s still in use.
\n
"
,
mdname
(
mddev
));
pr
_warn
(
"md: %s still in use.
\n
"
,
mdname
(
mddev
));
mutex_unlock
(
&
mddev
->
open_mutex
);
mutex_unlock
(
&
mddev
->
open_mutex
);
if
(
did_freeze
)
{
if
(
did_freeze
)
{
clear_bit
(
MD_RECOVERY_FROZEN
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_FROZEN
,
&
mddev
->
recovery
);
...
@@ -5690,7 +5721,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
...
@@ -5690,7 +5721,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
* Free resources if final stop
* Free resources if final stop
*/
*/
if
(
mode
==
0
)
{
if
(
mode
==
0
)
{
pr
intk
(
KERN_INFO
"md: %s stopped.
\n
"
,
mdname
(
mddev
));
pr
_info
(
"md: %s stopped.
\n
"
,
mdname
(
mddev
));
bitmap_destroy
(
mddev
);
bitmap_destroy
(
mddev
);
if
(
mddev
->
bitmap_info
.
file
)
{
if
(
mddev
->
bitmap_info
.
file
)
{
...
@@ -5722,17 +5753,17 @@ static void autorun_array(struct mddev *mddev)
...
@@ -5722,17 +5753,17 @@ static void autorun_array(struct mddev *mddev)
if
(
list_empty
(
&
mddev
->
disks
))
if
(
list_empty
(
&
mddev
->
disks
))
return
;
return
;
pr
intk
(
KERN_INFO
"md: running: "
);
pr
_info
(
"md: running: "
);
rdev_for_each
(
rdev
,
mddev
)
{
rdev_for_each
(
rdev
,
mddev
)
{
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
pr
intk
(
"<%s>"
,
bdevname
(
rdev
->
bdev
,
b
));
pr
_cont
(
"<%s>"
,
bdevname
(
rdev
->
bdev
,
b
));
}
}
pr
intk
(
"
\n
"
);
pr
_cont
(
"
\n
"
);
err
=
do_md_run
(
mddev
);
err
=
do_md_run
(
mddev
);
if
(
err
)
{
if
(
err
)
{
pr
intk
(
KERN_WARNING
"md: do_md_run() returned %d
\n
"
,
err
);
pr
_warn
(
"md: do_md_run() returned %d
\n
"
,
err
);
do_md_stop
(
mddev
,
0
,
NULL
);
do_md_stop
(
mddev
,
0
,
NULL
);
}
}
}
}
...
@@ -5755,7 +5786,7 @@ static void autorun_devices(int part)
...
@@ -5755,7 +5786,7 @@ static void autorun_devices(int part)
struct
mddev
*
mddev
;
struct
mddev
*
mddev
;
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
pr
intk
(
KERN_INFO
"md: autorun ...
\n
"
);
pr
_info
(
"md: autorun ...
\n
"
);
while
(
!
list_empty
(
&
pending_raid_disks
))
{
while
(
!
list_empty
(
&
pending_raid_disks
))
{
int
unit
;
int
unit
;
dev_t
dev
;
dev_t
dev
;
...
@@ -5763,12 +5794,11 @@ static void autorun_devices(int part)
...
@@ -5763,12 +5794,11 @@ static void autorun_devices(int part)
rdev0
=
list_entry
(
pending_raid_disks
.
next
,
rdev0
=
list_entry
(
pending_raid_disks
.
next
,
struct
md_rdev
,
same_set
);
struct
md_rdev
,
same_set
);
printk
(
KERN_INFO
"md: considering %s ...
\n
"
,
pr_debug
(
"md: considering %s ...
\n
"
,
bdevname
(
rdev0
->
bdev
,
b
));
bdevname
(
rdev0
->
bdev
,
b
));
INIT_LIST_HEAD
(
&
candidates
);
INIT_LIST_HEAD
(
&
candidates
);
rdev_for_each_list
(
rdev
,
tmp
,
&
pending_raid_disks
)
rdev_for_each_list
(
rdev
,
tmp
,
&
pending_raid_disks
)
if
(
super_90_load
(
rdev
,
rdev0
,
0
)
>=
0
)
{
if
(
super_90_load
(
rdev
,
rdev0
,
0
)
>=
0
)
{
pr
intk
(
KERN_INFO
"md: adding %s ...
\n
"
,
pr
_debug
(
"md: adding %s ...
\n
"
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
list_move
(
&
rdev
->
same_set
,
&
candidates
);
list_move
(
&
rdev
->
same_set
,
&
candidates
);
}
}
...
@@ -5786,7 +5816,7 @@ static void autorun_devices(int part)
...
@@ -5786,7 +5816,7 @@ static void autorun_devices(int part)
unit
=
MINOR
(
dev
);
unit
=
MINOR
(
dev
);
}
}
if
(
rdev0
->
preferred_minor
!=
unit
)
{
if
(
rdev0
->
preferred_minor
!=
unit
)
{
pr
intk
(
KERN_INFO
"md: unit number in %s is bad: %d
\n
"
,
pr
_warn
(
"md: unit number in %s is bad: %d
\n
"
,
bdevname
(
rdev0
->
bdev
,
b
),
rdev0
->
preferred_minor
);
bdevname
(
rdev0
->
bdev
,
b
),
rdev0
->
preferred_minor
);
break
;
break
;
}
}
...
@@ -5796,21 +5826,17 @@ static void autorun_devices(int part)
...
@@ -5796,21 +5826,17 @@ static void autorun_devices(int part)
if
(
!
mddev
||
!
mddev
->
gendisk
)
{
if
(
!
mddev
||
!
mddev
->
gendisk
)
{
if
(
mddev
)
if
(
mddev
)
mddev_put
(
mddev
);
mddev_put
(
mddev
);
printk
(
KERN_ERR
"md: cannot allocate memory for md drive.
\n
"
);
break
;
break
;
}
}
if
(
mddev_lock
(
mddev
))
if
(
mddev_lock
(
mddev
))
printk
(
KERN_WARNING
"md: %s locked, cannot run
\n
"
,
pr_warn
(
"md: %s locked, cannot run
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
else
if
(
mddev
->
raid_disks
||
mddev
->
major_version
else
if
(
mddev
->
raid_disks
||
mddev
->
major_version
||
!
list_empty
(
&
mddev
->
disks
))
{
||
!
list_empty
(
&
mddev
->
disks
))
{
printk
(
KERN_WARNING
pr_warn
(
"md: %s already running, cannot run %s
\n
"
,
"md: %s already running, cannot run %s
\n
"
,
mdname
(
mddev
),
bdevname
(
rdev0
->
bdev
,
b
));
mdname
(
mddev
),
bdevname
(
rdev0
->
bdev
,
b
));
mddev_unlock
(
mddev
);
mddev_unlock
(
mddev
);
}
else
{
}
else
{
pr
intk
(
KERN_INFO
"md: created %s
\n
"
,
mdname
(
mddev
));
pr
_debug
(
"md: created %s
\n
"
,
mdname
(
mddev
));
mddev
->
persistent
=
1
;
mddev
->
persistent
=
1
;
rdev_for_each_list
(
rdev
,
tmp
,
&
candidates
)
{
rdev_for_each_list
(
rdev
,
tmp
,
&
candidates
)
{
list_del_init
(
&
rdev
->
same_set
);
list_del_init
(
&
rdev
->
same_set
);
...
@@ -5829,7 +5855,7 @@ static void autorun_devices(int part)
...
@@ -5829,7 +5855,7 @@ static void autorun_devices(int part)
}
}
mddev_put
(
mddev
);
mddev_put
(
mddev
);
}
}
pr
intk
(
KERN_INFO
"md: ... autorun DONE.
\n
"
);
pr
_info
(
"md: ... autorun DONE.
\n
"
);
}
}
#endif
/* !MODULE */
#endif
/* !MODULE */
...
@@ -5964,6 +5990,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
...
@@ -5964,6 +5990,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
info
.
state
|=
(
1
<<
MD_DISK_JOURNAL
);
info
.
state
|=
(
1
<<
MD_DISK_JOURNAL
);
if
(
test_bit
(
WriteMostly
,
&
rdev
->
flags
))
if
(
test_bit
(
WriteMostly
,
&
rdev
->
flags
))
info
.
state
|=
(
1
<<
MD_DISK_WRITEMOSTLY
);
info
.
state
|=
(
1
<<
MD_DISK_WRITEMOSTLY
);
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
))
info
.
state
|=
(
1
<<
MD_DISK_FAILFAST
);
}
else
{
}
else
{
info
.
major
=
info
.
minor
=
0
;
info
.
major
=
info
.
minor
=
0
;
info
.
raid_disk
=
-
1
;
info
.
raid_disk
=
-
1
;
...
@@ -5985,7 +6013,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -5985,7 +6013,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
if
(
mddev_is_clustered
(
mddev
)
&&
if
(
mddev_is_clustered
(
mddev
)
&&
!
(
info
->
state
&
((
1
<<
MD_DISK_CLUSTER_ADD
)
|
(
1
<<
MD_DISK_CANDIDATE
))))
{
!
(
info
->
state
&
((
1
<<
MD_DISK_CLUSTER_ADD
)
|
(
1
<<
MD_DISK_CANDIDATE
))))
{
pr_
err
(
"%s: Cannot add to clustered mddev.
\n
"
,
pr_
warn
(
"%s: Cannot add to clustered mddev.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -5998,8 +6026,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -5998,8 +6026,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
/* expecting a device which has a superblock */
/* expecting a device which has a superblock */
rdev
=
md_import_device
(
dev
,
mddev
->
major_version
,
mddev
->
minor_version
);
rdev
=
md_import_device
(
dev
,
mddev
->
major_version
,
mddev
->
minor_version
);
if
(
IS_ERR
(
rdev
))
{
if
(
IS_ERR
(
rdev
))
{
printk
(
KERN_WARNING
pr_warn
(
"md: md_import_device returned %ld
\n
"
,
"md: md_import_device returned %ld
\n
"
,
PTR_ERR
(
rdev
));
PTR_ERR
(
rdev
));
return
PTR_ERR
(
rdev
);
return
PTR_ERR
(
rdev
);
}
}
...
@@ -6010,8 +6037,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -6010,8 +6037,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
err
=
super_types
[
mddev
->
major_version
]
err
=
super_types
[
mddev
->
major_version
]
.
load_super
(
rdev
,
rdev0
,
mddev
->
minor_version
);
.
load_super
(
rdev
,
rdev0
,
mddev
->
minor_version
);
if
(
err
<
0
)
{
if
(
err
<
0
)
{
printk
(
KERN_WARNING
pr_warn
(
"md: %s has different UUID to %s
\n
"
,
"md: %s has different UUID to %s
\n
"
,
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev0
->
bdev
,
b2
));
bdevname
(
rdev0
->
bdev
,
b2
));
export_rdev
(
rdev
);
export_rdev
(
rdev
);
...
@@ -6032,8 +6058,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -6032,8 +6058,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
if
(
mddev
->
pers
)
{
if
(
mddev
->
pers
)
{
int
err
;
int
err
;
if
(
!
mddev
->
pers
->
hot_add_disk
)
{
if
(
!
mddev
->
pers
->
hot_add_disk
)
{
printk
(
KERN_WARNING
pr_warn
(
"%s: personality does not support diskops!
\n
"
,
"%s: personality does not support diskops!
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -6043,8 +6068,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -6043,8 +6068,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
else
else
rdev
=
md_import_device
(
dev
,
-
1
,
-
1
);
rdev
=
md_import_device
(
dev
,
-
1
,
-
1
);
if
(
IS_ERR
(
rdev
))
{
if
(
IS_ERR
(
rdev
))
{
printk
(
KERN_WARNING
pr_warn
(
"md: md_import_device returned %ld
\n
"
,
"md: md_import_device returned %ld
\n
"
,
PTR_ERR
(
rdev
));
PTR_ERR
(
rdev
));
return
PTR_ERR
(
rdev
);
return
PTR_ERR
(
rdev
);
}
}
...
@@ -6075,6 +6099,10 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -6075,6 +6099,10 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
set_bit
(
WriteMostly
,
&
rdev
->
flags
);
set_bit
(
WriteMostly
,
&
rdev
->
flags
);
else
else
clear_bit
(
WriteMostly
,
&
rdev
->
flags
);
clear_bit
(
WriteMostly
,
&
rdev
->
flags
);
if
(
info
->
state
&
(
1
<<
MD_DISK_FAILFAST
))
set_bit
(
FailFast
,
&
rdev
->
flags
);
else
clear_bit
(
FailFast
,
&
rdev
->
flags
);
if
(
info
->
state
&
(
1
<<
MD_DISK_JOURNAL
))
{
if
(
info
->
state
&
(
1
<<
MD_DISK_JOURNAL
))
{
struct
md_rdev
*
rdev2
;
struct
md_rdev
*
rdev2
;
...
@@ -6140,8 +6168,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -6140,8 +6168,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
* for major_version==0 superblocks
* for major_version==0 superblocks
*/
*/
if
(
mddev
->
major_version
!=
0
)
{
if
(
mddev
->
major_version
!=
0
)
{
printk
(
KERN_WARNING
"%s: ADD_NEW_DISK not supported
\n
"
,
pr_warn
(
"%s: ADD_NEW_DISK not supported
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -6149,8 +6176,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -6149,8 +6176,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
int
err
;
int
err
;
rdev
=
md_import_device
(
dev
,
-
1
,
0
);
rdev
=
md_import_device
(
dev
,
-
1
,
0
);
if
(
IS_ERR
(
rdev
))
{
if
(
IS_ERR
(
rdev
))
{
printk
(
KERN_WARNING
pr_warn
(
"md: error, md_import_device() returned %ld
\n
"
,
"md: error, md_import_device() returned %ld
\n
"
,
PTR_ERR
(
rdev
));
PTR_ERR
(
rdev
));
return
PTR_ERR
(
rdev
);
return
PTR_ERR
(
rdev
);
}
}
...
@@ -6166,9 +6192,11 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
...
@@ -6166,9 +6192,11 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
if
(
info
->
state
&
(
1
<<
MD_DISK_WRITEMOSTLY
))
if
(
info
->
state
&
(
1
<<
MD_DISK_WRITEMOSTLY
))
set_bit
(
WriteMostly
,
&
rdev
->
flags
);
set_bit
(
WriteMostly
,
&
rdev
->
flags
);
if
(
info
->
state
&
(
1
<<
MD_DISK_FAILFAST
))
set_bit
(
FailFast
,
&
rdev
->
flags
);
if
(
!
mddev
->
persistent
)
{
if
(
!
mddev
->
persistent
)
{
pr
intk
(
KERN_INFO
"md: nonpersistent superblock ...
\n
"
);
pr
_debug
(
"md: nonpersistent superblock ...
\n
"
);
rdev
->
sb_start
=
i_size_read
(
rdev
->
bdev
->
bd_inode
)
/
512
;
rdev
->
sb_start
=
i_size_read
(
rdev
->
bdev
->
bd_inode
)
/
512
;
}
else
}
else
rdev
->
sb_start
=
calc_dev_sboffset
(
rdev
);
rdev
->
sb_start
=
calc_dev_sboffset
(
rdev
);
...
@@ -6207,12 +6235,16 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
...
@@ -6207,12 +6235,16 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
md_cluster_ops
->
remove_disk
(
mddev
,
rdev
);
md_cluster_ops
->
remove_disk
(
mddev
,
rdev
);
md_kick_rdev_from_array
(
rdev
);
md_kick_rdev_from_array
(
rdev
);
set_bit
(
MD_SB_CHANGE_DEVS
,
&
mddev
->
sb_flags
);
if
(
mddev
->
thread
)
md_wakeup_thread
(
mddev
->
thread
);
else
md_update_sb
(
mddev
,
1
);
md_update_sb
(
mddev
,
1
);
md_new_event
(
mddev
);
md_new_event
(
mddev
);
return
0
;
return
0
;
busy:
busy:
pr
intk
(
KERN_WARNING
"md: cannot remove active disk %s from %s ...
\n
"
,
pr
_debug
(
"md: cannot remove active disk %s from %s ...
\n
"
,
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
));
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
));
return
-
EBUSY
;
return
-
EBUSY
;
}
}
...
@@ -6227,22 +6259,19 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
...
@@ -6227,22 +6259,19 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
return
-
ENODEV
;
return
-
ENODEV
;
if
(
mddev
->
major_version
!=
0
)
{
if
(
mddev
->
major_version
!=
0
)
{
printk
(
KERN_WARNING
"%s: HOT_ADD may only be used with"
pr_warn
(
"%s: HOT_ADD may only be used with version-0 superblocks.
\n
"
,
" version-0 superblocks.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
if
(
!
mddev
->
pers
->
hot_add_disk
)
{
if
(
!
mddev
->
pers
->
hot_add_disk
)
{
printk
(
KERN_WARNING
pr_warn
(
"%s: personality does not support diskops!
\n
"
,
"%s: personality does not support diskops!
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
rdev
=
md_import_device
(
dev
,
-
1
,
0
);
rdev
=
md_import_device
(
dev
,
-
1
,
0
);
if
(
IS_ERR
(
rdev
))
{
if
(
IS_ERR
(
rdev
))
{
printk
(
KERN_WARNING
pr_warn
(
"md: error, md_import_device() returned %ld
\n
"
,
"md: error, md_import_device() returned %ld
\n
"
,
PTR_ERR
(
rdev
));
PTR_ERR
(
rdev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -6255,8 +6284,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
...
@@ -6255,8 +6284,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
rdev
->
sectors
=
rdev
->
sb_start
;
rdev
->
sectors
=
rdev
->
sb_start
;
if
(
test_bit
(
Faulty
,
&
rdev
->
flags
))
{
if
(
test_bit
(
Faulty
,
&
rdev
->
flags
))
{
printk
(
KERN_WARNING
pr_warn
(
"md: can not hot-add faulty %s disk to %s!
\n
"
,
"md: can not hot-add faulty %s disk to %s!
\n
"
,
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
));
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
));
err
=
-
EINVAL
;
err
=
-
EINVAL
;
goto
abort_export
;
goto
abort_export
;
...
@@ -6276,6 +6304,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
...
@@ -6276,6 +6304,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
rdev
->
raid_disk
=
-
1
;
rdev
->
raid_disk
=
-
1
;
set_bit
(
MD_SB_CHANGE_DEVS
,
&
mddev
->
sb_flags
);
if
(
!
mddev
->
thread
)
md_update_sb
(
mddev
,
1
);
md_update_sb
(
mddev
,
1
);
/*
/*
* Kick recovery, maybe this spare has to be added to the
* Kick recovery, maybe this spare has to be added to the
...
@@ -6312,22 +6342,22 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
...
@@ -6312,22 +6342,22 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
f
=
fget
(
fd
);
f
=
fget
(
fd
);
if
(
f
==
NULL
)
{
if
(
f
==
NULL
)
{
pr
intk
(
KERN_ERR
"%s: error: failed to get bitmap file
\n
"
,
pr
_warn
(
"%s: error: failed to get bitmap file
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EBADF
;
return
-
EBADF
;
}
}
inode
=
f
->
f_mapping
->
host
;
inode
=
f
->
f_mapping
->
host
;
if
(
!
S_ISREG
(
inode
->
i_mode
))
{
if
(
!
S_ISREG
(
inode
->
i_mode
))
{
pr
intk
(
KERN_ERR
"%s: error: bitmap file must be a regular file
\n
"
,
pr
_warn
(
"%s: error: bitmap file must be a regular file
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
err
=
-
EBADF
;
err
=
-
EBADF
;
}
else
if
(
!
(
f
->
f_mode
&
FMODE_WRITE
))
{
}
else
if
(
!
(
f
->
f_mode
&
FMODE_WRITE
))
{
pr
intk
(
KERN_ERR
"%s: error: bitmap file must open for write
\n
"
,
pr
_warn
(
"%s: error: bitmap file must open for write
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
err
=
-
EBADF
;
err
=
-
EBADF
;
}
else
if
(
atomic_read
(
&
inode
->
i_writecount
)
!=
1
)
{
}
else
if
(
atomic_read
(
&
inode
->
i_writecount
)
!=
1
)
{
pr
intk
(
KERN_ERR
"%s: error: bitmap file is already in use
\n
"
,
pr
_warn
(
"%s: error: bitmap file is already in use
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
err
=
-
EBUSY
;
err
=
-
EBUSY
;
}
}
...
@@ -6393,8 +6423,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
...
@@ -6393,8 +6423,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
info
->
major_version
>=
ARRAY_SIZE
(
super_types
)
||
info
->
major_version
>=
ARRAY_SIZE
(
super_types
)
||
super_types
[
info
->
major_version
].
name
==
NULL
)
{
super_types
[
info
->
major_version
].
name
==
NULL
)
{
/* maybe try to auto-load a module? */
/* maybe try to auto-load a module? */
printk
(
KERN_INFO
pr_warn
(
"md: superblock version %d not known
\n
"
,
"md: superblock version %d not known
\n
"
,
info
->
major_version
);
info
->
major_version
);
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -6432,9 +6461,11 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
...
@@ -6432,9 +6461,11 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev
->
max_disks
=
MD_SB_DISKS
;
mddev
->
max_disks
=
MD_SB_DISKS
;
if
(
mddev
->
persistent
)
if
(
mddev
->
persistent
)
{
mddev
->
flags
=
0
;
mddev
->
flags
=
0
;
set_bit
(
MD_CHANGE_DEVS
,
&
mddev
->
flags
);
mddev
->
sb_flags
=
0
;
}
set_bit
(
MD_SB_CHANGE_DEVS
,
&
mddev
->
sb_flags
);
mddev
->
bitmap_info
.
default_offset
=
MD_SB_BYTES
>>
9
;
mddev
->
bitmap_info
.
default_offset
=
MD_SB_BYTES
>>
9
;
mddev
->
bitmap_info
.
default_space
=
64
*
2
-
(
MD_SB_BYTES
>>
9
);
mddev
->
bitmap_info
.
default_space
=
64
*
2
-
(
MD_SB_BYTES
>>
9
);
...
@@ -6660,8 +6691,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
...
@@ -6660,8 +6691,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
if
(
mddev
->
bitmap_info
.
nodes
)
{
if
(
mddev
->
bitmap_info
.
nodes
)
{
/* hold PW on all the bitmap lock */
/* hold PW on all the bitmap lock */
if
(
md_cluster_ops
->
lock_all_bitmaps
(
mddev
)
<=
0
)
{
if
(
md_cluster_ops
->
lock_all_bitmaps
(
mddev
)
<=
0
)
{
printk
(
"md: can't change bitmap to none since the"
pr_warn
(
"md: can't change bitmap to none since the array is in use by more than one node
\n
"
);
" array is in use by more than one node
\n
"
);
rv
=
-
EPERM
;
rv
=
-
EPERM
;
md_cluster_ops
->
unlock_all_bitmaps
(
mddev
);
md_cluster_ops
->
unlock_all_bitmaps
(
mddev
);
goto
err
;
goto
err
;
...
@@ -6829,7 +6859,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
...
@@ -6829,7 +6859,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
/* need to ensure recovery thread has run */
/* need to ensure recovery thread has run */
wait_event_interruptible_timeout
(
mddev
->
sb_wait
,
wait_event_interruptible_timeout
(
mddev
->
sb_wait
,
!
test_bit
(
MD_RECOVERY_NEEDED
,
!
test_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
flags
),
&
mddev
->
recovery
),
msecs_to_jiffies
(
5000
));
msecs_to_jiffies
(
5000
));
if
(
cmd
==
STOP_ARRAY
||
cmd
==
STOP_ARRAY_RO
)
{
if
(
cmd
==
STOP_ARRAY
||
cmd
==
STOP_ARRAY_RO
)
{
/* Need to flush page cache, and ensure no-one else opens
/* Need to flush page cache, and ensure no-one else opens
...
@@ -6847,8 +6877,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
...
@@ -6847,8 +6877,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
}
}
err
=
mddev_lock
(
mddev
);
err
=
mddev_lock
(
mddev
);
if
(
err
)
{
if
(
err
)
{
printk
(
KERN_INFO
pr_debug
(
"md: ioctl lock interrupted, reason %d, cmd %d
\n
"
,
"md: ioctl lock interrupted, reason %d, cmd %d
\n
"
,
err
,
cmd
);
err
,
cmd
);
goto
out
;
goto
out
;
}
}
...
@@ -6864,30 +6893,24 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
...
@@ -6864,30 +6893,24 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
if
(
mddev
->
pers
)
{
if
(
mddev
->
pers
)
{
err
=
update_array_info
(
mddev
,
&
info
);
err
=
update_array_info
(
mddev
,
&
info
);
if
(
err
)
{
if
(
err
)
{
printk
(
KERN_WARNING
"md: couldn't update"
pr_warn
(
"md: couldn't update array info. %d
\n
"
,
err
);
" array info. %d
\n
"
,
err
);
goto
unlock
;
goto
unlock
;
}
}
goto
unlock
;
goto
unlock
;
}
}
if
(
!
list_empty
(
&
mddev
->
disks
))
{
if
(
!
list_empty
(
&
mddev
->
disks
))
{
printk
(
KERN_WARNING
pr_warn
(
"md: array %s already has disks!
\n
"
,
mdname
(
mddev
));
"md: array %s already has disks!
\n
"
,
mdname
(
mddev
));
err
=
-
EBUSY
;
err
=
-
EBUSY
;
goto
unlock
;
goto
unlock
;
}
}
if
(
mddev
->
raid_disks
)
{
if
(
mddev
->
raid_disks
)
{
printk
(
KERN_WARNING
pr_warn
(
"md: array %s already initialised!
\n
"
,
mdname
(
mddev
));
"md: array %s already initialised!
\n
"
,
mdname
(
mddev
));
err
=
-
EBUSY
;
err
=
-
EBUSY
;
goto
unlock
;
goto
unlock
;
}
}
err
=
set_array_info
(
mddev
,
&
info
);
err
=
set_array_info
(
mddev
,
&
info
);
if
(
err
)
{
if
(
err
)
{
printk
(
KERN_WARNING
"md: couldn't set"
pr_warn
(
"md: couldn't set array info. %d
\n
"
,
err
);
" array info. %d
\n
"
,
err
);
goto
unlock
;
goto
unlock
;
}
}
goto
unlock
;
goto
unlock
;
...
@@ -6987,11 +7010,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
...
@@ -6987,11 +7010,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
/* If a device failed while we were read-only, we
/* If a device failed while we were read-only, we
* need to make sure the metadata is updated now.
* need to make sure the metadata is updated now.
*/
*/
if
(
test_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
))
{
if
(
test_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
))
{
mddev_unlock
(
mddev
);
mddev_unlock
(
mddev
);
wait_event
(
mddev
->
sb_wait
,
wait_event
(
mddev
->
sb_wait
,
!
test_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
)
&&
!
test_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
)
&&
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
));
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
));
mddev_lock_nointr
(
mddev
);
mddev_lock_nointr
(
mddev
);
}
}
}
else
{
}
else
{
...
@@ -7092,7 +7115,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
...
@@ -7092,7 +7115,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
if
(
test_bit
(
MD_CLOSING
,
&
mddev
->
flags
))
{
if
(
test_bit
(
MD_CLOSING
,
&
mddev
->
flags
))
{
mutex_unlock
(
&
mddev
->
open_mutex
);
mutex_unlock
(
&
mddev
->
open_mutex
);
return
-
ENODEV
;
err
=
-
ENODEV
;
goto
out
;
}
}
err
=
0
;
err
=
0
;
...
@@ -7101,6 +7125,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
...
@@ -7101,6 +7125,8 @@ static int md_open(struct block_device *bdev, fmode_t mode)
check_disk_change
(
bdev
);
check_disk_change
(
bdev
);
out:
out:
if
(
err
)
mddev_put
(
mddev
);
return
err
;
return
err
;
}
}
...
@@ -7171,10 +7197,12 @@ static int md_thread(void *arg)
...
@@ -7171,10 +7197,12 @@ static int md_thread(void *arg)
wait_event_interruptible_timeout
wait_event_interruptible_timeout
(
thread
->
wqueue
,
(
thread
->
wqueue
,
test_bit
(
THREAD_WAKEUP
,
&
thread
->
flags
)
test_bit
(
THREAD_WAKEUP
,
&
thread
->
flags
)
||
kthread_should_stop
(),
||
kthread_should_stop
()
||
kthread_should_park
()
,
thread
->
timeout
);
thread
->
timeout
);
clear_bit
(
THREAD_WAKEUP
,
&
thread
->
flags
);
clear_bit
(
THREAD_WAKEUP
,
&
thread
->
flags
);
if
(
kthread_should_park
())
kthread_parkme
();
if
(
!
kthread_should_stop
())
if
(
!
kthread_should_stop
())
thread
->
run
(
thread
);
thread
->
run
(
thread
);
}
}
...
@@ -7588,7 +7616,7 @@ static const struct file_operations md_seq_fops = {
...
@@ -7588,7 +7616,7 @@ static const struct file_operations md_seq_fops = {
int
register_md_personality
(
struct
md_personality
*
p
)
int
register_md_personality
(
struct
md_personality
*
p
)
{
{
pr
intk
(
KERN_INFO
"md: %s personality registered for level %d
\n
"
,
pr
_debug
(
"md: %s personality registered for level %d
\n
"
,
p
->
name
,
p
->
level
);
p
->
name
,
p
->
level
);
spin_lock
(
&
pers_lock
);
spin_lock
(
&
pers_lock
);
list_add_tail
(
&
p
->
list
,
&
pers_list
);
list_add_tail
(
&
p
->
list
,
&
pers_list
);
...
@@ -7599,7 +7627,7 @@ EXPORT_SYMBOL(register_md_personality);
...
@@ -7599,7 +7627,7 @@ EXPORT_SYMBOL(register_md_personality);
int
unregister_md_personality
(
struct
md_personality
*
p
)
int
unregister_md_personality
(
struct
md_personality
*
p
)
{
{
pr
intk
(
KERN_INFO
"md: %s personality unregistered
\n
"
,
p
->
name
);
pr
_debug
(
"md: %s personality unregistered
\n
"
,
p
->
name
);
spin_lock
(
&
pers_lock
);
spin_lock
(
&
pers_lock
);
list_del_init
(
&
p
->
list
);
list_del_init
(
&
p
->
list
);
spin_unlock
(
&
pers_lock
);
spin_unlock
(
&
pers_lock
);
...
@@ -7639,7 +7667,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
...
@@ -7639,7 +7667,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
spin_lock
(
&
pers_lock
);
spin_lock
(
&
pers_lock
);
/* ensure module won't be unloaded */
/* ensure module won't be unloaded */
if
(
!
md_cluster_ops
||
!
try_module_get
(
md_cluster_mod
))
{
if
(
!
md_cluster_ops
||
!
try_module_get
(
md_cluster_mod
))
{
pr_
err
(
"can't find md-cluster module or get it's reference.
\n
"
);
pr_
warn
(
"can't find md-cluster module or get it's reference.
\n
"
);
spin_unlock
(
&
pers_lock
);
spin_unlock
(
&
pers_lock
);
return
-
ENOENT
;
return
-
ENOENT
;
}
}
...
@@ -7741,8 +7769,8 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
...
@@ -7741,8 +7769,8 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
spin_lock
(
&
mddev
->
lock
);
spin_lock
(
&
mddev
->
lock
);
if
(
mddev
->
in_sync
)
{
if
(
mddev
->
in_sync
)
{
mddev
->
in_sync
=
0
;
mddev
->
in_sync
=
0
;
set_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
);
set_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
);
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
did_change
=
1
;
did_change
=
1
;
}
}
...
@@ -7751,7 +7779,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
...
@@ -7751,7 +7779,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi)
if
(
did_change
)
if
(
did_change
)
sysfs_notify_dirent_safe
(
mddev
->
sysfs_state
);
sysfs_notify_dirent_safe
(
mddev
->
sysfs_state
);
wait_event
(
mddev
->
sb_wait
,
wait_event
(
mddev
->
sb_wait
,
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
));
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
));
}
}
EXPORT_SYMBOL
(
md_write_start
);
EXPORT_SYMBOL
(
md_write_start
);
...
@@ -7772,7 +7800,7 @@ EXPORT_SYMBOL(md_write_end);
...
@@ -7772,7 +7800,7 @@ EXPORT_SYMBOL(md_write_end);
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held.
* Must be called with mddev_lock held.
*
*
* In the ->external case MD_CHANGE_PENDING can not be cleared until mddev->lock
* In the ->external case MD_
SB_
CHANGE_PENDING can not be cleared until mddev->lock
* is dropped, so return -EAGAIN after notifying userspace.
* is dropped, so return -EAGAIN after notifying userspace.
*/
*/
int
md_allow_write
(
struct
mddev
*
mddev
)
int
md_allow_write
(
struct
mddev
*
mddev
)
...
@@ -7787,8 +7815,8 @@ int md_allow_write(struct mddev *mddev)
...
@@ -7787,8 +7815,8 @@ int md_allow_write(struct mddev *mddev)
spin_lock
(
&
mddev
->
lock
);
spin_lock
(
&
mddev
->
lock
);
if
(
mddev
->
in_sync
)
{
if
(
mddev
->
in_sync
)
{
mddev
->
in_sync
=
0
;
mddev
->
in_sync
=
0
;
set_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
);
set_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
);
if
(
mddev
->
safemode_delay
&&
if
(
mddev
->
safemode_delay
&&
mddev
->
safemode
==
0
)
mddev
->
safemode
==
0
)
mddev
->
safemode
=
1
;
mddev
->
safemode
=
1
;
...
@@ -7798,7 +7826,7 @@ int md_allow_write(struct mddev *mddev)
...
@@ -7798,7 +7826,7 @@ int md_allow_write(struct mddev *mddev)
}
else
}
else
spin_unlock
(
&
mddev
->
lock
);
spin_unlock
(
&
mddev
->
lock
);
if
(
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
if
(
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
return
-
EAGAIN
;
return
-
EAGAIN
;
else
else
return
0
;
return
0
;
...
@@ -7914,9 +7942,7 @@ void md_do_sync(struct md_thread *thread)
...
@@ -7914,9 +7942,7 @@ void md_do_sync(struct md_thread *thread)
mddev2
->
curr_resync
>=
mddev
->
curr_resync
)
{
mddev2
->
curr_resync
>=
mddev
->
curr_resync
)
{
if
(
mddev2_minor
!=
mddev2
->
md_minor
)
{
if
(
mddev2_minor
!=
mddev2
->
md_minor
)
{
mddev2_minor
=
mddev2
->
md_minor
;
mddev2_minor
=
mddev2
->
md_minor
;
printk
(
KERN_INFO
"md: delaying %s of %s"
pr_info
(
"md: delaying %s of %s until %s has finished (they share one or more physical units)
\n
"
,
" until %s has finished (they"
" share one or more physical units)
\n
"
,
desc
,
mdname
(
mddev
),
desc
,
mdname
(
mddev
),
mdname
(
mddev2
));
mdname
(
mddev2
));
}
}
...
@@ -7975,11 +8001,9 @@ void md_do_sync(struct md_thread *thread)
...
@@ -7975,11 +8001,9 @@ void md_do_sync(struct md_thread *thread)
}
}
}
}
printk
(
KERN_INFO
"md: %s of RAID array %s
\n
"
,
desc
,
mdname
(
mddev
));
pr_info
(
"md: %s of RAID array %s
\n
"
,
desc
,
mdname
(
mddev
));
printk
(
KERN_INFO
"md: minimum _guaranteed_ speed:"
pr_debug
(
"md: minimum _guaranteed_ speed: %d KB/sec/disk.
\n
"
,
speed_min
(
mddev
));
" %d KB/sec/disk.
\n
"
,
speed_min
(
mddev
));
pr_debug
(
"md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.
\n
"
,
printk
(
KERN_INFO
"md: using maximum available idle IO bandwidth "
"(but not more than %d KB/sec) for %s.
\n
"
,
speed_max
(
mddev
),
desc
);
speed_max
(
mddev
),
desc
);
is_mddev_idle
(
mddev
,
1
);
/* this initializes IO event counters */
is_mddev_idle
(
mddev
,
1
);
/* this initializes IO event counters */
...
@@ -7997,15 +8021,14 @@ void md_do_sync(struct md_thread *thread)
...
@@ -7997,15 +8021,14 @@ void md_do_sync(struct md_thread *thread)
* Tune reconstruction:
* Tune reconstruction:
*/
*/
window
=
32
*
(
PAGE_SIZE
/
512
);
window
=
32
*
(
PAGE_SIZE
/
512
);
pr
intk
(
KERN_INFO
"md: using %dk window, over a total of %lluk.
\n
"
,
pr
_debug
(
"md: using %dk window, over a total of %lluk.
\n
"
,
window
/
2
,
(
unsigned
long
long
)
max_sectors
/
2
);
window
/
2
,
(
unsigned
long
long
)
max_sectors
/
2
);
atomic_set
(
&
mddev
->
recovery_active
,
0
);
atomic_set
(
&
mddev
->
recovery_active
,
0
);
last_check
=
0
;
last_check
=
0
;
if
(
j
>
2
)
{
if
(
j
>
2
)
{
printk
(
KERN_INFO
pr_debug
(
"md: resuming %s of %s from checkpoint.
\n
"
,
"md: resuming %s of %s from checkpoint.
\n
"
,
desc
,
mdname
(
mddev
));
desc
,
mdname
(
mddev
));
mddev
->
curr_resync
=
j
;
mddev
->
curr_resync
=
j
;
}
else
}
else
...
@@ -8038,7 +8061,7 @@ void md_do_sync(struct md_thread *thread)
...
@@ -8038,7 +8061,7 @@ void md_do_sync(struct md_thread *thread)
j
>
mddev
->
recovery_cp
)
j
>
mddev
->
recovery_cp
)
mddev
->
recovery_cp
=
j
;
mddev
->
recovery_cp
=
j
;
update_time
=
jiffies
;
update_time
=
jiffies
;
set_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
);
sysfs_notify
(
&
mddev
->
kobj
,
NULL
,
"sync_completed"
);
sysfs_notify
(
&
mddev
->
kobj
,
NULL
,
"sync_completed"
);
}
}
...
@@ -8133,7 +8156,7 @@ void md_do_sync(struct md_thread *thread)
...
@@ -8133,7 +8156,7 @@ void md_do_sync(struct md_thread *thread)
}
}
}
}
}
}
pr
intk
(
KERN_INFO
"md: %s: %s %s.
\n
"
,
mdname
(
mddev
),
desc
,
pr
_info
(
"md: %s: %s %s.
\n
"
,
mdname
(
mddev
),
desc
,
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
)
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
)
?
"interrupted"
:
"done"
);
?
"interrupted"
:
"done"
);
/*
/*
...
@@ -8155,8 +8178,7 @@ void md_do_sync(struct md_thread *thread)
...
@@ -8155,8 +8178,7 @@ void md_do_sync(struct md_thread *thread)
if
(
test_bit
(
MD_RECOVERY_SYNC
,
&
mddev
->
recovery
))
{
if
(
test_bit
(
MD_RECOVERY_SYNC
,
&
mddev
->
recovery
))
{
if
(
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
{
if
(
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
{
if
(
mddev
->
curr_resync
>=
mddev
->
recovery_cp
)
{
if
(
mddev
->
curr_resync
>=
mddev
->
recovery_cp
)
{
printk
(
KERN_INFO
pr_debug
(
"md: checkpointing %s of %s.
\n
"
,
"md: checkpointing %s of %s.
\n
"
,
desc
,
mdname
(
mddev
));
desc
,
mdname
(
mddev
));
if
(
test_bit
(
MD_RECOVERY_ERROR
,
if
(
test_bit
(
MD_RECOVERY_ERROR
,
&
mddev
->
recovery
))
&
mddev
->
recovery
))
...
@@ -8187,8 +8209,8 @@ void md_do_sync(struct md_thread *thread)
...
@@ -8187,8 +8209,8 @@ void md_do_sync(struct md_thread *thread)
/* set CHANGE_PENDING here since maybe another update is needed,
/* set CHANGE_PENDING here since maybe another update is needed,
* so other nodes are informed. It should be harmless for normal
* so other nodes are informed. It should be harmless for normal
* raid */
* raid */
set_mask_bits
(
&
mddev
->
flags
,
0
,
set_mask_bits
(
&
mddev
->
sb_
flags
,
0
,
BIT
(
MD_
CHANGE_PENDING
)
|
BIT
(
MD
_CHANGE_DEVS
));
BIT
(
MD_
SB_CHANGE_PENDING
)
|
BIT
(
MD_SB
_CHANGE_DEVS
));
spin_lock
(
&
mddev
->
lock
);
spin_lock
(
&
mddev
->
lock
);
if
(
!
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
{
if
(
!
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
{
...
@@ -8288,12 +8310,12 @@ static int remove_and_add_spares(struct mddev *mddev,
...
@@ -8288,12 +8310,12 @@ static int remove_and_add_spares(struct mddev *mddev,
if
(
!
test_bit
(
Journal
,
&
rdev
->
flags
))
if
(
!
test_bit
(
Journal
,
&
rdev
->
flags
))
spares
++
;
spares
++
;
md_new_event
(
mddev
);
md_new_event
(
mddev
);
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
}
}
}
}
no_add:
no_add:
if
(
removed
)
if
(
removed
)
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
return
spares
;
return
spares
;
}
}
...
@@ -8305,7 +8327,7 @@ static void md_start_sync(struct work_struct *ws)
...
@@ -8305,7 +8327,7 @@ static void md_start_sync(struct work_struct *ws)
mddev
,
mddev
,
"resync"
);
"resync"
);
if
(
!
mddev
->
sync_thread
)
{
if
(
!
mddev
->
sync_thread
)
{
pr
intk
(
KERN_ERR
"%s: could not start resync thread...
\n
"
,
pr
_warn
(
"%s: could not start resync thread...
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
/* leave the spares where they are, it shouldn't hurt */
/* leave the spares where they are, it shouldn't hurt */
clear_bit
(
MD_RECOVERY_SYNC
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_SYNC
,
&
mddev
->
recovery
);
...
@@ -8356,7 +8378,7 @@ void md_check_recovery(struct mddev *mddev)
...
@@ -8356,7 +8378,7 @@ void md_check_recovery(struct mddev *mddev)
if
(
signal_pending
(
current
))
{
if
(
signal_pending
(
current
))
{
if
(
mddev
->
pers
->
sync_request
&&
!
mddev
->
external
)
{
if
(
mddev
->
pers
->
sync_request
&&
!
mddev
->
external
)
{
pr
intk
(
KERN_INFO
"md: %s in immediate safe mode
\n
"
,
pr
_debug
(
"md: %s in immediate safe mode
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
mddev
->
safemode
=
2
;
mddev
->
safemode
=
2
;
}
}
...
@@ -8366,7 +8388,7 @@ void md_check_recovery(struct mddev *mddev)
...
@@ -8366,7 +8388,7 @@ void md_check_recovery(struct mddev *mddev)
if
(
mddev
->
ro
&&
!
test_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
))
if
(
mddev
->
ro
&&
!
test_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
))
return
;
return
;
if
(
!
(
if
(
!
(
(
mddev
->
flags
&
MD_UPDATE_SB_FLAGS
&
~
(
1
<<
MD
_CHANGE_PENDING
))
||
(
mddev
->
sb_flags
&
~
(
1
<<
MD_SB
_CHANGE_PENDING
))
||
test_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
)
||
test_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
)
||
test_bit
(
MD_RECOVERY_DONE
,
&
mddev
->
recovery
)
||
test_bit
(
MD_RECOVERY_DONE
,
&
mddev
->
recovery
)
||
test_bit
(
MD_RELOAD_SB
,
&
mddev
->
flags
)
||
test_bit
(
MD_RELOAD_SB
,
&
mddev
->
flags
)
||
...
@@ -8404,7 +8426,7 @@ void md_check_recovery(struct mddev *mddev)
...
@@ -8404,7 +8426,7 @@ void md_check_recovery(struct mddev *mddev)
md_reap_sync_thread
(
mddev
);
md_reap_sync_thread
(
mddev
);
clear_bit
(
MD_RECOVERY_RECOVER
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_RECOVER
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_NEEDED
,
&
mddev
->
recovery
);
clear_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
);
clear_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
);
goto
unlock
;
goto
unlock
;
}
}
...
@@ -8432,7 +8454,7 @@ void md_check_recovery(struct mddev *mddev)
...
@@ -8432,7 +8454,7 @@ void md_check_recovery(struct mddev *mddev)
mddev
->
recovery_cp
==
MaxSector
)
{
mddev
->
recovery_cp
==
MaxSector
)
{
mddev
->
in_sync
=
1
;
mddev
->
in_sync
=
1
;
did_change
=
1
;
did_change
=
1
;
set_bit
(
MD_
CHANGE_CLEAN
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_CLEAN
,
&
mddev
->
sb_
flags
);
}
}
if
(
mddev
->
safemode
==
1
)
if
(
mddev
->
safemode
==
1
)
mddev
->
safemode
=
0
;
mddev
->
safemode
=
0
;
...
@@ -8441,7 +8463,7 @@ void md_check_recovery(struct mddev *mddev)
...
@@ -8441,7 +8463,7 @@ void md_check_recovery(struct mddev *mddev)
sysfs_notify_dirent_safe
(
mddev
->
sysfs_state
);
sysfs_notify_dirent_safe
(
mddev
->
sysfs_state
);
}
}
if
(
mddev
->
flags
&
MD_UPDATE_SB_FLAGS
)
if
(
mddev
->
sb_flags
)
md_update_sb
(
mddev
,
0
);
md_update_sb
(
mddev
,
0
);
if
(
test_bit
(
MD_RECOVERY_RUNNING
,
&
mddev
->
recovery
)
&&
if
(
test_bit
(
MD_RECOVERY_RUNNING
,
&
mddev
->
recovery
)
&&
...
@@ -8537,7 +8559,7 @@ void md_reap_sync_thread(struct mddev *mddev)
...
@@ -8537,7 +8559,7 @@ void md_reap_sync_thread(struct mddev *mddev)
if
(
mddev
->
pers
->
spare_active
(
mddev
))
{
if
(
mddev
->
pers
->
spare_active
(
mddev
))
{
sysfs_notify
(
&
mddev
->
kobj
,
NULL
,
sysfs_notify
(
&
mddev
->
kobj
,
NULL
,
"degraded"
);
"degraded"
);
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
}
}
}
}
if
(
test_bit
(
MD_RECOVERY_RESHAPE
,
&
mddev
->
recovery
)
&&
if
(
test_bit
(
MD_RECOVERY_RESHAPE
,
&
mddev
->
recovery
)
&&
...
@@ -8552,7 +8574,7 @@ void md_reap_sync_thread(struct mddev *mddev)
...
@@ -8552,7 +8574,7 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev
->
saved_raid_disk
=
-
1
;
rdev
->
saved_raid_disk
=
-
1
;
md_update_sb
(
mddev
,
1
);
md_update_sb
(
mddev
,
1
);
/* MD_CHANGE_PENDING should be cleared by md_update_sb, so we can
/* MD_
SB_
CHANGE_PENDING should be cleared by md_update_sb, so we can
* call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
* call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
* clustered raid */
* clustered raid */
if
(
test_and_clear_bit
(
MD_CLUSTER_RESYNC_LOCKED
,
&
mddev
->
flags
))
if
(
test_and_clear_bit
(
MD_CLUSTER_RESYNC_LOCKED
,
&
mddev
->
flags
))
...
@@ -8614,9 +8636,12 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
...
@@ -8614,9 +8636,12 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
rv
=
badblocks_set
(
&
rdev
->
badblocks
,
s
,
sectors
,
0
);
rv
=
badblocks_set
(
&
rdev
->
badblocks
,
s
,
sectors
,
0
);
if
(
rv
==
0
)
{
if
(
rv
==
0
)
{
/* Make sure they get written out promptly */
/* Make sure they get written out promptly */
if
(
test_bit
(
ExternalBbl
,
&
rdev
->
flags
))
sysfs_notify
(
&
rdev
->
kobj
,
NULL
,
"unacknowledged_bad_blocks"
);
sysfs_notify_dirent_safe
(
rdev
->
sysfs_state
);
sysfs_notify_dirent_safe
(
rdev
->
sysfs_state
);
set_mask_bits
(
&
mddev
->
flags
,
0
,
set_mask_bits
(
&
mddev
->
sb_
flags
,
0
,
BIT
(
MD_
CHANGE_CLEAN
)
|
BIT
(
MD
_CHANGE_PENDING
));
BIT
(
MD_
SB_CHANGE_CLEAN
)
|
BIT
(
MD_SB
_CHANGE_PENDING
));
md_wakeup_thread
(
rdev
->
mddev
->
thread
);
md_wakeup_thread
(
rdev
->
mddev
->
thread
);
return
1
;
return
1
;
}
else
}
else
...
@@ -8627,12 +8652,15 @@ EXPORT_SYMBOL_GPL(rdev_set_badblocks);
...
@@ -8627,12 +8652,15 @@ EXPORT_SYMBOL_GPL(rdev_set_badblocks);
int
rdev_clear_badblocks
(
struct
md_rdev
*
rdev
,
sector_t
s
,
int
sectors
,
int
rdev_clear_badblocks
(
struct
md_rdev
*
rdev
,
sector_t
s
,
int
sectors
,
int
is_new
)
int
is_new
)
{
{
int
rv
;
if
(
is_new
)
if
(
is_new
)
s
+=
rdev
->
new_data_offset
;
s
+=
rdev
->
new_data_offset
;
else
else
s
+=
rdev
->
data_offset
;
s
+=
rdev
->
data_offset
;
return
badblocks_clear
(
&
rdev
->
badblocks
,
rv
=
badblocks_clear
(
&
rdev
->
badblocks
,
s
,
sectors
);
s
,
sectors
);
if
((
rv
==
0
)
&&
test_bit
(
ExternalBbl
,
&
rdev
->
flags
))
sysfs_notify
(
&
rdev
->
kobj
,
NULL
,
"bad_blocks"
);
return
rv
;
}
}
EXPORT_SYMBOL_GPL
(
rdev_clear_badblocks
);
EXPORT_SYMBOL_GPL
(
rdev_clear_badblocks
);
...
@@ -8785,14 +8813,17 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -8785,14 +8813,17 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
* variable in case we err in the future
* variable in case we err in the future
*/
*/
rdev
->
sb_page
=
NULL
;
rdev
->
sb_page
=
NULL
;
alloc_disk_sb
(
rdev
);
err
=
alloc_disk_sb
(
rdev
);
if
(
err
==
0
)
{
ClearPageUptodate
(
rdev
->
sb_page
);
ClearPageUptodate
(
rdev
->
sb_page
);
rdev
->
sb_loaded
=
0
;
rdev
->
sb_loaded
=
0
;
err
=
super_types
[
mddev
->
major_version
].
load_super
(
rdev
,
NULL
,
mddev
->
minor_version
);
err
=
super_types
[
mddev
->
major_version
].
load_super
(
rdev
,
NULL
,
mddev
->
minor_version
);
}
if
(
err
<
0
)
{
if
(
err
<
0
)
{
pr_warn
(
"%s: %d Could not reload rdev(%d) err: %d. Restoring old values
\n
"
,
pr_warn
(
"%s: %d Could not reload rdev(%d) err: %d. Restoring old values
\n
"
,
__func__
,
__LINE__
,
rdev
->
desc_nr
,
err
);
__func__
,
__LINE__
,
rdev
->
desc_nr
,
err
);
if
(
rdev
->
sb_page
)
put_page
(
rdev
->
sb_page
);
put_page
(
rdev
->
sb_page
);
rdev
->
sb_page
=
swapout
;
rdev
->
sb_page
=
swapout
;
rdev
->
sb_loaded
=
1
;
rdev
->
sb_loaded
=
1
;
...
@@ -8871,9 +8902,6 @@ void md_autodetect_dev(dev_t dev)
...
@@ -8871,9 +8902,6 @@ void md_autodetect_dev(dev_t dev)
mutex_lock
(
&
detected_devices_mutex
);
mutex_lock
(
&
detected_devices_mutex
);
list_add_tail
(
&
node_detected_dev
->
list
,
&
all_detected_devices
);
list_add_tail
(
&
node_detected_dev
->
list
,
&
all_detected_devices
);
mutex_unlock
(
&
detected_devices_mutex
);
mutex_unlock
(
&
detected_devices_mutex
);
}
else
{
printk
(
KERN_CRIT
"md: md_autodetect_dev: kzalloc failed"
", skipping dev(%d,%d)
\n
"
,
MAJOR
(
dev
),
MINOR
(
dev
));
}
}
}
}
...
@@ -8887,7 +8915,7 @@ static void autostart_arrays(int part)
...
@@ -8887,7 +8915,7 @@ static void autostart_arrays(int part)
i_scanned
=
0
;
i_scanned
=
0
;
i_passed
=
0
;
i_passed
=
0
;
pr
intk
(
KERN_INFO
"md: Autodetecting RAID arrays.
\n
"
);
pr
_info
(
"md: Autodetecting RAID arrays.
\n
"
);
mutex_lock
(
&
detected_devices_mutex
);
mutex_lock
(
&
detected_devices_mutex
);
while
(
!
list_empty
(
&
all_detected_devices
)
&&
i_scanned
<
INT_MAX
)
{
while
(
!
list_empty
(
&
all_detected_devices
)
&&
i_scanned
<
INT_MAX
)
{
...
@@ -8912,8 +8940,7 @@ static void autostart_arrays(int part)
...
@@ -8912,8 +8940,7 @@ static void autostart_arrays(int part)
}
}
mutex_unlock
(
&
detected_devices_mutex
);
mutex_unlock
(
&
detected_devices_mutex
);
printk
(
KERN_INFO
"md: Scanned %d and added %d devices.
\n
"
,
pr_debug
(
"md: Scanned %d and added %d devices.
\n
"
,
i_scanned
,
i_passed
);
i_scanned
,
i_passed
);
autorun_devices
(
part
);
autorun_devices
(
part
);
}
}
...
...
drivers/md/md.h
浏览文件 @
20737738
...
@@ -29,6 +29,16 @@
...
@@ -29,6 +29,16 @@
#define MaxSector (~(sector_t)0)
#define MaxSector (~(sector_t)0)
/*
* These flags should really be called "NO_RETRY" rather than
* "FAILFAST" because they don't make any promise about time lapse,
* only about the number of retries, which will be zero.
* REQ_FAILFAST_DRIVER is not included because
* Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
* seems to suggest that the errors it avoids retrying should usually
* be retried.
*/
#define MD_FAILFAST (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
/*
/*
* MD's 'extended' device
* MD's 'extended' device
*/
*/
...
@@ -168,6 +178,19 @@ enum flag_bits {
...
@@ -168,6 +178,19 @@ enum flag_bits {
* so it is safe to remove without
* so it is safe to remove without
* another synchronize_rcu() call.
* another synchronize_rcu() call.
*/
*/
ExternalBbl
,
/* External metadata provides bad
* block management for a disk
*/
FailFast
,
/* Minimal retries should be attempted on
* this device, so use REQ_FAILFAST_DEV.
* Also don't try to repair failed reads.
* It is expects that no bad block log
* is present.
*/
LastDev
,
/* Seems to be the last working dev as
* it didn't fail, so don't use FailFast
* any more for metadata
*/
};
};
static
inline
int
is_badblock
(
struct
md_rdev
*
rdev
,
sector_t
s
,
int
sectors
,
static
inline
int
is_badblock
(
struct
md_rdev
*
rdev
,
sector_t
s
,
int
sectors
,
...
@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
...
@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int
is_new
);
int
is_new
);
struct
md_cluster_info
;
struct
md_cluster_info
;
enum
mddev_flags
{
MD_ARRAY_FIRST_USE
,
/* First use of array, needs initialization */
MD_CLOSING
,
/* If set, we are closing the array, do not open
* it then */
MD_JOURNAL_CLEAN
,
/* A raid with journal is already clean */
MD_HAS_JOURNAL
,
/* The raid array has journal feature set */
MD_RELOAD_SB
,
/* Reload the superblock because another node
* updated it.
*/
MD_CLUSTER_RESYNC_LOCKED
,
/* cluster raid only, which means node
* already took resync lock, need to
* release the lock */
MD_FAILFAST_SUPPORTED
,
/* Using MD_FAILFAST on metadata writes is
* supported as calls to md_error() will
* never cause the array to become failed.
*/
};
enum
mddev_sb_flags
{
MD_SB_CHANGE_DEVS
,
/* Some device status has changed */
MD_SB_CHANGE_CLEAN
,
/* transition to or from 'clean' */
MD_SB_CHANGE_PENDING
,
/* switch from 'clean' to 'active' in progress */
MD_SB_NEED_REWRITE
,
/* metadata write needs to be repeated */
};
struct
mddev
{
struct
mddev
{
void
*
private
;
void
*
private
;
struct
md_personality
*
pers
;
struct
md_personality
*
pers
;
...
@@ -196,21 +244,7 @@ struct mddev {
...
@@ -196,21 +244,7 @@ struct mddev {
int
md_minor
;
int
md_minor
;
struct
list_head
disks
;
struct
list_head
disks
;
unsigned
long
flags
;
unsigned
long
flags
;
#define MD_CHANGE_DEVS 0
/* Some device status has changed */
unsigned
long
sb_flags
;
#define MD_CHANGE_CLEAN 1
/* transition to or from 'clean' */
#define MD_CHANGE_PENDING 2
/* switch from 'clean' to 'active' in progress */
#define MD_UPDATE_SB_FLAGS (1 | 2 | 4)
/* If these are set, md_update_sb needed */
#define MD_ARRAY_FIRST_USE 3
/* First use of array, needs initialization */
#define MD_CLOSING 4
/* If set, we are closing the array, do not open
* it then */
#define MD_JOURNAL_CLEAN 5
/* A raid with journal is already clean */
#define MD_HAS_JOURNAL 6
/* The raid array has journal feature set */
#define MD_RELOAD_SB 7
/* Reload the superblock because another node
* updated it.
*/
#define MD_CLUSTER_RESYNC_LOCKED 8
/* cluster raid only, which means node
* already took resync lock, need to
* release the lock */
int
suspended
;
int
suspended
;
atomic_t
active_io
;
atomic_t
active_io
;
...
@@ -304,31 +338,6 @@ struct mddev {
...
@@ -304,31 +338,6 @@ struct mddev {
int
parallel_resync
;
int
parallel_resync
;
int
ok_start_degraded
;
int
ok_start_degraded
;
/* recovery/resync flags
* NEEDED: we might need to start a resync/recover
* RUNNING: a thread is running, or about to be started
* SYNC: actually doing a resync, not a recovery
* RECOVER: doing recovery, or need to try it.
* INTR: resync needs to be aborted for some reason
* DONE: thread is done and is waiting to be reaped
* REQUEST: user-space has requested a sync (used with SYNC)
* CHECK: user-space request for check-only, no repair
* RESHAPE: A reshape is happening
* ERROR: sync-action interrupted because io-error
*
* If neither SYNC or RESHAPE are set, then it is a recovery.
*/
#define MD_RECOVERY_RUNNING 0
#define MD_RECOVERY_SYNC 1
#define MD_RECOVERY_RECOVER 2
#define MD_RECOVERY_INTR 3
#define MD_RECOVERY_DONE 4
#define MD_RECOVERY_NEEDED 5
#define MD_RECOVERY_REQUESTED 6
#define MD_RECOVERY_CHECK 7
#define MD_RECOVERY_RESHAPE 8
#define MD_RECOVERY_FROZEN 9
#define MD_RECOVERY_ERROR 10
unsigned
long
recovery
;
unsigned
long
recovery
;
/* If a RAID personality determines that recovery (of a particular
/* If a RAID personality determines that recovery (of a particular
...
@@ -442,6 +451,23 @@ struct mddev {
...
@@ -442,6 +451,23 @@ struct mddev {
unsigned
int
good_device_nr
;
/* good device num within cluster raid */
unsigned
int
good_device_nr
;
/* good device num within cluster raid */
};
};
enum
recovery_flags
{
/*
* If neither SYNC or RESHAPE are set, then it is a recovery.
*/
MD_RECOVERY_RUNNING
,
/* a thread is running, or about to be started */
MD_RECOVERY_SYNC
,
/* actually doing a resync, not a recovery */
MD_RECOVERY_RECOVER
,
/* doing recovery, or need to try it. */
MD_RECOVERY_INTR
,
/* resync needs to be aborted for some reason */
MD_RECOVERY_DONE
,
/* thread is done and is waiting to be reaped */
MD_RECOVERY_NEEDED
,
/* we might need to start a resync/recover */
MD_RECOVERY_REQUESTED
,
/* user-space has requested a sync (used with SYNC) */
MD_RECOVERY_CHECK
,
/* user-space request for check-only, no repair */
MD_RECOVERY_RESHAPE
,
/* A reshape is happening */
MD_RECOVERY_FROZEN
,
/* User request to abort, and not restart, any action */
MD_RECOVERY_ERROR
,
/* sync-action interrupted because io-error */
};
static
inline
int
__must_check
mddev_lock
(
struct
mddev
*
mddev
)
static
inline
int
__must_check
mddev_lock
(
struct
mddev
*
mddev
)
{
{
return
mutex_lock_interruptible
(
&
mddev
->
reconfig_mutex
);
return
mutex_lock_interruptible
(
&
mddev
->
reconfig_mutex
);
...
@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
...
@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
extern
void
md_flush_request
(
struct
mddev
*
mddev
,
struct
bio
*
bio
);
extern
void
md_flush_request
(
struct
mddev
*
mddev
,
struct
bio
*
bio
);
extern
void
md_super_write
(
struct
mddev
*
mddev
,
struct
md_rdev
*
rdev
,
extern
void
md_super_write
(
struct
mddev
*
mddev
,
struct
md_rdev
*
rdev
,
sector_t
sector
,
int
size
,
struct
page
*
page
);
sector_t
sector
,
int
size
,
struct
page
*
page
);
extern
void
md_super_wait
(
struct
mddev
*
mddev
);
extern
int
md_super_wait
(
struct
mddev
*
mddev
);
extern
int
sync_page_io
(
struct
md_rdev
*
rdev
,
sector_t
sector
,
int
size
,
extern
int
sync_page_io
(
struct
md_rdev
*
rdev
,
sector_t
sector
,
int
size
,
struct
page
*
page
,
int
op
,
int
op_flags
,
struct
page
*
page
,
int
op
,
int
op_flags
,
bool
metadata_op
);
bool
metadata_op
);
...
...
drivers/md/multipath.c
浏览文件 @
20737738
...
@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf)
...
@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf)
}
}
rcu_read_unlock
();
rcu_read_unlock
();
pr
intk
(
KERN_ERR
"multipath_map(): no more operational IO paths?
\n
"
);
pr
_crit_ratelimited
(
"multipath_map(): no more operational IO paths?
\n
"
);
return
(
-
1
);
return
(
-
1
);
}
}
...
@@ -97,7 +97,7 @@ static void multipath_end_request(struct bio *bio)
...
@@ -97,7 +97,7 @@ static void multipath_end_request(struct bio *bio)
*/
*/
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
md_error
(
mp_bh
->
mddev
,
rdev
);
md_error
(
mp_bh
->
mddev
,
rdev
);
pr
intk
(
KERN_ERR
"multipath: %s: rescheduling sector %llu
\n
"
,
pr
_info
(
"multipath: %s: rescheduling sector %llu
\n
"
,
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
(
unsigned
long
long
)
bio
->
bi_iter
.
bi_sector
);
(
unsigned
long
long
)
bio
->
bi_iter
.
bi_sector
);
multipath_reschedule_retry
(
mp_bh
);
multipath_reschedule_retry
(
mp_bh
);
...
@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
...
@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
* first check if this is a queued request for a device
* first check if this is a queued request for a device
* which has just failed.
* which has just failed.
*/
*/
printk
(
KERN_ALERT
pr_warn
(
"multipath: only one IO path left and IO error.
\n
"
);
"multipath: only one IO path left and IO error.
\n
"
);
/* leave it active... it's all we have */
/* leave it active... it's all we have */
return
;
return
;
}
}
...
@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
...
@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
spin_unlock_irqrestore
(
&
conf
->
device_lock
,
flags
);
spin_unlock_irqrestore
(
&
conf
->
device_lock
,
flags
);
}
}
set_bit
(
Faulty
,
&
rdev
->
flags
);
set_bit
(
Faulty
,
&
rdev
->
flags
);
set_bit
(
MD_CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_SB_CHANGE_DEVS
,
&
mddev
->
sb_flags
);
printk
(
KERN_ALERT
"multipath: IO failure on %s,"
pr_err
(
"multipath: IO failure on %s, disabling IO path.
\n
"
" disabling IO path.
\n
"
"multipath: Operation continuing on %d IO paths.
\n
"
,
"multipath: Operation continuing"
" on %d IO paths.
\n
"
,
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
conf
->
raid_disks
-
mddev
->
degraded
);
conf
->
raid_disks
-
mddev
->
degraded
);
}
}
...
@@ -223,19 +220,19 @@ static void print_multipath_conf (struct mpconf *conf)
...
@@ -223,19 +220,19 @@ static void print_multipath_conf (struct mpconf *conf)
int
i
;
int
i
;
struct
multipath_info
*
tmp
;
struct
multipath_info
*
tmp
;
pr
intk
(
"MULTIPATH conf printout:
\n
"
);
pr
_debug
(
"MULTIPATH conf printout:
\n
"
);
if
(
!
conf
)
{
if
(
!
conf
)
{
pr
intk
(
"(conf==NULL)
\n
"
);
pr
_debug
(
"(conf==NULL)
\n
"
);
return
;
return
;
}
}
pr
intk
(
" --- wd:%d rd:%d
\n
"
,
conf
->
raid_disks
-
conf
->
mddev
->
degraded
,
pr
_debug
(
" --- wd:%d rd:%d
\n
"
,
conf
->
raid_disks
-
conf
->
mddev
->
degraded
,
conf
->
raid_disks
);
conf
->
raid_disks
);
for
(
i
=
0
;
i
<
conf
->
raid_disks
;
i
++
)
{
for
(
i
=
0
;
i
<
conf
->
raid_disks
;
i
++
)
{
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
tmp
=
conf
->
multipaths
+
i
;
tmp
=
conf
->
multipaths
+
i
;
if
(
tmp
->
rdev
)
if
(
tmp
->
rdev
)
pr
intk
(
" disk%d, o:%d, dev:%s
\n
"
,
pr
_debug
(
" disk%d, o:%d, dev:%s
\n
"
,
i
,
!
test_bit
(
Faulty
,
&
tmp
->
rdev
->
flags
),
i
,
!
test_bit
(
Faulty
,
&
tmp
->
rdev
->
flags
),
bdevname
(
tmp
->
rdev
->
bdev
,
b
));
bdevname
(
tmp
->
rdev
->
bdev
,
b
));
}
}
...
@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if
(
rdev
==
p
->
rdev
)
{
if
(
rdev
==
p
->
rdev
)
{
if
(
test_bit
(
In_sync
,
&
rdev
->
flags
)
||
if
(
test_bit
(
In_sync
,
&
rdev
->
flags
)
||
atomic_read
(
&
rdev
->
nr_pending
))
{
atomic_read
(
&
rdev
->
nr_pending
))
{
printk
(
KERN_ERR
"hot-remove-disk, slot %d is identified"
pr_warn
(
"hot-remove-disk, slot %d is identified but is still operational!
\n
"
,
number
);
" but is still operational!
\n
"
,
number
);
err
=
-
EBUSY
;
err
=
-
EBUSY
;
goto
abort
;
goto
abort
;
}
}
...
@@ -346,14 +342,12 @@ static void multipathd(struct md_thread *thread)
...
@@ -346,14 +342,12 @@ static void multipathd(struct md_thread *thread)
bio
->
bi_iter
.
bi_sector
=
mp_bh
->
master_bio
->
bi_iter
.
bi_sector
;
bio
->
bi_iter
.
bi_sector
=
mp_bh
->
master_bio
->
bi_iter
.
bi_sector
;
if
((
mp_bh
->
path
=
multipath_map
(
conf
))
<
0
)
{
if
((
mp_bh
->
path
=
multipath_map
(
conf
))
<
0
)
{
printk
(
KERN_ALERT
"multipath: %s: unrecoverable IO read"
pr_err
(
"multipath: %s: unrecoverable IO read error for block %llu
\n
"
,
" error for block %llu
\n
"
,
bdevname
(
bio
->
bi_bdev
,
b
),
bdevname
(
bio
->
bi_bdev
,
b
),
(
unsigned
long
long
)
bio
->
bi_iter
.
bi_sector
);
(
unsigned
long
long
)
bio
->
bi_iter
.
bi_sector
);
multipath_end_bh_io
(
mp_bh
,
-
EIO
);
multipath_end_bh_io
(
mp_bh
,
-
EIO
);
}
else
{
}
else
{
printk
(
KERN_ERR
"multipath: %s: redirecting sector %llu"
pr_err
(
"multipath: %s: redirecting sector %llu to another IO path
\n
"
,
" to another IO path
\n
"
,
bdevname
(
bio
->
bi_bdev
,
b
),
bdevname
(
bio
->
bi_bdev
,
b
),
(
unsigned
long
long
)
bio
->
bi_iter
.
bi_sector
);
(
unsigned
long
long
)
bio
->
bi_iter
.
bi_sector
);
*
bio
=
*
(
mp_bh
->
master_bio
);
*
bio
=
*
(
mp_bh
->
master_bio
);
...
@@ -389,7 +383,7 @@ static int multipath_run (struct mddev *mddev)
...
@@ -389,7 +383,7 @@ static int multipath_run (struct mddev *mddev)
return
-
EINVAL
;
return
-
EINVAL
;
if
(
mddev
->
level
!=
LEVEL_MULTIPATH
)
{
if
(
mddev
->
level
!=
LEVEL_MULTIPATH
)
{
pr
intk
(
"multipath: %s: raid level not set to multipath IO (%d)
\n
"
,
pr
_warn
(
"multipath: %s: raid level not set to multipath IO (%d)
\n
"
,
mdname
(
mddev
),
mddev
->
level
);
mdname
(
mddev
),
mddev
->
level
);
goto
out
;
goto
out
;
}
}
...
@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev)
...
@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev)
conf
=
kzalloc
(
sizeof
(
struct
mpconf
),
GFP_KERNEL
);
conf
=
kzalloc
(
sizeof
(
struct
mpconf
),
GFP_KERNEL
);
mddev
->
private
=
conf
;
mddev
->
private
=
conf
;
if
(
!
conf
)
{
if
(
!
conf
)
printk
(
KERN_ERR
"multipath: couldn't allocate memory for %s
\n
"
,
mdname
(
mddev
));
goto
out
;
goto
out
;
}
conf
->
multipaths
=
kzalloc
(
sizeof
(
struct
multipath_info
)
*
mddev
->
raid_disks
,
conf
->
multipaths
=
kzalloc
(
sizeof
(
struct
multipath_info
)
*
mddev
->
raid_disks
,
GFP_KERNEL
);
GFP_KERNEL
);
if
(
!
conf
->
multipaths
)
{
if
(
!
conf
->
multipaths
)
printk
(
KERN_ERR
"multipath: couldn't allocate memory for %s
\n
"
,
mdname
(
mddev
));
goto
out_free_conf
;
goto
out_free_conf
;
}
working_disks
=
0
;
working_disks
=
0
;
rdev_for_each
(
rdev
,
mddev
)
{
rdev_for_each
(
rdev
,
mddev
)
{
...
@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev)
...
@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev)
INIT_LIST_HEAD
(
&
conf
->
retry_list
);
INIT_LIST_HEAD
(
&
conf
->
retry_list
);
if
(
!
working_disks
)
{
if
(
!
working_disks
)
{
pr
intk
(
KERN_ERR
"multipath: no operational IO paths for %s
\n
"
,
pr
_warn
(
"multipath: no operational IO paths for %s
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
goto
out_free_conf
;
goto
out_free_conf
;
}
}
...
@@ -447,25 +433,15 @@ static int multipath_run (struct mddev *mddev)
...
@@ -447,25 +433,15 @@ static int multipath_run (struct mddev *mddev)
conf
->
pool
=
mempool_create_kmalloc_pool
(
NR_RESERVED_BUFS
,
conf
->
pool
=
mempool_create_kmalloc_pool
(
NR_RESERVED_BUFS
,
sizeof
(
struct
multipath_bh
));
sizeof
(
struct
multipath_bh
));
if
(
conf
->
pool
==
NULL
)
{
if
(
conf
->
pool
==
NULL
)
printk
(
KERN_ERR
"multipath: couldn't allocate memory for %s
\n
"
,
mdname
(
mddev
));
goto
out_free_conf
;
goto
out_free_conf
;
}
{
mddev
->
thread
=
md_register_thread
(
multipathd
,
mddev
,
mddev
->
thread
=
md_register_thread
(
multipathd
,
mddev
,
"multipath"
);
"multipath"
);
if
(
!
mddev
->
thread
)
{
if
(
!
mddev
->
thread
)
printk
(
KERN_ERR
"multipath: couldn't allocate thread"
" for %s
\n
"
,
mdname
(
mddev
));
goto
out_free_conf
;
goto
out_free_conf
;
}
}
printk
(
KERN_INFO
pr_info
(
"multipath: array %s active with %d out of %d IO paths
\n
"
,
"multipath: array %s active with %d out of %d IO paths
\n
"
,
mdname
(
mddev
),
conf
->
raid_disks
-
mddev
->
degraded
,
mdname
(
mddev
),
conf
->
raid_disks
-
mddev
->
degraded
,
mddev
->
raid_disks
);
mddev
->
raid_disks
);
/*
/*
...
...
drivers/md/raid0.c
浏览文件 @
20737738
...
@@ -21,6 +21,7 @@
...
@@ -21,6 +21,7 @@
#include <linux/seq_file.h>
#include <linux/seq_file.h>
#include <linux/module.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/slab.h>
#include <trace/events/block.h>
#include "md.h"
#include "md.h"
#include "raid0.h"
#include "raid0.h"
#include "raid5.h"
#include "raid5.h"
...
@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev)
...
@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev)
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
struct
r0conf
*
conf
=
mddev
->
private
;
struct
r0conf
*
conf
=
mddev
->
private
;
int
raid_disks
=
conf
->
strip_zone
[
0
].
nb_dev
;
int
raid_disks
=
conf
->
strip_zone
[
0
].
nb_dev
;
pr
intk
(
KERN_INFO
"md: RAID0 configuration for %s - %d zone%s
\n
"
,
pr
_debug
(
"md: RAID0 configuration for %s - %d zone%s
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
conf
->
nr_strip_zones
,
conf
->
nr_strip_zones
==
1
?
""
:
"s"
);
conf
->
nr_strip_zones
,
conf
->
nr_strip_zones
==
1
?
""
:
"s"
);
for
(
j
=
0
;
j
<
conf
->
nr_strip_zones
;
j
++
)
{
for
(
j
=
0
;
j
<
conf
->
nr_strip_zones
;
j
++
)
{
printk
(
KERN_INFO
"md: zone%d=["
,
j
);
char
line
[
200
];
int
len
=
0
;
for
(
k
=
0
;
k
<
conf
->
strip_zone
[
j
].
nb_dev
;
k
++
)
for
(
k
=
0
;
k
<
conf
->
strip_zone
[
j
].
nb_dev
;
k
++
)
printk
(
KERN_CONT
"%s%s"
,
k
?
"/"
:
""
,
len
+=
snprintf
(
line
+
len
,
200
-
len
,
"%s%s"
,
k
?
"/"
:
""
,
bdevname
(
conf
->
devlist
[
j
*
raid_disks
bdevname
(
conf
->
devlist
[
j
*
raid_disks
+
k
]
->
bdev
,
b
));
+
k
]
->
bdev
,
b
));
pr
intk
(
KERN_CONT
"]
\n
"
);
pr
_debug
(
"md: zone%d=[%s]
\n
"
,
j
,
line
);
zone_size
=
conf
->
strip_zone
[
j
].
zone_end
-
zone_start
;
zone_size
=
conf
->
strip_zone
[
j
].
zone_end
-
zone_start
;
printk
(
KERN_INFO
" zone-offset=%10lluKB, "
pr_debug
(
" zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB
\n
"
,
"device-offset=%10lluKB, size=%10lluKB
\n
"
,
(
unsigned
long
long
)
zone_start
>>
1
,
(
unsigned
long
long
)
zone_start
>>
1
,
(
unsigned
long
long
)
conf
->
strip_zone
[
j
].
dev_start
>>
1
,
(
unsigned
long
long
)
conf
->
strip_zone
[
j
].
dev_start
>>
1
,
(
unsigned
long
long
)
zone_size
>>
1
);
(
unsigned
long
long
)
zone_size
>>
1
);
...
@@ -142,7 +144,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
...
@@ -142,7 +144,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
* chunk size is a multiple of that sector size
* chunk size is a multiple of that sector size
*/
*/
if
((
mddev
->
chunk_sectors
<<
9
)
%
blksize
)
{
if
((
mddev
->
chunk_sectors
<<
9
)
%
blksize
)
{
pr
intk
(
KERN_ERR
"md/raid0:%s: chunk_size of %d not multiple of block size %d
\n
"
,
pr
_warn
(
"md/raid0:%s: chunk_size of %d not multiple of block size %d
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
mddev
->
chunk_sectors
<<
9
,
blksize
);
mddev
->
chunk_sectors
<<
9
,
blksize
);
err
=
-
EINVAL
;
err
=
-
EINVAL
;
...
@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
...
@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
}
if
(
j
<
0
)
{
if
(
j
<
0
)
{
printk
(
KERN_ERR
pr_warn
(
"md/raid0:%s: remove inactive devices before converting to RAID0
\n
"
,
"md/raid0:%s: remove inactive devices before converting to RAID0
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
goto
abort
;
goto
abort
;
}
}
if
(
j
>=
mddev
->
raid_disks
)
{
if
(
j
>=
mddev
->
raid_disks
)
{
pr
intk
(
KERN_ERR
"md/raid0:%s: bad disk number %d - "
pr
_warn
(
"md/raid0:%s: bad disk number %d - aborting!
\n
"
,
"aborting!
\n
"
,
mdname
(
mddev
),
j
);
mdname
(
mddev
),
j
);
goto
abort
;
goto
abort
;
}
}
if
(
dev
[
j
])
{
if
(
dev
[
j
])
{
pr
intk
(
KERN_ERR
"md/raid0:%s: multiple devices for %d - "
pr
_warn
(
"md/raid0:%s: multiple devices for %d - aborting!
\n
"
,
"aborting!
\n
"
,
mdname
(
mddev
),
j
);
mdname
(
mddev
),
j
);
goto
abort
;
goto
abort
;
}
}
dev
[
j
]
=
rdev1
;
dev
[
j
]
=
rdev1
;
...
@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
...
@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
cnt
++
;
cnt
++
;
}
}
if
(
cnt
!=
mddev
->
raid_disks
)
{
if
(
cnt
!=
mddev
->
raid_disks
)
{
pr
intk
(
KERN_ERR
"md/raid0:%s: too few disks (%d of %d) - "
pr
_warn
(
"md/raid0:%s: too few disks (%d of %d) - aborting!
\n
"
,
"aborting!
\n
"
,
mdname
(
mddev
),
cnt
,
mddev
->
raid_disks
);
mdname
(
mddev
),
cnt
,
mddev
->
raid_disks
);
goto
abort
;
goto
abort
;
}
}
zone
->
nb_dev
=
cnt
;
zone
->
nb_dev
=
cnt
;
...
@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev)
...
@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev)
int
ret
;
int
ret
;
if
(
mddev
->
chunk_sectors
==
0
)
{
if
(
mddev
->
chunk_sectors
==
0
)
{
printk
(
KERN_ERR
"md/raid0:%s: chunk size must be set.
\n
"
,
pr_warn
(
"md/raid0:%s: chunk size must be set.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
if
(
md_check_no_bitmap
(
mddev
))
if
(
md_check_no_bitmap
(
mddev
))
...
@@ -399,7 +399,7 @@ static int raid0_run(struct mddev *mddev)
...
@@ -399,7 +399,7 @@ static int raid0_run(struct mddev *mddev)
/* calculate array device size */
/* calculate array device size */
md_set_array_sectors
(
mddev
,
raid0_size
(
mddev
,
0
,
0
));
md_set_array_sectors
(
mddev
,
raid0_size
(
mddev
,
0
,
0
));
pr
intk
(
KERN_INFO
"md/raid0:%s: md_size is %llu sectors.
\n
"
,
pr
_debug
(
"md/raid0:%s: md_size is %llu sectors.
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
(
unsigned
long
long
)
mddev
->
array_sectors
);
(
unsigned
long
long
)
mddev
->
array_sectors
);
...
@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
...
@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
}
}
do
{
do
{
sector_t
sector
=
bio
->
bi_iter
.
bi_sector
;
sector_t
bio_sector
=
bio
->
bi_iter
.
bi_sector
;
sector_t
sector
=
bio_sector
;
unsigned
chunk_sects
=
mddev
->
chunk_sectors
;
unsigned
chunk_sects
=
mddev
->
chunk_sectors
;
unsigned
sectors
=
chunk_sects
-
unsigned
sectors
=
chunk_sects
-
...
@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
...
@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
:
sector_div
(
sector
,
chunk_sects
));
:
sector_div
(
sector
,
chunk_sects
));
/* Restore due to sector_div */
/* Restore due to sector_div */
sector
=
bio
->
bi_iter
.
bi
_sector
;
sector
=
bio_sector
;
if
(
sectors
<
bio_sectors
(
bio
))
{
if
(
sectors
<
bio_sectors
(
bio
))
{
split
=
bio_split
(
bio
,
sectors
,
GFP_NOIO
,
fs_bio_set
);
split
=
bio_split
(
bio
,
sectors
,
GFP_NOIO
,
fs_bio_set
);
...
@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
...
@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
!
blk_queue_discard
(
bdev_get_queue
(
split
->
bi_bdev
))))
{
!
blk_queue_discard
(
bdev_get_queue
(
split
->
bi_bdev
))))
{
/* Just ignore it */
/* Just ignore it */
bio_endio
(
split
);
bio_endio
(
split
);
}
else
}
else
{
if
(
mddev
->
gendisk
)
trace_block_bio_remap
(
bdev_get_queue
(
split
->
bi_bdev
),
split
,
disk_devt
(
mddev
->
gendisk
),
bio_sector
);
generic_make_request
(
split
);
generic_make_request
(
split
);
}
}
while
(
split
!=
bio
);
}
while
(
split
!=
bio
);
}
}
...
@@ -509,7 +515,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
...
@@ -509,7 +515,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
struct
r0conf
*
priv_conf
;
struct
r0conf
*
priv_conf
;
if
(
mddev
->
degraded
!=
1
)
{
if
(
mddev
->
degraded
!=
1
)
{
pr
intk
(
KERN_ERR
"md/raid0:%s: raid5 must be degraded! Degraded disks: %d
\n
"
,
pr
_warn
(
"md/raid0:%s: raid5 must be degraded! Degraded disks: %d
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
mddev
->
degraded
);
mddev
->
degraded
);
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
...
@@ -518,7 +524,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
...
@@ -518,7 +524,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
rdev_for_each
(
rdev
,
mddev
)
{
rdev_for_each
(
rdev
,
mddev
)
{
/* check slot number for a disk */
/* check slot number for a disk */
if
(
rdev
->
raid_disk
==
mddev
->
raid_disks
-
1
)
{
if
(
rdev
->
raid_disk
==
mddev
->
raid_disks
-
1
)
{
pr
intk
(
KERN_ERR
"md/raid0:%s: raid5 must have missing parity disk!
\n
"
,
pr
_warn
(
"md/raid0:%s: raid5 must have missing parity disk!
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
...
@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
...
@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
mddev
->
delta_disks
=
-
1
;
mddev
->
delta_disks
=
-
1
;
/* make sure it will be not marked as dirty */
/* make sure it will be not marked as dirty */
mddev
->
recovery_cp
=
MaxSector
;
mddev
->
recovery_cp
=
MaxSector
;
clear_bit
(
MD_HAS_JOURNAL
,
&
mddev
->
flags
);
clear_bit
(
MD_JOURNAL_CLEAN
,
&
mddev
->
flags
);
create_strip_zones
(
mddev
,
&
priv_conf
);
create_strip_zones
(
mddev
,
&
priv_conf
);
return
priv_conf
;
return
priv_conf
;
}
}
...
@@ -549,18 +558,18 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
...
@@ -549,18 +558,18 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
* - all mirrors must be already degraded
* - all mirrors must be already degraded
*/
*/
if
(
mddev
->
layout
!=
((
1
<<
8
)
+
2
))
{
if
(
mddev
->
layout
!=
((
1
<<
8
)
+
2
))
{
pr
intk
(
KERN_ERR
"md/raid0:%s:: Raid0 cannot takeover layout: 0x%x
\n
"
,
pr
_warn
(
"md/raid0:%s:: Raid0 cannot takeover layout: 0x%x
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
mddev
->
layout
);
mddev
->
layout
);
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
if
(
mddev
->
raid_disks
&
1
)
{
if
(
mddev
->
raid_disks
&
1
)
{
pr
intk
(
KERN_ERR
"md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.
\n
"
,
pr
_warn
(
"md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
if
(
mddev
->
degraded
!=
(
mddev
->
raid_disks
>>
1
))
{
if
(
mddev
->
degraded
!=
(
mddev
->
raid_disks
>>
1
))
{
pr
intk
(
KERN_ERR
"md/raid0:%s: All mirrors must be already degraded!
\n
"
,
pr
_warn
(
"md/raid0:%s: All mirrors must be already degraded!
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
...
@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
...
@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
mddev
->
degraded
=
0
;
mddev
->
degraded
=
0
;
/* make sure it will be not marked as dirty */
/* make sure it will be not marked as dirty */
mddev
->
recovery_cp
=
MaxSector
;
mddev
->
recovery_cp
=
MaxSector
;
clear_bit
(
MD_FAILFAST_SUPPORTED
,
&
mddev
->
flags
);
create_strip_zones
(
mddev
,
&
priv_conf
);
create_strip_zones
(
mddev
,
&
priv_conf
);
return
priv_conf
;
return
priv_conf
;
...
@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
...
@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
* - (N - 1) mirror drives must be already faulty
* - (N - 1) mirror drives must be already faulty
*/
*/
if
((
mddev
->
raid_disks
-
1
)
!=
mddev
->
degraded
)
{
if
((
mddev
->
raid_disks
-
1
)
!=
mddev
->
degraded
)
{
pr
intk
(
KERN_ERR
"md/raid0:%s: (N - 1) mirrors drives must be already faulty!
\n
"
,
pr
_err
(
"md/raid0:%s: (N - 1) mirrors drives must be already faulty!
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
...
@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
...
@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
mddev
->
raid_disks
=
1
;
mddev
->
raid_disks
=
1
;
/* make sure it will be not marked as dirty */
/* make sure it will be not marked as dirty */
mddev
->
recovery_cp
=
MaxSector
;
mddev
->
recovery_cp
=
MaxSector
;
clear_bit
(
MD_FAILFAST_SUPPORTED
,
&
mddev
->
flags
);
create_strip_zones
(
mddev
,
&
priv_conf
);
create_strip_zones
(
mddev
,
&
priv_conf
);
return
priv_conf
;
return
priv_conf
;
...
@@ -631,7 +642,7 @@ static void *raid0_takeover(struct mddev *mddev)
...
@@ -631,7 +642,7 @@ static void *raid0_takeover(struct mddev *mddev)
*/
*/
if
(
mddev
->
bitmap
)
{
if
(
mddev
->
bitmap
)
{
pr
intk
(
KERN_ERR
"md/raid0: %s: cannot takeover array with bitmap
\n
"
,
pr
_warn
(
"md/raid0: %s: cannot takeover array with bitmap
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
ERR_PTR
(
-
EBUSY
);
return
ERR_PTR
(
-
EBUSY
);
}
}
...
@@ -642,7 +653,7 @@ static void *raid0_takeover(struct mddev *mddev)
...
@@ -642,7 +653,7 @@ static void *raid0_takeover(struct mddev *mddev)
if
(
mddev
->
layout
==
ALGORITHM_PARITY_N
)
if
(
mddev
->
layout
==
ALGORITHM_PARITY_N
)
return
raid0_takeover_raid45
(
mddev
);
return
raid0_takeover_raid45
(
mddev
);
pr
intk
(
KERN_ERR
"md/raid0:%s: Raid can only takeover Raid5 with layout: %d
\n
"
,
pr
_warn
(
"md/raid0:%s: Raid can only takeover Raid5 with layout: %d
\n
"
,
mdname
(
mddev
),
ALGORITHM_PARITY_N
);
mdname
(
mddev
),
ALGORITHM_PARITY_N
);
}
}
...
@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev)
...
@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev)
if
(
mddev
->
level
==
1
)
if
(
mddev
->
level
==
1
)
return
raid0_takeover_raid1
(
mddev
);
return
raid0_takeover_raid1
(
mddev
);
pr
intk
(
KERN_ERR
"Takeover from raid%i to raid0 not supported
\n
"
,
pr
_warn
(
"Takeover from raid%i to raid0 not supported
\n
"
,
mddev
->
level
);
mddev
->
level
);
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
...
...
drivers/md/raid1.c
浏览文件 @
20737738
...
@@ -37,6 +37,7 @@
...
@@ -37,6 +37,7 @@
#include <linux/module.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/ratelimit.h>
#include <trace/events/block.h>
#include "md.h"
#include "md.h"
#include "raid1.h"
#include "raid1.h"
#include "bitmap.h"
#include "bitmap.h"
...
@@ -70,6 +71,9 @@ static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
...
@@ -70,6 +71,9 @@ static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
sector_t
bi_sector
);
sector_t
bi_sector
);
static
void
lower_barrier
(
struct
r1conf
*
conf
);
static
void
lower_barrier
(
struct
r1conf
*
conf
);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
static
void
*
r1bio_pool_alloc
(
gfp_t
gfp_flags
,
void
*
data
)
static
void
*
r1bio_pool_alloc
(
gfp_t
gfp_flags
,
void
*
data
)
{
{
struct
pool_info
*
pi
=
data
;
struct
pool_info
*
pi
=
data
;
...
@@ -325,6 +329,11 @@ static void raid1_end_read_request(struct bio *bio)
...
@@ -325,6 +329,11 @@ static void raid1_end_read_request(struct bio *bio)
if
(
uptodate
)
if
(
uptodate
)
set_bit
(
R1BIO_Uptodate
,
&
r1_bio
->
state
);
set_bit
(
R1BIO_Uptodate
,
&
r1_bio
->
state
);
else
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
)
&&
test_bit
(
R1BIO_FailFast
,
&
r1_bio
->
state
))
/* This was a fail-fast read so we definitely
* want to retry */
;
else
{
else
{
/* If all other devices have failed, we want to return
/* If all other devices have failed, we want to return
* the error upwards rather than fail the last device.
* the error upwards rather than fail the last device.
...
@@ -347,12 +356,9 @@ static void raid1_end_read_request(struct bio *bio)
...
@@ -347,12 +356,9 @@ static void raid1_end_read_request(struct bio *bio)
* oops, read error:
* oops, read error:
*/
*/
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
printk_ratelimited
(
pr_err_ratelimited
(
"md/raid1:%s: %s: rescheduling sector %llu
\n
"
,
KERN_ERR
"md/raid1:%s: %s: "
"rescheduling sector %llu
\n
"
,
mdname
(
conf
->
mddev
),
mdname
(
conf
->
mddev
),
bdevname
(
rdev
->
bdev
,
bdevname
(
rdev
->
bdev
,
b
),
b
),
(
unsigned
long
long
)
r1_bio
->
sector
);
(
unsigned
long
long
)
r1_bio
->
sector
);
set_bit
(
R1BIO_ReadError
,
&
r1_bio
->
state
);
set_bit
(
R1BIO_ReadError
,
&
r1_bio
->
state
);
reschedule_retry
(
r1_bio
);
reschedule_retry
(
r1_bio
);
...
@@ -416,6 +422,23 @@ static void raid1_end_write_request(struct bio *bio)
...
@@ -416,6 +422,23 @@ static void raid1_end_write_request(struct bio *bio)
set_bit
(
MD_RECOVERY_NEEDED
,
&
set_bit
(
MD_RECOVERY_NEEDED
,
&
conf
->
mddev
->
recovery
);
conf
->
mddev
->
recovery
);
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
)
&&
(
bio
->
bi_opf
&
MD_FAILFAST
)
&&
/* We never try FailFast to WriteMostly devices */
!
test_bit
(
WriteMostly
,
&
rdev
->
flags
))
{
md_error
(
r1_bio
->
mddev
,
rdev
);
if
(
!
test_bit
(
Faulty
,
&
rdev
->
flags
))
/* This is the only remaining device,
* We need to retry the write without
* FailFast
*/
set_bit
(
R1BIO_WriteError
,
&
r1_bio
->
state
);
else
{
/* Finished with this branch */
r1_bio
->
bios
[
mirror
]
=
NULL
;
to_put
=
bio
;
}
}
else
set_bit
(
R1BIO_WriteError
,
&
r1_bio
->
state
);
set_bit
(
R1BIO_WriteError
,
&
r1_bio
->
state
);
}
else
{
}
else
{
/*
/*
...
@@ -534,6 +557,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
...
@@ -534,6 +557,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
best_good_sectors
=
0
;
best_good_sectors
=
0
;
has_nonrot_disk
=
0
;
has_nonrot_disk
=
0
;
choose_next_idle
=
0
;
choose_next_idle
=
0
;
clear_bit
(
R1BIO_FailFast
,
&
r1_bio
->
state
);
if
((
conf
->
mddev
->
recovery_cp
<
this_sector
+
sectors
)
||
if
((
conf
->
mddev
->
recovery_cp
<
this_sector
+
sectors
)
||
(
mddev_is_clustered
(
conf
->
mddev
)
&&
(
mddev_is_clustered
(
conf
->
mddev
)
&&
...
@@ -607,6 +631,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
...
@@ -607,6 +631,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
}
else
}
else
best_good_sectors
=
sectors
;
best_good_sectors
=
sectors
;
if
(
best_disk
>=
0
)
/* At least two disks to choose from so failfast is OK */
set_bit
(
R1BIO_FailFast
,
&
r1_bio
->
state
);
nonrot
=
blk_queue_nonrot
(
bdev_get_queue
(
rdev
->
bdev
));
nonrot
=
blk_queue_nonrot
(
bdev_get_queue
(
rdev
->
bdev
));
has_nonrot_disk
|=
nonrot
;
has_nonrot_disk
|=
nonrot
;
pending
=
atomic_read
(
&
rdev
->
nr_pending
);
pending
=
atomic_read
(
&
rdev
->
nr_pending
);
...
@@ -645,11 +673,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
...
@@ -645,11 +673,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
}
}
break
;
break
;
}
}
/* If device is idle, use it */
if
(
pending
==
0
)
{
best_disk
=
disk
;
break
;
}
if
(
choose_next_idle
)
if
(
choose_next_idle
)
continue
;
continue
;
...
@@ -672,7 +695,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
...
@@ -672,7 +695,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
* mixed ratation/non-rotational disks depending on workload.
* mixed ratation/non-rotational disks depending on workload.
*/
*/
if
(
best_disk
==
-
1
)
{
if
(
best_disk
==
-
1
)
{
if
(
has_nonrot_disk
)
if
(
has_nonrot_disk
||
min_pending
==
0
)
best_disk
=
best_pending_disk
;
best_disk
=
best_pending_disk
;
else
else
best_disk
=
best_dist_disk
;
best_disk
=
best_dist_disk
;
...
@@ -745,8 +768,13 @@ static void flush_pending_writes(struct r1conf *conf)
...
@@ -745,8 +768,13 @@ static void flush_pending_writes(struct r1conf *conf)
while
(
bio
)
{
/* submit pending writes */
while
(
bio
)
{
/* submit pending writes */
struct
bio
*
next
=
bio
->
bi_next
;
struct
bio
*
next
=
bio
->
bi_next
;
struct
md_rdev
*
rdev
=
(
void
*
)
bio
->
bi_bdev
;
bio
->
bi_next
=
NULL
;
bio
->
bi_next
=
NULL
;
if
(
unlikely
((
bio_op
(
bio
)
==
REQ_OP_DISCARD
)
&&
bio
->
bi_bdev
=
rdev
->
bdev
;
if
(
test_bit
(
Faulty
,
&
rdev
->
flags
))
{
bio
->
bi_error
=
-
EIO
;
bio_endio
(
bio
);
}
else
if
(
unlikely
((
bio_op
(
bio
)
==
REQ_OP_DISCARD
)
&&
!
blk_queue_discard
(
bdev_get_queue
(
bio
->
bi_bdev
))))
!
blk_queue_discard
(
bdev_get_queue
(
bio
->
bi_bdev
))))
/* Just ignore it */
/* Just ignore it */
bio_endio
(
bio
);
bio_endio
(
bio
);
...
@@ -832,7 +860,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
...
@@ -832,7 +860,7 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
else
if
(
conf
->
barrier
&&
bio_data_dir
(
bio
)
==
WRITE
)
{
else
if
(
conf
->
barrier
&&
bio_data_dir
(
bio
)
==
WRITE
)
{
if
((
conf
->
mddev
->
curr_resync_completed
if
((
conf
->
mddev
->
curr_resync_completed
>=
bio_end_sector
(
bio
))
||
>=
bio_end_sector
(
bio
))
||
(
conf
->
next_resync
+
NEXT_NORMALIO_DISTANCE
(
conf
->
start_next_window
+
NEXT_NORMALIO_DISTANCE
<=
bio
->
bi_iter
.
bi_sector
))
<=
bio
->
bi_iter
.
bi_sector
))
wait
=
false
;
wait
=
false
;
else
else
...
@@ -858,6 +886,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
...
@@ -858,6 +886,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
* that queue to allow conf->start_next_window
* that queue to allow conf->start_next_window
* to increase.
* to increase.
*/
*/
raid1_log
(
conf
->
mddev
,
"wait barrier"
);
wait_event_lock_irq
(
conf
->
wait_barrier
,
wait_event_lock_irq
(
conf
->
wait_barrier
,
!
conf
->
array_frozen
&&
!
conf
->
array_frozen
&&
(
!
conf
->
barrier
||
(
!
conf
->
barrier
||
...
@@ -937,6 +966,7 @@ static void freeze_array(struct r1conf *conf, int extra)
...
@@ -937,6 +966,7 @@ static void freeze_array(struct r1conf *conf, int extra)
*/
*/
spin_lock_irq
(
&
conf
->
resync_lock
);
spin_lock_irq
(
&
conf
->
resync_lock
);
conf
->
array_frozen
=
1
;
conf
->
array_frozen
=
1
;
raid1_log
(
conf
->
mddev
,
"wait freeze"
);
wait_event_lock_irq_cmd
(
conf
->
wait_barrier
,
wait_event_lock_irq_cmd
(
conf
->
wait_barrier
,
conf
->
nr_pending
==
conf
->
nr_queued
+
extra
,
conf
->
nr_pending
==
conf
->
nr_queued
+
extra
,
conf
->
resync_lock
,
conf
->
resync_lock
,
...
@@ -1019,8 +1049,13 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
...
@@ -1019,8 +1049,13 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
while
(
bio
)
{
/* submit pending writes */
while
(
bio
)
{
/* submit pending writes */
struct
bio
*
next
=
bio
->
bi_next
;
struct
bio
*
next
=
bio
->
bi_next
;
struct
md_rdev
*
rdev
=
(
void
*
)
bio
->
bi_bdev
;
bio
->
bi_next
=
NULL
;
bio
->
bi_next
=
NULL
;
if
(
unlikely
((
bio_op
(
bio
)
==
REQ_OP_DISCARD
)
&&
bio
->
bi_bdev
=
rdev
->
bdev
;
if
(
test_bit
(
Faulty
,
&
rdev
->
flags
))
{
bio
->
bi_error
=
-
EIO
;
bio_endio
(
bio
);
}
else
if
(
unlikely
((
bio_op
(
bio
)
==
REQ_OP_DISCARD
)
&&
!
blk_queue_discard
(
bdev_get_queue
(
bio
->
bi_bdev
))))
!
blk_queue_discard
(
bdev_get_queue
(
bio
->
bi_bdev
))))
/* Just ignore it */
/* Just ignore it */
bio_endio
(
bio
);
bio_endio
(
bio
);
...
@@ -1136,6 +1171,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
...
@@ -1136,6 +1171,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
* take care not to over-take any writes
* take care not to over-take any writes
* that are 'behind'
* that are 'behind'
*/
*/
raid1_log
(
mddev
,
"wait behind writes"
);
wait_event
(
bitmap
->
behind_wait
,
wait_event
(
bitmap
->
behind_wait
,
atomic_read
(
&
bitmap
->
behind_writes
)
==
0
);
atomic_read
(
&
bitmap
->
behind_writes
)
==
0
);
}
}
...
@@ -1153,8 +1189,16 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
...
@@ -1153,8 +1189,16 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
read_bio
->
bi_bdev
=
mirror
->
rdev
->
bdev
;
read_bio
->
bi_bdev
=
mirror
->
rdev
->
bdev
;
read_bio
->
bi_end_io
=
raid1_end_read_request
;
read_bio
->
bi_end_io
=
raid1_end_read_request
;
bio_set_op_attrs
(
read_bio
,
op
,
do_sync
);
bio_set_op_attrs
(
read_bio
,
op
,
do_sync
);
if
(
test_bit
(
FailFast
,
&
mirror
->
rdev
->
flags
)
&&
test_bit
(
R1BIO_FailFast
,
&
r1_bio
->
state
))
read_bio
->
bi_opf
|=
MD_FAILFAST
;
read_bio
->
bi_private
=
r1_bio
;
read_bio
->
bi_private
=
r1_bio
;
if
(
mddev
->
gendisk
)
trace_block_bio_remap
(
bdev_get_queue
(
read_bio
->
bi_bdev
),
read_bio
,
disk_devt
(
mddev
->
gendisk
),
r1_bio
->
sector
);
if
(
max_sectors
<
r1_bio
->
sectors
)
{
if
(
max_sectors
<
r1_bio
->
sectors
)
{
/* could not read all from this device, so we will
/* could not read all from this device, so we will
* need another r1_bio.
* need another r1_bio.
...
@@ -1195,6 +1239,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
...
@@ -1195,6 +1239,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
*/
*/
if
(
conf
->
pending_count
>=
max_queued_requests
)
{
if
(
conf
->
pending_count
>=
max_queued_requests
)
{
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
raid1_log
(
mddev
,
"wait queued"
);
wait_event
(
conf
->
wait_barrier
,
wait_event
(
conf
->
wait_barrier
,
conf
->
pending_count
<
max_queued_requests
);
conf
->
pending_count
<
max_queued_requests
);
}
}
...
@@ -1286,6 +1331,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
...
@@ -1286,6 +1331,7 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
rdev_dec_pending
(
conf
->
mirrors
[
j
].
rdev
,
mddev
);
rdev_dec_pending
(
conf
->
mirrors
[
j
].
rdev
,
mddev
);
r1_bio
->
state
=
0
;
r1_bio
->
state
=
0
;
allow_barrier
(
conf
,
start_next_window
,
bio
->
bi_iter
.
bi_sector
);
allow_barrier
(
conf
,
start_next_window
,
bio
->
bi_iter
.
bi_sector
);
raid1_log
(
mddev
,
"wait rdev %d blocked"
,
blocked_rdev
->
raid_disk
);
md_wait_for_blocked_rdev
(
blocked_rdev
,
mddev
);
md_wait_for_blocked_rdev
(
blocked_rdev
,
mddev
);
start_next_window
=
wait_barrier
(
conf
,
bio
);
start_next_window
=
wait_barrier
(
conf
,
bio
);
/*
/*
...
@@ -1363,10 +1409,21 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
...
@@ -1363,10 +1409,21 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
mbio
->
bi_bdev
=
conf
->
mirrors
[
i
].
rdev
->
bdev
;
mbio
->
bi_bdev
=
conf
->
mirrors
[
i
].
rdev
->
bdev
;
mbio
->
bi_end_io
=
raid1_end_write_request
;
mbio
->
bi_end_io
=
raid1_end_write_request
;
bio_set_op_attrs
(
mbio
,
op
,
do_flush_fua
|
do_sync
);
bio_set_op_attrs
(
mbio
,
op
,
do_flush_fua
|
do_sync
);
if
(
test_bit
(
FailFast
,
&
conf
->
mirrors
[
i
].
rdev
->
flags
)
&&
!
test_bit
(
WriteMostly
,
&
conf
->
mirrors
[
i
].
rdev
->
flags
)
&&
conf
->
raid_disks
-
mddev
->
degraded
>
1
)
mbio
->
bi_opf
|=
MD_FAILFAST
;
mbio
->
bi_private
=
r1_bio
;
mbio
->
bi_private
=
r1_bio
;
atomic_inc
(
&
r1_bio
->
remaining
);
atomic_inc
(
&
r1_bio
->
remaining
);
if
(
mddev
->
gendisk
)
trace_block_bio_remap
(
bdev_get_queue
(
mbio
->
bi_bdev
),
mbio
,
disk_devt
(
mddev
->
gendisk
),
r1_bio
->
sector
);
/* flush_pending_writes() needs access to the rdev so...*/
mbio
->
bi_bdev
=
(
void
*
)
conf
->
mirrors
[
i
].
rdev
;
cb
=
blk_check_plugged
(
raid1_unplug
,
mddev
,
sizeof
(
*
plug
));
cb
=
blk_check_plugged
(
raid1_unplug
,
mddev
,
sizeof
(
*
plug
));
if
(
cb
)
if
(
cb
)
plug
=
container_of
(
cb
,
struct
raid1_plug_cb
,
cb
);
plug
=
container_of
(
cb
,
struct
raid1_plug_cb
,
cb
);
...
@@ -1436,6 +1493,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1436,6 +1493,7 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
* next level up know.
* next level up know.
* else mark the drive as failed
* else mark the drive as failed
*/
*/
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
if
(
test_bit
(
In_sync
,
&
rdev
->
flags
)
if
(
test_bit
(
In_sync
,
&
rdev
->
flags
)
&&
(
conf
->
raid_disks
-
mddev
->
degraded
)
==
1
)
{
&&
(
conf
->
raid_disks
-
mddev
->
degraded
)
==
1
)
{
/*
/*
...
@@ -1445,10 +1503,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1445,10 +1503,10 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
* it is very likely to fail.
* it is very likely to fail.
*/
*/
conf
->
recovery_disabled
=
mddev
->
recovery_disabled
;
conf
->
recovery_disabled
=
mddev
->
recovery_disabled
;
spin_unlock_irqrestore
(
&
conf
->
device_lock
,
flags
);
return
;
return
;
}
}
set_bit
(
Blocked
,
&
rdev
->
flags
);
set_bit
(
Blocked
,
&
rdev
->
flags
);
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
if
(
test_and_clear_bit
(
In_sync
,
&
rdev
->
flags
))
{
if
(
test_and_clear_bit
(
In_sync
,
&
rdev
->
flags
))
{
mddev
->
degraded
++
;
mddev
->
degraded
++
;
set_bit
(
Faulty
,
&
rdev
->
flags
);
set_bit
(
Faulty
,
&
rdev
->
flags
);
...
@@ -1459,10 +1517,9 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1459,10 +1517,9 @@ static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
* if recovery is running, make sure it aborts.
* if recovery is running, make sure it aborts.
*/
*/
set_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
);
set_mask_bits
(
&
mddev
->
flags
,
0
,
set_mask_bits
(
&
mddev
->
sb_flags
,
0
,
BIT
(
MD_CHANGE_DEVS
)
|
BIT
(
MD_CHANGE_PENDING
));
BIT
(
MD_SB_CHANGE_DEVS
)
|
BIT
(
MD_SB_CHANGE_PENDING
));
printk
(
KERN_ALERT
pr_crit
(
"md/raid1:%s: Disk failure on %s, disabling device.
\n
"
"md/raid1:%s: Disk failure on %s, disabling device.
\n
"
"md/raid1:%s: Operation continuing on %d devices.
\n
"
,
"md/raid1:%s: Operation continuing on %d devices.
\n
"
,
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
),
conf
->
raid_disks
-
mddev
->
degraded
);
mdname
(
mddev
),
conf
->
raid_disks
-
mddev
->
degraded
);
...
@@ -1472,12 +1529,12 @@ static void print_conf(struct r1conf *conf)
...
@@ -1472,12 +1529,12 @@ static void print_conf(struct r1conf *conf)
{
{
int
i
;
int
i
;
pr
intk
(
KERN_DEBUG
"RAID1 conf printout:
\n
"
);
pr
_debug
(
"RAID1 conf printout:
\n
"
);
if
(
!
conf
)
{
if
(
!
conf
)
{
pr
intk
(
KERN_DEBUG
"(!conf)
\n
"
);
pr
_debug
(
"(!conf)
\n
"
);
return
;
return
;
}
}
pr
intk
(
KERN_DEBUG
" --- wd:%d rd:%d
\n
"
,
conf
->
raid_disks
-
conf
->
mddev
->
degraded
,
pr
_debug
(
" --- wd:%d rd:%d
\n
"
,
conf
->
raid_disks
-
conf
->
mddev
->
degraded
,
conf
->
raid_disks
);
conf
->
raid_disks
);
rcu_read_lock
();
rcu_read_lock
();
...
@@ -1485,7 +1542,7 @@ static void print_conf(struct r1conf *conf)
...
@@ -1485,7 +1542,7 @@ static void print_conf(struct r1conf *conf)
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
struct
md_rdev
*
rdev
=
rcu_dereference
(
conf
->
mirrors
[
i
].
rdev
);
struct
md_rdev
*
rdev
=
rcu_dereference
(
conf
->
mirrors
[
i
].
rdev
);
if
(
rdev
)
if
(
rdev
)
pr
intk
(
KERN_DEBUG
" disk %d, wo:%d, o:%d, dev:%s
\n
"
,
pr
_debug
(
" disk %d, wo:%d, o:%d, dev:%s
\n
"
,
i
,
!
test_bit
(
In_sync
,
&
rdev
->
flags
),
i
,
!
test_bit
(
In_sync
,
&
rdev
->
flags
),
!
test_bit
(
Faulty
,
&
rdev
->
flags
),
!
test_bit
(
Faulty
,
&
rdev
->
flags
),
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
...
@@ -1788,12 +1845,24 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
...
@@ -1788,12 +1845,24 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
sector_t
sect
=
r1_bio
->
sector
;
sector_t
sect
=
r1_bio
->
sector
;
int
sectors
=
r1_bio
->
sectors
;
int
sectors
=
r1_bio
->
sectors
;
int
idx
=
0
;
int
idx
=
0
;
struct
md_rdev
*
rdev
;
rdev
=
conf
->
mirrors
[
r1_bio
->
read_disk
].
rdev
;
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
))
{
/* Don't try recovering from here - just fail it
* ... unless it is the last working device of course */
md_error
(
mddev
,
rdev
);
if
(
test_bit
(
Faulty
,
&
rdev
->
flags
))
/* Don't try to read from here, but make sure
* put_buf does it's thing
*/
bio
->
bi_end_io
=
end_sync_write
;
}
while
(
sectors
)
{
while
(
sectors
)
{
int
s
=
sectors
;
int
s
=
sectors
;
int
d
=
r1_bio
->
read_disk
;
int
d
=
r1_bio
->
read_disk
;
int
success
=
0
;
int
success
=
0
;
struct
md_rdev
*
rdev
;
int
start
;
int
start
;
if
(
s
>
(
PAGE_SIZE
>>
9
))
if
(
s
>
(
PAGE_SIZE
>>
9
))
...
@@ -1825,8 +1894,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
...
@@ -1825,8 +1894,7 @@ static int fix_sync_read_error(struct r1bio *r1_bio)
* work just disable and interrupt the recovery.
* work just disable and interrupt the recovery.
* Don't fail devices as that won't really help.
* Don't fail devices as that won't really help.
*/
*/
printk
(
KERN_ALERT
"md/raid1:%s: %s: unrecoverable I/O read error"
pr_crit_ratelimited
(
"md/raid1:%s: %s: unrecoverable I/O read error for block %llu
\n
"
,
" for block %llu
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
bdevname
(
bio
->
bi_bdev
,
b
),
bdevname
(
bio
->
bi_bdev
,
b
),
(
unsigned
long
long
)
r1_bio
->
sector
);
(
unsigned
long
long
)
r1_bio
->
sector
);
...
@@ -2013,6 +2081,9 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
...
@@ -2013,6 +2081,9 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
continue
;
continue
;
bio_set_op_attrs
(
wbio
,
REQ_OP_WRITE
,
0
);
bio_set_op_attrs
(
wbio
,
REQ_OP_WRITE
,
0
);
if
(
test_bit
(
FailFast
,
&
conf
->
mirrors
[
i
].
rdev
->
flags
))
wbio
->
bi_opf
|=
MD_FAILFAST
;
wbio
->
bi_end_io
=
end_sync_write
;
wbio
->
bi_end_io
=
end_sync_write
;
atomic_inc
(
&
r1_bio
->
remaining
);
atomic_inc
(
&
r1_bio
->
remaining
);
md_sync_acct
(
conf
->
mirrors
[
i
].
rdev
->
bdev
,
bio_sectors
(
wbio
));
md_sync_acct
(
conf
->
mirrors
[
i
].
rdev
->
bdev
,
bio_sectors
(
wbio
));
...
@@ -2122,9 +2193,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
...
@@ -2122,9 +2193,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
if
(
r1_sync_page_io
(
rdev
,
sect
,
s
,
if
(
r1_sync_page_io
(
rdev
,
sect
,
s
,
conf
->
tmppage
,
READ
))
{
conf
->
tmppage
,
READ
))
{
atomic_add
(
s
,
&
rdev
->
corrected_errors
);
atomic_add
(
s
,
&
rdev
->
corrected_errors
);
printk
(
KERN_INFO
pr_info
(
"md/raid1:%s: read error corrected (%d sectors at %llu on %s)
\n
"
,
"md/raid1:%s: read error corrected "
"(%d sectors at %llu on %s)
\n
"
,
mdname
(
mddev
),
s
,
mdname
(
mddev
),
s
,
(
unsigned
long
long
)(
sect
+
(
unsigned
long
long
)(
sect
+
rdev
->
data_offset
),
rdev
->
data_offset
),
...
@@ -2287,6 +2356,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
...
@@ -2287,6 +2356,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
struct
bio
*
bio
;
struct
bio
*
bio
;
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
struct
md_rdev
*
rdev
;
struct
md_rdev
*
rdev
;
dev_t
bio_dev
;
sector_t
bio_sector
;
clear_bit
(
R1BIO_ReadError
,
&
r1_bio
->
state
);
clear_bit
(
R1BIO_ReadError
,
&
r1_bio
->
state
);
/* we got a read error. Maybe the drive is bad. Maybe just
/* we got a read error. Maybe the drive is bad. Maybe just
...
@@ -2300,10 +2371,14 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
...
@@ -2300,10 +2371,14 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
bio
=
r1_bio
->
bios
[
r1_bio
->
read_disk
];
bio
=
r1_bio
->
bios
[
r1_bio
->
read_disk
];
bdevname
(
bio
->
bi_bdev
,
b
);
bdevname
(
bio
->
bi_bdev
,
b
);
bio_dev
=
bio
->
bi_bdev
->
bd_dev
;
bio_sector
=
conf
->
mirrors
[
r1_bio
->
read_disk
].
rdev
->
data_offset
+
r1_bio
->
sector
;
bio_put
(
bio
);
bio_put
(
bio
);
r1_bio
->
bios
[
r1_bio
->
read_disk
]
=
NULL
;
r1_bio
->
bios
[
r1_bio
->
read_disk
]
=
NULL
;
if
(
mddev
->
ro
==
0
)
{
rdev
=
conf
->
mirrors
[
r1_bio
->
read_disk
].
rdev
;
if
(
mddev
->
ro
==
0
&&
!
test_bit
(
FailFast
,
&
rdev
->
flags
))
{
freeze_array
(
conf
,
1
);
freeze_array
(
conf
,
1
);
fix_read_error
(
conf
,
r1_bio
->
read_disk
,
fix_read_error
(
conf
,
r1_bio
->
read_disk
,
r1_bio
->
sector
,
r1_bio
->
sectors
);
r1_bio
->
sector
,
r1_bio
->
sectors
);
...
@@ -2312,13 +2387,12 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
...
@@ -2312,13 +2387,12 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
r1_bio
->
bios
[
r1_bio
->
read_disk
]
=
IO_BLOCKED
;
r1_bio
->
bios
[
r1_bio
->
read_disk
]
=
IO_BLOCKED
;
}
}
rdev_dec_pending
(
conf
->
mirrors
[
r1_bio
->
read_disk
].
rdev
,
conf
->
mddev
);
rdev_dec_pending
(
rdev
,
conf
->
mddev
);
read_more:
read_more:
disk
=
read_balance
(
conf
,
r1_bio
,
&
max_sectors
);
disk
=
read_balance
(
conf
,
r1_bio
,
&
max_sectors
);
if
(
disk
==
-
1
)
{
if
(
disk
==
-
1
)
{
printk
(
KERN_ALERT
"md/raid1:%s: %s: unrecoverable I/O"
pr_crit_ratelimited
(
"md/raid1:%s: %s: unrecoverable I/O read error for block %llu
\n
"
,
" read error for block %llu
\n
"
,
mdname
(
mddev
),
b
,
(
unsigned
long
long
)
r1_bio
->
sector
);
mdname
(
mddev
),
b
,
(
unsigned
long
long
)
r1_bio
->
sector
);
raid_end_bio_io
(
r1_bio
);
raid_end_bio_io
(
r1_bio
);
}
else
{
}
else
{
...
@@ -2330,9 +2404,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
...
@@ -2330,9 +2404,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
max_sectors
);
max_sectors
);
r1_bio
->
bios
[
r1_bio
->
read_disk
]
=
bio
;
r1_bio
->
bios
[
r1_bio
->
read_disk
]
=
bio
;
rdev
=
conf
->
mirrors
[
disk
].
rdev
;
rdev
=
conf
->
mirrors
[
disk
].
rdev
;
printk_ratelimited
(
KERN_ERR
pr_info_ratelimited
(
"md/raid1:%s: redirecting sector %llu to other mirror: %s
\n
"
,
"md/raid1:%s: redirecting sector %llu"
" to other mirror: %s
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
(
unsigned
long
long
)
r1_bio
->
sector
,
(
unsigned
long
long
)
r1_bio
->
sector
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
...
@@ -2340,6 +2412,9 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
...
@@ -2340,6 +2412,9 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
bio
->
bi_bdev
=
rdev
->
bdev
;
bio
->
bi_bdev
=
rdev
->
bdev
;
bio
->
bi_end_io
=
raid1_end_read_request
;
bio
->
bi_end_io
=
raid1_end_read_request
;
bio_set_op_attrs
(
bio
,
REQ_OP_READ
,
do_sync
);
bio_set_op_attrs
(
bio
,
REQ_OP_READ
,
do_sync
);
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
)
&&
test_bit
(
R1BIO_FailFast
,
&
r1_bio
->
state
))
bio
->
bi_opf
|=
MD_FAILFAST
;
bio
->
bi_private
=
r1_bio
;
bio
->
bi_private
=
r1_bio
;
if
(
max_sectors
<
r1_bio
->
sectors
)
{
if
(
max_sectors
<
r1_bio
->
sectors
)
{
/* Drat - have to split this up more */
/* Drat - have to split this up more */
...
@@ -2353,6 +2428,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
...
@@ -2353,6 +2428,8 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
else
else
mbio
->
bi_phys_segments
++
;
mbio
->
bi_phys_segments
++
;
spin_unlock_irq
(
&
conf
->
device_lock
);
spin_unlock_irq
(
&
conf
->
device_lock
);
trace_block_bio_remap
(
bdev_get_queue
(
bio
->
bi_bdev
),
bio
,
bio_dev
,
bio_sector
);
generic_make_request
(
bio
);
generic_make_request
(
bio
);
bio
=
NULL
;
bio
=
NULL
;
...
@@ -2367,9 +2444,12 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
...
@@ -2367,9 +2444,12 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
sectors_handled
;
sectors_handled
;
goto
read_more
;
goto
read_more
;
}
else
}
else
{
trace_block_bio_remap
(
bdev_get_queue
(
bio
->
bi_bdev
),
bio
,
bio_dev
,
bio_sector
);
generic_make_request
(
bio
);
generic_make_request
(
bio
);
}
}
}
}
}
static
void
raid1d
(
struct
md_thread
*
thread
)
static
void
raid1d
(
struct
md_thread
*
thread
)
...
@@ -2384,10 +2464,10 @@ static void raid1d(struct md_thread *thread)
...
@@ -2384,10 +2464,10 @@ static void raid1d(struct md_thread *thread)
md_check_recovery
(
mddev
);
md_check_recovery
(
mddev
);
if
(
!
list_empty_careful
(
&
conf
->
bio_end_io_list
)
&&
if
(
!
list_empty_careful
(
&
conf
->
bio_end_io_list
)
&&
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
{
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
{
LIST_HEAD
(
tmp
);
LIST_HEAD
(
tmp
);
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
if
(
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
{
if
(
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
{
while
(
!
list_empty
(
&
conf
->
bio_end_io_list
))
{
while
(
!
list_empty
(
&
conf
->
bio_end_io_list
))
{
list_move
(
conf
->
bio_end_io_list
.
prev
,
&
tmp
);
list_move
(
conf
->
bio_end_io_list
.
prev
,
&
tmp
);
conf
->
nr_queued
--
;
conf
->
nr_queued
--
;
...
@@ -2441,7 +2521,7 @@ static void raid1d(struct md_thread *thread)
...
@@ -2441,7 +2521,7 @@ static void raid1d(struct md_thread *thread)
generic_make_request
(
r1_bio
->
bios
[
r1_bio
->
read_disk
]);
generic_make_request
(
r1_bio
->
bios
[
r1_bio
->
read_disk
]);
cond_resched
();
cond_resched
();
if
(
mddev
->
flags
&
~
(
1
<<
MD
_CHANGE_PENDING
))
if
(
mddev
->
sb_flags
&
~
(
1
<<
MD_SB
_CHANGE_PENDING
))
md_check_recovery
(
mddev
);
md_check_recovery
(
mddev
);
}
}
blk_finish_plug
(
&
plug
);
blk_finish_plug
(
&
plug
);
...
@@ -2623,6 +2703,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -2623,6 +2703,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
bio
->
bi_iter
.
bi_sector
=
sector_nr
+
rdev
->
data_offset
;
bio
->
bi_iter
.
bi_sector
=
sector_nr
+
rdev
->
data_offset
;
bio
->
bi_bdev
=
rdev
->
bdev
;
bio
->
bi_bdev
=
rdev
->
bdev
;
bio
->
bi_private
=
r1_bio
;
bio
->
bi_private
=
r1_bio
;
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
))
bio
->
bi_opf
|=
MD_FAILFAST
;
}
}
}
}
rcu_read_unlock
();
rcu_read_unlock
();
...
@@ -2642,7 +2724,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -2642,7 +2724,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
min_bad
,
0
min_bad
,
0
)
&&
ok
;
)
&&
ok
;
}
}
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
*
skipped
=
1
;
*
skipped
=
1
;
put_buf
(
r1_bio
);
put_buf
(
r1_bio
);
...
@@ -2753,6 +2835,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -2753,6 +2835,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if
(
bio
->
bi_end_io
==
end_sync_read
)
{
if
(
bio
->
bi_end_io
==
end_sync_read
)
{
read_targets
--
;
read_targets
--
;
md_sync_acct
(
bio
->
bi_bdev
,
nr_sectors
);
md_sync_acct
(
bio
->
bi_bdev
,
nr_sectors
);
if
(
read_targets
==
1
)
bio
->
bi_opf
&=
~
MD_FAILFAST
;
generic_make_request
(
bio
);
generic_make_request
(
bio
);
}
}
}
}
...
@@ -2760,6 +2844,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -2760,6 +2844,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
atomic_set
(
&
r1_bio
->
remaining
,
1
);
atomic_set
(
&
r1_bio
->
remaining
,
1
);
bio
=
r1_bio
->
bios
[
r1_bio
->
read_disk
];
bio
=
r1_bio
->
bios
[
r1_bio
->
read_disk
];
md_sync_acct
(
bio
->
bi_bdev
,
nr_sectors
);
md_sync_acct
(
bio
->
bi_bdev
,
nr_sectors
);
if
(
read_targets
==
1
)
bio
->
bi_opf
&=
~
MD_FAILFAST
;
generic_make_request
(
bio
);
generic_make_request
(
bio
);
}
}
...
@@ -2875,12 +2961,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
...
@@ -2875,12 +2961,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err
=
-
ENOMEM
;
err
=
-
ENOMEM
;
conf
->
thread
=
md_register_thread
(
raid1d
,
mddev
,
"raid1"
);
conf
->
thread
=
md_register_thread
(
raid1d
,
mddev
,
"raid1"
);
if
(
!
conf
->
thread
)
{
if
(
!
conf
->
thread
)
printk
(
KERN_ERR
"md/raid1:%s: couldn't allocate thread
\n
"
,
mdname
(
mddev
));
goto
abort
;
goto
abort
;
}
return
conf
;
return
conf
;
...
@@ -2905,12 +2987,12 @@ static int raid1_run(struct mddev *mddev)
...
@@ -2905,12 +2987,12 @@ static int raid1_run(struct mddev *mddev)
bool
discard_supported
=
false
;
bool
discard_supported
=
false
;
if
(
mddev
->
level
!=
1
)
{
if
(
mddev
->
level
!=
1
)
{
pr
intk
(
KERN_ERR
"md/raid1:%s: raid level not set to mirroring (%d)
\n
"
,
pr
_warn
(
"md/raid1:%s: raid level not set to mirroring (%d)
\n
"
,
mdname
(
mddev
),
mddev
->
level
);
mdname
(
mddev
),
mddev
->
level
);
return
-
EIO
;
return
-
EIO
;
}
}
if
(
mddev
->
reshape_position
!=
MaxSector
)
{
if
(
mddev
->
reshape_position
!=
MaxSector
)
{
pr
intk
(
KERN_ERR
"md/raid1:%s: reshape_position set but not supported
\n
"
,
pr
_warn
(
"md/raid1:%s: reshape_position set but not supported
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EIO
;
return
-
EIO
;
}
}
...
@@ -2950,11 +3032,9 @@ static int raid1_run(struct mddev *mddev)
...
@@ -2950,11 +3032,9 @@ static int raid1_run(struct mddev *mddev)
mddev
->
recovery_cp
=
MaxSector
;
mddev
->
recovery_cp
=
MaxSector
;
if
(
mddev
->
recovery_cp
!=
MaxSector
)
if
(
mddev
->
recovery_cp
!=
MaxSector
)
printk
(
KERN_NOTICE
"md/raid1:%s: not clean"
pr_info
(
"md/raid1:%s: not clean -- starting background reconstruction
\n
"
,
" -- starting background reconstruction
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
printk
(
KERN_INFO
pr_info
(
"md/raid1:%s: active with %d out of %d mirrors
\n
"
,
"md/raid1:%s: active with %d out of %d mirrors
\n
"
,
mdname
(
mddev
),
mddev
->
raid_disks
-
mddev
->
degraded
,
mdname
(
mddev
),
mddev
->
raid_disks
-
mddev
->
degraded
,
mddev
->
raid_disks
);
mddev
->
raid_disks
);
...
@@ -2964,6 +3044,7 @@ static int raid1_run(struct mddev *mddev)
...
@@ -2964,6 +3044,7 @@ static int raid1_run(struct mddev *mddev)
mddev
->
thread
=
conf
->
thread
;
mddev
->
thread
=
conf
->
thread
;
conf
->
thread
=
NULL
;
conf
->
thread
=
NULL
;
mddev
->
private
=
conf
;
mddev
->
private
=
conf
;
set_bit
(
MD_FAILFAST_SUPPORTED
,
&
mddev
->
flags
);
md_set_array_sectors
(
mddev
,
raid1_size
(
mddev
,
0
,
0
));
md_set_array_sectors
(
mddev
,
raid1_size
(
mddev
,
0
,
0
));
...
@@ -3107,8 +3188,7 @@ static int raid1_reshape(struct mddev *mddev)
...
@@ -3107,8 +3188,7 @@ static int raid1_reshape(struct mddev *mddev)
rdev
->
raid_disk
=
d2
;
rdev
->
raid_disk
=
d2
;
sysfs_unlink_rdev
(
mddev
,
rdev
);
sysfs_unlink_rdev
(
mddev
,
rdev
);
if
(
sysfs_link_rdev
(
mddev
,
rdev
))
if
(
sysfs_link_rdev
(
mddev
,
rdev
))
printk
(
KERN_WARNING
pr_warn
(
"md/raid1:%s: cannot register rd%d
\n
"
,
"md/raid1:%s: cannot register rd%d
\n
"
,
mdname
(
mddev
),
rdev
->
raid_disk
);
mdname
(
mddev
),
rdev
->
raid_disk
);
}
}
if
(
rdev
)
if
(
rdev
)
...
@@ -3163,9 +3243,12 @@ static void *raid1_takeover(struct mddev *mddev)
...
@@ -3163,9 +3243,12 @@ static void *raid1_takeover(struct mddev *mddev)
mddev
->
new_layout
=
0
;
mddev
->
new_layout
=
0
;
mddev
->
new_chunk_sectors
=
0
;
mddev
->
new_chunk_sectors
=
0
;
conf
=
setup_conf
(
mddev
);
conf
=
setup_conf
(
mddev
);
if
(
!
IS_ERR
(
conf
))
if
(
!
IS_ERR
(
conf
))
{
/* Array must appear to be quiesced */
/* Array must appear to be quiesced */
conf
->
array_frozen
=
1
;
conf
->
array_frozen
=
1
;
clear_bit
(
MD_HAS_JOURNAL
,
&
mddev
->
flags
);
clear_bit
(
MD_JOURNAL_CLEAN
,
&
mddev
->
flags
);
}
return
conf
;
return
conf
;
}
}
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
...
...
drivers/md/raid1.h
浏览文件 @
20737738
...
@@ -161,14 +161,15 @@ struct r1bio {
...
@@ -161,14 +161,15 @@ struct r1bio {
};
};
/* bits for r1bio.state */
/* bits for r1bio.state */
#define R1BIO_Uptodate 0
enum
r1bio_state
{
#define R1BIO_IsSync 1
R1BIO_Uptodate
,
#define R1BIO_Degraded 2
R1BIO_IsSync
,
#define R1BIO_BehindIO 3
R1BIO_Degraded
,
R1BIO_BehindIO
,
/* Set ReadError on bios that experience a readerror so that
/* Set ReadError on bios that experience a readerror so that
* raid1d knows what to do with them.
* raid1d knows what to do with them.
*/
*/
#define R1BIO_ReadError 4
R1BIO_ReadError
,
/* For write-behind requests, we call bi_end_io when
/* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing
* the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when
* any write was successful. Otherwise we call when
...
@@ -176,10 +177,12 @@ struct r1bio {
...
@@ -176,10 +177,12 @@ struct r1bio {
* with failure when last write completes (and all failed).
* with failure when last write completes (and all failed).
* Record that bi_end_io was called with this flag...
* Record that bi_end_io was called with this flag...
*/
*/
#define R1BIO_Returned 6
R1BIO_Returned
,
/* If a write for this request means we can clear some
/* If a write for this request means we can clear some
* known-bad-block records, we set this flag
* known-bad-block records, we set this flag
*/
*/
#define R1BIO_MadeGood 7
R1BIO_MadeGood
,
#define R1BIO_WriteError 8
R1BIO_WriteError
,
R1BIO_FailFast
,
};
#endif
#endif
drivers/md/raid10.c
浏览文件 @
20737738
...
@@ -25,6 +25,7 @@
...
@@ -25,6 +25,7 @@
#include <linux/seq_file.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/ratelimit.h>
#include <linux/kthread.h>
#include <linux/kthread.h>
#include <trace/events/block.h>
#include "md.h"
#include "md.h"
#include "raid10.h"
#include "raid10.h"
#include "raid0.h"
#include "raid0.h"
...
@@ -99,12 +100,16 @@ static int max_queued_requests = 1024;
...
@@ -99,12 +100,16 @@ static int max_queued_requests = 1024;
static
void
allow_barrier
(
struct
r10conf
*
conf
);
static
void
allow_barrier
(
struct
r10conf
*
conf
);
static
void
lower_barrier
(
struct
r10conf
*
conf
);
static
void
lower_barrier
(
struct
r10conf
*
conf
);
static
int
_enough
(
struct
r10conf
*
conf
,
int
previous
,
int
ignore
);
static
int
_enough
(
struct
r10conf
*
conf
,
int
previous
,
int
ignore
);
static
int
enough
(
struct
r10conf
*
conf
,
int
ignore
);
static
sector_t
reshape_request
(
struct
mddev
*
mddev
,
sector_t
sector_nr
,
static
sector_t
reshape_request
(
struct
mddev
*
mddev
,
sector_t
sector_nr
,
int
*
skipped
);
int
*
skipped
);
static
void
reshape_request_write
(
struct
mddev
*
mddev
,
struct
r10bio
*
r10_bio
);
static
void
reshape_request_write
(
struct
mddev
*
mddev
,
struct
r10bio
*
r10_bio
);
static
void
end_reshape_write
(
struct
bio
*
bio
);
static
void
end_reshape_write
(
struct
bio
*
bio
);
static
void
end_reshape
(
struct
r10conf
*
conf
);
static
void
end_reshape
(
struct
r10conf
*
conf
);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
static
void
*
r10bio_pool_alloc
(
gfp_t
gfp_flags
,
void
*
data
)
static
void
*
r10bio_pool_alloc
(
gfp_t
gfp_flags
,
void
*
data
)
{
{
struct
r10conf
*
conf
=
data
;
struct
r10conf
*
conf
=
data
;
...
@@ -404,8 +409,7 @@ static void raid10_end_read_request(struct bio *bio)
...
@@ -404,8 +409,7 @@ static void raid10_end_read_request(struct bio *bio)
* oops, read error - keep the refcount on the rdev
* oops, read error - keep the refcount on the rdev
*/
*/
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
printk_ratelimited
(
KERN_ERR
pr_err_ratelimited
(
"md/raid10:%s: %s: rescheduling sector %llu
\n
"
,
"md/raid10:%s: %s: rescheduling sector %llu
\n
"
,
mdname
(
conf
->
mddev
),
mdname
(
conf
->
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
(
unsigned
long
long
)
r10_bio
->
sector
);
(
unsigned
long
long
)
r10_bio
->
sector
);
...
@@ -447,6 +451,7 @@ static void raid10_end_write_request(struct bio *bio)
...
@@ -447,6 +451,7 @@ static void raid10_end_write_request(struct bio *bio)
struct
r10conf
*
conf
=
r10_bio
->
mddev
->
private
;
struct
r10conf
*
conf
=
r10_bio
->
mddev
->
private
;
int
slot
,
repl
;
int
slot
,
repl
;
struct
md_rdev
*
rdev
=
NULL
;
struct
md_rdev
*
rdev
=
NULL
;
struct
bio
*
to_put
=
NULL
;
bool
discard_error
;
bool
discard_error
;
discard_error
=
bio
->
bi_error
&&
bio_op
(
bio
)
==
REQ_OP_DISCARD
;
discard_error
=
bio
->
bi_error
&&
bio_op
(
bio
)
==
REQ_OP_DISCARD
;
...
@@ -474,8 +479,24 @@ static void raid10_end_write_request(struct bio *bio)
...
@@ -474,8 +479,24 @@ static void raid10_end_write_request(struct bio *bio)
if
(
!
test_and_set_bit
(
WantReplacement
,
&
rdev
->
flags
))
if
(
!
test_and_set_bit
(
WantReplacement
,
&
rdev
->
flags
))
set_bit
(
MD_RECOVERY_NEEDED
,
set_bit
(
MD_RECOVERY_NEEDED
,
&
rdev
->
mddev
->
recovery
);
&
rdev
->
mddev
->
recovery
);
set_bit
(
R10BIO_WriteError
,
&
r10_bio
->
state
);
dec_rdev
=
0
;
dec_rdev
=
0
;
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
)
&&
(
bio
->
bi_opf
&
MD_FAILFAST
))
{
md_error
(
rdev
->
mddev
,
rdev
);
if
(
!
test_bit
(
Faulty
,
&
rdev
->
flags
))
/* This is the only remaining device,
* We need to retry the write without
* FailFast
*/
set_bit
(
R10BIO_WriteError
,
&
r10_bio
->
state
);
else
{
r10_bio
->
devs
[
slot
].
bio
=
NULL
;
to_put
=
bio
;
dec_rdev
=
1
;
}
}
else
set_bit
(
R10BIO_WriteError
,
&
r10_bio
->
state
);
}
}
}
else
{
}
else
{
/*
/*
...
@@ -525,6 +546,8 @@ static void raid10_end_write_request(struct bio *bio)
...
@@ -525,6 +546,8 @@ static void raid10_end_write_request(struct bio *bio)
one_write_done
(
r10_bio
);
one_write_done
(
r10_bio
);
if
(
dec_rdev
)
if
(
dec_rdev
)
rdev_dec_pending
(
rdev
,
conf
->
mddev
);
rdev_dec_pending
(
rdev
,
conf
->
mddev
);
if
(
to_put
)
bio_put
(
to_put
);
}
}
/*
/*
...
@@ -716,6 +739,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
...
@@ -716,6 +739,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
best_dist
=
MaxSector
;
best_dist
=
MaxSector
;
best_good_sectors
=
0
;
best_good_sectors
=
0
;
do_balance
=
1
;
do_balance
=
1
;
clear_bit
(
R10BIO_FailFast
,
&
r10_bio
->
state
);
/*
/*
* Check if we can balance. We can balance on the whole
* Check if we can balance. We can balance on the whole
* device if no resync is going on (recovery is ok), or below
* device if no resync is going on (recovery is ok), or below
...
@@ -780,15 +804,18 @@ static struct md_rdev *read_balance(struct r10conf *conf,
...
@@ -780,15 +804,18 @@ static struct md_rdev *read_balance(struct r10conf *conf,
if
(
!
do_balance
)
if
(
!
do_balance
)
break
;
break
;
if
(
best_slot
>=
0
)
/* At least 2 disks to choose from so failfast is OK */
set_bit
(
R10BIO_FailFast
,
&
r10_bio
->
state
);
/* This optimisation is debatable, and completely destroys
/* This optimisation is debatable, and completely destroys
* sequential read speed for 'far copies' arrays. So only
* sequential read speed for 'far copies' arrays. So only
* keep it for 'near' arrays, and review those later.
* keep it for 'near' arrays, and review those later.
*/
*/
if
(
geo
->
near_copies
>
1
&&
!
atomic_read
(
&
rdev
->
nr_pending
))
if
(
geo
->
near_copies
>
1
&&
!
atomic_read
(
&
rdev
->
nr_pending
))
break
;
new_distance
=
0
;
/* for far > 1 always use the lowest address */
/* for far > 1 always use the lowest address */
if
(
geo
->
far_copies
>
1
)
else
if
(
geo
->
far_copies
>
1
)
new_distance
=
r10_bio
->
devs
[
slot
].
addr
;
new_distance
=
r10_bio
->
devs
[
slot
].
addr
;
else
else
new_distance
=
abs
(
r10_bio
->
devs
[
slot
].
addr
-
new_distance
=
abs
(
r10_bio
->
devs
[
slot
].
addr
-
...
@@ -859,8 +886,13 @@ static void flush_pending_writes(struct r10conf *conf)
...
@@ -859,8 +886,13 @@ static void flush_pending_writes(struct r10conf *conf)
while
(
bio
)
{
/* submit pending writes */
while
(
bio
)
{
/* submit pending writes */
struct
bio
*
next
=
bio
->
bi_next
;
struct
bio
*
next
=
bio
->
bi_next
;
struct
md_rdev
*
rdev
=
(
void
*
)
bio
->
bi_bdev
;
bio
->
bi_next
=
NULL
;
bio
->
bi_next
=
NULL
;
if
(
unlikely
((
bio_op
(
bio
)
==
REQ_OP_DISCARD
)
&&
bio
->
bi_bdev
=
rdev
->
bdev
;
if
(
test_bit
(
Faulty
,
&
rdev
->
flags
))
{
bio
->
bi_error
=
-
EIO
;
bio_endio
(
bio
);
}
else
if
(
unlikely
((
bio_op
(
bio
)
==
REQ_OP_DISCARD
)
&&
!
blk_queue_discard
(
bdev_get_queue
(
bio
->
bi_bdev
))))
!
blk_queue_discard
(
bdev_get_queue
(
bio
->
bi_bdev
))))
/* Just ignore it */
/* Just ignore it */
bio_endio
(
bio
);
bio_endio
(
bio
);
...
@@ -937,6 +969,7 @@ static void wait_barrier(struct r10conf *conf)
...
@@ -937,6 +969,7 @@ static void wait_barrier(struct r10conf *conf)
* that queue to get the nr_pending
* that queue to get the nr_pending
* count down.
* count down.
*/
*/
raid10_log
(
conf
->
mddev
,
"wait barrier"
);
wait_event_lock_irq
(
conf
->
wait_barrier
,
wait_event_lock_irq
(
conf
->
wait_barrier
,
!
conf
->
barrier
||
!
conf
->
barrier
||
(
atomic_read
(
&
conf
->
nr_pending
)
&&
(
atomic_read
(
&
conf
->
nr_pending
)
&&
...
@@ -1037,8 +1070,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
...
@@ -1037,8 +1070,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
while
(
bio
)
{
/* submit pending writes */
while
(
bio
)
{
/* submit pending writes */
struct
bio
*
next
=
bio
->
bi_next
;
struct
bio
*
next
=
bio
->
bi_next
;
struct
md_rdev
*
rdev
=
(
void
*
)
bio
->
bi_bdev
;
bio
->
bi_next
=
NULL
;
bio
->
bi_next
=
NULL
;
if
(
unlikely
((
bio_op
(
bio
)
==
REQ_OP_DISCARD
)
&&
bio
->
bi_bdev
=
rdev
->
bdev
;
if
(
test_bit
(
Faulty
,
&
rdev
->
flags
))
{
bio
->
bi_error
=
-
EIO
;
bio_endio
(
bio
);
}
else
if
(
unlikely
((
bio_op
(
bio
)
==
REQ_OP_DISCARD
)
&&
!
blk_queue_discard
(
bdev_get_queue
(
bio
->
bi_bdev
))))
!
blk_queue_discard
(
bdev_get_queue
(
bio
->
bi_bdev
))))
/* Just ignore it */
/* Just ignore it */
bio_endio
(
bio
);
bio_endio
(
bio
);
...
@@ -1083,6 +1121,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
...
@@ -1083,6 +1121,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
/* IO spans the reshape position. Need to wait for
/* IO spans the reshape position. Need to wait for
* reshape to pass
* reshape to pass
*/
*/
raid10_log
(
conf
->
mddev
,
"wait reshape"
);
allow_barrier
(
conf
);
allow_barrier
(
conf
);
wait_event
(
conf
->
wait_barrier
,
wait_event
(
conf
->
wait_barrier
,
conf
->
reshape_progress
<=
bio
->
bi_iter
.
bi_sector
||
conf
->
reshape_progress
<=
bio
->
bi_iter
.
bi_sector
||
...
@@ -1099,11 +1138,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
...
@@ -1099,11 +1138,12 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
bio
->
bi_iter
.
bi_sector
<
conf
->
reshape_progress
)))
{
bio
->
bi_iter
.
bi_sector
<
conf
->
reshape_progress
)))
{
/* Need to update reshape_position in metadata */
/* Need to update reshape_position in metadata */
mddev
->
reshape_position
=
conf
->
reshape_progress
;
mddev
->
reshape_position
=
conf
->
reshape_progress
;
set_mask_bits
(
&
mddev
->
flags
,
0
,
set_mask_bits
(
&
mddev
->
sb_
flags
,
0
,
BIT
(
MD_
CHANGE_DEVS
)
|
BIT
(
MD
_CHANGE_PENDING
));
BIT
(
MD_
SB_CHANGE_DEVS
)
|
BIT
(
MD_SB
_CHANGE_PENDING
));
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
raid10_log
(
conf
->
mddev
,
"wait reshape metadata"
);
wait_event
(
mddev
->
sb_wait
,
wait_event
(
mddev
->
sb_wait
,
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
));
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
));
conf
->
reshape_safe
=
mddev
->
reshape_position
;
conf
->
reshape_safe
=
mddev
->
reshape_position
;
}
}
...
@@ -1154,8 +1194,15 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
...
@@ -1154,8 +1194,15 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
read_bio
->
bi_bdev
=
rdev
->
bdev
;
read_bio
->
bi_bdev
=
rdev
->
bdev
;
read_bio
->
bi_end_io
=
raid10_end_read_request
;
read_bio
->
bi_end_io
=
raid10_end_read_request
;
bio_set_op_attrs
(
read_bio
,
op
,
do_sync
);
bio_set_op_attrs
(
read_bio
,
op
,
do_sync
);
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
)
&&
test_bit
(
R10BIO_FailFast
,
&
r10_bio
->
state
))
read_bio
->
bi_opf
|=
MD_FAILFAST
;
read_bio
->
bi_private
=
r10_bio
;
read_bio
->
bi_private
=
r10_bio
;
if
(
mddev
->
gendisk
)
trace_block_bio_remap
(
bdev_get_queue
(
read_bio
->
bi_bdev
),
read_bio
,
disk_devt
(
mddev
->
gendisk
),
r10_bio
->
sector
);
if
(
max_sectors
<
r10_bio
->
sectors
)
{
if
(
max_sectors
<
r10_bio
->
sectors
)
{
/* Could not read all from this device, so we will
/* Could not read all from this device, so we will
* need another r10_bio.
* need another r10_bio.
...
@@ -1195,6 +1242,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
...
@@ -1195,6 +1242,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
*/
*/
if
(
conf
->
pending_count
>=
max_queued_requests
)
{
if
(
conf
->
pending_count
>=
max_queued_requests
)
{
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
raid10_log
(
mddev
,
"wait queued"
);
wait_event
(
conf
->
wait_barrier
,
wait_event
(
conf
->
wait_barrier
,
conf
->
pending_count
<
max_queued_requests
);
conf
->
pending_count
<
max_queued_requests
);
}
}
...
@@ -1322,6 +1370,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
...
@@ -1322,6 +1370,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
}
}
}
}
allow_barrier
(
conf
);
allow_barrier
(
conf
);
raid10_log
(
conf
->
mddev
,
"wait rdev %d blocked"
,
blocked_rdev
->
raid_disk
);
md_wait_for_blocked_rdev
(
blocked_rdev
,
mddev
);
md_wait_for_blocked_rdev
(
blocked_rdev
,
mddev
);
wait_barrier
(
conf
);
wait_barrier
(
conf
);
goto
retry_write
;
goto
retry_write
;
...
@@ -1361,8 +1410,18 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
...
@@ -1361,8 +1410,18 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
mbio
->
bi_bdev
=
rdev
->
bdev
;
mbio
->
bi_bdev
=
rdev
->
bdev
;
mbio
->
bi_end_io
=
raid10_end_write_request
;
mbio
->
bi_end_io
=
raid10_end_write_request
;
bio_set_op_attrs
(
mbio
,
op
,
do_sync
|
do_fua
);
bio_set_op_attrs
(
mbio
,
op
,
do_sync
|
do_fua
);
if
(
test_bit
(
FailFast
,
&
conf
->
mirrors
[
d
].
rdev
->
flags
)
&&
enough
(
conf
,
d
))
mbio
->
bi_opf
|=
MD_FAILFAST
;
mbio
->
bi_private
=
r10_bio
;
mbio
->
bi_private
=
r10_bio
;
if
(
conf
->
mddev
->
gendisk
)
trace_block_bio_remap
(
bdev_get_queue
(
mbio
->
bi_bdev
),
mbio
,
disk_devt
(
conf
->
mddev
->
gendisk
),
r10_bio
->
sector
);
/* flush_pending_writes() needs access to the rdev so...*/
mbio
->
bi_bdev
=
(
void
*
)
rdev
;
atomic_inc
(
&
r10_bio
->
remaining
);
atomic_inc
(
&
r10_bio
->
remaining
);
cb
=
blk_check_plugged
(
raid10_unplug
,
mddev
,
cb
=
blk_check_plugged
(
raid10_unplug
,
mddev
,
...
@@ -1405,6 +1464,13 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
...
@@ -1405,6 +1464,13 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
bio_set_op_attrs
(
mbio
,
op
,
do_sync
|
do_fua
);
bio_set_op_attrs
(
mbio
,
op
,
do_sync
|
do_fua
);
mbio
->
bi_private
=
r10_bio
;
mbio
->
bi_private
=
r10_bio
;
if
(
conf
->
mddev
->
gendisk
)
trace_block_bio_remap
(
bdev_get_queue
(
mbio
->
bi_bdev
),
mbio
,
disk_devt
(
conf
->
mddev
->
gendisk
),
r10_bio
->
sector
);
/* flush_pending_writes() needs access to the rdev so...*/
mbio
->
bi_bdev
=
(
void
*
)
rdev
;
atomic_inc
(
&
r10_bio
->
remaining
);
atomic_inc
(
&
r10_bio
->
remaining
);
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
bio_list_add
(
&
conf
->
pending_bio_list
,
mbio
);
bio_list_add
(
&
conf
->
pending_bio_list
,
mbio
);
...
@@ -1586,11 +1652,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -1586,11 +1652,10 @@ static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
set_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
);
set_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
);
set_bit
(
Blocked
,
&
rdev
->
flags
);
set_bit
(
Blocked
,
&
rdev
->
flags
);
set_bit
(
Faulty
,
&
rdev
->
flags
);
set_bit
(
Faulty
,
&
rdev
->
flags
);
set_mask_bits
(
&
mddev
->
flags
,
0
,
set_mask_bits
(
&
mddev
->
sb_
flags
,
0
,
BIT
(
MD_
CHANGE_DEVS
)
|
BIT
(
MD
_CHANGE_PENDING
));
BIT
(
MD_
SB_CHANGE_DEVS
)
|
BIT
(
MD_SB
_CHANGE_PENDING
));
spin_unlock_irqrestore
(
&
conf
->
device_lock
,
flags
);
spin_unlock_irqrestore
(
&
conf
->
device_lock
,
flags
);
printk
(
KERN_ALERT
pr_crit
(
"md/raid10:%s: Disk failure on %s, disabling device.
\n
"
"md/raid10:%s: Disk failure on %s, disabling device.
\n
"
"md/raid10:%s: Operation continuing on %d devices.
\n
"
,
"md/raid10:%s: Operation continuing on %d devices.
\n
"
,
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
mdname
(
mddev
),
conf
->
geo
.
raid_disks
-
mddev
->
degraded
);
mdname
(
mddev
),
conf
->
geo
.
raid_disks
-
mddev
->
degraded
);
...
@@ -1601,12 +1666,12 @@ static void print_conf(struct r10conf *conf)
...
@@ -1601,12 +1666,12 @@ static void print_conf(struct r10conf *conf)
int
i
;
int
i
;
struct
md_rdev
*
rdev
;
struct
md_rdev
*
rdev
;
pr
intk
(
KERN_DEBUG
"RAID10 conf printout:
\n
"
);
pr
_debug
(
"RAID10 conf printout:
\n
"
);
if
(
!
conf
)
{
if
(
!
conf
)
{
pr
intk
(
KERN_DEBUG
"(!conf)
\n
"
);
pr
_debug
(
"(!conf)
\n
"
);
return
;
return
;
}
}
pr
intk
(
KERN_DEBUG
" --- wd:%d rd:%d
\n
"
,
conf
->
geo
.
raid_disks
-
conf
->
mddev
->
degraded
,
pr
_debug
(
" --- wd:%d rd:%d
\n
"
,
conf
->
geo
.
raid_disks
-
conf
->
mddev
->
degraded
,
conf
->
geo
.
raid_disks
);
conf
->
geo
.
raid_disks
);
/* This is only called with ->reconfix_mutex held, so
/* This is only called with ->reconfix_mutex held, so
...
@@ -1615,7 +1680,7 @@ static void print_conf(struct r10conf *conf)
...
@@ -1615,7 +1680,7 @@ static void print_conf(struct r10conf *conf)
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
rdev
=
conf
->
mirrors
[
i
].
rdev
;
rdev
=
conf
->
mirrors
[
i
].
rdev
;
if
(
rdev
)
if
(
rdev
)
pr
intk
(
KERN_DEBUG
" disk %d, wo:%d, o:%d, dev:%s
\n
"
,
pr
_debug
(
" disk %d, wo:%d, o:%d, dev:%s
\n
"
,
i
,
!
test_bit
(
In_sync
,
&
rdev
->
flags
),
i
,
!
test_bit
(
In_sync
,
&
rdev
->
flags
),
!
test_bit
(
Faulty
,
&
rdev
->
flags
),
!
test_bit
(
Faulty
,
&
rdev
->
flags
),
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
...
@@ -1953,6 +2018,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
...
@@ -1953,6 +2018,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
/* now find blocks with errors */
/* now find blocks with errors */
for
(
i
=
0
;
i
<
conf
->
copies
;
i
++
)
{
for
(
i
=
0
;
i
<
conf
->
copies
;
i
++
)
{
int
j
,
d
;
int
j
,
d
;
struct
md_rdev
*
rdev
;
tbio
=
r10_bio
->
devs
[
i
].
bio
;
tbio
=
r10_bio
->
devs
[
i
].
bio
;
...
@@ -1960,6 +2026,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
...
@@ -1960,6 +2026,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
continue
;
continue
;
if
(
i
==
first
)
if
(
i
==
first
)
continue
;
continue
;
d
=
r10_bio
->
devs
[
i
].
devnum
;
rdev
=
conf
->
mirrors
[
d
].
rdev
;
if
(
!
r10_bio
->
devs
[
i
].
bio
->
bi_error
)
{
if
(
!
r10_bio
->
devs
[
i
].
bio
->
bi_error
)
{
/* We know that the bi_io_vec layout is the same for
/* We know that the bi_io_vec layout is the same for
* both 'first' and 'i', so we just compare them.
* both 'first' and 'i', so we just compare them.
...
@@ -1982,6 +2050,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
...
@@ -1982,6 +2050,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
if
(
test_bit
(
MD_RECOVERY_CHECK
,
&
mddev
->
recovery
))
if
(
test_bit
(
MD_RECOVERY_CHECK
,
&
mddev
->
recovery
))
/* Don't fix anything. */
/* Don't fix anything. */
continue
;
continue
;
}
else
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
))
{
/* Just give up on this device */
md_error
(
rdev
->
mddev
,
rdev
);
continue
;
}
}
/* Ok, we need to write this bio, either to correct an
/* Ok, we need to write this bio, either to correct an
* inconsistency or to correct an unreadable block.
* inconsistency or to correct an unreadable block.
...
@@ -1999,11 +2071,12 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
...
@@ -1999,11 +2071,12 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
bio_copy_data
(
tbio
,
fbio
);
bio_copy_data
(
tbio
,
fbio
);
d
=
r10_bio
->
devs
[
i
].
devnum
;
atomic_inc
(
&
conf
->
mirrors
[
d
].
rdev
->
nr_pending
);
atomic_inc
(
&
conf
->
mirrors
[
d
].
rdev
->
nr_pending
);
atomic_inc
(
&
r10_bio
->
remaining
);
atomic_inc
(
&
r10_bio
->
remaining
);
md_sync_acct
(
conf
->
mirrors
[
d
].
rdev
->
bdev
,
bio_sectors
(
tbio
));
md_sync_acct
(
conf
->
mirrors
[
d
].
rdev
->
bdev
,
bio_sectors
(
tbio
));
if
(
test_bit
(
FailFast
,
&
conf
->
mirrors
[
d
].
rdev
->
flags
))
tbio
->
bi_opf
|=
MD_FAILFAST
;
tbio
->
bi_iter
.
bi_sector
+=
conf
->
mirrors
[
d
].
rdev
->
data_offset
;
tbio
->
bi_iter
.
bi_sector
+=
conf
->
mirrors
[
d
].
rdev
->
data_offset
;
tbio
->
bi_bdev
=
conf
->
mirrors
[
d
].
rdev
->
bdev
;
tbio
->
bi_bdev
=
conf
->
mirrors
[
d
].
rdev
->
bdev
;
generic_make_request
(
tbio
);
generic_make_request
(
tbio
);
...
@@ -2109,9 +2182,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
...
@@ -2109,9 +2182,7 @@ static void fix_recovery_read_error(struct r10bio *r10_bio)
ok
=
rdev_set_badblocks
(
rdev2
,
addr
,
s
,
0
);
ok
=
rdev_set_badblocks
(
rdev2
,
addr
,
s
,
0
);
if
(
!
ok
)
{
if
(
!
ok
)
{
/* just abort the recovery */
/* just abort the recovery */
printk
(
KERN_NOTICE
pr_notice
(
"md/raid10:%s: recovery aborted due to read error
\n
"
,
"md/raid10:%s: recovery aborted"
" due to read error
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
conf
->
mirrors
[
dw
].
recovery_disabled
conf
->
mirrors
[
dw
].
recovery_disabled
...
@@ -2259,13 +2330,10 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
...
@@ -2259,13 +2330,10 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
bdevname
(
rdev
->
bdev
,
b
);
bdevname
(
rdev
->
bdev
,
b
);
printk
(
KERN_NOTICE
pr_notice
(
"md/raid10:%s: %s: Raid device exceeded read_error threshold [cur %d:max %d]
\n
"
,
"md/raid10:%s: %s: Raid device exceeded "
"read_error threshold [cur %d:max %d]
\n
"
,
mdname
(
mddev
),
b
,
mdname
(
mddev
),
b
,
atomic_read
(
&
rdev
->
read_errors
),
max_read_errors
);
atomic_read
(
&
rdev
->
read_errors
),
max_read_errors
);
printk
(
KERN_NOTICE
pr_notice
(
"md/raid10:%s: %s: Failing raid device
\n
"
,
"md/raid10:%s: %s: Failing raid device
\n
"
,
mdname
(
mddev
),
b
);
mdname
(
mddev
),
b
);
md_error
(
mddev
,
rdev
);
md_error
(
mddev
,
rdev
);
r10_bio
->
devs
[
r10_bio
->
read_slot
].
bio
=
IO_BLOCKED
;
r10_bio
->
devs
[
r10_bio
->
read_slot
].
bio
=
IO_BLOCKED
;
...
@@ -2356,18 +2424,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
...
@@ -2356,18 +2424,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
s
,
conf
->
tmppage
,
WRITE
)
s
,
conf
->
tmppage
,
WRITE
)
==
0
)
{
==
0
)
{
/* Well, this device is dead */
/* Well, this device is dead */
printk
(
KERN_NOTICE
pr_notice
(
"md/raid10:%s: read correction write failed (%d sectors at %llu on %s)
\n
"
,
"md/raid10:%s: read correction "
"write failed"
" (%d sectors at %llu on %s)
\n
"
,
mdname
(
mddev
),
s
,
mdname
(
mddev
),
s
,
(
unsigned
long
long
)(
(
unsigned
long
long
)(
sect
+
sect
+
choose_data_offset
(
r10_bio
,
choose_data_offset
(
r10_bio
,
rdev
)),
rdev
)),
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
printk
(
KERN_NOTICE
"md/raid10:%s: %s: failing "
pr_notice
(
"md/raid10:%s: %s: failing drive
\n
"
,
"drive
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
}
}
...
@@ -2397,24 +2461,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
...
@@ -2397,24 +2461,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
READ
))
{
READ
))
{
case
0
:
case
0
:
/* Well, this device is dead */
/* Well, this device is dead */
printk
(
KERN_NOTICE
pr_notice
(
"md/raid10:%s: unable to read back corrected sectors (%d sectors at %llu on %s)
\n
"
,
"md/raid10:%s: unable to read back "
"corrected sectors"
" (%d sectors at %llu on %s)
\n
"
,
mdname
(
mddev
),
s
,
mdname
(
mddev
),
s
,
(
unsigned
long
long
)(
(
unsigned
long
long
)(
sect
+
sect
+
choose_data_offset
(
r10_bio
,
rdev
)),
choose_data_offset
(
r10_bio
,
rdev
)),
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
printk
(
KERN_NOTICE
"md/raid10:%s: %s: failing "
pr_notice
(
"md/raid10:%s: %s: failing drive
\n
"
,
"drive
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
break
;
break
;
case
1
:
case
1
:
printk
(
KERN_INFO
pr_info
(
"md/raid10:%s: read error corrected (%d sectors at %llu on %s)
\n
"
,
"md/raid10:%s: read error corrected"
" (%d sectors at %llu on %s)
\n
"
,
mdname
(
mddev
),
s
,
mdname
(
mddev
),
s
,
(
unsigned
long
long
)(
(
unsigned
long
long
)(
sect
+
sect
+
...
@@ -2503,6 +2561,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
...
@@ -2503,6 +2561,8 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
unsigned
long
do_sync
;
unsigned
long
do_sync
;
int
max_sectors
;
int
max_sectors
;
dev_t
bio_dev
;
sector_t
bio_last_sector
;
/* we got a read error. Maybe the drive is bad. Maybe just
/* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it.
* the block and we can fix it.
...
@@ -2514,23 +2574,26 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
...
@@ -2514,23 +2574,26 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
*/
*/
bio
=
r10_bio
->
devs
[
slot
].
bio
;
bio
=
r10_bio
->
devs
[
slot
].
bio
;
bdevname
(
bio
->
bi_bdev
,
b
);
bdevname
(
bio
->
bi_bdev
,
b
);
bio_dev
=
bio
->
bi_bdev
->
bd_dev
;
bio_last_sector
=
r10_bio
->
devs
[
slot
].
addr
+
rdev
->
data_offset
+
r10_bio
->
sectors
;
bio_put
(
bio
);
bio_put
(
bio
);
r10_bio
->
devs
[
slot
].
bio
=
NULL
;
r10_bio
->
devs
[
slot
].
bio
=
NULL
;
if
(
mddev
->
ro
==
0
)
{
if
(
mddev
->
ro
)
r10_bio
->
devs
[
slot
].
bio
=
IO_BLOCKED
;
else
if
(
!
test_bit
(
FailFast
,
&
rdev
->
flags
))
{
freeze_array
(
conf
,
1
);
freeze_array
(
conf
,
1
);
fix_read_error
(
conf
,
mddev
,
r10_bio
);
fix_read_error
(
conf
,
mddev
,
r10_bio
);
unfreeze_array
(
conf
);
unfreeze_array
(
conf
);
}
else
}
else
r10_bio
->
devs
[
slot
].
bio
=
IO_BLOCKED
;
md_error
(
mddev
,
rdev
)
;
rdev_dec_pending
(
rdev
,
mddev
);
rdev_dec_pending
(
rdev
,
mddev
);
read_more:
read_more:
rdev
=
read_balance
(
conf
,
r10_bio
,
&
max_sectors
);
rdev
=
read_balance
(
conf
,
r10_bio
,
&
max_sectors
);
if
(
rdev
==
NULL
)
{
if
(
rdev
==
NULL
)
{
printk
(
KERN_ALERT
"md/raid10:%s: %s: unrecoverable I/O"
pr_crit_ratelimited
(
"md/raid10:%s: %s: unrecoverable I/O read error for block %llu
\n
"
,
" read error for block %llu
\n
"
,
mdname
(
mddev
),
b
,
mdname
(
mddev
),
b
,
(
unsigned
long
long
)
r10_bio
->
sector
);
(
unsigned
long
long
)
r10_bio
->
sector
);
raid_end_bio_io
(
r10_bio
);
raid_end_bio_io
(
r10_bio
);
...
@@ -2539,10 +2602,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
...
@@ -2539,10 +2602,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
do_sync
=
(
r10_bio
->
master_bio
->
bi_opf
&
REQ_SYNC
);
do_sync
=
(
r10_bio
->
master_bio
->
bi_opf
&
REQ_SYNC
);
slot
=
r10_bio
->
read_slot
;
slot
=
r10_bio
->
read_slot
;
printk_ratelimited
(
pr_err_ratelimited
(
"md/raid10:%s: %s: redirecting sector %llu to another mirror
\n
"
,
KERN_ERR
"md/raid10:%s: %s: redirecting "
"sector %llu to another mirror
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
(
unsigned
long
long
)
r10_bio
->
sector
);
(
unsigned
long
long
)
r10_bio
->
sector
);
...
@@ -2555,8 +2615,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
...
@@ -2555,8 +2615,15 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
+
choose_data_offset
(
r10_bio
,
rdev
);
+
choose_data_offset
(
r10_bio
,
rdev
);
bio
->
bi_bdev
=
rdev
->
bdev
;
bio
->
bi_bdev
=
rdev
->
bdev
;
bio_set_op_attrs
(
bio
,
REQ_OP_READ
,
do_sync
);
bio_set_op_attrs
(
bio
,
REQ_OP_READ
,
do_sync
);
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
)
&&
test_bit
(
R10BIO_FailFast
,
&
r10_bio
->
state
))
bio
->
bi_opf
|=
MD_FAILFAST
;
bio
->
bi_private
=
r10_bio
;
bio
->
bi_private
=
r10_bio
;
bio
->
bi_end_io
=
raid10_end_read_request
;
bio
->
bi_end_io
=
raid10_end_read_request
;
trace_block_bio_remap
(
bdev_get_queue
(
bio
->
bi_bdev
),
bio
,
bio_dev
,
bio_last_sector
-
r10_bio
->
sectors
);
if
(
max_sectors
<
r10_bio
->
sectors
)
{
if
(
max_sectors
<
r10_bio
->
sectors
)
{
/* Drat - have to split this up more */
/* Drat - have to split this up more */
struct
bio
*
mbio
=
r10_bio
->
master_bio
;
struct
bio
*
mbio
=
r10_bio
->
master_bio
;
...
@@ -2694,10 +2761,10 @@ static void raid10d(struct md_thread *thread)
...
@@ -2694,10 +2761,10 @@ static void raid10d(struct md_thread *thread)
md_check_recovery
(
mddev
);
md_check_recovery
(
mddev
);
if
(
!
list_empty_careful
(
&
conf
->
bio_end_io_list
)
&&
if
(
!
list_empty_careful
(
&
conf
->
bio_end_io_list
)
&&
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
{
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
{
LIST_HEAD
(
tmp
);
LIST_HEAD
(
tmp
);
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
if
(
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
{
if
(
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
{
while
(
!
list_empty
(
&
conf
->
bio_end_io_list
))
{
while
(
!
list_empty
(
&
conf
->
bio_end_io_list
))
{
list_move
(
conf
->
bio_end_io_list
.
prev
,
&
tmp
);
list_move
(
conf
->
bio_end_io_list
.
prev
,
&
tmp
);
conf
->
nr_queued
--
;
conf
->
nr_queued
--
;
...
@@ -2755,7 +2822,7 @@ static void raid10d(struct md_thread *thread)
...
@@ -2755,7 +2822,7 @@ static void raid10d(struct md_thread *thread)
}
}
cond_resched
();
cond_resched
();
if
(
mddev
->
flags
&
~
(
1
<<
MD
_CHANGE_PENDING
))
if
(
mddev
->
sb_flags
&
~
(
1
<<
MD_SB
_CHANGE_PENDING
))
md_check_recovery
(
mddev
);
md_check_recovery
(
mddev
);
}
}
blk_finish_plug
(
&
plug
);
blk_finish_plug
(
&
plug
);
...
@@ -3072,6 +3139,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -3072,6 +3139,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio
->
bi_private
=
r10_bio
;
bio
->
bi_private
=
r10_bio
;
bio
->
bi_end_io
=
end_sync_read
;
bio
->
bi_end_io
=
end_sync_read
;
bio_set_op_attrs
(
bio
,
REQ_OP_READ
,
0
);
bio_set_op_attrs
(
bio
,
REQ_OP_READ
,
0
);
if
(
test_bit
(
FailFast
,
&
rdev
->
flags
))
bio
->
bi_opf
|=
MD_FAILFAST
;
from_addr
=
r10_bio
->
devs
[
j
].
addr
;
from_addr
=
r10_bio
->
devs
[
j
].
addr
;
bio
->
bi_iter
.
bi_sector
=
from_addr
+
bio
->
bi_iter
.
bi_sector
=
from_addr
+
rdev
->
data_offset
;
rdev
->
data_offset
;
...
@@ -3160,8 +3229,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -3160,8 +3229,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
if
(
!
any_working
)
{
if
(
!
any_working
)
{
if
(
!
test_and_set_bit
(
MD_RECOVERY_INTR
,
if
(
!
test_and_set_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
&
mddev
->
recovery
))
printk
(
KERN_INFO
"md/raid10:%s: insufficient "
pr_warn
(
"md/raid10:%s: insufficient working devices for recovery.
\n
"
,
"working devices for recovery.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
mirror
->
recovery_disabled
mirror
->
recovery_disabled
=
mddev
->
recovery_disabled
;
=
mddev
->
recovery_disabled
;
...
@@ -3178,6 +3246,23 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -3178,6 +3246,23 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
rdev_dec_pending
(
mrdev
,
mddev
);
rdev_dec_pending
(
mrdev
,
mddev
);
if
(
mreplace
)
if
(
mreplace
)
rdev_dec_pending
(
mreplace
,
mddev
);
rdev_dec_pending
(
mreplace
,
mddev
);
if
(
r10_bio
->
devs
[
0
].
bio
->
bi_opf
&
MD_FAILFAST
)
{
/* Only want this if there is elsewhere to
* read from. 'j' is currently the first
* readable copy.
*/
int
targets
=
1
;
for
(;
j
<
conf
->
copies
;
j
++
)
{
int
d
=
r10_bio
->
devs
[
j
].
devnum
;
if
(
conf
->
mirrors
[
d
].
rdev
&&
test_bit
(
In_sync
,
&
conf
->
mirrors
[
d
].
rdev
->
flags
))
targets
++
;
}
if
(
targets
==
1
)
r10_bio
->
devs
[
0
].
bio
->
bi_opf
&=
~
MD_FAILFAST
;
}
}
}
if
(
biolist
==
NULL
)
{
if
(
biolist
==
NULL
)
{
while
(
r10_bio
)
{
while
(
r10_bio
)
{
...
@@ -3256,6 +3341,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -3256,6 +3341,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio
->
bi_private
=
r10_bio
;
bio
->
bi_private
=
r10_bio
;
bio
->
bi_end_io
=
end_sync_read
;
bio
->
bi_end_io
=
end_sync_read
;
bio_set_op_attrs
(
bio
,
REQ_OP_READ
,
0
);
bio_set_op_attrs
(
bio
,
REQ_OP_READ
,
0
);
if
(
test_bit
(
FailFast
,
&
conf
->
mirrors
[
d
].
rdev
->
flags
))
bio
->
bi_opf
|=
MD_FAILFAST
;
bio
->
bi_iter
.
bi_sector
=
sector
+
rdev
->
data_offset
;
bio
->
bi_iter
.
bi_sector
=
sector
+
rdev
->
data_offset
;
bio
->
bi_bdev
=
rdev
->
bdev
;
bio
->
bi_bdev
=
rdev
->
bdev
;
count
++
;
count
++
;
...
@@ -3279,6 +3366,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -3279,6 +3366,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio
->
bi_private
=
r10_bio
;
bio
->
bi_private
=
r10_bio
;
bio
->
bi_end_io
=
end_sync_write
;
bio
->
bi_end_io
=
end_sync_write
;
bio_set_op_attrs
(
bio
,
REQ_OP_WRITE
,
0
);
bio_set_op_attrs
(
bio
,
REQ_OP_WRITE
,
0
);
if
(
test_bit
(
FailFast
,
&
conf
->
mirrors
[
d
].
rdev
->
flags
))
bio
->
bi_opf
|=
MD_FAILFAST
;
bio
->
bi_iter
.
bi_sector
=
sector
+
rdev
->
data_offset
;
bio
->
bi_iter
.
bi_sector
=
sector
+
rdev
->
data_offset
;
bio
->
bi_bdev
=
rdev
->
bdev
;
bio
->
bi_bdev
=
rdev
->
bdev
;
count
++
;
count
++
;
...
@@ -3489,14 +3578,13 @@ static struct r10conf *setup_conf(struct mddev *mddev)
...
@@ -3489,14 +3578,13 @@ static struct r10conf *setup_conf(struct mddev *mddev)
copies
=
setup_geo
(
&
geo
,
mddev
,
geo_new
);
copies
=
setup_geo
(
&
geo
,
mddev
,
geo_new
);
if
(
copies
==
-
2
)
{
if
(
copies
==
-
2
)
{
printk
(
KERN_ERR
"md/raid10:%s: chunk size must be "
pr_warn
(
"md/raid10:%s: chunk size must be at least PAGE_SIZE(%ld) and be a power of 2.
\n
"
,
"at least PAGE_SIZE(%ld) and be a power of 2.
\n
"
,
mdname
(
mddev
),
PAGE_SIZE
);
mdname
(
mddev
),
PAGE_SIZE
);
goto
out
;
goto
out
;
}
}
if
(
copies
<
2
||
copies
>
mddev
->
raid_disks
)
{
if
(
copies
<
2
||
copies
>
mddev
->
raid_disks
)
{
pr
intk
(
KERN_ERR
"md/raid10:%s: unsupported raid10 layout: 0x%8x
\n
"
,
pr
_warn
(
"md/raid10:%s: unsupported raid10 layout: 0x%8x
\n
"
,
mdname
(
mddev
),
mddev
->
new_layout
);
mdname
(
mddev
),
mddev
->
new_layout
);
goto
out
;
goto
out
;
}
}
...
@@ -3557,9 +3645,6 @@ static struct r10conf *setup_conf(struct mddev *mddev)
...
@@ -3557,9 +3645,6 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return
conf
;
return
conf
;
out:
out:
if
(
err
==
-
ENOMEM
)
printk
(
KERN_ERR
"md/raid10:%s: couldn't allocate memory.
\n
"
,
mdname
(
mddev
));
if
(
conf
)
{
if
(
conf
)
{
mempool_destroy
(
conf
->
r10bio_pool
);
mempool_destroy
(
conf
->
r10bio_pool
);
kfree
(
conf
->
mirrors
);
kfree
(
conf
->
mirrors
);
...
@@ -3656,7 +3741,7 @@ static int raid10_run(struct mddev *mddev)
...
@@ -3656,7 +3741,7 @@ static int raid10_run(struct mddev *mddev)
}
}
/* need to check that every block has at least one working mirror */
/* need to check that every block has at least one working mirror */
if
(
!
enough
(
conf
,
-
1
))
{
if
(
!
enough
(
conf
,
-
1
))
{
pr
intk
(
KERN_ERR
"md/raid10:%s: not enough operational mirrors.
\n
"
,
pr
_err
(
"md/raid10:%s: not enough operational mirrors.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
goto
out_free_conf
;
goto
out_free_conf
;
}
}
...
@@ -3698,11 +3783,9 @@ static int raid10_run(struct mddev *mddev)
...
@@ -3698,11 +3783,9 @@ static int raid10_run(struct mddev *mddev)
}
}
if
(
mddev
->
recovery_cp
!=
MaxSector
)
if
(
mddev
->
recovery_cp
!=
MaxSector
)
printk
(
KERN_NOTICE
"md/raid10:%s: not clean"
pr_notice
(
"md/raid10:%s: not clean -- starting background reconstruction
\n
"
,
" -- starting background reconstruction
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
printk
(
KERN_INFO
pr_info
(
"md/raid10:%s: active with %d out of %d devices
\n
"
,
"md/raid10:%s: active with %d out of %d devices
\n
"
,
mdname
(
mddev
),
conf
->
geo
.
raid_disks
-
mddev
->
degraded
,
mdname
(
mddev
),
conf
->
geo
.
raid_disks
-
mddev
->
degraded
,
conf
->
geo
.
raid_disks
);
conf
->
geo
.
raid_disks
);
/*
/*
...
@@ -3712,6 +3795,7 @@ static int raid10_run(struct mddev *mddev)
...
@@ -3712,6 +3795,7 @@ static int raid10_run(struct mddev *mddev)
size
=
raid10_size
(
mddev
,
0
,
0
);
size
=
raid10_size
(
mddev
,
0
,
0
);
md_set_array_sectors
(
mddev
,
size
);
md_set_array_sectors
(
mddev
,
size
);
mddev
->
resync_max_sectors
=
size
;
mddev
->
resync_max_sectors
=
size
;
set_bit
(
MD_FAILFAST_SUPPORTED
,
&
mddev
->
flags
);
if
(
mddev
->
queue
)
{
if
(
mddev
->
queue
)
{
int
stripe
=
conf
->
geo
.
raid_disks
*
int
stripe
=
conf
->
geo
.
raid_disks
*
...
@@ -3739,7 +3823,7 @@ static int raid10_run(struct mddev *mddev)
...
@@ -3739,7 +3823,7 @@ static int raid10_run(struct mddev *mddev)
if
(
max
(
before_length
,
after_length
)
>
min_offset_diff
)
{
if
(
max
(
before_length
,
after_length
)
>
min_offset_diff
)
{
/* This cannot work */
/* This cannot work */
pr
intk
(
"md/raid10: offset difference not enough to continue reshape
\n
"
);
pr
_warn
(
"md/raid10: offset difference not enough to continue reshape
\n
"
);
goto
out_free_conf
;
goto
out_free_conf
;
}
}
conf
->
offset_diff
=
min_offset_diff
;
conf
->
offset_diff
=
min_offset_diff
;
...
@@ -3846,7 +3930,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
...
@@ -3846,7 +3930,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
struct
r10conf
*
conf
;
struct
r10conf
*
conf
;
if
(
mddev
->
degraded
>
0
)
{
if
(
mddev
->
degraded
>
0
)
{
pr
intk
(
KERN_ERR
"md/raid10:%s: Error: degraded raid0!
\n
"
,
pr
_warn
(
"md/raid10:%s: Error: degraded raid0!
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
...
@@ -3887,8 +3971,7 @@ static void *raid10_takeover(struct mddev *mddev)
...
@@ -3887,8 +3971,7 @@ static void *raid10_takeover(struct mddev *mddev)
/* for raid0 takeover only one zone is supported */
/* for raid0 takeover only one zone is supported */
raid0_conf
=
mddev
->
private
;
raid0_conf
=
mddev
->
private
;
if
(
raid0_conf
->
nr_strip_zones
>
1
)
{
if
(
raid0_conf
->
nr_strip_zones
>
1
)
{
printk
(
KERN_ERR
"md/raid10:%s: cannot takeover raid 0"
pr_warn
(
"md/raid10:%s: cannot takeover raid 0 with more than one zone.
\n
"
,
" with more than one zone.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
...
@@ -4078,7 +4161,7 @@ static int raid10_start_reshape(struct mddev *mddev)
...
@@ -4078,7 +4161,7 @@ static int raid10_start_reshape(struct mddev *mddev)
sector_t
size
=
raid10_size
(
mddev
,
0
,
0
);
sector_t
size
=
raid10_size
(
mddev
,
0
,
0
);
if
(
size
<
mddev
->
array_sectors
)
{
if
(
size
<
mddev
->
array_sectors
)
{
spin_unlock_irq
(
&
conf
->
device_lock
);
spin_unlock_irq
(
&
conf
->
device_lock
);
pr
intk
(
KERN_ERR
"md/raid10:%s: array size must be reduce before number of disks
\n
"
,
pr
_warn
(
"md/raid10:%s: array size must be reduce before number of disks
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -4126,7 +4209,7 @@ static int raid10_start_reshape(struct mddev *mddev)
...
@@ -4126,7 +4209,7 @@ static int raid10_start_reshape(struct mddev *mddev)
spin_unlock_irq
(
&
conf
->
device_lock
);
spin_unlock_irq
(
&
conf
->
device_lock
);
mddev
->
raid_disks
=
conf
->
geo
.
raid_disks
;
mddev
->
raid_disks
=
conf
->
geo
.
raid_disks
;
mddev
->
reshape_position
=
conf
->
reshape_progress
;
mddev
->
reshape_position
=
conf
->
reshape_progress
;
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
clear_bit
(
MD_RECOVERY_SYNC
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_SYNC
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_CHECK
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_CHECK
,
&
mddev
->
recovery
);
...
@@ -4321,9 +4404,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
...
@@ -4321,9 +4404,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
else
else
mddev
->
curr_resync_completed
=
conf
->
reshape_progress
;
mddev
->
curr_resync_completed
=
conf
->
reshape_progress
;
conf
->
reshape_checkpoint
=
jiffies
;
conf
->
reshape_checkpoint
=
jiffies
;
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
wait_event
(
mddev
->
sb_wait
,
mddev
->
flags
==
0
||
wait_event
(
mddev
->
sb_wait
,
mddev
->
sb_
flags
==
0
||
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
));
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
));
if
(
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
{
if
(
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
{
allow_barrier
(
conf
);
allow_barrier
(
conf
);
...
...
drivers/md/raid10.h
浏览文件 @
20737738
...
@@ -156,5 +156,7 @@ enum r10bio_state {
...
@@ -156,5 +156,7 @@ enum r10bio_state {
* flag is set
* flag is set
*/
*/
R10BIO_Previous
,
R10BIO_Previous
,
/* failfast devices did receive failfast requests. */
R10BIO_FailFast
,
};
};
#endif
#endif
drivers/md/raid5-cache.c
浏览文件 @
20737738
/*
/*
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
* Copyright (C) 2016 Song Liu <songliubraving@fb.com>
*
*
* This program is free software; you can redistribute it and/or modify it
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* under the terms and conditions of the GNU General Public License,
...
@@ -18,8 +19,10 @@
...
@@ -18,8 +19,10 @@
#include <linux/raid/md_p.h>
#include <linux/raid/md_p.h>
#include <linux/crc32c.h>
#include <linux/crc32c.h>
#include <linux/random.h>
#include <linux/random.h>
#include <linux/kthread.h>
#include "md.h"
#include "md.h"
#include "raid5.h"
#include "raid5.h"
#include "bitmap.h"
/*
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
* metadata/data stored in disk with 4k size unit (a block) regardless
...
@@ -28,18 +31,70 @@
...
@@ -28,18 +31,70 @@
#define BLOCK_SECTORS (8)
#define BLOCK_SECTORS (8)
/*
/*
* reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
* log->max_free_space is min(1/4 disk size, 10G reclaimable space).
* recovery scans a very long log
*
* In write through mode, the reclaim runs every log->max_free_space.
* This can prevent the recovery scans for too long
*/
*/
#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2)
/* sector */
#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2)
/* sector */
#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
/* wake up reclaim thread periodically */
#define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
/* start flush with these full stripes */
#define R5C_FULL_STRIPE_FLUSH_BATCH 256
/* reclaim stripes in groups */
#define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
/*
/*
* We only need 2 bios per I/O unit to make progress, but ensure we
* We only need 2 bios per I/O unit to make progress, but ensure we
* have a few more available to not get too tight.
* have a few more available to not get too tight.
*/
*/
#define R5L_POOL_SIZE 4
#define R5L_POOL_SIZE 4
/*
* r5c journal modes of the array: write-back or write-through.
* write-through mode has identical behavior as existing log only
* implementation.
*/
enum
r5c_journal_mode
{
R5C_JOURNAL_MODE_WRITE_THROUGH
=
0
,
R5C_JOURNAL_MODE_WRITE_BACK
=
1
,
};
static
char
*
r5c_journal_mode_str
[]
=
{
"write-through"
,
"write-back"
};
/*
* raid5 cache state machine
*
* With the RAID cache, each stripe works in two phases:
* - caching phase
* - writing-out phase
*
* These two phases are controlled by bit STRIPE_R5C_CACHING:
* if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
* if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
*
* When there is no journal, or the journal is in write-through mode,
* the stripe is always in writing-out phase.
*
* For write-back journal, the stripe is sent to caching phase on write
* (r5c_try_caching_write). r5c_make_stripe_write_out() kicks off
* the write-out phase by clearing STRIPE_R5C_CACHING.
*
* Stripes in caching phase do not write the raid disks. Instead, all
* writes are committed from the log device. Therefore, a stripe in
* caching phase handles writes as:
* - write to log device
* - return IO
*
* Stripes in writing-out phase handle writes as:
* - calculate parity
* - write pending data and parity to journal
* - write data and parity to raid disks
* - return IO for pending writes
*/
struct
r5l_log
{
struct
r5l_log
{
struct
md_rdev
*
rdev
;
struct
md_rdev
*
rdev
;
...
@@ -58,7 +113,6 @@ struct r5l_log {
...
@@ -58,7 +113,6 @@ struct r5l_log {
u64
seq
;
/* log head sequence */
u64
seq
;
/* log head sequence */
sector_t
next_checkpoint
;
sector_t
next_checkpoint
;
u64
next_cp_seq
;
struct
mutex
io_mutex
;
struct
mutex
io_mutex
;
struct
r5l_io_unit
*
current_io
;
/* current io_unit accepting new data */
struct
r5l_io_unit
*
current_io
;
/* current io_unit accepting new data */
...
@@ -96,6 +150,18 @@ struct r5l_log {
...
@@ -96,6 +150,18 @@ struct r5l_log {
spinlock_t
no_space_stripes_lock
;
spinlock_t
no_space_stripes_lock
;
bool
need_cache_flush
;
bool
need_cache_flush
;
/* for r5c_cache */
enum
r5c_journal_mode
r5c_journal_mode
;
/* all stripes in r5cache, in the order of seq at sh->log_start */
struct
list_head
stripe_in_journal_list
;
spinlock_t
stripe_in_journal_lock
;
atomic_t
stripe_in_journal_count
;
/* to submit async io_units, to fulfill ordering of flush */
struct
work_struct
deferred_io_work
;
};
};
/*
/*
...
@@ -122,6 +188,18 @@ struct r5l_io_unit {
...
@@ -122,6 +188,18 @@ struct r5l_io_unit {
int
state
;
int
state
;
bool
need_split_bio
;
bool
need_split_bio
;
struct
bio
*
split_bio
;
unsigned
int
has_flush
:
1
;
/* include flush request */
unsigned
int
has_fua
:
1
;
/* include fua request */
unsigned
int
has_null_flush
:
1
;
/* include empty flush request */
/*
* io isn't sent yet, flush/fua request can only be submitted till it's
* the first IO in running_ios list
*/
unsigned
int
io_deferred
:
1
;
struct
bio_list
flush_barriers
;
/* size == 0 flush bios */
};
};
/* r5l_io_unit state */
/* r5l_io_unit state */
...
@@ -133,6 +211,12 @@ enum r5l_io_unit_state {
...
@@ -133,6 +211,12 @@ enum r5l_io_unit_state {
IO_UNIT_STRIPE_END
=
3
,
/* stripes data finished writing to raid */
IO_UNIT_STRIPE_END
=
3
,
/* stripes data finished writing to raid */
};
};
bool
r5c_is_writeback
(
struct
r5l_log
*
log
)
{
return
(
log
!=
NULL
&&
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_BACK
);
}
static
sector_t
r5l_ring_add
(
struct
r5l_log
*
log
,
sector_t
start
,
sector_t
inc
)
static
sector_t
r5l_ring_add
(
struct
r5l_log
*
log
,
sector_t
start
,
sector_t
inc
)
{
{
start
+=
inc
;
start
+=
inc
;
...
@@ -168,12 +252,235 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
...
@@ -168,12 +252,235 @@ static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
io
->
state
=
state
;
io
->
state
=
state
;
}
}
static
void
r5c_return_dev_pending_writes
(
struct
r5conf
*
conf
,
struct
r5dev
*
dev
,
struct
bio_list
*
return_bi
)
{
struct
bio
*
wbi
,
*
wbi2
;
wbi
=
dev
->
written
;
dev
->
written
=
NULL
;
while
(
wbi
&&
wbi
->
bi_iter
.
bi_sector
<
dev
->
sector
+
STRIPE_SECTORS
)
{
wbi2
=
r5_next_bio
(
wbi
,
dev
->
sector
);
if
(
!
raid5_dec_bi_active_stripes
(
wbi
))
{
md_write_end
(
conf
->
mddev
);
bio_list_add
(
return_bi
,
wbi
);
}
wbi
=
wbi2
;
}
}
void
r5c_handle_cached_data_endio
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
int
disks
,
struct
bio_list
*
return_bi
)
{
int
i
;
for
(
i
=
sh
->
disks
;
i
--
;
)
{
if
(
sh
->
dev
[
i
].
written
)
{
set_bit
(
R5_UPTODATE
,
&
sh
->
dev
[
i
].
flags
);
r5c_return_dev_pending_writes
(
conf
,
&
sh
->
dev
[
i
],
return_bi
);
bitmap_endwrite
(
conf
->
mddev
->
bitmap
,
sh
->
sector
,
STRIPE_SECTORS
,
!
test_bit
(
STRIPE_DEGRADED
,
&
sh
->
state
),
0
);
}
}
}
/* Check whether we should flush some stripes to free up stripe cache */
void
r5c_check_stripe_cache_usage
(
struct
r5conf
*
conf
)
{
int
total_cached
;
if
(
!
r5c_is_writeback
(
conf
->
log
))
return
;
total_cached
=
atomic_read
(
&
conf
->
r5c_cached_partial_stripes
)
+
atomic_read
(
&
conf
->
r5c_cached_full_stripes
);
/*
* The following condition is true for either of the following:
* - stripe cache pressure high:
* total_cached > 3/4 min_nr_stripes ||
* empty_inactive_list_nr > 0
* - stripe cache pressure moderate:
* total_cached > 1/2 min_nr_stripes
*/
if
(
total_cached
>
conf
->
min_nr_stripes
*
1
/
2
||
atomic_read
(
&
conf
->
empty_inactive_list_nr
)
>
0
)
r5l_wake_reclaim
(
conf
->
log
,
0
);
}
/*
* flush cache when there are R5C_FULL_STRIPE_FLUSH_BATCH or more full
* stripes in the cache
*/
void
r5c_check_cached_full_stripe
(
struct
r5conf
*
conf
)
{
if
(
!
r5c_is_writeback
(
conf
->
log
))
return
;
/*
* wake up reclaim for R5C_FULL_STRIPE_FLUSH_BATCH cached stripes
* or a full stripe (chunk size / 4k stripes).
*/
if
(
atomic_read
(
&
conf
->
r5c_cached_full_stripes
)
>=
min
(
R5C_FULL_STRIPE_FLUSH_BATCH
,
conf
->
chunk_sectors
>>
STRIPE_SHIFT
))
r5l_wake_reclaim
(
conf
->
log
,
0
);
}
/*
* Total log space (in sectors) needed to flush all data in cache
*
* Currently, writing-out phase automatically includes all pending writes
* to the same sector. So the reclaim of each stripe takes up to
* (conf->raid_disks + 1) pages of log space.
*
* To totally avoid deadlock due to log space, the code reserves
* (conf->raid_disks + 1) pages for each stripe in cache, which is not
* necessary in most cases.
*
* To improve this, we will need writing-out phase to be able to NOT include
* pending writes, which will reduce the requirement to
* (conf->max_degraded + 1) pages per stripe in cache.
*/
static
sector_t
r5c_log_required_to_flush_cache
(
struct
r5conf
*
conf
)
{
struct
r5l_log
*
log
=
conf
->
log
;
if
(
!
r5c_is_writeback
(
log
))
return
0
;
return
BLOCK_SECTORS
*
(
conf
->
raid_disks
+
1
)
*
atomic_read
(
&
log
->
stripe_in_journal_count
);
}
/*
* evaluate log space usage and update R5C_LOG_TIGHT and R5C_LOG_CRITICAL
*
* R5C_LOG_TIGHT is set when free space on the log device is less than 3x of
* reclaim_required_space. R5C_LOG_CRITICAL is set when free space on the log
* device is less than 2x of reclaim_required_space.
*/
static
inline
void
r5c_update_log_state
(
struct
r5l_log
*
log
)
{
struct
r5conf
*
conf
=
log
->
rdev
->
mddev
->
private
;
sector_t
free_space
;
sector_t
reclaim_space
;
bool
wake_reclaim
=
false
;
if
(
!
r5c_is_writeback
(
log
))
return
;
free_space
=
r5l_ring_distance
(
log
,
log
->
log_start
,
log
->
last_checkpoint
);
reclaim_space
=
r5c_log_required_to_flush_cache
(
conf
);
if
(
free_space
<
2
*
reclaim_space
)
set_bit
(
R5C_LOG_CRITICAL
,
&
conf
->
cache_state
);
else
{
if
(
test_bit
(
R5C_LOG_CRITICAL
,
&
conf
->
cache_state
))
wake_reclaim
=
true
;
clear_bit
(
R5C_LOG_CRITICAL
,
&
conf
->
cache_state
);
}
if
(
free_space
<
3
*
reclaim_space
)
set_bit
(
R5C_LOG_TIGHT
,
&
conf
->
cache_state
);
else
clear_bit
(
R5C_LOG_TIGHT
,
&
conf
->
cache_state
);
if
(
wake_reclaim
)
r5l_wake_reclaim
(
log
,
0
);
}
/*
* Put the stripe into writing-out phase by clearing STRIPE_R5C_CACHING.
* This function should only be called in write-back mode.
*/
void
r5c_make_stripe_write_out
(
struct
stripe_head
*
sh
)
{
struct
r5conf
*
conf
=
sh
->
raid_conf
;
struct
r5l_log
*
log
=
conf
->
log
;
BUG_ON
(
!
r5c_is_writeback
(
log
));
WARN_ON
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
));
clear_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
);
if
(
!
test_and_set_bit
(
STRIPE_PREREAD_ACTIVE
,
&
sh
->
state
))
atomic_inc
(
&
conf
->
preread_active_stripes
);
if
(
test_and_clear_bit
(
STRIPE_R5C_PARTIAL_STRIPE
,
&
sh
->
state
))
{
BUG_ON
(
atomic_read
(
&
conf
->
r5c_cached_partial_stripes
)
==
0
);
atomic_dec
(
&
conf
->
r5c_cached_partial_stripes
);
}
if
(
test_and_clear_bit
(
STRIPE_R5C_FULL_STRIPE
,
&
sh
->
state
))
{
BUG_ON
(
atomic_read
(
&
conf
->
r5c_cached_full_stripes
)
==
0
);
atomic_dec
(
&
conf
->
r5c_cached_full_stripes
);
}
}
static
void
r5c_handle_data_cached
(
struct
stripe_head
*
sh
)
{
int
i
;
for
(
i
=
sh
->
disks
;
i
--
;
)
if
(
test_and_clear_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
i
].
flags
))
{
set_bit
(
R5_InJournal
,
&
sh
->
dev
[
i
].
flags
);
clear_bit
(
R5_LOCKED
,
&
sh
->
dev
[
i
].
flags
);
}
clear_bit
(
STRIPE_LOG_TRAPPED
,
&
sh
->
state
);
}
/*
* this journal write must contain full parity,
* it may also contain some data pages
*/
static
void
r5c_handle_parity_cached
(
struct
stripe_head
*
sh
)
{
int
i
;
for
(
i
=
sh
->
disks
;
i
--
;
)
if
(
test_bit
(
R5_InJournal
,
&
sh
->
dev
[
i
].
flags
))
set_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
i
].
flags
);
}
/*
* Setting proper flags after writing (or flushing) data and/or parity to the
* log device. This is called from r5l_log_endio() or r5l_log_flush_endio().
*/
static
void
r5c_finish_cache_stripe
(
struct
stripe_head
*
sh
)
{
struct
r5l_log
*
log
=
sh
->
raid_conf
->
log
;
if
(
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_THROUGH
)
{
BUG_ON
(
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
));
/*
* Set R5_InJournal for parity dev[pd_idx]. This means
* all data AND parity in the journal. For RAID 6, it is
* NOT necessary to set the flag for dev[qd_idx], as the
* two parities are written out together.
*/
set_bit
(
R5_InJournal
,
&
sh
->
dev
[
sh
->
pd_idx
].
flags
);
}
else
if
(
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
))
{
r5c_handle_data_cached
(
sh
);
}
else
{
r5c_handle_parity_cached
(
sh
);
set_bit
(
R5_InJournal
,
&
sh
->
dev
[
sh
->
pd_idx
].
flags
);
}
}
static
void
r5l_io_run_stripes
(
struct
r5l_io_unit
*
io
)
static
void
r5l_io_run_stripes
(
struct
r5l_io_unit
*
io
)
{
{
struct
stripe_head
*
sh
,
*
next
;
struct
stripe_head
*
sh
,
*
next
;
list_for_each_entry_safe
(
sh
,
next
,
&
io
->
stripe_list
,
log_list
)
{
list_for_each_entry_safe
(
sh
,
next
,
&
io
->
stripe_list
,
log_list
)
{
list_del_init
(
&
sh
->
log_list
);
list_del_init
(
&
sh
->
log_list
);
r5c_finish_cache_stripe
(
sh
);
set_bit
(
STRIPE_HANDLE
,
&
sh
->
state
);
set_bit
(
STRIPE_HANDLE
,
&
sh
->
state
);
raid5_release_stripe
(
sh
);
raid5_release_stripe
(
sh
);
}
}
...
@@ -209,9 +516,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
...
@@ -209,9 +516,11 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
}
}
}
}
static
void
__r5l_stripe_write_finished
(
struct
r5l_io_unit
*
io
);
static
void
r5l_log_endio
(
struct
bio
*
bio
)
static
void
r5l_log_endio
(
struct
bio
*
bio
)
{
{
struct
r5l_io_unit
*
io
=
bio
->
bi_private
;
struct
r5l_io_unit
*
io
=
bio
->
bi_private
;
struct
r5l_io_unit
*
io_deferred
;
struct
r5l_log
*
log
=
io
->
log
;
struct
r5l_log
*
log
=
io
->
log
;
unsigned
long
flags
;
unsigned
long
flags
;
...
@@ -227,18 +536,89 @@ static void r5l_log_endio(struct bio *bio)
...
@@ -227,18 +536,89 @@ static void r5l_log_endio(struct bio *bio)
r5l_move_to_end_ios
(
log
);
r5l_move_to_end_ios
(
log
);
else
else
r5l_log_run_stripes
(
log
);
r5l_log_run_stripes
(
log
);
if
(
!
list_empty
(
&
log
->
running_ios
))
{
/*
* FLUSH/FUA io_unit is deferred because of ordering, now we
* can dispatch it
*/
io_deferred
=
list_first_entry
(
&
log
->
running_ios
,
struct
r5l_io_unit
,
log_sibling
);
if
(
io_deferred
->
io_deferred
)
schedule_work
(
&
log
->
deferred_io_work
);
}
spin_unlock_irqrestore
(
&
log
->
io_list_lock
,
flags
);
spin_unlock_irqrestore
(
&
log
->
io_list_lock
,
flags
);
if
(
log
->
need_cache_flush
)
if
(
log
->
need_cache_flush
)
md_wakeup_thread
(
log
->
rdev
->
mddev
->
thread
);
md_wakeup_thread
(
log
->
rdev
->
mddev
->
thread
);
if
(
io
->
has_null_flush
)
{
struct
bio
*
bi
;
WARN_ON
(
bio_list_empty
(
&
io
->
flush_barriers
));
while
((
bi
=
bio_list_pop
(
&
io
->
flush_barriers
))
!=
NULL
)
{
bio_endio
(
bi
);
atomic_dec
(
&
io
->
pending_stripe
);
}
if
(
atomic_read
(
&
io
->
pending_stripe
)
==
0
)
__r5l_stripe_write_finished
(
io
);
}
}
static
void
r5l_do_submit_io
(
struct
r5l_log
*
log
,
struct
r5l_io_unit
*
io
)
{
unsigned
long
flags
;
spin_lock_irqsave
(
&
log
->
io_list_lock
,
flags
);
__r5l_set_io_unit_state
(
io
,
IO_UNIT_IO_START
);
spin_unlock_irqrestore
(
&
log
->
io_list_lock
,
flags
);
if
(
io
->
has_flush
)
io
->
current_bio
->
bi_opf
|=
REQ_PREFLUSH
;
if
(
io
->
has_fua
)
io
->
current_bio
->
bi_opf
|=
REQ_FUA
;
submit_bio
(
io
->
current_bio
);
if
(
!
io
->
split_bio
)
return
;
if
(
io
->
has_flush
)
io
->
split_bio
->
bi_opf
|=
REQ_PREFLUSH
;
if
(
io
->
has_fua
)
io
->
split_bio
->
bi_opf
|=
REQ_FUA
;
submit_bio
(
io
->
split_bio
);
}
/* deferred io_unit will be dispatched here */
static
void
r5l_submit_io_async
(
struct
work_struct
*
work
)
{
struct
r5l_log
*
log
=
container_of
(
work
,
struct
r5l_log
,
deferred_io_work
);
struct
r5l_io_unit
*
io
=
NULL
;
unsigned
long
flags
;
spin_lock_irqsave
(
&
log
->
io_list_lock
,
flags
);
if
(
!
list_empty
(
&
log
->
running_ios
))
{
io
=
list_first_entry
(
&
log
->
running_ios
,
struct
r5l_io_unit
,
log_sibling
);
if
(
!
io
->
io_deferred
)
io
=
NULL
;
else
io
->
io_deferred
=
0
;
}
spin_unlock_irqrestore
(
&
log
->
io_list_lock
,
flags
);
if
(
io
)
r5l_do_submit_io
(
log
,
io
);
}
}
static
void
r5l_submit_current_io
(
struct
r5l_log
*
log
)
static
void
r5l_submit_current_io
(
struct
r5l_log
*
log
)
{
{
struct
r5l_io_unit
*
io
=
log
->
current_io
;
struct
r5l_io_unit
*
io
=
log
->
current_io
;
struct
bio
*
bio
;
struct
r5l_meta_block
*
block
;
struct
r5l_meta_block
*
block
;
unsigned
long
flags
;
unsigned
long
flags
;
u32
crc
;
u32
crc
;
bool
do_submit
=
true
;
if
(
!
io
)
if
(
!
io
)
return
;
return
;
...
@@ -247,13 +627,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
...
@@ -247,13 +627,20 @@ static void r5l_submit_current_io(struct r5l_log *log)
block
->
meta_size
=
cpu_to_le32
(
io
->
meta_offset
);
block
->
meta_size
=
cpu_to_le32
(
io
->
meta_offset
);
crc
=
crc32c_le
(
log
->
uuid_checksum
,
block
,
PAGE_SIZE
);
crc
=
crc32c_le
(
log
->
uuid_checksum
,
block
,
PAGE_SIZE
);
block
->
checksum
=
cpu_to_le32
(
crc
);
block
->
checksum
=
cpu_to_le32
(
crc
);
bio
=
io
->
current_bio
;
log
->
current_io
=
NULL
;
log
->
current_io
=
NULL
;
spin_lock_irqsave
(
&
log
->
io_list_lock
,
flags
);
spin_lock_irqsave
(
&
log
->
io_list_lock
,
flags
);
__r5l_set_io_unit_state
(
io
,
IO_UNIT_IO_START
);
if
(
io
->
has_flush
||
io
->
has_fua
)
{
if
(
io
!=
list_first_entry
(
&
log
->
running_ios
,
struct
r5l_io_unit
,
log_sibling
))
{
io
->
io_deferred
=
1
;
do_submit
=
false
;
}
}
spin_unlock_irqrestore
(
&
log
->
io_list_lock
,
flags
);
spin_unlock_irqrestore
(
&
log
->
io_list_lock
,
flags
);
if
(
do_submit
)
submit_bio
(
io
->
current_b
io
);
r5l_do_submit_io
(
log
,
io
);
}
}
static
struct
bio
*
r5l_bio_alloc
(
struct
r5l_log
*
log
)
static
struct
bio
*
r5l_bio_alloc
(
struct
r5l_log
*
log
)
...
@@ -271,6 +658,7 @@ static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
...
@@ -271,6 +658,7 @@ static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
{
{
log
->
log_start
=
r5l_ring_add
(
log
,
log
->
log_start
,
BLOCK_SECTORS
);
log
->
log_start
=
r5l_ring_add
(
log
,
log
->
log_start
,
BLOCK_SECTORS
);
r5c_update_log_state
(
log
);
/*
/*
* If we filled up the log device start from the beginning again,
* If we filled up the log device start from the beginning again,
* which will require a new bio.
* which will require a new bio.
...
@@ -297,6 +685,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
...
@@ -297,6 +685,7 @@ static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
io
->
log
=
log
;
io
->
log
=
log
;
INIT_LIST_HEAD
(
&
io
->
log_sibling
);
INIT_LIST_HEAD
(
&
io
->
log_sibling
);
INIT_LIST_HEAD
(
&
io
->
stripe_list
);
INIT_LIST_HEAD
(
&
io
->
stripe_list
);
bio_list_init
(
&
io
->
flush_barriers
);
io
->
state
=
IO_UNIT_RUNNING
;
io
->
state
=
IO_UNIT_RUNNING
;
io
->
meta_page
=
mempool_alloc
(
log
->
meta_pool
,
GFP_NOIO
);
io
->
meta_page
=
mempool_alloc
(
log
->
meta_pool
,
GFP_NOIO
);
...
@@ -367,12 +756,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
...
@@ -367,12 +756,11 @@ static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
struct
r5l_io_unit
*
io
=
log
->
current_io
;
struct
r5l_io_unit
*
io
=
log
->
current_io
;
if
(
io
->
need_split_bio
)
{
if
(
io
->
need_split_bio
)
{
struct
bio
*
prev
=
io
->
current_bio
;
BUG_ON
(
io
->
split_bio
)
;
io
->
split_bio
=
io
->
current_bio
;
io
->
current_bio
=
r5l_bio_alloc
(
log
);
io
->
current_bio
=
r5l_bio_alloc
(
log
);
bio_chain
(
io
->
current_bio
,
prev
);
bio_chain
(
io
->
current_bio
,
io
->
split_bio
);
io
->
need_split_bio
=
false
;
submit_bio
(
prev
);
}
}
if
(
!
bio_add_page
(
io
->
current_bio
,
page
,
PAGE_SIZE
,
0
))
if
(
!
bio_add_page
(
io
->
current_bio
,
page
,
PAGE_SIZE
,
0
))
...
@@ -401,50 +789,85 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
...
@@ -401,50 +789,85 @@ static int r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
io
=
log
->
current_io
;
io
=
log
->
current_io
;
if
(
test_and_clear_bit
(
STRIPE_R5C_PREFLUSH
,
&
sh
->
state
))
io
->
has_flush
=
1
;
for
(
i
=
0
;
i
<
sh
->
disks
;
i
++
)
{
for
(
i
=
0
;
i
<
sh
->
disks
;
i
++
)
{
if
(
!
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
i
].
flags
))
if
(
!
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
i
].
flags
)
||
test_bit
(
R5_InJournal
,
&
sh
->
dev
[
i
].
flags
))
continue
;
continue
;
if
(
i
==
sh
->
pd_idx
||
i
==
sh
->
qd_idx
)
if
(
i
==
sh
->
pd_idx
||
i
==
sh
->
qd_idx
)
continue
;
continue
;
if
(
test_bit
(
R5_WantFUA
,
&
sh
->
dev
[
i
].
flags
)
&&
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_BACK
)
{
io
->
has_fua
=
1
;
/*
* we need to flush journal to make sure recovery can
* reach the data with fua flag
*/
io
->
has_flush
=
1
;
}
r5l_append_payload_meta
(
log
,
R5LOG_PAYLOAD_DATA
,
r5l_append_payload_meta
(
log
,
R5LOG_PAYLOAD_DATA
,
raid5_compute_blocknr
(
sh
,
i
,
0
),
raid5_compute_blocknr
(
sh
,
i
,
0
),
sh
->
dev
[
i
].
log_checksum
,
0
,
false
);
sh
->
dev
[
i
].
log_checksum
,
0
,
false
);
r5l_append_payload_page
(
log
,
sh
->
dev
[
i
].
page
);
r5l_append_payload_page
(
log
,
sh
->
dev
[
i
].
page
);
}
}
if
(
sh
->
qd_idx
>=
0
)
{
if
(
parity_pages
==
2
)
{
r5l_append_payload_meta
(
log
,
R5LOG_PAYLOAD_PARITY
,
r5l_append_payload_meta
(
log
,
R5LOG_PAYLOAD_PARITY
,
sh
->
sector
,
sh
->
dev
[
sh
->
pd_idx
].
log_checksum
,
sh
->
sector
,
sh
->
dev
[
sh
->
pd_idx
].
log_checksum
,
sh
->
dev
[
sh
->
qd_idx
].
log_checksum
,
true
);
sh
->
dev
[
sh
->
qd_idx
].
log_checksum
,
true
);
r5l_append_payload_page
(
log
,
sh
->
dev
[
sh
->
pd_idx
].
page
);
r5l_append_payload_page
(
log
,
sh
->
dev
[
sh
->
pd_idx
].
page
);
r5l_append_payload_page
(
log
,
sh
->
dev
[
sh
->
qd_idx
].
page
);
r5l_append_payload_page
(
log
,
sh
->
dev
[
sh
->
qd_idx
].
page
);
}
else
{
}
else
if
(
parity_pages
==
1
)
{
r5l_append_payload_meta
(
log
,
R5LOG_PAYLOAD_PARITY
,
r5l_append_payload_meta
(
log
,
R5LOG_PAYLOAD_PARITY
,
sh
->
sector
,
sh
->
dev
[
sh
->
pd_idx
].
log_checksum
,
sh
->
sector
,
sh
->
dev
[
sh
->
pd_idx
].
log_checksum
,
0
,
false
);
0
,
false
);
r5l_append_payload_page
(
log
,
sh
->
dev
[
sh
->
pd_idx
].
page
);
r5l_append_payload_page
(
log
,
sh
->
dev
[
sh
->
pd_idx
].
page
);
}
}
else
/* Just writing data, not parity, in caching phase */
BUG_ON
(
parity_pages
!=
0
);
list_add_tail
(
&
sh
->
log_list
,
&
io
->
stripe_list
);
list_add_tail
(
&
sh
->
log_list
,
&
io
->
stripe_list
);
atomic_inc
(
&
io
->
pending_stripe
);
atomic_inc
(
&
io
->
pending_stripe
);
sh
->
log_io
=
io
;
sh
->
log_io
=
io
;
if
(
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_THROUGH
)
return
0
;
if
(
sh
->
log_start
==
MaxSector
)
{
BUG_ON
(
!
list_empty
(
&
sh
->
r5c
));
sh
->
log_start
=
io
->
log_start
;
spin_lock_irq
(
&
log
->
stripe_in_journal_lock
);
list_add_tail
(
&
sh
->
r5c
,
&
log
->
stripe_in_journal_list
);
spin_unlock_irq
(
&
log
->
stripe_in_journal_lock
);
atomic_inc
(
&
log
->
stripe_in_journal_count
);
}
return
0
;
return
0
;
}
}
static
void
r5l_wake_reclaim
(
struct
r5l_log
*
log
,
sector_t
space
);
/* add stripe to no_space_stripes, and then wake up reclaim */
static
inline
void
r5l_add_no_space_stripe
(
struct
r5l_log
*
log
,
struct
stripe_head
*
sh
)
{
spin_lock
(
&
log
->
no_space_stripes_lock
);
list_add_tail
(
&
sh
->
log_list
,
&
log
->
no_space_stripes
);
spin_unlock
(
&
log
->
no_space_stripes_lock
);
}
/*
/*
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* data from log to raid disks), so we shouldn't wait for reclaim here
* data from log to raid disks), so we shouldn't wait for reclaim here
*/
*/
int
r5l_write_stripe
(
struct
r5l_log
*
log
,
struct
stripe_head
*
sh
)
int
r5l_write_stripe
(
struct
r5l_log
*
log
,
struct
stripe_head
*
sh
)
{
{
struct
r5conf
*
conf
=
sh
->
raid_conf
;
int
write_disks
=
0
;
int
write_disks
=
0
;
int
data_pages
,
parity_pages
;
int
data_pages
,
parity_pages
;
int
meta_size
;
int
reserve
;
int
reserve
;
int
i
;
int
i
;
int
ret
=
0
;
int
ret
=
0
;
bool
wake_reclaim
=
false
;
if
(
!
log
)
if
(
!
log
)
return
-
EAGAIN
;
return
-
EAGAIN
;
...
@@ -456,11 +879,15 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
...
@@ -456,11 +879,15 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
return
-
EAGAIN
;
return
-
EAGAIN
;
}
}
WARN_ON
(
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
));
for
(
i
=
0
;
i
<
sh
->
disks
;
i
++
)
{
for
(
i
=
0
;
i
<
sh
->
disks
;
i
++
)
{
void
*
addr
;
void
*
addr
;
if
(
!
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
i
].
flags
))
if
(
!
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
i
].
flags
)
||
test_bit
(
R5_InJournal
,
&
sh
->
dev
[
i
].
flags
))
continue
;
continue
;
write_disks
++
;
write_disks
++
;
/* checksum is already calculated in last run */
/* checksum is already calculated in last run */
if
(
test_bit
(
STRIPE_LOG_TRAPPED
,
&
sh
->
state
))
if
(
test_bit
(
STRIPE_LOG_TRAPPED
,
&
sh
->
state
))
...
@@ -473,15 +900,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
...
@@ -473,15 +900,6 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
parity_pages
=
1
+
!!
(
sh
->
qd_idx
>=
0
);
parity_pages
=
1
+
!!
(
sh
->
qd_idx
>=
0
);
data_pages
=
write_disks
-
parity_pages
;
data_pages
=
write_disks
-
parity_pages
;
meta_size
=
((
sizeof
(
struct
r5l_payload_data_parity
)
+
sizeof
(
__le32
))
*
data_pages
)
+
sizeof
(
struct
r5l_payload_data_parity
)
+
sizeof
(
__le32
)
*
parity_pages
;
/* Doesn't work with very big raid array */
if
(
meta_size
+
sizeof
(
struct
r5l_meta_block
)
>
PAGE_SIZE
)
return
-
EINVAL
;
set_bit
(
STRIPE_LOG_TRAPPED
,
&
sh
->
state
);
set_bit
(
STRIPE_LOG_TRAPPED
,
&
sh
->
state
);
/*
/*
* The stripe must enter state machine again to finish the write, so
* The stripe must enter state machine again to finish the write, so
...
@@ -493,22 +911,49 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
...
@@ -493,22 +911,49 @@ int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
mutex_lock
(
&
log
->
io_mutex
);
mutex_lock
(
&
log
->
io_mutex
);
/* meta + data */
/* meta + data */
reserve
=
(
1
+
write_disks
)
<<
(
PAGE_SHIFT
-
9
);
reserve
=
(
1
+
write_disks
)
<<
(
PAGE_SHIFT
-
9
);
if
(
!
r5l_has_free_space
(
log
,
reserve
))
{
spin_lock
(
&
log
->
no_space_stripes_lock
);
list_add_tail
(
&
sh
->
log_list
,
&
log
->
no_space_stripes
);
spin_unlock
(
&
log
->
no_space_stripes_lock
);
r5l_wake_reclaim
(
log
,
reserve
);
if
(
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_THROUGH
)
{
if
(
!
r5l_has_free_space
(
log
,
reserve
))
{
r5l_add_no_space_stripe
(
log
,
sh
);
wake_reclaim
=
true
;
}
else
{
}
else
{
ret
=
r5l_log_stripe
(
log
,
sh
,
data_pages
,
parity_pages
);
ret
=
r5l_log_stripe
(
log
,
sh
,
data_pages
,
parity_pages
);
if
(
ret
)
{
if
(
ret
)
{
spin_lock_irq
(
&
log
->
io_list_lock
);
spin_lock_irq
(
&
log
->
io_list_lock
);
list_add_tail
(
&
sh
->
log_list
,
&
log
->
no_mem_stripes
);
list_add_tail
(
&
sh
->
log_list
,
&
log
->
no_mem_stripes
);
spin_unlock_irq
(
&
log
->
io_list_lock
);
}
}
}
else
{
/* R5C_JOURNAL_MODE_WRITE_BACK */
/*
* log space critical, do not process stripes that are
* not in cache yet (sh->log_start == MaxSector).
*/
if
(
test_bit
(
R5C_LOG_CRITICAL
,
&
conf
->
cache_state
)
&&
sh
->
log_start
==
MaxSector
)
{
r5l_add_no_space_stripe
(
log
,
sh
);
wake_reclaim
=
true
;
reserve
=
0
;
}
else
if
(
!
r5l_has_free_space
(
log
,
reserve
))
{
if
(
sh
->
log_start
==
log
->
last_checkpoint
)
BUG
();
else
r5l_add_no_space_stripe
(
log
,
sh
);
}
else
{
ret
=
r5l_log_stripe
(
log
,
sh
,
data_pages
,
parity_pages
);
if
(
ret
)
{
spin_lock_irq
(
&
log
->
io_list_lock
);
list_add_tail
(
&
sh
->
log_list
,
&
log
->
no_mem_stripes
);
spin_unlock_irq
(
&
log
->
io_list_lock
);
spin_unlock_irq
(
&
log
->
io_list_lock
);
}
}
}
}
}
mutex_unlock
(
&
log
->
io_mutex
);
mutex_unlock
(
&
log
->
io_mutex
);
if
(
wake_reclaim
)
r5l_wake_reclaim
(
log
,
reserve
);
return
0
;
return
0
;
}
}
...
@@ -525,17 +970,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
...
@@ -525,17 +970,34 @@ int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{
{
if
(
!
log
)
if
(
!
log
)
return
-
ENODEV
;
return
-
ENODEV
;
if
(
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_THROUGH
)
{
/*
/*
* we flush log disk cache first, then write stripe data to raid disks.
* in write through (journal only)
* So if bio is finished, the log disk cache is flushed already. The
* we flush log disk cache first, then write stripe data to
* recovery guarantees we can recovery the bio from log disk, so we
* raid disks. So if bio is finished, the log disk cache is
* don't need to flush again
* flushed already. The recovery guarantees we can recovery
* the bio from log disk, so we don't need to flush again
*/
*/
if
(
bio
->
bi_iter
.
bi_size
==
0
)
{
if
(
bio
->
bi_iter
.
bi_size
==
0
)
{
bio_endio
(
bio
);
bio_endio
(
bio
);
return
0
;
return
0
;
}
}
bio
->
bi_opf
&=
~
REQ_PREFLUSH
;
bio
->
bi_opf
&=
~
REQ_PREFLUSH
;
}
else
{
/* write back (with cache) */
if
(
bio
->
bi_iter
.
bi_size
==
0
)
{
mutex_lock
(
&
log
->
io_mutex
);
r5l_get_meta
(
log
,
0
);
bio_list_add
(
&
log
->
current_io
->
flush_barriers
,
bio
);
log
->
current_io
->
has_flush
=
1
;
log
->
current_io
->
has_null_flush
=
1
;
atomic_inc
(
&
log
->
current_io
->
pending_stripe
);
r5l_submit_current_io
(
log
);
mutex_unlock
(
&
log
->
io_mutex
);
return
0
;
}
}
return
-
EAGAIN
;
return
-
EAGAIN
;
}
}
...
@@ -555,10 +1017,40 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
...
@@ -555,10 +1017,40 @@ static void r5l_run_no_space_stripes(struct r5l_log *log)
spin_unlock
(
&
log
->
no_space_stripes_lock
);
spin_unlock
(
&
log
->
no_space_stripes_lock
);
}
}
/*
* calculate new last_checkpoint
* for write through mode, returns log->next_checkpoint
* for write back, returns log_start of first sh in stripe_in_journal_list
*/
static
sector_t
r5c_calculate_new_cp
(
struct
r5conf
*
conf
)
{
struct
stripe_head
*
sh
;
struct
r5l_log
*
log
=
conf
->
log
;
sector_t
new_cp
;
unsigned
long
flags
;
if
(
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_THROUGH
)
return
log
->
next_checkpoint
;
spin_lock_irqsave
(
&
log
->
stripe_in_journal_lock
,
flags
);
if
(
list_empty
(
&
conf
->
log
->
stripe_in_journal_list
))
{
/* all stripes flushed */
spin_unlock_irqrestore
(
&
log
->
stripe_in_journal_lock
,
flags
);
return
log
->
next_checkpoint
;
}
sh
=
list_first_entry
(
&
conf
->
log
->
stripe_in_journal_list
,
struct
stripe_head
,
r5c
);
new_cp
=
sh
->
log_start
;
spin_unlock_irqrestore
(
&
log
->
stripe_in_journal_lock
,
flags
);
return
new_cp
;
}
static
sector_t
r5l_reclaimable_space
(
struct
r5l_log
*
log
)
static
sector_t
r5l_reclaimable_space
(
struct
r5l_log
*
log
)
{
{
struct
r5conf
*
conf
=
log
->
rdev
->
mddev
->
private
;
return
r5l_ring_distance
(
log
,
log
->
last_checkpoint
,
return
r5l_ring_distance
(
log
,
log
->
last_checkpoint
,
log
->
next_checkpoint
);
r5c_calculate_new_cp
(
conf
)
);
}
}
static
void
r5l_run_no_mem_stripe
(
struct
r5l_log
*
log
)
static
void
r5l_run_no_mem_stripe
(
struct
r5l_log
*
log
)
...
@@ -589,7 +1081,6 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
...
@@ -589,7 +1081,6 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
break
;
break
;
log
->
next_checkpoint
=
io
->
log_start
;
log
->
next_checkpoint
=
io
->
log_start
;
log
->
next_cp_seq
=
io
->
seq
;
list_del
(
&
io
->
log_sibling
);
list_del
(
&
io
->
log_sibling
);
mempool_free
(
io
,
log
->
io_pool
);
mempool_free
(
io
,
log
->
io_pool
);
...
@@ -604,6 +1095,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
...
@@ -604,6 +1095,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
static
void
__r5l_stripe_write_finished
(
struct
r5l_io_unit
*
io
)
static
void
__r5l_stripe_write_finished
(
struct
r5l_io_unit
*
io
)
{
{
struct
r5l_log
*
log
=
io
->
log
;
struct
r5l_log
*
log
=
io
->
log
;
struct
r5conf
*
conf
=
log
->
rdev
->
mddev
->
private
;
unsigned
long
flags
;
unsigned
long
flags
;
spin_lock_irqsave
(
&
log
->
io_list_lock
,
flags
);
spin_lock_irqsave
(
&
log
->
io_list_lock
,
flags
);
...
@@ -614,7 +1106,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
...
@@ -614,7 +1106,8 @@ static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
return
;
return
;
}
}
if
(
r5l_reclaimable_space
(
log
)
>
log
->
max_free_space
)
if
(
r5l_reclaimable_space
(
log
)
>
log
->
max_free_space
||
test_bit
(
R5C_LOG_TIGHT
,
&
conf
->
cache_state
))
r5l_wake_reclaim
(
log
,
0
);
r5l_wake_reclaim
(
log
,
0
);
spin_unlock_irqrestore
(
&
log
->
io_list_lock
,
flags
);
spin_unlock_irqrestore
(
&
log
->
io_list_lock
,
flags
);
...
@@ -713,8 +1206,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
...
@@ -713,8 +1206,8 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
* there is a deadlock. We workaround this issue with a trylock.
* there is a deadlock. We workaround this issue with a trylock.
* FIXME: we could miss discard if we can't take reconfig mutex
* FIXME: we could miss discard if we can't take reconfig mutex
*/
*/
set_mask_bits
(
&
mddev
->
flags
,
0
,
set_mask_bits
(
&
mddev
->
sb_
flags
,
0
,
BIT
(
MD_
CHANGE_DEVS
)
|
BIT
(
MD
_CHANGE_PENDING
));
BIT
(
MD_
SB_CHANGE_DEVS
)
|
BIT
(
MD_SB
_CHANGE_PENDING
));
if
(
!
mddev_trylock
(
mddev
))
if
(
!
mddev_trylock
(
mddev
))
return
;
return
;
md_update_sb
(
mddev
,
1
);
md_update_sb
(
mddev
,
1
);
...
@@ -735,41 +1228,174 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
...
@@ -735,41 +1228,174 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
}
}
}
}
/*
static
void
r5l_do_reclaim
(
struct
r5l_log
*
log
)
* r5c_flush_stripe moves stripe from cached list to handle_list. When called,
* the stripe must be on r5c_cached_full_stripes or r5c_cached_partial_stripes.
*
* must hold conf->device_lock
*/
static
void
r5c_flush_stripe
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
)
{
{
sector_t
reclaim_target
=
xchg
(
&
log
->
reclaim_target
,
0
);
BUG_ON
(
list_empty
(
&
sh
->
lru
));
sector_t
reclaimable
;
BUG_ON
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
));
sector_t
next_checkpoint
;
BUG_ON
(
test_bit
(
STRIPE_HANDLE
,
&
sh
->
state
));
u64
next_cp_seq
;
spin_lock_irq
(
&
log
->
io_list_lock
);
/*
/*
* move proper io_unit to reclaim list. We should not change the order.
* The stripe is not ON_RELEASE_LIST, so it is safe to call
* reclaimable/unreclaimable io_unit can be mixed in the list, we
* raid5_release_stripe() while holding conf->device_lock
* shouldn't reuse space of an unreclaimable io_unit
*/
*/
while
(
1
)
{
BUG_ON
(
test_bit
(
STRIPE_ON_RELEASE_LIST
,
&
sh
->
state
));
reclaimable
=
r5l_reclaimable_space
(
log
);
assert_spin_locked
(
&
conf
->
device_lock
);
if
(
reclaimable
>=
reclaim_target
||
(
list_empty
(
&
log
->
running_ios
)
&&
list_empty
(
&
log
->
io_end_ios
)
&&
list_empty
(
&
log
->
flushing_ios
)
&&
list_empty
(
&
log
->
finished_ios
)))
break
;
md_wakeup_thread
(
log
->
rdev
->
mddev
->
thread
);
list_del_init
(
&
sh
->
lru
);
wait_event_lock_irq
(
log
->
iounit_wait
,
atomic_inc
(
&
sh
->
count
);
r5l_reclaimable_space
(
log
)
>
reclaimable
,
log
->
io_list_lock
);
}
next_checkpoint
=
log
->
next_checkpoint
;
set_bit
(
STRIPE_HANDLE
,
&
sh
->
state
);
next_cp_seq
=
log
->
next_cp_seq
;
atomic_inc
(
&
conf
->
active_stripes
);
r5c_make_stripe_write_out
(
sh
);
raid5_release_stripe
(
sh
);
}
/*
* if num == 0, flush all full stripes
* if num > 0, flush all full stripes. If less than num full stripes are
* flushed, flush some partial stripes until totally num stripes are
* flushed or there is no more cached stripes.
*/
void
r5c_flush_cache
(
struct
r5conf
*
conf
,
int
num
)
{
int
count
;
struct
stripe_head
*
sh
,
*
next
;
assert_spin_locked
(
&
conf
->
device_lock
);
if
(
!
conf
->
log
)
return
;
count
=
0
;
list_for_each_entry_safe
(
sh
,
next
,
&
conf
->
r5c_full_stripe_list
,
lru
)
{
r5c_flush_stripe
(
conf
,
sh
);
count
++
;
}
if
(
count
>=
num
)
return
;
list_for_each_entry_safe
(
sh
,
next
,
&
conf
->
r5c_partial_stripe_list
,
lru
)
{
r5c_flush_stripe
(
conf
,
sh
);
if
(
++
count
>=
num
)
break
;
}
}
static
void
r5c_do_reclaim
(
struct
r5conf
*
conf
)
{
struct
r5l_log
*
log
=
conf
->
log
;
struct
stripe_head
*
sh
;
int
count
=
0
;
unsigned
long
flags
;
int
total_cached
;
int
stripes_to_flush
;
if
(
!
r5c_is_writeback
(
log
))
return
;
total_cached
=
atomic_read
(
&
conf
->
r5c_cached_partial_stripes
)
+
atomic_read
(
&
conf
->
r5c_cached_full_stripes
);
if
(
total_cached
>
conf
->
min_nr_stripes
*
3
/
4
||
atomic_read
(
&
conf
->
empty_inactive_list_nr
)
>
0
)
/*
* if stripe cache pressure high, flush all full stripes and
* some partial stripes
*/
stripes_to_flush
=
R5C_RECLAIM_STRIPE_GROUP
;
else
if
(
total_cached
>
conf
->
min_nr_stripes
*
1
/
2
||
atomic_read
(
&
conf
->
r5c_cached_full_stripes
)
>
R5C_FULL_STRIPE_FLUSH_BATCH
)
/*
* if stripe cache pressure moderate, or if there is many full
* stripes,flush all full stripes
*/
stripes_to_flush
=
0
;
else
/* no need to flush */
stripes_to_flush
=
-
1
;
if
(
stripes_to_flush
>=
0
)
{
spin_lock_irqsave
(
&
conf
->
device_lock
,
flags
);
r5c_flush_cache
(
conf
,
stripes_to_flush
);
spin_unlock_irqrestore
(
&
conf
->
device_lock
,
flags
);
}
/* if log space is tight, flush stripes on stripe_in_journal_list */
if
(
test_bit
(
R5C_LOG_TIGHT
,
&
conf
->
cache_state
))
{
spin_lock_irqsave
(
&
log
->
stripe_in_journal_lock
,
flags
);
spin_lock
(
&
conf
->
device_lock
);
list_for_each_entry
(
sh
,
&
log
->
stripe_in_journal_list
,
r5c
)
{
/*
* stripes on stripe_in_journal_list could be in any
* state of the stripe_cache state machine. In this
* case, we only want to flush stripe on
* r5c_cached_full/partial_stripes. The following
* condition makes sure the stripe is on one of the
* two lists.
*/
if
(
!
list_empty
(
&
sh
->
lru
)
&&
!
test_bit
(
STRIPE_HANDLE
,
&
sh
->
state
)
&&
atomic_read
(
&
sh
->
count
)
==
0
)
{
r5c_flush_stripe
(
conf
,
sh
);
}
if
(
count
++
>=
R5C_RECLAIM_STRIPE_GROUP
)
break
;
}
spin_unlock
(
&
conf
->
device_lock
);
spin_unlock_irqrestore
(
&
log
->
stripe_in_journal_lock
,
flags
);
}
if
(
!
test_bit
(
R5C_LOG_CRITICAL
,
&
conf
->
cache_state
))
r5l_run_no_space_stripes
(
log
);
md_wakeup_thread
(
conf
->
mddev
->
thread
);
}
static
void
r5l_do_reclaim
(
struct
r5l_log
*
log
)
{
struct
r5conf
*
conf
=
log
->
rdev
->
mddev
->
private
;
sector_t
reclaim_target
=
xchg
(
&
log
->
reclaim_target
,
0
);
sector_t
reclaimable
;
sector_t
next_checkpoint
;
bool
write_super
;
spin_lock_irq
(
&
log
->
io_list_lock
);
write_super
=
r5l_reclaimable_space
(
log
)
>
log
->
max_free_space
||
reclaim_target
!=
0
||
!
list_empty
(
&
log
->
no_space_stripes
);
/*
* move proper io_unit to reclaim list. We should not change the order.
* reclaimable/unreclaimable io_unit can be mixed in the list, we
* shouldn't reuse space of an unreclaimable io_unit
*/
while
(
1
)
{
reclaimable
=
r5l_reclaimable_space
(
log
);
if
(
reclaimable
>=
reclaim_target
||
(
list_empty
(
&
log
->
running_ios
)
&&
list_empty
(
&
log
->
io_end_ios
)
&&
list_empty
(
&
log
->
flushing_ios
)
&&
list_empty
(
&
log
->
finished_ios
)))
break
;
md_wakeup_thread
(
log
->
rdev
->
mddev
->
thread
);
wait_event_lock_irq
(
log
->
iounit_wait
,
r5l_reclaimable_space
(
log
)
>
reclaimable
,
log
->
io_list_lock
);
}
next_checkpoint
=
r5c_calculate_new_cp
(
conf
);
spin_unlock_irq
(
&
log
->
io_list_lock
);
spin_unlock_irq
(
&
log
->
io_list_lock
);
BUG_ON
(
reclaimable
<
0
);
BUG_ON
(
reclaimable
<
0
);
if
(
reclaimable
==
0
)
if
(
reclaimable
==
0
||
!
write_super
)
return
;
return
;
/*
/*
...
@@ -781,7 +1407,7 @@ static void r5l_do_reclaim(struct r5l_log *log)
...
@@ -781,7 +1407,7 @@ static void r5l_do_reclaim(struct r5l_log *log)
mutex_lock
(
&
log
->
io_mutex
);
mutex_lock
(
&
log
->
io_mutex
);
log
->
last_checkpoint
=
next_checkpoint
;
log
->
last_checkpoint
=
next_checkpoint
;
log
->
last_cp_seq
=
next_cp_seq
;
r5c_update_log_state
(
log
)
;
mutex_unlock
(
&
log
->
io_mutex
);
mutex_unlock
(
&
log
->
io_mutex
);
r5l_run_no_space_stripes
(
log
);
r5l_run_no_space_stripes
(
log
);
...
@@ -795,14 +1421,17 @@ static void r5l_reclaim_thread(struct md_thread *thread)
...
@@ -795,14 +1421,17 @@ static void r5l_reclaim_thread(struct md_thread *thread)
if
(
!
log
)
if
(
!
log
)
return
;
return
;
r5c_do_reclaim
(
conf
);
r5l_do_reclaim
(
log
);
r5l_do_reclaim
(
log
);
}
}
static
void
r5l_wake_reclaim
(
struct
r5l_log
*
log
,
sector_t
space
)
void
r5l_wake_reclaim
(
struct
r5l_log
*
log
,
sector_t
space
)
{
{
unsigned
long
target
;
unsigned
long
target
;
unsigned
long
new
=
(
unsigned
long
)
space
;
/* overflow in theory */
unsigned
long
new
=
(
unsigned
long
)
space
;
/* overflow in theory */
if
(
!
log
)
return
;
do
{
do
{
target
=
log
->
reclaim_target
;
target
=
log
->
reclaim_target
;
if
(
new
<
target
)
if
(
new
<
target
)
...
@@ -816,22 +1445,14 @@ void r5l_quiesce(struct r5l_log *log, int state)
...
@@ -816,22 +1445,14 @@ void r5l_quiesce(struct r5l_log *log, int state)
struct
mddev
*
mddev
;
struct
mddev
*
mddev
;
if
(
!
log
||
state
==
2
)
if
(
!
log
||
state
==
2
)
return
;
return
;
if
(
state
==
0
)
{
if
(
state
==
0
)
/*
kthread_unpark
(
log
->
reclaim_thread
->
tsk
);
* This is a special case for hotadd. In suspend, the array has
else
if
(
state
==
1
)
{
* no journal. In resume, journal is initialized as well as the
* reclaim thread.
*/
if
(
log
->
reclaim_thread
)
return
;
log
->
reclaim_thread
=
md_register_thread
(
r5l_reclaim_thread
,
log
->
rdev
->
mddev
,
"reclaim"
);
}
else
if
(
state
==
1
)
{
/* make sure r5l_write_super_and_discard_space exits */
/* make sure r5l_write_super_and_discard_space exits */
mddev
=
log
->
rdev
->
mddev
;
mddev
=
log
->
rdev
->
mddev
;
wake_up
(
&
mddev
->
sb_wait
);
wake_up
(
&
mddev
->
sb_wait
);
r5l_wake_reclaim
(
log
,
-
1L
);
kthread_park
(
log
->
reclaim_thread
->
tsk
);
md_unregister_thread
(
&
log
->
reclaim_thread
);
r5l_wake_reclaim
(
log
,
MaxSector
);
r5l_do_reclaim
(
log
);
r5l_do_reclaim
(
log
);
}
}
}
}
...
@@ -852,265 +1473,1017 @@ bool r5l_log_disk_error(struct r5conf *conf)
...
@@ -852,265 +1473,1017 @@ bool r5l_log_disk_error(struct r5conf *conf)
return
ret
;
return
ret
;
}
}
struct
r5l_recovery_ctx
{
struct
r5l_recovery_ctx
{
struct
page
*
meta_page
;
/* current meta */
struct
page
*
meta_page
;
/* current meta */
sector_t
meta_total_blocks
;
/* total size of current meta and data */
sector_t
meta_total_blocks
;
/* total size of current meta and data */
sector_t
pos
;
/* recovery position */
sector_t
pos
;
/* recovery position */
u64
seq
;
/* recovery position seq */
u64
seq
;
/* recovery position seq */
};
int
data_parity_stripes
;
/* number of data_parity stripes */
int
data_only_stripes
;
/* number of data_only stripes */
struct
list_head
cached_list
;
};
static
int
r5l_recovery_read_meta_block
(
struct
r5l_log
*
log
,
struct
r5l_recovery_ctx
*
ctx
)
{
struct
page
*
page
=
ctx
->
meta_page
;
struct
r5l_meta_block
*
mb
;
u32
crc
,
stored_crc
;
if
(
!
sync_page_io
(
log
->
rdev
,
ctx
->
pos
,
PAGE_SIZE
,
page
,
REQ_OP_READ
,
0
,
false
))
return
-
EIO
;
mb
=
page_address
(
page
);
stored_crc
=
le32_to_cpu
(
mb
->
checksum
);
mb
->
checksum
=
0
;
if
(
le32_to_cpu
(
mb
->
magic
)
!=
R5LOG_MAGIC
||
le64_to_cpu
(
mb
->
seq
)
!=
ctx
->
seq
||
mb
->
version
!=
R5LOG_VERSION
||
le64_to_cpu
(
mb
->
position
)
!=
ctx
->
pos
)
return
-
EINVAL
;
crc
=
crc32c_le
(
log
->
uuid_checksum
,
mb
,
PAGE_SIZE
);
if
(
stored_crc
!=
crc
)
return
-
EINVAL
;
if
(
le32_to_cpu
(
mb
->
meta_size
)
>
PAGE_SIZE
)
return
-
EINVAL
;
ctx
->
meta_total_blocks
=
BLOCK_SECTORS
;
return
0
;
}
static
void
r5l_recovery_create_empty_meta_block
(
struct
r5l_log
*
log
,
struct
page
*
page
,
sector_t
pos
,
u64
seq
)
{
struct
r5l_meta_block
*
mb
;
mb
=
page_address
(
page
);
clear_page
(
mb
);
mb
->
magic
=
cpu_to_le32
(
R5LOG_MAGIC
);
mb
->
version
=
R5LOG_VERSION
;
mb
->
meta_size
=
cpu_to_le32
(
sizeof
(
struct
r5l_meta_block
));
mb
->
seq
=
cpu_to_le64
(
seq
);
mb
->
position
=
cpu_to_le64
(
pos
);
}
static
int
r5l_log_write_empty_meta_block
(
struct
r5l_log
*
log
,
sector_t
pos
,
u64
seq
)
{
struct
page
*
page
;
struct
r5l_meta_block
*
mb
;
page
=
alloc_page
(
GFP_KERNEL
);
if
(
!
page
)
return
-
ENOMEM
;
r5l_recovery_create_empty_meta_block
(
log
,
page
,
pos
,
seq
);
mb
=
page_address
(
page
);
mb
->
checksum
=
cpu_to_le32
(
crc32c_le
(
log
->
uuid_checksum
,
mb
,
PAGE_SIZE
));
if
(
!
sync_page_io
(
log
->
rdev
,
pos
,
PAGE_SIZE
,
page
,
REQ_OP_WRITE
,
REQ_FUA
,
false
))
{
__free_page
(
page
);
return
-
EIO
;
}
__free_page
(
page
);
return
0
;
}
/*
* r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
* to mark valid (potentially not flushed) data in the journal.
*
* We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
* so there should not be any mismatch here.
*/
static
void
r5l_recovery_load_data
(
struct
r5l_log
*
log
,
struct
stripe_head
*
sh
,
struct
r5l_recovery_ctx
*
ctx
,
struct
r5l_payload_data_parity
*
payload
,
sector_t
log_offset
)
{
struct
mddev
*
mddev
=
log
->
rdev
->
mddev
;
struct
r5conf
*
conf
=
mddev
->
private
;
int
dd_idx
;
raid5_compute_sector
(
conf
,
le64_to_cpu
(
payload
->
location
),
0
,
&
dd_idx
,
sh
);
sync_page_io
(
log
->
rdev
,
log_offset
,
PAGE_SIZE
,
sh
->
dev
[
dd_idx
].
page
,
REQ_OP_READ
,
0
,
false
);
sh
->
dev
[
dd_idx
].
log_checksum
=
le32_to_cpu
(
payload
->
checksum
[
0
]);
ctx
->
meta_total_blocks
+=
BLOCK_SECTORS
;
set_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
dd_idx
].
flags
);
set_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
);
}
static
void
r5l_recovery_load_parity
(
struct
r5l_log
*
log
,
struct
stripe_head
*
sh
,
struct
r5l_recovery_ctx
*
ctx
,
struct
r5l_payload_data_parity
*
payload
,
sector_t
log_offset
)
{
struct
mddev
*
mddev
=
log
->
rdev
->
mddev
;
struct
r5conf
*
conf
=
mddev
->
private
;
ctx
->
meta_total_blocks
+=
BLOCK_SECTORS
*
conf
->
max_degraded
;
sync_page_io
(
log
->
rdev
,
log_offset
,
PAGE_SIZE
,
sh
->
dev
[
sh
->
pd_idx
].
page
,
REQ_OP_READ
,
0
,
false
);
sh
->
dev
[
sh
->
pd_idx
].
log_checksum
=
le32_to_cpu
(
payload
->
checksum
[
0
]);
set_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
sh
->
pd_idx
].
flags
);
if
(
sh
->
qd_idx
>=
0
)
{
sync_page_io
(
log
->
rdev
,
r5l_ring_add
(
log
,
log_offset
,
BLOCK_SECTORS
),
PAGE_SIZE
,
sh
->
dev
[
sh
->
qd_idx
].
page
,
REQ_OP_READ
,
0
,
false
);
sh
->
dev
[
sh
->
qd_idx
].
log_checksum
=
le32_to_cpu
(
payload
->
checksum
[
1
]);
set_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
sh
->
qd_idx
].
flags
);
}
clear_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
);
}
static
void
r5l_recovery_reset_stripe
(
struct
stripe_head
*
sh
)
{
int
i
;
sh
->
state
=
0
;
sh
->
log_start
=
MaxSector
;
for
(
i
=
sh
->
disks
;
i
--
;
)
sh
->
dev
[
i
].
flags
=
0
;
}
static
void
r5l_recovery_replay_one_stripe
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
struct
r5l_recovery_ctx
*
ctx
)
{
struct
md_rdev
*
rdev
,
*
rrdev
;
int
disk_index
;
int
data_count
=
0
;
for
(
disk_index
=
0
;
disk_index
<
sh
->
disks
;
disk_index
++
)
{
if
(
!
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
disk_index
].
flags
))
continue
;
if
(
disk_index
==
sh
->
qd_idx
||
disk_index
==
sh
->
pd_idx
)
continue
;
data_count
++
;
}
/*
* stripes that only have parity must have been flushed
* before the crash that we are now recovering from, so
* there is nothing more to recovery.
*/
if
(
data_count
==
0
)
goto
out
;
for
(
disk_index
=
0
;
disk_index
<
sh
->
disks
;
disk_index
++
)
{
if
(
!
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
disk_index
].
flags
))
continue
;
/* in case device is broken */
rcu_read_lock
();
rdev
=
rcu_dereference
(
conf
->
disks
[
disk_index
].
rdev
);
if
(
rdev
)
{
atomic_inc
(
&
rdev
->
nr_pending
);
rcu_read_unlock
();
sync_page_io
(
rdev
,
sh
->
sector
,
PAGE_SIZE
,
sh
->
dev
[
disk_index
].
page
,
REQ_OP_WRITE
,
0
,
false
);
rdev_dec_pending
(
rdev
,
rdev
->
mddev
);
rcu_read_lock
();
}
rrdev
=
rcu_dereference
(
conf
->
disks
[
disk_index
].
replacement
);
if
(
rrdev
)
{
atomic_inc
(
&
rrdev
->
nr_pending
);
rcu_read_unlock
();
sync_page_io
(
rrdev
,
sh
->
sector
,
PAGE_SIZE
,
sh
->
dev
[
disk_index
].
page
,
REQ_OP_WRITE
,
0
,
false
);
rdev_dec_pending
(
rrdev
,
rrdev
->
mddev
);
rcu_read_lock
();
}
rcu_read_unlock
();
}
ctx
->
data_parity_stripes
++
;
out:
r5l_recovery_reset_stripe
(
sh
);
}
static
struct
stripe_head
*
r5c_recovery_alloc_stripe
(
struct
r5conf
*
conf
,
sector_t
stripe_sect
,
sector_t
log_start
)
{
struct
stripe_head
*
sh
;
sh
=
raid5_get_active_stripe
(
conf
,
stripe_sect
,
0
,
1
,
0
);
if
(
!
sh
)
return
NULL
;
/* no more stripe available */
r5l_recovery_reset_stripe
(
sh
);
sh
->
log_start
=
log_start
;
return
sh
;
}
static
struct
stripe_head
*
r5c_recovery_lookup_stripe
(
struct
list_head
*
list
,
sector_t
sect
)
{
struct
stripe_head
*
sh
;
list_for_each_entry
(
sh
,
list
,
lru
)
if
(
sh
->
sector
==
sect
)
return
sh
;
return
NULL
;
}
static
void
r5c_recovery_drop_stripes
(
struct
list_head
*
cached_stripe_list
,
struct
r5l_recovery_ctx
*
ctx
)
{
struct
stripe_head
*
sh
,
*
next
;
list_for_each_entry_safe
(
sh
,
next
,
cached_stripe_list
,
lru
)
{
r5l_recovery_reset_stripe
(
sh
);
list_del_init
(
&
sh
->
lru
);
raid5_release_stripe
(
sh
);
}
}
static
void
r5c_recovery_replay_stripes
(
struct
list_head
*
cached_stripe_list
,
struct
r5l_recovery_ctx
*
ctx
)
{
struct
stripe_head
*
sh
,
*
next
;
list_for_each_entry_safe
(
sh
,
next
,
cached_stripe_list
,
lru
)
if
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
))
{
r5l_recovery_replay_one_stripe
(
sh
->
raid_conf
,
sh
,
ctx
);
list_del_init
(
&
sh
->
lru
);
raid5_release_stripe
(
sh
);
}
}
/* if matches return 0; otherwise return -EINVAL */
static
int
r5l_recovery_verify_data_checksum
(
struct
r5l_log
*
log
,
struct
page
*
page
,
sector_t
log_offset
,
__le32
log_checksum
)
{
void
*
addr
;
u32
checksum
;
sync_page_io
(
log
->
rdev
,
log_offset
,
PAGE_SIZE
,
page
,
REQ_OP_READ
,
0
,
false
);
addr
=
kmap_atomic
(
page
);
checksum
=
crc32c_le
(
log
->
uuid_checksum
,
addr
,
PAGE_SIZE
);
kunmap_atomic
(
addr
);
return
(
le32_to_cpu
(
log_checksum
)
==
checksum
)
?
0
:
-
EINVAL
;
}
/*
* before loading data to stripe cache, we need verify checksum for all data,
* if there is mismatch for any data page, we drop all data in the mata block
*/
static
int
r5l_recovery_verify_data_checksum_for_mb
(
struct
r5l_log
*
log
,
struct
r5l_recovery_ctx
*
ctx
)
{
struct
mddev
*
mddev
=
log
->
rdev
->
mddev
;
struct
r5conf
*
conf
=
mddev
->
private
;
struct
r5l_meta_block
*
mb
=
page_address
(
ctx
->
meta_page
);
sector_t
mb_offset
=
sizeof
(
struct
r5l_meta_block
);
sector_t
log_offset
=
r5l_ring_add
(
log
,
ctx
->
pos
,
BLOCK_SECTORS
);
struct
page
*
page
;
struct
r5l_payload_data_parity
*
payload
;
page
=
alloc_page
(
GFP_KERNEL
);
if
(
!
page
)
return
-
ENOMEM
;
while
(
mb_offset
<
le32_to_cpu
(
mb
->
meta_size
))
{
payload
=
(
void
*
)
mb
+
mb_offset
;
if
(
payload
->
header
.
type
==
R5LOG_PAYLOAD_DATA
)
{
if
(
r5l_recovery_verify_data_checksum
(
log
,
page
,
log_offset
,
payload
->
checksum
[
0
])
<
0
)
goto
mismatch
;
}
else
if
(
payload
->
header
.
type
==
R5LOG_PAYLOAD_PARITY
)
{
if
(
r5l_recovery_verify_data_checksum
(
log
,
page
,
log_offset
,
payload
->
checksum
[
0
])
<
0
)
goto
mismatch
;
if
(
conf
->
max_degraded
==
2
&&
/* q for RAID 6 */
r5l_recovery_verify_data_checksum
(
log
,
page
,
r5l_ring_add
(
log
,
log_offset
,
BLOCK_SECTORS
),
payload
->
checksum
[
1
])
<
0
)
goto
mismatch
;
}
else
/* not R5LOG_PAYLOAD_DATA or R5LOG_PAYLOAD_PARITY */
goto
mismatch
;
log_offset
=
r5l_ring_add
(
log
,
log_offset
,
le32_to_cpu
(
payload
->
size
));
mb_offset
+=
sizeof
(
struct
r5l_payload_data_parity
)
+
sizeof
(
__le32
)
*
(
le32_to_cpu
(
payload
->
size
)
>>
(
PAGE_SHIFT
-
9
));
}
put_page
(
page
);
return
0
;
mismatch:
put_page
(
page
);
return
-
EINVAL
;
}
/*
* Analyze all data/parity pages in one meta block
* Returns:
* 0 for success
* -EINVAL for unknown playload type
* -EAGAIN for checksum mismatch of data page
* -ENOMEM for run out of memory (alloc_page failed or run out of stripes)
*/
static
int
r5c_recovery_analyze_meta_block
(
struct
r5l_log
*
log
,
struct
r5l_recovery_ctx
*
ctx
,
struct
list_head
*
cached_stripe_list
)
{
struct
mddev
*
mddev
=
log
->
rdev
->
mddev
;
struct
r5conf
*
conf
=
mddev
->
private
;
struct
r5l_meta_block
*
mb
;
struct
r5l_payload_data_parity
*
payload
;
int
mb_offset
;
sector_t
log_offset
;
sector_t
stripe_sect
;
struct
stripe_head
*
sh
;
int
ret
;
/*
* for mismatch in data blocks, we will drop all data in this mb, but
* we will still read next mb for other data with FLUSH flag, as
* io_unit could finish out of order.
*/
ret
=
r5l_recovery_verify_data_checksum_for_mb
(
log
,
ctx
);
if
(
ret
==
-
EINVAL
)
return
-
EAGAIN
;
else
if
(
ret
)
return
ret
;
/* -ENOMEM duo to alloc_page() failed */
mb
=
page_address
(
ctx
->
meta_page
);
mb_offset
=
sizeof
(
struct
r5l_meta_block
);
log_offset
=
r5l_ring_add
(
log
,
ctx
->
pos
,
BLOCK_SECTORS
);
while
(
mb_offset
<
le32_to_cpu
(
mb
->
meta_size
))
{
int
dd
;
payload
=
(
void
*
)
mb
+
mb_offset
;
stripe_sect
=
(
payload
->
header
.
type
==
R5LOG_PAYLOAD_DATA
)
?
raid5_compute_sector
(
conf
,
le64_to_cpu
(
payload
->
location
),
0
,
&
dd
,
NULL
)
:
le64_to_cpu
(
payload
->
location
);
sh
=
r5c_recovery_lookup_stripe
(
cached_stripe_list
,
stripe_sect
);
if
(
!
sh
)
{
sh
=
r5c_recovery_alloc_stripe
(
conf
,
stripe_sect
,
ctx
->
pos
);
/*
* cannot get stripe from raid5_get_active_stripe
* try replay some stripes
*/
if
(
!
sh
)
{
r5c_recovery_replay_stripes
(
cached_stripe_list
,
ctx
);
sh
=
r5c_recovery_alloc_stripe
(
conf
,
stripe_sect
,
ctx
->
pos
);
}
if
(
!
sh
)
{
pr_debug
(
"md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.
\n
"
,
mdname
(
mddev
),
conf
->
min_nr_stripes
*
2
);
raid5_set_cache_size
(
mddev
,
conf
->
min_nr_stripes
*
2
);
sh
=
r5c_recovery_alloc_stripe
(
conf
,
stripe_sect
,
ctx
->
pos
);
}
if
(
!
sh
)
{
pr_err
(
"md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.
\n
"
,
mdname
(
mddev
));
return
-
ENOMEM
;
}
list_add_tail
(
&
sh
->
lru
,
cached_stripe_list
);
}
if
(
payload
->
header
.
type
==
R5LOG_PAYLOAD_DATA
)
{
if
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
)
&&
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
sh
->
pd_idx
].
flags
))
{
r5l_recovery_replay_one_stripe
(
conf
,
sh
,
ctx
);
sh
->
log_start
=
ctx
->
pos
;
list_move_tail
(
&
sh
->
lru
,
cached_stripe_list
);
}
r5l_recovery_load_data
(
log
,
sh
,
ctx
,
payload
,
log_offset
);
}
else
if
(
payload
->
header
.
type
==
R5LOG_PAYLOAD_PARITY
)
r5l_recovery_load_parity
(
log
,
sh
,
ctx
,
payload
,
log_offset
);
else
return
-
EINVAL
;
log_offset
=
r5l_ring_add
(
log
,
log_offset
,
le32_to_cpu
(
payload
->
size
));
mb_offset
+=
sizeof
(
struct
r5l_payload_data_parity
)
+
sizeof
(
__le32
)
*
(
le32_to_cpu
(
payload
->
size
)
>>
(
PAGE_SHIFT
-
9
));
}
return
0
;
}
/*
* Load the stripe into cache. The stripe will be written out later by
* the stripe cache state machine.
*/
static
void
r5c_recovery_load_one_stripe
(
struct
r5l_log
*
log
,
struct
stripe_head
*
sh
)
{
struct
r5dev
*
dev
;
int
i
;
for
(
i
=
sh
->
disks
;
i
--
;
)
{
dev
=
sh
->
dev
+
i
;
if
(
test_and_clear_bit
(
R5_Wantwrite
,
&
dev
->
flags
))
{
set_bit
(
R5_InJournal
,
&
dev
->
flags
);
set_bit
(
R5_UPTODATE
,
&
dev
->
flags
);
}
}
list_add_tail
(
&
sh
->
r5c
,
&
log
->
stripe_in_journal_list
);
atomic_inc
(
&
log
->
stripe_in_journal_count
);
}
/*
* Scan through the log for all to-be-flushed data
*
* For stripes with data and parity, namely Data-Parity stripe
* (STRIPE_R5C_CACHING == 0), we simply replay all the writes.
*
* For stripes with only data, namely Data-Only stripe
* (STRIPE_R5C_CACHING == 1), we load them to stripe cache state machine.
*
* For a stripe, if we see data after parity, we should discard all previous
* data and parity for this stripe, as these data are already flushed to
* the array.
*
* At the end of the scan, we return the new journal_tail, which points to
* first data-only stripe on the journal device, or next invalid meta block.
*/
static
int
r5c_recovery_flush_log
(
struct
r5l_log
*
log
,
struct
r5l_recovery_ctx
*
ctx
)
{
struct
stripe_head
*
sh
;
int
ret
=
0
;
/* scan through the log */
while
(
1
)
{
if
(
r5l_recovery_read_meta_block
(
log
,
ctx
))
break
;
ret
=
r5c_recovery_analyze_meta_block
(
log
,
ctx
,
&
ctx
->
cached_list
);
/*
* -EAGAIN means mismatch in data block, in this case, we still
* try scan the next metablock
*/
if
(
ret
&&
ret
!=
-
EAGAIN
)
break
;
/* ret == -EINVAL or -ENOMEM */
ctx
->
seq
++
;
ctx
->
pos
=
r5l_ring_add
(
log
,
ctx
->
pos
,
ctx
->
meta_total_blocks
);
}
if
(
ret
==
-
ENOMEM
)
{
r5c_recovery_drop_stripes
(
&
ctx
->
cached_list
,
ctx
);
return
ret
;
}
/* replay data-parity stripes */
r5c_recovery_replay_stripes
(
&
ctx
->
cached_list
,
ctx
);
/* load data-only stripes to stripe cache */
list_for_each_entry
(
sh
,
&
ctx
->
cached_list
,
lru
)
{
WARN_ON
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
));
r5c_recovery_load_one_stripe
(
log
,
sh
);
ctx
->
data_only_stripes
++
;
}
return
0
;
}
/*
* we did a recovery. Now ctx.pos points to an invalid meta block. New
* log will start here. but we can't let superblock point to last valid
* meta block. The log might looks like:
* | meta 1| meta 2| meta 3|
* meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
* superblock points to meta 1, we write a new valid meta 2n. if crash
* happens again, new recovery will start from meta 1. Since meta 2n is
* valid now, recovery will think meta 3 is valid, which is wrong.
* The solution is we create a new meta in meta2 with its seq == meta
* 1's seq + 10000 and let superblock points to meta2. The same recovery
* will not think meta 3 is a valid meta, because its seq doesn't match
*/
/*
* Before recovery, the log looks like the following
*
* ---------------------------------------------
* | valid log | invalid log |
* ---------------------------------------------
* ^
* |- log->last_checkpoint
* |- log->last_cp_seq
*
* Now we scan through the log until we see invalid entry
*
* ---------------------------------------------
* | valid log | invalid log |
* ---------------------------------------------
* ^ ^
* |- log->last_checkpoint |- ctx->pos
* |- log->last_cp_seq |- ctx->seq
*
* From this point, we need to increase seq number by 10 to avoid
* confusing next recovery.
*
* ---------------------------------------------
* | valid log | invalid log |
* ---------------------------------------------
* ^ ^
* |- log->last_checkpoint |- ctx->pos+1
* |- log->last_cp_seq |- ctx->seq+10001
*
* However, it is not safe to start the state machine yet, because data only
* parities are not yet secured in RAID. To save these data only parities, we
* rewrite them from seq+11.
*
* -----------------------------------------------------------------
* | valid log | data only stripes | invalid log |
* -----------------------------------------------------------------
* ^ ^
* |- log->last_checkpoint |- ctx->pos+n
* |- log->last_cp_seq |- ctx->seq+10000+n
*
* If failure happens again during this process, the recovery can safe start
* again from log->last_checkpoint.
*
* Once data only stripes are rewritten to journal, we move log_tail
*
* -----------------------------------------------------------------
* | old log | data only stripes | invalid log |
* -----------------------------------------------------------------
* ^ ^
* |- log->last_checkpoint |- ctx->pos+n
* |- log->last_cp_seq |- ctx->seq+10000+n
*
* Then we can safely start the state machine. If failure happens from this
* point on, the recovery will start from new log->last_checkpoint.
*/
static
int
r5c_recovery_rewrite_data_only_stripes
(
struct
r5l_log
*
log
,
struct
r5l_recovery_ctx
*
ctx
)
{
struct
stripe_head
*
sh
,
*
next
;
struct
mddev
*
mddev
=
log
->
rdev
->
mddev
;
struct
page
*
page
;
page
=
alloc_page
(
GFP_KERNEL
);
if
(
!
page
)
{
pr_err
(
"md/raid:%s: cannot allocate memory to rewrite data only stripes
\n
"
,
mdname
(
mddev
));
return
-
ENOMEM
;
}
list_for_each_entry_safe
(
sh
,
next
,
&
ctx
->
cached_list
,
lru
)
{
struct
r5l_meta_block
*
mb
;
int
i
;
int
offset
;
sector_t
write_pos
;
WARN_ON
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
));
r5l_recovery_create_empty_meta_block
(
log
,
page
,
ctx
->
pos
,
ctx
->
seq
);
mb
=
page_address
(
page
);
offset
=
le32_to_cpu
(
mb
->
meta_size
);
write_pos
=
r5l_ring_add
(
log
,
ctx
->
pos
,
BLOCK_SECTORS
);
for
(
i
=
sh
->
disks
;
i
--
;
)
{
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
struct
r5l_payload_data_parity
*
payload
;
void
*
addr
;
if
(
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
{
payload
=
(
void
*
)
mb
+
offset
;
payload
->
header
.
type
=
cpu_to_le16
(
R5LOG_PAYLOAD_DATA
);
payload
->
size
=
BLOCK_SECTORS
;
payload
->
location
=
cpu_to_le64
(
raid5_compute_blocknr
(
sh
,
i
,
0
));
addr
=
kmap_atomic
(
dev
->
page
);
payload
->
checksum
[
0
]
=
cpu_to_le32
(
crc32c_le
(
log
->
uuid_checksum
,
addr
,
PAGE_SIZE
));
kunmap_atomic
(
addr
);
sync_page_io
(
log
->
rdev
,
write_pos
,
PAGE_SIZE
,
dev
->
page
,
REQ_OP_WRITE
,
0
,
false
);
write_pos
=
r5l_ring_add
(
log
,
write_pos
,
BLOCK_SECTORS
);
offset
+=
sizeof
(
__le32
)
+
sizeof
(
struct
r5l_payload_data_parity
);
}
}
mb
->
meta_size
=
cpu_to_le32
(
offset
);
mb
->
checksum
=
cpu_to_le32
(
crc32c_le
(
log
->
uuid_checksum
,
mb
,
PAGE_SIZE
));
sync_page_io
(
log
->
rdev
,
ctx
->
pos
,
PAGE_SIZE
,
page
,
REQ_OP_WRITE
,
REQ_FUA
,
false
);
sh
->
log_start
=
ctx
->
pos
;
ctx
->
pos
=
write_pos
;
ctx
->
seq
+=
1
;
list_del_init
(
&
sh
->
lru
);
raid5_release_stripe
(
sh
);
}
__free_page
(
page
);
return
0
;
}
static
int
r5l_recovery_log
(
struct
r5l_log
*
log
)
{
struct
mddev
*
mddev
=
log
->
rdev
->
mddev
;
struct
r5l_recovery_ctx
ctx
;
int
ret
;
sector_t
pos
;
struct
stripe_head
*
sh
;
ctx
.
pos
=
log
->
last_checkpoint
;
ctx
.
seq
=
log
->
last_cp_seq
;
ctx
.
meta_page
=
alloc_page
(
GFP_KERNEL
);
ctx
.
data_only_stripes
=
0
;
ctx
.
data_parity_stripes
=
0
;
INIT_LIST_HEAD
(
&
ctx
.
cached_list
);
if
(
!
ctx
.
meta_page
)
return
-
ENOMEM
;
ret
=
r5c_recovery_flush_log
(
log
,
&
ctx
);
__free_page
(
ctx
.
meta_page
);
if
(
ret
)
return
ret
;
pos
=
ctx
.
pos
;
ctx
.
seq
+=
10000
;
if
(
ctx
.
data_only_stripes
==
0
)
{
log
->
next_checkpoint
=
ctx
.
pos
;
r5l_log_write_empty_meta_block
(
log
,
ctx
.
pos
,
ctx
.
seq
++
);
ctx
.
pos
=
r5l_ring_add
(
log
,
ctx
.
pos
,
BLOCK_SECTORS
);
}
else
{
sh
=
list_last_entry
(
&
ctx
.
cached_list
,
struct
stripe_head
,
lru
);
log
->
next_checkpoint
=
sh
->
log_start
;
}
if
((
ctx
.
data_only_stripes
==
0
)
&&
(
ctx
.
data_parity_stripes
==
0
))
pr_debug
(
"md/raid:%s: starting from clean shutdown
\n
"
,
mdname
(
mddev
));
else
{
pr_debug
(
"md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes
\n
"
,
mdname
(
mddev
),
ctx
.
data_only_stripes
,
ctx
.
data_parity_stripes
);
if
(
ctx
.
data_only_stripes
>
0
)
if
(
r5c_recovery_rewrite_data_only_stripes
(
log
,
&
ctx
))
{
pr_err
(
"md/raid:%s: failed to rewrite stripes to journal
\n
"
,
mdname
(
mddev
));
return
-
EIO
;
}
}
log
->
log_start
=
ctx
.
pos
;
log
->
seq
=
ctx
.
seq
;
log
->
last_checkpoint
=
pos
;
r5l_write_super
(
log
,
pos
);
return
0
;
}
static
void
r5l_write_super
(
struct
r5l_log
*
log
,
sector_t
cp
)
{
struct
mddev
*
mddev
=
log
->
rdev
->
mddev
;
static
int
r5l_read_meta_block
(
struct
r5l_log
*
log
,
log
->
rdev
->
journal_tail
=
cp
;
struct
r5l_recovery_ctx
*
ctx
)
set_bit
(
MD_SB_CHANGE_DEVS
,
&
mddev
->
sb_flags
);
}
static
ssize_t
r5c_journal_mode_show
(
struct
mddev
*
mddev
,
char
*
page
)
{
{
struct
page
*
page
=
ctx
->
meta_page
;
struct
r5conf
*
conf
=
mddev
->
private
;
struct
r5l_meta_block
*
mb
;
int
ret
;
u32
crc
,
stored_crc
;
if
(
!
sync_page_io
(
log
->
rdev
,
ctx
->
pos
,
PAGE_SIZE
,
page
,
REQ_OP_READ
,
0
,
if
(
!
conf
->
log
)
false
))
return
0
;
return
-
EIO
;
mb
=
page_address
(
page
);
switch
(
conf
->
log
->
r5c_journal_mode
)
{
stored_crc
=
le32_to_cpu
(
mb
->
checksum
);
case
R5C_JOURNAL_MODE_WRITE_THROUGH
:
mb
->
checksum
=
0
;
ret
=
snprintf
(
page
,
PAGE_SIZE
,
"[%s] %s
\n
"
,
r5c_journal_mode_str
[
R5C_JOURNAL_MODE_WRITE_THROUGH
],
r5c_journal_mode_str
[
R5C_JOURNAL_MODE_WRITE_BACK
]);
break
;
case
R5C_JOURNAL_MODE_WRITE_BACK
:
ret
=
snprintf
(
page
,
PAGE_SIZE
,
"%s [%s]
\n
"
,
r5c_journal_mode_str
[
R5C_JOURNAL_MODE_WRITE_THROUGH
],
r5c_journal_mode_str
[
R5C_JOURNAL_MODE_WRITE_BACK
]);
break
;
default:
ret
=
0
;
}
return
ret
;
}
if
(
le32_to_cpu
(
mb
->
magic
)
!=
R5LOG_MAGIC
||
static
ssize_t
r5c_journal_mode_store
(
struct
mddev
*
mddev
,
le64_to_cpu
(
mb
->
seq
)
!=
ctx
->
seq
||
const
char
*
page
,
size_t
length
)
mb
->
version
!=
R5LOG_VERSION
||
{
le64_to_cpu
(
mb
->
position
)
!=
ctx
->
pos
)
struct
r5conf
*
conf
=
mddev
->
private
;
return
-
EINVAL
;
struct
r5l_log
*
log
=
conf
->
log
;
int
val
=
-
1
,
i
;
int
len
=
length
;
crc
=
crc32c_le
(
log
->
uuid_checksum
,
mb
,
PAGE_SIZE
);
if
(
!
log
)
if
(
stored_crc
!=
crc
)
return
-
ENODEV
;
return
-
EINVAL
;
if
(
le32_to_cpu
(
mb
->
meta_size
)
>
PAGE_SIZE
)
if
(
len
&&
page
[
len
-
1
]
==
'\n'
)
len
-=
1
;
for
(
i
=
0
;
i
<
ARRAY_SIZE
(
r5c_journal_mode_str
);
i
++
)
if
(
strlen
(
r5c_journal_mode_str
[
i
])
==
len
&&
strncmp
(
page
,
r5c_journal_mode_str
[
i
],
len
)
==
0
)
{
val
=
i
;
break
;
}
if
(
val
<
R5C_JOURNAL_MODE_WRITE_THROUGH
||
val
>
R5C_JOURNAL_MODE_WRITE_BACK
)
return
-
EINVAL
;
return
-
EINVAL
;
ctx
->
meta_total_blocks
=
BLOCK_SECTORS
;
mddev_suspend
(
mddev
);
conf
->
log
->
r5c_journal_mode
=
val
;
mddev_resume
(
mddev
);
return
0
;
pr_debug
(
"md/raid:%s: setting r5c cache mode to %d: %s
\n
"
,
mdname
(
mddev
),
val
,
r5c_journal_mode_str
[
val
]);
return
length
;
}
}
static
int
r5l_recovery_flush_one_stripe
(
struct
r5l_log
*
log
,
struct
md_sysfs_entry
struct
r5l_recovery_ctx
*
ctx
,
r5c_journal_mode
=
__ATTR
(
journal_mode
,
0644
,
sector_t
stripe_sect
,
r5c_journal_mode_show
,
r5c_journal_mode_store
);
int
*
offset
,
sector_t
*
log_offset
)
/*
* Try handle write operation in caching phase. This function should only
* be called in write-back mode.
*
* If all outstanding writes can be handled in caching phase, returns 0
* If writes requires write-out phase, call r5c_make_stripe_write_out()
* and returns -EAGAIN
*/
int
r5c_try_caching_write
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
struct
stripe_head_state
*
s
,
int
disks
)
{
{
struct
r5
conf
*
conf
=
log
->
rdev
->
mddev
->
private
;
struct
r5
l_log
*
log
=
conf
->
log
;
struct
stripe_head
*
sh
;
int
i
;
struct
r5
l_payload_data_parity
*
payload
;
struct
r5
dev
*
dev
;
int
disk_index
;
int
to_cache
=
0
;
sh
=
raid5_get_active_stripe
(
conf
,
stripe_sect
,
0
,
0
,
0
);
BUG_ON
(
!
r5c_is_writeback
(
log
));
while
(
1
)
{
payload
=
page_address
(
ctx
->
meta_page
)
+
*
offset
;
if
(
le16_to_cpu
(
payload
->
header
.
type
)
==
R5LOG_PAYLOAD_DATA
)
{
if
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
))
{
raid5_compute_sector
(
conf
,
/*
le64_to_cpu
(
payload
->
location
),
0
,
* There are two different scenarios here:
&
disk_index
,
sh
);
* 1. The stripe has some data cached, and it is sent to
* write-out phase for reclaim
* 2. The stripe is clean, and this is the first write
*
* For 1, return -EAGAIN, so we continue with
* handle_stripe_dirtying().
*
* For 2, set STRIPE_R5C_CACHING and continue with caching
* write.
*/
sync_page_io
(
log
->
rdev
,
*
log_offset
,
PAGE_SIZE
,
/* case 1: anything injournal or anything in written */
sh
->
dev
[
disk_index
].
page
,
REQ_OP_READ
,
0
,
if
(
s
->
injournal
>
0
||
s
->
written
>
0
)
false
);
return
-
EAGAIN
;
sh
->
dev
[
disk_index
].
log_checksum
=
/* case 2 */
le32_to_cpu
(
payload
->
checksum
[
0
]);
set_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
);
set_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
disk_index
].
flags
);
}
ctx
->
meta_total_blocks
+=
BLOCK_SECTORS
;
}
else
{
disk_index
=
sh
->
pd_idx
;
sync_page_io
(
log
->
rdev
,
*
log_offset
,
PAGE_SIZE
,
sh
->
dev
[
disk_index
].
page
,
REQ_OP_READ
,
0
,
false
);
sh
->
dev
[
disk_index
].
log_checksum
=
le32_to_cpu
(
payload
->
checksum
[
0
]);
set_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
disk_index
].
flags
);
if
(
sh
->
qd_idx
>=
0
)
{
for
(
i
=
disks
;
i
--
;
)
{
disk_index
=
sh
->
qd_idx
;
dev
=
&
sh
->
dev
[
i
];
sync_page_io
(
log
->
rdev
,
/* if non-overwrite, use writing-out phase */
r5l_ring_add
(
log
,
*
log_offset
,
BLOCK_SECTORS
),
if
(
dev
->
towrite
&&
!
test_bit
(
R5_OVERWRITE
,
&
dev
->
flags
)
&&
PAGE_SIZE
,
sh
->
dev
[
disk_index
].
page
,
!
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
{
REQ_OP_READ
,
0
,
false
);
r5c_make_stripe_write_out
(
sh
);
sh
->
dev
[
disk_index
].
log_checksum
=
return
-
EAGAIN
;
le32_to_cpu
(
payload
->
checksum
[
1
]);
set_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
disk_index
].
flags
);
}
}
ctx
->
meta_total_blocks
+=
BLOCK_SECTORS
*
conf
->
max_degraded
;
}
}
*
log_offset
=
r5l_ring_add
(
log
,
*
log_offset
,
for
(
i
=
disks
;
i
--
;
)
{
le32_to_cpu
(
payload
->
size
))
;
dev
=
&
sh
->
dev
[
i
]
;
*
offset
+=
sizeof
(
struct
r5l_payload_data_parity
)
+
if
(
dev
->
towrite
)
{
s
izeof
(
__le32
)
*
s
et_bit
(
R5_Wantwrite
,
&
dev
->
flags
);
(
le32_to_cpu
(
payload
->
size
)
>>
(
PAGE_SHIFT
-
9
)
);
set_bit
(
R5_Wantdrain
,
&
dev
->
flags
);
if
(
le16_to_cpu
(
payload
->
header
.
type
)
==
R5LOG_PAYLOAD_PARITY
)
set_bit
(
R5_LOCKED
,
&
dev
->
flags
);
break
;
to_cache
++
;
}
}
for
(
disk_index
=
0
;
disk_index
<
sh
->
disks
;
disk_index
++
)
{
void
*
addr
;
u32
checksum
;
if
(
!
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
disk_index
].
flags
))
continue
;
addr
=
kmap_atomic
(
sh
->
dev
[
disk_index
].
page
);
checksum
=
crc32c_le
(
log
->
uuid_checksum
,
addr
,
PAGE_SIZE
);
kunmap_atomic
(
addr
);
if
(
checksum
!=
sh
->
dev
[
disk_index
].
log_checksum
)
goto
error
;
}
}
for
(
disk_index
=
0
;
disk_index
<
sh
->
disks
;
disk_index
++
)
{
if
(
to_cache
)
{
struct
md_rdev
*
rdev
,
*
rrdev
;
set_bit
(
STRIPE_OP_BIODRAIN
,
&
s
->
ops_request
);
/*
if
(
!
test_and_clear_bit
(
R5_Wantwrite
,
* set STRIPE_LOG_TRAPPED, which triggers r5c_cache_data()
&
sh
->
dev
[
disk_index
].
flags
))
* in ops_run_io(). STRIPE_LOG_TRAPPED will be cleared in
continue
;
* r5c_handle_data_cached()
*/
/* in case device is broken */
set_bit
(
STRIPE_LOG_TRAPPED
,
&
sh
->
state
);
rdev
=
rcu_dereference
(
conf
->
disks
[
disk_index
].
rdev
);
if
(
rdev
)
sync_page_io
(
rdev
,
stripe_sect
,
PAGE_SIZE
,
sh
->
dev
[
disk_index
].
page
,
REQ_OP_WRITE
,
0
,
false
);
rrdev
=
rcu_dereference
(
conf
->
disks
[
disk_index
].
replacement
);
if
(
rrdev
)
sync_page_io
(
rrdev
,
stripe_sect
,
PAGE_SIZE
,
sh
->
dev
[
disk_index
].
page
,
REQ_OP_WRITE
,
0
,
false
);
}
}
raid5_release_stripe
(
sh
);
return
0
;
error:
return
0
;
for
(
disk_index
=
0
;
disk_index
<
sh
->
disks
;
disk_index
++
)
sh
->
dev
[
disk_index
].
flags
=
0
;
raid5_release_stripe
(
sh
);
return
-
EINVAL
;
}
}
static
int
r5l_recovery_flush_one_meta
(
struct
r5l_log
*
log
,
/*
struct
r5l_recovery_ctx
*
ctx
)
* free extra pages (orig_page) we allocated for prexor
*/
void
r5c_release_extra_page
(
struct
stripe_head
*
sh
)
{
{
struct
r5conf
*
conf
=
log
->
rdev
->
mddev
->
private
;
struct
r5conf
*
conf
=
sh
->
raid_conf
;
struct
r5l_payload_data_parity
*
payload
;
int
i
;
struct
r5l_meta_block
*
mb
;
bool
using_disk_info_extra_page
;
int
offset
;
sector_t
log_offset
;
sector_t
stripe_sector
;
mb
=
page_address
(
ctx
->
meta_page
);
using_disk_info_extra_page
=
offset
=
sizeof
(
struct
r5l_meta_block
);
sh
->
dev
[
0
].
orig_page
==
conf
->
disks
[
0
].
extra_page
;
log_offset
=
r5l_ring_add
(
log
,
ctx
->
pos
,
BLOCK_SECTORS
);
while
(
offset
<
le32_to_cpu
(
mb
->
meta_size
))
{
for
(
i
=
sh
->
disks
;
i
--
;
)
int
dd
;
if
(
sh
->
dev
[
i
].
page
!=
sh
->
dev
[
i
].
orig_page
)
{
struct
page
*
p
=
sh
->
dev
[
i
].
orig_page
;
payload
=
(
void
*
)
mb
+
offset
;
sh
->
dev
[
i
].
orig_page
=
sh
->
dev
[
i
].
page
;
stripe_sector
=
raid5_compute_sector
(
conf
,
if
(
!
using_disk_info_extra_page
)
le64_to_cpu
(
payload
->
location
),
0
,
&
dd
,
NULL
);
put_page
(
p
);
if
(
r5l_recovery_flush_one_stripe
(
log
,
ctx
,
stripe_sector
,
}
&
offset
,
&
log_offset
))
return
-
EINVAL
;
if
(
using_disk_info_extra_page
)
{
clear_bit
(
R5C_EXTRA_PAGE_IN_USE
,
&
conf
->
cache_state
);
md_wakeup_thread
(
conf
->
mddev
->
thread
);
}
}
return
0
;
}
}
/* copy data/parity from log to raid disks */
void
r5c_use_extra_page
(
struct
stripe_head
*
sh
)
static
void
r5l_recovery_flush_log
(
struct
r5l_log
*
log
,
struct
r5l_recovery_ctx
*
ctx
)
{
{
while
(
1
)
{
struct
r5conf
*
conf
=
sh
->
raid_conf
;
if
(
r5l_read_meta_block
(
log
,
ctx
))
int
i
;
return
;
struct
r5dev
*
dev
;
if
(
r5l_recovery_flush_one_meta
(
log
,
ctx
))
return
;
for
(
i
=
sh
->
disks
;
i
--
;
)
{
ctx
->
seq
++
;
dev
=
&
sh
->
dev
[
i
];
ctx
->
pos
=
r5l_ring_add
(
log
,
ctx
->
pos
,
ctx
->
meta_total_blocks
);
if
(
dev
->
orig_page
!=
dev
->
page
)
put_page
(
dev
->
orig_page
);
dev
->
orig_page
=
conf
->
disks
[
i
].
extra_page
;
}
}
}
}
static
int
r5l_log_write_empty_meta_block
(
struct
r5l_log
*
log
,
sector_t
pos
,
/*
u64
seq
)
* clean up the stripe (clear R5_InJournal for dev[pd_idx] etc.) after the
* stripe is committed to RAID disks.
*/
void
r5c_finish_stripe_write_out
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
struct
stripe_head_state
*
s
)
{
{
struct
page
*
page
;
int
i
;
struct
r5l_meta_block
*
mb
;
int
do_wakeup
=
0
;
u32
crc
;
page
=
alloc_page
(
GFP_KERNEL
|
__GFP_ZERO
);
if
(
!
conf
->
log
||
if
(
!
page
)
!
test_bit
(
R5_InJournal
,
&
sh
->
dev
[
sh
->
pd_idx
].
flags
))
return
-
ENOMEM
;
return
;
mb
=
page_address
(
page
);
mb
->
magic
=
cpu_to_le32
(
R5LOG_MAGIC
);
mb
->
version
=
R5LOG_VERSION
;
mb
->
meta_size
=
cpu_to_le32
(
sizeof
(
struct
r5l_meta_block
));
mb
->
seq
=
cpu_to_le64
(
seq
);
mb
->
position
=
cpu_to_le64
(
pos
);
crc
=
crc32c_le
(
log
->
uuid_checksum
,
mb
,
PAGE_SIZE
);
mb
->
checksum
=
cpu_to_le32
(
crc
);
if
(
!
sync_page_io
(
log
->
rdev
,
pos
,
PAGE_SIZE
,
page
,
REQ_OP_WRITE
,
WARN_ON
(
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
));
REQ_FUA
,
false
))
{
clear_bit
(
R5_InJournal
,
&
sh
->
dev
[
sh
->
pd_idx
].
flags
);
__free_page
(
page
);
return
-
EIO
;
if
(
conf
->
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_THROUGH
)
return
;
for
(
i
=
sh
->
disks
;
i
--
;
)
{
clear_bit
(
R5_InJournal
,
&
sh
->
dev
[
i
].
flags
);
if
(
test_and_clear_bit
(
R5_Overlap
,
&
sh
->
dev
[
i
].
flags
))
do_wakeup
=
1
;
}
}
__free_page
(
page
);
return
0
;
/*
* analyse_stripe() runs before r5c_finish_stripe_write_out(),
* We updated R5_InJournal, so we also update s->injournal.
*/
s
->
injournal
=
0
;
if
(
test_and_clear_bit
(
STRIPE_FULL_WRITE
,
&
sh
->
state
))
if
(
atomic_dec_and_test
(
&
conf
->
pending_full_writes
))
md_wakeup_thread
(
conf
->
mddev
->
thread
);
if
(
do_wakeup
)
wake_up
(
&
conf
->
wait_for_overlap
);
if
(
conf
->
log
->
r5c_journal_mode
==
R5C_JOURNAL_MODE_WRITE_THROUGH
)
return
;
spin_lock_irq
(
&
conf
->
log
->
stripe_in_journal_lock
);
list_del_init
(
&
sh
->
r5c
);
spin_unlock_irq
(
&
conf
->
log
->
stripe_in_journal_lock
);
sh
->
log_start
=
MaxSector
;
atomic_dec
(
&
conf
->
log
->
stripe_in_journal_count
);
r5c_update_log_state
(
conf
->
log
);
}
}
static
int
r5l_recovery_log
(
struct
r5l_log
*
log
)
int
r5c_cache_data
(
struct
r5l_log
*
log
,
struct
stripe_head
*
sh
,
struct
stripe_head_state
*
s
)
{
{
struct
r5l_recovery_ctx
ctx
;
struct
r5conf
*
conf
=
sh
->
raid_conf
;
int
pages
=
0
;
int
reserve
;
int
i
;
int
ret
=
0
;
ctx
.
pos
=
log
->
last_checkpoint
;
BUG_ON
(
!
log
);
ctx
.
seq
=
log
->
last_cp_seq
;
ctx
.
meta_page
=
alloc_page
(
GFP_KERNEL
);
if
(
!
ctx
.
meta_page
)
return
-
ENOMEM
;
r5l_recovery_flush_log
(
log
,
&
ctx
);
for
(
i
=
0
;
i
<
sh
->
disks
;
i
++
)
{
__free_page
(
ctx
.
meta_page
);
void
*
addr
;
if
(
!
test_bit
(
R5_Wantwrite
,
&
sh
->
dev
[
i
].
flags
))
continue
;
addr
=
kmap_atomic
(
sh
->
dev
[
i
].
page
);
sh
->
dev
[
i
].
log_checksum
=
crc32c_le
(
log
->
uuid_checksum
,
addr
,
PAGE_SIZE
);
kunmap_atomic
(
addr
);
pages
++
;
}
WARN_ON
(
pages
==
0
);
/*
/*
* we did a recovery. Now ctx.pos points to an invalid meta block. New
* The stripe must enter state machine again to call endio, so
* log will start here. but we can't let superblock point to last valid
* don't delay.
* meta block. The log might looks like:
* | meta 1| meta 2| meta 3|
* meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
* superblock points to meta 1, we write a new valid meta 2n. if crash
* happens again, new recovery will start from meta 1. Since meta 2n is
* valid now, recovery will think meta 3 is valid, which is wrong.
* The solution is we create a new meta in meta2 with its seq == meta
* 1's seq + 10 and let superblock points to meta2. The same recovery will
* not think meta 3 is a valid meta, because its seq doesn't match
*/
*/
if
(
ctx
.
seq
>
log
->
last_cp_seq
)
{
clear_bit
(
STRIPE_DELAYED
,
&
sh
->
state
);
int
ret
;
atomic_inc
(
&
sh
->
count
)
;
ret
=
r5l_log_write_empty_meta_block
(
log
,
ctx
.
pos
,
ctx
.
seq
+
10
);
mutex_lock
(
&
log
->
io_mutex
);
if
(
ret
)
/* meta + data */
return
ret
;
reserve
=
(
1
+
pages
)
<<
(
PAGE_SHIFT
-
9
);
log
->
seq
=
ctx
.
seq
+
11
;
log
->
log_start
=
r5l_ring_add
(
log
,
ctx
.
pos
,
BLOCK_SECTORS
);
if
(
test_bit
(
R5C_LOG_CRITICAL
,
&
conf
->
cache_state
)
&&
r5l_write_super
(
log
,
ctx
.
pos
);
sh
->
log_start
==
MaxSector
)
log
->
last_checkpoint
=
ctx
.
pos
;
r5l_add_no_space_stripe
(
log
,
sh
);
log
->
next_checkpoint
=
ctx
.
pos
;
else
if
(
!
r5l_has_free_space
(
log
,
reserve
))
{
if
(
sh
->
log_start
==
log
->
last_checkpoint
)
BUG
();
else
r5l_add_no_space_stripe
(
log
,
sh
);
}
else
{
}
else
{
log
->
log_start
=
ctx
.
pos
;
ret
=
r5l_log_stripe
(
log
,
sh
,
pages
,
0
);
log
->
seq
=
ctx
.
seq
;
if
(
ret
)
{
spin_lock_irq
(
&
log
->
io_list_lock
);
list_add_tail
(
&
sh
->
log_list
,
&
log
->
no_mem_stripes
);
spin_unlock_irq
(
&
log
->
io_list_lock
);
}
}
}
return
0
;
}
static
void
r5l_write_super
(
struct
r5l_log
*
log
,
sector_t
cp
)
{
struct
mddev
*
mddev
=
log
->
rdev
->
mddev
;
log
->
rdev
->
journal_tail
=
cp
;
mutex_unlock
(
&
log
->
io_mutex
)
;
set_bit
(
MD_CHANGE_DEVS
,
&
mddev
->
flags
)
;
return
0
;
}
}
static
int
r5l_load_log
(
struct
r5l_log
*
log
)
static
int
r5l_load_log
(
struct
r5l_log
*
log
)
...
@@ -1121,7 +2494,7 @@ static int r5l_load_log(struct r5l_log *log)
...
@@ -1121,7 +2494,7 @@ static int r5l_load_log(struct r5l_log *log)
sector_t
cp
=
log
->
rdev
->
journal_tail
;
sector_t
cp
=
log
->
rdev
->
journal_tail
;
u32
stored_crc
,
expected_crc
;
u32
stored_crc
,
expected_crc
;
bool
create_super
=
false
;
bool
create_super
=
false
;
int
ret
;
int
ret
=
0
;
/* Make sure it's valid */
/* Make sure it's valid */
if
(
cp
>=
rdev
->
sectors
||
round_down
(
cp
,
BLOCK_SECTORS
)
!=
cp
)
if
(
cp
>=
rdev
->
sectors
||
round_down
(
cp
,
BLOCK_SECTORS
)
!=
cp
)
...
@@ -1171,11 +2544,18 @@ static int r5l_load_log(struct r5l_log *log)
...
@@ -1171,11 +2544,18 @@ static int r5l_load_log(struct r5l_log *log)
if
(
log
->
max_free_space
>
RECLAIM_MAX_FREE_SPACE
)
if
(
log
->
max_free_space
>
RECLAIM_MAX_FREE_SPACE
)
log
->
max_free_space
=
RECLAIM_MAX_FREE_SPACE
;
log
->
max_free_space
=
RECLAIM_MAX_FREE_SPACE
;
log
->
last_checkpoint
=
cp
;
log
->
last_checkpoint
=
cp
;
log
->
next_checkpoint
=
cp
;
__free_page
(
page
);
__free_page
(
page
);
return
r5l_recovery_log
(
log
);
if
(
create_super
)
{
log
->
log_start
=
r5l_ring_add
(
log
,
cp
,
BLOCK_SECTORS
);
log
->
seq
=
log
->
last_cp_seq
+
1
;
log
->
next_checkpoint
=
cp
;
}
else
ret
=
r5l_recovery_log
(
log
);
r5c_update_log_state
(
log
);
return
ret
;
ioerr:
ioerr:
__free_page
(
page
);
__free_page
(
page
);
return
ret
;
return
ret
;
...
@@ -1188,6 +2568,22 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
...
@@ -1188,6 +2568,22 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if
(
PAGE_SIZE
!=
4096
)
if
(
PAGE_SIZE
!=
4096
)
return
-
EINVAL
;
return
-
EINVAL
;
/*
* The PAGE_SIZE must be big enough to hold 1 r5l_meta_block and
* raid_disks r5l_payload_data_parity.
*
* Write journal and cache does not work for very big array
* (raid_disks > 203)
*/
if
(
sizeof
(
struct
r5l_meta_block
)
+
((
sizeof
(
struct
r5l_payload_data_parity
)
+
sizeof
(
__le32
))
*
conf
->
raid_disks
)
>
PAGE_SIZE
)
{
pr_err
(
"md/raid:%s: write journal/cache doesn't work for array with %d disks
\n
"
,
mdname
(
conf
->
mddev
),
conf
->
raid_disks
);
return
-
EINVAL
;
}
log
=
kzalloc
(
sizeof
(
*
log
),
GFP_KERNEL
);
log
=
kzalloc
(
sizeof
(
*
log
),
GFP_KERNEL
);
if
(
!
log
)
if
(
!
log
)
return
-
ENOMEM
;
return
-
ENOMEM
;
...
@@ -1227,6 +2623,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
...
@@ -1227,6 +2623,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
log
->
rdev
->
mddev
,
"reclaim"
);
log
->
rdev
->
mddev
,
"reclaim"
);
if
(
!
log
->
reclaim_thread
)
if
(
!
log
->
reclaim_thread
)
goto
reclaim_thread
;
goto
reclaim_thread
;
log
->
reclaim_thread
->
timeout
=
R5C_RECLAIM_WAKEUP_INTERVAL
;
init_waitqueue_head
(
&
log
->
iounit_wait
);
init_waitqueue_head
(
&
log
->
iounit_wait
);
INIT_LIST_HEAD
(
&
log
->
no_mem_stripes
);
INIT_LIST_HEAD
(
&
log
->
no_mem_stripes
);
...
@@ -1234,6 +2632,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
...
@@ -1234,6 +2632,13 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
INIT_LIST_HEAD
(
&
log
->
no_space_stripes
);
INIT_LIST_HEAD
(
&
log
->
no_space_stripes
);
spin_lock_init
(
&
log
->
no_space_stripes_lock
);
spin_lock_init
(
&
log
->
no_space_stripes_lock
);
INIT_WORK
(
&
log
->
deferred_io_work
,
r5l_submit_io_async
);
log
->
r5c_journal_mode
=
R5C_JOURNAL_MODE_WRITE_THROUGH
;
INIT_LIST_HEAD
(
&
log
->
stripe_in_journal_list
);
spin_lock_init
(
&
log
->
stripe_in_journal_lock
);
atomic_set
(
&
log
->
stripe_in_journal_count
,
0
);
if
(
r5l_load_log
(
log
))
if
(
r5l_load_log
(
log
))
goto
error
;
goto
error
;
...
...
drivers/md/raid5.c
浏览文件 @
20737738
...
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644);
...
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644);
MODULE_PARM_DESC
(
devices_handle_discard_safely
,
MODULE_PARM_DESC
(
devices_handle_discard_safely
,
"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"
);
"Set to Y if all devices in each array reliably return zeroes on reads from discarded regions"
);
static
struct
workqueue_struct
*
raid5_wq
;
static
struct
workqueue_struct
*
raid5_wq
;
/*
* Stripe cache
*/
#define NR_STRIPES 256
#define STRIPE_SIZE PAGE_SIZE
#define STRIPE_SHIFT (PAGE_SHIFT - 9)
#define STRIPE_SECTORS (STRIPE_SIZE>>9)
#define IO_THRESHOLD 1
#define BYPASS_THRESHOLD 1
#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
#define HASH_MASK (NR_HASH - 1)
#define MAX_STRIPE_BATCH 8
static
inline
struct
hlist_head
*
stripe_hash
(
struct
r5conf
*
conf
,
sector_t
sect
)
static
inline
struct
hlist_head
*
stripe_hash
(
struct
r5conf
*
conf
,
sector_t
sect
)
{
{
...
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
...
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
local_irq_enable
();
local_irq_enable
();
}
}
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
* order without overlap. There may be several bio's per stripe+device, and
* a bio could span several devices.
* When walking this list for a particular stripe+device, we must never proceed
* beyond a bio that extends past this device, as the next bio might no longer
* be valid.
* This function is used to determine the 'next' bio in the list, given the sector
* of the current stripe+device
*/
static
inline
struct
bio
*
r5_next_bio
(
struct
bio
*
bio
,
sector_t
sector
)
{
int
sectors
=
bio_sectors
(
bio
);
if
(
bio
->
bi_iter
.
bi_sector
+
sectors
<
sector
+
STRIPE_SECTORS
)
return
bio
->
bi_next
;
else
return
NULL
;
}
/*
* We maintain a biased count of active stripes in the bottom 16 bits of
* bi_phys_segments, and a count of processed stripes in the upper 16 bits
*/
static
inline
int
raid5_bi_processed_stripes
(
struct
bio
*
bio
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
return
(
atomic_read
(
segments
)
>>
16
)
&
0xffff
;
}
static
inline
int
raid5_dec_bi_active_stripes
(
struct
bio
*
bio
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
return
atomic_sub_return
(
1
,
segments
)
&
0xffff
;
}
static
inline
void
raid5_inc_bi_active_stripes
(
struct
bio
*
bio
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
atomic_inc
(
segments
);
}
static
inline
void
raid5_set_bi_processed_stripes
(
struct
bio
*
bio
,
unsigned
int
cnt
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
int
old
,
new
;
do
{
old
=
atomic_read
(
segments
);
new
=
(
old
&
0xffff
)
|
(
cnt
<<
16
);
}
while
(
atomic_cmpxchg
(
segments
,
old
,
new
)
!=
old
);
}
static
inline
void
raid5_set_bi_stripes
(
struct
bio
*
bio
,
unsigned
int
cnt
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
atomic_set
(
segments
,
cnt
);
}
/* Find first data disk in a raid6 stripe */
/* Find first data disk in a raid6 stripe */
static
inline
int
raid6_d0
(
struct
stripe_head
*
sh
)
static
inline
int
raid6_d0
(
struct
stripe_head
*
sh
)
{
{
...
@@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
...
@@ -289,8 +218,27 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
static
void
do_release_stripe
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
static
void
do_release_stripe
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
struct
list_head
*
temp_inactive_list
)
struct
list_head
*
temp_inactive_list
)
{
{
int
i
;
int
injournal
=
0
;
/* number of date pages with R5_InJournal */
BUG_ON
(
!
list_empty
(
&
sh
->
lru
));
BUG_ON
(
!
list_empty
(
&
sh
->
lru
));
BUG_ON
(
atomic_read
(
&
conf
->
active_stripes
)
==
0
);
BUG_ON
(
atomic_read
(
&
conf
->
active_stripes
)
==
0
);
if
(
r5c_is_writeback
(
conf
->
log
))
for
(
i
=
sh
->
disks
;
i
--
;
)
if
(
test_bit
(
R5_InJournal
,
&
sh
->
dev
[
i
].
flags
))
injournal
++
;
/*
* When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
* data in journal, so they are not released to cached lists
*/
if
(
conf
->
quiesce
&&
r5c_is_writeback
(
conf
->
log
)
&&
!
test_bit
(
STRIPE_HANDLE
,
&
sh
->
state
)
&&
injournal
!=
0
)
{
if
(
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
))
r5c_make_stripe_write_out
(
sh
);
set_bit
(
STRIPE_HANDLE
,
&
sh
->
state
);
}
if
(
test_bit
(
STRIPE_HANDLE
,
&
sh
->
state
))
{
if
(
test_bit
(
STRIPE_HANDLE
,
&
sh
->
state
))
{
if
(
test_bit
(
STRIPE_DELAYED
,
&
sh
->
state
)
&&
if
(
test_bit
(
STRIPE_DELAYED
,
&
sh
->
state
)
&&
!
test_bit
(
STRIPE_PREREAD_ACTIVE
,
&
sh
->
state
))
!
test_bit
(
STRIPE_PREREAD_ACTIVE
,
&
sh
->
state
))
...
@@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
...
@@ -316,8 +264,30 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
<
IO_THRESHOLD
)
<
IO_THRESHOLD
)
md_wakeup_thread
(
conf
->
mddev
->
thread
);
md_wakeup_thread
(
conf
->
mddev
->
thread
);
atomic_dec
(
&
conf
->
active_stripes
);
atomic_dec
(
&
conf
->
active_stripes
);
if
(
!
test_bit
(
STRIPE_EXPANDING
,
&
sh
->
state
))
if
(
!
test_bit
(
STRIPE_EXPANDING
,
&
sh
->
state
))
{
if
(
!
r5c_is_writeback
(
conf
->
log
))
list_add_tail
(
&
sh
->
lru
,
temp_inactive_list
);
list_add_tail
(
&
sh
->
lru
,
temp_inactive_list
);
else
{
WARN_ON
(
test_bit
(
R5_InJournal
,
&
sh
->
dev
[
sh
->
pd_idx
].
flags
));
if
(
injournal
==
0
)
list_add_tail
(
&
sh
->
lru
,
temp_inactive_list
);
else
if
(
injournal
==
conf
->
raid_disks
-
conf
->
max_degraded
)
{
/* full stripe */
if
(
!
test_and_set_bit
(
STRIPE_R5C_FULL_STRIPE
,
&
sh
->
state
))
atomic_inc
(
&
conf
->
r5c_cached_full_stripes
);
if
(
test_and_clear_bit
(
STRIPE_R5C_PARTIAL_STRIPE
,
&
sh
->
state
))
atomic_dec
(
&
conf
->
r5c_cached_partial_stripes
);
list_add_tail
(
&
sh
->
lru
,
&
conf
->
r5c_full_stripe_list
);
r5c_check_cached_full_stripe
(
conf
);
}
else
{
/* partial stripe */
if
(
!
test_and_set_bit
(
STRIPE_R5C_PARTIAL_STRIPE
,
&
sh
->
state
))
atomic_inc
(
&
conf
->
r5c_cached_partial_stripes
);
list_add_tail
(
&
sh
->
lru
,
&
conf
->
r5c_partial_stripe_list
);
}
}
}
}
}
}
}
...
@@ -541,7 +511,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
...
@@ -541,7 +511,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
if
(
dev
->
toread
||
dev
->
read
||
dev
->
towrite
||
dev
->
written
||
if
(
dev
->
toread
||
dev
->
read
||
dev
->
towrite
||
dev
->
written
||
test_bit
(
R5_LOCKED
,
&
dev
->
flags
))
{
test_bit
(
R5_LOCKED
,
&
dev
->
flags
))
{
pr
intk
(
KERN_ERR
"sector=%llx i=%d %p %p %p %p %d
\n
"
,
pr
_err
(
"sector=%llx i=%d %p %p %p %p %d
\n
"
,
(
unsigned
long
long
)
sh
->
sector
,
i
,
dev
->
toread
,
(
unsigned
long
long
)
sh
->
sector
,
i
,
dev
->
toread
,
dev
->
read
,
dev
->
towrite
,
dev
->
written
,
dev
->
read
,
dev
->
towrite
,
dev
->
written
,
test_bit
(
R5_LOCKED
,
&
dev
->
flags
));
test_bit
(
R5_LOCKED
,
&
dev
->
flags
));
...
@@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
...
@@ -680,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
}
}
if
(
noblock
&&
sh
==
NULL
)
if
(
noblock
&&
sh
==
NULL
)
break
;
break
;
r5c_check_stripe_cache_usage
(
conf
);
if
(
!
sh
)
{
if
(
!
sh
)
{
set_bit
(
R5_INACTIVE_BLOCKED
,
set_bit
(
R5_INACTIVE_BLOCKED
,
&
conf
->
cache_state
);
&
conf
->
cache_state
);
r5l_wake_reclaim
(
conf
->
log
,
0
);
wait_event_lock_irq
(
wait_event_lock_irq
(
conf
->
wait_for_stripe
,
conf
->
wait_for_stripe
,
!
list_empty
(
conf
->
inactive_list
+
hash
)
&&
!
list_empty
(
conf
->
inactive_list
+
hash
)
&&
...
@@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
...
@@ -901,8 +874,19 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep
();
might_sleep
();
if
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
))
{
/* writing out phase */
if
(
s
->
waiting_extra_page
)
return
;
if
(
r5l_write_stripe
(
conf
->
log
,
sh
)
==
0
)
if
(
r5l_write_stripe
(
conf
->
log
,
sh
)
==
0
)
return
;
return
;
}
else
{
/* caching phase */
if
(
test_bit
(
STRIPE_LOG_TRAPPED
,
&
sh
->
state
))
{
r5c_cache_data
(
conf
->
log
,
sh
,
s
);
return
;
}
}
for
(
i
=
disks
;
i
--
;
)
{
for
(
i
=
disks
;
i
--
;
)
{
int
op
,
op_flags
=
0
;
int
op
,
op_flags
=
0
;
int
replace_only
=
0
;
int
replace_only
=
0
;
...
@@ -977,7 +961,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
...
@@ -977,7 +961,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if
(
bad
<
0
)
{
if
(
bad
<
0
)
{
set_bit
(
BlockedBadBlocks
,
&
rdev
->
flags
);
set_bit
(
BlockedBadBlocks
,
&
rdev
->
flags
);
if
(
!
conf
->
mddev
->
external
&&
if
(
!
conf
->
mddev
->
external
&&
conf
->
mddev
->
flags
)
{
conf
->
mddev
->
sb_
flags
)
{
/* It is very unlikely, but we might
/* It is very unlikely, but we might
* still need to write out the
* still need to write out the
* bad block log - better give it
* bad block log - better give it
...
@@ -1115,7 +1099,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
...
@@ -1115,7 +1099,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
static
struct
dma_async_tx_descriptor
*
static
struct
dma_async_tx_descriptor
*
async_copy_data
(
int
frombio
,
struct
bio
*
bio
,
struct
page
**
page
,
async_copy_data
(
int
frombio
,
struct
bio
*
bio
,
struct
page
**
page
,
sector_t
sector
,
struct
dma_async_tx_descriptor
*
tx
,
sector_t
sector
,
struct
dma_async_tx_descriptor
*
tx
,
struct
stripe_head
*
sh
)
struct
stripe_head
*
sh
,
int
no_skipcopy
)
{
{
struct
bio_vec
bvl
;
struct
bio_vec
bvl
;
struct
bvec_iter
iter
;
struct
bvec_iter
iter
;
...
@@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
...
@@ -1155,7 +1139,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
if
(
frombio
)
{
if
(
frombio
)
{
if
(
sh
->
raid_conf
->
skip_copy
&&
if
(
sh
->
raid_conf
->
skip_copy
&&
b_offset
==
0
&&
page_offset
==
0
&&
b_offset
==
0
&&
page_offset
==
0
&&
clen
==
STRIPE_SIZE
)
clen
==
STRIPE_SIZE
&&
!
no_skipcopy
)
*
page
=
bio_page
;
*
page
=
bio_page
;
else
else
tx
=
async_memcpy
(
*
page
,
bio_page
,
page_offset
,
tx
=
async_memcpy
(
*
page
,
bio_page
,
page_offset
,
...
@@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh)
...
@@ -1237,7 +1222,7 @@ static void ops_run_biofill(struct stripe_head *sh)
while
(
rbi
&&
rbi
->
bi_iter
.
bi_sector
<
while
(
rbi
&&
rbi
->
bi_iter
.
bi_sector
<
dev
->
sector
+
STRIPE_SECTORS
)
{
dev
->
sector
+
STRIPE_SECTORS
)
{
tx
=
async_copy_data
(
0
,
rbi
,
&
dev
->
page
,
tx
=
async_copy_data
(
0
,
rbi
,
&
dev
->
page
,
dev
->
sector
,
tx
,
sh
);
dev
->
sector
,
tx
,
sh
,
0
);
rbi
=
r5_next_bio
(
rbi
,
dev
->
sector
);
rbi
=
r5_next_bio
(
rbi
,
dev
->
sector
);
}
}
}
}
...
@@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs,
...
@@ -1364,10 +1349,15 @@ static int set_syndrome_sources(struct page **srcs,
if
(
i
==
sh
->
qd_idx
||
i
==
sh
->
pd_idx
||
if
(
i
==
sh
->
qd_idx
||
i
==
sh
->
pd_idx
||
(
srctype
==
SYNDROME_SRC_ALL
)
||
(
srctype
==
SYNDROME_SRC_ALL
)
||
(
srctype
==
SYNDROME_SRC_WANT_DRAIN
&&
(
srctype
==
SYNDROME_SRC_WANT_DRAIN
&&
test_bit
(
R5_Wantdrain
,
&
dev
->
flags
))
||
(
test_bit
(
R5_Wantdrain
,
&
dev
->
flags
)
||
test_bit
(
R5_InJournal
,
&
dev
->
flags
)))
||
(
srctype
==
SYNDROME_SRC_WRITTEN
&&
(
srctype
==
SYNDROME_SRC_WRITTEN
&&
dev
->
written
))
dev
->
written
))
{
if
(
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
srcs
[
slot
]
=
sh
->
dev
[
i
].
orig_page
;
else
srcs
[
slot
]
=
sh
->
dev
[
i
].
page
;
srcs
[
slot
]
=
sh
->
dev
[
i
].
page
;
}
i
=
raid6_next_disk
(
i
,
disks
);
i
=
raid6_next_disk
(
i
,
disks
);
}
while
(
i
!=
d0_idx
);
}
while
(
i
!=
d0_idx
);
...
@@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
...
@@ -1546,6 +1536,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
pr_debug
(
"%s: stripe %llu
\n
"
,
__func__
,
pr_debug
(
"%s: stripe %llu
\n
"
,
__func__
,
(
unsigned
long
long
)
sh
->
sector
);
(
unsigned
long
long
)
sh
->
sector
);
if
(
r5c_is_writeback
(
sh
->
raid_conf
->
log
))
/*
* raid5-cache write back uses orig_page during prexor.
* After prexor, it is time to free orig_page
*/
r5c_release_extra_page
(
sh
);
}
}
static
struct
dma_async_tx_descriptor
*
static
struct
dma_async_tx_descriptor
*
...
@@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
...
@@ -1567,7 +1564,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
for
(
i
=
disks
;
i
--
;
)
{
for
(
i
=
disks
;
i
--
;
)
{
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
/* Only process blocks that are known to be uptodate */
/* Only process blocks that are known to be uptodate */
if
(
test_bit
(
R5_Wantdrain
,
&
dev
->
flags
))
if
(
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
xor_srcs
[
count
++
]
=
dev
->
orig_page
;
else
if
(
test_bit
(
R5_Wantdrain
,
&
dev
->
flags
))
xor_srcs
[
count
++
]
=
dev
->
page
;
xor_srcs
[
count
++
]
=
dev
->
page
;
}
}
...
@@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
...
@@ -1601,6 +1600,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
static
struct
dma_async_tx_descriptor
*
static
struct
dma_async_tx_descriptor
*
ops_run_biodrain
(
struct
stripe_head
*
sh
,
struct
dma_async_tx_descriptor
*
tx
)
ops_run_biodrain
(
struct
stripe_head
*
sh
,
struct
dma_async_tx_descriptor
*
tx
)
{
{
struct
r5conf
*
conf
=
sh
->
raid_conf
;
int
disks
=
sh
->
disks
;
int
disks
=
sh
->
disks
;
int
i
;
int
i
;
struct
stripe_head
*
head_sh
=
sh
;
struct
stripe_head
*
head_sh
=
sh
;
...
@@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
...
@@ -1618,6 +1618,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
again:
again:
dev
=
&
sh
->
dev
[
i
];
dev
=
&
sh
->
dev
[
i
];
/*
* clear R5_InJournal, so when rewriting a page in
* journal, it is not skipped by r5l_log_stripe()
*/
clear_bit
(
R5_InJournal
,
&
dev
->
flags
);
spin_lock_irq
(
&
sh
->
stripe_lock
);
spin_lock_irq
(
&
sh
->
stripe_lock
);
chosen
=
dev
->
towrite
;
chosen
=
dev
->
towrite
;
dev
->
towrite
=
NULL
;
dev
->
towrite
=
NULL
;
...
@@ -1637,8 +1642,10 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
...
@@ -1637,8 +1642,10 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
set_bit
(
R5_Discard
,
&
dev
->
flags
);
set_bit
(
R5_Discard
,
&
dev
->
flags
);
else
{
else
{
tx
=
async_copy_data
(
1
,
wbi
,
&
dev
->
page
,
tx
=
async_copy_data
(
1
,
wbi
,
&
dev
->
page
,
dev
->
sector
,
tx
,
sh
);
dev
->
sector
,
tx
,
sh
,
if
(
dev
->
page
!=
dev
->
orig_page
)
{
r5c_is_writeback
(
conf
->
log
));
if
(
dev
->
page
!=
dev
->
orig_page
&&
!
r5c_is_writeback
(
conf
->
log
))
{
set_bit
(
R5_SkipCopy
,
&
dev
->
flags
);
set_bit
(
R5_SkipCopy
,
&
dev
->
flags
);
clear_bit
(
R5_UPTODATE
,
&
dev
->
flags
);
clear_bit
(
R5_UPTODATE
,
&
dev
->
flags
);
clear_bit
(
R5_OVERWRITE
,
&
dev
->
flags
);
clear_bit
(
R5_OVERWRITE
,
&
dev
->
flags
);
...
@@ -1746,7 +1753,8 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
...
@@ -1746,7 +1753,8 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
xor_dest
=
xor_srcs
[
count
++
]
=
sh
->
dev
[
pd_idx
].
page
;
xor_dest
=
xor_srcs
[
count
++
]
=
sh
->
dev
[
pd_idx
].
page
;
for
(
i
=
disks
;
i
--
;
)
{
for
(
i
=
disks
;
i
--
;
)
{
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
if
(
head_sh
->
dev
[
i
].
written
)
if
(
head_sh
->
dev
[
i
].
written
||
test_bit
(
R5_InJournal
,
&
head_sh
->
dev
[
i
].
flags
))
xor_srcs
[
count
++
]
=
dev
->
page
;
xor_srcs
[
count
++
]
=
dev
->
page
;
}
}
}
else
{
}
else
{
...
@@ -2000,7 +2008,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
...
@@ -2000,7 +2008,10 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
spin_lock_init
(
&
sh
->
batch_lock
);
spin_lock_init
(
&
sh
->
batch_lock
);
INIT_LIST_HEAD
(
&
sh
->
batch_list
);
INIT_LIST_HEAD
(
&
sh
->
batch_list
);
INIT_LIST_HEAD
(
&
sh
->
lru
);
INIT_LIST_HEAD
(
&
sh
->
lru
);
INIT_LIST_HEAD
(
&
sh
->
r5c
);
INIT_LIST_HEAD
(
&
sh
->
log_list
);
atomic_set
(
&
sh
->
count
,
1
);
atomic_set
(
&
sh
->
count
,
1
);
sh
->
log_start
=
MaxSector
;
for
(
i
=
0
;
i
<
disks
;
i
++
)
{
for
(
i
=
0
;
i
<
disks
;
i
++
)
{
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
...
@@ -2240,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
...
@@ -2240,10 +2251,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
*/
*/
ndisks
=
kzalloc
(
newsize
*
sizeof
(
struct
disk_info
),
GFP_NOIO
);
ndisks
=
kzalloc
(
newsize
*
sizeof
(
struct
disk_info
),
GFP_NOIO
);
if
(
ndisks
)
{
if
(
ndisks
)
{
for
(
i
=
0
;
i
<
conf
->
raid_disks
;
i
++
)
for
(
i
=
0
;
i
<
conf
->
pool_size
;
i
++
)
ndisks
[
i
]
=
conf
->
disks
[
i
];
ndisks
[
i
]
=
conf
->
disks
[
i
];
for
(
i
=
conf
->
pool_size
;
i
<
newsize
;
i
++
)
{
ndisks
[
i
].
extra_page
=
alloc_page
(
GFP_NOIO
);
if
(
!
ndisks
[
i
].
extra_page
)
err
=
-
ENOMEM
;
}
if
(
err
)
{
for
(
i
=
conf
->
pool_size
;
i
<
newsize
;
i
++
)
if
(
ndisks
[
i
].
extra_page
)
put_page
(
ndisks
[
i
].
extra_page
);
kfree
(
ndisks
);
}
else
{
kfree
(
conf
->
disks
);
kfree
(
conf
->
disks
);
conf
->
disks
=
ndisks
;
conf
->
disks
=
ndisks
;
}
}
else
}
else
err
=
-
ENOMEM
;
err
=
-
ENOMEM
;
...
@@ -2342,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi)
...
@@ -2342,10 +2367,8 @@ static void raid5_end_read_request(struct bio * bi)
* replacement device. We just fail those on
* replacement device. We just fail those on
* any error
* any error
*/
*/
printk_ratelimited
(
pr_info_ratelimited
(
KERN_INFO
"md/raid:%s: read error corrected (%lu sectors at %llu on %s)
\n
"
,
"md/raid:%s: read error corrected"
" (%lu sectors at %llu on %s)
\n
"
,
mdname
(
conf
->
mddev
),
STRIPE_SECTORS
,
mdname
(
conf
->
mddev
),
STRIPE_SECTORS
,
(
unsigned
long
long
)
s
,
(
unsigned
long
long
)
s
,
bdevname
(
rdev
->
bdev
,
b
));
bdevname
(
rdev
->
bdev
,
b
));
...
@@ -2365,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi)
...
@@ -2365,36 +2388,29 @@ static void raid5_end_read_request(struct bio * bi)
clear_bit
(
R5_UPTODATE
,
&
sh
->
dev
[
i
].
flags
);
clear_bit
(
R5_UPTODATE
,
&
sh
->
dev
[
i
].
flags
);
atomic_inc
(
&
rdev
->
read_errors
);
atomic_inc
(
&
rdev
->
read_errors
);
if
(
test_bit
(
R5_ReadRepl
,
&
sh
->
dev
[
i
].
flags
))
if
(
test_bit
(
R5_ReadRepl
,
&
sh
->
dev
[
i
].
flags
))
printk_ratelimited
(
pr_warn_ratelimited
(
KERN_WARNING
"md/raid:%s: read error on replacement device (sector %llu on %s).
\n
"
,
"md/raid:%s: read error on replacement device "
"(sector %llu on %s).
\n
"
,
mdname
(
conf
->
mddev
),
mdname
(
conf
->
mddev
),
(
unsigned
long
long
)
s
,
(
unsigned
long
long
)
s
,
bdn
);
bdn
);
else
if
(
conf
->
mddev
->
degraded
>=
conf
->
max_degraded
)
{
else
if
(
conf
->
mddev
->
degraded
>=
conf
->
max_degraded
)
{
set_bad
=
1
;
set_bad
=
1
;
printk_ratelimited
(
pr_warn_ratelimited
(
KERN_WARNING
"md/raid:%s: read error not correctable (sector %llu on %s).
\n
"
,
"md/raid:%s: read error not correctable "
"(sector %llu on %s).
\n
"
,
mdname
(
conf
->
mddev
),
mdname
(
conf
->
mddev
),
(
unsigned
long
long
)
s
,
(
unsigned
long
long
)
s
,
bdn
);
bdn
);
}
else
if
(
test_bit
(
R5_ReWrite
,
&
sh
->
dev
[
i
].
flags
))
{
}
else
if
(
test_bit
(
R5_ReWrite
,
&
sh
->
dev
[
i
].
flags
))
{
/* Oh, no!!! */
/* Oh, no!!! */
set_bad
=
1
;
set_bad
=
1
;
printk_ratelimited
(
pr_warn_ratelimited
(
KERN_WARNING
"md/raid:%s: read error NOT corrected!! (sector %llu on %s).
\n
"
,
"md/raid:%s: read error NOT corrected!! "
"(sector %llu on %s).
\n
"
,
mdname
(
conf
->
mddev
),
mdname
(
conf
->
mddev
),
(
unsigned
long
long
)
s
,
(
unsigned
long
long
)
s
,
bdn
);
bdn
);
}
else
if
(
atomic_read
(
&
rdev
->
read_errors
)
}
else
if
(
atomic_read
(
&
rdev
->
read_errors
)
>
conf
->
max_nr_stripes
)
>
conf
->
max_nr_stripes
)
printk
(
KERN_WARNING
pr_warn
(
"md/raid:%s: Too many read errors, failing device %s.
\n
"
,
"md/raid:%s: Too many read errors, failing device %s.
\n
"
,
mdname
(
conf
->
mddev
),
bdn
);
mdname
(
conf
->
mddev
),
bdn
);
else
else
retry
=
1
;
retry
=
1
;
...
@@ -2526,10 +2542,9 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -2526,10 +2542,9 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
set_bit
(
Blocked
,
&
rdev
->
flags
);
set_bit
(
Blocked
,
&
rdev
->
flags
);
set_bit
(
Faulty
,
&
rdev
->
flags
);
set_bit
(
Faulty
,
&
rdev
->
flags
);
set_mask_bits
(
&
mddev
->
flags
,
0
,
set_mask_bits
(
&
mddev
->
sb_flags
,
0
,
BIT
(
MD_CHANGE_DEVS
)
|
BIT
(
MD_CHANGE_PENDING
));
BIT
(
MD_SB_CHANGE_DEVS
)
|
BIT
(
MD_SB_CHANGE_PENDING
));
printk
(
KERN_ALERT
pr_crit
(
"md/raid:%s: Disk failure on %s, disabling device.
\n
"
"md/raid:%s: Disk failure on %s, disabling device.
\n
"
"md/raid:%s: Operation continuing on %d devices.
\n
"
,
"md/raid:%s: Operation continuing on %d devices.
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
bdevname
(
rdev
->
bdev
,
b
),
...
@@ -2856,7 +2871,7 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
...
@@ -2856,7 +2871,7 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
previous
,
&
dummy1
,
&
sh2
);
previous
,
&
dummy1
,
&
sh2
);
if
(
check
!=
sh
->
sector
||
dummy1
!=
dd_idx
||
sh2
.
pd_idx
!=
sh
->
pd_idx
if
(
check
!=
sh
->
sector
||
dummy1
!=
dd_idx
||
sh2
.
pd_idx
!=
sh
->
pd_idx
||
sh2
.
qd_idx
!=
sh
->
qd_idx
)
{
||
sh2
.
qd_idx
!=
sh
->
qd_idx
)
{
pr
intk
(
KERN_ERR
"md/raid:%s: compute_blocknr: map not correct
\n
"
,
pr
_warn
(
"md/raid:%s: compute_blocknr: map not correct
\n
"
,
mdname
(
conf
->
mddev
));
mdname
(
conf
->
mddev
));
return
0
;
return
0
;
}
}
...
@@ -2872,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
...
@@ -2872,6 +2887,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
int
level
=
conf
->
level
;
int
level
=
conf
->
level
;
if
(
rcw
)
{
if
(
rcw
)
{
/*
* In some cases, handle_stripe_dirtying initially decided to
* run rmw and allocates extra page for prexor. However, rcw is
* cheaper later on. We need to free the extra page now,
* because we won't be able to do that in ops_complete_prexor().
*/
r5c_release_extra_page
(
sh
);
for
(
i
=
disks
;
i
--
;
)
{
for
(
i
=
disks
;
i
--
;
)
{
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
...
@@ -2882,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
...
@@ -2882,6 +2904,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
if
(
!
expand
)
if
(
!
expand
)
clear_bit
(
R5_UPTODATE
,
&
dev
->
flags
);
clear_bit
(
R5_UPTODATE
,
&
dev
->
flags
);
s
->
locked
++
;
s
->
locked
++
;
}
else
if
(
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
{
set_bit
(
R5_LOCKED
,
&
dev
->
flags
);
s
->
locked
++
;
}
}
}
}
/* if we are not expanding this is a proper write request, and
/* if we are not expanding this is a proper write request, and
...
@@ -2921,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
...
@@ -2921,6 +2946,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
set_bit
(
R5_LOCKED
,
&
dev
->
flags
);
set_bit
(
R5_LOCKED
,
&
dev
->
flags
);
clear_bit
(
R5_UPTODATE
,
&
dev
->
flags
);
clear_bit
(
R5_UPTODATE
,
&
dev
->
flags
);
s
->
locked
++
;
s
->
locked
++
;
}
else
if
(
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
{
set_bit
(
R5_LOCKED
,
&
dev
->
flags
);
s
->
locked
++
;
}
}
}
}
if
(
!
s
->
locked
)
if
(
!
s
->
locked
)
...
@@ -3564,7 +3592,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
...
@@ -3564,7 +3592,7 @@ static void handle_stripe_clean_event(struct r5conf *conf,
break_stripe_batch_list
(
head_sh
,
STRIPE_EXPAND_SYNC_FLAGS
);
break_stripe_batch_list
(
head_sh
,
STRIPE_EXPAND_SYNC_FLAGS
);
}
}
static
void
handle_stripe_dirtying
(
struct
r5conf
*
conf
,
static
int
handle_stripe_dirtying
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
struct
stripe_head
*
sh
,
struct
stripe_head_state
*
s
,
struct
stripe_head_state
*
s
,
int
disks
)
int
disks
)
...
@@ -3592,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
...
@@ -3592,9 +3620,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
}
else
for
(
i
=
disks
;
i
--
;
)
{
}
else
for
(
i
=
disks
;
i
--
;
)
{
/* would I have to read this buffer for read_modify_write */
/* would I have to read this buffer for read_modify_write */
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
if
((
dev
->
towrite
||
i
==
sh
->
pd_idx
||
i
==
sh
->
qd_idx
)
&&
if
((
dev
->
towrite
||
i
==
sh
->
pd_idx
||
i
==
sh
->
qd_idx
||
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
&&
!
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
!
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
!
(
test_bit
(
R5_UPTODATE
,
&
dev
->
flags
)
||
!
((
test_bit
(
R5_UPTODATE
,
&
dev
->
flags
)
&&
(
!
test_bit
(
R5_InJournal
,
&
dev
->
flags
)
||
dev
->
page
!=
dev
->
orig_page
))
||
test_bit
(
R5_Wantcompute
,
&
dev
->
flags
)))
{
test_bit
(
R5_Wantcompute
,
&
dev
->
flags
)))
{
if
(
test_bit
(
R5_Insync
,
&
dev
->
flags
))
if
(
test_bit
(
R5_Insync
,
&
dev
->
flags
))
rmw
++
;
rmw
++
;
...
@@ -3606,6 +3637,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
...
@@ -3606,6 +3637,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
i
!=
sh
->
pd_idx
&&
i
!=
sh
->
qd_idx
&&
i
!=
sh
->
pd_idx
&&
i
!=
sh
->
qd_idx
&&
!
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
!
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
!
(
test_bit
(
R5_UPTODATE
,
&
dev
->
flags
)
||
!
(
test_bit
(
R5_UPTODATE
,
&
dev
->
flags
)
||
test_bit
(
R5_InJournal
,
&
dev
->
flags
)
||
test_bit
(
R5_Wantcompute
,
&
dev
->
flags
)))
{
test_bit
(
R5_Wantcompute
,
&
dev
->
flags
)))
{
if
(
test_bit
(
R5_Insync
,
&
dev
->
flags
))
if
(
test_bit
(
R5_Insync
,
&
dev
->
flags
))
rcw
++
;
rcw
++
;
...
@@ -3613,6 +3645,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
...
@@ -3613,6 +3645,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
rcw
+=
2
*
disks
;
rcw
+=
2
*
disks
;
}
}
}
}
pr_debug
(
"for sector %llu, rmw=%d rcw=%d
\n
"
,
pr_debug
(
"for sector %llu, rmw=%d rcw=%d
\n
"
,
(
unsigned
long
long
)
sh
->
sector
,
rmw
,
rcw
);
(
unsigned
long
long
)
sh
->
sector
,
rmw
,
rcw
);
set_bit
(
STRIPE_HANDLE
,
&
sh
->
state
);
set_bit
(
STRIPE_HANDLE
,
&
sh
->
state
);
...
@@ -3624,9 +3657,43 @@ static void handle_stripe_dirtying(struct r5conf *conf,
...
@@ -3624,9 +3657,43 @@ static void handle_stripe_dirtying(struct r5conf *conf,
(
unsigned
long
long
)
sh
->
sector
,
rmw
);
(
unsigned
long
long
)
sh
->
sector
,
rmw
);
for
(
i
=
disks
;
i
--
;
)
{
for
(
i
=
disks
;
i
--
;
)
{
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
if
((
dev
->
towrite
||
i
==
sh
->
pd_idx
||
i
==
sh
->
qd_idx
)
&&
if
(
test_bit
(
R5_InJournal
,
&
dev
->
flags
)
&&
dev
->
page
==
dev
->
orig_page
&&
!
test_bit
(
R5_LOCKED
,
&
sh
->
dev
[
sh
->
pd_idx
].
flags
))
{
/* alloc page for prexor */
struct
page
*
p
=
alloc_page
(
GFP_NOIO
);
if
(
p
)
{
dev
->
orig_page
=
p
;
continue
;
}
/*
* alloc_page() failed, try use
* disk_info->extra_page
*/
if
(
!
test_and_set_bit
(
R5C_EXTRA_PAGE_IN_USE
,
&
conf
->
cache_state
))
{
r5c_use_extra_page
(
sh
);
break
;
}
/* extra_page in use, add to delayed_list */
set_bit
(
STRIPE_DELAYED
,
&
sh
->
state
);
s
->
waiting_extra_page
=
1
;
return
-
EAGAIN
;
}
}
for
(
i
=
disks
;
i
--
;
)
{
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
if
((
dev
->
towrite
||
i
==
sh
->
pd_idx
||
i
==
sh
->
qd_idx
||
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
&&
!
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
!
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
!
(
test_bit
(
R5_UPTODATE
,
&
dev
->
flags
)
||
!
((
test_bit
(
R5_UPTODATE
,
&
dev
->
flags
)
&&
(
!
test_bit
(
R5_InJournal
,
&
dev
->
flags
)
||
dev
->
page
!=
dev
->
orig_page
))
||
test_bit
(
R5_Wantcompute
,
&
dev
->
flags
))
&&
test_bit
(
R5_Wantcompute
,
&
dev
->
flags
))
&&
test_bit
(
R5_Insync
,
&
dev
->
flags
))
{
test_bit
(
R5_Insync
,
&
dev
->
flags
))
{
if
(
test_bit
(
STRIPE_PREREAD_ACTIVE
,
if
(
test_bit
(
STRIPE_PREREAD_ACTIVE
,
...
@@ -3653,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
...
@@ -3653,6 +3720,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
i
!=
sh
->
pd_idx
&&
i
!=
sh
->
qd_idx
&&
i
!=
sh
->
pd_idx
&&
i
!=
sh
->
qd_idx
&&
!
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
!
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
!
(
test_bit
(
R5_UPTODATE
,
&
dev
->
flags
)
||
!
(
test_bit
(
R5_UPTODATE
,
&
dev
->
flags
)
||
test_bit
(
R5_InJournal
,
&
dev
->
flags
)
||
test_bit
(
R5_Wantcompute
,
&
dev
->
flags
)))
{
test_bit
(
R5_Wantcompute
,
&
dev
->
flags
)))
{
rcw
++
;
rcw
++
;
if
(
test_bit
(
R5_Insync
,
&
dev
->
flags
)
&&
if
(
test_bit
(
R5_Insync
,
&
dev
->
flags
)
&&
...
@@ -3694,6 +3762,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
...
@@ -3694,6 +3762,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
(
s
->
locked
==
0
&&
(
rcw
==
0
||
rmw
==
0
)
&&
(
s
->
locked
==
0
&&
(
rcw
==
0
||
rmw
==
0
)
&&
!
test_bit
(
STRIPE_BIT_DELAY
,
&
sh
->
state
)))
!
test_bit
(
STRIPE_BIT_DELAY
,
&
sh
->
state
)))
schedule_reconstruction
(
sh
,
s
,
rcw
==
0
,
0
);
schedule_reconstruction
(
sh
,
s
,
rcw
==
0
,
0
);
return
0
;
}
}
static
void
handle_parity_checks5
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
static
void
handle_parity_checks5
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
...
@@ -3777,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
...
@@ -3777,7 +3846,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
case
check_state_compute_run
:
case
check_state_compute_run
:
break
;
break
;
default:
default:
pr
intk
(
KERN_ERR
"%s: unknown check_state: %d sector: %llu
\n
"
,
pr
_err
(
"%s: unknown check_state: %d sector: %llu
\n
"
,
__func__
,
sh
->
check_state
,
__func__
,
sh
->
check_state
,
(
unsigned
long
long
)
sh
->
sector
);
(
unsigned
long
long
)
sh
->
sector
);
BUG
();
BUG
();
...
@@ -3941,7 +4010,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
...
@@ -3941,7 +4010,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
case
check_state_compute_run
:
case
check_state_compute_run
:
break
;
break
;
default:
default:
pr
intk
(
KERN_ERR
"%s: unknown check_state: %d sector: %llu
\n
"
,
pr
_warn
(
"%s: unknown check_state: %d sector: %llu
\n
"
,
__func__
,
sh
->
check_state
,
__func__
,
sh
->
check_state
,
(
unsigned
long
long
)
sh
->
sector
);
(
unsigned
long
long
)
sh
->
sector
);
BUG
();
BUG
();
...
@@ -4183,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
...
@@ -4183,6 +4252,11 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if
(
rdev
&&
!
test_bit
(
Faulty
,
&
rdev
->
flags
))
if
(
rdev
&&
!
test_bit
(
Faulty
,
&
rdev
->
flags
))
do_recovery
=
1
;
do_recovery
=
1
;
}
}
if
(
test_bit
(
R5_InJournal
,
&
dev
->
flags
))
s
->
injournal
++
;
if
(
test_bit
(
R5_InJournal
,
&
dev
->
flags
)
&&
dev
->
written
)
s
->
just_cached
++
;
}
}
if
(
test_bit
(
STRIPE_SYNCING
,
&
sh
->
state
))
{
if
(
test_bit
(
STRIPE_SYNCING
,
&
sh
->
state
))
{
/* If there is a failed device being replaced,
/* If there is a failed device being replaced,
...
@@ -4411,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh)
...
@@ -4411,7 +4485,8 @@ static void handle_stripe(struct stripe_head *sh)
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
struct
r5dev
*
dev
=
&
sh
->
dev
[
i
];
if
(
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
if
(
test_bit
(
R5_LOCKED
,
&
dev
->
flags
)
&&
(
i
==
sh
->
pd_idx
||
i
==
sh
->
qd_idx
||
(
i
==
sh
->
pd_idx
||
i
==
sh
->
qd_idx
||
dev
->
written
))
{
dev
->
written
||
test_bit
(
R5_InJournal
,
&
dev
->
flags
)))
{
pr_debug
(
"Writing block %d
\n
"
,
i
);
pr_debug
(
"Writing block %d
\n
"
,
i
);
set_bit
(
R5_Wantwrite
,
&
dev
->
flags
);
set_bit
(
R5_Wantwrite
,
&
dev
->
flags
);
if
(
prexor
)
if
(
prexor
)
...
@@ -4451,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh)
...
@@ -4451,6 +4526,10 @@ static void handle_stripe(struct stripe_head *sh)
test_bit
(
R5_Discard
,
&
qdev
->
flags
))))))
test_bit
(
R5_Discard
,
&
qdev
->
flags
))))))
handle_stripe_clean_event
(
conf
,
sh
,
disks
,
&
s
.
return_bi
);
handle_stripe_clean_event
(
conf
,
sh
,
disks
,
&
s
.
return_bi
);
if
(
s
.
just_cached
)
r5c_handle_cached_data_endio
(
conf
,
sh
,
disks
,
&
s
.
return_bi
);
r5l_stripe_write_finished
(
sh
);
/* Now we might consider reading some blocks, either to check/generate
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* parity, or to satisfy requests
* or to load a block that is being partially written.
* or to load a block that is being partially written.
...
@@ -4462,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh)
...
@@ -4462,14 +4541,51 @@ static void handle_stripe(struct stripe_head *sh)
||
s
.
expanding
)
||
s
.
expanding
)
handle_stripe_fill
(
sh
,
&
s
,
disks
);
handle_stripe_fill
(
sh
,
&
s
,
disks
);
/* Now to consider new write requests and what else, if anything
/*
* should be read. We do not handle new writes when:
* When the stripe finishes full journal write cycle (write to journal
* and raid disk), this is the clean up procedure so it is ready for
* next operation.
*/
r5c_finish_stripe_write_out
(
conf
,
sh
,
&
s
);
/*
* Now to consider new write requests, cache write back and what else,
* if anything should be read. We do not handle new writes when:
* 1/ A 'write' operation (copy+xor) is already in flight.
* 1/ A 'write' operation (copy+xor) is already in flight.
* 2/ A 'check' operation is in flight, as it may clobber the parity
* 2/ A 'check' operation is in flight, as it may clobber the parity
* block.
* block.
* 3/ A r5c cache log write is in flight.
*/
*/
if
(
s
.
to_write
&&
!
sh
->
reconstruct_state
&&
!
sh
->
check_state
)
if
(
!
sh
->
reconstruct_state
&&
!
sh
->
check_state
&&
!
sh
->
log_io
)
{
if
(
!
r5c_is_writeback
(
conf
->
log
))
{
if
(
s
.
to_write
)
handle_stripe_dirtying
(
conf
,
sh
,
&
s
,
disks
);
handle_stripe_dirtying
(
conf
,
sh
,
&
s
,
disks
);
}
else
{
/* write back cache */
int
ret
=
0
;
/* First, try handle writes in caching phase */
if
(
s
.
to_write
)
ret
=
r5c_try_caching_write
(
conf
,
sh
,
&
s
,
disks
);
/*
* If caching phase failed: ret == -EAGAIN
* OR
* stripe under reclaim: !caching && injournal
*
* fall back to handle_stripe_dirtying()
*/
if
(
ret
==
-
EAGAIN
||
/* stripe under reclaim: !caching && injournal */
(
!
test_bit
(
STRIPE_R5C_CACHING
,
&
sh
->
state
)
&&
s
.
injournal
>
0
))
{
ret
=
handle_stripe_dirtying
(
conf
,
sh
,
&
s
,
disks
);
if
(
ret
==
-
EAGAIN
)
goto
finish
;
}
}
}
/* maybe we need to check and possibly fix the parity for this stripe
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough
* Any reads will already have been scheduled, so we just see if enough
...
@@ -4640,9 +4756,7 @@ static void handle_stripe(struct stripe_head *sh)
...
@@ -4640,9 +4756,7 @@ static void handle_stripe(struct stripe_head *sh)
}
}
if
(
!
bio_list_empty
(
&
s
.
return_bi
))
{
if
(
!
bio_list_empty
(
&
s
.
return_bi
))
{
if
(
test_bit
(
MD_CHANGE_PENDING
,
&
conf
->
mddev
->
flags
)
&&
if
(
test_bit
(
MD_SB_CHANGE_PENDING
,
&
conf
->
mddev
->
sb_flags
))
{
(
s
.
failed
<=
conf
->
max_degraded
||
conf
->
mddev
->
external
==
0
))
{
spin_lock_irq
(
&
conf
->
device_lock
);
spin_lock_irq
(
&
conf
->
device_lock
);
bio_list_merge
(
&
conf
->
return_bi
,
&
s
.
return_bi
);
bio_list_merge
(
&
conf
->
return_bi
,
&
s
.
return_bi
);
spin_unlock_irq
(
&
conf
->
device_lock
);
spin_unlock_irq
(
&
conf
->
device_lock
);
...
@@ -4698,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
...
@@ -4698,6 +4812,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
if
(
test_bit
(
R5_INACTIVE_BLOCKED
,
&
conf
->
cache_state
))
if
(
test_bit
(
R5_INACTIVE_BLOCKED
,
&
conf
->
cache_state
))
return
1
;
return
1
;
/* Also checks whether there is pressure on r5cache log space */
if
(
test_bit
(
R5C_LOG_TIGHT
,
&
conf
->
cache_state
))
return
1
;
if
(
conf
->
quiesce
)
if
(
conf
->
quiesce
)
return
1
;
return
1
;
if
(
atomic_read
(
&
conf
->
empty_inactive_list_nr
))
if
(
atomic_read
(
&
conf
->
empty_inactive_list_nr
))
...
@@ -5167,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
...
@@ -5167,6 +5285,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
int
remaining
;
int
remaining
;
DEFINE_WAIT
(
w
);
DEFINE_WAIT
(
w
);
bool
do_prepare
;
bool
do_prepare
;
bool
do_flush
=
false
;
if
(
unlikely
(
bi
->
bi_opf
&
REQ_PREFLUSH
))
{
if
(
unlikely
(
bi
->
bi_opf
&
REQ_PREFLUSH
))
{
int
ret
=
r5l_handle_flush_request
(
conf
->
log
,
bi
);
int
ret
=
r5l_handle_flush_request
(
conf
->
log
,
bi
);
...
@@ -5178,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
...
@@ -5178,6 +5297,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
return
;
return
;
}
}
/* ret == -EAGAIN, fallback */
/* ret == -EAGAIN, fallback */
/*
* if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
* we need to flush journal device
*/
do_flush
=
bi
->
bi_opf
&
REQ_PREFLUSH
;
}
}
md_write_start
(
mddev
,
bi
);
md_write_start
(
mddev
,
bi
);
...
@@ -5188,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
...
@@ -5188,6 +5312,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
* data on failed drives.
* data on failed drives.
*/
*/
if
(
rw
==
READ
&&
mddev
->
degraded
==
0
&&
if
(
rw
==
READ
&&
mddev
->
degraded
==
0
&&
!
r5c_is_writeback
(
conf
->
log
)
&&
mddev
->
reshape_position
==
MaxSector
)
{
mddev
->
reshape_position
==
MaxSector
)
{
bi
=
chunk_aligned_read
(
mddev
,
bi
);
bi
=
chunk_aligned_read
(
mddev
,
bi
);
if
(
!
bi
)
if
(
!
bi
)
...
@@ -5316,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
...
@@ -5316,6 +5441,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
do_prepare
=
true
;
do_prepare
=
true
;
goto
retry
;
goto
retry
;
}
}
if
(
do_flush
)
{
set_bit
(
STRIPE_R5C_PREFLUSH
,
&
sh
->
state
);
/* we only need flush for one stripe */
do_flush
=
false
;
}
set_bit
(
STRIPE_HANDLE
,
&
sh
->
state
);
set_bit
(
STRIPE_HANDLE
,
&
sh
->
state
);
clear_bit
(
STRIPE_DELAYED
,
&
sh
->
state
);
clear_bit
(
STRIPE_DELAYED
,
&
sh
->
state
);
if
((
!
sh
->
batch_head
||
sh
==
sh
->
batch_head
)
&&
if
((
!
sh
->
batch_head
||
sh
==
sh
->
batch_head
)
&&
...
@@ -5481,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
...
@@ -5481,9 +5612,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
mddev
->
reshape_position
=
conf
->
reshape_progress
;
mddev
->
reshape_position
=
conf
->
reshape_progress
;
mddev
->
curr_resync_completed
=
sector_nr
;
mddev
->
curr_resync_completed
=
sector_nr
;
conf
->
reshape_checkpoint
=
jiffies
;
conf
->
reshape_checkpoint
=
jiffies
;
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
wait_event
(
mddev
->
sb_wait
,
mddev
->
flags
==
0
||
wait_event
(
mddev
->
sb_wait
,
mddev
->
sb_
flags
==
0
||
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
));
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
));
if
(
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
if
(
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
return
0
;
return
0
;
...
@@ -5579,10 +5710,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
...
@@ -5579,10 +5710,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
mddev
->
reshape_position
=
conf
->
reshape_progress
;
mddev
->
reshape_position
=
conf
->
reshape_progress
;
mddev
->
curr_resync_completed
=
sector_nr
;
mddev
->
curr_resync_completed
=
sector_nr
;
conf
->
reshape_checkpoint
=
jiffies
;
conf
->
reshape_checkpoint
=
jiffies
;
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
wait_event
(
mddev
->
sb_wait
,
wait_event
(
mddev
->
sb_wait
,
!
test_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
)
!
test_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
)
||
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
));
||
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
));
if
(
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
if
(
test_bit
(
MD_RECOVERY_INTR
,
&
mddev
->
recovery
))
goto
ret
;
goto
ret
;
...
@@ -5857,10 +5988,10 @@ static void raid5d(struct md_thread *thread)
...
@@ -5857,10 +5988,10 @@ static void raid5d(struct md_thread *thread)
md_check_recovery
(
mddev
);
md_check_recovery
(
mddev
);
if
(
!
bio_list_empty
(
&
conf
->
return_bi
)
&&
if
(
!
bio_list_empty
(
&
conf
->
return_bi
)
&&
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
{
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
{
struct
bio_list
tmp
=
BIO_EMPTY_LIST
;
struct
bio_list
tmp
=
BIO_EMPTY_LIST
;
spin_lock_irq
(
&
conf
->
device_lock
);
spin_lock_irq
(
&
conf
->
device_lock
);
if
(
!
test_bit
(
MD_
CHANGE_PENDING
,
&
mddev
->
flags
))
{
if
(
!
test_bit
(
MD_
SB_CHANGE_PENDING
,
&
mddev
->
sb_
flags
))
{
bio_list_merge
(
&
tmp
,
&
conf
->
return_bi
);
bio_list_merge
(
&
tmp
,
&
conf
->
return_bi
);
bio_list_init
(
&
conf
->
return_bi
);
bio_list_init
(
&
conf
->
return_bi
);
}
}
...
@@ -5907,7 +6038,7 @@ static void raid5d(struct md_thread *thread)
...
@@ -5907,7 +6038,7 @@ static void raid5d(struct md_thread *thread)
break
;
break
;
handled
+=
batch_size
;
handled
+=
batch_size
;
if
(
mddev
->
flags
&
~
(
1
<<
MD
_CHANGE_PENDING
))
{
if
(
mddev
->
sb_flags
&
~
(
1
<<
MD_SB
_CHANGE_PENDING
))
{
spin_unlock_irq
(
&
conf
->
device_lock
);
spin_unlock_irq
(
&
conf
->
device_lock
);
md_check_recovery
(
mddev
);
md_check_recovery
(
mddev
);
spin_lock_irq
(
&
conf
->
device_lock
);
spin_lock_irq
(
&
conf
->
device_lock
);
...
@@ -6237,6 +6368,7 @@ static struct attribute *raid5_attrs[] = {
...
@@ -6237,6 +6368,7 @@ static struct attribute *raid5_attrs[] = {
&
raid5_group_thread_cnt
.
attr
,
&
raid5_group_thread_cnt
.
attr
,
&
raid5_skip_copy
.
attr
,
&
raid5_skip_copy
.
attr
,
&
raid5_rmw_level
.
attr
,
&
raid5_rmw_level
.
attr
,
&
r5c_journal_mode
.
attr
,
NULL
,
NULL
,
};
};
static
struct
attribute_group
raid5_attrs_group
=
{
static
struct
attribute_group
raid5_attrs_group
=
{
...
@@ -6363,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf)
...
@@ -6363,6 +6495,8 @@ static void raid5_free_percpu(struct r5conf *conf)
static
void
free_conf
(
struct
r5conf
*
conf
)
static
void
free_conf
(
struct
r5conf
*
conf
)
{
{
int
i
;
if
(
conf
->
log
)
if
(
conf
->
log
)
r5l_exit_log
(
conf
->
log
);
r5l_exit_log
(
conf
->
log
);
if
(
conf
->
shrinker
.
nr_deferred
)
if
(
conf
->
shrinker
.
nr_deferred
)
...
@@ -6371,6 +6505,9 @@ static void free_conf(struct r5conf *conf)
...
@@ -6371,6 +6505,9 @@ static void free_conf(struct r5conf *conf)
free_thread_groups
(
conf
);
free_thread_groups
(
conf
);
shrink_stripes
(
conf
);
shrink_stripes
(
conf
);
raid5_free_percpu
(
conf
);
raid5_free_percpu
(
conf
);
for
(
i
=
0
;
i
<
conf
->
pool_size
;
i
++
)
if
(
conf
->
disks
[
i
].
extra_page
)
put_page
(
conf
->
disks
[
i
].
extra_page
);
kfree
(
conf
->
disks
);
kfree
(
conf
->
disks
);
kfree
(
conf
->
stripe_hashtbl
);
kfree
(
conf
->
stripe_hashtbl
);
kfree
(
conf
);
kfree
(
conf
);
...
@@ -6382,7 +6519,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
...
@@ -6382,7 +6519,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
struct
raid5_percpu
*
percpu
=
per_cpu_ptr
(
conf
->
percpu
,
cpu
);
struct
raid5_percpu
*
percpu
=
per_cpu_ptr
(
conf
->
percpu
,
cpu
);
if
(
alloc_scratch_buffer
(
conf
,
percpu
))
{
if
(
alloc_scratch_buffer
(
conf
,
percpu
))
{
pr_
err
(
"%s: failed memory allocation for cpu%u
\n
"
,
pr_
warn
(
"%s: failed memory allocation for cpu%u
\n
"
,
__func__
,
cpu
);
__func__
,
cpu
);
return
-
ENOMEM
;
return
-
ENOMEM
;
}
}
...
@@ -6453,7 +6590,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6453,7 +6590,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if
(
mddev
->
new_level
!=
5
if
(
mddev
->
new_level
!=
5
&&
mddev
->
new_level
!=
4
&&
mddev
->
new_level
!=
4
&&
mddev
->
new_level
!=
6
)
{
&&
mddev
->
new_level
!=
6
)
{
pr
intk
(
KERN_ERR
"md/raid:%s: raid level not set to 4/5/6 (%d)
\n
"
,
pr
_warn
(
"md/raid:%s: raid level not set to 4/5/6 (%d)
\n
"
,
mdname
(
mddev
),
mddev
->
new_level
);
mdname
(
mddev
),
mddev
->
new_level
);
return
ERR_PTR
(
-
EIO
);
return
ERR_PTR
(
-
EIO
);
}
}
...
@@ -6461,12 +6598,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6461,12 +6598,12 @@ static struct r5conf *setup_conf(struct mddev *mddev)
&&
!
algorithm_valid_raid5
(
mddev
->
new_layout
))
||
&&
!
algorithm_valid_raid5
(
mddev
->
new_layout
))
||
(
mddev
->
new_level
==
6
(
mddev
->
new_level
==
6
&&
!
algorithm_valid_raid6
(
mddev
->
new_layout
)))
{
&&
!
algorithm_valid_raid6
(
mddev
->
new_layout
)))
{
pr
intk
(
KERN_ERR
"md/raid:%s: layout %d not supported
\n
"
,
pr
_warn
(
"md/raid:%s: layout %d not supported
\n
"
,
mdname
(
mddev
),
mddev
->
new_layout
);
mdname
(
mddev
),
mddev
->
new_layout
);
return
ERR_PTR
(
-
EIO
);
return
ERR_PTR
(
-
EIO
);
}
}
if
(
mddev
->
new_level
==
6
&&
mddev
->
raid_disks
<
4
)
{
if
(
mddev
->
new_level
==
6
&&
mddev
->
raid_disks
<
4
)
{
pr
intk
(
KERN_ERR
"md/raid:%s: not enough configured devices (%d, minimum 4)
\n
"
,
pr
_warn
(
"md/raid:%s: not enough configured devices (%d, minimum 4)
\n
"
,
mdname
(
mddev
),
mddev
->
raid_disks
);
mdname
(
mddev
),
mddev
->
raid_disks
);
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
...
@@ -6474,7 +6611,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6474,7 +6611,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if
(
!
mddev
->
new_chunk_sectors
||
if
(
!
mddev
->
new_chunk_sectors
||
(
mddev
->
new_chunk_sectors
<<
9
)
%
PAGE_SIZE
||
(
mddev
->
new_chunk_sectors
<<
9
)
%
PAGE_SIZE
||
!
is_power_of_2
(
mddev
->
new_chunk_sectors
))
{
!
is_power_of_2
(
mddev
->
new_chunk_sectors
))
{
pr
intk
(
KERN_ERR
"md/raid:%s: invalid chunk size %d
\n
"
,
pr
_warn
(
"md/raid:%s: invalid chunk size %d
\n
"
,
mdname
(
mddev
),
mddev
->
new_chunk_sectors
<<
9
);
mdname
(
mddev
),
mddev
->
new_chunk_sectors
<<
9
);
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
...
@@ -6517,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6517,9 +6654,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf
->
disks
=
kzalloc
(
max_disks
*
sizeof
(
struct
disk_info
),
conf
->
disks
=
kzalloc
(
max_disks
*
sizeof
(
struct
disk_info
),
GFP_KERNEL
);
GFP_KERNEL
);
if
(
!
conf
->
disks
)
if
(
!
conf
->
disks
)
goto
abort
;
goto
abort
;
for
(
i
=
0
;
i
<
max_disks
;
i
++
)
{
conf
->
disks
[
i
].
extra_page
=
alloc_page
(
GFP_KERNEL
);
if
(
!
conf
->
disks
[
i
].
extra_page
)
goto
abort
;
}
conf
->
mddev
=
mddev
;
conf
->
mddev
=
mddev
;
if
((
conf
->
stripe_hashtbl
=
kzalloc
(
PAGE_SIZE
,
GFP_KERNEL
))
==
NULL
)
if
((
conf
->
stripe_hashtbl
=
kzalloc
(
PAGE_SIZE
,
GFP_KERNEL
))
==
NULL
)
...
@@ -6540,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6540,6 +6684,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
for
(
i
=
0
;
i
<
NR_STRIPE_HASH_LOCKS
;
i
++
)
for
(
i
=
0
;
i
<
NR_STRIPE_HASH_LOCKS
;
i
++
)
INIT_LIST_HEAD
(
conf
->
temp_inactive_list
+
i
);
INIT_LIST_HEAD
(
conf
->
temp_inactive_list
+
i
);
atomic_set
(
&
conf
->
r5c_cached_full_stripes
,
0
);
INIT_LIST_HEAD
(
&
conf
->
r5c_full_stripe_list
);
atomic_set
(
&
conf
->
r5c_cached_partial_stripes
,
0
);
INIT_LIST_HEAD
(
&
conf
->
r5c_partial_stripe_list
);
conf
->
level
=
mddev
->
new_level
;
conf
->
level
=
mddev
->
new_level
;
conf
->
chunk_sectors
=
mddev
->
new_chunk_sectors
;
conf
->
chunk_sectors
=
mddev
->
new_chunk_sectors
;
if
(
raid5_alloc_percpu
(
conf
)
!=
0
)
if
(
raid5_alloc_percpu
(
conf
)
!=
0
)
...
@@ -6566,8 +6715,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6566,8 +6715,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
if
(
test_bit
(
In_sync
,
&
rdev
->
flags
))
{
if
(
test_bit
(
In_sync
,
&
rdev
->
flags
))
{
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
printk
(
KERN_INFO
"md/raid:%s: device %s operational as raid"
pr_info
(
"md/raid:%s: device %s operational as raid disk %d
\n
"
,
" disk %d
\n
"
,
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
raid_disk
);
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
),
raid_disk
);
}
else
if
(
rdev
->
saved_raid_disk
!=
raid_disk
)
}
else
if
(
rdev
->
saved_raid_disk
!=
raid_disk
)
/* Cannot rely on bitmap to complete recovery */
/* Cannot rely on bitmap to complete recovery */
...
@@ -6602,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6602,21 +6750,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
((
mddev
->
new_chunk_sectors
<<
9
)
/
STRIPE_SIZE
)
*
4
);
((
mddev
->
new_chunk_sectors
<<
9
)
/
STRIPE_SIZE
)
*
4
);
conf
->
min_nr_stripes
=
max
(
NR_STRIPES
,
stripes
);
conf
->
min_nr_stripes
=
max
(
NR_STRIPES
,
stripes
);
if
(
conf
->
min_nr_stripes
!=
NR_STRIPES
)
if
(
conf
->
min_nr_stripes
!=
NR_STRIPES
)
printk
(
KERN_INFO
pr_info
(
"md/raid:%s: force stripe size %d for reshape
\n
"
,
"md/raid:%s: force stripe size %d for reshape
\n
"
,
mdname
(
mddev
),
conf
->
min_nr_stripes
);
mdname
(
mddev
),
conf
->
min_nr_stripes
);
}
}
memory
=
conf
->
min_nr_stripes
*
(
sizeof
(
struct
stripe_head
)
+
memory
=
conf
->
min_nr_stripes
*
(
sizeof
(
struct
stripe_head
)
+
max_disks
*
((
sizeof
(
struct
bio
)
+
PAGE_SIZE
)))
/
1024
;
max_disks
*
((
sizeof
(
struct
bio
)
+
PAGE_SIZE
)))
/
1024
;
atomic_set
(
&
conf
->
empty_inactive_list_nr
,
NR_STRIPE_HASH_LOCKS
);
atomic_set
(
&
conf
->
empty_inactive_list_nr
,
NR_STRIPE_HASH_LOCKS
);
if
(
grow_stripes
(
conf
,
conf
->
min_nr_stripes
))
{
if
(
grow_stripes
(
conf
,
conf
->
min_nr_stripes
))
{
printk
(
KERN_ERR
pr_warn
(
"md/raid:%s: couldn't allocate %dkB for buffers
\n
"
,
"md/raid:%s: couldn't allocate %dkB for buffers
\n
"
,
mdname
(
mddev
),
memory
);
mdname
(
mddev
),
memory
);
goto
abort
;
goto
abort
;
}
else
}
else
printk
(
KERN_INFO
"md/raid:%s: allocated %dkB
\n
"
,
pr_debug
(
"md/raid:%s: allocated %dkB
\n
"
,
mdname
(
mddev
),
memory
);
mdname
(
mddev
),
memory
);
/*
/*
* Losing a stripe head costs more than the time to refill it,
* Losing a stripe head costs more than the time to refill it,
* it reduces the queue depth and so can hurt throughput.
* it reduces the queue depth and so can hurt throughput.
...
@@ -6628,8 +6773,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6628,8 +6773,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
conf
->
shrinker
.
batch
=
128
;
conf
->
shrinker
.
batch
=
128
;
conf
->
shrinker
.
flags
=
0
;
conf
->
shrinker
.
flags
=
0
;
if
(
register_shrinker
(
&
conf
->
shrinker
))
{
if
(
register_shrinker
(
&
conf
->
shrinker
))
{
printk
(
KERN_ERR
pr_warn
(
"md/raid:%s: couldn't register shrinker.
\n
"
,
"md/raid:%s: couldn't register shrinker.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
goto
abort
;
goto
abort
;
}
}
...
@@ -6637,8 +6781,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
...
@@ -6637,8 +6781,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
sprintf
(
pers_name
,
"raid%d"
,
mddev
->
new_level
);
sprintf
(
pers_name
,
"raid%d"
,
mddev
->
new_level
);
conf
->
thread
=
md_register_thread
(
raid5d
,
mddev
,
pers_name
);
conf
->
thread
=
md_register_thread
(
raid5d
,
mddev
,
pers_name
);
if
(
!
conf
->
thread
)
{
if
(
!
conf
->
thread
)
{
printk
(
KERN_ERR
pr_warn
(
"md/raid:%s: couldn't allocate thread.
\n
"
,
"md/raid:%s: couldn't allocate thread.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
goto
abort
;
goto
abort
;
}
}
...
@@ -6692,8 +6835,7 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6692,8 +6835,7 @@ static int raid5_run(struct mddev *mddev)
int
first
=
1
;
int
first
=
1
;
if
(
mddev
->
recovery_cp
!=
MaxSector
)
if
(
mddev
->
recovery_cp
!=
MaxSector
)
printk
(
KERN_NOTICE
"md/raid:%s: not clean"
pr_notice
(
"md/raid:%s: not clean -- starting background reconstruction
\n
"
,
" -- starting background reconstruction
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
rdev_for_each
(
rdev
,
mddev
)
{
rdev_for_each
(
rdev
,
mddev
)
{
...
@@ -6737,14 +6879,13 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6737,14 +6879,13 @@ static int raid5_run(struct mddev *mddev)
int
new_data_disks
;
int
new_data_disks
;
if
(
journal_dev
)
{
if
(
journal_dev
)
{
pr
intk
(
KERN_ERR
"md/raid:%s: don't support reshape with journal - aborting.
\n
"
,
pr
_warn
(
"md/raid:%s: don't support reshape with journal - aborting.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
if
(
mddev
->
new_level
!=
mddev
->
level
)
{
if
(
mddev
->
new_level
!=
mddev
->
level
)
{
printk
(
KERN_ERR
"md/raid:%s: unsupported reshape "
pr_warn
(
"md/raid:%s: unsupported reshape required - aborting.
\n
"
,
"required - aborting.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -6760,8 +6901,8 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6760,8 +6901,8 @@ static int raid5_run(struct mddev *mddev)
chunk_sectors
=
max
(
mddev
->
chunk_sectors
,
mddev
->
new_chunk_sectors
);
chunk_sectors
=
max
(
mddev
->
chunk_sectors
,
mddev
->
new_chunk_sectors
);
new_data_disks
=
mddev
->
raid_disks
-
max_degraded
;
new_data_disks
=
mddev
->
raid_disks
-
max_degraded
;
if
(
sector_div
(
here_new
,
chunk_sectors
*
new_data_disks
))
{
if
(
sector_div
(
here_new
,
chunk_sectors
*
new_data_disks
))
{
pr
intk
(
KERN_ERR
"md/raid:%s: reshape_position not "
pr
_warn
(
"md/raid:%s: reshape_position not on a stripe boundary
\n
"
,
"on a stripe boundary
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
reshape_offset
=
here_new
*
chunk_sectors
;
reshape_offset
=
here_new
*
chunk_sectors
;
...
@@ -6782,9 +6923,7 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6782,9 +6923,7 @@ static int raid5_run(struct mddev *mddev)
abs
(
min_offset_diff
)
>=
mddev
->
new_chunk_sectors
)
abs
(
min_offset_diff
)
>=
mddev
->
new_chunk_sectors
)
/* not really in-place - so OK */
;
/* not really in-place - so OK */
;
else
if
(
mddev
->
ro
==
0
)
{
else
if
(
mddev
->
ro
==
0
)
{
printk
(
KERN_ERR
"md/raid:%s: in-place reshape "
pr_warn
(
"md/raid:%s: in-place reshape must be started in read-only mode - aborting
\n
"
,
"must be started in read-only mode "
"- aborting
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -6794,13 +6933,11 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6794,13 +6933,11 @@ static int raid5_run(struct mddev *mddev)
:
(
here_new
*
chunk_sectors
>=
:
(
here_new
*
chunk_sectors
>=
here_old
*
chunk_sectors
+
(
-
min_offset_diff
)))
{
here_old
*
chunk_sectors
+
(
-
min_offset_diff
)))
{
/* Reading from the same stripe as writing to - bad */
/* Reading from the same stripe as writing to - bad */
printk
(
KERN_ERR
"md/raid:%s: reshape_position too early for "
pr_warn
(
"md/raid:%s: reshape_position too early for auto-recovery - aborting.
\n
"
,
"auto-recovery - aborting.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
printk
(
KERN_INFO
"md/raid:%s: reshape will continue
\n
"
,
pr_debug
(
"md/raid:%s: reshape will continue
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
/* OK, we should be able to continue; */
/* OK, we should be able to continue; */
}
else
{
}
else
{
BUG_ON
(
mddev
->
level
!=
mddev
->
new_level
);
BUG_ON
(
mddev
->
level
!=
mddev
->
new_level
);
...
@@ -6819,7 +6956,7 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6819,7 +6956,7 @@ static int raid5_run(struct mddev *mddev)
if
(
test_bit
(
MD_HAS_JOURNAL
,
&
mddev
->
flags
))
{
if
(
test_bit
(
MD_HAS_JOURNAL
,
&
mddev
->
flags
))
{
if
(
!
journal_dev
)
{
if
(
!
journal_dev
)
{
pr_
err
(
"md/raid:%s: journal disk is missing, force array readonly
\n
"
,
pr_
warn
(
"md/raid:%s: journal disk is missing, force array readonly
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
mddev
->
ro
=
1
;
mddev
->
ro
=
1
;
set_disk_ro
(
mddev
->
gendisk
,
1
);
set_disk_ro
(
mddev
->
gendisk
,
1
);
...
@@ -6847,8 +6984,7 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6847,8 +6984,7 @@ static int raid5_run(struct mddev *mddev)
if
(
conf
->
disks
[
i
].
replacement
&&
if
(
conf
->
disks
[
i
].
replacement
&&
conf
->
reshape_progress
!=
MaxSector
)
{
conf
->
reshape_progress
!=
MaxSector
)
{
/* replacements and reshape simply do not mix. */
/* replacements and reshape simply do not mix. */
printk
(
KERN_ERR
"md: cannot handle concurrent "
pr_warn
(
"md: cannot handle concurrent replacement and reshape.
\n
"
);
"replacement and reshape.
\n
"
);
goto
abort
;
goto
abort
;
}
}
if
(
test_bit
(
In_sync
,
&
rdev
->
flags
))
{
if
(
test_bit
(
In_sync
,
&
rdev
->
flags
))
{
...
@@ -6890,8 +7026,7 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6890,8 +7026,7 @@ static int raid5_run(struct mddev *mddev)
mddev
->
degraded
=
calc_degraded
(
conf
);
mddev
->
degraded
=
calc_degraded
(
conf
);
if
(
has_failed
(
conf
))
{
if
(
has_failed
(
conf
))
{
printk
(
KERN_ERR
"md/raid:%s: not enough operational devices"
pr_crit
(
"md/raid:%s: not enough operational devices (%d/%d failed)
\n
"
,
" (%d/%d failed)
\n
"
,
mdname
(
mddev
),
mddev
->
degraded
,
conf
->
raid_disks
);
mdname
(
mddev
),
mddev
->
degraded
,
conf
->
raid_disks
);
goto
abort
;
goto
abort
;
}
}
...
@@ -6903,29 +7038,19 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6903,29 +7038,19 @@ static int raid5_run(struct mddev *mddev)
if
(
mddev
->
degraded
>
dirty_parity_disks
&&
if
(
mddev
->
degraded
>
dirty_parity_disks
&&
mddev
->
recovery_cp
!=
MaxSector
)
{
mddev
->
recovery_cp
!=
MaxSector
)
{
if
(
mddev
->
ok_start_degraded
)
if
(
mddev
->
ok_start_degraded
)
printk
(
KERN_WARNING
pr_crit
(
"md/raid:%s: starting dirty degraded array - data corruption possible.
\n
"
,
"md/raid:%s: starting dirty degraded array"
" - data corruption possible.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
else
{
else
{
printk
(
KERN_ERR
pr_crit
(
"md/raid:%s: cannot start dirty degraded array.
\n
"
,
"md/raid:%s: cannot start dirty degraded array.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
goto
abort
;
goto
abort
;
}
}
}
}
if
(
mddev
->
degraded
==
0
)
pr_info
(
"md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d
\n
"
,
printk
(
KERN_INFO
"md/raid:%s: raid level %d active with %d out of %d"
mdname
(
mddev
),
conf
->
level
,
" devices, algorithm %d
\n
"
,
mdname
(
mddev
),
conf
->
level
,
mddev
->
raid_disks
-
mddev
->
degraded
,
mddev
->
raid_disks
,
mddev
->
raid_disks
-
mddev
->
degraded
,
mddev
->
raid_disks
,
mddev
->
new_layout
);
mddev
->
new_layout
);
else
printk
(
KERN_ALERT
"md/raid:%s: raid level %d active with %d"
" out of %d devices, algorithm %d
\n
"
,
mdname
(
mddev
),
conf
->
level
,
mddev
->
raid_disks
-
mddev
->
degraded
,
mddev
->
raid_disks
,
mddev
->
new_layout
);
print_raid5_conf
(
conf
);
print_raid5_conf
(
conf
);
...
@@ -6945,8 +7070,7 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6945,8 +7070,7 @@ static int raid5_run(struct mddev *mddev)
mddev
->
to_remove
=
NULL
;
mddev
->
to_remove
=
NULL
;
else
if
(
mddev
->
kobj
.
sd
&&
else
if
(
mddev
->
kobj
.
sd
&&
sysfs_create_group
(
&
mddev
->
kobj
,
&
raid5_attrs_group
))
sysfs_create_group
(
&
mddev
->
kobj
,
&
raid5_attrs_group
))
printk
(
KERN_WARNING
pr_warn
(
"raid5: failed to create sysfs attributes for %s
\n
"
,
"raid5: failed to create sysfs attributes for %s
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
md_set_array_sectors
(
mddev
,
raid5_size
(
mddev
,
0
,
0
));
md_set_array_sectors
(
mddev
,
raid5_size
(
mddev
,
0
,
0
));
...
@@ -6979,6 +7103,15 @@ static int raid5_run(struct mddev *mddev)
...
@@ -6979,6 +7103,15 @@ static int raid5_run(struct mddev *mddev)
stripe
=
(
stripe
|
(
stripe
-
1
))
+
1
;
stripe
=
(
stripe
|
(
stripe
-
1
))
+
1
;
mddev
->
queue
->
limits
.
discard_alignment
=
stripe
;
mddev
->
queue
->
limits
.
discard_alignment
=
stripe
;
mddev
->
queue
->
limits
.
discard_granularity
=
stripe
;
mddev
->
queue
->
limits
.
discard_granularity
=
stripe
;
/*
* We use 16-bit counter of active stripes in bi_phys_segments
* (minus one for over-loaded initialization)
*/
blk_queue_max_hw_sectors
(
mddev
->
queue
,
0xfffe
*
STRIPE_SECTORS
);
blk_queue_max_discard_sectors
(
mddev
->
queue
,
0xfffe
*
STRIPE_SECTORS
);
/*
/*
* unaligned part of discard request will be ignored, so can't
* unaligned part of discard request will be ignored, so can't
* guarantee discard_zeroes_data
* guarantee discard_zeroes_data
...
@@ -7035,9 +7168,10 @@ static int raid5_run(struct mddev *mddev)
...
@@ -7035,9 +7168,10 @@ static int raid5_run(struct mddev *mddev)
if
(
journal_dev
)
{
if
(
journal_dev
)
{
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
pr
intk
(
KERN_INFO
"md/raid:%s: using device %s as journal
\n
"
,
pr
_debug
(
"md/raid:%s: using device %s as journal
\n
"
,
mdname
(
mddev
),
bdevname
(
journal_dev
->
bdev
,
b
));
mdname
(
mddev
),
bdevname
(
journal_dev
->
bdev
,
b
));
r5l_init_log
(
conf
,
journal_dev
);
if
(
r5l_init_log
(
conf
,
journal_dev
))
goto
abort
;
}
}
return
0
;
return
0
;
...
@@ -7046,7 +7180,7 @@ static int raid5_run(struct mddev *mddev)
...
@@ -7046,7 +7180,7 @@ static int raid5_run(struct mddev *mddev)
print_raid5_conf
(
conf
);
print_raid5_conf
(
conf
);
free_conf
(
conf
);
free_conf
(
conf
);
mddev
->
private
=
NULL
;
mddev
->
private
=
NULL
;
pr
intk
(
KERN_ALERT
"md/raid:%s: failed to run raid set.
\n
"
,
mdname
(
mddev
));
pr
_warn
(
"md/raid:%s: failed to run raid set.
\n
"
,
mdname
(
mddev
));
return
-
EIO
;
return
-
EIO
;
}
}
...
@@ -7080,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf)
...
@@ -7080,12 +7214,12 @@ static void print_raid5_conf (struct r5conf *conf)
int
i
;
int
i
;
struct
disk_info
*
tmp
;
struct
disk_info
*
tmp
;
pr
intk
(
KERN_DEBUG
"RAID conf printout:
\n
"
);
pr
_debug
(
"RAID conf printout:
\n
"
);
if
(
!
conf
)
{
if
(
!
conf
)
{
pr
intk
(
"(conf==NULL)
\n
"
);
pr
_debug
(
"(conf==NULL)
\n
"
);
return
;
return
;
}
}
pr
intk
(
KERN_DEBUG
" --- level:%d rd:%d wd:%d
\n
"
,
conf
->
level
,
pr
_debug
(
" --- level:%d rd:%d wd:%d
\n
"
,
conf
->
level
,
conf
->
raid_disks
,
conf
->
raid_disks
,
conf
->
raid_disks
-
conf
->
mddev
->
degraded
);
conf
->
raid_disks
-
conf
->
mddev
->
degraded
);
...
@@ -7093,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf)
...
@@ -7093,7 +7227,7 @@ static void print_raid5_conf (struct r5conf *conf)
char
b
[
BDEVNAME_SIZE
];
char
b
[
BDEVNAME_SIZE
];
tmp
=
conf
->
disks
+
i
;
tmp
=
conf
->
disks
+
i
;
if
(
tmp
->
rdev
)
if
(
tmp
->
rdev
)
pr
intk
(
KERN_DEBUG
" disk %d, o:%d, dev:%s
\n
"
,
pr
_debug
(
" disk %d, o:%d, dev:%s
\n
"
,
i
,
!
test_bit
(
Faulty
,
&
tmp
->
rdev
->
flags
),
i
,
!
test_bit
(
Faulty
,
&
tmp
->
rdev
->
flags
),
bdevname
(
tmp
->
rdev
->
bdev
,
b
));
bdevname
(
tmp
->
rdev
->
bdev
,
b
));
}
}
...
@@ -7241,7 +7375,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
...
@@ -7241,7 +7375,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
* write requests running. We should be safe
* write requests running. We should be safe
*/
*/
r5l_init_log
(
conf
,
rdev
);
r5l_init_log
(
conf
,
rdev
);
pr
intk
(
KERN_INFO
"md/raid:%s: using device %s as journal
\n
"
,
pr
_debug
(
"md/raid:%s: using device %s as journal
\n
"
,
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
));
mdname
(
mddev
),
bdevname
(
rdev
->
bdev
,
b
));
return
0
;
return
0
;
}
}
...
@@ -7346,7 +7480,7 @@ static int check_stripe_cache(struct mddev *mddev)
...
@@ -7346,7 +7480,7 @@ static int check_stripe_cache(struct mddev *mddev)
>
conf
->
min_nr_stripes
||
>
conf
->
min_nr_stripes
||
((
mddev
->
new_chunk_sectors
<<
9
)
/
STRIPE_SIZE
)
*
4
((
mddev
->
new_chunk_sectors
<<
9
)
/
STRIPE_SIZE
)
*
4
>
conf
->
min_nr_stripes
)
{
>
conf
->
min_nr_stripes
)
{
pr
intk
(
KERN_WARNING
"md/raid:%s: reshape: not enough stripes. Needed %lu
\n
"
,
pr
_warn
(
"md/raid:%s: reshape: not enough stripes. Needed %lu
\n
"
,
mdname
(
mddev
),
mdname
(
mddev
),
((
max
(
mddev
->
chunk_sectors
,
mddev
->
new_chunk_sectors
)
<<
9
)
((
max
(
mddev
->
chunk_sectors
,
mddev
->
new_chunk_sectors
)
<<
9
)
/
STRIPE_SIZE
)
*
4
);
/
STRIPE_SIZE
)
*
4
);
...
@@ -7430,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev)
...
@@ -7430,8 +7564,8 @@ static int raid5_start_reshape(struct mddev *mddev)
*/
*/
if
(
raid5_size
(
mddev
,
0
,
conf
->
raid_disks
+
mddev
->
delta_disks
)
if
(
raid5_size
(
mddev
,
0
,
conf
->
raid_disks
+
mddev
->
delta_disks
)
<
mddev
->
array_sectors
)
{
<
mddev
->
array_sectors
)
{
pr
intk
(
KERN_ERR
"md/raid:%s: array size must be reduced "
pr
_warn
(
"md/raid:%s: array size must be reduced before number of disks
\n
"
,
"before number of disks
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
-
EINVAL
;
return
-
EINVAL
;
}
}
...
@@ -7501,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev)
...
@@ -7501,7 +7635,7 @@ static int raid5_start_reshape(struct mddev *mddev)
}
}
mddev
->
raid_disks
=
conf
->
raid_disks
;
mddev
->
raid_disks
=
conf
->
raid_disks
;
mddev
->
reshape_position
=
conf
->
reshape_progress
;
mddev
->
reshape_position
=
conf
->
reshape_progress
;
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
clear_bit
(
MD_RECOVERY_SYNC
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_SYNC
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_CHECK
,
&
mddev
->
recovery
);
clear_bit
(
MD_RECOVERY_CHECK
,
&
mddev
->
recovery
);
...
@@ -7619,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
...
@@ -7619,6 +7753,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
/* '2' tells resync/reshape to pause so that all
/* '2' tells resync/reshape to pause so that all
* active stripes can drain
* active stripes can drain
*/
*/
r5c_flush_cache
(
conf
,
INT_MAX
);
conf
->
quiesce
=
2
;
conf
->
quiesce
=
2
;
wait_event_cmd
(
conf
->
wait_for_quiescent
,
wait_event_cmd
(
conf
->
wait_for_quiescent
,
atomic_read
(
&
conf
->
active_stripes
)
==
0
&&
atomic_read
(
&
conf
->
active_stripes
)
==
0
&&
...
@@ -7649,7 +7784,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
...
@@ -7649,7 +7784,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
/* for raid0 takeover only one zone is supported */
/* for raid0 takeover only one zone is supported */
if
(
raid0_conf
->
nr_strip_zones
>
1
)
{
if
(
raid0_conf
->
nr_strip_zones
>
1
)
{
pr
intk
(
KERN_ERR
"md/raid:%s: cannot takeover raid0 with more than one zone.
\n
"
,
pr
_warn
(
"md/raid:%s: cannot takeover raid0 with more than one zone.
\n
"
,
mdname
(
mddev
));
mdname
(
mddev
));
return
ERR_PTR
(
-
EINVAL
);
return
ERR_PTR
(
-
EINVAL
);
}
}
...
@@ -7671,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
...
@@ -7671,6 +7806,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
static
void
*
raid5_takeover_raid1
(
struct
mddev
*
mddev
)
static
void
*
raid5_takeover_raid1
(
struct
mddev
*
mddev
)
{
{
int
chunksect
;
int
chunksect
;
void
*
ret
;
if
(
mddev
->
raid_disks
!=
2
||
if
(
mddev
->
raid_disks
!=
2
||
mddev
->
degraded
>
1
)
mddev
->
degraded
>
1
)
...
@@ -7692,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
...
@@ -7692,7 +7828,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
mddev
->
new_layout
=
ALGORITHM_LEFT_SYMMETRIC
;
mddev
->
new_layout
=
ALGORITHM_LEFT_SYMMETRIC
;
mddev
->
new_chunk_sectors
=
chunksect
;
mddev
->
new_chunk_sectors
=
chunksect
;
return
setup_conf
(
mddev
);
ret
=
setup_conf
(
mddev
);
if
(
!
IS_ERR_VALUE
(
ret
))
clear_bit
(
MD_FAILFAST_SUPPORTED
,
&
mddev
->
flags
);
return
ret
;
}
}
static
void
*
raid5_takeover_raid6
(
struct
mddev
*
mddev
)
static
void
*
raid5_takeover_raid6
(
struct
mddev
*
mddev
)
...
@@ -7762,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev)
...
@@ -7762,7 +7901,7 @@ static int raid5_check_reshape(struct mddev *mddev)
conf
->
chunk_sectors
=
new_chunk
;
conf
->
chunk_sectors
=
new_chunk
;
mddev
->
chunk_sectors
=
new_chunk
;
mddev
->
chunk_sectors
=
new_chunk
;
}
}
set_bit
(
MD_
CHANGE_DEVS
,
&
mddev
->
flags
);
set_bit
(
MD_
SB_CHANGE_DEVS
,
&
mddev
->
sb_
flags
);
md_wakeup_thread
(
mddev
->
thread
);
md_wakeup_thread
(
mddev
->
thread
);
}
}
return
check_reshape
(
mddev
);
return
check_reshape
(
mddev
);
...
...
drivers/md/raid5.h
浏览文件 @
20737738
...
@@ -226,6 +226,8 @@ struct stripe_head {
...
@@ -226,6 +226,8 @@ struct stripe_head {
struct
r5l_io_unit
*
log_io
;
struct
r5l_io_unit
*
log_io
;
struct
list_head
log_list
;
struct
list_head
log_list
;
sector_t
log_start
;
/* first meta block on the journal */
struct
list_head
r5c
;
/* for r5c_cache->stripe_in_journal */
/**
/**
* struct stripe_operations
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
* @target - STRIPE_OP_COMPUTE_BLK target
...
@@ -264,6 +266,7 @@ struct stripe_head_state {
...
@@ -264,6 +266,7 @@ struct stripe_head_state {
int
syncing
,
expanding
,
expanded
,
replacing
;
int
syncing
,
expanding
,
expanded
,
replacing
;
int
locked
,
uptodate
,
to_read
,
to_write
,
failed
,
written
;
int
locked
,
uptodate
,
to_read
,
to_write
,
failed
,
written
;
int
to_fill
,
compute
,
req_compute
,
non_overwrite
;
int
to_fill
,
compute
,
req_compute
,
non_overwrite
;
int
injournal
,
just_cached
;
int
failed_num
[
2
];
int
failed_num
[
2
];
int
p_failed
,
q_failed
;
int
p_failed
,
q_failed
;
int
dec_preread_active
;
int
dec_preread_active
;
...
@@ -273,6 +276,7 @@ struct stripe_head_state {
...
@@ -273,6 +276,7 @@ struct stripe_head_state {
struct
md_rdev
*
blocked_rdev
;
struct
md_rdev
*
blocked_rdev
;
int
handle_bad_blocks
;
int
handle_bad_blocks
;
int
log_failed
;
int
log_failed
;
int
waiting_extra_page
;
};
};
/* Flags for struct r5dev.flags */
/* Flags for struct r5dev.flags */
...
@@ -313,6 +317,11 @@ enum r5dev_flags {
...
@@ -313,6 +317,11 @@ enum r5dev_flags {
*/
*/
R5_Discard
,
/* Discard the stripe */
R5_Discard
,
/* Discard the stripe */
R5_SkipCopy
,
/* Don't copy data from bio to stripe cache */
R5_SkipCopy
,
/* Don't copy data from bio to stripe cache */
R5_InJournal
,
/* data being written is in the journal device.
* if R5_InJournal is set for parity pd_idx, all the
* data and parity being written are in the journal
* device
*/
};
};
/*
/*
...
@@ -345,7 +354,30 @@ enum {
...
@@ -345,7 +354,30 @@ enum {
STRIPE_BITMAP_PENDING
,
/* Being added to bitmap, don't add
STRIPE_BITMAP_PENDING
,
/* Being added to bitmap, don't add
* to batch yet.
* to batch yet.
*/
*/
STRIPE_LOG_TRAPPED
,
/* trapped into log */
STRIPE_LOG_TRAPPED
,
/* trapped into log (see raid5-cache.c)
* this bit is used in two scenarios:
*
* 1. write-out phase
* set in first entry of r5l_write_stripe
* clear in second entry of r5l_write_stripe
* used to bypass logic in handle_stripe
*
* 2. caching phase
* set in r5c_try_caching_write()
* clear when journal write is done
* used to initiate r5c_cache_data()
* also used to bypass logic in handle_stripe
*/
STRIPE_R5C_CACHING
,
/* the stripe is in caching phase
* see more detail in the raid5-cache.c
*/
STRIPE_R5C_PARTIAL_STRIPE
,
/* in r5c cache (to-be/being handled or
* in conf->r5c_partial_stripe_list)
*/
STRIPE_R5C_FULL_STRIPE
,
/* in r5c cache (to-be/being handled or
* in conf->r5c_full_stripe_list)
*/
STRIPE_R5C_PREFLUSH
,
/* need to flush journal device */
};
};
#define STRIPE_EXPAND_SYNC_FLAGS \
#define STRIPE_EXPAND_SYNC_FLAGS \
...
@@ -408,8 +440,86 @@ enum {
...
@@ -408,8 +440,86 @@ enum {
struct
disk_info
{
struct
disk_info
{
struct
md_rdev
*
rdev
,
*
replacement
;
struct
md_rdev
*
rdev
,
*
replacement
;
struct
page
*
extra_page
;
/* extra page to use in prexor */
};
};
/*
* Stripe cache
*/
#define NR_STRIPES 256
#define STRIPE_SIZE PAGE_SIZE
#define STRIPE_SHIFT (PAGE_SHIFT - 9)
#define STRIPE_SECTORS (STRIPE_SIZE>>9)
#define IO_THRESHOLD 1
#define BYPASS_THRESHOLD 1
#define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
#define HASH_MASK (NR_HASH - 1)
#define MAX_STRIPE_BATCH 8
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
* order without overlap. There may be several bio's per stripe+device, and
* a bio could span several devices.
* When walking this list for a particular stripe+device, we must never proceed
* beyond a bio that extends past this device, as the next bio might no longer
* be valid.
* This function is used to determine the 'next' bio in the list, given the
* sector of the current stripe+device
*/
static
inline
struct
bio
*
r5_next_bio
(
struct
bio
*
bio
,
sector_t
sector
)
{
int
sectors
=
bio_sectors
(
bio
);
if
(
bio
->
bi_iter
.
bi_sector
+
sectors
<
sector
+
STRIPE_SECTORS
)
return
bio
->
bi_next
;
else
return
NULL
;
}
/*
* We maintain a biased count of active stripes in the bottom 16 bits of
* bi_phys_segments, and a count of processed stripes in the upper 16 bits
*/
static
inline
int
raid5_bi_processed_stripes
(
struct
bio
*
bio
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
return
(
atomic_read
(
segments
)
>>
16
)
&
0xffff
;
}
static
inline
int
raid5_dec_bi_active_stripes
(
struct
bio
*
bio
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
return
atomic_sub_return
(
1
,
segments
)
&
0xffff
;
}
static
inline
void
raid5_inc_bi_active_stripes
(
struct
bio
*
bio
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
atomic_inc
(
segments
);
}
static
inline
void
raid5_set_bi_processed_stripes
(
struct
bio
*
bio
,
unsigned
int
cnt
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
int
old
,
new
;
do
{
old
=
atomic_read
(
segments
);
new
=
(
old
&
0xffff
)
|
(
cnt
<<
16
);
}
while
(
atomic_cmpxchg
(
segments
,
old
,
new
)
!=
old
);
}
static
inline
void
raid5_set_bi_stripes
(
struct
bio
*
bio
,
unsigned
int
cnt
)
{
atomic_t
*
segments
=
(
atomic_t
*
)
&
bio
->
bi_phys_segments
;
atomic_set
(
segments
,
cnt
);
}
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
/* NOTE NR_STRIPE_HASH_LOCKS must remain below 64.
* This is because we sometimes take all the spinlocks
* This is because we sometimes take all the spinlocks
* and creating that much locking depth can cause
* and creating that much locking depth can cause
...
@@ -432,6 +542,30 @@ struct r5worker_group {
...
@@ -432,6 +542,30 @@ struct r5worker_group {
int
stripes_cnt
;
int
stripes_cnt
;
};
};
enum
r5_cache_state
{
R5_INACTIVE_BLOCKED
,
/* release of inactive stripes blocked,
* waiting for 25% to be free
*/
R5_ALLOC_MORE
,
/* It might help to allocate another
* stripe.
*/
R5_DID_ALLOC
,
/* A stripe was allocated, don't allocate
* more until at least one has been
* released. This avoids flooding
* the cache.
*/
R5C_LOG_TIGHT
,
/* log device space tight, need to
* prioritize stripes at last_checkpoint
*/
R5C_LOG_CRITICAL
,
/* log device is running out of space,
* only process stripes that are already
* occupying the log
*/
R5C_EXTRA_PAGE_IN_USE
,
/* a stripe is using disk_info.extra_page
* for prexor
*/
};
struct
r5conf
{
struct
r5conf
{
struct
hlist_head
*
stripe_hashtbl
;
struct
hlist_head
*
stripe_hashtbl
;
/* only protect corresponding hash list and inactive_list */
/* only protect corresponding hash list and inactive_list */
...
@@ -519,23 +653,18 @@ struct r5conf {
...
@@ -519,23 +653,18 @@ struct r5conf {
*/
*/
atomic_t
active_stripes
;
atomic_t
active_stripes
;
struct
list_head
inactive_list
[
NR_STRIPE_HASH_LOCKS
];
struct
list_head
inactive_list
[
NR_STRIPE_HASH_LOCKS
];
atomic_t
r5c_cached_full_stripes
;
struct
list_head
r5c_full_stripe_list
;
atomic_t
r5c_cached_partial_stripes
;
struct
list_head
r5c_partial_stripe_list
;
atomic_t
empty_inactive_list_nr
;
atomic_t
empty_inactive_list_nr
;
struct
llist_head
released_stripes
;
struct
llist_head
released_stripes
;
wait_queue_head_t
wait_for_quiescent
;
wait_queue_head_t
wait_for_quiescent
;
wait_queue_head_t
wait_for_stripe
;
wait_queue_head_t
wait_for_stripe
;
wait_queue_head_t
wait_for_overlap
;
wait_queue_head_t
wait_for_overlap
;
unsigned
long
cache_state
;
unsigned
long
cache_state
;
#define R5_INACTIVE_BLOCKED 1
/* release of inactive stripes blocked,
* waiting for 25% to be free
*/
#define R5_ALLOC_MORE 2
/* It might help to allocate another
* stripe.
*/
#define R5_DID_ALLOC 4
/* A stripe was allocated, don't allocate
* more until at least one has been
* released. This avoids flooding
* the cache.
*/
struct
shrinker
shrinker
;
struct
shrinker
shrinker
;
int
pool_size
;
/* number of disks in stripeheads in pool */
int
pool_size
;
/* number of disks in stripeheads in pool */
spinlock_t
device_lock
;
spinlock_t
device_lock
;
...
@@ -633,4 +762,23 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
...
@@ -633,4 +762,23 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern
int
r5l_handle_flush_request
(
struct
r5l_log
*
log
,
struct
bio
*
bio
);
extern
int
r5l_handle_flush_request
(
struct
r5l_log
*
log
,
struct
bio
*
bio
);
extern
void
r5l_quiesce
(
struct
r5l_log
*
log
,
int
state
);
extern
void
r5l_quiesce
(
struct
r5l_log
*
log
,
int
state
);
extern
bool
r5l_log_disk_error
(
struct
r5conf
*
conf
);
extern
bool
r5l_log_disk_error
(
struct
r5conf
*
conf
);
extern
bool
r5c_is_writeback
(
struct
r5l_log
*
log
);
extern
int
r5c_try_caching_write
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
struct
stripe_head_state
*
s
,
int
disks
);
extern
void
r5c_finish_stripe_write_out
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
struct
stripe_head_state
*
s
);
extern
void
r5c_release_extra_page
(
struct
stripe_head
*
sh
);
extern
void
r5c_use_extra_page
(
struct
stripe_head
*
sh
);
extern
void
r5l_wake_reclaim
(
struct
r5l_log
*
log
,
sector_t
space
);
extern
void
r5c_handle_cached_data_endio
(
struct
r5conf
*
conf
,
struct
stripe_head
*
sh
,
int
disks
,
struct
bio_list
*
return_bi
);
extern
int
r5c_cache_data
(
struct
r5l_log
*
log
,
struct
stripe_head
*
sh
,
struct
stripe_head_state
*
s
);
extern
void
r5c_make_stripe_write_out
(
struct
stripe_head
*
sh
);
extern
void
r5c_flush_cache
(
struct
r5conf
*
conf
,
int
num
);
extern
void
r5c_check_stripe_cache_usage
(
struct
r5conf
*
conf
);
extern
void
r5c_check_cached_full_stripe
(
struct
r5conf
*
conf
);
extern
struct
md_sysfs_entry
r5c_journal_mode
;
#endif
#endif
include/uapi/linux/raid/md_p.h
浏览文件 @
20737738
...
@@ -84,6 +84,10 @@
...
@@ -84,6 +84,10 @@
#define MD_DISK_CANDIDATE 5
/* disk is added as spare (local) until confirmed
#define MD_DISK_CANDIDATE 5
/* disk is added as spare (local) until confirmed
* For clustered enviroments only.
* For clustered enviroments only.
*/
*/
#define MD_DISK_FAILFAST 10
/* Send REQ_FAILFAST if there are multiple
* devices available - and don't try to
* correct read errors.
*/
#define MD_DISK_WRITEMOSTLY 9
/* disk is "write-mostly" is RAID1 config.
#define MD_DISK_WRITEMOSTLY 9
/* disk is "write-mostly" is RAID1 config.
* read requests will only be sent here in
* read requests will only be sent here in
...
@@ -265,8 +269,9 @@ struct mdp_superblock_1 {
...
@@ -265,8 +269,9 @@ struct mdp_superblock_1 {
__le32
dev_number
;
/* permanent identifier of this device - not role in raid */
__le32
dev_number
;
/* permanent identifier of this device - not role in raid */
__le32
cnt_corrected_read
;
/* number of read errors that were corrected by re-writing */
__le32
cnt_corrected_read
;
/* number of read errors that were corrected by re-writing */
__u8
device_uuid
[
16
];
/* user-space setable, ignored by kernel */
__u8
device_uuid
[
16
];
/* user-space setable, ignored by kernel */
__u8
devflags
;
/* per-device flags. Only
one
defined...*/
__u8
devflags
;
/* per-device flags. Only
two
defined...*/
#define WriteMostly1 1
/* mask for writemostly flag in above */
#define WriteMostly1 1
/* mask for writemostly flag in above */
#define FailFast1 2
/* Should avoid retries and fixups and just fail */
/* Bad block log. If there are any bad blocks the feature flag is set.
/* Bad block log. If there are any bad blocks the feature flag is set.
* If offset and size are non-zero, that space is reserved and available
* If offset and size are non-zero, that space is reserved and available
*/
*/
...
...
lib/raid6/avx2.c
浏览文件 @
20737738
...
@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
...
@@ -87,9 +87,57 @@ static void raid6_avx21_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end
();
kernel_fpu_end
();
}
}
static
void
raid6_avx21_xor_syndrome
(
int
disks
,
int
start
,
int
stop
,
size_t
bytes
,
void
**
ptrs
)
{
u8
**
dptr
=
(
u8
**
)
ptrs
;
u8
*
p
,
*
q
;
int
d
,
z
,
z0
;
z0
=
stop
;
/* P/Q right side optimization */
p
=
dptr
[
disks
-
2
];
/* XOR parity */
q
=
dptr
[
disks
-
1
];
/* RS syndrome */
kernel_fpu_begin
();
asm
volatile
(
"vmovdqa %0,%%ymm0"
:
:
"m"
(
raid6_avx2_constants
.
x1d
[
0
]));
for
(
d
=
0
;
d
<
bytes
;
d
+=
32
)
{
asm
volatile
(
"vmovdqa %0,%%ymm4"
::
"m"
(
dptr
[
z0
][
d
]));
asm
volatile
(
"vmovdqa %0,%%ymm2"
:
:
"m"
(
p
[
d
]));
asm
volatile
(
"vpxor %ymm4,%ymm2,%ymm2"
);
/* P/Q data pages */
for
(
z
=
z0
-
1
;
z
>=
start
;
z
--
)
{
asm
volatile
(
"vpxor %ymm5,%ymm5,%ymm5"
);
asm
volatile
(
"vpcmpgtb %ymm4,%ymm5,%ymm5"
);
asm
volatile
(
"vpaddb %ymm4,%ymm4,%ymm4"
);
asm
volatile
(
"vpand %ymm0,%ymm5,%ymm5"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
asm
volatile
(
"vmovdqa %0,%%ymm5"
::
"m"
(
dptr
[
z
][
d
]));
asm
volatile
(
"vpxor %ymm5,%ymm2,%ymm2"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
}
/* P/Q left side optimization */
for
(
z
=
start
-
1
;
z
>=
0
;
z
--
)
{
asm
volatile
(
"vpxor %ymm5,%ymm5,%ymm5"
);
asm
volatile
(
"vpcmpgtb %ymm4,%ymm5,%ymm5"
);
asm
volatile
(
"vpaddb %ymm4,%ymm4,%ymm4"
);
asm
volatile
(
"vpand %ymm0,%ymm5,%ymm5"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
}
asm
volatile
(
"vpxor %0,%%ymm4,%%ymm4"
:
:
"m"
(
q
[
d
]));
/* Don't use movntdq for r/w memory area < cache line */
asm
volatile
(
"vmovdqa %%ymm4,%0"
:
"=m"
(
q
[
d
]));
asm
volatile
(
"vmovdqa %%ymm2,%0"
:
"=m"
(
p
[
d
]));
}
asm
volatile
(
"sfence"
:
:
:
"memory"
);
kernel_fpu_end
();
}
const
struct
raid6_calls
raid6_avx2x1
=
{
const
struct
raid6_calls
raid6_avx2x1
=
{
raid6_avx21_gen_syndrome
,
raid6_avx21_gen_syndrome
,
NULL
,
/* XOR not yet implemented */
raid6_avx21_xor_syndrome
,
raid6_have_avx2
,
raid6_have_avx2
,
"avx2x1"
,
"avx2x1"
,
1
/* Has cache hints */
1
/* Has cache hints */
...
@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
...
@@ -149,9 +197,77 @@ static void raid6_avx22_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end
();
kernel_fpu_end
();
}
}
static
void
raid6_avx22_xor_syndrome
(
int
disks
,
int
start
,
int
stop
,
size_t
bytes
,
void
**
ptrs
)
{
u8
**
dptr
=
(
u8
**
)
ptrs
;
u8
*
p
,
*
q
;
int
d
,
z
,
z0
;
z0
=
stop
;
/* P/Q right side optimization */
p
=
dptr
[
disks
-
2
];
/* XOR parity */
q
=
dptr
[
disks
-
1
];
/* RS syndrome */
kernel_fpu_begin
();
asm
volatile
(
"vmovdqa %0,%%ymm0"
:
:
"m"
(
raid6_avx2_constants
.
x1d
[
0
]));
for
(
d
=
0
;
d
<
bytes
;
d
+=
64
)
{
asm
volatile
(
"vmovdqa %0,%%ymm4"
::
"m"
(
dptr
[
z0
][
d
]));
asm
volatile
(
"vmovdqa %0,%%ymm6"
::
"m"
(
dptr
[
z0
][
d
+
32
]));
asm
volatile
(
"vmovdqa %0,%%ymm2"
:
:
"m"
(
p
[
d
]));
asm
volatile
(
"vmovdqa %0,%%ymm3"
:
:
"m"
(
p
[
d
+
32
]));
asm
volatile
(
"vpxor %ymm4,%ymm2,%ymm2"
);
asm
volatile
(
"vpxor %ymm6,%ymm3,%ymm3"
);
/* P/Q data pages */
for
(
z
=
z0
-
1
;
z
>=
start
;
z
--
)
{
asm
volatile
(
"vpxor %ymm5,%ymm5,%ymm5"
);
asm
volatile
(
"vpxor %ymm7,%ymm7,%ymm7"
);
asm
volatile
(
"vpcmpgtb %ymm4,%ymm5,%ymm5"
);
asm
volatile
(
"vpcmpgtb %ymm6,%ymm7,%ymm7"
);
asm
volatile
(
"vpaddb %ymm4,%ymm4,%ymm4"
);
asm
volatile
(
"vpaddb %ymm6,%ymm6,%ymm6"
);
asm
volatile
(
"vpand %ymm0,%ymm5,%ymm5"
);
asm
volatile
(
"vpand %ymm0,%ymm7,%ymm7"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
asm
volatile
(
"vpxor %ymm7,%ymm6,%ymm6"
);
asm
volatile
(
"vmovdqa %0,%%ymm5"
::
"m"
(
dptr
[
z
][
d
]));
asm
volatile
(
"vmovdqa %0,%%ymm7"
::
"m"
(
dptr
[
z
][
d
+
32
]));
asm
volatile
(
"vpxor %ymm5,%ymm2,%ymm2"
);
asm
volatile
(
"vpxor %ymm7,%ymm3,%ymm3"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
asm
volatile
(
"vpxor %ymm7,%ymm6,%ymm6"
);
}
/* P/Q left side optimization */
for
(
z
=
start
-
1
;
z
>=
0
;
z
--
)
{
asm
volatile
(
"vpxor %ymm5,%ymm5,%ymm5"
);
asm
volatile
(
"vpxor %ymm7,%ymm7,%ymm7"
);
asm
volatile
(
"vpcmpgtb %ymm4,%ymm5,%ymm5"
);
asm
volatile
(
"vpcmpgtb %ymm6,%ymm7,%ymm7"
);
asm
volatile
(
"vpaddb %ymm4,%ymm4,%ymm4"
);
asm
volatile
(
"vpaddb %ymm6,%ymm6,%ymm6"
);
asm
volatile
(
"vpand %ymm0,%ymm5,%ymm5"
);
asm
volatile
(
"vpand %ymm0,%ymm7,%ymm7"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
asm
volatile
(
"vpxor %ymm7,%ymm6,%ymm6"
);
}
asm
volatile
(
"vpxor %0,%%ymm4,%%ymm4"
:
:
"m"
(
q
[
d
]));
asm
volatile
(
"vpxor %0,%%ymm6,%%ymm6"
:
:
"m"
(
q
[
d
+
32
]));
/* Don't use movntdq for r/w memory area < cache line */
asm
volatile
(
"vmovdqa %%ymm4,%0"
:
"=m"
(
q
[
d
]));
asm
volatile
(
"vmovdqa %%ymm6,%0"
:
"=m"
(
q
[
d
+
32
]));
asm
volatile
(
"vmovdqa %%ymm2,%0"
:
"=m"
(
p
[
d
]));
asm
volatile
(
"vmovdqa %%ymm3,%0"
:
"=m"
(
p
[
d
+
32
]));
}
asm
volatile
(
"sfence"
:
:
:
"memory"
);
kernel_fpu_end
();
}
const
struct
raid6_calls
raid6_avx2x2
=
{
const
struct
raid6_calls
raid6_avx2x2
=
{
raid6_avx22_gen_syndrome
,
raid6_avx22_gen_syndrome
,
NULL
,
/* XOR not yet implemented */
raid6_avx22_xor_syndrome
,
raid6_have_avx2
,
raid6_have_avx2
,
"avx2x2"
,
"avx2x2"
,
1
/* Has cache hints */
1
/* Has cache hints */
...
@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
...
@@ -242,9 +358,119 @@ static void raid6_avx24_gen_syndrome(int disks, size_t bytes, void **ptrs)
kernel_fpu_end
();
kernel_fpu_end
();
}
}
static
void
raid6_avx24_xor_syndrome
(
int
disks
,
int
start
,
int
stop
,
size_t
bytes
,
void
**
ptrs
)
{
u8
**
dptr
=
(
u8
**
)
ptrs
;
u8
*
p
,
*
q
;
int
d
,
z
,
z0
;
z0
=
stop
;
/* P/Q right side optimization */
p
=
dptr
[
disks
-
2
];
/* XOR parity */
q
=
dptr
[
disks
-
1
];
/* RS syndrome */
kernel_fpu_begin
();
asm
volatile
(
"vmovdqa %0,%%ymm0"
::
"m"
(
raid6_avx2_constants
.
x1d
[
0
]));
for
(
d
=
0
;
d
<
bytes
;
d
+=
128
)
{
asm
volatile
(
"vmovdqa %0,%%ymm4"
::
"m"
(
dptr
[
z0
][
d
]));
asm
volatile
(
"vmovdqa %0,%%ymm6"
::
"m"
(
dptr
[
z0
][
d
+
32
]));
asm
volatile
(
"vmovdqa %0,%%ymm12"
::
"m"
(
dptr
[
z0
][
d
+
64
]));
asm
volatile
(
"vmovdqa %0,%%ymm14"
::
"m"
(
dptr
[
z0
][
d
+
96
]));
asm
volatile
(
"vmovdqa %0,%%ymm2"
:
:
"m"
(
p
[
d
]));
asm
volatile
(
"vmovdqa %0,%%ymm3"
:
:
"m"
(
p
[
d
+
32
]));
asm
volatile
(
"vmovdqa %0,%%ymm10"
:
:
"m"
(
p
[
d
+
64
]));
asm
volatile
(
"vmovdqa %0,%%ymm11"
:
:
"m"
(
p
[
d
+
96
]));
asm
volatile
(
"vpxor %ymm4,%ymm2,%ymm2"
);
asm
volatile
(
"vpxor %ymm6,%ymm3,%ymm3"
);
asm
volatile
(
"vpxor %ymm12,%ymm10,%ymm10"
);
asm
volatile
(
"vpxor %ymm14,%ymm11,%ymm11"
);
/* P/Q data pages */
for
(
z
=
z0
-
1
;
z
>=
start
;
z
--
)
{
asm
volatile
(
"prefetchnta %0"
::
"m"
(
dptr
[
z
][
d
]));
asm
volatile
(
"prefetchnta %0"
::
"m"
(
dptr
[
z
][
d
+
64
]));
asm
volatile
(
"vpxor %ymm5,%ymm5,%ymm5"
);
asm
volatile
(
"vpxor %ymm7,%ymm7,%ymm7"
);
asm
volatile
(
"vpxor %ymm13,%ymm13,%ymm13"
);
asm
volatile
(
"vpxor %ymm15,%ymm15,%ymm15"
);
asm
volatile
(
"vpcmpgtb %ymm4,%ymm5,%ymm5"
);
asm
volatile
(
"vpcmpgtb %ymm6,%ymm7,%ymm7"
);
asm
volatile
(
"vpcmpgtb %ymm12,%ymm13,%ymm13"
);
asm
volatile
(
"vpcmpgtb %ymm14,%ymm15,%ymm15"
);
asm
volatile
(
"vpaddb %ymm4,%ymm4,%ymm4"
);
asm
volatile
(
"vpaddb %ymm6,%ymm6,%ymm6"
);
asm
volatile
(
"vpaddb %ymm12,%ymm12,%ymm12"
);
asm
volatile
(
"vpaddb %ymm14,%ymm14,%ymm14"
);
asm
volatile
(
"vpand %ymm0,%ymm5,%ymm5"
);
asm
volatile
(
"vpand %ymm0,%ymm7,%ymm7"
);
asm
volatile
(
"vpand %ymm0,%ymm13,%ymm13"
);
asm
volatile
(
"vpand %ymm0,%ymm15,%ymm15"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
asm
volatile
(
"vpxor %ymm7,%ymm6,%ymm6"
);
asm
volatile
(
"vpxor %ymm13,%ymm12,%ymm12"
);
asm
volatile
(
"vpxor %ymm15,%ymm14,%ymm14"
);
asm
volatile
(
"vmovdqa %0,%%ymm5"
::
"m"
(
dptr
[
z
][
d
]));
asm
volatile
(
"vmovdqa %0,%%ymm7"
::
"m"
(
dptr
[
z
][
d
+
32
]));
asm
volatile
(
"vmovdqa %0,%%ymm13"
::
"m"
(
dptr
[
z
][
d
+
64
]));
asm
volatile
(
"vmovdqa %0,%%ymm15"
::
"m"
(
dptr
[
z
][
d
+
96
]));
asm
volatile
(
"vpxor %ymm5,%ymm2,%ymm2"
);
asm
volatile
(
"vpxor %ymm7,%ymm3,%ymm3"
);
asm
volatile
(
"vpxor %ymm13,%ymm10,%ymm10"
);
asm
volatile
(
"vpxor %ymm15,%ymm11,%ymm11"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
asm
volatile
(
"vpxor %ymm7,%ymm6,%ymm6"
);
asm
volatile
(
"vpxor %ymm13,%ymm12,%ymm12"
);
asm
volatile
(
"vpxor %ymm15,%ymm14,%ymm14"
);
}
asm
volatile
(
"prefetchnta %0"
::
"m"
(
q
[
d
]));
asm
volatile
(
"prefetchnta %0"
::
"m"
(
q
[
d
+
64
]));
/* P/Q left side optimization */
for
(
z
=
start
-
1
;
z
>=
0
;
z
--
)
{
asm
volatile
(
"vpxor %ymm5,%ymm5,%ymm5"
);
asm
volatile
(
"vpxor %ymm7,%ymm7,%ymm7"
);
asm
volatile
(
"vpxor %ymm13,%ymm13,%ymm13"
);
asm
volatile
(
"vpxor %ymm15,%ymm15,%ymm15"
);
asm
volatile
(
"vpcmpgtb %ymm4,%ymm5,%ymm5"
);
asm
volatile
(
"vpcmpgtb %ymm6,%ymm7,%ymm7"
);
asm
volatile
(
"vpcmpgtb %ymm12,%ymm13,%ymm13"
);
asm
volatile
(
"vpcmpgtb %ymm14,%ymm15,%ymm15"
);
asm
volatile
(
"vpaddb %ymm4,%ymm4,%ymm4"
);
asm
volatile
(
"vpaddb %ymm6,%ymm6,%ymm6"
);
asm
volatile
(
"vpaddb %ymm12,%ymm12,%ymm12"
);
asm
volatile
(
"vpaddb %ymm14,%ymm14,%ymm14"
);
asm
volatile
(
"vpand %ymm0,%ymm5,%ymm5"
);
asm
volatile
(
"vpand %ymm0,%ymm7,%ymm7"
);
asm
volatile
(
"vpand %ymm0,%ymm13,%ymm13"
);
asm
volatile
(
"vpand %ymm0,%ymm15,%ymm15"
);
asm
volatile
(
"vpxor %ymm5,%ymm4,%ymm4"
);
asm
volatile
(
"vpxor %ymm7,%ymm6,%ymm6"
);
asm
volatile
(
"vpxor %ymm13,%ymm12,%ymm12"
);
asm
volatile
(
"vpxor %ymm15,%ymm14,%ymm14"
);
}
asm
volatile
(
"vmovntdq %%ymm2,%0"
:
"=m"
(
p
[
d
]));
asm
volatile
(
"vmovntdq %%ymm3,%0"
:
"=m"
(
p
[
d
+
32
]));
asm
volatile
(
"vmovntdq %%ymm10,%0"
:
"=m"
(
p
[
d
+
64
]));
asm
volatile
(
"vmovntdq %%ymm11,%0"
:
"=m"
(
p
[
d
+
96
]));
asm
volatile
(
"vpxor %0,%%ymm4,%%ymm4"
:
:
"m"
(
q
[
d
]));
asm
volatile
(
"vpxor %0,%%ymm6,%%ymm6"
:
:
"m"
(
q
[
d
+
32
]));
asm
volatile
(
"vpxor %0,%%ymm12,%%ymm12"
:
:
"m"
(
q
[
d
+
64
]));
asm
volatile
(
"vpxor %0,%%ymm14,%%ymm14"
:
:
"m"
(
q
[
d
+
96
]));
asm
volatile
(
"vmovntdq %%ymm4,%0"
:
"=m"
(
q
[
d
]));
asm
volatile
(
"vmovntdq %%ymm6,%0"
:
"=m"
(
q
[
d
+
32
]));
asm
volatile
(
"vmovntdq %%ymm12,%0"
:
"=m"
(
q
[
d
+
64
]));
asm
volatile
(
"vmovntdq %%ymm14,%0"
:
"=m"
(
q
[
d
+
96
]));
}
asm
volatile
(
"sfence"
:
:
:
"memory"
);
kernel_fpu_end
();
}
const
struct
raid6_calls
raid6_avx2x4
=
{
const
struct
raid6_calls
raid6_avx2x4
=
{
raid6_avx24_gen_syndrome
,
raid6_avx24_gen_syndrome
,
NULL
,
/* XOR not yet implemented */
raid6_avx24_xor_syndrome
,
raid6_have_avx2
,
raid6_have_avx2
,
"avx2x4"
,
"avx2x4"
,
1
/* Has cache hints */
1
/* Has cache hints */
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录