md/raid1: fix a race between removing rdev and access conf->mirrors[i].rdev

hulk inclusion category: bugfix bugzilla: 18683 CVE: NA --------------------------- We get a NULL pointer dereference oops when test raid1 as follow: mdadm -CR /dev/md1 -l 1 -n 2 /dev/sd[ab] mdadm /dev/md1 -f /dev/sda mdadm /dev/md1 -r /dev/sda mdadm /dev/md1 -a /dev/sda sleep 5 mdadm /dev/md1 -f /dev/sdb mdadm /dev/md1 -r /dev/sdb mdadm /dev/md1 -a /dev/sdb After a disk(/dev/sda) has been removed, we add the disk to raid array again, which would trigger recovery action. Since the rdev current state is 'spare', read/write bio can be issued to the disk. Then we set the other disk (/dev/sdb) faulty. Since the raid array is now in degraded state and /dev/sdb is the only 'In_sync' disk, raid1_error() will return but without set faulty success. However, that can interrupt the recovery action and md_check_recovery will try to call remove_and_add_spares() to remove the spare disk. And the race condition between remove_and_add_spares() and raid1_write_request() in follow can cause NULL pointer dereference for conf->mirrors[i].rdev: raid1_write_request() md_check_recovery raid1_error() rcu_read_lock() rdev != NULL !test_bit(Faulty, &rdev->flags) conf->recovery_disabled= mddev->recovery_disabled; return busy remove_and_add_spares raid1_remove_disk rdev->nr_pending == 0 atomic_inc(&rdev->nr_pending); rcu_read_unlock() p->rdev=NULL conf->mirrors[i].rdev->data_offset NULL pointer deref!!! if (!test_bit(RemoveSynchronized, &rdev->flags)) synchronize_rcu(); p->rdev=rdev To fix the race condition, we add a new flag 'WantRemove' for rdev. Before access conf->mirrors[i].rdev, we need to ensure the rdev without 'WantRemove' bit. Link: https://marc.info/?l=linux-raid&m=156412052717709&w=2Reported-by: N Zou Wei <zou_wei@huawei.com> Signed-off-by: N Yufen Yu <yuyufen@huawei.com> Reviewed-by: N Hou Tao <houtao1@huawei.com> Signed-off-by: N Yang Yingliang <yangyingliang@huawei.com>

md/raid1: fix a race between removing rdev and access conf->mirrors[i].rdev
hulk inclusion category: bugfix bugzilla: 18683 CVE: NA --------------------------- We get a NULL pointer dereference oops when test raid1 as follow: mdadm -CR /dev/md1 -l 1 -n 2 /dev/sd[ab] mdadm /dev/md1 -f /dev/sda mdadm /dev/md1 -r /dev/sda mdadm /dev/md1 -a /dev/sda sleep 5 mdadm /dev/md1 -f /dev/sdb mdadm /dev/md1 -r /dev/sdb mdadm /dev/md1 -a /dev/sdb After a disk(/dev/sda) has been removed, we add the disk to raid array again, which would trigger recovery action. Since the rdev current state is 'spare', read/write bio can be issued to the disk. Then we set the other disk (/dev/sdb) faulty. Since the raid array is now in degraded state and /dev/sdb is the only 'In_sync' disk, raid1_error() will return but without set faulty success. However, that can interrupt the recovery action and md_check_recovery will try to call remove_and_add_spares() to remove the spare disk. And the race condition between remove_and_add_spares() and raid1_write_request() in follow can cause NULL pointer dereference for conf->mirrors[i].rdev: raid1_write_request() md_check_recovery raid1_error() rcu_read_lock() rdev != NULL !test_bit(Faulty, &rdev->flags) conf->recovery_disabled= mddev->recovery_disabled; return busy remove_and_add_spares raid1_remove_disk rdev->nr_pending == 0 atomic_inc(&rdev->nr_pending); rcu_read_unlock() p->rdev=NULL conf->mirrors[i].rdev->data_offset NULL pointer deref!!! if (!test_bit(RemoveSynchronized, &rdev->flags)) synchronize_rcu(); p->rdev=rdev To fix the race condition, we add a new flag 'WantRemove' for rdev. Before access conf->mirrors[i].rdev, we need to ensure the rdev without 'WantRemove' bit. Link: https://marc.info/?l=linux-raid&m=156412052717709&w=2Reported-by: N Zou Wei <zou_wei@huawei.com> Signed-off-by: N Yufen Yu <yuyufen@huawei.com> Reviewed-by: N Hou Tao <houtao1@huawei.com> Signed-off-by: N Yang Yingliang <yangyingliang@huawei.com>
10783f79 · Yufen Yu · Xie XiuQi · 23aa3f21 · 10783f79 · 10783f79
显示空白变更内容
内联并排

Showing with 26 addition and 6 deletion

drivers/md/md.h drivers/md/md.h +4 -0

drivers/md/raid1.c drivers/md/raid1.c +22 -6

未找到文件。
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -200,6 +200,10 @@ enum flag_bits {
 				 * it didn't fail, so don't use FailFast
 				 * any more for metadata
 				 */
+	WantRemove,		/* Before set conf->mirrors[i] as NULL,
+				 * we set the bit first, avoiding access the
+				 * conf->mirrors[i] after it set NULL.
+				 */
 };
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,

--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -605,7 +605,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (r1_bio->bios[disk] == IO_BLOCKED
 		    || rdev == NULL
-		    || test_bit(Faulty, &rdev->flags))
+		    || test_bit(Faulty, &rdev->flags)
+			|| test_bit(WantRemove, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
 		    rdev->recovery_offset < this_sector + sectors)
@@ -734,7 +735,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	if (best_disk >= 0) {
 		rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
-		if (!rdev)
+		if (!rdev || test_bit(Faulty, &rdev->flags)
+				|| test_bit(WantRemove, &rdev->flags))
 			goto retry;
 		atomic_inc(&rdev->nr_pending);
 		sectors = best_good_sectors;
@@ -1372,7 +1374,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			break;
 		}
 		r1_bio->bios[i] = NULL;
-		if (!rdev || test_bit(Faulty, &rdev->flags)) {
+		if (!rdev || test_bit(Faulty, &rdev->flags)
+				|| test_bit(WantRemove, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				set_bit(R1BIO_Degraded, &r1_bio->state);
 			continue;
@@ -1750,6 +1753,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
+			clear_bit(WantRemove, &rdev->flags);
 			err = 0;
 			/* As all devices are equivalent, we don't need a full recovery
 			 * if this was recently any drive of the array
@@ -1764,6 +1768,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 			/* Add this device as a replacement */
 			clear_bit(In_sync, &rdev->flags);
 			set_bit(Replacement, &rdev->flags);
+			clear_bit(WantRemove, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
 			conf->fullsync = 1;
@@ -1803,16 +1808,26 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 			err = -EBUSY;
 			goto abort;
 		}
-		p->rdev = NULL;
+		/*
+		 * Before set p->rdev = NULL, we set WantRemove bit avoiding
+		 * race between rdev remove and issue bio, which can cause
+		 * NULL pointer deference of rdev by conf->mirrors[i].rdev.
+		 */
+		set_bit(WantRemove, &rdev->flags);
 		if (!test_bit(RemoveSynchronized, &rdev->flags)) {
 			synchronize_rcu();
 			if (atomic_read(&rdev->nr_pending)) {
 				/* lost the race, try later */
 				err = -EBUSY;
-				p->rdev = rdev;
+				clear_bit(WantRemove, &rdev->flags);
 				goto abort;
 			}
 		}
+		p->rdev = NULL;
 		if (conf->mirrors[conf->raid_disks + number].rdev) {
 			/* We just removed a device that is being replaced.
 			 * Move down the replacement.  We drain all IO before
@@ -2709,7 +2724,8 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
-		    test_bit(Faulty, &rdev->flags)) {
+		    test_bit(Faulty, &rdev->flags) ||
+			test_bit(WantRemove, &rdev->flags)) {
 			if (i < conf->raid_disks)
 				still_degraded = 1;
 		} else if (!test_bit(In_sync, &rdev->flags)) {