libceph: fix double __remove_osd() problem

It turns out it's possible to get __remove_osd() called twice on the same OSD. That doesn't sit well with rb_erase() - depending on the shape of the tree we can get a NULL dereference, a soft lockup or a random crash at some point in the future as we end up touching freed memory. One scenario that I was able to reproduce is as follows: <osd3 is idle, on the osd lru list> <con reset - osd3> con_fault_finish() osd_reset() <osdmap - osd3 down> ceph_osdc_handle_map() <takes map_sem> kick_requests() <takes request_mutex> reset_changed_osds() __reset_osd() __remove_osd() <releases request_mutex> <releases map_sem> <takes map_sem> <takes request_mutex> __kick_osd_requests() __reset_osd() __remove_osd() <-- !!! A case can be made that osd refcounting is imperfect and reworking it would be a proper resolution, but for now Sage and I decided to fix this by adding a safe guard around __remove_osd(). Fixes: http://tracker.ceph.com/issues/8087 Cc: Sage Weil <sage@redhat.com> Cc: stable@vger.kernel.org # 3.9+: 7c6e6fc5: libceph: assert both regular and lingering lists in __remove_osd() Cc: stable@vger.kernel.org # 3.9+: cc9f1f51: libceph: change from BUG to WARN for __remove_osd() asserts Cc: stable@vger.kernel.org # 3.9+ Signed-off-by: N Ilya Dryomov <idryomov@gmail.com> Reviewed-by: N Sage Weil <sage@redhat.com> Reviewed-by: N Alex Elder <elder@linaro.org>

libceph: fix double __remove_osd() problem
It turns out it's possible to get __remove_osd() called twice on the same OSD. That doesn't sit well with rb_erase() - depending on the shape of the tree we can get a NULL dereference, a soft lockup or a random crash at some point in the future as we end up touching freed memory. One scenario that I was able to reproduce is as follows: <osd3 is idle, on the osd lru list> <con reset - osd3> con_fault_finish() osd_reset() <osdmap - osd3 down> ceph_osdc_handle_map() <takes map_sem> kick_requests() <takes request_mutex> reset_changed_osds() __reset_osd() __remove_osd() <releases request_mutex> <releases map_sem> <takes map_sem> <takes request_mutex> __kick_osd_requests() __reset_osd() __remove_osd() <-- !!! A case can be made that osd refcounting is imperfect and reworking it would be a proper resolution, but for now Sage and I decided to fix this by adding a safe guard around __remove_osd(). Fixes: http://tracker.ceph.com/issues/8087 Cc: Sage Weil <sage@redhat.com> Cc: stable@vger.kernel.org # 3.9+: 7c6e6fc5: libceph: assert both regular and lingering lists in __remove_osd() Cc: stable@vger.kernel.org # 3.9+: cc9f1f51: libceph: change from BUG to WARN for __remove_osd() asserts Cc: stable@vger.kernel.org # 3.9+ Signed-off-by: N Ilya Dryomov <idryomov@gmail.com> Reviewed-by: N Sage Weil <sage@redhat.com> Reviewed-by: N Alex Elder <elder@linaro.org>
7eb71e03 · Ilya Dryomov · 7ad18afa · 7eb71e03
隐藏空白更改
内联并排

Showing with 18 addition and 8 deletion

net/ceph/osd_client.c net/ceph/osd_client.c +18 -8

未找到文件。
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1048,14 +1048,24 @@ static void put_osd(struct ceph_osd *osd)
 */
 static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 {
-	dout("__remove_osd %p\n", osd);
+	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
 	WARN_ON(!list_empty(&osd->o_requests));
 	WARN_ON(!list_empty(&osd->o_linger_requests));

-	rb_erase(&osd->o_node, &osdc->osds);
 	list_del_init(&osd->o_osd_lru);
-	ceph_con_close(&osd->o_con);
-	put_osd(osd);
+	rb_erase(&osd->o_node, &osdc->osds);
+	RB_CLEAR_NODE(&osd->o_node);
+}
+
+static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+	dout("%s %p osd%d\n", __func__, osd, osd->o_osd);
+
+	if (!RB_EMPTY_NODE(&osd->o_node)) {
+		ceph_con_close(&osd->o_con);
+		__remove_osd(osdc, osd);
+		put_osd(osd);
+	}
 }

 static void remove_all_osds(struct ceph_osd_client *osdc)
@@ -1065,7 +1075,7 @@ static void remove_all_osds(struct ceph_osd_client *osdc)
 	while (!RB_EMPTY_ROOT(&osdc->osds)) {
 		struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
 						struct ceph_osd, o_node);
-		__remove_osd(osdc, osd);
+		remove_osd(osdc, osd);
 	}
 	mutex_unlock(&osdc->request_mutex);
 }
@@ -1106,7 +1116,7 @@ static void remove_old_osds(struct ceph_osd_client *osdc)
 	list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
 		if (time_before(jiffies, osd->lru_ttl))
 			break;
-		__remove_osd(osdc, osd);
+		remove_osd(osdc, osd);
 	}
 	mutex_unlock(&osdc->request_mutex);
 }
@@ -1121,8 +1131,7 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
 	dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
 	if (list_empty(&osd->o_requests) &&
 	    list_empty(&osd->o_linger_requests)) {
-		__remove_osd(osdc, osd);
-
+		remove_osd(osdc, osd);
 		return -ENODEV;
 	}

@@ -1926,6 +1935,7 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
 {
 	struct rb_node *p, *n;

+	dout("%s %p\n", __func__, osdc);
 	for (p = rb_first(&osdc->osds); p; p = n) {
 		struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);