提交 d29adb34 编写于 作者: J Josh Durgin 提交者: Sage Weil

libceph: block I/O when PAUSE or FULL osd map flags are set

The PAUSEWR and PAUSERD flags are meant to stop the cluster from
processing writes and reads, respectively. The FULL flag is set when
the cluster determines that it is out of space, and will no longer
process writes.  PAUSEWR and PAUSERD are purely client-side settings
already implemented in userspace clients. The osd does nothing special
with these flags.

When the FULL flag is set, however, the osd responds to all writes
with -ENOSPC. For cephfs, this makes sense, but for rbd the block
layer translates this into EIO.  If a cluster goes from full to
non-full quickly, a filesystem on top of rbd will not behave well,
since some writes succeed while others get EIO.

Fix this by blocking any writes when the FULL flag is set in the osd
client. This is the same strategy used by userspace, so apply it by
default.  A follow-on patch makes this configurable.

__map_request() is called to re-target osd requests in case the
available osds changed.  Add a paused field to a ceph_osd_request, and
set it whenever an appropriate osd map flag is set.  Avoid queueing
paused requests in __map_request(), but force them to be resent if
they become unpaused.

Also subscribe to the next osd map from the monitor if any of these
flags are set, so paused requests can be unblocked as soon as
possible.

Fixes: http://tracker.ceph.com/issues/6079Reviewed-by: NSage Weil <sage@inktank.com>
Signed-off-by: NJosh Durgin <josh.durgin@inktank.com>
上级 aa8b60e0
...@@ -138,6 +138,7 @@ struct ceph_osd_request { ...@@ -138,6 +138,7 @@ struct ceph_osd_request {
__le64 *r_request_pool; __le64 *r_request_pool;
void *r_request_pgid; void *r_request_pgid;
__le32 *r_request_attempts; __le32 *r_request_attempts;
bool r_paused;
struct ceph_eversion *r_request_reassert_version; struct ceph_eversion *r_request_reassert_version;
int r_result; int r_result;
......
...@@ -1231,6 +1231,22 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, ...@@ -1231,6 +1231,22 @@ void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
} }
EXPORT_SYMBOL(ceph_osdc_set_request_linger); EXPORT_SYMBOL(ceph_osdc_set_request_linger);
/*
* Returns whether a request should be blocked from being sent
* based on the current osdmap and osd_client settings.
*
* Caller should hold map_sem for read.
*/
static bool __req_should_be_paused(struct ceph_osd_client *osdc,
struct ceph_osd_request *req)
{
bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD);
bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) ||
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) ||
(req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr);
}
/* /*
* Pick an osd (the first 'up' osd in the pg), allocate the osd struct * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
* (as needed), and set the request r_osd appropriately. If there is * (as needed), and set the request r_osd appropriately. If there is
...@@ -1248,6 +1264,7 @@ static int __map_request(struct ceph_osd_client *osdc, ...@@ -1248,6 +1264,7 @@ static int __map_request(struct ceph_osd_client *osdc,
int acting[CEPH_PG_MAX_SIZE]; int acting[CEPH_PG_MAX_SIZE];
int o = -1, num = 0; int o = -1, num = 0;
int err; int err;
bool was_paused;
dout("map_request %p tid %lld\n", req, req->r_tid); dout("map_request %p tid %lld\n", req, req->r_tid);
err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap, err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap,
...@@ -1264,12 +1281,18 @@ static int __map_request(struct ceph_osd_client *osdc, ...@@ -1264,12 +1281,18 @@ static int __map_request(struct ceph_osd_client *osdc,
num = err; num = err;
} }
was_paused = req->r_paused;
req->r_paused = __req_should_be_paused(osdc, req);
if (was_paused && !req->r_paused)
force_resend = 1;
if ((!force_resend && if ((!force_resend &&
req->r_osd && req->r_osd->o_osd == o && req->r_osd && req->r_osd->o_osd == o &&
req->r_sent >= req->r_osd->o_incarnation && req->r_sent >= req->r_osd->o_incarnation &&
req->r_num_pg_osds == num && req->r_num_pg_osds == num &&
memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
(req->r_osd == NULL && o == -1)) (req->r_osd == NULL && o == -1) ||
req->r_paused)
return 0; /* no change */ return 0; /* no change */
dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n",
...@@ -1811,7 +1834,9 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1811,7 +1834,9 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
* we find out when we are no longer full and stop returning * we find out when we are no longer full and stop returning
* ENOSPC. * ENOSPC.
*/ */
if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) ||
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) ||
ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR))
ceph_monc_request_next_osdmap(&osdc->client->monc); ceph_monc_request_next_osdmap(&osdc->client->monc);
mutex_lock(&osdc->request_mutex); mutex_lock(&osdc->request_mutex);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册