提交 edc9f5eb 编写于 作者: L Lars Ellenberg 提交者: Philipp Reisner

drbd: always write bitmap on detach

If we detach due to local read-error (which sets a bit in the bitmap),
stay Primary, and then re-attach (which re-reads the bitmap from disk),
we potentially lost the "out-of-sync" (or, "bad block") information in
the bitmap.

Always (try to) write out the changed bitmap pages before going diskless.

That way, we don't lose the bit for the bad block,
the next resync will fetch it from the peer, and rewrite
it locally, which may result in block reallocation in some
lower layer (or the hardware), and thereby "heal" the bad blocks.

If the bitmap writeout errors out as well, we will (again: try to)
mark the "we need a full sync" bit in our super block,
if it was a READ error; writes are covered by the activity log already.

If that superblock does not make it to disk either, we are sorry.

Maybe we just lost an entire disk or controller (or iSCSI connection),
and there actually are no bad blocks at all, so we don't need to
re-fetch from the peer, there is no "auto-healing" necessary.
Signed-off-by: NPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: NLars Ellenberg <lars.ellenberg@linbit.com>
上级 e34b677d
...@@ -682,7 +682,8 @@ enum { ...@@ -682,7 +682,8 @@ enum {
once no more io in flight, start bitmap io */ once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */ BITMAP_IO_QUEUED, /* Started bitmap IO */
GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */
WAS_IO_ERROR, /* Local disk failed returned IO error */ WAS_IO_ERROR, /* Local disk failed, returned IO error */
WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
RESIZE_PENDING, /* Size change detected locally, waiting for the response from RESIZE_PENDING, /* Size change detected locally, waiting for the response from
...@@ -1142,6 +1143,9 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, ...@@ -1142,6 +1143,9 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
extern int drbd_bitmap_io(struct drbd_conf *mdev, extern int drbd_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *), int (*io_fn)(struct drbd_conf *),
char *why, enum bm_flag flags); char *why, enum bm_flag flags);
extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
char *why, enum bm_flag flags);
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern void drbd_go_diskless(struct drbd_conf *mdev); extern void drbd_go_diskless(struct drbd_conf *mdev);
...@@ -1661,14 +1665,15 @@ static inline union drbd_state drbd_read_state(struct drbd_conf *mdev) ...@@ -1661,14 +1665,15 @@ static inline union drbd_state drbd_read_state(struct drbd_conf *mdev)
} }
enum drbd_force_detach_flags { enum drbd_force_detach_flags {
DRBD_IO_ERROR, DRBD_READ_ERROR,
DRBD_WRITE_ERROR,
DRBD_META_IO_ERROR, DRBD_META_IO_ERROR,
DRBD_FORCE_DETACH, DRBD_FORCE_DETACH,
}; };
#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
enum drbd_force_detach_flags forcedetach, enum drbd_force_detach_flags df,
const char *where) const char *where)
{ {
enum drbd_io_error_p ep; enum drbd_io_error_p ep;
...@@ -1678,18 +1683,40 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, ...@@ -1678,18 +1683,40 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev,
rcu_read_unlock(); rcu_read_unlock();
switch (ep) { switch (ep) {
case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */ case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
if (forcedetach == DRBD_IO_ERROR) { if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
if (__ratelimit(&drbd_ratelimit_state)) if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Local IO failed in %s.\n", where); dev_err(DEV, "Local IO failed in %s.\n", where);
if (mdev->state.disk > D_INCONSISTENT) if (mdev->state.disk > D_INCONSISTENT)
_drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL);
break; break;
} }
/* NOTE fall through to detach case if forcedetach set */ /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
case EP_DETACH: case EP_DETACH:
case EP_CALL_HELPER: case EP_CALL_HELPER:
/* Remember whether we saw a READ or WRITE error.
*
* Recovery of the affected area for WRITE failure is covered
* by the activity log.
* READ errors may fall outside that area though. Certain READ
* errors can be "healed" by writing good data to the affected
* blocks, which triggers block re-allocation in lower layers.
*
* If we can not write the bitmap after a READ error,
* we may need to trigger a full sync (see w_go_diskless()).
*
* Force-detach is not really an IO error, but rather a
* desperate measure to try to deal with a completely
* unresponsive lower level IO stack.
* Still it should be treated as a WRITE error.
*
* Meta IO error is always WRITE error:
* we read meta data only once during attach,
* which will fail in case of errors.
*/
set_bit(WAS_IO_ERROR, &mdev->flags); set_bit(WAS_IO_ERROR, &mdev->flags);
if (forcedetach == DRBD_FORCE_DETACH) if (df == DRBD_READ_ERROR)
set_bit(WAS_READ_ERROR, &mdev->flags);
if (df == DRBD_FORCE_DETACH)
set_bit(FORCE_DETACH, &mdev->flags); set_bit(FORCE_DETACH, &mdev->flags);
if (mdev->state.disk > D_FAILED) { if (mdev->state.disk > D_FAILED) {
_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
......
...@@ -3226,6 +3226,26 @@ static int w_go_diskless(struct drbd_work *w, int unused) ...@@ -3226,6 +3226,26 @@ static int w_go_diskless(struct drbd_work *w, int unused)
* inc/dec it frequently. Once we are D_DISKLESS, no one will touch * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
* the protected members anymore, though, so once put_ldev reaches zero * the protected members anymore, though, so once put_ldev reaches zero
* again, it will be safe to free them. */ * again, it will be safe to free them. */
/* Try to write changed bitmap pages, read errors may have just
* set some bits outside the area covered by the activity log.
*
* If we have an IO error during the bitmap writeout,
* we will want a full sync next time, just in case.
* (Do we want a specific meta data flag for this?)
*
* If that does not make it to stable storage either,
* we cannot do anything about that anymore. */
if (mdev->bitmap) {
if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
"detach", BM_LOCKED_MASK)) {
if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
drbd_md_set_flag(mdev, MDF_FULL_SYNC);
drbd_md_sync(mdev);
}
}
}
drbd_force_state(mdev, NS(disk, D_DISKLESS)); drbd_force_state(mdev, NS(disk, D_DISKLESS));
return 0; return 0;
} }
......
...@@ -1294,6 +1294,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) ...@@ -1294,6 +1294,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
/* make sure there is no leftover from previous force-detach attempts */ /* make sure there is no leftover from previous force-detach attempts */
clear_bit(FORCE_DETACH, &mdev->flags); clear_bit(FORCE_DETACH, &mdev->flags);
clear_bit(WAS_IO_ERROR, &mdev->flags);
clear_bit(WAS_READ_ERROR, &mdev->flags);
/* and no leftover from previously aborted resync or verify, either */ /* and no leftover from previously aborted resync or verify, either */
mdev->rs_total = 0; mdev->rs_total = 0;
......
...@@ -492,11 +492,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, ...@@ -492,11 +492,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED); mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
break; break;
case WRITE_COMPLETED_WITH_ERROR:
__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
break;
case READ_COMPLETED_WITH_ERROR: case READ_COMPLETED_WITH_ERROR:
drbd_set_out_of_sync(mdev, req->i.sector, req->i.size); drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
/* fall through. */ __drbd_chk_io_error(mdev, DRBD_READ_ERROR);
case WRITE_COMPLETED_WITH_ERROR:
__drbd_chk_io_error(mdev, DRBD_IO_ERROR);
/* fall through. */ /* fall through. */
case READ_AHEAD_COMPLETED_WITH_ERROR: case READ_AHEAD_COMPLETED_WITH_ERROR:
/* it is legal to fail READA, no __drbd_chk_io_error in that case. */ /* it is legal to fail READA, no __drbd_chk_io_error in that case. */
......
...@@ -106,7 +106,7 @@ void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(lo ...@@ -106,7 +106,7 @@ void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(lo
if (list_empty(&mdev->read_ee)) if (list_empty(&mdev->read_ee))
wake_up(&mdev->ee_wait); wake_up(&mdev->ee_wait);
if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
__drbd_chk_io_error(mdev, DRBD_IO_ERROR); __drbd_chk_io_error(mdev, DRBD_READ_ERROR);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w); drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w);
...@@ -147,7 +147,7 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel ...@@ -147,7 +147,7 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel
do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee); do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee);
if (test_bit(__EE_WAS_ERROR, &peer_req->flags)) if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
__drbd_chk_io_error(mdev, DRBD_IO_ERROR); __drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
spin_unlock_irqrestore(&mdev->tconn->req_lock, flags); spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
if (block_id == ID_SYNCER) if (block_id == ID_SYNCER)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册