drbd_worker.c 53.6 KB
Newer Older
P
Philipp Reisner 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
/*
   drbd_worker.c

   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.

   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.

   drbd is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   drbd is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with drbd; see the file COPYING.  If not, write to
   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.

 */

#include <linux/module.h>
#include <linux/drbd.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/string.h>
#include <linux/scatterlist.h>

#include "drbd_int.h"
39
#include "drbd_protocol.h"
P
Philipp Reisner 已提交
40 41
#include "drbd_req.h"

42
static int w_make_ov_request(struct drbd_work *w, int cancel);
P
Philipp Reisner 已提交
43 44


45 46
/* endio handlers:
 *   drbd_md_io_complete (defined here)
47 48
 *   drbd_request_endio (defined here)
 *   drbd_peer_request_endio (defined here)
49 50
 *   bm_async_io_complete (defined in drbd_bitmap.c)
 *
P
Philipp Reisner 已提交
51 52 53 54 55 56 57 58 59 60
 * For all these callbacks, note the following:
 * The callbacks will be called in irq context by the IDE drivers,
 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
 * Try to get the locking right :)
 *
 */


/* About the global_state_lock
   Each state transition on an device holds a read lock. In case we have
61
   to evaluate the resync after dependencies, we grab a write lock, because
P
Philipp Reisner 已提交
62 63 64 65 66 67 68 69 70
   we need stable states on all devices for that.  */
rwlock_t global_state_lock;

/* used for synchronous meta data and bitmap IO
 * submitted by drbd_md_sync_page_io()
 */
void drbd_md_io_complete(struct bio *bio, int error)
{
	struct drbd_md_io *md_io;
71
	struct drbd_device *mdev;
P
Philipp Reisner 已提交
72 73

	md_io = (struct drbd_md_io *)bio->bi_private;
74
	mdev = container_of(md_io, struct drbd_device, md_io);
75

P
Philipp Reisner 已提交
76 77
	md_io->error = error;

78 79 80 81 82 83 84 85 86 87 88 89
	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
	 * to timeout on the lower level device, and eventually detach from it.
	 * If this io completion runs after that timeout expired, this
	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
	 * During normal operation, this only puts that extra reference
	 * down to 1 again.
	 * Make sure we first drop the reference, and only then signal
	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
	 * next drbd_md_sync_page_io(), that we trigger the
	 * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there.
	 */
	drbd_md_put_buffer(mdev);
90 91 92
	md_io->done = 1;
	wake_up(&mdev->misc_wait);
	bio_put(bio);
93 94
	if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
		put_ldev(mdev);
P
Philipp Reisner 已提交
95 96 97 98 99
}

/* reads on behalf of the partner,
 * "submitted" by the receiver
 */
100
static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
P
Philipp Reisner 已提交
101 102
{
	unsigned long flags = 0;
103
	struct drbd_device *mdev = peer_req->w.mdev;
P
Philipp Reisner 已提交
104

105
	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
106 107
	mdev->read_cnt += peer_req->i.size >> 9;
	list_del(&peer_req->w.list);
P
Philipp Reisner 已提交
108 109
	if (list_empty(&mdev->read_ee))
		wake_up(&mdev->ee_wait);
110
	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
111
		__drbd_chk_io_error(mdev, DRBD_READ_ERROR);
112
	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
P
Philipp Reisner 已提交
113

114
	drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w);
P
Philipp Reisner 已提交
115 116 117 118
	put_ldev(mdev);
}

/* writes on behalf of the partner, or resync writes,
119
 * "submitted" by the receiver, final stage.  */
120
static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
P
Philipp Reisner 已提交
121 122
{
	unsigned long flags = 0;
123
	struct drbd_device *mdev = peer_req->w.mdev;
124
	struct drbd_interval i;
P
Philipp Reisner 已提交
125
	int do_wake;
126
	u64 block_id;
P
Philipp Reisner 已提交
127 128
	int do_al_complete_io;

129
	/* after we moved peer_req to done_ee,
P
Philipp Reisner 已提交
130 131 132
	 * we may no longer access it,
	 * it may be freed/reused already!
	 * (as soon as we release the req_lock) */
133
	i = peer_req->i;
134 135
	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
	block_id = peer_req->block_id;
P
Philipp Reisner 已提交
136

137
	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
138
	mdev->writ_cnt += peer_req->i.size >> 9;
139
	list_move_tail(&peer_req->w.list, &mdev->done_ee);
P
Philipp Reisner 已提交
140

141
	/*
142
	 * Do not remove from the write_requests tree here: we did not send the
143 144 145 146 147
	 * Ack yet and did not wake possibly waiting conflicting requests.
	 * Removed from the tree from "drbd_process_done_ee" within the
	 * appropriate w.cb (e_end_block/e_end_resync_block) or from
	 * _drbd_clear_done_ee.
	 */
P
Philipp Reisner 已提交
148

149
	do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee);
P
Philipp Reisner 已提交
150

151
	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
152
		__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
153
	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
P
Philipp Reisner 已提交
154

155
	if (block_id == ID_SYNCER)
156
		drbd_rs_complete_io(mdev, i.sector);
P
Philipp Reisner 已提交
157 158 159 160 161

	if (do_wake)
		wake_up(&mdev->ee_wait);

	if (do_al_complete_io)
162
		drbd_al_complete_io(mdev, &i);
P
Philipp Reisner 已提交
163

164
	wake_asender(mdev->tconn);
P
Philipp Reisner 已提交
165
	put_ldev(mdev);
166
}
P
Philipp Reisner 已提交
167

168 169 170
/* writes on behalf of the partner, or resync writes,
 * "submitted" by the receiver.
 */
171
void drbd_peer_request_endio(struct bio *bio, int error)
172
{
173
	struct drbd_peer_request *peer_req = bio->bi_private;
174
	struct drbd_device *mdev = peer_req->w.mdev;
175 176 177
	int uptodate = bio_flagged(bio, BIO_UPTODATE);
	int is_write = bio_data_dir(bio) == WRITE;

178
	if (error && __ratelimit(&drbd_ratelimit_state))
179 180
		dev_warn(DEV, "%s: error=%d s=%llus\n",
				is_write ? "write" : "read", error,
181
				(unsigned long long)peer_req->i.sector);
182
	if (!error && !uptodate) {
183 184 185
		if (__ratelimit(&drbd_ratelimit_state))
			dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
					is_write ? "write" : "read",
186
					(unsigned long long)peer_req->i.sector);
187 188 189 190 191 192 193
		/* strange behavior of some lower level drivers...
		 * fail the request by clearing the uptodate flag,
		 * but do not return any error?! */
		error = -EIO;
	}

	if (error)
194
		set_bit(__EE_WAS_ERROR, &peer_req->flags);
195 196

	bio_put(bio); /* no need for the bio anymore */
197
	if (atomic_dec_and_test(&peer_req->pending_bios)) {
198
		if (is_write)
199
			drbd_endio_write_sec_final(peer_req);
200
		else
201
			drbd_endio_read_sec_final(peer_req);
202
	}
P
Philipp Reisner 已提交
203 204 205 206
}

/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
 */
207
void drbd_request_endio(struct bio *bio, int error)
P
Philipp Reisner 已提交
208
{
209
	unsigned long flags;
P
Philipp Reisner 已提交
210
	struct drbd_request *req = bio->bi_private;
211
	struct drbd_device *mdev = req->w.mdev;
212
	struct bio_and_error m;
P
Philipp Reisner 已提交
213 214 215 216 217 218 219 220 221 222 223 224
	enum drbd_req_event what;
	int uptodate = bio_flagged(bio, BIO_UPTODATE);

	if (!error && !uptodate) {
		dev_warn(DEV, "p %s: setting error to -EIO\n",
			 bio_data_dir(bio) == WRITE ? "write" : "read");
		/* strange behavior of some lower level drivers...
		 * fail the request by clearing the uptodate flag,
		 * but do not return any error?! */
		error = -EIO;
	}

225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261

	/* If this request was aborted locally before,
	 * but now was completed "successfully",
	 * chances are that this caused arbitrary data corruption.
	 *
	 * "aborting" requests, or force-detaching the disk, is intended for
	 * completely blocked/hung local backing devices which do no longer
	 * complete requests at all, not even do error completions.  In this
	 * situation, usually a hard-reset and failover is the only way out.
	 *
	 * By "aborting", basically faking a local error-completion,
	 * we allow for a more graceful swichover by cleanly migrating services.
	 * Still the affected node has to be rebooted "soon".
	 *
	 * By completing these requests, we allow the upper layers to re-use
	 * the associated data pages.
	 *
	 * If later the local backing device "recovers", and now DMAs some data
	 * from disk into the original request pages, in the best case it will
	 * just put random data into unused pages; but typically it will corrupt
	 * meanwhile completely unrelated data, causing all sorts of damage.
	 *
	 * Which means delayed successful completion,
	 * especially for READ requests,
	 * is a reason to panic().
	 *
	 * We assume that a delayed *error* completion is OK,
	 * though we still will complain noisily about it.
	 */
	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
		if (__ratelimit(&drbd_ratelimit_state))
			dev_emerg(DEV, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");

		if (!error)
			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
	}

P
Philipp Reisner 已提交
262 263 264
	/* to avoid recursion in __req_mod */
	if (unlikely(error)) {
		what = (bio_data_dir(bio) == WRITE)
265
			? WRITE_COMPLETED_WITH_ERROR
266
			: (bio_rw(bio) == READ)
267 268
			  ? READ_COMPLETED_WITH_ERROR
			  : READ_AHEAD_COMPLETED_WITH_ERROR;
P
Philipp Reisner 已提交
269
	} else
270
		what = COMPLETED_OK;
P
Philipp Reisner 已提交
271 272 273 274

	bio_put(req->private_bio);
	req->private_bio = ERR_PTR(error);

275
	/* not req_mod(), we need irqsave here! */
276
	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
277
	__req_mod(req, what, &m);
278
	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
279
	put_ldev(mdev);
280 281 282

	if (m.bio)
		complete_master_bio(mdev, &m);
P
Philipp Reisner 已提交
283 284
}

285
void drbd_csum_ee(struct drbd_device *mdev, struct crypto_hash *tfm,
286
		  struct drbd_peer_request *peer_req, void *digest)
287 288 289
{
	struct hash_desc desc;
	struct scatterlist sg;
290
	struct page *page = peer_req->pages;
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
	struct page *tmp;
	unsigned len;

	desc.tfm = tfm;
	desc.flags = 0;

	sg_init_table(&sg, 1);
	crypto_hash_init(&desc);

	while ((tmp = page_chain_next(page))) {
		/* all but the last page will be fully used */
		sg_set_page(&sg, page, PAGE_SIZE, 0);
		crypto_hash_update(&desc, &sg, sg.length);
		page = tmp;
	}
	/* and now the last, possibly only partially used page */
307
	len = peer_req->i.size & (PAGE_SIZE - 1);
308 309 310 311 312
	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
	crypto_hash_update(&desc, &sg, sg.length);
	crypto_hash_final(&desc, digest);
}

313
void drbd_csum_bio(struct drbd_device *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
P
Philipp Reisner 已提交
314 315 316
{
	struct hash_desc desc;
	struct scatterlist sg;
317 318
	struct bio_vec bvec;
	struct bvec_iter iter;
P
Philipp Reisner 已提交
319 320 321 322 323 324 325

	desc.tfm = tfm;
	desc.flags = 0;

	sg_init_table(&sg, 1);
	crypto_hash_init(&desc);

326 327
	bio_for_each_segment(bvec, bio, iter) {
		sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
P
Philipp Reisner 已提交
328 329 330 331 332
		crypto_hash_update(&desc, &sg, sg.length);
	}
	crypto_hash_final(&desc, digest);
}

333
/* MAYBE merge common code with w_e_end_ov_req */
334
static int w_e_send_csum(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
335
{
336
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
337
	struct drbd_device *mdev = w->mdev;
P
Philipp Reisner 已提交
338 339
	int digest_size;
	void *digest;
340
	int err = 0;
P
Philipp Reisner 已提交
341

342 343
	if (unlikely(cancel))
		goto out;
P
Philipp Reisner 已提交
344

345
	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
346
		goto out;
P
Philipp Reisner 已提交
347

348
	digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
349 350
	digest = kmalloc(digest_size, GFP_NOIO);
	if (digest) {
351 352
		sector_t sector = peer_req->i.sector;
		unsigned int size = peer_req->i.size;
353
		drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
354
		/* Free peer_req and pages before send.
355 356 357
		 * In case we block on congestion, we could otherwise run into
		 * some distributed deadlock, if the other side blocks on
		 * congestion as well, because our receiver blocks in
358
		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
359
		drbd_free_peer_req(mdev, peer_req);
360
		peer_req = NULL;
361
		inc_rs_pending(mdev);
362
		err = drbd_send_drequest_csum(mdev, sector, size,
363 364
					      digest, digest_size,
					      P_CSUM_RS_REQUEST);
365 366 367
		kfree(digest);
	} else {
		dev_err(DEV, "kmalloc() of digest failed.\n");
368
		err = -ENOMEM;
369
	}
P
Philipp Reisner 已提交
370

371
out:
372
	if (peer_req)
373
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
374

375
	if (unlikely(err))
P
Philipp Reisner 已提交
376
		dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
377
	return err;
P
Philipp Reisner 已提交
378 379 380 381
}

#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)

382
static int read_for_csum(struct drbd_device *mdev, sector_t sector, int size)
P
Philipp Reisner 已提交
383
{
384
	struct drbd_peer_request *peer_req;
P
Philipp Reisner 已提交
385 386

	if (!get_ldev(mdev))
387
		return -EIO;
P
Philipp Reisner 已提交
388

389
	if (drbd_rs_should_slow_down(mdev, sector))
390 391
		goto defer;

P
Philipp Reisner 已提交
392 393
	/* GFP_TRY, because if there is no memory available right now, this may
	 * be rescheduled for later. It is "only" background resync, after all. */
394 395
	peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector,
				       size, GFP_TRY);
396
	if (!peer_req)
397
		goto defer;
P
Philipp Reisner 已提交
398

399
	peer_req->w.cb = w_e_send_csum;
400
	spin_lock_irq(&mdev->tconn->req_lock);
401
	list_add(&peer_req->w.list, &mdev->read_ee);
402
	spin_unlock_irq(&mdev->tconn->req_lock);
P
Philipp Reisner 已提交
403

404
	atomic_add(size >> 9, &mdev->rs_sect_ev);
405
	if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
406
		return 0;
P
Philipp Reisner 已提交
407

408 409 410 411
	/* If it failed because of ENOMEM, retry should help.  If it failed
	 * because bio_add_page failed (probably broken lower level driver),
	 * retry may or may not help.
	 * If it does not, you may need to force disconnect. */
412
	spin_lock_irq(&mdev->tconn->req_lock);
413
	list_del(&peer_req->w.list);
414
	spin_unlock_irq(&mdev->tconn->req_lock);
415

416
	drbd_free_peer_req(mdev, peer_req);
417
defer:
418
	put_ldev(mdev);
419
	return -EAGAIN;
P
Philipp Reisner 已提交
420 421
}

422
int w_resync_timer(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
423
{
424
	struct drbd_device *mdev = w->mdev;
425 426
	switch (mdev->state.conn) {
	case C_VERIFY_S:
427
		w_make_ov_request(w, cancel);
428 429
		break;
	case C_SYNC_TARGET:
430
		w_make_resync_request(w, cancel);
431
		break;
P
Philipp Reisner 已提交
432 433
	}

434
	return 0;
435 436 437 438
}

void resync_timer_fn(unsigned long data)
{
439
	struct drbd_device *mdev = (struct drbd_device *) data;
440 441

	if (list_empty(&mdev->resync_work.list))
442
		drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work);
P
Philipp Reisner 已提交
443 444
}

445 446 447 448 449
static void fifo_set(struct fifo_buffer *fb, int value)
{
	int i;

	for (i = 0; i < fb->size; i++)
450
		fb->values[i] = value;
451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
}

static int fifo_push(struct fifo_buffer *fb, int value)
{
	int ov;

	ov = fb->values[fb->head_index];
	fb->values[fb->head_index++] = value;

	if (fb->head_index >= fb->size)
		fb->head_index = 0;

	return ov;
}

static void fifo_add_val(struct fifo_buffer *fb, int value)
{
	int i;

	for (i = 0; i < fb->size; i++)
		fb->values[i] += value;
}

474 475 476 477
struct fifo_buffer *fifo_alloc(int fifo_size)
{
	struct fifo_buffer *fb;

478
	fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
479 480 481 482 483 484 485 486 487 488
	if (!fb)
		return NULL;

	fb->head_index = 0;
	fb->size = fifo_size;
	fb->total = 0;

	return fb;
}

489
static int drbd_rs_controller(struct drbd_device *mdev)
490
{
P
Philipp Reisner 已提交
491
	struct disk_conf *dc;
492 493 494 495 496 497 498 499
	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
	unsigned int want;     /* The number of sectors we want in the proxy */
	int req_sect; /* Number of sectors to request in this turn */
	int correction; /* Number of sectors more we need in the proxy*/
	int cps; /* correction per invocation of drbd_rs_controller() */
	int steps; /* Number of time steps to plan ahead */
	int curr_corr;
	int max_sect;
P
Philipp Reisner 已提交
500
	struct fifo_buffer *plan;
501 502 503 504

	sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
	mdev->rs_in_flight -= sect_in;

P
Philipp Reisner 已提交
505
	dc = rcu_dereference(mdev->ldev->disk_conf);
P
Philipp Reisner 已提交
506
	plan = rcu_dereference(mdev->rs_plan_s);
507

P
Philipp Reisner 已提交
508
	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
509 510

	if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
P
Philipp Reisner 已提交
511
		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
512
	} else { /* normal path */
P
Philipp Reisner 已提交
513 514
		want = dc->c_fill_target ? dc->c_fill_target :
			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
515 516
	}

P
Philipp Reisner 已提交
517
	correction = want - mdev->rs_in_flight - plan->total;
518 519 520

	/* Plan ahead */
	cps = correction / steps;
P
Philipp Reisner 已提交
521 522
	fifo_add_val(plan, cps);
	plan->total += cps * steps;
523 524

	/* What we do in this step */
P
Philipp Reisner 已提交
525 526
	curr_corr = fifo_push(plan, 0);
	plan->total -= curr_corr;
527 528 529 530 531

	req_sect = sect_in + curr_corr;
	if (req_sect < 0)
		req_sect = 0;

P
Philipp Reisner 已提交
532
	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
533 534 535 536 537 538 539 540 541 542 543 544
	if (req_sect > max_sect)
		req_sect = max_sect;

	/*
	dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
		 sect_in, mdev->rs_in_flight, want, correction,
		 steps, cps, mdev->rs_planed, curr_corr, req_sect);
	*/

	return req_sect;
}

545
static int drbd_rs_number_requests(struct drbd_device *mdev)
546 547
{
	int number;
P
Philipp Reisner 已提交
548 549 550

	rcu_read_lock();
	if (rcu_dereference(mdev->rs_plan_s)->size) {
551 552 553
		number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
		mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
	} else {
P
Philipp Reisner 已提交
554
		mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate;
555 556
		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
	}
P
Philipp Reisner 已提交
557
	rcu_read_unlock();
558 559 560 561 562 563

	/* ignore the amount of pending requests, the resync controller should
	 * throttle down to incoming reply rate soon enough anyways. */
	return number;
}

564
int w_make_resync_request(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
565
{
566
	struct drbd_device *mdev = w->mdev;
P
Philipp Reisner 已提交
567 568 569
	unsigned long bit;
	sector_t sector;
	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
570
	int max_bio_size;
571
	int number, rollback_i, size;
P
Philipp Reisner 已提交
572
	int align, queued, sndbuf;
573
	int i = 0;
P
Philipp Reisner 已提交
574 575

	if (unlikely(cancel))
576
		return 0;
P
Philipp Reisner 已提交
577

578 579 580
	if (mdev->rs_total == 0) {
		/* empty resync? */
		drbd_resync_finished(mdev);
581
		return 0;
582 583
	}

P
Philipp Reisner 已提交
584 585 586 587 588 589
	if (!get_ldev(mdev)) {
		/* Since we only need to access mdev->rsync a
		   get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
		   to continue resync with a broken disk makes no sense at
		   all */
		dev_err(DEV, "Disk broke down during resync!\n");
590
		return 0;
P
Philipp Reisner 已提交
591 592
	}

593
	max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
594 595
	number = drbd_rs_number_requests(mdev);
	if (number == 0)
596
		goto requeue;
P
Philipp Reisner 已提交
597 598 599

	for (i = 0; i < number; i++) {
		/* Stop generating RS requests, when half of the send buffer is filled */
600 601 602 603
		mutex_lock(&mdev->tconn->data.mutex);
		if (mdev->tconn->data.socket) {
			queued = mdev->tconn->data.socket->sk->sk_wmem_queued;
			sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf;
P
Philipp Reisner 已提交
604 605 606 607
		} else {
			queued = 1;
			sndbuf = 0;
		}
608
		mutex_unlock(&mdev->tconn->data.mutex);
P
Philipp Reisner 已提交
609 610 611 612 613 614 615
		if (queued > sndbuf / 2)
			goto requeue;

next_sector:
		size = BM_BLOCK_SIZE;
		bit  = drbd_bm_find_next(mdev, mdev->bm_resync_fo);

616
		if (bit == DRBD_END_OF_BITMAP) {
P
Philipp Reisner 已提交
617 618
			mdev->bm_resync_fo = drbd_bm_bits(mdev);
			put_ldev(mdev);
619
			return 0;
P
Philipp Reisner 已提交
620 621 622 623
		}

		sector = BM_BIT_TO_SECT(bit);

624 625
		if (drbd_rs_should_slow_down(mdev, sector) ||
		    drbd_try_rs_begin_io(mdev, sector)) {
P
Philipp Reisner 已提交
626 627 628 629 630 631 632 633 634 635
			mdev->bm_resync_fo = bit;
			goto requeue;
		}
		mdev->bm_resync_fo = bit + 1;

		if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
			drbd_rs_complete_io(mdev, sector);
			goto next_sector;
		}

636
#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
P
Philipp Reisner 已提交
637 638 639 640 641 642 643
		/* try to find some adjacent bits.
		 * we stop if we have already the maximum req size.
		 *
		 * Additionally always align bigger requests, in order to
		 * be prepared for all stripe sizes of software RAIDs.
		 */
		align = 1;
644
		rollback_i = i;
P
Philipp Reisner 已提交
645
		for (;;) {
646
			if (size + BM_BLOCK_SIZE > max_bio_size)
P
Philipp Reisner 已提交
647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
				break;

			/* Be always aligned */
			if (sector & ((1<<(align+3))-1))
				break;

			/* do not cross extent boundaries */
			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
				break;
			/* now, is it actually dirty, after all?
			 * caution, drbd_bm_test_bit is tri-state for some
			 * obscure reason; ( b == 0 ) would get the out-of-band
			 * only accidentally right because of the "oddly sized"
			 * adjustment below */
			if (drbd_bm_test_bit(mdev, bit+1) != 1)
				break;
			bit++;
			size += BM_BLOCK_SIZE;
			if ((BM_BLOCK_SIZE << align) <= size)
				align++;
			i++;
		}
		/* if we merged some,
		 * reset the offset to start the next drbd_bm_find_next from */
		if (size > BM_BLOCK_SIZE)
			mdev->bm_resync_fo = bit + 1;
#endif

		/* adjust very last sectors, in case we are oddly sized */
		if (sector + (size>>9) > capacity)
			size = (capacity-sector)<<9;
678
		if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) {
P
Philipp Reisner 已提交
679
			switch (read_for_csum(mdev, sector, size)) {
680
			case -EIO: /* Disk failure */
P
Philipp Reisner 已提交
681
				put_ldev(mdev);
682
				return -EIO;
683
			case -EAGAIN: /* allocation failed, or ldev busy */
P
Philipp Reisner 已提交
684 685
				drbd_rs_complete_io(mdev, sector);
				mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
686
				i = rollback_i;
P
Philipp Reisner 已提交
687
				goto requeue;
688 689 690 691 692
			case 0:
				/* everything ok */
				break;
			default:
				BUG();
P
Philipp Reisner 已提交
693 694
			}
		} else {
695 696
			int err;

P
Philipp Reisner 已提交
697
			inc_rs_pending(mdev);
698 699 700
			err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
						 sector, size, ID_SYNCER);
			if (err) {
P
Philipp Reisner 已提交
701 702 703
				dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
				dec_rs_pending(mdev);
				put_ldev(mdev);
704
				return err;
P
Philipp Reisner 已提交
705 706 707 708 709 710 711 712 713 714 715 716
			}
		}
	}

	if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
		/* last syncer _request_ was sent,
		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
		 * next sync group will resume), as soon as we receive the last
		 * resync data block, and the last bit is cleared.
		 * until then resync "work" is "inactive" ...
		 */
		put_ldev(mdev);
717
		return 0;
P
Philipp Reisner 已提交
718 719 720
	}

 requeue:
721
	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
P
Philipp Reisner 已提交
722 723
	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
	put_ldev(mdev);
724
	return 0;
P
Philipp Reisner 已提交
725 726
}

727
static int w_make_ov_request(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
728
{
729
	struct drbd_device *mdev = w->mdev;
P
Philipp Reisner 已提交
730 731 732
	int number, i, size;
	sector_t sector;
	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
733
	bool stop_sector_reached = false;
P
Philipp Reisner 已提交
734 735 736 737

	if (unlikely(cancel))
		return 1;

738
	number = drbd_rs_number_requests(mdev);
P
Philipp Reisner 已提交
739 740 741

	sector = mdev->ov_position;
	for (i = 0; i < number; i++) {
742
		if (sector >= capacity)
P
Philipp Reisner 已提交
743
			return 1;
744 745 746 747 748 749 750 751 752

		/* We check for "finished" only in the reply path:
		 * w_e_end_ov_reply().
		 * We need to send at least one request out. */
		stop_sector_reached = i > 0
			&& verify_can_do_stop_sector(mdev)
			&& sector >= mdev->ov_stop_sector;
		if (stop_sector_reached)
			break;
P
Philipp Reisner 已提交
753 754 755

		size = BM_BLOCK_SIZE;

756 757
		if (drbd_rs_should_slow_down(mdev, sector) ||
		    drbd_try_rs_begin_io(mdev, sector)) {
P
Philipp Reisner 已提交
758 759 760 761 762 763 764 765
			mdev->ov_position = sector;
			goto requeue;
		}

		if (sector + (size>>9) > capacity)
			size = (capacity-sector)<<9;

		inc_rs_pending(mdev);
766
		if (drbd_send_ov_request(mdev, sector, size)) {
P
Philipp Reisner 已提交
767 768 769 770 771 772 773 774
			dec_rs_pending(mdev);
			return 0;
		}
		sector += BM_SECT_PER_BIT;
	}
	mdev->ov_position = sector;

 requeue:
775
	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
776 777
	if (i == 0 || !stop_sector_reached)
		mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
P
Philipp Reisner 已提交
778 779 780
	return 1;
}

781
int w_ov_finished(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
782
{
783
	struct drbd_device *mdev = w->mdev;
P
Philipp Reisner 已提交
784
	kfree(w);
785
	ov_out_of_sync_print(mdev);
P
Philipp Reisner 已提交
786 787
	drbd_resync_finished(mdev);

788
	return 0;
P
Philipp Reisner 已提交
789 790
}

791
static int w_resync_finished(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
792
{
793
	struct drbd_device *mdev = w->mdev;
P
Philipp Reisner 已提交
794 795 796 797
	kfree(w);

	drbd_resync_finished(mdev);

798
	return 0;
P
Philipp Reisner 已提交
799 800
}

801
static void ping_peer(struct drbd_device *mdev)
802
{
803 804 805 806 807 808
	struct drbd_tconn *tconn = mdev->tconn;

	clear_bit(GOT_PING_ACK, &tconn->flags);
	request_ping(tconn);
	wait_event(tconn->ping_wait,
		   test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED);
809 810
}

811
int drbd_resync_finished(struct drbd_device *mdev)
P
Philipp Reisner 已提交
812 813 814 815 816 817
{
	unsigned long db, dt, dbdt;
	unsigned long n_oos;
	union drbd_state os, ns;
	struct drbd_work *w;
	char *khelper_cmd = NULL;
818
	int verify_done = 0;
P
Philipp Reisner 已提交
819 820 821 822 823 824 825 826 827 828

	/* Remove all elements from the resync LRU. Since future actions
	 * might set bits in the (main) bitmap, then the entries in the
	 * resync LRU would be wrong. */
	if (drbd_rs_del_all(mdev)) {
		/* In case this is not possible now, most probably because
		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
		 * queue (or even the read operations for those packets
		 * is not finished by now).   Retry in 100ms. */

829
		schedule_timeout_interruptible(HZ / 10);
P
Philipp Reisner 已提交
830 831 832
		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
		if (w) {
			w->cb = w_resync_finished;
833
			w->mdev = mdev;
834
			drbd_queue_work(&mdev->tconn->sender_work, w);
P
Philipp Reisner 已提交
835 836 837 838 839 840 841 842
			return 1;
		}
		dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
	}

	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
	if (dt <= 0)
		dt = 1;
843
	
P
Philipp Reisner 已提交
844
	db = mdev->rs_total;
845 846 847 848
	/* adjust for verify start and stop sectors, respective reached position */
	if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
		db -= mdev->ov_left;

P
Philipp Reisner 已提交
849 850 851 852 853 854
	dbdt = Bit2KB(db/dt);
	mdev->rs_paused /= HZ;

	if (!get_ldev(mdev))
		goto out;

855 856
	ping_peer(mdev);

857
	spin_lock_irq(&mdev->tconn->req_lock);
858
	os = drbd_read_state(mdev);
P
Philipp Reisner 已提交
859

860 861
	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);

P
Philipp Reisner 已提交
862 863 864 865 866 867 868 869 870
	/* This protects us against multiple calls (that can happen in the presence
	   of application IO), and against connectivity loss just before we arrive here. */
	if (os.conn <= C_CONNECTED)
		goto out_unlock;

	ns = os;
	ns.conn = C_CONNECTED;

	dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
871
	     verify_done ? "Online verify" : "Resync",
P
Philipp Reisner 已提交
872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887
	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);

	n_oos = drbd_bm_total_weight(mdev);

	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
		if (n_oos) {
			dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
			      n_oos, Bit2KB(1));
			khelper_cmd = "out-of-sync";
		}
	} else {
		D_ASSERT((n_oos - mdev->rs_failed) == 0);

		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
			khelper_cmd = "after-resync-target";

888
		if (mdev->tconn->csums_tfm && mdev->rs_total) {
P
Philipp Reisner 已提交
889 890 891 892 893
			const unsigned long s = mdev->rs_same_csum;
			const unsigned long t = mdev->rs_total;
			const int ratio =
				(t == 0)     ? 0 :
			(t < 100000) ? ((s*100)/t) : (s/(t/100));
B
Bart Van Assche 已提交
894
			dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "
P
Philipp Reisner 已提交
895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928
			     "transferred %luK total %luK\n",
			     ratio,
			     Bit2KB(mdev->rs_same_csum),
			     Bit2KB(mdev->rs_total - mdev->rs_same_csum),
			     Bit2KB(mdev->rs_total));
		}
	}

	if (mdev->rs_failed) {
		dev_info(DEV, "            %lu failed blocks\n", mdev->rs_failed);

		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
			ns.disk = D_INCONSISTENT;
			ns.pdsk = D_UP_TO_DATE;
		} else {
			ns.disk = D_UP_TO_DATE;
			ns.pdsk = D_INCONSISTENT;
		}
	} else {
		ns.disk = D_UP_TO_DATE;
		ns.pdsk = D_UP_TO_DATE;

		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
			if (mdev->p_uuid) {
				int i;
				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
					_drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
				drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
				_drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
			} else {
				dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
			}
		}

929 930 931 932 933 934 935 936 937 938 939 940
		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
			/* for verify runs, we don't update uuids here,
			 * so there would be nothing to report. */
			drbd_uuid_set_bm(mdev, 0UL);
			drbd_print_uuids(mdev, "updated UUIDs");
			if (mdev->p_uuid) {
				/* Now the two UUID sets are equal, update what we
				 * know of the peer. */
				int i;
				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
					mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
			}
P
Philipp Reisner 已提交
941 942 943 944 945
		}
	}

	_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
out_unlock:
946
	spin_unlock_irq(&mdev->tconn->req_lock);
P
Philipp Reisner 已提交
947 948 949 950 951
	put_ldev(mdev);
out:
	mdev->rs_total  = 0;
	mdev->rs_failed = 0;
	mdev->rs_paused = 0;
952 953 954

	/* reset start sector, if we reached end of device */
	if (verify_done && mdev->ov_left == 0)
955
		mdev->ov_start_sector = 0;
P
Philipp Reisner 已提交
956

957 958
	drbd_md_sync(mdev);

P
Philipp Reisner 已提交
959 960 961 962 963 964 965
	if (khelper_cmd)
		drbd_khelper(mdev, khelper_cmd);

	return 1;
}

/* helper */
966
static void move_to_net_ee_or_free(struct drbd_device *mdev, struct drbd_peer_request *peer_req)
P
Philipp Reisner 已提交
967
{
968
	if (drbd_peer_req_has_active_page(peer_req)) {
P
Philipp Reisner 已提交
969
		/* This might happen if sendpage() has not finished */
970
		int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
971 972
		atomic_add(i, &mdev->pp_in_use_by_net);
		atomic_sub(i, &mdev->pp_in_use);
973
		spin_lock_irq(&mdev->tconn->req_lock);
974
		list_add_tail(&peer_req->w.list, &mdev->net_ee);
975
		spin_unlock_irq(&mdev->tconn->req_lock);
976
		wake_up(&drbd_pp_wait);
P
Philipp Reisner 已提交
977
	} else
978
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
979 980 981 982 983 984 985 986
}

/**
 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
 * @mdev:	DRBD device.
 * @w:		work object.
 * @cancel:	The connection will be closed anyways
 */
987
int w_e_end_data_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
988
{
989
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
990
	struct drbd_device *mdev = w->mdev;
991
	int err;
P
Philipp Reisner 已提交
992 993

	if (unlikely(cancel)) {
994
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
995
		dec_unacked(mdev);
996
		return 0;
P
Philipp Reisner 已提交
997 998
	}

999
	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1000
		err = drbd_send_block(mdev, P_DATA_REPLY, peer_req);
P
Philipp Reisner 已提交
1001 1002 1003
	} else {
		if (__ratelimit(&drbd_ratelimit_state))
			dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
1004
			    (unsigned long long)peer_req->i.sector);
P
Philipp Reisner 已提交
1005

1006
		err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req);
P
Philipp Reisner 已提交
1007 1008 1009 1010
	}

	dec_unacked(mdev);

1011
	move_to_net_ee_or_free(mdev, peer_req);
P
Philipp Reisner 已提交
1012

1013
	if (unlikely(err))
P
Philipp Reisner 已提交
1014
		dev_err(DEV, "drbd_send_block() failed\n");
1015
	return err;
P
Philipp Reisner 已提交
1016 1017 1018
}

/**
1019
 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
P
Philipp Reisner 已提交
1020 1021 1022 1023
 * @mdev:	DRBD device.
 * @w:		work object.
 * @cancel:	The connection will be closed anyways
 */
1024
int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1025
{
1026
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1027
	struct drbd_device *mdev = w->mdev;
1028
	int err;
P
Philipp Reisner 已提交
1029 1030

	if (unlikely(cancel)) {
1031
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1032
		dec_unacked(mdev);
1033
		return 0;
P
Philipp Reisner 已提交
1034 1035 1036
	}

	if (get_ldev_if_state(mdev, D_FAILED)) {
1037
		drbd_rs_complete_io(mdev, peer_req->i.sector);
P
Philipp Reisner 已提交
1038 1039 1040
		put_ldev(mdev);
	}

1041
	if (mdev->state.conn == C_AHEAD) {
1042
		err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req);
1043
	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
P
Philipp Reisner 已提交
1044 1045
		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
			inc_rs_pending(mdev);
1046
			err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
P
Philipp Reisner 已提交
1047 1048 1049 1050
		} else {
			if (__ratelimit(&drbd_ratelimit_state))
				dev_err(DEV, "Not sending RSDataReply, "
				    "partner DISKLESS!\n");
1051
			err = 0;
P
Philipp Reisner 已提交
1052 1053 1054 1055
		}
	} else {
		if (__ratelimit(&drbd_ratelimit_state))
			dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
1056
			    (unsigned long long)peer_req->i.sector);
P
Philipp Reisner 已提交
1057

1058
		err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
P
Philipp Reisner 已提交
1059 1060

		/* update resync data with failure */
1061
		drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size);
P
Philipp Reisner 已提交
1062 1063 1064 1065
	}

	dec_unacked(mdev);

1066
	move_to_net_ee_or_free(mdev, peer_req);
P
Philipp Reisner 已提交
1067

1068
	if (unlikely(err))
P
Philipp Reisner 已提交
1069
		dev_err(DEV, "drbd_send_block() failed\n");
1070
	return err;
P
Philipp Reisner 已提交
1071 1072
}

1073
int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1074
{
1075
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1076
	struct drbd_device *mdev = w->mdev;
P
Philipp Reisner 已提交
1077 1078 1079
	struct digest_info *di;
	int digest_size;
	void *digest = NULL;
1080
	int err, eq = 0;
P
Philipp Reisner 已提交
1081 1082

	if (unlikely(cancel)) {
1083
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1084
		dec_unacked(mdev);
1085
		return 0;
P
Philipp Reisner 已提交
1086 1087
	}

1088
	if (get_ldev(mdev)) {
1089
		drbd_rs_complete_io(mdev, peer_req->i.sector);
1090 1091
		put_ldev(mdev);
	}
P
Philipp Reisner 已提交
1092

1093
	di = peer_req->digest;
P
Philipp Reisner 已提交
1094

1095
	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
P
Philipp Reisner 已提交
1096 1097 1098
		/* quick hack to try to avoid a race against reconfiguration.
		 * a real fix would be much more involved,
		 * introducing more locking mechanisms */
1099 1100
		if (mdev->tconn->csums_tfm) {
			digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
P
Philipp Reisner 已提交
1101 1102 1103 1104
			D_ASSERT(digest_size == di->digest_size);
			digest = kmalloc(digest_size, GFP_NOIO);
		}
		if (digest) {
1105
			drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
P
Philipp Reisner 已提交
1106 1107 1108 1109 1110
			eq = !memcmp(digest, di->digest, digest_size);
			kfree(digest);
		}

		if (eq) {
1111
			drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size);
1112
			/* rs_same_csums unit is BM_BLOCK_SIZE */
1113
			mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
1114
			err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req);
P
Philipp Reisner 已提交
1115 1116
		} else {
			inc_rs_pending(mdev);
1117 1118
			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
1119
			kfree(di);
1120
			err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
P
Philipp Reisner 已提交
1121 1122
		}
	} else {
1123
		err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
P
Philipp Reisner 已提交
1124 1125 1126 1127 1128
		if (__ratelimit(&drbd_ratelimit_state))
			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
	}

	dec_unacked(mdev);
1129
	move_to_net_ee_or_free(mdev, peer_req);
P
Philipp Reisner 已提交
1130

1131
	if (unlikely(err))
P
Philipp Reisner 已提交
1132
		dev_err(DEV, "drbd_send_block/ack() failed\n");
1133
	return err;
P
Philipp Reisner 已提交
1134 1135
}

1136
int w_e_end_ov_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1137
{
1138
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1139
	struct drbd_device *mdev = w->mdev;
1140 1141
	sector_t sector = peer_req->i.sector;
	unsigned int size = peer_req->i.size;
P
Philipp Reisner 已提交
1142 1143
	int digest_size;
	void *digest;
1144
	int err = 0;
P
Philipp Reisner 已提交
1145 1146 1147 1148

	if (unlikely(cancel))
		goto out;

1149
	digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
P
Philipp Reisner 已提交
1150
	digest = kmalloc(digest_size, GFP_NOIO);
1151
	if (!digest) {
1152
		err = 1;	/* terminate the connection in case the allocation failed */
1153
		goto out;
P
Philipp Reisner 已提交
1154 1155
	}

1156
	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
1157
		drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
1158 1159 1160
	else
		memset(digest, 0, digest_size);

1161 1162 1163 1164
	/* Free e and pages before send.
	 * In case we block on congestion, we could otherwise run into
	 * some distributed deadlock, if the other side blocks on
	 * congestion as well, because our receiver blocks in
1165
	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1166
	drbd_free_peer_req(mdev, peer_req);
1167
	peer_req = NULL;
1168
	inc_rs_pending(mdev);
1169 1170
	err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY);
	if (err)
1171 1172 1173
		dec_rs_pending(mdev);
	kfree(digest);

P
Philipp Reisner 已提交
1174
out:
1175
	if (peer_req)
1176
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1177
	dec_unacked(mdev);
1178
	return err;
P
Philipp Reisner 已提交
1179 1180
}

1181
void drbd_ov_out_of_sync_found(struct drbd_device *mdev, sector_t sector, int size)
P
Philipp Reisner 已提交
1182 1183 1184 1185 1186 1187 1188 1189 1190 1191
{
	if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
		mdev->ov_last_oos_size += size>>9;
	} else {
		mdev->ov_last_oos_start = sector;
		mdev->ov_last_oos_size = size>>9;
	}
	drbd_set_out_of_sync(mdev, sector, size);
}

1192
int w_e_end_ov_reply(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1193
{
1194
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1195
	struct drbd_device *mdev = w->mdev;
P
Philipp Reisner 已提交
1196 1197
	struct digest_info *di;
	void *digest;
1198 1199
	sector_t sector = peer_req->i.sector;
	unsigned int size = peer_req->i.size;
1200
	int digest_size;
1201
	int err, eq = 0;
1202
	bool stop_sector_reached = false;
P
Philipp Reisner 已提交
1203 1204

	if (unlikely(cancel)) {
1205
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1206
		dec_unacked(mdev);
1207
		return 0;
P
Philipp Reisner 已提交
1208 1209 1210 1211
	}

	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
	 * the resync lru has been cleaned up already */
1212
	if (get_ldev(mdev)) {
1213
		drbd_rs_complete_io(mdev, peer_req->i.sector);
1214 1215
		put_ldev(mdev);
	}
P
Philipp Reisner 已提交
1216

1217
	di = peer_req->digest;
P
Philipp Reisner 已提交
1218

1219
	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1220
		digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
P
Philipp Reisner 已提交
1221 1222
		digest = kmalloc(digest_size, GFP_NOIO);
		if (digest) {
1223
			drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
P
Philipp Reisner 已提交
1224 1225 1226 1227 1228 1229 1230

			D_ASSERT(digest_size == di->digest_size);
			eq = !memcmp(digest, di->digest, digest_size);
			kfree(digest);
		}
	}

1231 1232 1233 1234
	/* Free peer_req and pages before send.
	 * In case we block on congestion, we could otherwise run into
	 * some distributed deadlock, if the other side blocks on
	 * congestion as well, because our receiver blocks in
1235
	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1236
	drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1237
	if (!eq)
1238
		drbd_ov_out_of_sync_found(mdev, sector, size);
P
Philipp Reisner 已提交
1239
	else
1240
		ov_out_of_sync_print(mdev);
P
Philipp Reisner 已提交
1241

1242
	err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
1243
			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
P
Philipp Reisner 已提交
1244

1245
	dec_unacked(mdev);
P
Philipp Reisner 已提交
1246

1247 1248 1249 1250 1251 1252
	--mdev->ov_left;

	/* let's advance progress step marks only for every other megabyte */
	if ((mdev->ov_left & 0x200) == 0x200)
		drbd_advance_rs_marks(mdev, mdev->ov_left);

1253 1254 1255 1256
	stop_sector_reached = verify_can_do_stop_sector(mdev) &&
		(sector + (size>>9)) >= mdev->ov_stop_sector;

	if (mdev->ov_left == 0 || stop_sector_reached) {
1257
		ov_out_of_sync_print(mdev);
P
Philipp Reisner 已提交
1258 1259 1260
		drbd_resync_finished(mdev);
	}

1261
	return err;
P
Philipp Reisner 已提交
1262 1263
}

1264
int w_prev_work_done(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1265 1266
{
	struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1267

P
Philipp Reisner 已提交
1268
	complete(&b->done);
1269
	return 0;
P
Philipp Reisner 已提交
1270 1271
}

1272 1273 1274 1275 1276
/* FIXME
 * We need to track the number of pending barrier acks,
 * and to be able to wait for them.
 * See also comment in drbd_adm_attach before drbd_suspend_io.
 */
1277
static int drbd_send_barrier(struct drbd_tconn *tconn)
P
Philipp Reisner 已提交
1278
{
1279
	struct p_barrier *p;
1280
	struct drbd_socket *sock;
P
Philipp Reisner 已提交
1281

1282 1283
	sock = &tconn->data;
	p = conn_prepare_command(tconn, sock);
1284 1285
	if (!p)
		return -EIO;
1286 1287 1288 1289 1290
	p->barrier = tconn->send.current_epoch_nr;
	p->pad = 0;
	tconn->send.current_epoch_writes = 0;

	return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0);
P
Philipp Reisner 已提交
1291 1292
}

1293
int w_send_write_hint(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1294
{
1295
	struct drbd_device *mdev = w->mdev;
1296 1297
	struct drbd_socket *sock;

P
Philipp Reisner 已提交
1298
	if (cancel)
1299
		return 0;
1300 1301 1302
	sock = &mdev->tconn->data;
	if (!drbd_prepare_command(mdev, sock))
		return -EIO;
1303
	return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
P
Philipp Reisner 已提交
1304 1305
}

1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch)
{
	if (!tconn->send.seen_any_write_yet) {
		tconn->send.seen_any_write_yet = true;
		tconn->send.current_epoch_nr = epoch;
		tconn->send.current_epoch_writes = 0;
	}
}

static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch)
{
	/* re-init if first write on this connection */
	if (!tconn->send.seen_any_write_yet)
		return;
	if (tconn->send.current_epoch_nr != epoch) {
		if (tconn->send.current_epoch_writes)
			drbd_send_barrier(tconn);
		tconn->send.current_epoch_nr = epoch;
	}
}

1327
int w_send_out_of_sync(struct drbd_work *w, int cancel)
1328 1329
{
	struct drbd_request *req = container_of(w, struct drbd_request, w);
1330
	struct drbd_device *mdev = w->mdev;
1331
	struct drbd_tconn *tconn = mdev->tconn;
1332
	int err;
1333 1334

	if (unlikely(cancel)) {
1335
		req_mod(req, SEND_CANCELED);
1336
		return 0;
1337 1338
	}

1339 1340 1341 1342
	/* this time, no tconn->send.current_epoch_writes++;
	 * If it was sent, it was the closing barrier for the last
	 * replicated epoch, before we went into AHEAD mode.
	 * No more barriers will be sent, until we leave AHEAD mode again. */
1343
	maybe_send_barrier(tconn, req->epoch);
1344

1345
	err = drbd_send_out_of_sync(mdev, req);
1346
	req_mod(req, OOS_HANDED_TO_NETWORK);
1347

1348
	return err;
1349 1350
}

P
Philipp Reisner 已提交
1351 1352 1353 1354 1355 1356
/**
 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
 * @mdev:	DRBD device.
 * @w:		work object.
 * @cancel:	The connection will be closed anyways
 */
1357
int w_send_dblock(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1358 1359
{
	struct drbd_request *req = container_of(w, struct drbd_request, w);
1360
	struct drbd_device *mdev = w->mdev;
1361
	struct drbd_tconn *tconn = mdev->tconn;
1362
	int err;
P
Philipp Reisner 已提交
1363 1364

	if (unlikely(cancel)) {
1365
		req_mod(req, SEND_CANCELED);
1366
		return 0;
P
Philipp Reisner 已提交
1367 1368
	}

1369 1370
	re_init_if_first_write(tconn, req->epoch);
	maybe_send_barrier(tconn, req->epoch);
1371 1372
	tconn->send.current_epoch_writes++;

1373 1374
	err = drbd_send_dblock(mdev, req);
	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
P
Philipp Reisner 已提交
1375

1376
	return err;
P
Philipp Reisner 已提交
1377 1378 1379 1380 1381 1382 1383 1384
}

/**
 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
 * @mdev:	DRBD device.
 * @w:		work object.
 * @cancel:	The connection will be closed anyways
 */
1385
int w_send_read_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1386 1387
{
	struct drbd_request *req = container_of(w, struct drbd_request, w);
1388
	struct drbd_device *mdev = w->mdev;
1389
	struct drbd_tconn *tconn = mdev->tconn;
1390
	int err;
P
Philipp Reisner 已提交
1391 1392

	if (unlikely(cancel)) {
1393
		req_mod(req, SEND_CANCELED);
1394
		return 0;
P
Philipp Reisner 已提交
1395 1396
	}

1397 1398
	/* Even read requests may close a write epoch,
	 * if there was any yet. */
1399
	maybe_send_barrier(tconn, req->epoch);
1400

1401
	err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size,
1402
				 (unsigned long)req);
P
Philipp Reisner 已提交
1403

1404
	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
P
Philipp Reisner 已提交
1405

1406
	return err;
P
Philipp Reisner 已提交
1407 1408
}

1409
int w_restart_disk_io(struct drbd_work *w, int cancel)
1410 1411
{
	struct drbd_request *req = container_of(w, struct drbd_request, w);
1412
	struct drbd_device *mdev = w->mdev;
1413

1414
	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1415
		drbd_al_begin_io(mdev, &req->i, false);
1416 1417 1418 1419 1420

	drbd_req_make_private_bio(req, req->master_bio);
	req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
	generic_make_request(req->private_bio);

1421
	return 0;
1422 1423
}

1424
static int _drbd_may_sync_now(struct drbd_device *mdev)
P
Philipp Reisner 已提交
1425
{
1426
	struct drbd_device *odev = mdev;
1427
	int resync_after;
P
Philipp Reisner 已提交
1428 1429

	while (1) {
1430
		if (!odev->ldev || odev->state.disk == D_DISKLESS)
1431
			return 1;
P
Philipp Reisner 已提交
1432
		rcu_read_lock();
1433
		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
P
Philipp Reisner 已提交
1434
		rcu_read_unlock();
1435
		if (resync_after == -1)
P
Philipp Reisner 已提交
1436
			return 1;
1437
		odev = minor_to_mdev(resync_after);
1438
		if (!odev)
1439
			return 1;
P
Philipp Reisner 已提交
1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453
		if ((odev->state.conn >= C_SYNC_SOURCE &&
		     odev->state.conn <= C_PAUSED_SYNC_T) ||
		    odev->state.aftr_isp || odev->state.peer_isp ||
		    odev->state.user_isp)
			return 0;
	}
}

/**
 * _drbd_pause_after() - Pause resync on all devices that may not resync now
 * @mdev:	DRBD device.
 *
 * Called from process context only (admin command and after_state_ch).
 */
1454
static int _drbd_pause_after(struct drbd_device *mdev)
P
Philipp Reisner 已提交
1455
{
1456
	struct drbd_device *odev;
P
Philipp Reisner 已提交
1457 1458
	int i, rv = 0;

1459
	rcu_read_lock();
1460
	idr_for_each_entry(&minors, odev, i) {
P
Philipp Reisner 已提交
1461 1462 1463 1464 1465 1466
		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
			continue;
		if (!_drbd_may_sync_now(odev))
			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
			       != SS_NOTHING_TO_DO);
	}
1467
	rcu_read_unlock();
P
Philipp Reisner 已提交
1468 1469 1470 1471 1472 1473 1474 1475 1476 1477

	return rv;
}

/**
 * _drbd_resume_next() - Resume resync on all devices that may resync now
 * @mdev:	DRBD device.
 *
 * Called from process context only (admin command and worker).
 */
1478
static int _drbd_resume_next(struct drbd_device *mdev)
P
Philipp Reisner 已提交
1479
{
1480
	struct drbd_device *odev;
P
Philipp Reisner 已提交
1481 1482
	int i, rv = 0;

1483
	rcu_read_lock();
1484
	idr_for_each_entry(&minors, odev, i) {
P
Philipp Reisner 已提交
1485 1486 1487 1488 1489 1490 1491 1492 1493
		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
			continue;
		if (odev->state.aftr_isp) {
			if (_drbd_may_sync_now(odev))
				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
							CS_HARD, NULL)
				       != SS_NOTHING_TO_DO) ;
		}
	}
1494
	rcu_read_unlock();
P
Philipp Reisner 已提交
1495 1496 1497
	return rv;
}

1498
void resume_next_sg(struct drbd_device *mdev)
P
Philipp Reisner 已提交
1499 1500 1501 1502 1503 1504
{
	write_lock_irq(&global_state_lock);
	_drbd_resume_next(mdev);
	write_unlock_irq(&global_state_lock);
}

1505
void suspend_other_sg(struct drbd_device *mdev)
P
Philipp Reisner 已提交
1506 1507 1508 1509 1510 1511
{
	write_lock_irq(&global_state_lock);
	_drbd_pause_after(mdev);
	write_unlock_irq(&global_state_lock);
}

1512
/* caller must hold global_state_lock */
1513
enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *mdev, int o_minor)
P
Philipp Reisner 已提交
1514
{
1515
	struct drbd_device *odev;
1516
	int resync_after;
P
Philipp Reisner 已提交
1517 1518 1519

	if (o_minor == -1)
		return NO_ERROR;
1520
	if (o_minor < -1 || o_minor > MINORMASK)
1521
		return ERR_RESYNC_AFTER;
P
Philipp Reisner 已提交
1522 1523 1524 1525 1526

	/* check for loops */
	odev = minor_to_mdev(o_minor);
	while (1) {
		if (odev == mdev)
1527
			return ERR_RESYNC_AFTER_CYCLE;
P
Philipp Reisner 已提交
1528

1529 1530 1531 1532 1533 1534 1535 1536 1537
		/* You are free to depend on diskless, non-existing,
		 * or not yet/no longer existing minors.
		 * We only reject dependency loops.
		 * We cannot follow the dependency chain beyond a detached or
		 * missing minor.
		 */
		if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
			return NO_ERROR;

P
Philipp Reisner 已提交
1538
		rcu_read_lock();
1539
		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
P
Philipp Reisner 已提交
1540
		rcu_read_unlock();
P
Philipp Reisner 已提交
1541
		/* dependency chain ends here, no cycles. */
1542
		if (resync_after == -1)
P
Philipp Reisner 已提交
1543 1544 1545
			return NO_ERROR;

		/* follow the dependency chain */
1546
		odev = minor_to_mdev(resync_after);
P
Philipp Reisner 已提交
1547 1548 1549
	}
}

1550
/* caller must hold global_state_lock */
1551
void drbd_resync_after_changed(struct drbd_device *mdev)
P
Philipp Reisner 已提交
1552 1553 1554
{
	int changes;

1555 1556 1557 1558
	do {
		changes  = _drbd_pause_after(mdev);
		changes |= _drbd_resume_next(mdev);
	} while (changes);
P
Philipp Reisner 已提交
1559 1560
}

1561
void drbd_rs_controller_reset(struct drbd_device *mdev)
1562
{
P
Philipp Reisner 已提交
1563 1564
	struct fifo_buffer *plan;

1565 1566 1567
	atomic_set(&mdev->rs_sect_in, 0);
	atomic_set(&mdev->rs_sect_ev, 0);
	mdev->rs_in_flight = 0;
P
Philipp Reisner 已提交
1568 1569 1570 1571 1572 1573 1574 1575 1576 1577

	/* Updating the RCU protected object in place is necessary since
	   this function gets called from atomic context.
	   It is valid since all other updates also lead to an completely
	   empty fifo */
	rcu_read_lock();
	plan = rcu_dereference(mdev->rs_plan_s);
	plan->total = 0;
	fifo_set(plan, 0);
	rcu_read_unlock();
1578 1579
}

P
Philipp Reisner 已提交
1580 1581
void start_resync_timer_fn(unsigned long data)
{
1582
	struct drbd_device *mdev = (struct drbd_device *) data;
P
Philipp Reisner 已提交
1583

1584
	drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work);
P
Philipp Reisner 已提交
1585 1586
}

1587
int w_start_resync(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1588
{
1589
	struct drbd_device *mdev = w->mdev;
1590

P
Philipp Reisner 已提交
1591 1592 1593 1594
	if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
		dev_warn(DEV, "w_start_resync later...\n");
		mdev->start_resync_timer.expires = jiffies + HZ/10;
		add_timer(&mdev->start_resync_timer);
1595
		return 0;
P
Philipp Reisner 已提交
1596 1597 1598
	}

	drbd_start_resync(mdev, C_SYNC_SOURCE);
1599
	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
1600
	return 0;
P
Philipp Reisner 已提交
1601 1602
}

P
Philipp Reisner 已提交
1603 1604 1605 1606 1607 1608 1609 1610
/**
 * drbd_start_resync() - Start the resync process
 * @mdev:	DRBD device.
 * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
 *
 * This function might bring you directly into one of the
 * C_PAUSED_SYNC_* states.
 */
1611
void drbd_start_resync(struct drbd_device *mdev, enum drbd_conns side)
P
Philipp Reisner 已提交
1612 1613 1614 1615
{
	union drbd_state ns;
	int r;

1616
	if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) {
P
Philipp Reisner 已提交
1617 1618 1619 1620
		dev_err(DEV, "Resync already running!\n");
		return;
	}

1621 1622 1623 1624 1625 1626 1627 1628 1629
	if (!test_bit(B_RS_H_DONE, &mdev->flags)) {
		if (side == C_SYNC_TARGET) {
			/* Since application IO was locked out during C_WF_BITMAP_T and
			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
			   we check that we might make the data inconsistent. */
			r = drbd_khelper(mdev, "before-resync-target");
			r = (r >> 8) & 0xff;
			if (r > 0) {
				dev_info(DEV, "before-resync-target handler returned %d, "
1630
					 "dropping connection.\n", r);
1631
				conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
1632 1633
				return;
			}
1634 1635 1636 1637 1638 1639 1640 1641 1642 1643
		} else /* C_SYNC_SOURCE */ {
			r = drbd_khelper(mdev, "before-resync-source");
			r = (r >> 8) & 0xff;
			if (r > 0) {
				if (r == 3) {
					dev_info(DEV, "before-resync-source handler returned %d, "
						 "ignoring. Old userland tools?", r);
				} else {
					dev_info(DEV, "before-resync-source handler returned %d, "
						 "dropping connection.\n", r);
1644
					conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
1645 1646 1647
					return;
				}
			}
1648
		}
P
Philipp Reisner 已提交
1649 1650
	}

1651
	if (current == mdev->tconn->worker.task) {
1652
		/* The worker should not sleep waiting for state_mutex,
1653
		   that can take long */
1654
		if (!mutex_trylock(mdev->state_mutex)) {
1655 1656 1657 1658 1659 1660
			set_bit(B_RS_H_DONE, &mdev->flags);
			mdev->start_resync_timer.expires = jiffies + HZ/5;
			add_timer(&mdev->start_resync_timer);
			return;
		}
	} else {
1661
		mutex_lock(mdev->state_mutex);
1662 1663
	}
	clear_bit(B_RS_H_DONE, &mdev->flags);
P
Philipp Reisner 已提交
1664

1665
	write_lock_irq(&global_state_lock);
1666 1667 1668
	/* Did some connection breakage or IO error race with us? */
	if (mdev->state.conn < C_CONNECTED
	|| !get_ldev_if_state(mdev, D_NEGOTIATING)) {
1669
		write_unlock_irq(&global_state_lock);
1670
		mutex_unlock(mdev->state_mutex);
P
Philipp Reisner 已提交
1671 1672 1673
		return;
	}

1674
	ns = drbd_read_state(mdev);
P
Philipp Reisner 已提交
1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685

	ns.aftr_isp = !_drbd_may_sync_now(mdev);

	ns.conn = side;

	if (side == C_SYNC_TARGET)
		ns.disk = D_INCONSISTENT;
	else /* side == C_SYNC_SOURCE */
		ns.pdsk = D_INCONSISTENT;

	r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1686
	ns = drbd_read_state(mdev);
P
Philipp Reisner 已提交
1687 1688 1689 1690 1691

	if (ns.conn < C_CONNECTED)
		r = SS_UNKNOWN_ERROR;

	if (r == SS_SUCCESS) {
1692 1693 1694 1695
		unsigned long tw = drbd_bm_total_weight(mdev);
		unsigned long now = jiffies;
		int i;

P
Philipp Reisner 已提交
1696 1697 1698
		mdev->rs_failed    = 0;
		mdev->rs_paused    = 0;
		mdev->rs_same_csum = 0;
1699 1700
		mdev->rs_last_events = 0;
		mdev->rs_last_sect_ev = 0;
1701 1702 1703 1704 1705 1706
		mdev->rs_total     = tw;
		mdev->rs_start     = now;
		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
			mdev->rs_mark_left[i] = tw;
			mdev->rs_mark_time[i] = now;
		}
P
Philipp Reisner 已提交
1707 1708 1709
		_drbd_pause_after(mdev);
	}
	write_unlock_irq(&global_state_lock);
1710

P
Philipp Reisner 已提交
1711
	if (r == SS_SUCCESS) {
1712 1713 1714 1715
		/* reset rs_last_bcast when a resync or verify is started,
		 * to deal with potential jiffies wrap. */
		mdev->rs_last_bcast = jiffies - HZ;

P
Philipp Reisner 已提交
1716 1717 1718 1719
		dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
		     drbd_conn_str(ns.conn),
		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
		     (unsigned long) mdev->rs_total);
1720 1721 1722 1723 1724 1725 1726 1727 1728 1729
		if (side == C_SYNC_TARGET)
			mdev->bm_resync_fo = 0;

		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
		 * with w_send_oos, or the sync target will get confused as to
		 * how much bits to resync.  We cannot do that always, because for an
		 * empty resync and protocol < 95, we need to do it here, as we call
		 * drbd_resync_finished from here in that case.
		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
		 * and from after_state_ch otherwise. */
1730
		if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96)
1731
			drbd_gen_and_send_sync_uuid(mdev);
P
Philipp Reisner 已提交
1732

1733
		if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) {
1734 1735 1736 1737 1738 1739 1740 1741 1742 1743
			/* This still has a race (about when exactly the peers
			 * detect connection loss) that can lead to a full sync
			 * on next handshake. In 8.3.9 we fixed this with explicit
			 * resync-finished notifications, but the fix
			 * introduces a protocol change.  Sleeping for some
			 * time longer than the ping interval + timeout on the
			 * SyncSource, to give the SyncTarget the chance to
			 * detect connection loss, then waiting for a ping
			 * response (implicit in drbd_resync_finished) reduces
			 * the race considerably, but does not solve it. */
1744 1745 1746 1747 1748 1749 1750 1751 1752 1753
			if (side == C_SYNC_SOURCE) {
				struct net_conf *nc;
				int timeo;

				rcu_read_lock();
				nc = rcu_dereference(mdev->tconn->net_conf);
				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
				rcu_read_unlock();
				schedule_timeout_interruptible(timeo);
			}
P
Philipp Reisner 已提交
1754 1755 1756
			drbd_resync_finished(mdev);
		}

1757
		drbd_rs_controller_reset(mdev);
P
Philipp Reisner 已提交
1758 1759 1760 1761 1762 1763 1764 1765 1766
		/* ns.conn may already be != mdev->state.conn,
		 * we may have been paused in between, or become paused until
		 * the timer triggers.
		 * No matter, that is handled in resync_timer_fn() */
		if (ns.conn == C_SYNC_TARGET)
			mod_timer(&mdev->resync_timer, jiffies);

		drbd_md_sync(mdev);
	}
1767
	put_ldev(mdev);
1768
	mutex_unlock(mdev->state_mutex);
P
Philipp Reisner 已提交
1769 1770
}

1771 1772 1773 1774
/* If the resource already closed the current epoch, but we did not
 * (because we have not yet seen new requests), we should send the
 * corresponding barrier now.  Must be checked within the same spinlock
 * that is used to check for new requests. */
1775
static bool need_to_send_barrier(struct drbd_tconn *connection)
1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798
{
	if (!connection->send.seen_any_write_yet)
		return false;

	/* Skip barriers that do not contain any writes.
	 * This may happen during AHEAD mode. */
	if (!connection->send.current_epoch_writes)
		return false;

	/* ->req_lock is held when requests are queued on
	 * connection->sender_work, and put into ->transfer_log.
	 * It is also held when ->current_tle_nr is increased.
	 * So either there are already new requests queued,
	 * and corresponding barriers will be send there.
	 * Or nothing new is queued yet, so the difference will be 1.
	 */
	if (atomic_read(&connection->current_tle_nr) !=
	    connection->send.current_epoch_nr + 1)
		return false;

	return true;
}

1799
static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
1800 1801 1802 1803 1804 1805 1806
{
	spin_lock_irq(&queue->q_lock);
	list_splice_init(&queue->q, work_list);
	spin_unlock_irq(&queue->q_lock);
	return !list_empty(work_list);
}

1807
static bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
1808 1809 1810 1811 1812 1813 1814 1815
{
	spin_lock_irq(&queue->q_lock);
	if (!list_empty(&queue->q))
		list_move(queue->q.next, work_list);
	spin_unlock_irq(&queue->q_lock);
	return !list_empty(work_list);
}

1816
static void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list)
1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847
{
	DEFINE_WAIT(wait);
	struct net_conf *nc;
	int uncork, cork;

	dequeue_work_item(&connection->sender_work, work_list);
	if (!list_empty(work_list))
		return;

	/* Still nothing to do?
	 * Maybe we still need to close the current epoch,
	 * even if no new requests are queued yet.
	 *
	 * Also, poke TCP, just in case.
	 * Then wait for new work (or signal). */
	rcu_read_lock();
	nc = rcu_dereference(connection->net_conf);
	uncork = nc ? nc->tcp_cork : 0;
	rcu_read_unlock();
	if (uncork) {
		mutex_lock(&connection->data.mutex);
		if (connection->data.socket)
			drbd_tcp_uncork(connection->data.socket);
		mutex_unlock(&connection->data.mutex);
	}

	for (;;) {
		int send_barrier;
		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
		spin_lock_irq(&connection->req_lock);
		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
1848 1849 1850 1851
		/* dequeue single item only,
		 * we still use drbd_queue_work_front() in some places */
		if (!list_empty(&connection->sender_work.q))
			list_move(connection->sender_work.q.next, work_list);
1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884
		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
		if (!list_empty(work_list) || signal_pending(current)) {
			spin_unlock_irq(&connection->req_lock);
			break;
		}
		send_barrier = need_to_send_barrier(connection);
		spin_unlock_irq(&connection->req_lock);
		if (send_barrier) {
			drbd_send_barrier(connection);
			connection->send.current_epoch_nr++;
		}
		schedule();
		/* may be woken up for other things but new work, too,
		 * e.g. if the current epoch got closed.
		 * In which case we send the barrier above. */
	}
	finish_wait(&connection->sender_work.q_wait, &wait);

	/* someone may have changed the config while we have been waiting above. */
	rcu_read_lock();
	nc = rcu_dereference(connection->net_conf);
	cork = nc ? nc->tcp_cork : 0;
	rcu_read_unlock();
	mutex_lock(&connection->data.mutex);
	if (connection->data.socket) {
		if (cork)
			drbd_tcp_cork(connection->data.socket);
		else if (!uncork)
			drbd_tcp_uncork(connection->data.socket);
	}
	mutex_unlock(&connection->data.mutex);
}

P
Philipp Reisner 已提交
1885 1886
int drbd_worker(struct drbd_thread *thi)
{
1887
	struct drbd_tconn *tconn = thi->tconn;
P
Philipp Reisner 已提交
1888
	struct drbd_work *w = NULL;
1889
	struct drbd_device *mdev;
P
Philipp Reisner 已提交
1890
	LIST_HEAD(work_list);
1891
	int vnr;
P
Philipp Reisner 已提交
1892

1893
	while (get_t_state(thi) == RUNNING) {
1894
		drbd_thread_current_set_cpu(thi);
P
Philipp Reisner 已提交
1895

1896 1897 1898
		/* as long as we use drbd_queue_work_front(),
		 * we may only dequeue single work items here, not batches. */
		if (list_empty(&work_list))
1899
			wait_for_work(tconn, &work_list);
P
Philipp Reisner 已提交
1900

1901
		if (signal_pending(current)) {
P
Philipp Reisner 已提交
1902
			flush_signals(current);
1903 1904
			if (get_t_state(thi) == RUNNING) {
				conn_warn(tconn, "Worker got an unexpected signal\n");
P
Philipp Reisner 已提交
1905
				continue;
1906
			}
P
Philipp Reisner 已提交
1907 1908 1909
			break;
		}

1910
		if (get_t_state(thi) != RUNNING)
P
Philipp Reisner 已提交
1911 1912
			break;

1913 1914 1915 1916 1917
		while (!list_empty(&work_list)) {
			w = list_first_entry(&work_list, struct drbd_work, list);
			list_del_init(&w->list);
			if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0)
				continue;
1918 1919
			if (tconn->cstate >= C_WF_REPORT_PARAMS)
				conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
P
Philipp Reisner 已提交
1920 1921 1922
		}
	}

1923
	do {
P
Philipp Reisner 已提交
1924
		while (!list_empty(&work_list)) {
1925
			w = list_first_entry(&work_list, struct drbd_work, list);
P
Philipp Reisner 已提交
1926
			list_del_init(&w->list);
1927
			w->cb(w, 1);
P
Philipp Reisner 已提交
1928
		}
1929
		dequeue_work_batch(&tconn->sender_work, &work_list);
1930
	} while (!list_empty(&work_list));
P
Philipp Reisner 已提交
1931

P
Philipp Reisner 已提交
1932
	rcu_read_lock();
1933
	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1934
		D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
P
Philipp Reisner 已提交
1935 1936
		kref_get(&mdev->kref);
		rcu_read_unlock();
1937
		drbd_mdev_cleanup(mdev);
P
Philipp Reisner 已提交
1938 1939
		kref_put(&mdev->kref, &drbd_minor_destroy);
		rcu_read_lock();
1940
	}
P
Philipp Reisner 已提交
1941
	rcu_read_unlock();
P
Philipp Reisner 已提交
1942 1943 1944

	return 0;
}