drbd_worker.c 53.5 KB
Newer Older
P
Philipp Reisner 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
/*
   drbd_worker.c

   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.

   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.

   drbd is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   drbd is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with drbd; see the file COPYING.  If not, write to
   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.

 */

#include <linux/module.h>
#include <linux/drbd.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/string.h>
#include <linux/scatterlist.h>

#include "drbd_int.h"
#include "drbd_req.h"

41
static int w_make_ov_request(struct drbd_work *w, int cancel);
P
Philipp Reisner 已提交
42 43


44 45
/* endio handlers:
 *   drbd_md_io_complete (defined here)
46 47
 *   drbd_request_endio (defined here)
 *   drbd_peer_request_endio (defined here)
48 49
 *   bm_async_io_complete (defined in drbd_bitmap.c)
 *
P
Philipp Reisner 已提交
50 51 52 53 54 55 56 57 58 59
 * For all these callbacks, note the following:
 * The callbacks will be called in irq context by the IDE drivers,
 * and in Softirqs/Tasklets/BH context by the SCSI drivers.
 * Try to get the locking right :)
 *
 */


/* About the global_state_lock
   Each state transition on an device holds a read lock. In case we have
60
   to evaluate the resync after dependencies, we grab a write lock, because
P
Philipp Reisner 已提交
61 62 63 64 65 66 67 68 69
   we need stable states on all devices for that.  */
rwlock_t global_state_lock;

/* used for synchronous meta data and bitmap IO
 * submitted by drbd_md_sync_page_io()
 */
void drbd_md_io_complete(struct bio *bio, int error)
{
	struct drbd_md_io *md_io;
70
	struct drbd_conf *mdev;
P
Philipp Reisner 已提交
71 72

	md_io = (struct drbd_md_io *)bio->bi_private;
73 74
	mdev = container_of(md_io, struct drbd_conf, md_io);

P
Philipp Reisner 已提交
75 76
	md_io->error = error;

77 78 79 80 81 82 83 84 85 86 87 88
	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
	 * to timeout on the lower level device, and eventually detach from it.
	 * If this io completion runs after that timeout expired, this
	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
	 * During normal operation, this only puts that extra reference
	 * down to 1 again.
	 * Make sure we first drop the reference, and only then signal
	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
	 * next drbd_md_sync_page_io(), that we trigger the
	 * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there.
	 */
	drbd_md_put_buffer(mdev);
89 90 91
	md_io->done = 1;
	wake_up(&mdev->misc_wait);
	bio_put(bio);
92 93
	if (mdev->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
		put_ldev(mdev);
P
Philipp Reisner 已提交
94 95 96 97 98
}

/* reads on behalf of the partner,
 * "submitted" by the receiver
 */
99
static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
P
Philipp Reisner 已提交
100 101
{
	unsigned long flags = 0;
102
	struct drbd_conf *mdev = peer_req->w.mdev;
P
Philipp Reisner 已提交
103

104
	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
105 106
	mdev->read_cnt += peer_req->i.size >> 9;
	list_del(&peer_req->w.list);
P
Philipp Reisner 已提交
107 108
	if (list_empty(&mdev->read_ee))
		wake_up(&mdev->ee_wait);
109
	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
110
		__drbd_chk_io_error(mdev, DRBD_READ_ERROR);
111
	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
P
Philipp Reisner 已提交
112

113
	drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w);
P
Philipp Reisner 已提交
114 115 116 117
	put_ldev(mdev);
}

/* writes on behalf of the partner, or resync writes,
118
 * "submitted" by the receiver, final stage.  */
119
static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
P
Philipp Reisner 已提交
120 121
{
	unsigned long flags = 0;
122
	struct drbd_conf *mdev = peer_req->w.mdev;
123
	struct drbd_interval i;
P
Philipp Reisner 已提交
124
	int do_wake;
125
	u64 block_id;
P
Philipp Reisner 已提交
126 127
	int do_al_complete_io;

128
	/* after we moved peer_req to done_ee,
P
Philipp Reisner 已提交
129 130 131
	 * we may no longer access it,
	 * it may be freed/reused already!
	 * (as soon as we release the req_lock) */
132
	i = peer_req->i;
133 134
	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
	block_id = peer_req->block_id;
P
Philipp Reisner 已提交
135

136
	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
137
	mdev->writ_cnt += peer_req->i.size >> 9;
138
	list_move_tail(&peer_req->w.list, &mdev->done_ee);
P
Philipp Reisner 已提交
139

140
	/*
141
	 * Do not remove from the write_requests tree here: we did not send the
142 143 144 145 146
	 * Ack yet and did not wake possibly waiting conflicting requests.
	 * Removed from the tree from "drbd_process_done_ee" within the
	 * appropriate w.cb (e_end_block/e_end_resync_block) or from
	 * _drbd_clear_done_ee.
	 */
P
Philipp Reisner 已提交
147

148
	do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee);
P
Philipp Reisner 已提交
149

150
	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
151
		__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
152
	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
P
Philipp Reisner 已提交
153

154
	if (block_id == ID_SYNCER)
155
		drbd_rs_complete_io(mdev, i.sector);
P
Philipp Reisner 已提交
156 157 158 159 160

	if (do_wake)
		wake_up(&mdev->ee_wait);

	if (do_al_complete_io)
161
		drbd_al_complete_io(mdev, &i);
P
Philipp Reisner 已提交
162

163
	wake_asender(mdev->tconn);
P
Philipp Reisner 已提交
164
	put_ldev(mdev);
165
}
P
Philipp Reisner 已提交
166

167 168 169
/* writes on behalf of the partner, or resync writes,
 * "submitted" by the receiver.
 */
170
void drbd_peer_request_endio(struct bio *bio, int error)
171
{
172
	struct drbd_peer_request *peer_req = bio->bi_private;
173
	struct drbd_conf *mdev = peer_req->w.mdev;
174 175 176
	int uptodate = bio_flagged(bio, BIO_UPTODATE);
	int is_write = bio_data_dir(bio) == WRITE;

177
	if (error && __ratelimit(&drbd_ratelimit_state))
178 179
		dev_warn(DEV, "%s: error=%d s=%llus\n",
				is_write ? "write" : "read", error,
180
				(unsigned long long)peer_req->i.sector);
181
	if (!error && !uptodate) {
182 183 184
		if (__ratelimit(&drbd_ratelimit_state))
			dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
					is_write ? "write" : "read",
185
					(unsigned long long)peer_req->i.sector);
186 187 188 189 190 191 192
		/* strange behavior of some lower level drivers...
		 * fail the request by clearing the uptodate flag,
		 * but do not return any error?! */
		error = -EIO;
	}

	if (error)
193
		set_bit(__EE_WAS_ERROR, &peer_req->flags);
194 195

	bio_put(bio); /* no need for the bio anymore */
196
	if (atomic_dec_and_test(&peer_req->pending_bios)) {
197
		if (is_write)
198
			drbd_endio_write_sec_final(peer_req);
199
		else
200
			drbd_endio_read_sec_final(peer_req);
201
	}
P
Philipp Reisner 已提交
202 203 204 205
}

/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
 */
206
void drbd_request_endio(struct bio *bio, int error)
P
Philipp Reisner 已提交
207
{
208
	unsigned long flags;
P
Philipp Reisner 已提交
209
	struct drbd_request *req = bio->bi_private;
210
	struct drbd_conf *mdev = req->w.mdev;
211
	struct bio_and_error m;
P
Philipp Reisner 已提交
212 213 214 215 216 217 218 219 220 221 222 223
	enum drbd_req_event what;
	int uptodate = bio_flagged(bio, BIO_UPTODATE);

	if (!error && !uptodate) {
		dev_warn(DEV, "p %s: setting error to -EIO\n",
			 bio_data_dir(bio) == WRITE ? "write" : "read");
		/* strange behavior of some lower level drivers...
		 * fail the request by clearing the uptodate flag,
		 * but do not return any error?! */
		error = -EIO;
	}

224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260

	/* If this request was aborted locally before,
	 * but now was completed "successfully",
	 * chances are that this caused arbitrary data corruption.
	 *
	 * "aborting" requests, or force-detaching the disk, is intended for
	 * completely blocked/hung local backing devices which do no longer
	 * complete requests at all, not even do error completions.  In this
	 * situation, usually a hard-reset and failover is the only way out.
	 *
	 * By "aborting", basically faking a local error-completion,
	 * we allow for a more graceful swichover by cleanly migrating services.
	 * Still the affected node has to be rebooted "soon".
	 *
	 * By completing these requests, we allow the upper layers to re-use
	 * the associated data pages.
	 *
	 * If later the local backing device "recovers", and now DMAs some data
	 * from disk into the original request pages, in the best case it will
	 * just put random data into unused pages; but typically it will corrupt
	 * meanwhile completely unrelated data, causing all sorts of damage.
	 *
	 * Which means delayed successful completion,
	 * especially for READ requests,
	 * is a reason to panic().
	 *
	 * We assume that a delayed *error* completion is OK,
	 * though we still will complain noisily about it.
	 */
	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
		if (__ratelimit(&drbd_ratelimit_state))
			dev_emerg(DEV, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");

		if (!error)
			panic("possible random memory corruption caused by delayed completion of aborted local request\n");
	}

P
Philipp Reisner 已提交
261 262 263
	/* to avoid recursion in __req_mod */
	if (unlikely(error)) {
		what = (bio_data_dir(bio) == WRITE)
264
			? WRITE_COMPLETED_WITH_ERROR
265
			: (bio_rw(bio) == READ)
266 267
			  ? READ_COMPLETED_WITH_ERROR
			  : READ_AHEAD_COMPLETED_WITH_ERROR;
P
Philipp Reisner 已提交
268
	} else
269
		what = COMPLETED_OK;
P
Philipp Reisner 已提交
270 271 272 273

	bio_put(req->private_bio);
	req->private_bio = ERR_PTR(error);

274
	/* not req_mod(), we need irqsave here! */
275
	spin_lock_irqsave(&mdev->tconn->req_lock, flags);
276
	__req_mod(req, what, &m);
277
	spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
278
	put_ldev(mdev);
279 280 281

	if (m.bio)
		complete_master_bio(mdev, &m);
P
Philipp Reisner 已提交
282 283
}

284
void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm,
285
		  struct drbd_peer_request *peer_req, void *digest)
286 287 288
{
	struct hash_desc desc;
	struct scatterlist sg;
289
	struct page *page = peer_req->pages;
290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
	struct page *tmp;
	unsigned len;

	desc.tfm = tfm;
	desc.flags = 0;

	sg_init_table(&sg, 1);
	crypto_hash_init(&desc);

	while ((tmp = page_chain_next(page))) {
		/* all but the last page will be fully used */
		sg_set_page(&sg, page, PAGE_SIZE, 0);
		crypto_hash_update(&desc, &sg, sg.length);
		page = tmp;
	}
	/* and now the last, possibly only partially used page */
306
	len = peer_req->i.size & (PAGE_SIZE - 1);
307 308 309 310 311 312
	sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
	crypto_hash_update(&desc, &sg, sg.length);
	crypto_hash_final(&desc, digest);
}

void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
P
Philipp Reisner 已提交
313 314 315
{
	struct hash_desc desc;
	struct scatterlist sg;
316 317
	struct bio_vec bvec;
	struct bvec_iter iter;
P
Philipp Reisner 已提交
318 319 320 321 322 323 324

	desc.tfm = tfm;
	desc.flags = 0;

	sg_init_table(&sg, 1);
	crypto_hash_init(&desc);

325 326
	bio_for_each_segment(bvec, bio, iter) {
		sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
P
Philipp Reisner 已提交
327 328 329 330 331
		crypto_hash_update(&desc, &sg, sg.length);
	}
	crypto_hash_final(&desc, digest);
}

332
/* MAYBE merge common code with w_e_end_ov_req */
333
static int w_e_send_csum(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
334
{
335 336
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
	struct drbd_conf *mdev = w->mdev;
P
Philipp Reisner 已提交
337 338
	int digest_size;
	void *digest;
339
	int err = 0;
P
Philipp Reisner 已提交
340

341 342
	if (unlikely(cancel))
		goto out;
P
Philipp Reisner 已提交
343

344
	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
345
		goto out;
P
Philipp Reisner 已提交
346

347
	digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
348 349
	digest = kmalloc(digest_size, GFP_NOIO);
	if (digest) {
350 351
		sector_t sector = peer_req->i.sector;
		unsigned int size = peer_req->i.size;
352
		drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
353
		/* Free peer_req and pages before send.
354 355 356
		 * In case we block on congestion, we could otherwise run into
		 * some distributed deadlock, if the other side blocks on
		 * congestion as well, because our receiver blocks in
357
		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
358
		drbd_free_peer_req(mdev, peer_req);
359
		peer_req = NULL;
360
		inc_rs_pending(mdev);
361
		err = drbd_send_drequest_csum(mdev, sector, size,
362 363
					      digest, digest_size,
					      P_CSUM_RS_REQUEST);
364 365 366
		kfree(digest);
	} else {
		dev_err(DEV, "kmalloc() of digest failed.\n");
367
		err = -ENOMEM;
368
	}
P
Philipp Reisner 已提交
369

370
out:
371
	if (peer_req)
372
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
373

374
	if (unlikely(err))
P
Philipp Reisner 已提交
375
		dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
376
	return err;
P
Philipp Reisner 已提交
377 378 379 380 381 382
}

#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)

static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
{
383
	struct drbd_peer_request *peer_req;
P
Philipp Reisner 已提交
384 385

	if (!get_ldev(mdev))
386
		return -EIO;
P
Philipp Reisner 已提交
387

388
	if (drbd_rs_should_slow_down(mdev, sector))
389 390
		goto defer;

P
Philipp Reisner 已提交
391 392
	/* GFP_TRY, because if there is no memory available right now, this may
	 * be rescheduled for later. It is "only" background resync, after all. */
393 394
	peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector,
				       size, GFP_TRY);
395
	if (!peer_req)
396
		goto defer;
P
Philipp Reisner 已提交
397

398
	peer_req->w.cb = w_e_send_csum;
399
	spin_lock_irq(&mdev->tconn->req_lock);
400
	list_add(&peer_req->w.list, &mdev->read_ee);
401
	spin_unlock_irq(&mdev->tconn->req_lock);
P
Philipp Reisner 已提交
402

403
	atomic_add(size >> 9, &mdev->rs_sect_ev);
404
	if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
405
		return 0;
P
Philipp Reisner 已提交
406

407 408 409 410
	/* If it failed because of ENOMEM, retry should help.  If it failed
	 * because bio_add_page failed (probably broken lower level driver),
	 * retry may or may not help.
	 * If it does not, you may need to force disconnect. */
411
	spin_lock_irq(&mdev->tconn->req_lock);
412
	list_del(&peer_req->w.list);
413
	spin_unlock_irq(&mdev->tconn->req_lock);
414

415
	drbd_free_peer_req(mdev, peer_req);
416
defer:
417
	put_ldev(mdev);
418
	return -EAGAIN;
P
Philipp Reisner 已提交
419 420
}

421
int w_resync_timer(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
422
{
423
	struct drbd_conf *mdev = w->mdev;
424 425
	switch (mdev->state.conn) {
	case C_VERIFY_S:
426
		w_make_ov_request(w, cancel);
427 428
		break;
	case C_SYNC_TARGET:
429
		w_make_resync_request(w, cancel);
430
		break;
P
Philipp Reisner 已提交
431 432
	}

433
	return 0;
434 435 436 437 438 439 440
}

void resync_timer_fn(unsigned long data)
{
	struct drbd_conf *mdev = (struct drbd_conf *) data;

	if (list_empty(&mdev->resync_work.list))
441
		drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work);
P
Philipp Reisner 已提交
442 443
}

444 445 446 447 448
static void fifo_set(struct fifo_buffer *fb, int value)
{
	int i;

	for (i = 0; i < fb->size; i++)
449
		fb->values[i] = value;
450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472
}

static int fifo_push(struct fifo_buffer *fb, int value)
{
	int ov;

	ov = fb->values[fb->head_index];
	fb->values[fb->head_index++] = value;

	if (fb->head_index >= fb->size)
		fb->head_index = 0;

	return ov;
}

static void fifo_add_val(struct fifo_buffer *fb, int value)
{
	int i;

	for (i = 0; i < fb->size; i++)
		fb->values[i] += value;
}

473 474 475 476
struct fifo_buffer *fifo_alloc(int fifo_size)
{
	struct fifo_buffer *fb;

477
	fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
478 479 480 481 482 483 484 485 486 487
	if (!fb)
		return NULL;

	fb->head_index = 0;
	fb->size = fifo_size;
	fb->total = 0;

	return fb;
}

488
static int drbd_rs_controller(struct drbd_conf *mdev)
489
{
P
Philipp Reisner 已提交
490
	struct disk_conf *dc;
491 492 493 494 495 496 497 498
	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
	unsigned int want;     /* The number of sectors we want in the proxy */
	int req_sect; /* Number of sectors to request in this turn */
	int correction; /* Number of sectors more we need in the proxy*/
	int cps; /* correction per invocation of drbd_rs_controller() */
	int steps; /* Number of time steps to plan ahead */
	int curr_corr;
	int max_sect;
P
Philipp Reisner 已提交
499
	struct fifo_buffer *plan;
500 501 502 503

	sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
	mdev->rs_in_flight -= sect_in;

P
Philipp Reisner 已提交
504
	dc = rcu_dereference(mdev->ldev->disk_conf);
P
Philipp Reisner 已提交
505
	plan = rcu_dereference(mdev->rs_plan_s);
506

P
Philipp Reisner 已提交
507
	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
508 509

	if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
P
Philipp Reisner 已提交
510
		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
511
	} else { /* normal path */
P
Philipp Reisner 已提交
512 513
		want = dc->c_fill_target ? dc->c_fill_target :
			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
514 515
	}

P
Philipp Reisner 已提交
516
	correction = want - mdev->rs_in_flight - plan->total;
517 518 519

	/* Plan ahead */
	cps = correction / steps;
P
Philipp Reisner 已提交
520 521
	fifo_add_val(plan, cps);
	plan->total += cps * steps;
522 523

	/* What we do in this step */
P
Philipp Reisner 已提交
524 525
	curr_corr = fifo_push(plan, 0);
	plan->total -= curr_corr;
526 527 528 529 530

	req_sect = sect_in + curr_corr;
	if (req_sect < 0)
		req_sect = 0;

P
Philipp Reisner 已提交
531
	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
532 533 534 535 536 537 538 539 540 541 542 543
	if (req_sect > max_sect)
		req_sect = max_sect;

	/*
	dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
		 sect_in, mdev->rs_in_flight, want, correction,
		 steps, cps, mdev->rs_planed, curr_corr, req_sect);
	*/

	return req_sect;
}

544
static int drbd_rs_number_requests(struct drbd_conf *mdev)
545 546
{
	int number;
P
Philipp Reisner 已提交
547 548 549

	rcu_read_lock();
	if (rcu_dereference(mdev->rs_plan_s)->size) {
550 551 552
		number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
		mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
	} else {
P
Philipp Reisner 已提交
553
		mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate;
554 555
		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
	}
P
Philipp Reisner 已提交
556
	rcu_read_unlock();
557 558 559 560 561 562

	/* ignore the amount of pending requests, the resync controller should
	 * throttle down to incoming reply rate soon enough anyways. */
	return number;
}

563
int w_make_resync_request(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
564
{
565
	struct drbd_conf *mdev = w->mdev;
P
Philipp Reisner 已提交
566 567 568
	unsigned long bit;
	sector_t sector;
	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
569
	int max_bio_size;
570
	int number, rollback_i, size;
P
Philipp Reisner 已提交
571
	int align, queued, sndbuf;
572
	int i = 0;
P
Philipp Reisner 已提交
573 574

	if (unlikely(cancel))
575
		return 0;
P
Philipp Reisner 已提交
576

577 578 579
	if (mdev->rs_total == 0) {
		/* empty resync? */
		drbd_resync_finished(mdev);
580
		return 0;
581 582
	}

P
Philipp Reisner 已提交
583 584 585 586 587 588
	if (!get_ldev(mdev)) {
		/* Since we only need to access mdev->rsync a
		   get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
		   to continue resync with a broken disk makes no sense at
		   all */
		dev_err(DEV, "Disk broke down during resync!\n");
589
		return 0;
P
Philipp Reisner 已提交
590 591
	}

592
	max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
593 594
	number = drbd_rs_number_requests(mdev);
	if (number == 0)
595
		goto requeue;
P
Philipp Reisner 已提交
596 597 598

	for (i = 0; i < number; i++) {
		/* Stop generating RS requests, when half of the send buffer is filled */
599 600 601 602
		mutex_lock(&mdev->tconn->data.mutex);
		if (mdev->tconn->data.socket) {
			queued = mdev->tconn->data.socket->sk->sk_wmem_queued;
			sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf;
P
Philipp Reisner 已提交
603 604 605 606
		} else {
			queued = 1;
			sndbuf = 0;
		}
607
		mutex_unlock(&mdev->tconn->data.mutex);
P
Philipp Reisner 已提交
608 609 610 611 612 613 614
		if (queued > sndbuf / 2)
			goto requeue;

next_sector:
		size = BM_BLOCK_SIZE;
		bit  = drbd_bm_find_next(mdev, mdev->bm_resync_fo);

615
		if (bit == DRBD_END_OF_BITMAP) {
P
Philipp Reisner 已提交
616 617
			mdev->bm_resync_fo = drbd_bm_bits(mdev);
			put_ldev(mdev);
618
			return 0;
P
Philipp Reisner 已提交
619 620 621 622
		}

		sector = BM_BIT_TO_SECT(bit);

623 624
		if (drbd_rs_should_slow_down(mdev, sector) ||
		    drbd_try_rs_begin_io(mdev, sector)) {
P
Philipp Reisner 已提交
625 626 627 628 629 630 631 632 633 634
			mdev->bm_resync_fo = bit;
			goto requeue;
		}
		mdev->bm_resync_fo = bit + 1;

		if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
			drbd_rs_complete_io(mdev, sector);
			goto next_sector;
		}

635
#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
P
Philipp Reisner 已提交
636 637 638 639 640 641 642
		/* try to find some adjacent bits.
		 * we stop if we have already the maximum req size.
		 *
		 * Additionally always align bigger requests, in order to
		 * be prepared for all stripe sizes of software RAIDs.
		 */
		align = 1;
643
		rollback_i = i;
P
Philipp Reisner 已提交
644
		for (;;) {
645
			if (size + BM_BLOCK_SIZE > max_bio_size)
P
Philipp Reisner 已提交
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676
				break;

			/* Be always aligned */
			if (sector & ((1<<(align+3))-1))
				break;

			/* do not cross extent boundaries */
			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
				break;
			/* now, is it actually dirty, after all?
			 * caution, drbd_bm_test_bit is tri-state for some
			 * obscure reason; ( b == 0 ) would get the out-of-band
			 * only accidentally right because of the "oddly sized"
			 * adjustment below */
			if (drbd_bm_test_bit(mdev, bit+1) != 1)
				break;
			bit++;
			size += BM_BLOCK_SIZE;
			if ((BM_BLOCK_SIZE << align) <= size)
				align++;
			i++;
		}
		/* if we merged some,
		 * reset the offset to start the next drbd_bm_find_next from */
		if (size > BM_BLOCK_SIZE)
			mdev->bm_resync_fo = bit + 1;
#endif

		/* adjust very last sectors, in case we are oddly sized */
		if (sector + (size>>9) > capacity)
			size = (capacity-sector)<<9;
677
		if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) {
P
Philipp Reisner 已提交
678
			switch (read_for_csum(mdev, sector, size)) {
679
			case -EIO: /* Disk failure */
P
Philipp Reisner 已提交
680
				put_ldev(mdev);
681
				return -EIO;
682
			case -EAGAIN: /* allocation failed, or ldev busy */
P
Philipp Reisner 已提交
683 684
				drbd_rs_complete_io(mdev, sector);
				mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
685
				i = rollback_i;
P
Philipp Reisner 已提交
686
				goto requeue;
687 688 689 690 691
			case 0:
				/* everything ok */
				break;
			default:
				BUG();
P
Philipp Reisner 已提交
692 693
			}
		} else {
694 695
			int err;

P
Philipp Reisner 已提交
696
			inc_rs_pending(mdev);
697 698 699
			err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
						 sector, size, ID_SYNCER);
			if (err) {
P
Philipp Reisner 已提交
700 701 702
				dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
				dec_rs_pending(mdev);
				put_ldev(mdev);
703
				return err;
P
Philipp Reisner 已提交
704 705 706 707 708 709 710 711 712 713 714 715
			}
		}
	}

	if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
		/* last syncer _request_ was sent,
		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
		 * next sync group will resume), as soon as we receive the last
		 * resync data block, and the last bit is cleared.
		 * until then resync "work" is "inactive" ...
		 */
		put_ldev(mdev);
716
		return 0;
P
Philipp Reisner 已提交
717 718 719
	}

 requeue:
720
	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
P
Philipp Reisner 已提交
721 722
	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
	put_ldev(mdev);
723
	return 0;
P
Philipp Reisner 已提交
724 725
}

726
static int w_make_ov_request(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
727
{
728
	struct drbd_conf *mdev = w->mdev;
P
Philipp Reisner 已提交
729 730 731
	int number, i, size;
	sector_t sector;
	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
732
	bool stop_sector_reached = false;
P
Philipp Reisner 已提交
733 734 735 736

	if (unlikely(cancel))
		return 1;

737
	number = drbd_rs_number_requests(mdev);
P
Philipp Reisner 已提交
738 739 740

	sector = mdev->ov_position;
	for (i = 0; i < number; i++) {
741
		if (sector >= capacity)
P
Philipp Reisner 已提交
742
			return 1;
743 744 745 746 747 748 749 750 751

		/* We check for "finished" only in the reply path:
		 * w_e_end_ov_reply().
		 * We need to send at least one request out. */
		stop_sector_reached = i > 0
			&& verify_can_do_stop_sector(mdev)
			&& sector >= mdev->ov_stop_sector;
		if (stop_sector_reached)
			break;
P
Philipp Reisner 已提交
752 753 754

		size = BM_BLOCK_SIZE;

755 756
		if (drbd_rs_should_slow_down(mdev, sector) ||
		    drbd_try_rs_begin_io(mdev, sector)) {
P
Philipp Reisner 已提交
757 758 759 760 761 762 763 764
			mdev->ov_position = sector;
			goto requeue;
		}

		if (sector + (size>>9) > capacity)
			size = (capacity-sector)<<9;

		inc_rs_pending(mdev);
765
		if (drbd_send_ov_request(mdev, sector, size)) {
P
Philipp Reisner 已提交
766 767 768 769 770 771 772 773
			dec_rs_pending(mdev);
			return 0;
		}
		sector += BM_SECT_PER_BIT;
	}
	mdev->ov_position = sector;

 requeue:
774
	mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
775 776
	if (i == 0 || !stop_sector_reached)
		mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
P
Philipp Reisner 已提交
777 778 779
	return 1;
}

780
int w_ov_finished(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
781
{
782
	struct drbd_conf *mdev = w->mdev;
P
Philipp Reisner 已提交
783
	kfree(w);
784
	ov_out_of_sync_print(mdev);
P
Philipp Reisner 已提交
785 786
	drbd_resync_finished(mdev);

787
	return 0;
P
Philipp Reisner 已提交
788 789
}

790
static int w_resync_finished(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
791
{
792
	struct drbd_conf *mdev = w->mdev;
P
Philipp Reisner 已提交
793 794 795 796
	kfree(w);

	drbd_resync_finished(mdev);

797
	return 0;
P
Philipp Reisner 已提交
798 799
}

800 801
static void ping_peer(struct drbd_conf *mdev)
{
802 803 804 805 806 807
	struct drbd_tconn *tconn = mdev->tconn;

	clear_bit(GOT_PING_ACK, &tconn->flags);
	request_ping(tconn);
	wait_event(tconn->ping_wait,
		   test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED);
808 809
}

P
Philipp Reisner 已提交
810 811 812 813 814 815 816
int drbd_resync_finished(struct drbd_conf *mdev)
{
	unsigned long db, dt, dbdt;
	unsigned long n_oos;
	union drbd_state os, ns;
	struct drbd_work *w;
	char *khelper_cmd = NULL;
817
	int verify_done = 0;
P
Philipp Reisner 已提交
818 819 820 821 822 823 824 825 826 827

	/* Remove all elements from the resync LRU. Since future actions
	 * might set bits in the (main) bitmap, then the entries in the
	 * resync LRU would be wrong. */
	if (drbd_rs_del_all(mdev)) {
		/* In case this is not possible now, most probably because
		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
		 * queue (or even the read operations for those packets
		 * is not finished by now).   Retry in 100ms. */

828
		schedule_timeout_interruptible(HZ / 10);
P
Philipp Reisner 已提交
829 830 831
		w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
		if (w) {
			w->cb = w_resync_finished;
832
			w->mdev = mdev;
833
			drbd_queue_work(&mdev->tconn->sender_work, w);
P
Philipp Reisner 已提交
834 835 836 837 838 839 840 841
			return 1;
		}
		dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
	}

	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
	if (dt <= 0)
		dt = 1;
842
	
P
Philipp Reisner 已提交
843
	db = mdev->rs_total;
844 845 846 847
	/* adjust for verify start and stop sectors, respective reached position */
	if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
		db -= mdev->ov_left;

P
Philipp Reisner 已提交
848 849 850 851 852 853
	dbdt = Bit2KB(db/dt);
	mdev->rs_paused /= HZ;

	if (!get_ldev(mdev))
		goto out;

854 855
	ping_peer(mdev);

856
	spin_lock_irq(&mdev->tconn->req_lock);
857
	os = drbd_read_state(mdev);
P
Philipp Reisner 已提交
858

859 860
	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);

P
Philipp Reisner 已提交
861 862 863 864 865 866 867 868 869
	/* This protects us against multiple calls (that can happen in the presence
	   of application IO), and against connectivity loss just before we arrive here. */
	if (os.conn <= C_CONNECTED)
		goto out_unlock;

	ns = os;
	ns.conn = C_CONNECTED;

	dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
870
	     verify_done ? "Online verify" : "Resync",
P
Philipp Reisner 已提交
871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);

	n_oos = drbd_bm_total_weight(mdev);

	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
		if (n_oos) {
			dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
			      n_oos, Bit2KB(1));
			khelper_cmd = "out-of-sync";
		}
	} else {
		D_ASSERT((n_oos - mdev->rs_failed) == 0);

		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
			khelper_cmd = "after-resync-target";

887
		if (mdev->tconn->csums_tfm && mdev->rs_total) {
P
Philipp Reisner 已提交
888 889 890 891 892
			const unsigned long s = mdev->rs_same_csum;
			const unsigned long t = mdev->rs_total;
			const int ratio =
				(t == 0)     ? 0 :
			(t < 100000) ? ((s*100)/t) : (s/(t/100));
B
Bart Van Assche 已提交
893
			dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; "
P
Philipp Reisner 已提交
894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
			     "transferred %luK total %luK\n",
			     ratio,
			     Bit2KB(mdev->rs_same_csum),
			     Bit2KB(mdev->rs_total - mdev->rs_same_csum),
			     Bit2KB(mdev->rs_total));
		}
	}

	if (mdev->rs_failed) {
		dev_info(DEV, "            %lu failed blocks\n", mdev->rs_failed);

		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
			ns.disk = D_INCONSISTENT;
			ns.pdsk = D_UP_TO_DATE;
		} else {
			ns.disk = D_UP_TO_DATE;
			ns.pdsk = D_INCONSISTENT;
		}
	} else {
		ns.disk = D_UP_TO_DATE;
		ns.pdsk = D_UP_TO_DATE;

		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
			if (mdev->p_uuid) {
				int i;
				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
					_drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
				drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
				_drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
			} else {
				dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
			}
		}

928 929 930 931 932 933 934 935 936 937 938 939
		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
			/* for verify runs, we don't update uuids here,
			 * so there would be nothing to report. */
			drbd_uuid_set_bm(mdev, 0UL);
			drbd_print_uuids(mdev, "updated UUIDs");
			if (mdev->p_uuid) {
				/* Now the two UUID sets are equal, update what we
				 * know of the peer. */
				int i;
				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
					mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
			}
P
Philipp Reisner 已提交
940 941 942 943 944
		}
	}

	_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
out_unlock:
945
	spin_unlock_irq(&mdev->tconn->req_lock);
P
Philipp Reisner 已提交
946 947 948 949 950
	put_ldev(mdev);
out:
	mdev->rs_total  = 0;
	mdev->rs_failed = 0;
	mdev->rs_paused = 0;
951 952 953

	/* reset start sector, if we reached end of device */
	if (verify_done && mdev->ov_left == 0)
954
		mdev->ov_start_sector = 0;
P
Philipp Reisner 已提交
955

956 957
	drbd_md_sync(mdev);

P
Philipp Reisner 已提交
958 959 960 961 962 963 964
	if (khelper_cmd)
		drbd_khelper(mdev, khelper_cmd);

	return 1;
}

/* helper */
965
static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
P
Philipp Reisner 已提交
966
{
967
	if (drbd_peer_req_has_active_page(peer_req)) {
P
Philipp Reisner 已提交
968
		/* This might happen if sendpage() has not finished */
969
		int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
970 971
		atomic_add(i, &mdev->pp_in_use_by_net);
		atomic_sub(i, &mdev->pp_in_use);
972
		spin_lock_irq(&mdev->tconn->req_lock);
973
		list_add_tail(&peer_req->w.list, &mdev->net_ee);
974
		spin_unlock_irq(&mdev->tconn->req_lock);
975
		wake_up(&drbd_pp_wait);
P
Philipp Reisner 已提交
976
	} else
977
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
978 979 980 981 982 983 984 985
}

/**
 * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
 * @mdev:	DRBD device.
 * @w:		work object.
 * @cancel:	The connection will be closed anyways
 */
986
int w_e_end_data_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
987
{
988
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
989
	struct drbd_conf *mdev = w->mdev;
990
	int err;
P
Philipp Reisner 已提交
991 992

	if (unlikely(cancel)) {
993
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
994
		dec_unacked(mdev);
995
		return 0;
P
Philipp Reisner 已提交
996 997
	}

998
	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
999
		err = drbd_send_block(mdev, P_DATA_REPLY, peer_req);
P
Philipp Reisner 已提交
1000 1001 1002
	} else {
		if (__ratelimit(&drbd_ratelimit_state))
			dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
1003
			    (unsigned long long)peer_req->i.sector);
P
Philipp Reisner 已提交
1004

1005
		err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req);
P
Philipp Reisner 已提交
1006 1007 1008 1009
	}

	dec_unacked(mdev);

1010
	move_to_net_ee_or_free(mdev, peer_req);
P
Philipp Reisner 已提交
1011

1012
	if (unlikely(err))
P
Philipp Reisner 已提交
1013
		dev_err(DEV, "drbd_send_block() failed\n");
1014
	return err;
P
Philipp Reisner 已提交
1015 1016 1017
}

/**
1018
 * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
P
Philipp Reisner 已提交
1019 1020 1021 1022
 * @mdev:	DRBD device.
 * @w:		work object.
 * @cancel:	The connection will be closed anyways
 */
1023
int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1024
{
1025
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1026
	struct drbd_conf *mdev = w->mdev;
1027
	int err;
P
Philipp Reisner 已提交
1028 1029

	if (unlikely(cancel)) {
1030
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1031
		dec_unacked(mdev);
1032
		return 0;
P
Philipp Reisner 已提交
1033 1034 1035
	}

	if (get_ldev_if_state(mdev, D_FAILED)) {
1036
		drbd_rs_complete_io(mdev, peer_req->i.sector);
P
Philipp Reisner 已提交
1037 1038 1039
		put_ldev(mdev);
	}

1040
	if (mdev->state.conn == C_AHEAD) {
1041
		err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req);
1042
	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
P
Philipp Reisner 已提交
1043 1044
		if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
			inc_rs_pending(mdev);
1045
			err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
P
Philipp Reisner 已提交
1046 1047 1048 1049
		} else {
			if (__ratelimit(&drbd_ratelimit_state))
				dev_err(DEV, "Not sending RSDataReply, "
				    "partner DISKLESS!\n");
1050
			err = 0;
P
Philipp Reisner 已提交
1051 1052 1053 1054
		}
	} else {
		if (__ratelimit(&drbd_ratelimit_state))
			dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
1055
			    (unsigned long long)peer_req->i.sector);
P
Philipp Reisner 已提交
1056

1057
		err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
P
Philipp Reisner 已提交
1058 1059

		/* update resync data with failure */
1060
		drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size);
P
Philipp Reisner 已提交
1061 1062 1063 1064
	}

	dec_unacked(mdev);

1065
	move_to_net_ee_or_free(mdev, peer_req);
P
Philipp Reisner 已提交
1066

1067
	if (unlikely(err))
P
Philipp Reisner 已提交
1068
		dev_err(DEV, "drbd_send_block() failed\n");
1069
	return err;
P
Philipp Reisner 已提交
1070 1071
}

1072
int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1073
{
1074
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1075
	struct drbd_conf *mdev = w->mdev;
P
Philipp Reisner 已提交
1076 1077 1078
	struct digest_info *di;
	int digest_size;
	void *digest = NULL;
1079
	int err, eq = 0;
P
Philipp Reisner 已提交
1080 1081

	if (unlikely(cancel)) {
1082
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1083
		dec_unacked(mdev);
1084
		return 0;
P
Philipp Reisner 已提交
1085 1086
	}

1087
	if (get_ldev(mdev)) {
1088
		drbd_rs_complete_io(mdev, peer_req->i.sector);
1089 1090
		put_ldev(mdev);
	}
P
Philipp Reisner 已提交
1091

1092
	di = peer_req->digest;
P
Philipp Reisner 已提交
1093

1094
	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
P
Philipp Reisner 已提交
1095 1096 1097
		/* quick hack to try to avoid a race against reconfiguration.
		 * a real fix would be much more involved,
		 * introducing more locking mechanisms */
1098 1099
		if (mdev->tconn->csums_tfm) {
			digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
P
Philipp Reisner 已提交
1100 1101 1102 1103
			D_ASSERT(digest_size == di->digest_size);
			digest = kmalloc(digest_size, GFP_NOIO);
		}
		if (digest) {
1104
			drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
P
Philipp Reisner 已提交
1105 1106 1107 1108 1109
			eq = !memcmp(digest, di->digest, digest_size);
			kfree(digest);
		}

		if (eq) {
1110
			drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size);
1111
			/* rs_same_csums unit is BM_BLOCK_SIZE */
1112
			mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
1113
			err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req);
P
Philipp Reisner 已提交
1114 1115
		} else {
			inc_rs_pending(mdev);
1116 1117
			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
1118
			kfree(di);
1119
			err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
P
Philipp Reisner 已提交
1120 1121
		}
	} else {
1122
		err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
P
Philipp Reisner 已提交
1123 1124 1125 1126 1127
		if (__ratelimit(&drbd_ratelimit_state))
			dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
	}

	dec_unacked(mdev);
1128
	move_to_net_ee_or_free(mdev, peer_req);
P
Philipp Reisner 已提交
1129

1130
	if (unlikely(err))
P
Philipp Reisner 已提交
1131
		dev_err(DEV, "drbd_send_block/ack() failed\n");
1132
	return err;
P
Philipp Reisner 已提交
1133 1134
}

1135
int w_e_end_ov_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1136
{
1137
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1138
	struct drbd_conf *mdev = w->mdev;
1139 1140
	sector_t sector = peer_req->i.sector;
	unsigned int size = peer_req->i.size;
P
Philipp Reisner 已提交
1141 1142
	int digest_size;
	void *digest;
1143
	int err = 0;
P
Philipp Reisner 已提交
1144 1145 1146 1147

	if (unlikely(cancel))
		goto out;

1148
	digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
P
Philipp Reisner 已提交
1149
	digest = kmalloc(digest_size, GFP_NOIO);
1150
	if (!digest) {
1151
		err = 1;	/* terminate the connection in case the allocation failed */
1152
		goto out;
P
Philipp Reisner 已提交
1153 1154
	}

1155
	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
1156
		drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
1157 1158 1159
	else
		memset(digest, 0, digest_size);

1160 1161 1162 1163
	/* Free e and pages before send.
	 * In case we block on congestion, we could otherwise run into
	 * some distributed deadlock, if the other side blocks on
	 * congestion as well, because our receiver blocks in
1164
	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1165
	drbd_free_peer_req(mdev, peer_req);
1166
	peer_req = NULL;
1167
	inc_rs_pending(mdev);
1168 1169
	err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY);
	if (err)
1170 1171 1172
		dec_rs_pending(mdev);
	kfree(digest);

P
Philipp Reisner 已提交
1173
out:
1174
	if (peer_req)
1175
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1176
	dec_unacked(mdev);
1177
	return err;
P
Philipp Reisner 已提交
1178 1179
}

1180
void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size)
P
Philipp Reisner 已提交
1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
{
	if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
		mdev->ov_last_oos_size += size>>9;
	} else {
		mdev->ov_last_oos_start = sector;
		mdev->ov_last_oos_size = size>>9;
	}
	drbd_set_out_of_sync(mdev, sector, size);
}

1191
int w_e_end_ov_reply(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1192
{
1193
	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
1194
	struct drbd_conf *mdev = w->mdev;
P
Philipp Reisner 已提交
1195 1196
	struct digest_info *di;
	void *digest;
1197 1198
	sector_t sector = peer_req->i.sector;
	unsigned int size = peer_req->i.size;
1199
	int digest_size;
1200
	int err, eq = 0;
1201
	bool stop_sector_reached = false;
P
Philipp Reisner 已提交
1202 1203

	if (unlikely(cancel)) {
1204
		drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1205
		dec_unacked(mdev);
1206
		return 0;
P
Philipp Reisner 已提交
1207 1208 1209 1210
	}

	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
	 * the resync lru has been cleaned up already */
1211
	if (get_ldev(mdev)) {
1212
		drbd_rs_complete_io(mdev, peer_req->i.sector);
1213 1214
		put_ldev(mdev);
	}
P
Philipp Reisner 已提交
1215

1216
	di = peer_req->digest;
P
Philipp Reisner 已提交
1217

1218
	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
1219
		digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
P
Philipp Reisner 已提交
1220 1221
		digest = kmalloc(digest_size, GFP_NOIO);
		if (digest) {
1222
			drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
P
Philipp Reisner 已提交
1223 1224 1225 1226 1227 1228 1229

			D_ASSERT(digest_size == di->digest_size);
			eq = !memcmp(digest, di->digest, digest_size);
			kfree(digest);
		}
	}

1230 1231 1232 1233
	/* Free peer_req and pages before send.
	 * In case we block on congestion, we could otherwise run into
	 * some distributed deadlock, if the other side blocks on
	 * congestion as well, because our receiver blocks in
1234
	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
1235
	drbd_free_peer_req(mdev, peer_req);
P
Philipp Reisner 已提交
1236
	if (!eq)
1237
		drbd_ov_out_of_sync_found(mdev, sector, size);
P
Philipp Reisner 已提交
1238
	else
1239
		ov_out_of_sync_print(mdev);
P
Philipp Reisner 已提交
1240

1241
	err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
1242
			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
P
Philipp Reisner 已提交
1243

1244
	dec_unacked(mdev);
P
Philipp Reisner 已提交
1245

1246 1247 1248 1249 1250 1251
	--mdev->ov_left;

	/* let's advance progress step marks only for every other megabyte */
	if ((mdev->ov_left & 0x200) == 0x200)
		drbd_advance_rs_marks(mdev, mdev->ov_left);

1252 1253 1254 1255
	stop_sector_reached = verify_can_do_stop_sector(mdev) &&
		(sector + (size>>9)) >= mdev->ov_stop_sector;

	if (mdev->ov_left == 0 || stop_sector_reached) {
1256
		ov_out_of_sync_print(mdev);
P
Philipp Reisner 已提交
1257 1258 1259
		drbd_resync_finished(mdev);
	}

1260
	return err;
P
Philipp Reisner 已提交
1261 1262
}

1263
int w_prev_work_done(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1264 1265
{
	struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
1266

P
Philipp Reisner 已提交
1267
	complete(&b->done);
1268
	return 0;
P
Philipp Reisner 已提交
1269 1270
}

1271 1272 1273 1274 1275
/* FIXME
 * We need to track the number of pending barrier acks,
 * and to be able to wait for them.
 * See also comment in drbd_adm_attach before drbd_suspend_io.
 */
1276
static int drbd_send_barrier(struct drbd_tconn *tconn)
P
Philipp Reisner 已提交
1277
{
1278
	struct p_barrier *p;
1279
	struct drbd_socket *sock;
P
Philipp Reisner 已提交
1280

1281 1282
	sock = &tconn->data;
	p = conn_prepare_command(tconn, sock);
1283 1284
	if (!p)
		return -EIO;
1285 1286 1287 1288 1289
	p->barrier = tconn->send.current_epoch_nr;
	p->pad = 0;
	tconn->send.current_epoch_writes = 0;

	return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0);
P
Philipp Reisner 已提交
1290 1291
}

1292
int w_send_write_hint(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1293
{
1294
	struct drbd_conf *mdev = w->mdev;
1295 1296
	struct drbd_socket *sock;

P
Philipp Reisner 已提交
1297
	if (cancel)
1298
		return 0;
1299 1300 1301
	sock = &mdev->tconn->data;
	if (!drbd_prepare_command(mdev, sock))
		return -EIO;
1302
	return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
P
Philipp Reisner 已提交
1303 1304
}

1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325
static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch)
{
	if (!tconn->send.seen_any_write_yet) {
		tconn->send.seen_any_write_yet = true;
		tconn->send.current_epoch_nr = epoch;
		tconn->send.current_epoch_writes = 0;
	}
}

static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch)
{
	/* re-init if first write on this connection */
	if (!tconn->send.seen_any_write_yet)
		return;
	if (tconn->send.current_epoch_nr != epoch) {
		if (tconn->send.current_epoch_writes)
			drbd_send_barrier(tconn);
		tconn->send.current_epoch_nr = epoch;
	}
}

1326
int w_send_out_of_sync(struct drbd_work *w, int cancel)
1327 1328
{
	struct drbd_request *req = container_of(w, struct drbd_request, w);
1329
	struct drbd_conf *mdev = w->mdev;
1330
	struct drbd_tconn *tconn = mdev->tconn;
1331
	int err;
1332 1333

	if (unlikely(cancel)) {
1334
		req_mod(req, SEND_CANCELED);
1335
		return 0;
1336 1337
	}

1338 1339 1340 1341
	/* this time, no tconn->send.current_epoch_writes++;
	 * If it was sent, it was the closing barrier for the last
	 * replicated epoch, before we went into AHEAD mode.
	 * No more barriers will be sent, until we leave AHEAD mode again. */
1342
	maybe_send_barrier(tconn, req->epoch);
1343

1344
	err = drbd_send_out_of_sync(mdev, req);
1345
	req_mod(req, OOS_HANDED_TO_NETWORK);
1346

1347
	return err;
1348 1349
}

P
Philipp Reisner 已提交
1350 1351 1352 1353 1354 1355
/**
 * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
 * @mdev:	DRBD device.
 * @w:		work object.
 * @cancel:	The connection will be closed anyways
 */
1356
int w_send_dblock(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1357 1358
{
	struct drbd_request *req = container_of(w, struct drbd_request, w);
1359
	struct drbd_conf *mdev = w->mdev;
1360
	struct drbd_tconn *tconn = mdev->tconn;
1361
	int err;
P
Philipp Reisner 已提交
1362 1363

	if (unlikely(cancel)) {
1364
		req_mod(req, SEND_CANCELED);
1365
		return 0;
P
Philipp Reisner 已提交
1366 1367
	}

1368 1369
	re_init_if_first_write(tconn, req->epoch);
	maybe_send_barrier(tconn, req->epoch);
1370 1371
	tconn->send.current_epoch_writes++;

1372 1373
	err = drbd_send_dblock(mdev, req);
	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
P
Philipp Reisner 已提交
1374

1375
	return err;
P
Philipp Reisner 已提交
1376 1377 1378 1379 1380 1381 1382 1383
}

/**
 * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
 * @mdev:	DRBD device.
 * @w:		work object.
 * @cancel:	The connection will be closed anyways
 */
1384
int w_send_read_req(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1385 1386
{
	struct drbd_request *req = container_of(w, struct drbd_request, w);
1387
	struct drbd_conf *mdev = w->mdev;
1388
	struct drbd_tconn *tconn = mdev->tconn;
1389
	int err;
P
Philipp Reisner 已提交
1390 1391

	if (unlikely(cancel)) {
1392
		req_mod(req, SEND_CANCELED);
1393
		return 0;
P
Philipp Reisner 已提交
1394 1395
	}

1396 1397
	/* Even read requests may close a write epoch,
	 * if there was any yet. */
1398
	maybe_send_barrier(tconn, req->epoch);
1399

1400
	err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size,
1401
				 (unsigned long)req);
P
Philipp Reisner 已提交
1402

1403
	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
P
Philipp Reisner 已提交
1404

1405
	return err;
P
Philipp Reisner 已提交
1406 1407
}

1408
int w_restart_disk_io(struct drbd_work *w, int cancel)
1409 1410
{
	struct drbd_request *req = container_of(w, struct drbd_request, w);
1411
	struct drbd_conf *mdev = w->mdev;
1412

1413
	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
1414
		drbd_al_begin_io(mdev, &req->i, false);
1415 1416 1417 1418 1419

	drbd_req_make_private_bio(req, req->master_bio);
	req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
	generic_make_request(req->private_bio);

1420
	return 0;
1421 1422
}

P
Philipp Reisner 已提交
1423 1424 1425
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
	struct drbd_conf *odev = mdev;
1426
	int resync_after;
P
Philipp Reisner 已提交
1427 1428

	while (1) {
1429
		if (!odev->ldev || odev->state.disk == D_DISKLESS)
1430
			return 1;
P
Philipp Reisner 已提交
1431
		rcu_read_lock();
1432
		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
P
Philipp Reisner 已提交
1433
		rcu_read_unlock();
1434
		if (resync_after == -1)
P
Philipp Reisner 已提交
1435
			return 1;
1436
		odev = minor_to_mdev(resync_after);
1437
		if (!odev)
1438
			return 1;
P
Philipp Reisner 已提交
1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457
		if ((odev->state.conn >= C_SYNC_SOURCE &&
		     odev->state.conn <= C_PAUSED_SYNC_T) ||
		    odev->state.aftr_isp || odev->state.peer_isp ||
		    odev->state.user_isp)
			return 0;
	}
}

/**
 * _drbd_pause_after() - Pause resync on all devices that may not resync now
 * @mdev:	DRBD device.
 *
 * Called from process context only (admin command and after_state_ch).
 */
static int _drbd_pause_after(struct drbd_conf *mdev)
{
	struct drbd_conf *odev;
	int i, rv = 0;

1458
	rcu_read_lock();
1459
	idr_for_each_entry(&minors, odev, i) {
P
Philipp Reisner 已提交
1460 1461 1462 1463 1464 1465
		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
			continue;
		if (!_drbd_may_sync_now(odev))
			rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
			       != SS_NOTHING_TO_DO);
	}
1466
	rcu_read_unlock();
P
Philipp Reisner 已提交
1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481

	return rv;
}

/**
 * _drbd_resume_next() - Resume resync on all devices that may resync now
 * @mdev:	DRBD device.
 *
 * Called from process context only (admin command and worker).
 */
static int _drbd_resume_next(struct drbd_conf *mdev)
{
	struct drbd_conf *odev;
	int i, rv = 0;

1482
	rcu_read_lock();
1483
	idr_for_each_entry(&minors, odev, i) {
P
Philipp Reisner 已提交
1484 1485 1486 1487 1488 1489 1490 1491 1492
		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
			continue;
		if (odev->state.aftr_isp) {
			if (_drbd_may_sync_now(odev))
				rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
							CS_HARD, NULL)
				       != SS_NOTHING_TO_DO) ;
		}
	}
1493
	rcu_read_unlock();
P
Philipp Reisner 已提交
1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510
	return rv;
}

void resume_next_sg(struct drbd_conf *mdev)
{
	write_lock_irq(&global_state_lock);
	_drbd_resume_next(mdev);
	write_unlock_irq(&global_state_lock);
}

void suspend_other_sg(struct drbd_conf *mdev)
{
	write_lock_irq(&global_state_lock);
	_drbd_pause_after(mdev);
	write_unlock_irq(&global_state_lock);
}

1511
/* caller must hold global_state_lock */
1512
enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
P
Philipp Reisner 已提交
1513 1514
{
	struct drbd_conf *odev;
1515
	int resync_after;
P
Philipp Reisner 已提交
1516 1517 1518

	if (o_minor == -1)
		return NO_ERROR;
1519
	if (o_minor < -1 || o_minor > MINORMASK)
1520
		return ERR_RESYNC_AFTER;
P
Philipp Reisner 已提交
1521 1522 1523 1524 1525

	/* check for loops */
	odev = minor_to_mdev(o_minor);
	while (1) {
		if (odev == mdev)
1526
			return ERR_RESYNC_AFTER_CYCLE;
P
Philipp Reisner 已提交
1527

1528 1529 1530 1531 1532 1533 1534 1535 1536
		/* You are free to depend on diskless, non-existing,
		 * or not yet/no longer existing minors.
		 * We only reject dependency loops.
		 * We cannot follow the dependency chain beyond a detached or
		 * missing minor.
		 */
		if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
			return NO_ERROR;

P
Philipp Reisner 已提交
1537
		rcu_read_lock();
1538
		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
P
Philipp Reisner 已提交
1539
		rcu_read_unlock();
P
Philipp Reisner 已提交
1540
		/* dependency chain ends here, no cycles. */
1541
		if (resync_after == -1)
P
Philipp Reisner 已提交
1542 1543 1544
			return NO_ERROR;

		/* follow the dependency chain */
1545
		odev = minor_to_mdev(resync_after);
P
Philipp Reisner 已提交
1546 1547 1548
	}
}

1549
/* caller must hold global_state_lock */
1550
void drbd_resync_after_changed(struct drbd_conf *mdev)
P
Philipp Reisner 已提交
1551 1552 1553
{
	int changes;

1554 1555 1556 1557
	do {
		changes  = _drbd_pause_after(mdev);
		changes |= _drbd_resume_next(mdev);
	} while (changes);
P
Philipp Reisner 已提交
1558 1559
}

1560 1561
void drbd_rs_controller_reset(struct drbd_conf *mdev)
{
P
Philipp Reisner 已提交
1562 1563
	struct fifo_buffer *plan;

1564 1565 1566
	atomic_set(&mdev->rs_sect_in, 0);
	atomic_set(&mdev->rs_sect_ev, 0);
	mdev->rs_in_flight = 0;
P
Philipp Reisner 已提交
1567 1568 1569 1570 1571 1572 1573 1574 1575 1576

	/* Updating the RCU protected object in place is necessary since
	   this function gets called from atomic context.
	   It is valid since all other updates also lead to an completely
	   empty fifo */
	rcu_read_lock();
	plan = rcu_dereference(mdev->rs_plan_s);
	plan->total = 0;
	fifo_set(plan, 0);
	rcu_read_unlock();
1577 1578
}

P
Philipp Reisner 已提交
1579 1580 1581 1582
void start_resync_timer_fn(unsigned long data)
{
	struct drbd_conf *mdev = (struct drbd_conf *) data;

1583
	drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work);
P
Philipp Reisner 已提交
1584 1585
}

1586
int w_start_resync(struct drbd_work *w, int cancel)
P
Philipp Reisner 已提交
1587
{
1588 1589
	struct drbd_conf *mdev = w->mdev;

P
Philipp Reisner 已提交
1590 1591 1592 1593
	if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
		dev_warn(DEV, "w_start_resync later...\n");
		mdev->start_resync_timer.expires = jiffies + HZ/10;
		add_timer(&mdev->start_resync_timer);
1594
		return 0;
P
Philipp Reisner 已提交
1595 1596 1597
	}

	drbd_start_resync(mdev, C_SYNC_SOURCE);
1598
	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
1599
	return 0;
P
Philipp Reisner 已提交
1600 1601
}

P
Philipp Reisner 已提交
1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614
/**
 * drbd_start_resync() - Start the resync process
 * @mdev:	DRBD device.
 * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
 *
 * This function might bring you directly into one of the
 * C_PAUSED_SYNC_* states.
 */
void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
{
	union drbd_state ns;
	int r;

1615
	if (mdev->state.conn >= C_SYNC_SOURCE && mdev->state.conn < C_AHEAD) {
P
Philipp Reisner 已提交
1616 1617 1618 1619
		dev_err(DEV, "Resync already running!\n");
		return;
	}

1620 1621 1622 1623 1624 1625 1626 1627 1628
	if (!test_bit(B_RS_H_DONE, &mdev->flags)) {
		if (side == C_SYNC_TARGET) {
			/* Since application IO was locked out during C_WF_BITMAP_T and
			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
			   we check that we might make the data inconsistent. */
			r = drbd_khelper(mdev, "before-resync-target");
			r = (r >> 8) & 0xff;
			if (r > 0) {
				dev_info(DEV, "before-resync-target handler returned %d, "
1629
					 "dropping connection.\n", r);
1630
				conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
1631 1632
				return;
			}
1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
		} else /* C_SYNC_SOURCE */ {
			r = drbd_khelper(mdev, "before-resync-source");
			r = (r >> 8) & 0xff;
			if (r > 0) {
				if (r == 3) {
					dev_info(DEV, "before-resync-source handler returned %d, "
						 "ignoring. Old userland tools?", r);
				} else {
					dev_info(DEV, "before-resync-source handler returned %d, "
						 "dropping connection.\n", r);
1643
					conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
1644 1645 1646
					return;
				}
			}
1647
		}
P
Philipp Reisner 已提交
1648 1649
	}

1650
	if (current == mdev->tconn->worker.task) {
1651
		/* The worker should not sleep waiting for state_mutex,
1652
		   that can take long */
1653
		if (!mutex_trylock(mdev->state_mutex)) {
1654 1655 1656 1657 1658 1659
			set_bit(B_RS_H_DONE, &mdev->flags);
			mdev->start_resync_timer.expires = jiffies + HZ/5;
			add_timer(&mdev->start_resync_timer);
			return;
		}
	} else {
1660
		mutex_lock(mdev->state_mutex);
1661 1662
	}
	clear_bit(B_RS_H_DONE, &mdev->flags);
P
Philipp Reisner 已提交
1663

1664
	write_lock_irq(&global_state_lock);
1665 1666 1667
	/* Did some connection breakage or IO error race with us? */
	if (mdev->state.conn < C_CONNECTED
	|| !get_ldev_if_state(mdev, D_NEGOTIATING)) {
1668
		write_unlock_irq(&global_state_lock);
1669
		mutex_unlock(mdev->state_mutex);
P
Philipp Reisner 已提交
1670 1671 1672
		return;
	}

1673
	ns = drbd_read_state(mdev);
P
Philipp Reisner 已提交
1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684

	ns.aftr_isp = !_drbd_may_sync_now(mdev);

	ns.conn = side;

	if (side == C_SYNC_TARGET)
		ns.disk = D_INCONSISTENT;
	else /* side == C_SYNC_SOURCE */
		ns.pdsk = D_INCONSISTENT;

	r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
1685
	ns = drbd_read_state(mdev);
P
Philipp Reisner 已提交
1686 1687 1688 1689 1690

	if (ns.conn < C_CONNECTED)
		r = SS_UNKNOWN_ERROR;

	if (r == SS_SUCCESS) {
1691 1692 1693 1694
		unsigned long tw = drbd_bm_total_weight(mdev);
		unsigned long now = jiffies;
		int i;

P
Philipp Reisner 已提交
1695 1696 1697
		mdev->rs_failed    = 0;
		mdev->rs_paused    = 0;
		mdev->rs_same_csum = 0;
1698 1699
		mdev->rs_last_events = 0;
		mdev->rs_last_sect_ev = 0;
1700 1701 1702 1703 1704 1705
		mdev->rs_total     = tw;
		mdev->rs_start     = now;
		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
			mdev->rs_mark_left[i] = tw;
			mdev->rs_mark_time[i] = now;
		}
P
Philipp Reisner 已提交
1706 1707 1708
		_drbd_pause_after(mdev);
	}
	write_unlock_irq(&global_state_lock);
1709

P
Philipp Reisner 已提交
1710
	if (r == SS_SUCCESS) {
1711 1712 1713 1714
		/* reset rs_last_bcast when a resync or verify is started,
		 * to deal with potential jiffies wrap. */
		mdev->rs_last_bcast = jiffies - HZ;

P
Philipp Reisner 已提交
1715 1716 1717 1718
		dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
		     drbd_conn_str(ns.conn),
		     (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
		     (unsigned long) mdev->rs_total);
1719 1720 1721 1722 1723 1724 1725 1726 1727 1728
		if (side == C_SYNC_TARGET)
			mdev->bm_resync_fo = 0;

		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
		 * with w_send_oos, or the sync target will get confused as to
		 * how much bits to resync.  We cannot do that always, because for an
		 * empty resync and protocol < 95, we need to do it here, as we call
		 * drbd_resync_finished from here in that case.
		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
		 * and from after_state_ch otherwise. */
1729
		if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96)
1730
			drbd_gen_and_send_sync_uuid(mdev);
P
Philipp Reisner 已提交
1731

1732
		if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) {
1733 1734 1735 1736 1737 1738 1739 1740 1741 1742
			/* This still has a race (about when exactly the peers
			 * detect connection loss) that can lead to a full sync
			 * on next handshake. In 8.3.9 we fixed this with explicit
			 * resync-finished notifications, but the fix
			 * introduces a protocol change.  Sleeping for some
			 * time longer than the ping interval + timeout on the
			 * SyncSource, to give the SyncTarget the chance to
			 * detect connection loss, then waiting for a ping
			 * response (implicit in drbd_resync_finished) reduces
			 * the race considerably, but does not solve it. */
1743 1744 1745 1746 1747 1748 1749 1750 1751 1752
			if (side == C_SYNC_SOURCE) {
				struct net_conf *nc;
				int timeo;

				rcu_read_lock();
				nc = rcu_dereference(mdev->tconn->net_conf);
				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
				rcu_read_unlock();
				schedule_timeout_interruptible(timeo);
			}
P
Philipp Reisner 已提交
1753 1754 1755
			drbd_resync_finished(mdev);
		}

1756
		drbd_rs_controller_reset(mdev);
P
Philipp Reisner 已提交
1757 1758 1759 1760 1761 1762 1763 1764 1765
		/* ns.conn may already be != mdev->state.conn,
		 * we may have been paused in between, or become paused until
		 * the timer triggers.
		 * No matter, that is handled in resync_timer_fn() */
		if (ns.conn == C_SYNC_TARGET)
			mod_timer(&mdev->resync_timer, jiffies);

		drbd_md_sync(mdev);
	}
1766
	put_ldev(mdev);
1767
	mutex_unlock(mdev->state_mutex);
P
Philipp Reisner 已提交
1768 1769
}

1770 1771 1772 1773
/* If the resource already closed the current epoch, but we did not
 * (because we have not yet seen new requests), we should send the
 * corresponding barrier now.  Must be checked within the same spinlock
 * that is used to check for new requests. */
1774
static bool need_to_send_barrier(struct drbd_tconn *connection)
1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797
{
	if (!connection->send.seen_any_write_yet)
		return false;

	/* Skip barriers that do not contain any writes.
	 * This may happen during AHEAD mode. */
	if (!connection->send.current_epoch_writes)
		return false;

	/* ->req_lock is held when requests are queued on
	 * connection->sender_work, and put into ->transfer_log.
	 * It is also held when ->current_tle_nr is increased.
	 * So either there are already new requests queued,
	 * and corresponding barriers will be send there.
	 * Or nothing new is queued yet, so the difference will be 1.
	 */
	if (atomic_read(&connection->current_tle_nr) !=
	    connection->send.current_epoch_nr + 1)
		return false;

	return true;
}

1798
static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
1799 1800 1801 1802 1803 1804 1805
{
	spin_lock_irq(&queue->q_lock);
	list_splice_init(&queue->q, work_list);
	spin_unlock_irq(&queue->q_lock);
	return !list_empty(work_list);
}

1806
static bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
1807 1808 1809 1810 1811 1812 1813 1814
{
	spin_lock_irq(&queue->q_lock);
	if (!list_empty(&queue->q))
		list_move(queue->q.next, work_list);
	spin_unlock_irq(&queue->q_lock);
	return !list_empty(work_list);
}

1815
static void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list)
1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846
{
	DEFINE_WAIT(wait);
	struct net_conf *nc;
	int uncork, cork;

	dequeue_work_item(&connection->sender_work, work_list);
	if (!list_empty(work_list))
		return;

	/* Still nothing to do?
	 * Maybe we still need to close the current epoch,
	 * even if no new requests are queued yet.
	 *
	 * Also, poke TCP, just in case.
	 * Then wait for new work (or signal). */
	rcu_read_lock();
	nc = rcu_dereference(connection->net_conf);
	uncork = nc ? nc->tcp_cork : 0;
	rcu_read_unlock();
	if (uncork) {
		mutex_lock(&connection->data.mutex);
		if (connection->data.socket)
			drbd_tcp_uncork(connection->data.socket);
		mutex_unlock(&connection->data.mutex);
	}

	for (;;) {
		int send_barrier;
		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
		spin_lock_irq(&connection->req_lock);
		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
1847 1848 1849 1850
		/* dequeue single item only,
		 * we still use drbd_queue_work_front() in some places */
		if (!list_empty(&connection->sender_work.q))
			list_move(connection->sender_work.q.next, work_list);
1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883
		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
		if (!list_empty(work_list) || signal_pending(current)) {
			spin_unlock_irq(&connection->req_lock);
			break;
		}
		send_barrier = need_to_send_barrier(connection);
		spin_unlock_irq(&connection->req_lock);
		if (send_barrier) {
			drbd_send_barrier(connection);
			connection->send.current_epoch_nr++;
		}
		schedule();
		/* may be woken up for other things but new work, too,
		 * e.g. if the current epoch got closed.
		 * In which case we send the barrier above. */
	}
	finish_wait(&connection->sender_work.q_wait, &wait);

	/* someone may have changed the config while we have been waiting above. */
	rcu_read_lock();
	nc = rcu_dereference(connection->net_conf);
	cork = nc ? nc->tcp_cork : 0;
	rcu_read_unlock();
	mutex_lock(&connection->data.mutex);
	if (connection->data.socket) {
		if (cork)
			drbd_tcp_cork(connection->data.socket);
		else if (!uncork)
			drbd_tcp_uncork(connection->data.socket);
	}
	mutex_unlock(&connection->data.mutex);
}

P
Philipp Reisner 已提交
1884 1885
int drbd_worker(struct drbd_thread *thi)
{
1886
	struct drbd_tconn *tconn = thi->tconn;
P
Philipp Reisner 已提交
1887
	struct drbd_work *w = NULL;
1888
	struct drbd_conf *mdev;
P
Philipp Reisner 已提交
1889
	LIST_HEAD(work_list);
1890
	int vnr;
P
Philipp Reisner 已提交
1891

1892
	while (get_t_state(thi) == RUNNING) {
1893
		drbd_thread_current_set_cpu(thi);
P
Philipp Reisner 已提交
1894

1895 1896 1897
		/* as long as we use drbd_queue_work_front(),
		 * we may only dequeue single work items here, not batches. */
		if (list_empty(&work_list))
1898
			wait_for_work(tconn, &work_list);
P
Philipp Reisner 已提交
1899

1900
		if (signal_pending(current)) {
P
Philipp Reisner 已提交
1901
			flush_signals(current);
1902 1903
			if (get_t_state(thi) == RUNNING) {
				conn_warn(tconn, "Worker got an unexpected signal\n");
P
Philipp Reisner 已提交
1904
				continue;
1905
			}
P
Philipp Reisner 已提交
1906 1907 1908
			break;
		}

1909
		if (get_t_state(thi) != RUNNING)
P
Philipp Reisner 已提交
1910 1911
			break;

1912 1913 1914 1915 1916
		while (!list_empty(&work_list)) {
			w = list_first_entry(&work_list, struct drbd_work, list);
			list_del_init(&w->list);
			if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0)
				continue;
1917 1918
			if (tconn->cstate >= C_WF_REPORT_PARAMS)
				conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
P
Philipp Reisner 已提交
1919 1920 1921
		}
	}

1922
	do {
P
Philipp Reisner 已提交
1923
		while (!list_empty(&work_list)) {
1924
			w = list_first_entry(&work_list, struct drbd_work, list);
P
Philipp Reisner 已提交
1925
			list_del_init(&w->list);
1926
			w->cb(w, 1);
P
Philipp Reisner 已提交
1927
		}
1928
		dequeue_work_batch(&tconn->sender_work, &work_list);
1929
	} while (!list_empty(&work_list));
P
Philipp Reisner 已提交
1930

P
Philipp Reisner 已提交
1931
	rcu_read_lock();
1932
	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
1933
		D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
P
Philipp Reisner 已提交
1934 1935
		kref_get(&mdev->kref);
		rcu_read_unlock();
1936
		drbd_mdev_cleanup(mdev);
P
Philipp Reisner 已提交
1937 1938
		kref_put(&mdev->kref, &drbd_minor_destroy);
		rcu_read_lock();
1939
	}
P
Philipp Reisner 已提交
1940
	rcu_read_unlock();
P
Philipp Reisner 已提交
1941 1942 1943

	return 0;
}