filelayoutdev.c 18.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 *  Device operations for the pnfs nfs4 file layout driver.
 *
 *  Copyright (c) 2002
 *  The Regents of the University of Michigan
 *  All Rights Reserved
 *
 *  Dean Hildebrand <dhildebz@umich.edu>
 *  Garth Goodson   <Garth.Goodson@netapp.com>
 *
 *  Permission is granted to use, copy, create derivative works, and
 *  redistribute this software and such derivative works for any purpose,
 *  so long as the name of the University of Michigan is not used in
 *  any advertising or publicity pertaining to the use or distribution
 *  of this software without specific, written prior authorization. If
 *  the above copyright notice or any other identification of the
 *  University of Michigan is included in any copy of any portion of
 *  this software, then the disclaimer below must also be included.
 *
 *  This software is provided as is, without representation or warranty
 *  of any kind either express or implied, including without limitation
 *  the implied warranties of merchantability, fitness for a particular
 *  purpose, or noninfringement.  The Regents of the University of
 *  Michigan shall not be liable for any damages, including special,
 *  indirect, incidental, or consequential damages, with respect to any
 *  claim arising out of or in connection with the use of the software,
 *  even if it has been or is hereafter advised of the possibility of
 *  such damages.
 */

#include <linux/nfs_fs.h>
#include <linux/vmalloc.h>
33
#include <linux/module.h>
34
#include <linux/sunrpc/addr.h>
35

36 37 38
#include "../internal.h"
#include "../nfs4session.h"
#include "filelayout.h"
39 40 41

#define NFSDBG_FACILITY		NFSDBG_PNFS_LD

42 43 44
static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;

45 46 47 48 49 50 51 52 53
/*
 * Data server cache
 *
 * Data servers can be mapped to different device ids.
 * nfs4_pnfs_ds reference counting
 *   - set to 1 on allocation
 *   - incremented when a device id maps a data server already in the cache.
 *   - decremented when deviceid is removed from the cache.
 */
54
static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
55 56 57 58 59 60 61 62 63 64
static LIST_HEAD(nfs4_data_server_cache);

/* Debug routines */
void
print_ds(struct nfs4_pnfs_ds *ds)
{
	if (ds == NULL) {
		printk("%s NULL device\n", __func__);
		return;
	}
W
Weston Andros Adamson 已提交
65
	printk("        ds %s\n"
66 67 68
		"        ref count %d\n"
		"        client %p\n"
		"        cl_exchange_flags %x\n",
W
Weston Andros Adamson 已提交
69
		ds->ds_remotestr,
70 71 72 73
		atomic_read(&ds->ds_count), ds->ds_clp,
		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
}

74 75
static bool
same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
76
{
W
Weston Andros Adamson 已提交
77 78
	struct sockaddr_in *a, *b;
	struct sockaddr_in6 *a6, *b6;
79

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
	if (addr1->sa_family != addr2->sa_family)
		return false;

	switch (addr1->sa_family) {
	case AF_INET:
		a = (struct sockaddr_in *)addr1;
		b = (struct sockaddr_in *)addr2;

		if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
		    a->sin_port == b->sin_port)
			return true;
		break;

	case AF_INET6:
		a6 = (struct sockaddr_in6 *)addr1;
		b6 = (struct sockaddr_in6 *)addr2;

		/* LINKLOCAL addresses must have matching scope_id */
98
		if (ipv6_addr_src_scope(&a6->sin6_addr) ==
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
		    IPV6_ADDR_SCOPE_LINKLOCAL &&
		    a6->sin6_scope_id != b6->sin6_scope_id)
			return false;

		if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
		    a6->sin6_port == b6->sin6_port)
			return true;
		break;

	default:
		dprintk("%s: unhandled address family: %u\n",
			__func__, addr1->sa_family);
		return false;
	}

	return false;
}

117
static bool
118 119
_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
			       const struct list_head *dsaddrs2)
120 121 122
{
	struct nfs4_pnfs_ds_addr *da1, *da2;

123 124 125 126 127 128 129 130 131
	/* step through both lists, comparing as we go */
	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
	     da1 != NULL && da2 != NULL;
	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
		if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
				   (struct sockaddr *)&da2->da_addr))
			return false;
132
	}
133 134 135 136
	if (da1 == NULL && da2 == NULL)
		return true;

	return false;
137 138
}

139
/*
140
 * Lookup DS by addresses.  nfs4_ds_cache_lock is held
141
 */
142 143
static struct nfs4_pnfs_ds *
_data_server_lookup_locked(const struct list_head *dsaddrs)
144
{
145
	struct nfs4_pnfs_ds *ds;
146

147 148 149 150
	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
			return ds;
	return NULL;
151 152
}

A
Andy Adamson 已提交
153 154
/*
 * Create an rpc connection to the nfs4_pnfs_ds data server
W
Weston Andros Adamson 已提交
155
 * Currently only supports IPv4 and IPv6 addresses
A
Andy Adamson 已提交
156 157 158 159
 */
static int
nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
{
160
	struct nfs_client *clp = ERR_PTR(-EIO);
161
	struct nfs4_pnfs_ds_addr *da;
A
Andy Adamson 已提交
162 163
	int status = 0;

164
	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
A
Andy Adamson 已提交
165 166
		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);

167 168 169
	list_for_each_entry(da, &ds->ds_addrs, da_node) {
		dprintk("%s: DS %s: trying address %s\n",
			__func__, ds->ds_remotestr, da->da_remotestr);
170

171
		clp = nfs4_set_ds_client(mds_srv->nfs_client,
172 173 174
					(struct sockaddr *)&da->da_addr,
					da->da_addrlen, IPPROTO_TCP,
					dataserver_timeo, dataserver_retrans);
175 176 177 178
		if (!IS_ERR(clp))
			break;
	}

A
Andy Adamson 已提交
179 180 181 182 183
	if (IS_ERR(clp)) {
		status = PTR_ERR(clp);
		goto out;
	}

184
	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
A
Andy Adamson 已提交
185 186 187
	if (status)
		goto out_put;

188
	smp_wmb();
A
Andy Adamson 已提交
189
	ds->ds_clp = clp;
W
Weston Andros Adamson 已提交
190
	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
A
Andy Adamson 已提交
191 192 193 194 195 196 197
out:
	return status;
out_put:
	nfs_put_client(clp);
	goto out;
}

198 199 200
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
201 202
	struct nfs4_pnfs_ds_addr *da;

203 204 205 206 207 208
	dprintk("--> %s\n", __func__);
	ifdebug(FACILITY)
		print_ds(ds);

	if (ds->ds_clp)
		nfs_put_client(ds->ds_clp);
209 210 211 212 213 214 215 216 217 218

	while (!list_empty(&ds->ds_addrs)) {
		da = list_first_entry(&ds->ds_addrs,
				      struct nfs4_pnfs_ds_addr,
				      da_node);
		list_del_init(&da->da_node);
		kfree(da->da_remotestr);
		kfree(da);
	}

W
Weston Andros Adamson 已提交
219
	kfree(ds->ds_remotestr);
220 221 222
	kfree(ds);
}

223
void
224 225 226 227 228
nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
	struct nfs4_pnfs_ds *ds;
	int i;

229
	nfs4_print_deviceid(&dsaddr->id_node.deviceid);
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

	for (i = 0; i < dsaddr->ds_num; i++) {
		ds = dsaddr->ds_list[i];
		if (ds != NULL) {
			if (atomic_dec_and_lock(&ds->ds_count,
						&nfs4_ds_cache_lock)) {
				list_del_init(&ds->ds_node);
				spin_unlock(&nfs4_ds_cache_lock);
				destroy_ds(ds);
			}
		}
	}
	kfree(dsaddr->stripe_indices);
	kfree(dsaddr);
}

W
Weston Andros Adamson 已提交
246 247 248 249 250
/*
 * Create a string with a human readable address and port to avoid
 * complicated setup around many dprinks.
 */
static char *
251
nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
W
Weston Andros Adamson 已提交
252
{
253
	struct nfs4_pnfs_ds_addr *da;
W
Weston Andros Adamson 已提交
254 255
	char *remotestr;
	size_t len;
256
	char *p;
W
Weston Andros Adamson 已提交
257

258 259 260
	len = 3;        /* '{', '}' and eol */
	list_for_each_entry(da, dsaddrs, da_node) {
		len += strlen(da->da_remotestr) + 1;    /* string plus comma */
W
Weston Andros Adamson 已提交
261 262
	}

263 264
	remotestr = kzalloc(len, gfp_flags);
	if (!remotestr)
W
Weston Andros Adamson 已提交
265 266
		return NULL;

267 268 269 270 271
	p = remotestr;
	*(p++) = '{';
	len--;
	list_for_each_entry(da, dsaddrs, da_node) {
		size_t ll = strlen(da->da_remotestr);
W
Weston Andros Adamson 已提交
272

273 274
		if (ll > len)
			goto out_err;
W
Weston Andros Adamson 已提交
275

276 277 278
		memcpy(p, da->da_remotestr, ll);
		p += ll;
		len -= ll;
W
Weston Andros Adamson 已提交
279

280 281 282 283 284 285 286 287 288
		if (len < 1)
			goto out_err;
		(*p++) = ',';
		len--;
	}
	if (len < 2)
		goto out_err;
	*(p++) = '}';
	*p = '\0';
W
Weston Andros Adamson 已提交
289
	return remotestr;
290 291 292
out_err:
	kfree(remotestr);
	return NULL;
W
Weston Andros Adamson 已提交
293 294
}

295
static struct nfs4_pnfs_ds *
296
nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
297
{
W
Weston Andros Adamson 已提交
298 299
	struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
	char *remotestr;
300

301 302 303 304 305 306
	if (list_empty(dsaddrs)) {
		dprintk("%s: no addresses defined\n", __func__);
		goto out;
	}

	ds = kzalloc(sizeof(*ds), gfp_flags);
307 308 309
	if (!ds)
		goto out;

W
Weston Andros Adamson 已提交
310
	/* this is only used for debugging, so it's ok if its NULL */
311
	remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
W
Weston Andros Adamson 已提交
312

313
	spin_lock(&nfs4_ds_cache_lock);
314
	tmp_ds = _data_server_lookup_locked(dsaddrs);
315
	if (tmp_ds == NULL) {
316 317
		INIT_LIST_HEAD(&ds->ds_addrs);
		list_splice_init(dsaddrs, &ds->ds_addrs);
W
Weston Andros Adamson 已提交
318
		ds->ds_remotestr = remotestr;
319 320 321 322
		atomic_set(&ds->ds_count, 1);
		INIT_LIST_HEAD(&ds->ds_node);
		ds->ds_clp = NULL;
		list_add(&ds->ds_node, &nfs4_data_server_cache);
W
Weston Andros Adamson 已提交
323 324
		dprintk("%s add new data server %s\n", __func__,
			ds->ds_remotestr);
325
	} else {
W
Weston Andros Adamson 已提交
326
		kfree(remotestr);
327 328
		kfree(ds);
		atomic_inc(&tmp_ds->ds_count);
W
Weston Andros Adamson 已提交
329 330
		dprintk("%s data server %s found, inc'ed ds_count to %d\n",
			__func__, tmp_ds->ds_remotestr,
331 332 333 334 335 336 337 338 339
			atomic_read(&tmp_ds->ds_count));
		ds = tmp_ds;
	}
	spin_unlock(&nfs4_ds_cache_lock);
out:
	return ds;
}

/*
W
Weston Andros Adamson 已提交
340
 * Currently only supports ipv4, ipv6 and one multi-path address.
341
 */
342
static struct nfs4_pnfs_ds_addr *
343
decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
344
{
345
	struct nfs4_pnfs_ds_addr *da = NULL;
W
Weston Andros Adamson 已提交
346
	char *buf, *portstr;
347
	__be16 port;
W
Weston Andros Adamson 已提交
348
	int nlen, rlen;
349
	int tmp[2];
350
	__be32 *p;
W
Weston Andros Adamson 已提交
351
	char *netid, *match_netid;
352 353 354 355
	size_t len, match_netid_len;
	char *startsep = "";
	char *endsep = "";

356 357

	/* r_netid */
358 359 360
	p = xdr_inline_decode(streamp, 4);
	if (unlikely(!p))
		goto out_err;
361 362
	nlen = be32_to_cpup(p++);

363 364 365
	p = xdr_inline_decode(streamp, nlen);
	if (unlikely(!p))
		goto out_err;
366

W
Weston Andros Adamson 已提交
367 368
	netid = kmalloc(nlen+1, gfp_flags);
	if (unlikely(!netid))
369 370
		goto out_err;

W
Weston Andros Adamson 已提交
371 372 373 374
	netid[nlen] = '\0';
	memcpy(netid, p, nlen);

	/* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
375 376
	p = xdr_inline_decode(streamp, 4);
	if (unlikely(!p))
W
Weston Andros Adamson 已提交
377
		goto out_free_netid;
378 379 380 381
	rlen = be32_to_cpup(p);

	p = xdr_inline_decode(streamp, rlen);
	if (unlikely(!p))
W
Weston Andros Adamson 已提交
382
		goto out_free_netid;
383

W
Weston Andros Adamson 已提交
384 385
	/* port is ".ABC.DEF", 8 chars max */
	if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
386
		dprintk("%s: Invalid address, length %d\n", __func__,
387
			rlen);
W
Weston Andros Adamson 已提交
388
		goto out_free_netid;
389
	}
390
	buf = kmalloc(rlen + 1, gfp_flags);
391 392
	if (!buf) {
		dprintk("%s: Not enough memory\n", __func__);
W
Weston Andros Adamson 已提交
393
		goto out_free_netid;
394
	}
395
	buf[rlen] = '\0';
396
	memcpy(buf, p, rlen);
397

W
Weston Andros Adamson 已提交
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
	/* replace port '.' with '-' */
	portstr = strrchr(buf, '.');
	if (!portstr) {
		dprintk("%s: Failed finding expected dot in port\n",
			__func__);
		goto out_free_buf;
	}
	*portstr = '-';

	/* find '.' between address and port */
	portstr = strrchr(buf, '.');
	if (!portstr) {
		dprintk("%s: Failed finding expected dot between address and "
			"port\n", __func__);
		goto out_free_buf;
413
	}
W
Weston Andros Adamson 已提交
414
	*portstr = '\0';
415

416 417
	da = kzalloc(sizeof(*da), gfp_flags);
	if (unlikely(!da))
W
Weston Andros Adamson 已提交
418
		goto out_free_buf;
419 420 421

	INIT_LIST_HEAD(&da->da_node);

422
	if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
423 424 425
		      sizeof(da->da_addr))) {
		dprintk("%s: error parsing address %s\n", __func__, buf);
		goto out_free_da;
426 427
	}

W
Weston Andros Adamson 已提交
428 429
	portstr++;
	sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
430 431
	port = htons((tmp[0] << 8) | (tmp[1]));

432
	switch (da->da_addr.ss_family) {
W
Weston Andros Adamson 已提交
433
	case AF_INET:
434 435
		((struct sockaddr_in *)&da->da_addr)->sin_port = port;
		da->da_addrlen = sizeof(struct sockaddr_in);
W
Weston Andros Adamson 已提交
436 437 438 439 440
		match_netid = "tcp";
		match_netid_len = 3;
		break;

	case AF_INET6:
441 442
		((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
		da->da_addrlen = sizeof(struct sockaddr_in6);
W
Weston Andros Adamson 已提交
443 444
		match_netid = "tcp6";
		match_netid_len = 4;
445 446
		startsep = "[";
		endsep = "]";
W
Weston Andros Adamson 已提交
447 448 449 450
		break;

	default:
		dprintk("%s: unsupported address family: %u\n",
451 452
			__func__, da->da_addr.ss_family);
		goto out_free_da;
W
Weston Andros Adamson 已提交
453 454 455 456 457
	}

	if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
		dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
			__func__, netid, match_netid);
458
		goto out_free_da;
W
Weston Andros Adamson 已提交
459 460
	}

461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
	/* save human readable address */
	len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
	da->da_remotestr = kzalloc(len, gfp_flags);

	/* NULL is ok, only used for dprintk */
	if (da->da_remotestr)
		snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
			 buf, endsep, ntohs(port));

	dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
	kfree(buf);
	kfree(netid);
	return da;

out_free_da:
	kfree(da);
W
Weston Andros Adamson 已提交
477
out_free_buf:
478
	dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
479
	kfree(buf);
W
Weston Andros Adamson 已提交
480 481
out_free_netid:
	kfree(netid);
482
out_err:
483
	return NULL;
484 485 486
}

/* Decode opaque device data and return the result */
487 488 489
struct nfs4_file_layout_dsaddr *
nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
		gfp_t gfp_flags)
490
{
491
	int i;
492 493
	u32 cnt, num;
	u8 *indexp;
494 495 496 497 498
	__be32 *p;
	u8 *stripe_indices;
	u8 max_stripe_index;
	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
	struct xdr_stream stream;
499
	struct xdr_buf buf;
500
	struct page *scratch;
501 502
	struct list_head dsaddrs;
	struct nfs4_pnfs_ds_addr *da;
503 504

	/* set up xdr stream */
505
	scratch = alloc_page(gfp_flags);
506 507 508
	if (!scratch)
		goto out_err;

509
	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
510
	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
511 512

	/* Get the stripe count (number of stripe index) */
513 514 515 516 517
	p = xdr_inline_decode(&stream, 4);
	if (unlikely(!p))
		goto out_err_free_scratch;

	cnt = be32_to_cpup(p);
518 519
	dprintk("%s stripe count  %d\n", __func__, cnt);
	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
520
		printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
521 522
		       "supported maximum %d\n", __func__,
			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
523 524 525 526
		goto out_err_free_scratch;
	}

	/* read stripe indices */
527
	stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
528 529 530 531 532 533 534 535 536 537 538 539 540
	if (!stripe_indices)
		goto out_err_free_scratch;

	p = xdr_inline_decode(&stream, cnt << 2);
	if (unlikely(!p))
		goto out_err_free_stripe_indices;

	indexp = &stripe_indices[0];
	max_stripe_index = 0;
	for (i = 0; i < cnt; i++) {
		*indexp = be32_to_cpup(p++);
		max_stripe_index = max(max_stripe_index, *indexp);
		indexp++;
541 542 543
	}

	/* Check the multipath list count */
544 545 546 547 548
	p = xdr_inline_decode(&stream, 4);
	if (unlikely(!p))
		goto out_err_free_stripe_indices;

	num = be32_to_cpup(p);
549 550
	dprintk("%s ds_num %u\n", __func__, num);
	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
551
		printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
552 553
			"supported maximum %d\n", __func__,
			num, NFS4_PNFS_MAX_MULTI_CNT);
554
		goto out_err_free_stripe_indices;
555
	}
556 557 558

	/* validate stripe indices are all < num */
	if (max_stripe_index >= num) {
559
		printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
560 561 562 563
			__func__, max_stripe_index, num);
		goto out_err_free_stripe_indices;
	}

564 565
	dsaddr = kzalloc(sizeof(*dsaddr) +
			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
566
			gfp_flags);
567
	if (!dsaddr)
568
		goto out_err_free_stripe_indices;
569 570

	dsaddr->stripe_count = cnt;
571 572
	dsaddr->stripe_indices = stripe_indices;
	stripe_indices = NULL;
573
	dsaddr->ds_num = num;
574
	nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
575

576 577
	INIT_LIST_HEAD(&dsaddrs);

578 579
	for (i = 0; i < dsaddr->ds_num; i++) {
		int j;
580 581 582 583 584
		u32 mp_count;

		p = xdr_inline_decode(&stream, 4);
		if (unlikely(!p))
			goto out_err_free_deviceid;
585

586 587
		mp_count = be32_to_cpup(p); /* multipath count */
		for (j = 0; j < mp_count; j++) {
588
			da = decode_ds_addr(server->nfs_client->cl_net,
589
					    &stream, gfp_flags);
590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610
			if (da)
				list_add_tail(&da->da_node, &dsaddrs);
		}
		if (list_empty(&dsaddrs)) {
			dprintk("%s: no suitable DS addresses found\n",
				__func__);
			goto out_err_free_deviceid;
		}

		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
		if (!dsaddr->ds_list[i])
			goto out_err_drain_dsaddrs;

		/* If DS was already in cache, free ds addrs */
		while (!list_empty(&dsaddrs)) {
			da = list_first_entry(&dsaddrs,
					      struct nfs4_pnfs_ds_addr,
					      da_node);
			list_del_init(&da->da_node);
			kfree(da->da_remotestr);
			kfree(da);
611 612
		}
	}
613 614

	__free_page(scratch);
615 616
	return dsaddr;

617 618 619 620 621 622 623 624
out_err_drain_dsaddrs:
	while (!list_empty(&dsaddrs)) {
		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
				      da_node);
		list_del_init(&da->da_node);
		kfree(da->da_remotestr);
		kfree(da);
	}
625
out_err_free_deviceid:
626
	nfs4_fl_free_deviceid(dsaddr);
627 628 629 630 631 632
	/* stripe_indicies was part of dsaddr */
	goto out_err_free_scratch;
out_err_free_stripe_indices:
	kfree(stripe_indices);
out_err_free_scratch:
	__free_page(scratch);
633 634 635 636 637
out_err:
	dprintk("%s ERROR: returning NULL\n", __func__);
	return NULL;
}

638 639
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
640
{
641
	nfs4_put_deviceid_node(&dsaddr->id_node);
642
}
F
Fred Isaman 已提交
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684

/*
 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
 * Then: ((res + fsi) % dsaddr->stripe_count)
 */
u32
nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
{
	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
	u64 tmp;

	tmp = offset - flseg->pattern_offset;
	do_div(tmp, flseg->stripe_unit);
	tmp += flseg->first_stripe_index;
	return do_div(tmp, flseg->dsaddr->stripe_count);
}

u32
nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
{
	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
}

struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
{
	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
	u32 i;

	if (flseg->stripe_type == STRIPE_SPARSE) {
		if (flseg->num_fh == 1)
			i = 0;
		else if (flseg->num_fh == 0)
			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
			return NULL;
		else
			i = nfs4_fl_calc_ds_index(lseg, j);
	} else
		i = j;
	return flseg->fh_array[i];
}

685 686 687
static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
{
	might_sleep();
688 689
	wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
			   nfs_wait_bit_killable, TASK_KILLABLE);
690 691 692 693
}

static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
{
694
	smp_mb__before_atomic();
695
	clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
696
	smp_mb__after_atomic();
697 698 699 700
	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
}


F
Fred Isaman 已提交
701 702 703 704 705
struct nfs4_pnfs_ds *
nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
{
	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
706
	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
707
	struct nfs4_pnfs_ds *ret = ds;
F
Fred Isaman 已提交
708 709

	if (ds == NULL) {
710
		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
F
Fred Isaman 已提交
711
			__func__, ds_idx);
712
		filelayout_mark_devid_invalid(devid);
713
		goto out;
F
Fred Isaman 已提交
714
	}
715
	smp_rmb();
716
	if (ds->ds_clp)
717
		goto out_test_devid;
F
Fred Isaman 已提交
718

719
	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
720
		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
F
Fred Isaman 已提交
721 722
		int err;

723
		err = nfs4_ds_connect(s, ds);
724
		if (err)
725
			nfs4_mark_deviceid_unavailable(devid);
726 727 728 729
		nfs4_clear_ds_conn_bit(ds);
	} else {
		/* Either ds is connected, or ds is NULL */
		nfs4_wait_ds_connect(ds);
F
Fred Isaman 已提交
730
	}
731 732 733 734 735
out_test_devid:
	if (filelayout_test_devid_unavailable(devid))
		ret = NULL;
out:
	return ret;
F
Fred Isaman 已提交
736
}
737 738 739 740 741 742 743 744 745

module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
			"retries a request before it attempts further "
			" recovery  action.");
module_param(dataserver_timeo, uint, 0644);
MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
			"NFSv4.1  client  waits for a response from a "
			" data server before it retries an NFS request.");