filelayoutdev.c 18.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 *  Device operations for the pnfs nfs4 file layout driver.
 *
 *  Copyright (c) 2002
 *  The Regents of the University of Michigan
 *  All Rights Reserved
 *
 *  Dean Hildebrand <dhildebz@umich.edu>
 *  Garth Goodson   <Garth.Goodson@netapp.com>
 *
 *  Permission is granted to use, copy, create derivative works, and
 *  redistribute this software and such derivative works for any purpose,
 *  so long as the name of the University of Michigan is not used in
 *  any advertising or publicity pertaining to the use or distribution
 *  of this software without specific, written prior authorization. If
 *  the above copyright notice or any other identification of the
 *  University of Michigan is included in any copy of any portion of
 *  this software, then the disclaimer below must also be included.
 *
 *  This software is provided as is, without representation or warranty
 *  of any kind either express or implied, including without limitation
 *  the implied warranties of merchantability, fitness for a particular
 *  purpose, or noninfringement.  The Regents of the University of
 *  Michigan shall not be liable for any damages, including special,
 *  indirect, incidental, or consequential damages, with respect to any
 *  claim arising out of or in connection with the use of the software,
 *  even if it has been or is hereafter advised of the possibility of
 *  such damages.
 */

#include <linux/nfs_fs.h>
#include <linux/vmalloc.h>
33
#include <linux/module.h>
34
#include <linux/sunrpc/addr.h>
35

36 37 38
#include "../internal.h"
#include "../nfs4session.h"
#include "filelayout.h"
39 40 41

#define NFSDBG_FACILITY		NFSDBG_PNFS_LD

42 43 44
static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;

45 46 47 48 49 50 51 52 53
/*
 * Data server cache
 *
 * Data servers can be mapped to different device ids.
 * nfs4_pnfs_ds reference counting
 *   - set to 1 on allocation
 *   - incremented when a device id maps a data server already in the cache.
 *   - decremented when deviceid is removed from the cache.
 */
54
static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
55 56 57 58 59 60 61 62 63 64
static LIST_HEAD(nfs4_data_server_cache);

/* Debug routines */
void
print_ds(struct nfs4_pnfs_ds *ds)
{
	if (ds == NULL) {
		printk("%s NULL device\n", __func__);
		return;
	}
W
Weston Andros Adamson 已提交
65
	printk("        ds %s\n"
66 67 68
		"        ref count %d\n"
		"        client %p\n"
		"        cl_exchange_flags %x\n",
W
Weston Andros Adamson 已提交
69
		ds->ds_remotestr,
70 71 72 73
		atomic_read(&ds->ds_count), ds->ds_clp,
		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
}

74 75
static bool
same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
76
{
W
Weston Andros Adamson 已提交
77 78
	struct sockaddr_in *a, *b;
	struct sockaddr_in6 *a6, *b6;
79

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
	if (addr1->sa_family != addr2->sa_family)
		return false;

	switch (addr1->sa_family) {
	case AF_INET:
		a = (struct sockaddr_in *)addr1;
		b = (struct sockaddr_in *)addr2;

		if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
		    a->sin_port == b->sin_port)
			return true;
		break;

	case AF_INET6:
		a6 = (struct sockaddr_in6 *)addr1;
		b6 = (struct sockaddr_in6 *)addr2;

		/* LINKLOCAL addresses must have matching scope_id */
98
		if (ipv6_addr_src_scope(&a6->sin6_addr) ==
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
		    IPV6_ADDR_SCOPE_LINKLOCAL &&
		    a6->sin6_scope_id != b6->sin6_scope_id)
			return false;

		if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
		    a6->sin6_port == b6->sin6_port)
			return true;
		break;

	default:
		dprintk("%s: unhandled address family: %u\n",
			__func__, addr1->sa_family);
		return false;
	}

	return false;
}

117
static bool
118 119
_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
			       const struct list_head *dsaddrs2)
120 121 122
{
	struct nfs4_pnfs_ds_addr *da1, *da2;

123 124 125 126 127 128 129 130 131
	/* step through both lists, comparing as we go */
	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
	     da1 != NULL && da2 != NULL;
	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
		if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
				   (struct sockaddr *)&da2->da_addr))
			return false;
132
	}
133 134 135 136
	if (da1 == NULL && da2 == NULL)
		return true;

	return false;
137 138
}

139
/*
140
 * Lookup DS by addresses.  nfs4_ds_cache_lock is held
141
 */
142 143
static struct nfs4_pnfs_ds *
_data_server_lookup_locked(const struct list_head *dsaddrs)
144
{
145
	struct nfs4_pnfs_ds *ds;
146

147 148 149 150
	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
			return ds;
	return NULL;
151 152
}

A
Andy Adamson 已提交
153 154
/*
 * Create an rpc connection to the nfs4_pnfs_ds data server
W
Weston Andros Adamson 已提交
155
 * Currently only supports IPv4 and IPv6 addresses
A
Andy Adamson 已提交
156 157 158 159
 */
static int
nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
{
160
	struct nfs_client *clp = ERR_PTR(-EIO);
161
	struct nfs4_pnfs_ds_addr *da;
A
Andy Adamson 已提交
162 163
	int status = 0;

164
	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
A
Andy Adamson 已提交
165 166
		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);

167 168 169
	list_for_each_entry(da, &ds->ds_addrs, da_node) {
		dprintk("%s: DS %s: trying address %s\n",
			__func__, ds->ds_remotestr, da->da_remotestr);
170

171
		clp = nfs4_set_ds_client(mds_srv->nfs_client,
172 173 174
					(struct sockaddr *)&da->da_addr,
					da->da_addrlen, IPPROTO_TCP,
					dataserver_timeo, dataserver_retrans);
175 176 177 178
		if (!IS_ERR(clp))
			break;
	}

A
Andy Adamson 已提交
179 180 181 182 183
	if (IS_ERR(clp)) {
		status = PTR_ERR(clp);
		goto out;
	}

184
	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
A
Andy Adamson 已提交
185 186 187
	if (status)
		goto out_put;

188
	smp_wmb();
A
Andy Adamson 已提交
189
	ds->ds_clp = clp;
W
Weston Andros Adamson 已提交
190
	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
A
Andy Adamson 已提交
191 192 193 194 195 196 197
out:
	return status;
out_put:
	nfs_put_client(clp);
	goto out;
}

198 199 200
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
201 202
	struct nfs4_pnfs_ds_addr *da;

203 204 205 206
	dprintk("--> %s\n", __func__);
	ifdebug(FACILITY)
		print_ds(ds);

207
	nfs_put_client(ds->ds_clp);
208 209 210 211 212 213 214 215 216 217

	while (!list_empty(&ds->ds_addrs)) {
		da = list_first_entry(&ds->ds_addrs,
				      struct nfs4_pnfs_ds_addr,
				      da_node);
		list_del_init(&da->da_node);
		kfree(da->da_remotestr);
		kfree(da);
	}

W
Weston Andros Adamson 已提交
218
	kfree(ds->ds_remotestr);
219 220 221
	kfree(ds);
}

222
void
223 224 225 226 227
nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
	struct nfs4_pnfs_ds *ds;
	int i;

228
	nfs4_print_deviceid(&dsaddr->id_node.deviceid);
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244

	for (i = 0; i < dsaddr->ds_num; i++) {
		ds = dsaddr->ds_list[i];
		if (ds != NULL) {
			if (atomic_dec_and_lock(&ds->ds_count,
						&nfs4_ds_cache_lock)) {
				list_del_init(&ds->ds_node);
				spin_unlock(&nfs4_ds_cache_lock);
				destroy_ds(ds);
			}
		}
	}
	kfree(dsaddr->stripe_indices);
	kfree(dsaddr);
}

W
Weston Andros Adamson 已提交
245 246 247 248 249
/*
 * Create a string with a human readable address and port to avoid
 * complicated setup around many dprinks.
 */
static char *
250
nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
W
Weston Andros Adamson 已提交
251
{
252
	struct nfs4_pnfs_ds_addr *da;
W
Weston Andros Adamson 已提交
253 254
	char *remotestr;
	size_t len;
255
	char *p;
W
Weston Andros Adamson 已提交
256

257 258 259
	len = 3;        /* '{', '}' and eol */
	list_for_each_entry(da, dsaddrs, da_node) {
		len += strlen(da->da_remotestr) + 1;    /* string plus comma */
W
Weston Andros Adamson 已提交
260 261
	}

262 263
	remotestr = kzalloc(len, gfp_flags);
	if (!remotestr)
W
Weston Andros Adamson 已提交
264 265
		return NULL;

266 267 268 269 270
	p = remotestr;
	*(p++) = '{';
	len--;
	list_for_each_entry(da, dsaddrs, da_node) {
		size_t ll = strlen(da->da_remotestr);
W
Weston Andros Adamson 已提交
271

272 273
		if (ll > len)
			goto out_err;
W
Weston Andros Adamson 已提交
274

275 276 277
		memcpy(p, da->da_remotestr, ll);
		p += ll;
		len -= ll;
W
Weston Andros Adamson 已提交
278

279 280 281 282 283 284 285 286 287
		if (len < 1)
			goto out_err;
		(*p++) = ',';
		len--;
	}
	if (len < 2)
		goto out_err;
	*(p++) = '}';
	*p = '\0';
W
Weston Andros Adamson 已提交
288
	return remotestr;
289 290 291
out_err:
	kfree(remotestr);
	return NULL;
W
Weston Andros Adamson 已提交
292 293
}

294
static struct nfs4_pnfs_ds *
295
nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
296
{
W
Weston Andros Adamson 已提交
297 298
	struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
	char *remotestr;
299

300 301 302 303 304 305
	if (list_empty(dsaddrs)) {
		dprintk("%s: no addresses defined\n", __func__);
		goto out;
	}

	ds = kzalloc(sizeof(*ds), gfp_flags);
306 307 308
	if (!ds)
		goto out;

W
Weston Andros Adamson 已提交
309
	/* this is only used for debugging, so it's ok if its NULL */
310
	remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
W
Weston Andros Adamson 已提交
311

312
	spin_lock(&nfs4_ds_cache_lock);
313
	tmp_ds = _data_server_lookup_locked(dsaddrs);
314
	if (tmp_ds == NULL) {
315 316
		INIT_LIST_HEAD(&ds->ds_addrs);
		list_splice_init(dsaddrs, &ds->ds_addrs);
W
Weston Andros Adamson 已提交
317
		ds->ds_remotestr = remotestr;
318 319 320 321
		atomic_set(&ds->ds_count, 1);
		INIT_LIST_HEAD(&ds->ds_node);
		ds->ds_clp = NULL;
		list_add(&ds->ds_node, &nfs4_data_server_cache);
W
Weston Andros Adamson 已提交
322 323
		dprintk("%s add new data server %s\n", __func__,
			ds->ds_remotestr);
324
	} else {
W
Weston Andros Adamson 已提交
325
		kfree(remotestr);
326 327
		kfree(ds);
		atomic_inc(&tmp_ds->ds_count);
W
Weston Andros Adamson 已提交
328 329
		dprintk("%s data server %s found, inc'ed ds_count to %d\n",
			__func__, tmp_ds->ds_remotestr,
330 331 332 333 334 335 336 337 338
			atomic_read(&tmp_ds->ds_count));
		ds = tmp_ds;
	}
	spin_unlock(&nfs4_ds_cache_lock);
out:
	return ds;
}

/*
W
Weston Andros Adamson 已提交
339
 * Currently only supports ipv4, ipv6 and one multi-path address.
340
 */
341
static struct nfs4_pnfs_ds_addr *
342
decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
343
{
344
	struct nfs4_pnfs_ds_addr *da = NULL;
W
Weston Andros Adamson 已提交
345
	char *buf, *portstr;
346
	__be16 port;
W
Weston Andros Adamson 已提交
347
	int nlen, rlen;
348
	int tmp[2];
349
	__be32 *p;
W
Weston Andros Adamson 已提交
350
	char *netid, *match_netid;
351 352 353 354
	size_t len, match_netid_len;
	char *startsep = "";
	char *endsep = "";

355 356

	/* r_netid */
357 358 359
	p = xdr_inline_decode(streamp, 4);
	if (unlikely(!p))
		goto out_err;
360 361
	nlen = be32_to_cpup(p++);

362 363 364
	p = xdr_inline_decode(streamp, nlen);
	if (unlikely(!p))
		goto out_err;
365

W
Weston Andros Adamson 已提交
366 367
	netid = kmalloc(nlen+1, gfp_flags);
	if (unlikely(!netid))
368 369
		goto out_err;

W
Weston Andros Adamson 已提交
370 371 372 373
	netid[nlen] = '\0';
	memcpy(netid, p, nlen);

	/* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
374 375
	p = xdr_inline_decode(streamp, 4);
	if (unlikely(!p))
W
Weston Andros Adamson 已提交
376
		goto out_free_netid;
377 378 379 380
	rlen = be32_to_cpup(p);

	p = xdr_inline_decode(streamp, rlen);
	if (unlikely(!p))
W
Weston Andros Adamson 已提交
381
		goto out_free_netid;
382

W
Weston Andros Adamson 已提交
383 384
	/* port is ".ABC.DEF", 8 chars max */
	if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
385
		dprintk("%s: Invalid address, length %d\n", __func__,
386
			rlen);
W
Weston Andros Adamson 已提交
387
		goto out_free_netid;
388
	}
389
	buf = kmalloc(rlen + 1, gfp_flags);
390 391
	if (!buf) {
		dprintk("%s: Not enough memory\n", __func__);
W
Weston Andros Adamson 已提交
392
		goto out_free_netid;
393
	}
394
	buf[rlen] = '\0';
395
	memcpy(buf, p, rlen);
396

W
Weston Andros Adamson 已提交
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
	/* replace port '.' with '-' */
	portstr = strrchr(buf, '.');
	if (!portstr) {
		dprintk("%s: Failed finding expected dot in port\n",
			__func__);
		goto out_free_buf;
	}
	*portstr = '-';

	/* find '.' between address and port */
	portstr = strrchr(buf, '.');
	if (!portstr) {
		dprintk("%s: Failed finding expected dot between address and "
			"port\n", __func__);
		goto out_free_buf;
412
	}
W
Weston Andros Adamson 已提交
413
	*portstr = '\0';
414

415 416
	da = kzalloc(sizeof(*da), gfp_flags);
	if (unlikely(!da))
W
Weston Andros Adamson 已提交
417
		goto out_free_buf;
418 419 420

	INIT_LIST_HEAD(&da->da_node);

421
	if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
422 423 424
		      sizeof(da->da_addr))) {
		dprintk("%s: error parsing address %s\n", __func__, buf);
		goto out_free_da;
425 426
	}

W
Weston Andros Adamson 已提交
427 428
	portstr++;
	sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
429 430
	port = htons((tmp[0] << 8) | (tmp[1]));

431
	switch (da->da_addr.ss_family) {
W
Weston Andros Adamson 已提交
432
	case AF_INET:
433 434
		((struct sockaddr_in *)&da->da_addr)->sin_port = port;
		da->da_addrlen = sizeof(struct sockaddr_in);
W
Weston Andros Adamson 已提交
435 436 437 438 439
		match_netid = "tcp";
		match_netid_len = 3;
		break;

	case AF_INET6:
440 441
		((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
		da->da_addrlen = sizeof(struct sockaddr_in6);
W
Weston Andros Adamson 已提交
442 443
		match_netid = "tcp6";
		match_netid_len = 4;
444 445
		startsep = "[";
		endsep = "]";
W
Weston Andros Adamson 已提交
446 447 448 449
		break;

	default:
		dprintk("%s: unsupported address family: %u\n",
450 451
			__func__, da->da_addr.ss_family);
		goto out_free_da;
W
Weston Andros Adamson 已提交
452 453 454 455 456
	}

	if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
		dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
			__func__, netid, match_netid);
457
		goto out_free_da;
W
Weston Andros Adamson 已提交
458 459
	}

460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
	/* save human readable address */
	len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
	da->da_remotestr = kzalloc(len, gfp_flags);

	/* NULL is ok, only used for dprintk */
	if (da->da_remotestr)
		snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
			 buf, endsep, ntohs(port));

	dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
	kfree(buf);
	kfree(netid);
	return da;

out_free_da:
	kfree(da);
W
Weston Andros Adamson 已提交
476
out_free_buf:
477
	dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
478
	kfree(buf);
W
Weston Andros Adamson 已提交
479 480
out_free_netid:
	kfree(netid);
481
out_err:
482
	return NULL;
483 484 485
}

/* Decode opaque device data and return the result */
486 487 488
struct nfs4_file_layout_dsaddr *
nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
		gfp_t gfp_flags)
489
{
490
	int i;
491 492
	u32 cnt, num;
	u8 *indexp;
493 494 495 496 497
	__be32 *p;
	u8 *stripe_indices;
	u8 max_stripe_index;
	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
	struct xdr_stream stream;
498
	struct xdr_buf buf;
499
	struct page *scratch;
500 501
	struct list_head dsaddrs;
	struct nfs4_pnfs_ds_addr *da;
502 503

	/* set up xdr stream */
504
	scratch = alloc_page(gfp_flags);
505 506 507
	if (!scratch)
		goto out_err;

508
	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
509
	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
510 511

	/* Get the stripe count (number of stripe index) */
512 513 514 515 516
	p = xdr_inline_decode(&stream, 4);
	if (unlikely(!p))
		goto out_err_free_scratch;

	cnt = be32_to_cpup(p);
517 518
	dprintk("%s stripe count  %d\n", __func__, cnt);
	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
519
		printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
520 521
		       "supported maximum %d\n", __func__,
			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
522 523 524 525
		goto out_err_free_scratch;
	}

	/* read stripe indices */
526
	stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
527 528 529 530 531 532 533 534 535 536 537 538 539
	if (!stripe_indices)
		goto out_err_free_scratch;

	p = xdr_inline_decode(&stream, cnt << 2);
	if (unlikely(!p))
		goto out_err_free_stripe_indices;

	indexp = &stripe_indices[0];
	max_stripe_index = 0;
	for (i = 0; i < cnt; i++) {
		*indexp = be32_to_cpup(p++);
		max_stripe_index = max(max_stripe_index, *indexp);
		indexp++;
540 541 542
	}

	/* Check the multipath list count */
543 544 545 546 547
	p = xdr_inline_decode(&stream, 4);
	if (unlikely(!p))
		goto out_err_free_stripe_indices;

	num = be32_to_cpup(p);
548 549
	dprintk("%s ds_num %u\n", __func__, num);
	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
550
		printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
551 552
			"supported maximum %d\n", __func__,
			num, NFS4_PNFS_MAX_MULTI_CNT);
553
		goto out_err_free_stripe_indices;
554
	}
555 556 557

	/* validate stripe indices are all < num */
	if (max_stripe_index >= num) {
558
		printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
559 560 561 562
			__func__, max_stripe_index, num);
		goto out_err_free_stripe_indices;
	}

563 564
	dsaddr = kzalloc(sizeof(*dsaddr) +
			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
565
			gfp_flags);
566
	if (!dsaddr)
567
		goto out_err_free_stripe_indices;
568 569

	dsaddr->stripe_count = cnt;
570 571
	dsaddr->stripe_indices = stripe_indices;
	stripe_indices = NULL;
572
	dsaddr->ds_num = num;
573
	nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
574

575 576
	INIT_LIST_HEAD(&dsaddrs);

577 578
	for (i = 0; i < dsaddr->ds_num; i++) {
		int j;
579 580 581 582 583
		u32 mp_count;

		p = xdr_inline_decode(&stream, 4);
		if (unlikely(!p))
			goto out_err_free_deviceid;
584

585 586
		mp_count = be32_to_cpup(p); /* multipath count */
		for (j = 0; j < mp_count; j++) {
587
			da = decode_ds_addr(server->nfs_client->cl_net,
588
					    &stream, gfp_flags);
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
			if (da)
				list_add_tail(&da->da_node, &dsaddrs);
		}
		if (list_empty(&dsaddrs)) {
			dprintk("%s: no suitable DS addresses found\n",
				__func__);
			goto out_err_free_deviceid;
		}

		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
		if (!dsaddr->ds_list[i])
			goto out_err_drain_dsaddrs;

		/* If DS was already in cache, free ds addrs */
		while (!list_empty(&dsaddrs)) {
			da = list_first_entry(&dsaddrs,
					      struct nfs4_pnfs_ds_addr,
					      da_node);
			list_del_init(&da->da_node);
			kfree(da->da_remotestr);
			kfree(da);
610 611
		}
	}
612 613

	__free_page(scratch);
614 615
	return dsaddr;

616 617 618 619 620 621 622 623
out_err_drain_dsaddrs:
	while (!list_empty(&dsaddrs)) {
		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
				      da_node);
		list_del_init(&da->da_node);
		kfree(da->da_remotestr);
		kfree(da);
	}
624
out_err_free_deviceid:
625
	nfs4_fl_free_deviceid(dsaddr);
626 627 628 629 630 631
	/* stripe_indicies was part of dsaddr */
	goto out_err_free_scratch;
out_err_free_stripe_indices:
	kfree(stripe_indices);
out_err_free_scratch:
	__free_page(scratch);
632 633 634 635 636
out_err:
	dprintk("%s ERROR: returning NULL\n", __func__);
	return NULL;
}

637 638
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
639
{
640
	nfs4_put_deviceid_node(&dsaddr->id_node);
641
}
F
Fred Isaman 已提交
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683

/*
 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
 * Then: ((res + fsi) % dsaddr->stripe_count)
 */
u32
nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
{
	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
	u64 tmp;

	tmp = offset - flseg->pattern_offset;
	do_div(tmp, flseg->stripe_unit);
	tmp += flseg->first_stripe_index;
	return do_div(tmp, flseg->dsaddr->stripe_count);
}

u32
nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
{
	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
}

struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
{
	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
	u32 i;

	if (flseg->stripe_type == STRIPE_SPARSE) {
		if (flseg->num_fh == 1)
			i = 0;
		else if (flseg->num_fh == 0)
			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
			return NULL;
		else
			i = nfs4_fl_calc_ds_index(lseg, j);
	} else
		i = j;
	return flseg->fh_array[i];
}

684 685 686
static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
{
	might_sleep();
687 688
	wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
			   nfs_wait_bit_killable, TASK_KILLABLE);
689 690 691 692
}

static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
{
693
	smp_mb__before_atomic();
694
	clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
695
	smp_mb__after_atomic();
696 697 698 699
	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
}


F
Fred Isaman 已提交
700 701 702 703 704
struct nfs4_pnfs_ds *
nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
{
	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
705
	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
706
	struct nfs4_pnfs_ds *ret = ds;
F
Fred Isaman 已提交
707 708

	if (ds == NULL) {
709
		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
F
Fred Isaman 已提交
710
			__func__, ds_idx);
711
		filelayout_mark_devid_invalid(devid);
712
		goto out;
F
Fred Isaman 已提交
713
	}
714
	smp_rmb();
715
	if (ds->ds_clp)
716
		goto out_test_devid;
F
Fred Isaman 已提交
717

718
	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
719
		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
F
Fred Isaman 已提交
720 721
		int err;

722
		err = nfs4_ds_connect(s, ds);
723
		if (err)
724
			nfs4_mark_deviceid_unavailable(devid);
725 726 727 728
		nfs4_clear_ds_conn_bit(ds);
	} else {
		/* Either ds is connected, or ds is NULL */
		nfs4_wait_ds_connect(ds);
F
Fred Isaman 已提交
729
	}
730 731 732 733 734
out_test_devid:
	if (filelayout_test_devid_unavailable(devid))
		ret = NULL;
out:
	return ret;
F
Fred Isaman 已提交
735
}
736 737 738 739 740 741 742 743 744

module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
			"retries a request before it attempts further "
			" recovery  action.");
module_param(dataserver_timeo, uint, 0644);
MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
			"NFSv4.1  client  waits for a response from a "
			" data server before it retries an NFS request.");