filelayoutdev.c 20.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 *  Device operations for the pnfs nfs4 file layout driver.
 *
 *  Copyright (c) 2002
 *  The Regents of the University of Michigan
 *  All Rights Reserved
 *
 *  Dean Hildebrand <dhildebz@umich.edu>
 *  Garth Goodson   <Garth.Goodson@netapp.com>
 *
 *  Permission is granted to use, copy, create derivative works, and
 *  redistribute this software and such derivative works for any purpose,
 *  so long as the name of the University of Michigan is not used in
 *  any advertising or publicity pertaining to the use or distribution
 *  of this software without specific, written prior authorization. If
 *  the above copyright notice or any other identification of the
 *  University of Michigan is included in any copy of any portion of
 *  this software, then the disclaimer below must also be included.
 *
 *  This software is provided as is, without representation or warranty
 *  of any kind either express or implied, including without limitation
 *  the implied warranties of merchantability, fitness for a particular
 *  purpose, or noninfringement.  The Regents of the University of
 *  Michigan shall not be liable for any damages, including special,
 *  indirect, incidental, or consequential damages, with respect to any
 *  claim arising out of or in connection with the use of the software,
 *  even if it has been or is hereafter advised of the possibility of
 *  such damages.
 */

#include <linux/nfs_fs.h>
#include <linux/vmalloc.h>
33
#include <linux/module.h>
34
#include <linux/sunrpc/addr.h>
35

36 37 38
#include "../internal.h"
#include "../nfs4session.h"
#include "filelayout.h"
39 40 41

#define NFSDBG_FACILITY		NFSDBG_PNFS_LD

42 43 44
static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;

45 46 47 48 49 50 51 52 53
/*
 * Data server cache
 *
 * Data servers can be mapped to different device ids.
 * nfs4_pnfs_ds reference counting
 *   - set to 1 on allocation
 *   - incremented when a device id maps a data server already in the cache.
 *   - decremented when deviceid is removed from the cache.
 */
54
static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
55 56 57 58 59 60 61 62 63 64
static LIST_HEAD(nfs4_data_server_cache);

/* Debug routines */
void
print_ds(struct nfs4_pnfs_ds *ds)
{
	if (ds == NULL) {
		printk("%s NULL device\n", __func__);
		return;
	}
W
Weston Andros Adamson 已提交
65
	printk("        ds %s\n"
66 67 68
		"        ref count %d\n"
		"        client %p\n"
		"        cl_exchange_flags %x\n",
W
Weston Andros Adamson 已提交
69
		ds->ds_remotestr,
70 71 72 73
		atomic_read(&ds->ds_count), ds->ds_clp,
		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
}

74 75
static bool
same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
76
{
W
Weston Andros Adamson 已提交
77 78
	struct sockaddr_in *a, *b;
	struct sockaddr_in6 *a6, *b6;
79

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
	if (addr1->sa_family != addr2->sa_family)
		return false;

	switch (addr1->sa_family) {
	case AF_INET:
		a = (struct sockaddr_in *)addr1;
		b = (struct sockaddr_in *)addr2;

		if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
		    a->sin_port == b->sin_port)
			return true;
		break;

	case AF_INET6:
		a6 = (struct sockaddr_in6 *)addr1;
		b6 = (struct sockaddr_in6 *)addr2;

		/* LINKLOCAL addresses must have matching scope_id */
98
		if (ipv6_addr_src_scope(&a6->sin6_addr) ==
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
		    IPV6_ADDR_SCOPE_LINKLOCAL &&
		    a6->sin6_scope_id != b6->sin6_scope_id)
			return false;

		if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
		    a6->sin6_port == b6->sin6_port)
			return true;
		break;

	default:
		dprintk("%s: unhandled address family: %u\n",
			__func__, addr1->sa_family);
		return false;
	}

	return false;
}

117
static bool
118 119
_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
			       const struct list_head *dsaddrs2)
120 121 122
{
	struct nfs4_pnfs_ds_addr *da1, *da2;

123 124 125 126 127 128 129 130 131
	/* step through both lists, comparing as we go */
	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
	     da1 != NULL && da2 != NULL;
	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
		if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
				   (struct sockaddr *)&da2->da_addr))
			return false;
132
	}
133 134 135 136
	if (da1 == NULL && da2 == NULL)
		return true;

	return false;
137 138
}

139
/*
140
 * Lookup DS by addresses.  nfs4_ds_cache_lock is held
141
 */
142 143
static struct nfs4_pnfs_ds *
_data_server_lookup_locked(const struct list_head *dsaddrs)
144
{
145
	struct nfs4_pnfs_ds *ds;
146

147 148 149 150
	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
			return ds;
	return NULL;
151 152
}

A
Andy Adamson 已提交
153 154
/*
 * Create an rpc connection to the nfs4_pnfs_ds data server
W
Weston Andros Adamson 已提交
155
 * Currently only supports IPv4 and IPv6 addresses
A
Andy Adamson 已提交
156 157 158 159
 */
static int
nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
{
160
	struct nfs_client *clp = ERR_PTR(-EIO);
161
	struct nfs4_pnfs_ds_addr *da;
A
Andy Adamson 已提交
162 163
	int status = 0;

164
	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
A
Andy Adamson 已提交
165 166
		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);

167 168 169
	list_for_each_entry(da, &ds->ds_addrs, da_node) {
		dprintk("%s: DS %s: trying address %s\n",
			__func__, ds->ds_remotestr, da->da_remotestr);
170

171
		clp = nfs4_set_ds_client(mds_srv->nfs_client,
172 173 174
					(struct sockaddr *)&da->da_addr,
					da->da_addrlen, IPPROTO_TCP,
					dataserver_timeo, dataserver_retrans);
175 176 177 178
		if (!IS_ERR(clp))
			break;
	}

A
Andy Adamson 已提交
179 180 181 182 183
	if (IS_ERR(clp)) {
		status = PTR_ERR(clp);
		goto out;
	}

184
	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
A
Andy Adamson 已提交
185 186 187
	if (status)
		goto out_put;

188
	smp_wmb();
A
Andy Adamson 已提交
189
	ds->ds_clp = clp;
W
Weston Andros Adamson 已提交
190
	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
A
Andy Adamson 已提交
191 192 193 194 195 196 197
out:
	return status;
out_put:
	nfs_put_client(clp);
	goto out;
}

198 199 200
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
201 202
	struct nfs4_pnfs_ds_addr *da;

203 204 205 206 207 208
	dprintk("--> %s\n", __func__);
	ifdebug(FACILITY)
		print_ds(ds);

	if (ds->ds_clp)
		nfs_put_client(ds->ds_clp);
209 210 211 212 213 214 215 216 217 218

	while (!list_empty(&ds->ds_addrs)) {
		da = list_first_entry(&ds->ds_addrs,
				      struct nfs4_pnfs_ds_addr,
				      da_node);
		list_del_init(&da->da_node);
		kfree(da->da_remotestr);
		kfree(da);
	}

W
Weston Andros Adamson 已提交
219
	kfree(ds->ds_remotestr);
220 221 222
	kfree(ds);
}

223
void
224 225 226 227 228
nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
	struct nfs4_pnfs_ds *ds;
	int i;

229
	nfs4_print_deviceid(&dsaddr->id_node.deviceid);
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245

	for (i = 0; i < dsaddr->ds_num; i++) {
		ds = dsaddr->ds_list[i];
		if (ds != NULL) {
			if (atomic_dec_and_lock(&ds->ds_count,
						&nfs4_ds_cache_lock)) {
				list_del_init(&ds->ds_node);
				spin_unlock(&nfs4_ds_cache_lock);
				destroy_ds(ds);
			}
		}
	}
	kfree(dsaddr->stripe_indices);
	kfree(dsaddr);
}

W
Weston Andros Adamson 已提交
246 247 248 249 250
/*
 * Create a string with a human readable address and port to avoid
 * complicated setup around many dprinks.
 */
static char *
251
nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
W
Weston Andros Adamson 已提交
252
{
253
	struct nfs4_pnfs_ds_addr *da;
W
Weston Andros Adamson 已提交
254 255
	char *remotestr;
	size_t len;
256
	char *p;
W
Weston Andros Adamson 已提交
257

258 259 260
	len = 3;        /* '{', '}' and eol */
	list_for_each_entry(da, dsaddrs, da_node) {
		len += strlen(da->da_remotestr) + 1;    /* string plus comma */
W
Weston Andros Adamson 已提交
261 262
	}

263 264
	remotestr = kzalloc(len, gfp_flags);
	if (!remotestr)
W
Weston Andros Adamson 已提交
265 266
		return NULL;

267 268 269 270 271
	p = remotestr;
	*(p++) = '{';
	len--;
	list_for_each_entry(da, dsaddrs, da_node) {
		size_t ll = strlen(da->da_remotestr);
W
Weston Andros Adamson 已提交
272

273 274
		if (ll > len)
			goto out_err;
W
Weston Andros Adamson 已提交
275

276 277 278
		memcpy(p, da->da_remotestr, ll);
		p += ll;
		len -= ll;
W
Weston Andros Adamson 已提交
279

280 281 282 283 284 285 286 287 288
		if (len < 1)
			goto out_err;
		(*p++) = ',';
		len--;
	}
	if (len < 2)
		goto out_err;
	*(p++) = '}';
	*p = '\0';
W
Weston Andros Adamson 已提交
289
	return remotestr;
290 291 292
out_err:
	kfree(remotestr);
	return NULL;
W
Weston Andros Adamson 已提交
293 294
}

295
static struct nfs4_pnfs_ds *
296
nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
297
{
W
Weston Andros Adamson 已提交
298 299
	struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
	char *remotestr;
300

301 302 303 304 305 306
	if (list_empty(dsaddrs)) {
		dprintk("%s: no addresses defined\n", __func__);
		goto out;
	}

	ds = kzalloc(sizeof(*ds), gfp_flags);
307 308 309
	if (!ds)
		goto out;

W
Weston Andros Adamson 已提交
310
	/* this is only used for debugging, so it's ok if its NULL */
311
	remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
W
Weston Andros Adamson 已提交
312

313
	spin_lock(&nfs4_ds_cache_lock);
314
	tmp_ds = _data_server_lookup_locked(dsaddrs);
315
	if (tmp_ds == NULL) {
316 317
		INIT_LIST_HEAD(&ds->ds_addrs);
		list_splice_init(dsaddrs, &ds->ds_addrs);
W
Weston Andros Adamson 已提交
318
		ds->ds_remotestr = remotestr;
319 320 321 322
		atomic_set(&ds->ds_count, 1);
		INIT_LIST_HEAD(&ds->ds_node);
		ds->ds_clp = NULL;
		list_add(&ds->ds_node, &nfs4_data_server_cache);
W
Weston Andros Adamson 已提交
323 324
		dprintk("%s add new data server %s\n", __func__,
			ds->ds_remotestr);
325
	} else {
W
Weston Andros Adamson 已提交
326
		kfree(remotestr);
327 328
		kfree(ds);
		atomic_inc(&tmp_ds->ds_count);
W
Weston Andros Adamson 已提交
329 330
		dprintk("%s data server %s found, inc'ed ds_count to %d\n",
			__func__, tmp_ds->ds_remotestr,
331 332 333 334 335 336 337 338 339
			atomic_read(&tmp_ds->ds_count));
		ds = tmp_ds;
	}
	spin_unlock(&nfs4_ds_cache_lock);
out:
	return ds;
}

/*
W
Weston Andros Adamson 已提交
340
 * Currently only supports ipv4, ipv6 and one multi-path address.
341
 */
342
static struct nfs4_pnfs_ds_addr *
343
decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
344
{
345
	struct nfs4_pnfs_ds_addr *da = NULL;
W
Weston Andros Adamson 已提交
346
	char *buf, *portstr;
347
	__be16 port;
W
Weston Andros Adamson 已提交
348
	int nlen, rlen;
349
	int tmp[2];
350
	__be32 *p;
W
Weston Andros Adamson 已提交
351
	char *netid, *match_netid;
352 353 354 355
	size_t len, match_netid_len;
	char *startsep = "";
	char *endsep = "";

356 357

	/* r_netid */
358 359 360
	p = xdr_inline_decode(streamp, 4);
	if (unlikely(!p))
		goto out_err;
361 362
	nlen = be32_to_cpup(p++);

363 364 365
	p = xdr_inline_decode(streamp, nlen);
	if (unlikely(!p))
		goto out_err;
366

W
Weston Andros Adamson 已提交
367 368
	netid = kmalloc(nlen+1, gfp_flags);
	if (unlikely(!netid))
369 370
		goto out_err;

W
Weston Andros Adamson 已提交
371 372 373 374
	netid[nlen] = '\0';
	memcpy(netid, p, nlen);

	/* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
375 376
	p = xdr_inline_decode(streamp, 4);
	if (unlikely(!p))
W
Weston Andros Adamson 已提交
377
		goto out_free_netid;
378 379 380 381
	rlen = be32_to_cpup(p);

	p = xdr_inline_decode(streamp, rlen);
	if (unlikely(!p))
W
Weston Andros Adamson 已提交
382
		goto out_free_netid;
383

W
Weston Andros Adamson 已提交
384 385
	/* port is ".ABC.DEF", 8 chars max */
	if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
386
		dprintk("%s: Invalid address, length %d\n", __func__,
387
			rlen);
W
Weston Andros Adamson 已提交
388
		goto out_free_netid;
389
	}
390
	buf = kmalloc(rlen + 1, gfp_flags);
391 392
	if (!buf) {
		dprintk("%s: Not enough memory\n", __func__);
W
Weston Andros Adamson 已提交
393
		goto out_free_netid;
394
	}
395
	buf[rlen] = '\0';
396
	memcpy(buf, p, rlen);
397

W
Weston Andros Adamson 已提交
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412
	/* replace port '.' with '-' */
	portstr = strrchr(buf, '.');
	if (!portstr) {
		dprintk("%s: Failed finding expected dot in port\n",
			__func__);
		goto out_free_buf;
	}
	*portstr = '-';

	/* find '.' between address and port */
	portstr = strrchr(buf, '.');
	if (!portstr) {
		dprintk("%s: Failed finding expected dot between address and "
			"port\n", __func__);
		goto out_free_buf;
413
	}
W
Weston Andros Adamson 已提交
414
	*portstr = '\0';
415

416 417
	da = kzalloc(sizeof(*da), gfp_flags);
	if (unlikely(!da))
W
Weston Andros Adamson 已提交
418
		goto out_free_buf;
419 420 421

	INIT_LIST_HEAD(&da->da_node);

422
	if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
423 424 425
		      sizeof(da->da_addr))) {
		dprintk("%s: error parsing address %s\n", __func__, buf);
		goto out_free_da;
426 427
	}

W
Weston Andros Adamson 已提交
428 429
	portstr++;
	sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
430 431
	port = htons((tmp[0] << 8) | (tmp[1]));

432
	switch (da->da_addr.ss_family) {
W
Weston Andros Adamson 已提交
433
	case AF_INET:
434 435
		((struct sockaddr_in *)&da->da_addr)->sin_port = port;
		da->da_addrlen = sizeof(struct sockaddr_in);
W
Weston Andros Adamson 已提交
436 437 438 439 440
		match_netid = "tcp";
		match_netid_len = 3;
		break;

	case AF_INET6:
441 442
		((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
		da->da_addrlen = sizeof(struct sockaddr_in6);
W
Weston Andros Adamson 已提交
443 444
		match_netid = "tcp6";
		match_netid_len = 4;
445 446
		startsep = "[";
		endsep = "]";
W
Weston Andros Adamson 已提交
447 448 449 450
		break;

	default:
		dprintk("%s: unsupported address family: %u\n",
451 452
			__func__, da->da_addr.ss_family);
		goto out_free_da;
W
Weston Andros Adamson 已提交
453 454 455 456 457
	}

	if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
		dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
			__func__, netid, match_netid);
458
		goto out_free_da;
W
Weston Andros Adamson 已提交
459 460
	}

461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
	/* save human readable address */
	len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
	da->da_remotestr = kzalloc(len, gfp_flags);

	/* NULL is ok, only used for dprintk */
	if (da->da_remotestr)
		snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
			 buf, endsep, ntohs(port));

	dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
	kfree(buf);
	kfree(netid);
	return da;

out_free_da:
	kfree(da);
W
Weston Andros Adamson 已提交
477
out_free_buf:
478
	dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
479
	kfree(buf);
W
Weston Andros Adamson 已提交
480 481
out_free_netid:
	kfree(netid);
482
out_err:
483
	return NULL;
484 485 486 487
}

/* Decode opaque device data and return the result */
static struct nfs4_file_layout_dsaddr*
488
decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
489
{
490
	int i;
491 492
	u32 cnt, num;
	u8 *indexp;
493 494 495 496 497
	__be32 *p;
	u8 *stripe_indices;
	u8 max_stripe_index;
	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
	struct xdr_stream stream;
498
	struct xdr_buf buf;
499
	struct page *scratch;
500 501
	struct list_head dsaddrs;
	struct nfs4_pnfs_ds_addr *da;
502 503

	/* set up xdr stream */
504
	scratch = alloc_page(gfp_flags);
505 506 507
	if (!scratch)
		goto out_err;

508
	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
509
	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
510 511

	/* Get the stripe count (number of stripe index) */
512 513 514 515 516
	p = xdr_inline_decode(&stream, 4);
	if (unlikely(!p))
		goto out_err_free_scratch;

	cnt = be32_to_cpup(p);
517 518
	dprintk("%s stripe count  %d\n", __func__, cnt);
	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
519
		printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
520 521
		       "supported maximum %d\n", __func__,
			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
522 523 524 525
		goto out_err_free_scratch;
	}

	/* read stripe indices */
526
	stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
527 528 529 530 531 532 533 534 535 536 537 538 539
	if (!stripe_indices)
		goto out_err_free_scratch;

	p = xdr_inline_decode(&stream, cnt << 2);
	if (unlikely(!p))
		goto out_err_free_stripe_indices;

	indexp = &stripe_indices[0];
	max_stripe_index = 0;
	for (i = 0; i < cnt; i++) {
		*indexp = be32_to_cpup(p++);
		max_stripe_index = max(max_stripe_index, *indexp);
		indexp++;
540 541 542
	}

	/* Check the multipath list count */
543 544 545 546 547
	p = xdr_inline_decode(&stream, 4);
	if (unlikely(!p))
		goto out_err_free_stripe_indices;

	num = be32_to_cpup(p);
548 549
	dprintk("%s ds_num %u\n", __func__, num);
	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
550
		printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
551 552
			"supported maximum %d\n", __func__,
			num, NFS4_PNFS_MAX_MULTI_CNT);
553
		goto out_err_free_stripe_indices;
554
	}
555 556 557

	/* validate stripe indices are all < num */
	if (max_stripe_index >= num) {
558
		printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
559 560 561 562
			__func__, max_stripe_index, num);
		goto out_err_free_stripe_indices;
	}

563 564
	dsaddr = kzalloc(sizeof(*dsaddr) +
			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
565
			gfp_flags);
566
	if (!dsaddr)
567
		goto out_err_free_stripe_indices;
568 569

	dsaddr->stripe_count = cnt;
570 571
	dsaddr->stripe_indices = stripe_indices;
	stripe_indices = NULL;
572
	dsaddr->ds_num = num;
573 574 575
	nfs4_init_deviceid_node(&dsaddr->id_node,
				NFS_SERVER(ino)->pnfs_curr_ld,
				NFS_SERVER(ino)->nfs_client,
576
				&pdev->dev_id);
577

578 579
	INIT_LIST_HEAD(&dsaddrs);

580 581
	for (i = 0; i < dsaddr->ds_num; i++) {
		int j;
582 583 584 585 586
		u32 mp_count;

		p = xdr_inline_decode(&stream, 4);
		if (unlikely(!p))
			goto out_err_free_deviceid;
587

588 589
		mp_count = be32_to_cpup(p); /* multipath count */
		for (j = 0; j < mp_count; j++) {
590
			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
591
					    &stream, gfp_flags);
592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
			if (da)
				list_add_tail(&da->da_node, &dsaddrs);
		}
		if (list_empty(&dsaddrs)) {
			dprintk("%s: no suitable DS addresses found\n",
				__func__);
			goto out_err_free_deviceid;
		}

		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
		if (!dsaddr->ds_list[i])
			goto out_err_drain_dsaddrs;

		/* If DS was already in cache, free ds addrs */
		while (!list_empty(&dsaddrs)) {
			da = list_first_entry(&dsaddrs,
					      struct nfs4_pnfs_ds_addr,
					      da_node);
			list_del_init(&da->da_node);
			kfree(da->da_remotestr);
			kfree(da);
613 614
		}
	}
615 616

	__free_page(scratch);
617 618
	return dsaddr;

619 620 621 622 623 624 625 626
out_err_drain_dsaddrs:
	while (!list_empty(&dsaddrs)) {
		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
				      da_node);
		list_del_init(&da->da_node);
		kfree(da->da_remotestr);
		kfree(da);
	}
627
out_err_free_deviceid:
628
	nfs4_fl_free_deviceid(dsaddr);
629 630 631 632 633 634
	/* stripe_indicies was part of dsaddr */
	goto out_err_free_scratch;
out_err_free_stripe_indices:
	kfree(stripe_indices);
out_err_free_scratch:
	__free_page(scratch);
635 636 637 638 639 640
out_err:
	dprintk("%s ERROR: returning NULL\n", __func__);
	return NULL;
}

/*
641 642
 * Decode the opaque device specified in 'dev' and add it to the cache of
 * available devices.
643
 */
644
static struct nfs4_file_layout_dsaddr *
645
decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
646
{
647 648
	struct nfs4_deviceid_node *d;
	struct nfs4_file_layout_dsaddr *n, *new;
649

650
	new = decode_device(inode, dev, gfp_flags);
651
	if (!new) {
652
		printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
653 654 655 656
			__func__);
		return NULL;
	}

657 658 659
	d = nfs4_insert_deviceid_node(&new->id_node);
	n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
	if (n != new) {
660
		nfs4_fl_free_deviceid(new);
661
		return n;
662 663 664
	}

	return new;
665 666 667 668 669 670 671
}

/*
 * Retrieve the information for dev_id, add it to the list
 * of available devices, and return it.
 */
struct nfs4_file_layout_dsaddr *
672 673 674 675
filelayout_get_device_info(struct inode *inode,
		struct nfs4_deviceid *dev_id,
		struct rpc_cred *cred,
		gfp_t gfp_flags)
676 677 678 679 680 681 682 683 684 685 686 687 688 689
{
	struct pnfs_device *pdev = NULL;
	u32 max_resp_sz;
	int max_pages;
	struct page **pages = NULL;
	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
	int rc, i;
	struct nfs_server *server = NFS_SERVER(inode);

	/*
	 * Use the session max response size as the basis for setting
	 * GETDEVICEINFO's maxcount
	 */
	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
690
	max_pages = nfs_page_array_len(0, max_resp_sz);
691 692 693
	dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
		__func__, inode, max_resp_sz, max_pages);

694
	pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
695 696 697
	if (pdev == NULL)
		return NULL;

698
	pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
699 700 701 702 703
	if (pages == NULL) {
		kfree(pdev);
		return NULL;
	}
	for (i = 0; i < max_pages; i++) {
704
		pages[i] = alloc_page(gfp_flags);
705 706 707 708 709 710 711 712
		if (!pages[i])
			goto out_free;
	}

	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
	pdev->layout_type = LAYOUT_NFSV4_1_FILES;
	pdev->pages = pages;
	pdev->pgbase = 0;
713
	pdev->pglen = max_resp_sz;
714
	pdev->mincount = 0;
715
	pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
716

717
	rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
718 719 720 721 722 723 724 725
	dprintk("%s getdevice info returns %d\n", __func__, rc);
	if (rc)
		goto out_free;

	/*
	 * Found new device, need to decode it and then add it to the
	 * list of known devices for this mountpoint.
	 */
726
	dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
727 728 729 730 731 732 733 734 735
out_free:
	for (i = 0; i < max_pages; i++)
		__free_page(pages[i]);
	kfree(pages);
	kfree(pdev);
	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
	return dsaddr;
}

736 737
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
738
{
739
	nfs4_put_deviceid_node(&dsaddr->id_node);
740
}
F
Fred Isaman 已提交
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782

/*
 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
 * Then: ((res + fsi) % dsaddr->stripe_count)
 */
u32
nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
{
	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
	u64 tmp;

	tmp = offset - flseg->pattern_offset;
	do_div(tmp, flseg->stripe_unit);
	tmp += flseg->first_stripe_index;
	return do_div(tmp, flseg->dsaddr->stripe_count);
}

u32
nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
{
	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
}

struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
{
	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
	u32 i;

	if (flseg->stripe_type == STRIPE_SPARSE) {
		if (flseg->num_fh == 1)
			i = 0;
		else if (flseg->num_fh == 0)
			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
			return NULL;
		else
			i = nfs4_fl_calc_ds_index(lseg, j);
	} else
		i = j;
	return flseg->fh_array[i];
}

783 784 785
static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
{
	might_sleep();
786 787
	wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
			   nfs_wait_bit_killable, TASK_KILLABLE);
788 789 790 791
}

static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
{
792
	smp_mb__before_atomic();
793
	clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
794
	smp_mb__after_atomic();
795 796 797 798
	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
}


F
Fred Isaman 已提交
799 800 801 802 803
struct nfs4_pnfs_ds *
nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
{
	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
804
	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
805
	struct nfs4_pnfs_ds *ret = ds;
F
Fred Isaman 已提交
806 807

	if (ds == NULL) {
808
		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
F
Fred Isaman 已提交
809
			__func__, ds_idx);
810
		filelayout_mark_devid_invalid(devid);
811
		goto out;
F
Fred Isaman 已提交
812
	}
813
	smp_rmb();
814
	if (ds->ds_clp)
815
		goto out_test_devid;
F
Fred Isaman 已提交
816

817
	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
818
		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
F
Fred Isaman 已提交
819 820
		int err;

821
		err = nfs4_ds_connect(s, ds);
822
		if (err)
823
			nfs4_mark_deviceid_unavailable(devid);
824 825 826 827
		nfs4_clear_ds_conn_bit(ds);
	} else {
		/* Either ds is connected, or ds is NULL */
		nfs4_wait_ds_connect(ds);
F
Fred Isaman 已提交
828
	}
829 830 831 832 833
out_test_devid:
	if (filelayout_test_devid_unavailable(devid))
		ret = NULL;
out:
	return ret;
F
Fred Isaman 已提交
834
}
835 836 837 838 839 840 841 842 843

module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
			"retries a request before it attempts further "
			" recovery  action.");
module_param(dataserver_timeo, uint, 0644);
MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
			"NFSv4.1  client  waits for a response from a "
			" data server before it retries an NFS request.");