nfs4filelayoutdev.c 20.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
/*
 *  Device operations for the pnfs nfs4 file layout driver.
 *
 *  Copyright (c) 2002
 *  The Regents of the University of Michigan
 *  All Rights Reserved
 *
 *  Dean Hildebrand <dhildebz@umich.edu>
 *  Garth Goodson   <Garth.Goodson@netapp.com>
 *
 *  Permission is granted to use, copy, create derivative works, and
 *  redistribute this software and such derivative works for any purpose,
 *  so long as the name of the University of Michigan is not used in
 *  any advertising or publicity pertaining to the use or distribution
 *  of this software without specific, written prior authorization. If
 *  the above copyright notice or any other identification of the
 *  University of Michigan is included in any copy of any portion of
 *  this software, then the disclaimer below must also be included.
 *
 *  This software is provided as is, without representation or warranty
 *  of any kind either express or implied, including without limitation
 *  the implied warranties of merchantability, fitness for a particular
 *  purpose, or noninfringement.  The Regents of the University of
 *  Michigan shall not be liable for any damages, including special,
 *  indirect, incidental, or consequential damages, with respect to any
 *  claim arising out of or in connection with the use of the software,
 *  even if it has been or is hereafter advised of the possibility of
 *  such damages.
 */

#include <linux/nfs_fs.h>
#include <linux/vmalloc.h>
33
#include <linux/module.h>
34
#include <linux/sunrpc/addr.h>
35 36

#include "internal.h"
37
#include "nfs4session.h"
38 39 40 41
#include "nfs4filelayout.h"

#define NFSDBG_FACILITY		NFSDBG_PNFS_LD

42 43 44
static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;

45 46 47 48 49 50 51 52 53
/*
 * Data server cache
 *
 * Data servers can be mapped to different device ids.
 * nfs4_pnfs_ds reference counting
 *   - set to 1 on allocation
 *   - incremented when a device id maps a data server already in the cache.
 *   - decremented when deviceid is removed from the cache.
 */
54
static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
55 56 57 58 59 60 61 62 63 64
static LIST_HEAD(nfs4_data_server_cache);

/* Debug routines */
void
print_ds(struct nfs4_pnfs_ds *ds)
{
	if (ds == NULL) {
		printk("%s NULL device\n", __func__);
		return;
	}
W
Weston Andros Adamson 已提交
65
	printk("        ds %s\n"
66 67 68
		"        ref count %d\n"
		"        client %p\n"
		"        cl_exchange_flags %x\n",
W
Weston Andros Adamson 已提交
69
		ds->ds_remotestr,
70 71 72 73
		atomic_read(&ds->ds_count), ds->ds_clp,
		ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
}

74 75
static bool
same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
76
{
W
Weston Andros Adamson 已提交
77 78
	struct sockaddr_in *a, *b;
	struct sockaddr_in6 *a6, *b6;
79

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
	if (addr1->sa_family != addr2->sa_family)
		return false;

	switch (addr1->sa_family) {
	case AF_INET:
		a = (struct sockaddr_in *)addr1;
		b = (struct sockaddr_in *)addr2;

		if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
		    a->sin_port == b->sin_port)
			return true;
		break;

	case AF_INET6:
		a6 = (struct sockaddr_in6 *)addr1;
		b6 = (struct sockaddr_in6 *)addr2;

		/* LINKLOCAL addresses must have matching scope_id */
		if (ipv6_addr_scope(&a6->sin6_addr) ==
		    IPV6_ADDR_SCOPE_LINKLOCAL &&
		    a6->sin6_scope_id != b6->sin6_scope_id)
			return false;

		if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
		    a6->sin6_port == b6->sin6_port)
			return true;
		break;

	default:
		dprintk("%s: unhandled address family: %u\n",
			__func__, addr1->sa_family);
		return false;
	}

	return false;
}

117
static bool
118 119
_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
			       const struct list_head *dsaddrs2)
120 121 122
{
	struct nfs4_pnfs_ds_addr *da1, *da2;

123 124 125 126 127 128 129 130 131
	/* step through both lists, comparing as we go */
	for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
	     da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
	     da1 != NULL && da2 != NULL;
	     da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
	     da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
		if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
				   (struct sockaddr *)&da2->da_addr))
			return false;
132
	}
133 134 135 136
	if (da1 == NULL && da2 == NULL)
		return true;

	return false;
137 138
}

139
/*
140
 * Lookup DS by addresses.  nfs4_ds_cache_lock is held
141
 */
142 143
static struct nfs4_pnfs_ds *
_data_server_lookup_locked(const struct list_head *dsaddrs)
144
{
145
	struct nfs4_pnfs_ds *ds;
146

147 148 149 150
	list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
		if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
			return ds;
	return NULL;
151 152
}

A
Andy Adamson 已提交
153 154
/*
 * Create an rpc connection to the nfs4_pnfs_ds data server
W
Weston Andros Adamson 已提交
155
 * Currently only supports IPv4 and IPv6 addresses
A
Andy Adamson 已提交
156 157 158 159
 */
static int
nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
{
160
	struct nfs_client *clp = ERR_PTR(-EIO);
161
	struct nfs4_pnfs_ds_addr *da;
A
Andy Adamson 已提交
162 163
	int status = 0;

164
	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
A
Andy Adamson 已提交
165 166
		mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);

167 168 169
	list_for_each_entry(da, &ds->ds_addrs, da_node) {
		dprintk("%s: DS %s: trying address %s\n",
			__func__, ds->ds_remotestr, da->da_remotestr);
170

171
		clp = nfs4_set_ds_client(mds_srv->nfs_client,
172 173 174
					(struct sockaddr *)&da->da_addr,
					da->da_addrlen, IPPROTO_TCP,
					dataserver_timeo, dataserver_retrans);
175 176 177 178
		if (!IS_ERR(clp))
			break;
	}

A
Andy Adamson 已提交
179 180 181 182 183
	if (IS_ERR(clp)) {
		status = PTR_ERR(clp);
		goto out;
	}

184
	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
A
Andy Adamson 已提交
185 186 187 188
	if (status)
		goto out_put;

	ds->ds_clp = clp;
W
Weston Andros Adamson 已提交
189
	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
A
Andy Adamson 已提交
190 191 192 193 194 195 196
out:
	return status;
out_put:
	nfs_put_client(clp);
	goto out;
}

197 198 199
static void
destroy_ds(struct nfs4_pnfs_ds *ds)
{
200 201
	struct nfs4_pnfs_ds_addr *da;

202 203 204 205 206 207
	dprintk("--> %s\n", __func__);
	ifdebug(FACILITY)
		print_ds(ds);

	if (ds->ds_clp)
		nfs_put_client(ds->ds_clp);
208 209 210 211 212 213 214 215 216 217

	while (!list_empty(&ds->ds_addrs)) {
		da = list_first_entry(&ds->ds_addrs,
				      struct nfs4_pnfs_ds_addr,
				      da_node);
		list_del_init(&da->da_node);
		kfree(da->da_remotestr);
		kfree(da);
	}

W
Weston Andros Adamson 已提交
218
	kfree(ds->ds_remotestr);
219 220 221
	kfree(ds);
}

222
void
223 224 225 226 227
nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
	struct nfs4_pnfs_ds *ds;
	int i;

228
	nfs4_print_deviceid(&dsaddr->id_node.deviceid);
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244

	for (i = 0; i < dsaddr->ds_num; i++) {
		ds = dsaddr->ds_list[i];
		if (ds != NULL) {
			if (atomic_dec_and_lock(&ds->ds_count,
						&nfs4_ds_cache_lock)) {
				list_del_init(&ds->ds_node);
				spin_unlock(&nfs4_ds_cache_lock);
				destroy_ds(ds);
			}
		}
	}
	kfree(dsaddr->stripe_indices);
	kfree(dsaddr);
}

W
Weston Andros Adamson 已提交
245 246 247 248 249
/*
 * Create a string with a human readable address and port to avoid
 * complicated setup around many dprinks.
 */
static char *
250
nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
W
Weston Andros Adamson 已提交
251
{
252
	struct nfs4_pnfs_ds_addr *da;
W
Weston Andros Adamson 已提交
253 254
	char *remotestr;
	size_t len;
255
	char *p;
W
Weston Andros Adamson 已提交
256

257 258 259
	len = 3;        /* '{', '}' and eol */
	list_for_each_entry(da, dsaddrs, da_node) {
		len += strlen(da->da_remotestr) + 1;    /* string plus comma */
W
Weston Andros Adamson 已提交
260 261
	}

262 263
	remotestr = kzalloc(len, gfp_flags);
	if (!remotestr)
W
Weston Andros Adamson 已提交
264 265
		return NULL;

266 267 268 269 270
	p = remotestr;
	*(p++) = '{';
	len--;
	list_for_each_entry(da, dsaddrs, da_node) {
		size_t ll = strlen(da->da_remotestr);
W
Weston Andros Adamson 已提交
271

272 273
		if (ll > len)
			goto out_err;
W
Weston Andros Adamson 已提交
274

275 276 277
		memcpy(p, da->da_remotestr, ll);
		p += ll;
		len -= ll;
W
Weston Andros Adamson 已提交
278

279 280 281 282 283 284 285 286 287
		if (len < 1)
			goto out_err;
		(*p++) = ',';
		len--;
	}
	if (len < 2)
		goto out_err;
	*(p++) = '}';
	*p = '\0';
W
Weston Andros Adamson 已提交
288
	return remotestr;
289 290 291
out_err:
	kfree(remotestr);
	return NULL;
W
Weston Andros Adamson 已提交
292 293
}

294
static struct nfs4_pnfs_ds *
295
nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
296
{
W
Weston Andros Adamson 已提交
297 298
	struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
	char *remotestr;
299

300 301 302 303 304 305
	if (list_empty(dsaddrs)) {
		dprintk("%s: no addresses defined\n", __func__);
		goto out;
	}

	ds = kzalloc(sizeof(*ds), gfp_flags);
306 307 308
	if (!ds)
		goto out;

W
Weston Andros Adamson 已提交
309
	/* this is only used for debugging, so it's ok if its NULL */
310
	remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
W
Weston Andros Adamson 已提交
311

312
	spin_lock(&nfs4_ds_cache_lock);
313
	tmp_ds = _data_server_lookup_locked(dsaddrs);
314
	if (tmp_ds == NULL) {
315 316
		INIT_LIST_HEAD(&ds->ds_addrs);
		list_splice_init(dsaddrs, &ds->ds_addrs);
W
Weston Andros Adamson 已提交
317
		ds->ds_remotestr = remotestr;
318 319 320 321
		atomic_set(&ds->ds_count, 1);
		INIT_LIST_HEAD(&ds->ds_node);
		ds->ds_clp = NULL;
		list_add(&ds->ds_node, &nfs4_data_server_cache);
W
Weston Andros Adamson 已提交
322 323
		dprintk("%s add new data server %s\n", __func__,
			ds->ds_remotestr);
324
	} else {
W
Weston Andros Adamson 已提交
325
		kfree(remotestr);
326 327
		kfree(ds);
		atomic_inc(&tmp_ds->ds_count);
W
Weston Andros Adamson 已提交
328 329
		dprintk("%s data server %s found, inc'ed ds_count to %d\n",
			__func__, tmp_ds->ds_remotestr,
330 331 332 333 334 335 336 337 338
			atomic_read(&tmp_ds->ds_count));
		ds = tmp_ds;
	}
	spin_unlock(&nfs4_ds_cache_lock);
out:
	return ds;
}

/*
W
Weston Andros Adamson 已提交
339
 * Currently only supports ipv4, ipv6 and one multi-path address.
340
 */
341
static struct nfs4_pnfs_ds_addr *
342
decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
343
{
344
	struct nfs4_pnfs_ds_addr *da = NULL;
W
Weston Andros Adamson 已提交
345
	char *buf, *portstr;
346
	__be16 port;
W
Weston Andros Adamson 已提交
347
	int nlen, rlen;
348
	int tmp[2];
349
	__be32 *p;
W
Weston Andros Adamson 已提交
350
	char *netid, *match_netid;
351 352 353 354
	size_t len, match_netid_len;
	char *startsep = "";
	char *endsep = "";

355 356

	/* r_netid */
357 358 359
	p = xdr_inline_decode(streamp, 4);
	if (unlikely(!p))
		goto out_err;
360 361
	nlen = be32_to_cpup(p++);

362 363 364
	p = xdr_inline_decode(streamp, nlen);
	if (unlikely(!p))
		goto out_err;
365

W
Weston Andros Adamson 已提交
366 367
	netid = kmalloc(nlen+1, gfp_flags);
	if (unlikely(!netid))
368 369
		goto out_err;

W
Weston Andros Adamson 已提交
370 371 372 373
	netid[nlen] = '\0';
	memcpy(netid, p, nlen);

	/* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
374 375
	p = xdr_inline_decode(streamp, 4);
	if (unlikely(!p))
W
Weston Andros Adamson 已提交
376
		goto out_free_netid;
377 378 379 380
	rlen = be32_to_cpup(p);

	p = xdr_inline_decode(streamp, rlen);
	if (unlikely(!p))
W
Weston Andros Adamson 已提交
381
		goto out_free_netid;
382

W
Weston Andros Adamson 已提交
383 384
	/* port is ".ABC.DEF", 8 chars max */
	if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
385
		dprintk("%s: Invalid address, length %d\n", __func__,
386
			rlen);
W
Weston Andros Adamson 已提交
387
		goto out_free_netid;
388
	}
389
	buf = kmalloc(rlen + 1, gfp_flags);
390 391
	if (!buf) {
		dprintk("%s: Not enough memory\n", __func__);
W
Weston Andros Adamson 已提交
392
		goto out_free_netid;
393
	}
394
	buf[rlen] = '\0';
395
	memcpy(buf, p, rlen);
396

W
Weston Andros Adamson 已提交
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411
	/* replace port '.' with '-' */
	portstr = strrchr(buf, '.');
	if (!portstr) {
		dprintk("%s: Failed finding expected dot in port\n",
			__func__);
		goto out_free_buf;
	}
	*portstr = '-';

	/* find '.' between address and port */
	portstr = strrchr(buf, '.');
	if (!portstr) {
		dprintk("%s: Failed finding expected dot between address and "
			"port\n", __func__);
		goto out_free_buf;
412
	}
W
Weston Andros Adamson 已提交
413
	*portstr = '\0';
414

415 416
	da = kzalloc(sizeof(*da), gfp_flags);
	if (unlikely(!da))
W
Weston Andros Adamson 已提交
417
		goto out_free_buf;
418 419 420

	INIT_LIST_HEAD(&da->da_node);

421
	if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
422 423 424
		      sizeof(da->da_addr))) {
		dprintk("%s: error parsing address %s\n", __func__, buf);
		goto out_free_da;
425 426
	}

W
Weston Andros Adamson 已提交
427 428
	portstr++;
	sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
429 430
	port = htons((tmp[0] << 8) | (tmp[1]));

431
	switch (da->da_addr.ss_family) {
W
Weston Andros Adamson 已提交
432
	case AF_INET:
433 434
		((struct sockaddr_in *)&da->da_addr)->sin_port = port;
		da->da_addrlen = sizeof(struct sockaddr_in);
W
Weston Andros Adamson 已提交
435 436 437 438 439
		match_netid = "tcp";
		match_netid_len = 3;
		break;

	case AF_INET6:
440 441
		((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
		da->da_addrlen = sizeof(struct sockaddr_in6);
W
Weston Andros Adamson 已提交
442 443
		match_netid = "tcp6";
		match_netid_len = 4;
444 445
		startsep = "[";
		endsep = "]";
W
Weston Andros Adamson 已提交
446 447 448 449
		break;

	default:
		dprintk("%s: unsupported address family: %u\n",
450 451
			__func__, da->da_addr.ss_family);
		goto out_free_da;
W
Weston Andros Adamson 已提交
452 453 454 455 456
	}

	if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
		dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
			__func__, netid, match_netid);
457
		goto out_free_da;
W
Weston Andros Adamson 已提交
458 459
	}

460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
	/* save human readable address */
	len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
	da->da_remotestr = kzalloc(len, gfp_flags);

	/* NULL is ok, only used for dprintk */
	if (da->da_remotestr)
		snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
			 buf, endsep, ntohs(port));

	dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
	kfree(buf);
	kfree(netid);
	return da;

out_free_da:
	kfree(da);
W
Weston Andros Adamson 已提交
476
out_free_buf:
477
	dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
478
	kfree(buf);
W
Weston Andros Adamson 已提交
479 480
out_free_netid:
	kfree(netid);
481
out_err:
482
	return NULL;
483 484 485 486
}

/* Decode opaque device data and return the result */
static struct nfs4_file_layout_dsaddr*
487
decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
488
{
489
	int i;
490 491
	u32 cnt, num;
	u8 *indexp;
492 493 494 495 496
	__be32 *p;
	u8 *stripe_indices;
	u8 max_stripe_index;
	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
	struct xdr_stream stream;
497
	struct xdr_buf buf;
498
	struct page *scratch;
499 500
	struct list_head dsaddrs;
	struct nfs4_pnfs_ds_addr *da;
501 502

	/* set up xdr stream */
503
	scratch = alloc_page(gfp_flags);
504 505 506
	if (!scratch)
		goto out_err;

507
	xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
508
	xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
509 510

	/* Get the stripe count (number of stripe index) */
511 512 513 514 515
	p = xdr_inline_decode(&stream, 4);
	if (unlikely(!p))
		goto out_err_free_scratch;

	cnt = be32_to_cpup(p);
516 517
	dprintk("%s stripe count  %d\n", __func__, cnt);
	if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
518
		printk(KERN_WARNING "NFS: %s: stripe count %d greater than "
519 520
		       "supported maximum %d\n", __func__,
			cnt, NFS4_PNFS_MAX_STRIPE_CNT);
521 522 523 524
		goto out_err_free_scratch;
	}

	/* read stripe indices */
525
	stripe_indices = kcalloc(cnt, sizeof(u8), gfp_flags);
526 527 528 529 530 531 532 533 534 535 536 537 538
	if (!stripe_indices)
		goto out_err_free_scratch;

	p = xdr_inline_decode(&stream, cnt << 2);
	if (unlikely(!p))
		goto out_err_free_stripe_indices;

	indexp = &stripe_indices[0];
	max_stripe_index = 0;
	for (i = 0; i < cnt; i++) {
		*indexp = be32_to_cpup(p++);
		max_stripe_index = max(max_stripe_index, *indexp);
		indexp++;
539 540 541
	}

	/* Check the multipath list count */
542 543 544 545 546
	p = xdr_inline_decode(&stream, 4);
	if (unlikely(!p))
		goto out_err_free_stripe_indices;

	num = be32_to_cpup(p);
547 548
	dprintk("%s ds_num %u\n", __func__, num);
	if (num > NFS4_PNFS_MAX_MULTI_CNT) {
549
		printk(KERN_WARNING "NFS: %s: multipath count %d greater than "
550 551
			"supported maximum %d\n", __func__,
			num, NFS4_PNFS_MAX_MULTI_CNT);
552
		goto out_err_free_stripe_indices;
553
	}
554 555 556

	/* validate stripe indices are all < num */
	if (max_stripe_index >= num) {
557
		printk(KERN_WARNING "NFS: %s: stripe index %u >= num ds %u\n",
558 559 560 561
			__func__, max_stripe_index, num);
		goto out_err_free_stripe_indices;
	}

562 563
	dsaddr = kzalloc(sizeof(*dsaddr) +
			(sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
564
			gfp_flags);
565
	if (!dsaddr)
566
		goto out_err_free_stripe_indices;
567 568

	dsaddr->stripe_count = cnt;
569 570
	dsaddr->stripe_indices = stripe_indices;
	stripe_indices = NULL;
571
	dsaddr->ds_num = num;
572 573 574
	nfs4_init_deviceid_node(&dsaddr->id_node,
				NFS_SERVER(ino)->pnfs_curr_ld,
				NFS_SERVER(ino)->nfs_client,
575
				&pdev->dev_id);
576

577 578
	INIT_LIST_HEAD(&dsaddrs);

579 580
	for (i = 0; i < dsaddr->ds_num; i++) {
		int j;
581 582 583 584 585
		u32 mp_count;

		p = xdr_inline_decode(&stream, 4);
		if (unlikely(!p))
			goto out_err_free_deviceid;
586

587 588
		mp_count = be32_to_cpup(p); /* multipath count */
		for (j = 0; j < mp_count; j++) {
589
			da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net,
590
					    &stream, gfp_flags);
591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
			if (da)
				list_add_tail(&da->da_node, &dsaddrs);
		}
		if (list_empty(&dsaddrs)) {
			dprintk("%s: no suitable DS addresses found\n",
				__func__);
			goto out_err_free_deviceid;
		}

		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
		if (!dsaddr->ds_list[i])
			goto out_err_drain_dsaddrs;

		/* If DS was already in cache, free ds addrs */
		while (!list_empty(&dsaddrs)) {
			da = list_first_entry(&dsaddrs,
					      struct nfs4_pnfs_ds_addr,
					      da_node);
			list_del_init(&da->da_node);
			kfree(da->da_remotestr);
			kfree(da);
612 613
		}
	}
614 615

	__free_page(scratch);
616 617
	return dsaddr;

618 619 620 621 622 623 624 625
out_err_drain_dsaddrs:
	while (!list_empty(&dsaddrs)) {
		da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
				      da_node);
		list_del_init(&da->da_node);
		kfree(da->da_remotestr);
		kfree(da);
	}
626
out_err_free_deviceid:
627
	nfs4_fl_free_deviceid(dsaddr);
628 629 630 631 632 633
	/* stripe_indicies was part of dsaddr */
	goto out_err_free_scratch;
out_err_free_stripe_indices:
	kfree(stripe_indices);
out_err_free_scratch:
	__free_page(scratch);
634 635 636 637 638 639
out_err:
	dprintk("%s ERROR: returning NULL\n", __func__);
	return NULL;
}

/*
640 641
 * Decode the opaque device specified in 'dev' and add it to the cache of
 * available devices.
642
 */
643
static struct nfs4_file_layout_dsaddr *
644
decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
645
{
646 647
	struct nfs4_deviceid_node *d;
	struct nfs4_file_layout_dsaddr *n, *new;
648

649
	new = decode_device(inode, dev, gfp_flags);
650
	if (!new) {
651
		printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
652 653 654 655
			__func__);
		return NULL;
	}

656 657 658
	d = nfs4_insert_deviceid_node(&new->id_node);
	n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
	if (n != new) {
659
		nfs4_fl_free_deviceid(new);
660
		return n;
661 662 663
	}

	return new;
664 665 666 667 668 669 670
}

/*
 * Retrieve the information for dev_id, add it to the list
 * of available devices, and return it.
 */
struct nfs4_file_layout_dsaddr *
671
filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, gfp_t gfp_flags)
672 673 674 675 676 677 678 679 680 681 682 683 684 685
{
	struct pnfs_device *pdev = NULL;
	u32 max_resp_sz;
	int max_pages;
	struct page **pages = NULL;
	struct nfs4_file_layout_dsaddr *dsaddr = NULL;
	int rc, i;
	struct nfs_server *server = NFS_SERVER(inode);

	/*
	 * Use the session max response size as the basis for setting
	 * GETDEVICEINFO's maxcount
	 */
	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
686
	max_pages = nfs_page_array_len(0, max_resp_sz);
687 688 689
	dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
		__func__, inode, max_resp_sz, max_pages);

690
	pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
691 692 693
	if (pdev == NULL)
		return NULL;

694
	pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
695 696 697 698 699
	if (pages == NULL) {
		kfree(pdev);
		return NULL;
	}
	for (i = 0; i < max_pages; i++) {
700
		pages[i] = alloc_page(gfp_flags);
701 702 703 704 705 706 707 708
		if (!pages[i])
			goto out_free;
	}

	memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
	pdev->layout_type = LAYOUT_NFSV4_1_FILES;
	pdev->pages = pages;
	pdev->pgbase = 0;
709
	pdev->pglen = max_resp_sz;
710 711 712 713 714 715 716 717 718 719 720
	pdev->mincount = 0;

	rc = nfs4_proc_getdeviceinfo(server, pdev);
	dprintk("%s getdevice info returns %d\n", __func__, rc);
	if (rc)
		goto out_free;

	/*
	 * Found new device, need to decode it and then add it to the
	 * list of known devices for this mountpoint.
	 */
721
	dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
722 723 724 725 726 727 728 729 730
out_free:
	for (i = 0; i < max_pages; i++)
		__free_page(pages[i]);
	kfree(pages);
	kfree(pdev);
	dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
	return dsaddr;
}

731 732
void
nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
733
{
734
	nfs4_put_deviceid_node(&dsaddr->id_node);
735
}
F
Fred Isaman 已提交
736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777

/*
 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
 * Then: ((res + fsi) % dsaddr->stripe_count)
 */
u32
nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
{
	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
	u64 tmp;

	tmp = offset - flseg->pattern_offset;
	do_div(tmp, flseg->stripe_unit);
	tmp += flseg->first_stripe_index;
	return do_div(tmp, flseg->dsaddr->stripe_count);
}

u32
nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
{
	return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
}

struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
{
	struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
	u32 i;

	if (flseg->stripe_type == STRIPE_SPARSE) {
		if (flseg->num_fh == 1)
			i = 0;
		else if (flseg->num_fh == 0)
			/* Use the MDS OPEN fh set in nfs_read_rpcsetup */
			return NULL;
		else
			i = nfs4_fl_calc_ds_index(lseg, j);
	} else
		i = j;
	return flseg->fh_array[i];
}

778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
{
	might_sleep();
	wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
			nfs_wait_bit_killable, TASK_KILLABLE);
}

static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
{
	smp_mb__before_clear_bit();
	clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
	smp_mb__after_clear_bit();
	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
}


F
Fred Isaman 已提交
794 795 796 797 798
struct nfs4_pnfs_ds *
nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
{
	struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
	struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
799 800
	struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);

801
	if (filelayout_test_devid_unavailable(devid))
802
		return NULL;
F
Fred Isaman 已提交
803 804

	if (ds == NULL) {
805
		printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
F
Fred Isaman 已提交
806
			__func__, ds_idx);
807 808
		filelayout_mark_devid_invalid(devid);
		return NULL;
F
Fred Isaman 已提交
809
	}
810 811
	if (ds->ds_clp)
		return ds;
F
Fred Isaman 已提交
812

813
	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
814
		struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
F
Fred Isaman 已提交
815 816
		int err;

817
		err = nfs4_ds_connect(s, ds);
818 819
		if (err) {
			nfs4_mark_deviceid_unavailable(devid);
820
			ds = NULL;
821
		}
822 823 824 825
		nfs4_clear_ds_conn_bit(ds);
	} else {
		/* Either ds is connected, or ds is NULL */
		nfs4_wait_ds_connect(ds);
F
Fred Isaman 已提交
826 827 828
	}
	return ds;
}
829 830 831 832 833 834 835 836 837

module_param(dataserver_retrans, uint, 0644);
MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
			"retries a request before it attempts further "
			" recovery  action.");
module_param(dataserver_timeo, uint, 0644);
MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
			"NFSv4.1  client  waits for a response from a "
			" data server before it retries an NFS request.");