devorangefs-req.c 27.8 KB
Newer Older
M
Mike Marshall 已提交
1 2 3 4 5 6 7 8 9 10
/*
 * (C) 2001 Clemson University and The University of Chicago
 *
 * Changes by Acxiom Corporation to add protocol version to kernel
 * communication, Copyright Acxiom Corporation, 2005.
 *
 * See COPYING in top-level directory.
 */

#include "protocol.h"
11 12 13
#include "orangefs-kernel.h"
#include "orangefs-dev-proto.h"
#include "orangefs-bufmap.h"
M
Mike Marshall 已提交
14 15 16 17 18 19 20 21 22 23 24

#include <linux/debugfs.h>
#include <linux/slab.h>

/* this file implements the /dev/pvfs2-req device node */

static int open_access_count;

#define DUMP_DEVICE_ERROR()                                                   \
do {                                                                          \
	gossip_err("*****************************************************\n");\
25
	gossip_err("ORANGEFS Device Error:  You cannot open the device file ");  \
M
Mike Marshall 已提交
26
	gossip_err("\n/dev/%s more than once.  Please make sure that\nthere " \
27
		   "are no ", ORANGEFS_REQDEVICE_NAME);                          \
M
Mike Marshall 已提交
28 29 30 31
	gossip_err("instances of a program using this device\ncurrently "     \
		   "running. (You must verify this!)\n");                     \
	gossip_err("For example, you can use the lsof program as follows:\n");\
	gossip_err("'lsof | grep %s' (run this as root)\n",                   \
32
		   ORANGEFS_REQDEVICE_NAME);                                     \
M
Mike Marshall 已提交
33 34 35 36 37 38
	gossip_err("  open_access_count = %d\n", open_access_count);          \
	gossip_err("*****************************************************\n");\
} while (0)

static int hash_func(__u64 tag, int table_size)
{
M
Mike Marshall 已提交
39
	return do_div(tag, (unsigned int)table_size);
M
Mike Marshall 已提交
40 41
}

42
static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
M
Mike Marshall 已提交
43 44 45 46 47 48 49 50
{
	int index = hash_func(op->tag, hash_table_size);

	spin_lock(&htable_ops_in_progress_lock);
	list_add_tail(&op->list, &htable_ops_in_progress[index]);
	spin_unlock(&htable_ops_in_progress_lock);
}

51
static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
M
Mike Marshall 已提交
52
{
53
	struct orangefs_kernel_op_s *op, *next;
M
Mike Marshall 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
	int index;

	index = hash_func(tag, hash_table_size);

	spin_lock(&htable_ops_in_progress_lock);
	list_for_each_entry_safe(op,
				 next,
				 &htable_ops_in_progress[index],
				 list) {
		if (op->tag == tag) {
			list_del(&op->list);
			spin_unlock(&htable_ops_in_progress_lock);
			return op;
		}
	}

	spin_unlock(&htable_ops_in_progress_lock);
	return NULL;
}

74
static int orangefs_devreq_open(struct inode *inode, struct file *file)
M
Mike Marshall 已提交
75 76 77 78
{
	int ret = -EINVAL;

	if (!(file->f_flags & O_NONBLOCK)) {
79 80
		gossip_err("%s: device cannot be opened in blocking mode\n",
			   __func__);
M
Mike Marshall 已提交
81 82 83
		goto out;
	}
	ret = -EACCES;
84
	gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n");
M
Mike Marshall 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
	mutex_lock(&devreq_mutex);

	if (open_access_count == 0) {
		ret = generic_file_open(inode, file);
		if (ret == 0)
			open_access_count++;
	} else {
		DUMP_DEVICE_ERROR();
	}
	mutex_unlock(&devreq_mutex);

out:

	gossip_debug(GOSSIP_DEV_DEBUG,
		     "pvfs2-client-core: open device complete (ret = %d)\n",
		     ret);
	return ret;
}

104
/* Function for read() callers into the device */
105
static ssize_t orangefs_devreq_read(struct file *file,
M
Mike Marshall 已提交
106 107 108
				 char __user *buf,
				 size_t count, loff_t *offset)
{
109 110 111 112
	struct orangefs_kernel_op_s *op, *temp;
	__s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION;
	static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
	struct orangefs_kernel_op_s *cur_op = NULL;
113
	unsigned long ret;
M
Mike Marshall 已提交
114

115
	/* We do not support blocking IO. */
M
Mike Marshall 已提交
116
	if (!(file->f_flags & O_NONBLOCK)) {
117 118
		gossip_err("%s: blocking read from client-core.\n",
			   __func__);
M
Mike Marshall 已提交
119
		return -EINVAL;
120 121 122 123 124 125 126 127 128 129 130 131
	}

	/*
	 * The client will do an ioctl to find MAX_ALIGNED_DEV_REQ_UPSIZE, then
	 * always read with that size buffer.
	 */
	if (count != MAX_ALIGNED_DEV_REQ_UPSIZE) {
		gossip_err("orangefs: client-core tried to read wrong size\n");
		return -EINVAL;
	}

	/* Get next op (if any) from top of list. */
132 133
	spin_lock(&orangefs_request_list_lock);
	list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
134 135 136 137 138
		__s32 fsid;
		/* This lock is held past the end of the loop when we break. */
		spin_lock(&op->lock);

		fsid = fsid_of_op(op);
139
		if (fsid != ORANGEFS_FS_ID_NULL) {
140 141 142 143
			int ret;
			/* Skip ops whose filesystem needs to be mounted. */
			ret = fs_mount_pending(fsid);
			if (ret == 1) {
M
Mike Marshall 已提交
144
				gossip_debug(GOSSIP_DEV_DEBUG,
145 146 147 148
				    "orangefs: skipping op tag %llu %s\n",
				    llu(op->tag), get_opname_string(op));
				spin_unlock(&op->lock);
				continue;
149 150 151 152
			/*
			 * Skip ops whose filesystem we don't know about unless
			 * it is being mounted.
			 */
153 154
			/* XXX: is there a better way to detect this? */
			} else if (ret == -1 &&
155 156 157 158
				   !(op->upcall.type ==
					ORANGEFS_VFS_OP_FS_MOUNT ||
				     op->upcall.type ==
					ORANGEFS_VFS_OP_GETATTR)) {
159 160 161 162 163 164 165
				gossip_debug(GOSSIP_DEV_DEBUG,
				    "orangefs: skipping op tag %llu %s\n",
				    llu(op->tag), get_opname_string(op));
				gossip_err(
				    "orangefs: ERROR: fs_mount_pending %d\n",
				    fsid);
				spin_unlock(&op->lock);
M
Mike Marshall 已提交
166 167 168
				continue;
			}
		}
169 170 171 172 173 174 175 176 177 178 179 180 181 182
		/*
		 * Either this op does not pertain to a filesystem, is mounting
		 * a filesystem, or pertains to a mounted filesystem. Let it
		 * through.
		 */
		cur_op = op;
		break;
	}

	/*
	 * At this point we either have a valid op and can continue or have not
	 * found an op and must ask the client to try again later.
	 */
	if (!cur_op) {
183
		spin_unlock(&orangefs_request_list_lock);
184
		return -EAGAIN;
M
Mike Marshall 已提交
185 186
	}

187 188
	gossip_debug(GOSSIP_DEV_DEBUG, "orangefs: reading op tag %llu %s\n",
		     llu(cur_op->tag), get_opname_string(cur_op));
M
Mike Marshall 已提交
189

190 191 192 193 194 195 196
	/*
	 * Such an op should never be on the list in the first place. If so, we
	 * will abort.
	 */
	if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
		gossip_err("orangefs: ERROR: Current op already queued.\n");
		list_del(&cur_op->list);
M
Mike Marshall 已提交
197
		spin_unlock(&cur_op->lock);
198
		spin_unlock(&orangefs_request_list_lock);
199
		return -EAGAIN;
M
Mike Marshall 已提交
200
	}
201 202 203 204 205 206 207 208

	/*
	 * Set the operation to be in progress and move it between lists since
	 * it has been sent to the client.
	 */
	set_op_state_inprogress(cur_op);

	list_del(&cur_op->list);
209 210
	spin_unlock(&orangefs_request_list_lock);
	orangefs_devreq_add_op(cur_op);
211 212 213 214 215 216 217 218 219 220 221 222 223
	spin_unlock(&cur_op->lock);

	/* Push the upcall out. */
	ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
	if (ret != 0)
		goto error;
	ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
	if (ret != 0)
		goto error;
	ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
	if (ret != 0)
		goto error;
	ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
224
			   sizeof(struct orangefs_upcall_s));
225 226 227 228 229 230 231 232 233 234 235 236
	if (ret != 0)
		goto error;

	/* The client only asks to read one size buffer. */
	return MAX_ALIGNED_DEV_REQ_UPSIZE;
error:
	/*
	 * We were unable to copy the op data to the client. Put the op back in
	 * list. If client has crashed, the op will be purged later when the
	 * device is released.
	 */
	gossip_err("orangefs: Failed to copy data to user space\n");
237
	spin_lock(&orangefs_request_list_lock);
238 239
	spin_lock(&cur_op->lock);
	set_op_state_waiting(cur_op);
240 241
	orangefs_devreq_remove_op(cur_op->tag);
	list_add(&cur_op->list, &orangefs_request_list);
242
	spin_unlock(&cur_op->lock);
243
	spin_unlock(&orangefs_request_list_lock);
244
	return -EFAULT;
M
Mike Marshall 已提交
245 246
}

247 248 249 250 251
/*
 * Function for writev() callers into the device. Readdir related
 * operations have an extra iovec containing info about objects
 * contained in directories.
 */
252
static ssize_t orangefs_devreq_writev(struct file *file,
M
Mike Marshall 已提交
253 254 255 256
				   const struct iovec *iov,
				   size_t count,
				   loff_t *offset)
{
257
	struct orangefs_kernel_op_s *op = NULL;
M
Mike Marshall 已提交
258 259 260
	void *buffer = NULL;
	void *ptr = NULL;
	unsigned long i = 0;
261 262 263 264 265 266 267 268 269
	int num_remaining = MAX_ALIGNED_DEV_REQ_DOWNSIZE;
	int ret = 0;
	/* num elements in iovec without trailer */
	int notrailer_count = 4;
	/*
	 * If there's a trailer, its iov index will be equal to
	 * notrailer_count.
	 */
	int trailer_index = notrailer_count;
M
Mike Marshall 已提交
270
	int payload_size = 0;
271
	int returned_downcall_size = 0;
M
Mike Marshall 已提交
272 273 274 275 276
	__s32 magic = 0;
	__s32 proto_ver = 0;
	__u64 tag = 0;
	ssize_t total_returned_size = 0;

277 278 279 280 281
	/*
	 * There will always be at least notrailer_count iovecs, and
	 * when there's a trailer, one more than notrailer_count. Check
	 * count's sanity.
	 */
M
Mike Marshall 已提交
282
	if (count != notrailer_count && count != (notrailer_count + 1)) {
283 284
		gossip_err("%s: count:%zu: notrailer_count :%d:\n",
			__func__,
M
Mike Marshall 已提交
285 286 287 288
			count,
			notrailer_count);
		return -EPROTO;
	}
289 290 291


	/* Copy the non-trailer iovec data into a device request buffer. */
M
Mike Marshall 已提交
292
	buffer = dev_req_alloc();
293 294
	if (!buffer) {
		gossip_err("%s: dev_req_alloc failed.\n", __func__);
M
Mike Marshall 已提交
295
		return -ENOMEM;
296
	}
M
Mike Marshall 已提交
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
	ptr = buffer;
	for (i = 0; i < notrailer_count; i++) {
		if (iov[i].iov_len > num_remaining) {
			gossip_err
			    ("writev error: Freeing buffer and returning\n");
			dev_req_release(buffer);
			return -EMSGSIZE;
		}
		ret = copy_from_user(ptr, iov[i].iov_base, iov[i].iov_len);
		if (ret) {
			gossip_err("Failed to copy data from user space\n");
			dev_req_release(buffer);
			return -EIO;
		}
		num_remaining -= iov[i].iov_len;
		ptr += iov[i].iov_len;
		payload_size += iov[i].iov_len;
	}
	total_returned_size = payload_size;

	/* these elements are currently 8 byte aligned (8 bytes for (version +
	 * magic) 8 bytes for tag).  If you add another element, either
	 * make it 8 bytes big, or use get_unaligned when asigning.
	 */
	ptr = buffer;
322
	proto_ver = *((__s32 *) ptr); /* unused */
M
Mike Marshall 已提交
323 324 325 326 327 328 329 330
	ptr += sizeof(__s32);

	magic = *((__s32 *) ptr);
	ptr += sizeof(__s32);

	tag = *((__u64 *) ptr);
	ptr += sizeof(__u64);

331
	if (magic != ORANGEFS_DEVREQ_MAGIC) {
M
Mike Marshall 已提交
332 333 334 335 336
		gossip_err("Error: Device magic number does not match.\n");
		dev_req_release(buffer);
		return -EPROTO;
	}

337
	op = orangefs_devreq_remove_op(tag);
M
Mike Marshall 已提交
338 339 340
	if (op) {
		/* Increase ref count! */
		get_op(op);
341 342 343 344 345 346 347 348

		/* calculate the size of the returned downcall. */
		returned_downcall_size =
			payload_size - (2 * sizeof(__s32) + sizeof(__u64));

		/* copy the passed in downcall into the op */
		if (returned_downcall_size ==
			sizeof(struct orangefs_downcall_s)) {
M
Mike Marshall 已提交
349 350
			memcpy(&op->downcall,
			       ptr,
351
			       sizeof(struct orangefs_downcall_s));
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
		} else {
			gossip_err("%s: returned downcall size:%d: \n",
				   __func__,
				   returned_downcall_size);
			dev_req_release(buffer);
			put_op(op);
			return -EMSGSIZE;
		}

		/* Don't tolerate an unexpected trailer iovec. */
		if ((op->downcall.trailer_size == 0) &&
		    (count != notrailer_count)) {
			gossip_err("%s: unexpected trailer iovec.\n",
				   __func__);
			dev_req_release(buffer);
			put_op(op);
			return -EPROTO;
		}

		/* Don't consider the trailer if there's a bad status. */
		if (op->downcall.status != 0)
			goto no_trailer;

		/* get the trailer if there is one. */
		if (op->downcall.trailer_size == 0)
			goto no_trailer;

		gossip_debug(GOSSIP_DEV_DEBUG,
			     "%s: op->downcall.trailer_size %lld\n",
			     __func__,
			     op->downcall.trailer_size);
M
Mike Marshall 已提交
383

384 385 386
		/*
		 * Bail if we think think there should be a trailer, but
		 * there's no iovec for it.
M
Mike Marshall 已提交
387
		 */
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
		if (count != (notrailer_count + 1)) {
			gossip_err("%s: trailer_size:%lld: count:%zu:\n",
				   __func__,
				   op->downcall.trailer_size,
				   count);
			dev_req_release(buffer);
			put_op(op);
			return -EPROTO;
		}

		/* Verify that trailer_size is accurate. */
		if (op->downcall.trailer_size != iov[trailer_index].iov_len) {
			gossip_err("%s: trailer_size:%lld: != iov_len:%zd:\n",
				   __func__,
				   op->downcall.trailer_size,
				   iov[trailer_index].iov_len);
			dev_req_release(buffer);
			put_op(op);
			return -EMSGSIZE;
		}

		total_returned_size += iov[trailer_index].iov_len;

		/*
		 * Allocate a buffer, copy the trailer bytes into it and
		 * attach it to the downcall.
		 */
		op->downcall.trailer_buf = vmalloc(iov[trailer_index].iov_len);
		if (op->downcall.trailer_buf != NULL) {
			gossip_debug(GOSSIP_DEV_DEBUG, "vmalloc: %p\n",
				     op->downcall.trailer_buf);
			ret = copy_from_user(op->downcall.trailer_buf,
					     iov[trailer_index].iov_base,
					     iov[trailer_index].iov_len);
			if (ret) {
				gossip_err("%s: Failed to copy trailer.\n",
					   __func__);
M
Mike Marshall 已提交
425
				dev_req_release(buffer);
426 427
				gossip_debug(GOSSIP_DEV_DEBUG,
					     "vfree: %p\n",
M
Mike Marshall 已提交
428
					     op->downcall.trailer_buf);
429 430 431 432
				vfree(op->downcall.trailer_buf);
				op->downcall.trailer_buf = NULL;
				put_op(op);
				return -EIO;
M
Mike Marshall 已提交
433
			}
434 435 436 437 438
		} else {
			gossip_err("writev: could not vmalloc for trailer!\n");
			dev_req_release(buffer);
			put_op(op);
			return -ENOMEM;
M
Mike Marshall 已提交
439 440
		}

441 442 443
no_trailer:

		/* if this operation is an I/O operation we need to wait
M
Mike Marshall 已提交
444 445 446 447 448 449 450 451 452
		 * for all data to be copied before we can return to avoid
		 * buffer corruption and races that can pull the buffers
		 * out from under us.
		 *
		 * Essentially we're synchronizing with other parts of the
		 * vfs implicitly by not allowing the user space
		 * application reading/writing this device to return until
		 * the buffers are done being used.
		 */
453
		if (op->upcall.type == ORANGEFS_VFS_OP_FILE_IO) {
M
Mike Marshall 已提交
454
			int timed_out = 0;
455
			DEFINE_WAIT(wait_entry);
M
Mike Marshall 已提交
456

457 458
			/*
			 * tell the vfs op waiting on a waitqueue
M
Mike Marshall 已提交
459 460 461 462 463 464 465 466 467 468
			 * that this op is done
			 */
			spin_lock(&op->lock);
			set_op_state_serviced(op);
			spin_unlock(&op->lock);

			wake_up_interruptible(&op->waitq);

			while (1) {
				spin_lock(&op->lock);
469 470 471 472
				prepare_to_wait_exclusive(
					&op->io_completion_waitq,
					&wait_entry,
					TASK_INTERRUPTIBLE);
M
Mike Marshall 已提交
473 474 475 476 477 478 479 480 481 482 483
				if (op->io_completed) {
					spin_unlock(&op->lock);
					break;
				}
				spin_unlock(&op->lock);

				if (!signal_pending(current)) {
					int timeout =
					    MSECS_TO_JIFFIES(1000 *
							     op_timeout_secs);
					if (!schedule_timeout(timeout)) {
484 485 486
						gossip_debug(GOSSIP_DEV_DEBUG,
							"%s: timed out.\n",
							__func__);
M
Mike Marshall 已提交
487 488 489 490 491 492
						timed_out = 1;
						break;
					}
					continue;
				}

493 494 495
				gossip_debug(GOSSIP_DEV_DEBUG,
					"%s: signal on I/O wait, aborting\n",
					__func__);
M
Mike Marshall 已提交
496 497 498
				break;
			}

499 500 501
			spin_lock(&op->lock);
			finish_wait(&op->io_completion_waitq, &wait_entry);
			spin_unlock(&op->lock);
M
Mike Marshall 已提交
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520

			/* NOTE: for I/O operations we handle releasing the op
			 * object except in the case of timeout.  the reason we
			 * can't free the op in timeout cases is that the op
			 * service logic in the vfs retries operations using
			 * the same op ptr, thus it can't be freed.
			 */
			if (!timed_out)
				op_release(op);
		} else {

			/*
			 * tell the vfs op waiting on a waitqueue that
			 * this op is done
			 */
			spin_lock(&op->lock);
			set_op_state_serviced(op);
			spin_unlock(&op->lock);
			/*
521 522 523
			 * for every other operation (i.e. non-I/O), we need to
			 * wake up the callers for downcall completion
			 * notification
M
Mike Marshall 已提交
524 525 526 527 528 529 530 531 532
			 */
			wake_up_interruptible(&op->waitq);
		}
	} else {
		/* ignore downcalls that we're not interested in */
		gossip_debug(GOSSIP_DEV_DEBUG,
			     "WARNING: No one's waiting for tag %llu\n",
			     llu(tag));
	}
533
	/* put_op? */
M
Mike Marshall 已提交
534 535 536 537 538
	dev_req_release(buffer);

	return total_returned_size;
}

539
static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
M
Mike Marshall 已提交
540 541
				      struct iov_iter *iter)
{
542
	return orangefs_devreq_writev(iocb->ki_filp,
M
Mike Marshall 已提交
543 544 545 546 547 548 549 550 551
				   iter->iov,
				   iter->nr_segs,
				   &iocb->ki_pos);
}

/* Returns whether any FS are still pending remounted */
static int mark_all_pending_mounts(void)
{
	int unmounted = 1;
552
	struct orangefs_sb_info_s *orangefs_sb = NULL;
M
Mike Marshall 已提交
553

554 555
	spin_lock(&orangefs_superblocks_lock);
	list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
M
Mike Marshall 已提交
556
		/* All of these file system require a remount */
557
		orangefs_sb->mount_pending = 1;
M
Mike Marshall 已提交
558 559
		unmounted = 0;
	}
560
	spin_unlock(&orangefs_superblocks_lock);
M
Mike Marshall 已提交
561 562 563 564 565 566 567 568 569 570 571 572
	return unmounted;
}

/*
 * Determine if a given file system needs to be remounted or not
 *  Returns -1 on error
 *           0 if already mounted
 *           1 if needs remount
 */
int fs_mount_pending(__s32 fsid)
{
	int mount_pending = -1;
573
	struct orangefs_sb_info_s *orangefs_sb = NULL;
M
Mike Marshall 已提交
574

575 576 577 578
	spin_lock(&orangefs_superblocks_lock);
	list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
		if (orangefs_sb->fs_id == fsid) {
			mount_pending = orangefs_sb->mount_pending;
M
Mike Marshall 已提交
579 580 581
			break;
		}
	}
582
	spin_unlock(&orangefs_superblocks_lock);
M
Mike Marshall 已提交
583 584 585 586 587 588 589 590
	return mount_pending;
}

/*
 * NOTE: gets called when the last reference to this device is dropped.
 * Using the open_access_count variable, we enforce a reference count
 * on this file so that it can be opened by only one process at a time.
 * the devreq_mutex is used to make sure all i/o has completed
591
 * before we call orangefs_bufmap_finalize, and similar such tricky
M
Mike Marshall 已提交
592 593
 * situations
 */
594
static int orangefs_devreq_release(struct inode *inode, struct file *file)
M
Mike Marshall 已提交
595 596 597 598 599 600 601 602
{
	int unmounted = 0;

	gossip_debug(GOSSIP_DEV_DEBUG,
		     "%s:pvfs2-client-core: exiting, closing device\n",
		     __func__);

	mutex_lock(&devreq_mutex);
603 604
	if (get_bufmap_init())
		orangefs_bufmap_finalize();
M
Mike Marshall 已提交
605 606 607 608

	open_access_count--;

	unmounted = mark_all_pending_mounts();
609
	gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n",
M
Mike Marshall 已提交
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
		     (unmounted ? "UNMOUNTED" : "MOUNTED"));
	mutex_unlock(&devreq_mutex);

	/*
	 * Walk through the list of ops in the request list, mark them
	 * as purged and wake them up.
	 */
	purge_waiting_ops();
	/*
	 * Walk through the hash table of in progress operations; mark
	 * them as purged and wake them up
	 */
	purge_inprogress_ops();
	gossip_debug(GOSSIP_DEV_DEBUG,
		     "pvfs2-client-core: device close complete\n");
	return 0;
}

int is_daemon_in_service(void)
{
	int in_service;

	/*
	 * What this function does is checks if client-core is alive
	 * based on the access count we maintain on the device.
	 */
	mutex_lock(&devreq_mutex);
	in_service = open_access_count == 1 ? 0 : -EIO;
	mutex_unlock(&devreq_mutex);
	return in_service;
}

static inline long check_ioctl_command(unsigned int command)
{
	/* Check for valid ioctl codes */
645
	if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) {
M
Mike Marshall 已提交
646 647 648
		gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n",
			command,
			_IOC_TYPE(command),
649
			ORANGEFS_DEV_MAGIC);
M
Mike Marshall 已提交
650 651 652
		return -EINVAL;
	}
	/* and valid ioctl commands */
653
	if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) {
M
Mike Marshall 已提交
654
		gossip_err("Invalid ioctl command number [%d >= %d]\n",
655
			   _IOC_NR(command), ORANGEFS_DEV_MAXNR);
M
Mike Marshall 已提交
656 657 658 659 660 661 662
		return -ENOIOCTLCMD;
	}
	return 0;
}

static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
{
663
	static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
M
Mike Marshall 已提交
664 665
	static __s32 max_up_size = MAX_ALIGNED_DEV_REQ_UPSIZE;
	static __s32 max_down_size = MAX_ALIGNED_DEV_REQ_DOWNSIZE;
666
	struct ORANGEFS_dev_map_desc user_desc;
M
Mike Marshall 已提交
667 668 669 670 671
	int ret = 0;
	struct dev_mask_info_s mask_info = { 0 };
	struct dev_mask2_info_s mask2_info = { 0, 0 };
	int upstream_kmod = 1;
	struct list_head *tmp = NULL;
672
	struct orangefs_sb_info_s *orangefs_sb = NULL;
M
Mike Marshall 已提交
673 674 675 676

	/* mtmoore: add locking here */

	switch (command) {
677
	case ORANGEFS_DEV_GET_MAGIC:
M
Mike Marshall 已提交
678 679 680
		return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ?
			-EIO :
			0);
681
	case ORANGEFS_DEV_GET_MAX_UPSIZE:
M
Mike Marshall 已提交
682 683 684 685
		return ((put_user(max_up_size,
				  (__s32 __user *) arg) == -EFAULT) ?
					-EIO :
					0);
686
	case ORANGEFS_DEV_GET_MAX_DOWNSIZE:
M
Mike Marshall 已提交
687 688 689 690
		return ((put_user(max_down_size,
				  (__s32 __user *) arg) == -EFAULT) ?
					-EIO :
					0);
691
	case ORANGEFS_DEV_MAP:
M
Mike Marshall 已提交
692
		ret = copy_from_user(&user_desc,
693
				     (struct ORANGEFS_dev_map_desc __user *)
M
Mike Marshall 已提交
694
				     arg,
695
				     sizeof(struct ORANGEFS_dev_map_desc));
696 697 698 699 700 701 702
		if (get_bufmap_init()) {
			return -EINVAL;
		} else {
			return ret ?
			       -EIO :
			       orangefs_bufmap_initialize(&user_desc);
		}
703
	case ORANGEFS_DEV_REMOUNT_ALL:
M
Mike Marshall 已提交
704
		gossip_debug(GOSSIP_DEV_DEBUG,
705 706
			     "%s: got ORANGEFS_DEV_REMOUNT_ALL\n",
			     __func__);
M
Mike Marshall 已提交
707 708

		/*
709
		 * remount all mounted orangefs volumes to regain the lost
M
Mike Marshall 已提交
710 711 712 713 714 715 716 717 718 719 720
		 * dynamic mount tables (if any) -- NOTE: this is done
		 * without keeping the superblock list locked due to the
		 * upcall/downcall waiting.  also, the request semaphore is
		 * used to ensure that no operations will be serviced until
		 * all of the remounts are serviced (to avoid ops between
		 * mounts to fail)
		 */
		ret = mutex_lock_interruptible(&request_mutex);
		if (ret < 0)
			return ret;
		gossip_debug(GOSSIP_DEV_DEBUG,
721 722
			     "%s: priority remount in progress\n",
			     __func__);
723 724
		list_for_each(tmp, &orangefs_superblocks) {
			orangefs_sb =
725 726 727
				list_entry(tmp,
					   struct orangefs_sb_info_s,
					   list);
728
			if (orangefs_sb && (orangefs_sb->sb)) {
M
Mike Marshall 已提交
729
				gossip_debug(GOSSIP_DEV_DEBUG,
730 731
					     "%s: Remounting SB %p\n",
					     __func__,
732
					     orangefs_sb);
M
Mike Marshall 已提交
733

734
				ret = orangefs_remount(orangefs_sb->sb);
M
Mike Marshall 已提交
735 736 737
				if (ret) {
					gossip_debug(GOSSIP_DEV_DEBUG,
						     "SB %p remount failed\n",
738
						     orangefs_sb);
739
					break;
M
Mike Marshall 已提交
740 741 742 743
				}
			}
		}
		gossip_debug(GOSSIP_DEV_DEBUG,
744 745
			     "%s: priority remount complete\n",
			     __func__);
M
Mike Marshall 已提交
746 747 748
		mutex_unlock(&request_mutex);
		return ret;

749
	case ORANGEFS_DEV_UPSTREAM:
M
Mike Marshall 已提交
750 751 752 753 754 755 756 757 758
		ret = copy_to_user((void __user *)arg,
				    &upstream_kmod,
				    sizeof(upstream_kmod));

		if (ret != 0)
			return -EIO;
		else
			return ret;

759
	case ORANGEFS_DEV_CLIENT_MASK:
M
Mike Marshall 已提交
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
		ret = copy_from_user(&mask2_info,
				     (void __user *)arg,
				     sizeof(struct dev_mask2_info_s));

		if (ret != 0)
			return -EIO;

		client_debug_mask.mask1 = mask2_info.mask1_value;
		client_debug_mask.mask2 = mask2_info.mask2_value;

		pr_info("%s: client debug mask has been been received "
			":%llx: :%llx:\n",
			__func__,
			(unsigned long long)client_debug_mask.mask1,
			(unsigned long long)client_debug_mask.mask2);

		return ret;

778
	case ORANGEFS_DEV_CLIENT_STRING:
M
Mike Marshall 已提交
779 780
		ret = copy_from_user(&client_debug_array_string,
				     (void __user *)arg,
781
				     ORANGEFS_MAX_DEBUG_STRING_LEN);
M
Mike Marshall 已提交
782
		if (ret != 0) {
783
			pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
M
Mike Marshall 已提交
784 785 786 787
				__func__);
			return -EIO;
		}

788
		pr_info("%s: client debug array string has been received.\n",
M
Mike Marshall 已提交
789 790 791 792 793 794 795 796 797
			__func__);

		if (!help_string_initialized) {

			/* Free the "we don't know yet" default string... */
			kfree(debug_help_string);

			/* build a proper debug help string */
			if (orangefs_prepare_debugfs_help_string(0)) {
798
				gossip_err("%s: no debug help string \n",
M
Mike Marshall 已提交
799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826
					   __func__);
				return -EIO;
			}

			/* Replace the boilerplate boot-time debug-help file. */
			debugfs_remove(help_file_dentry);

			help_file_dentry =
				debugfs_create_file(
					ORANGEFS_KMOD_DEBUG_HELP_FILE,
					0444,
					debug_dir,
					debug_help_string,
					&debug_help_fops);

			if (!help_file_dentry) {
				gossip_err("%s: debugfs_create_file failed for"
					   " :%s:!\n",
					   __func__,
					   ORANGEFS_KMOD_DEBUG_HELP_FILE);
				return -EIO;
			}
		}

		debug_mask_to_string(&client_debug_mask, 1);

		debugfs_remove(client_debug_dentry);

827
		orangefs_client_debug_init();
M
Mike Marshall 已提交
828 829 830 831 832

		help_string_initialized++;

		return ret;

833
	case ORANGEFS_DEV_DEBUG:
M
Mike Marshall 已提交
834 835 836 837 838 839 840 841 842 843 844 845 846 847
		ret = copy_from_user(&mask_info,
				     (void __user *)arg,
				     sizeof(mask_info));

		if (ret != 0)
			return -EIO;

		if (mask_info.mask_type == KERNEL_MASK) {
			if ((mask_info.mask_value == 0)
			    && (kernel_mask_set_mod_init)) {
				/*
				 * the kernel debug mask was set when the
				 * kernel module was loaded; don't override
				 * it if the client-core was started without
848
				 * a value for ORANGEFS_KMODMASK.
M
Mike Marshall 已提交
849 850 851 852 853 854
				 */
				return 0;
			}
			debug_mask_to_string(&mask_info.mask_value,
					     mask_info.mask_type);
			gossip_debug_mask = mask_info.mask_value;
855
			pr_info("%s: kernel debug mask has been modified to "
M
Mike Marshall 已提交
856
				":%s: :%llx:\n",
857
				__func__,
M
Mike Marshall 已提交
858 859 860 861 862
				kernel_debug_string,
				(unsigned long long)gossip_debug_mask);
		} else if (mask_info.mask_type == CLIENT_MASK) {
			debug_mask_to_string(&mask_info.mask_value,
					     mask_info.mask_type);
863
			pr_info("%s: client debug mask has been modified to"
M
Mike Marshall 已提交
864
				":%s: :%llx:\n",
865
				__func__,
M
Mike Marshall 已提交
866 867 868 869 870 871 872 873 874 875 876 877 878 879 880
				client_debug_string,
				llu(mask_info.mask_value));
		} else {
			gossip_lerr("Invalid mask type....\n");
			return -EINVAL;
		}

		return ret;

	default:
		return -ENOIOCTLCMD;
	}
	return -ENOIOCTLCMD;
}

881
static long orangefs_devreq_ioctl(struct file *file,
M
Mike Marshall 已提交
882 883 884 885 886 887 888 889 890 891 892 893 894 895
			       unsigned int command, unsigned long arg)
{
	long ret;

	/* Check for properly constructed commands */
	ret = check_ioctl_command(command);
	if (ret < 0)
		return (int)ret;

	return (int)dispatch_ioctl_command(command, arg);
}

#ifdef CONFIG_COMPAT		/* CONFIG_COMPAT is in .config */

896 897
/*  Compat structure for the ORANGEFS_DEV_MAP ioctl */
struct ORANGEFS_dev_map_desc32 {
M
Mike Marshall 已提交
898 899 900 901 902 903 904 905
	compat_uptr_t ptr;
	__s32 total_size;
	__s32 size;
	__s32 count;
};

static unsigned long translate_dev_map26(unsigned long args, long *error)
{
906
	struct ORANGEFS_dev_map_desc32 __user *p32 = (void __user *)args;
M
Mike Marshall 已提交
907 908 909 910
	/*
	 * Depending on the architecture, allocate some space on the
	 * user-call-stack based on our expected layout.
	 */
911
	struct ORANGEFS_dev_map_desc __user *p =
M
Mike Marshall 已提交
912
	    compat_alloc_user_space(sizeof(*p));
913
	compat_uptr_t addr;
M
Mike Marshall 已提交
914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938

	*error = 0;
	/* get the ptr from the 32 bit user-space */
	if (get_user(addr, &p32->ptr))
		goto err;
	/* try to put that into a 64-bit layout */
	if (put_user(compat_ptr(addr), &p->ptr))
		goto err;
	/* copy the remaining fields */
	if (copy_in_user(&p->total_size, &p32->total_size, sizeof(__s32)))
		goto err;
	if (copy_in_user(&p->size, &p32->size, sizeof(__s32)))
		goto err;
	if (copy_in_user(&p->count, &p32->count, sizeof(__s32)))
		goto err;
	return (unsigned long)p;
err:
	*error = -EFAULT;
	return 0;
}

/*
 * 32 bit user-space apps' ioctl handlers when kernel modules
 * is compiled as a 64 bit one
 */
939
static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd,
M
Mike Marshall 已提交
940 941 942 943 944 945 946 947 948
				      unsigned long args)
{
	long ret;
	unsigned long arg = args;

	/* Check for properly constructed commands */
	ret = check_ioctl_command(cmd);
	if (ret < 0)
		return ret;
949
	if (cmd == ORANGEFS_DEV_MAP) {
M
Mike Marshall 已提交
950 951 952 953 954 955 956 957 958 959 960 961 962 963
		/*
		 * convert the arguments to what we expect internally
		 * in kernel space
		 */
		arg = translate_dev_map26(args, &ret);
		if (ret < 0) {
			gossip_err("Could not translate dev map\n");
			return ret;
		}
	}
	/* no other ioctl requires translation */
	return dispatch_ioctl_command(cmd, arg);
}

M
Mike Marshall 已提交
964 965 966 967 968 969 970 971
#endif /* CONFIG_COMPAT is in .config */

/*
 * The following two ioctl32 functions had been refactored into the above
 * CONFIG_COMPAT ifdef, but that was an over simplification that was
 * not noticed until we tried to compile on power pc...
 */
#if (defined(CONFIG_COMPAT) && !defined(HAVE_REGISTER_IOCTL32_CONVERSION)) || !defined(CONFIG_COMPAT)
972
static int orangefs_ioctl32_init(void)
M
Mike Marshall 已提交
973 974 975 976
{
	return 0;
}

977
static void orangefs_ioctl32_cleanup(void)
M
Mike Marshall 已提交
978 979 980
{
	return;
}
M
Mike Marshall 已提交
981
#endif
M
Mike Marshall 已提交
982 983

/* the assigned character device major number */
984
static int orangefs_dev_major;
M
Mike Marshall 已提交
985 986

/*
987
 * Initialize orangefs device specific state:
M
Mike Marshall 已提交
988 989
 * Must be called at module load time only
 */
990
int orangefs_dev_init(void)
M
Mike Marshall 已提交
991 992 993 994
{
	int ret;

	/* register the ioctl32 sub-system */
995
	ret = orangefs_ioctl32_init();
M
Mike Marshall 已提交
996 997 998
	if (ret < 0)
		return ret;

999 1000 1001 1002 1003
	/* register orangefs-req device  */
	orangefs_dev_major = register_chrdev(0,
					  ORANGEFS_REQDEVICE_NAME,
					  &orangefs_devreq_file_operations);
	if (orangefs_dev_major < 0) {
M
Mike Marshall 已提交
1004 1005
		gossip_debug(GOSSIP_DEV_DEBUG,
			     "Failed to register /dev/%s (error %d)\n",
1006 1007 1008
			     ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
		orangefs_ioctl32_cleanup();
		return orangefs_dev_major;
M
Mike Marshall 已提交
1009 1010 1011 1012
	}

	gossip_debug(GOSSIP_DEV_DEBUG,
		     "*** /dev/%s character device registered ***\n",
1013
		     ORANGEFS_REQDEVICE_NAME);
M
Mike Marshall 已提交
1014
	gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n",
1015
		     ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
M
Mike Marshall 已提交
1016 1017 1018
	return 0;
}

1019
void orangefs_dev_cleanup(void)
M
Mike Marshall 已提交
1020
{
1021
	unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME);
M
Mike Marshall 已提交
1022 1023
	gossip_debug(GOSSIP_DEV_DEBUG,
		     "*** /dev/%s character device unregistered ***\n",
1024
		     ORANGEFS_REQDEVICE_NAME);
M
Mike Marshall 已提交
1025
	/* unregister the ioctl32 sub-system */
1026
	orangefs_ioctl32_cleanup();
M
Mike Marshall 已提交
1027 1028
}

1029
static unsigned int orangefs_devreq_poll(struct file *file,
M
Mike Marshall 已提交
1030 1031 1032 1033 1034
				      struct poll_table_struct *poll_table)
{
	int poll_revent_mask = 0;

	if (open_access_count == 1) {
1035
		poll_wait(file, &orangefs_request_list_waitq, poll_table);
M
Mike Marshall 已提交
1036

1037 1038
		spin_lock(&orangefs_request_list_lock);
		if (!list_empty(&orangefs_request_list))
M
Mike Marshall 已提交
1039
			poll_revent_mask |= POLL_IN;
1040
		spin_unlock(&orangefs_request_list_lock);
M
Mike Marshall 已提交
1041 1042 1043 1044
	}
	return poll_revent_mask;
}

1045
const struct file_operations orangefs_devreq_file_operations = {
M
Mike Marshall 已提交
1046
	.owner = THIS_MODULE,
1047 1048 1049 1050 1051
	.read = orangefs_devreq_read,
	.write_iter = orangefs_devreq_write_iter,
	.open = orangefs_devreq_open,
	.release = orangefs_devreq_release,
	.unlocked_ioctl = orangefs_devreq_ioctl,
M
Mike Marshall 已提交
1052 1053

#ifdef CONFIG_COMPAT		/* CONFIG_COMPAT is in .config */
1054
	.compat_ioctl = orangefs_devreq_compat_ioctl,
M
Mike Marshall 已提交
1055
#endif
1056
	.poll = orangefs_devreq_poll
M
Mike Marshall 已提交
1057
};