devorangefs-req.c 27.7 KB
Newer Older
M
Mike Marshall 已提交
1 2 3 4 5 6 7 8 9 10
/*
 * (C) 2001 Clemson University and The University of Chicago
 *
 * Changes by Acxiom Corporation to add protocol version to kernel
 * communication, Copyright Acxiom Corporation, 2005.
 *
 * See COPYING in top-level directory.
 */

#include "protocol.h"
11 12 13
#include "orangefs-kernel.h"
#include "orangefs-dev-proto.h"
#include "orangefs-bufmap.h"
M
Mike Marshall 已提交
14 15 16 17 18 19 20 21 22 23 24

#include <linux/debugfs.h>
#include <linux/slab.h>

/* this file implements the /dev/pvfs2-req device node */

static int open_access_count;

#define DUMP_DEVICE_ERROR()                                                   \
do {                                                                          \
	gossip_err("*****************************************************\n");\
25
	gossip_err("ORANGEFS Device Error:  You cannot open the device file ");  \
M
Mike Marshall 已提交
26
	gossip_err("\n/dev/%s more than once.  Please make sure that\nthere " \
27
		   "are no ", ORANGEFS_REQDEVICE_NAME);                          \
M
Mike Marshall 已提交
28 29 30 31
	gossip_err("instances of a program using this device\ncurrently "     \
		   "running. (You must verify this!)\n");                     \
	gossip_err("For example, you can use the lsof program as follows:\n");\
	gossip_err("'lsof | grep %s' (run this as root)\n",                   \
32
		   ORANGEFS_REQDEVICE_NAME);                                     \
M
Mike Marshall 已提交
33 34 35 36 37 38
	gossip_err("  open_access_count = %d\n", open_access_count);          \
	gossip_err("*****************************************************\n");\
} while (0)

static int hash_func(__u64 tag, int table_size)
{
M
Mike Marshall 已提交
39
	return do_div(tag, (unsigned int)table_size);
M
Mike Marshall 已提交
40 41
}

42
static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op)
M
Mike Marshall 已提交
43 44 45 46 47 48 49 50
{
	int index = hash_func(op->tag, hash_table_size);

	spin_lock(&htable_ops_in_progress_lock);
	list_add_tail(&op->list, &htable_ops_in_progress[index]);
	spin_unlock(&htable_ops_in_progress_lock);
}

51
static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
M
Mike Marshall 已提交
52
{
53
	struct orangefs_kernel_op_s *op, *next;
M
Mike Marshall 已提交
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
	int index;

	index = hash_func(tag, hash_table_size);

	spin_lock(&htable_ops_in_progress_lock);
	list_for_each_entry_safe(op,
				 next,
				 &htable_ops_in_progress[index],
				 list) {
		if (op->tag == tag) {
			list_del(&op->list);
			spin_unlock(&htable_ops_in_progress_lock);
			return op;
		}
	}

	spin_unlock(&htable_ops_in_progress_lock);
	return NULL;
}

74
static int orangefs_devreq_open(struct inode *inode, struct file *file)
M
Mike Marshall 已提交
75 76 77 78
{
	int ret = -EINVAL;

	if (!(file->f_flags & O_NONBLOCK)) {
79 80
		gossip_err("%s: device cannot be opened in blocking mode\n",
			   __func__);
M
Mike Marshall 已提交
81 82 83
		goto out;
	}
	ret = -EACCES;
84
	gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n");
M
Mike Marshall 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
	mutex_lock(&devreq_mutex);

	if (open_access_count == 0) {
		ret = generic_file_open(inode, file);
		if (ret == 0)
			open_access_count++;
	} else {
		DUMP_DEVICE_ERROR();
	}
	mutex_unlock(&devreq_mutex);

out:

	gossip_debug(GOSSIP_DEV_DEBUG,
		     "pvfs2-client-core: open device complete (ret = %d)\n",
		     ret);
	return ret;
}

104
/* Function for read() callers into the device */
105
static ssize_t orangefs_devreq_read(struct file *file,
M
Mike Marshall 已提交
106 107 108
				 char __user *buf,
				 size_t count, loff_t *offset)
{
109 110 111 112
	struct orangefs_kernel_op_s *op, *temp;
	__s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION;
	static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
	struct orangefs_kernel_op_s *cur_op = NULL;
113
	unsigned long ret;
M
Mike Marshall 已提交
114

115
	/* We do not support blocking IO. */
M
Mike Marshall 已提交
116
	if (!(file->f_flags & O_NONBLOCK)) {
117 118
		gossip_err("%s: blocking read from client-core.\n",
			   __func__);
M
Mike Marshall 已提交
119
		return -EINVAL;
120 121 122 123 124 125 126 127 128 129 130 131
	}

	/*
	 * The client will do an ioctl to find MAX_ALIGNED_DEV_REQ_UPSIZE, then
	 * always read with that size buffer.
	 */
	if (count != MAX_ALIGNED_DEV_REQ_UPSIZE) {
		gossip_err("orangefs: client-core tried to read wrong size\n");
		return -EINVAL;
	}

	/* Get next op (if any) from top of list. */
132 133
	spin_lock(&orangefs_request_list_lock);
	list_for_each_entry_safe(op, temp, &orangefs_request_list, list) {
134 135 136 137 138
		__s32 fsid;
		/* This lock is held past the end of the loop when we break. */
		spin_lock(&op->lock);

		fsid = fsid_of_op(op);
139
		if (fsid != ORANGEFS_FS_ID_NULL) {
140 141 142 143
			int ret;
			/* Skip ops whose filesystem needs to be mounted. */
			ret = fs_mount_pending(fsid);
			if (ret == 1) {
M
Mike Marshall 已提交
144
				gossip_debug(GOSSIP_DEV_DEBUG,
145 146 147 148
				    "orangefs: skipping op tag %llu %s\n",
				    llu(op->tag), get_opname_string(op));
				spin_unlock(&op->lock);
				continue;
149 150 151 152
			/*
			 * Skip ops whose filesystem we don't know about unless
			 * it is being mounted.
			 */
153 154
			/* XXX: is there a better way to detect this? */
			} else if (ret == -1 &&
155 156 157 158
				   !(op->upcall.type ==
					ORANGEFS_VFS_OP_FS_MOUNT ||
				     op->upcall.type ==
					ORANGEFS_VFS_OP_GETATTR)) {
159 160 161 162 163 164 165
				gossip_debug(GOSSIP_DEV_DEBUG,
				    "orangefs: skipping op tag %llu %s\n",
				    llu(op->tag), get_opname_string(op));
				gossip_err(
				    "orangefs: ERROR: fs_mount_pending %d\n",
				    fsid);
				spin_unlock(&op->lock);
M
Mike Marshall 已提交
166 167 168
				continue;
			}
		}
169 170 171 172 173 174 175 176 177 178 179 180 181 182
		/*
		 * Either this op does not pertain to a filesystem, is mounting
		 * a filesystem, or pertains to a mounted filesystem. Let it
		 * through.
		 */
		cur_op = op;
		break;
	}

	/*
	 * At this point we either have a valid op and can continue or have not
	 * found an op and must ask the client to try again later.
	 */
	if (!cur_op) {
183
		spin_unlock(&orangefs_request_list_lock);
184
		return -EAGAIN;
M
Mike Marshall 已提交
185 186
	}

187 188
	gossip_debug(GOSSIP_DEV_DEBUG, "orangefs: reading op tag %llu %s\n",
		     llu(cur_op->tag), get_opname_string(cur_op));
M
Mike Marshall 已提交
189

190 191 192 193 194 195 196
	/*
	 * Such an op should never be on the list in the first place. If so, we
	 * will abort.
	 */
	if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
		gossip_err("orangefs: ERROR: Current op already queued.\n");
		list_del(&cur_op->list);
M
Mike Marshall 已提交
197
		spin_unlock(&cur_op->lock);
198
		spin_unlock(&orangefs_request_list_lock);
199
		return -EAGAIN;
M
Mike Marshall 已提交
200
	}
201 202 203 204 205 206 207 208

	/*
	 * Set the operation to be in progress and move it between lists since
	 * it has been sent to the client.
	 */
	set_op_state_inprogress(cur_op);

	list_del(&cur_op->list);
209 210
	spin_unlock(&orangefs_request_list_lock);
	orangefs_devreq_add_op(cur_op);
211 212 213 214 215 216 217 218 219 220 221 222 223
	spin_unlock(&cur_op->lock);

	/* Push the upcall out. */
	ret = copy_to_user(buf, &proto_ver, sizeof(__s32));
	if (ret != 0)
		goto error;
	ret = copy_to_user(buf+sizeof(__s32), &magic, sizeof(__s32));
	if (ret != 0)
		goto error;
	ret = copy_to_user(buf+2 * sizeof(__s32), &cur_op->tag, sizeof(__u64));
	if (ret != 0)
		goto error;
	ret = copy_to_user(buf+2*sizeof(__s32)+sizeof(__u64), &cur_op->upcall,
224
			   sizeof(struct orangefs_upcall_s));
225 226 227 228 229 230 231 232 233 234 235 236
	if (ret != 0)
		goto error;

	/* The client only asks to read one size buffer. */
	return MAX_ALIGNED_DEV_REQ_UPSIZE;
error:
	/*
	 * We were unable to copy the op data to the client. Put the op back in
	 * list. If client has crashed, the op will be purged later when the
	 * device is released.
	 */
	gossip_err("orangefs: Failed to copy data to user space\n");
237
	spin_lock(&orangefs_request_list_lock);
238 239
	spin_lock(&cur_op->lock);
	set_op_state_waiting(cur_op);
240 241
	orangefs_devreq_remove_op(cur_op->tag);
	list_add(&cur_op->list, &orangefs_request_list);
242
	spin_unlock(&cur_op->lock);
243
	spin_unlock(&orangefs_request_list_lock);
244
	return -EFAULT;
M
Mike Marshall 已提交
245 246
}

247 248 249 250 251
/*
 * Function for writev() callers into the device. Readdir related
 * operations have an extra iovec containing info about objects
 * contained in directories.
 */
252
static ssize_t orangefs_devreq_writev(struct file *file,
M
Mike Marshall 已提交
253 254 255 256
				   const struct iovec *iov,
				   size_t count,
				   loff_t *offset)
{
257
	struct orangefs_kernel_op_s *op = NULL;
M
Mike Marshall 已提交
258 259 260
	void *buffer = NULL;
	void *ptr = NULL;
	unsigned long i = 0;
261 262 263 264 265 266 267 268 269
	int num_remaining = MAX_ALIGNED_DEV_REQ_DOWNSIZE;
	int ret = 0;
	/* num elements in iovec without trailer */
	int notrailer_count = 4;
	/*
	 * If there's a trailer, its iov index will be equal to
	 * notrailer_count.
	 */
	int trailer_index = notrailer_count;
M
Mike Marshall 已提交
270
	int payload_size = 0;
271
	int returned_downcall_size = 0;
M
Mike Marshall 已提交
272 273 274 275 276
	__s32 magic = 0;
	__s32 proto_ver = 0;
	__u64 tag = 0;
	ssize_t total_returned_size = 0;

277 278 279 280 281
	/*
	 * There will always be at least notrailer_count iovecs, and
	 * when there's a trailer, one more than notrailer_count. Check
	 * count's sanity.
	 */
M
Mike Marshall 已提交
282
	if (count != notrailer_count && count != (notrailer_count + 1)) {
283 284
		gossip_err("%s: count:%zu: notrailer_count :%d:\n",
			__func__,
M
Mike Marshall 已提交
285 286 287 288
			count,
			notrailer_count);
		return -EPROTO;
	}
289 290 291


	/* Copy the non-trailer iovec data into a device request buffer. */
M
Mike Marshall 已提交
292
	buffer = dev_req_alloc();
293 294
	if (!buffer) {
		gossip_err("%s: dev_req_alloc failed.\n", __func__);
M
Mike Marshall 已提交
295
		return -ENOMEM;
296
	}
M
Mike Marshall 已提交
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
	ptr = buffer;
	for (i = 0; i < notrailer_count; i++) {
		if (iov[i].iov_len > num_remaining) {
			gossip_err
			    ("writev error: Freeing buffer and returning\n");
			dev_req_release(buffer);
			return -EMSGSIZE;
		}
		ret = copy_from_user(ptr, iov[i].iov_base, iov[i].iov_len);
		if (ret) {
			gossip_err("Failed to copy data from user space\n");
			dev_req_release(buffer);
			return -EIO;
		}
		num_remaining -= iov[i].iov_len;
		ptr += iov[i].iov_len;
		payload_size += iov[i].iov_len;
	}
	total_returned_size = payload_size;

	/* these elements are currently 8 byte aligned (8 bytes for (version +
	 * magic) 8 bytes for tag).  If you add another element, either
	 * make it 8 bytes big, or use get_unaligned when asigning.
	 */
	ptr = buffer;
322
	proto_ver = *((__s32 *) ptr); /* unused */
M
Mike Marshall 已提交
323 324 325 326 327 328 329 330
	ptr += sizeof(__s32);

	magic = *((__s32 *) ptr);
	ptr += sizeof(__s32);

	tag = *((__u64 *) ptr);
	ptr += sizeof(__u64);

331
	if (magic != ORANGEFS_DEVREQ_MAGIC) {
M
Mike Marshall 已提交
332 333 334 335 336
		gossip_err("Error: Device magic number does not match.\n");
		dev_req_release(buffer);
		return -EPROTO;
	}

337
	op = orangefs_devreq_remove_op(tag);
M
Mike Marshall 已提交
338 339 340
	if (op) {
		/* Increase ref count! */
		get_op(op);
341 342 343 344 345 346 347 348

		/* calculate the size of the returned downcall. */
		returned_downcall_size =
			payload_size - (2 * sizeof(__s32) + sizeof(__u64));

		/* copy the passed in downcall into the op */
		if (returned_downcall_size ==
			sizeof(struct orangefs_downcall_s)) {
M
Mike Marshall 已提交
349 350
			memcpy(&op->downcall,
			       ptr,
351
			       sizeof(struct orangefs_downcall_s));
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
		} else {
			gossip_err("%s: returned downcall size:%d: \n",
				   __func__,
				   returned_downcall_size);
			dev_req_release(buffer);
			put_op(op);
			return -EMSGSIZE;
		}

		/* Don't tolerate an unexpected trailer iovec. */
		if ((op->downcall.trailer_size == 0) &&
		    (count != notrailer_count)) {
			gossip_err("%s: unexpected trailer iovec.\n",
				   __func__);
			dev_req_release(buffer);
			put_op(op);
			return -EPROTO;
		}

		/* Don't consider the trailer if there's a bad status. */
		if (op->downcall.status != 0)
			goto no_trailer;

		/* get the trailer if there is one. */
		if (op->downcall.trailer_size == 0)
			goto no_trailer;

		gossip_debug(GOSSIP_DEV_DEBUG,
			     "%s: op->downcall.trailer_size %lld\n",
			     __func__,
			     op->downcall.trailer_size);
M
Mike Marshall 已提交
383

384 385 386
		/*
		 * Bail if we think think there should be a trailer, but
		 * there's no iovec for it.
M
Mike Marshall 已提交
387
		 */
388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
		if (count != (notrailer_count + 1)) {
			gossip_err("%s: trailer_size:%lld: count:%zu:\n",
				   __func__,
				   op->downcall.trailer_size,
				   count);
			dev_req_release(buffer);
			put_op(op);
			return -EPROTO;
		}

		/* Verify that trailer_size is accurate. */
		if (op->downcall.trailer_size != iov[trailer_index].iov_len) {
			gossip_err("%s: trailer_size:%lld: != iov_len:%zd:\n",
				   __func__,
				   op->downcall.trailer_size,
				   iov[trailer_index].iov_len);
			dev_req_release(buffer);
			put_op(op);
			return -EMSGSIZE;
		}

		total_returned_size += iov[trailer_index].iov_len;

		/*
		 * Allocate a buffer, copy the trailer bytes into it and
		 * attach it to the downcall.
		 */
		op->downcall.trailer_buf = vmalloc(iov[trailer_index].iov_len);
		if (op->downcall.trailer_buf != NULL) {
			gossip_debug(GOSSIP_DEV_DEBUG, "vmalloc: %p\n",
				     op->downcall.trailer_buf);
			ret = copy_from_user(op->downcall.trailer_buf,
					     iov[trailer_index].iov_base,
					     iov[trailer_index].iov_len);
			if (ret) {
				gossip_err("%s: Failed to copy trailer.\n",
					   __func__);
M
Mike Marshall 已提交
425
				dev_req_release(buffer);
426 427
				gossip_debug(GOSSIP_DEV_DEBUG,
					     "vfree: %p\n",
M
Mike Marshall 已提交
428
					     op->downcall.trailer_buf);
429 430 431 432
				vfree(op->downcall.trailer_buf);
				op->downcall.trailer_buf = NULL;
				put_op(op);
				return -EIO;
M
Mike Marshall 已提交
433
			}
434 435 436 437 438
		} else {
			gossip_err("writev: could not vmalloc for trailer!\n");
			dev_req_release(buffer);
			put_op(op);
			return -ENOMEM;
M
Mike Marshall 已提交
439 440
		}

441 442 443
no_trailer:

		/* if this operation is an I/O operation we need to wait
M
Mike Marshall 已提交
444 445 446 447 448 449 450 451 452
		 * for all data to be copied before we can return to avoid
		 * buffer corruption and races that can pull the buffers
		 * out from under us.
		 *
		 * Essentially we're synchronizing with other parts of the
		 * vfs implicitly by not allowing the user space
		 * application reading/writing this device to return until
		 * the buffers are done being used.
		 */
453
		if (op->upcall.type == ORANGEFS_VFS_OP_FILE_IO) {
M
Mike Marshall 已提交
454
			int timed_out = 0;
455
			DEFINE_WAIT(wait_entry);
M
Mike Marshall 已提交
456

457 458
			/*
			 * tell the vfs op waiting on a waitqueue
M
Mike Marshall 已提交
459 460 461 462 463 464 465 466 467 468
			 * that this op is done
			 */
			spin_lock(&op->lock);
			set_op_state_serviced(op);
			spin_unlock(&op->lock);

			wake_up_interruptible(&op->waitq);

			while (1) {
				spin_lock(&op->lock);
469 470 471 472
				prepare_to_wait_exclusive(
					&op->io_completion_waitq,
					&wait_entry,
					TASK_INTERRUPTIBLE);
M
Mike Marshall 已提交
473 474 475 476 477 478 479 480 481 482 483
				if (op->io_completed) {
					spin_unlock(&op->lock);
					break;
				}
				spin_unlock(&op->lock);

				if (!signal_pending(current)) {
					int timeout =
					    MSECS_TO_JIFFIES(1000 *
							     op_timeout_secs);
					if (!schedule_timeout(timeout)) {
484 485 486
						gossip_debug(GOSSIP_DEV_DEBUG,
							"%s: timed out.\n",
							__func__);
M
Mike Marshall 已提交
487 488 489 490 491 492
						timed_out = 1;
						break;
					}
					continue;
				}

493 494 495
				gossip_debug(GOSSIP_DEV_DEBUG,
					"%s: signal on I/O wait, aborting\n",
					__func__);
M
Mike Marshall 已提交
496 497 498
				break;
			}

499 500 501
			spin_lock(&op->lock);
			finish_wait(&op->io_completion_waitq, &wait_entry);
			spin_unlock(&op->lock);
M
Mike Marshall 已提交
502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520

			/* NOTE: for I/O operations we handle releasing the op
			 * object except in the case of timeout.  the reason we
			 * can't free the op in timeout cases is that the op
			 * service logic in the vfs retries operations using
			 * the same op ptr, thus it can't be freed.
			 */
			if (!timed_out)
				op_release(op);
		} else {

			/*
			 * tell the vfs op waiting on a waitqueue that
			 * this op is done
			 */
			spin_lock(&op->lock);
			set_op_state_serviced(op);
			spin_unlock(&op->lock);
			/*
521 522 523
			 * for every other operation (i.e. non-I/O), we need to
			 * wake up the callers for downcall completion
			 * notification
M
Mike Marshall 已提交
524 525 526 527 528 529 530 531 532
			 */
			wake_up_interruptible(&op->waitq);
		}
	} else {
		/* ignore downcalls that we're not interested in */
		gossip_debug(GOSSIP_DEV_DEBUG,
			     "WARNING: No one's waiting for tag %llu\n",
			     llu(tag));
	}
533
	/* put_op? */
M
Mike Marshall 已提交
534 535 536 537 538
	dev_req_release(buffer);

	return total_returned_size;
}

539
static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
M
Mike Marshall 已提交
540 541
				      struct iov_iter *iter)
{
542
	return orangefs_devreq_writev(iocb->ki_filp,
M
Mike Marshall 已提交
543 544 545 546 547 548 549 550 551
				   iter->iov,
				   iter->nr_segs,
				   &iocb->ki_pos);
}

/* Returns whether any FS are still pending remounted */
static int mark_all_pending_mounts(void)
{
	int unmounted = 1;
552
	struct orangefs_sb_info_s *orangefs_sb = NULL;
M
Mike Marshall 已提交
553

554 555
	spin_lock(&orangefs_superblocks_lock);
	list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
M
Mike Marshall 已提交
556
		/* All of these file system require a remount */
557
		orangefs_sb->mount_pending = 1;
M
Mike Marshall 已提交
558 559
		unmounted = 0;
	}
560
	spin_unlock(&orangefs_superblocks_lock);
M
Mike Marshall 已提交
561 562 563 564 565 566 567 568 569 570 571 572
	return unmounted;
}

/*
 * Determine if a given file system needs to be remounted or not
 *  Returns -1 on error
 *           0 if already mounted
 *           1 if needs remount
 */
int fs_mount_pending(__s32 fsid)
{
	int mount_pending = -1;
573
	struct orangefs_sb_info_s *orangefs_sb = NULL;
M
Mike Marshall 已提交
574

575 576 577 578
	spin_lock(&orangefs_superblocks_lock);
	list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) {
		if (orangefs_sb->fs_id == fsid) {
			mount_pending = orangefs_sb->mount_pending;
M
Mike Marshall 已提交
579 580 581
			break;
		}
	}
582
	spin_unlock(&orangefs_superblocks_lock);
M
Mike Marshall 已提交
583 584 585 586 587 588 589 590
	return mount_pending;
}

/*
 * NOTE: gets called when the last reference to this device is dropped.
 * Using the open_access_count variable, we enforce a reference count
 * on this file so that it can be opened by only one process at a time.
 * the devreq_mutex is used to make sure all i/o has completed
591
 * before we call orangefs_bufmap_finalize, and similar such tricky
M
Mike Marshall 已提交
592 593
 * situations
 */
594
static int orangefs_devreq_release(struct inode *inode, struct file *file)
M
Mike Marshall 已提交
595 596 597 598 599 600 601 602
{
	int unmounted = 0;

	gossip_debug(GOSSIP_DEV_DEBUG,
		     "%s:pvfs2-client-core: exiting, closing device\n",
		     __func__);

	mutex_lock(&devreq_mutex);
603
	orangefs_bufmap_finalize();
M
Mike Marshall 已提交
604 605 606 607

	open_access_count--;

	unmounted = mark_all_pending_mounts();
608
	gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n",
M
Mike Marshall 已提交
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643
		     (unmounted ? "UNMOUNTED" : "MOUNTED"));
	mutex_unlock(&devreq_mutex);

	/*
	 * Walk through the list of ops in the request list, mark them
	 * as purged and wake them up.
	 */
	purge_waiting_ops();
	/*
	 * Walk through the hash table of in progress operations; mark
	 * them as purged and wake them up
	 */
	purge_inprogress_ops();
	gossip_debug(GOSSIP_DEV_DEBUG,
		     "pvfs2-client-core: device close complete\n");
	return 0;
}

int is_daemon_in_service(void)
{
	int in_service;

	/*
	 * What this function does is checks if client-core is alive
	 * based on the access count we maintain on the device.
	 */
	mutex_lock(&devreq_mutex);
	in_service = open_access_count == 1 ? 0 : -EIO;
	mutex_unlock(&devreq_mutex);
	return in_service;
}

static inline long check_ioctl_command(unsigned int command)
{
	/* Check for valid ioctl codes */
644
	if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) {
M
Mike Marshall 已提交
645 646 647
		gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n",
			command,
			_IOC_TYPE(command),
648
			ORANGEFS_DEV_MAGIC);
M
Mike Marshall 已提交
649 650 651
		return -EINVAL;
	}
	/* and valid ioctl commands */
652
	if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) {
M
Mike Marshall 已提交
653
		gossip_err("Invalid ioctl command number [%d >= %d]\n",
654
			   _IOC_NR(command), ORANGEFS_DEV_MAXNR);
M
Mike Marshall 已提交
655 656 657 658 659 660 661
		return -ENOIOCTLCMD;
	}
	return 0;
}

static long dispatch_ioctl_command(unsigned int command, unsigned long arg)
{
662
	static __s32 magic = ORANGEFS_DEVREQ_MAGIC;
M
Mike Marshall 已提交
663 664
	static __s32 max_up_size = MAX_ALIGNED_DEV_REQ_UPSIZE;
	static __s32 max_down_size = MAX_ALIGNED_DEV_REQ_DOWNSIZE;
665
	struct ORANGEFS_dev_map_desc user_desc;
M
Mike Marshall 已提交
666 667 668 669 670
	int ret = 0;
	struct dev_mask_info_s mask_info = { 0 };
	struct dev_mask2_info_s mask2_info = { 0, 0 };
	int upstream_kmod = 1;
	struct list_head *tmp = NULL;
671
	struct orangefs_sb_info_s *orangefs_sb = NULL;
M
Mike Marshall 已提交
672 673 674 675

	/* mtmoore: add locking here */

	switch (command) {
676
	case ORANGEFS_DEV_GET_MAGIC:
M
Mike Marshall 已提交
677 678 679
		return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ?
			-EIO :
			0);
680
	case ORANGEFS_DEV_GET_MAX_UPSIZE:
M
Mike Marshall 已提交
681 682 683 684
		return ((put_user(max_up_size,
				  (__s32 __user *) arg) == -EFAULT) ?
					-EIO :
					0);
685
	case ORANGEFS_DEV_GET_MAX_DOWNSIZE:
M
Mike Marshall 已提交
686 687 688 689
		return ((put_user(max_down_size,
				  (__s32 __user *) arg) == -EFAULT) ?
					-EIO :
					0);
690
	case ORANGEFS_DEV_MAP:
M
Mike Marshall 已提交
691
		ret = copy_from_user(&user_desc,
692
				     (struct ORANGEFS_dev_map_desc __user *)
M
Mike Marshall 已提交
693
				     arg,
694 695 696
				     sizeof(struct ORANGEFS_dev_map_desc));
		return ret ? -EIO : orangefs_bufmap_initialize(&user_desc);
	case ORANGEFS_DEV_REMOUNT_ALL:
M
Mike Marshall 已提交
697
		gossip_debug(GOSSIP_DEV_DEBUG,
698 699
			     "%s: got ORANGEFS_DEV_REMOUNT_ALL\n",
			     __func__);
M
Mike Marshall 已提交
700 701

		/*
702
		 * remount all mounted orangefs volumes to regain the lost
M
Mike Marshall 已提交
703 704 705 706 707 708 709 710 711 712 713
		 * dynamic mount tables (if any) -- NOTE: this is done
		 * without keeping the superblock list locked due to the
		 * upcall/downcall waiting.  also, the request semaphore is
		 * used to ensure that no operations will be serviced until
		 * all of the remounts are serviced (to avoid ops between
		 * mounts to fail)
		 */
		ret = mutex_lock_interruptible(&request_mutex);
		if (ret < 0)
			return ret;
		gossip_debug(GOSSIP_DEV_DEBUG,
714 715
			     "%s: priority remount in progress\n",
			     __func__);
716 717
		list_for_each(tmp, &orangefs_superblocks) {
			orangefs_sb =
718 719 720
				list_entry(tmp,
					   struct orangefs_sb_info_s,
					   list);
721
			if (orangefs_sb && (orangefs_sb->sb)) {
M
Mike Marshall 已提交
722
				gossip_debug(GOSSIP_DEV_DEBUG,
723 724
					     "%s: Remounting SB %p\n",
					     __func__,
725
					     orangefs_sb);
M
Mike Marshall 已提交
726

727
				ret = orangefs_remount(orangefs_sb->sb);
M
Mike Marshall 已提交
728 729 730
				if (ret) {
					gossip_debug(GOSSIP_DEV_DEBUG,
						     "SB %p remount failed\n",
731
						     orangefs_sb);
732
					break;
M
Mike Marshall 已提交
733 734 735 736
				}
			}
		}
		gossip_debug(GOSSIP_DEV_DEBUG,
737 738
			     "%s: priority remount complete\n",
			     __func__);
M
Mike Marshall 已提交
739 740 741
		mutex_unlock(&request_mutex);
		return ret;

742
	case ORANGEFS_DEV_UPSTREAM:
M
Mike Marshall 已提交
743 744 745 746 747 748 749 750 751
		ret = copy_to_user((void __user *)arg,
				    &upstream_kmod,
				    sizeof(upstream_kmod));

		if (ret != 0)
			return -EIO;
		else
			return ret;

752
	case ORANGEFS_DEV_CLIENT_MASK:
M
Mike Marshall 已提交
753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770
		ret = copy_from_user(&mask2_info,
				     (void __user *)arg,
				     sizeof(struct dev_mask2_info_s));

		if (ret != 0)
			return -EIO;

		client_debug_mask.mask1 = mask2_info.mask1_value;
		client_debug_mask.mask2 = mask2_info.mask2_value;

		pr_info("%s: client debug mask has been been received "
			":%llx: :%llx:\n",
			__func__,
			(unsigned long long)client_debug_mask.mask1,
			(unsigned long long)client_debug_mask.mask2);

		return ret;

771
	case ORANGEFS_DEV_CLIENT_STRING:
M
Mike Marshall 已提交
772 773
		ret = copy_from_user(&client_debug_array_string,
				     (void __user *)arg,
774
				     ORANGEFS_MAX_DEBUG_STRING_LEN);
M
Mike Marshall 已提交
775
		if (ret != 0) {
776
			pr_info("%s: CLIENT_STRING: copy_from_user failed\n",
M
Mike Marshall 已提交
777 778 779 780
				__func__);
			return -EIO;
		}

781
		pr_info("%s: client debug array string has been received.\n",
M
Mike Marshall 已提交
782 783 784 785 786 787 788 789 790
			__func__);

		if (!help_string_initialized) {

			/* Free the "we don't know yet" default string... */
			kfree(debug_help_string);

			/* build a proper debug help string */
			if (orangefs_prepare_debugfs_help_string(0)) {
791
				gossip_err("%s: no debug help string \n",
M
Mike Marshall 已提交
792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
					   __func__);
				return -EIO;
			}

			/* Replace the boilerplate boot-time debug-help file. */
			debugfs_remove(help_file_dentry);

			help_file_dentry =
				debugfs_create_file(
					ORANGEFS_KMOD_DEBUG_HELP_FILE,
					0444,
					debug_dir,
					debug_help_string,
					&debug_help_fops);

			if (!help_file_dentry) {
				gossip_err("%s: debugfs_create_file failed for"
					   " :%s:!\n",
					   __func__,
					   ORANGEFS_KMOD_DEBUG_HELP_FILE);
				return -EIO;
			}
		}

		debug_mask_to_string(&client_debug_mask, 1);

		debugfs_remove(client_debug_dentry);

820
		orangefs_client_debug_init();
M
Mike Marshall 已提交
821 822 823 824 825

		help_string_initialized++;

		return ret;

826
	case ORANGEFS_DEV_DEBUG:
M
Mike Marshall 已提交
827 828 829 830 831 832 833 834 835 836 837 838 839 840
		ret = copy_from_user(&mask_info,
				     (void __user *)arg,
				     sizeof(mask_info));

		if (ret != 0)
			return -EIO;

		if (mask_info.mask_type == KERNEL_MASK) {
			if ((mask_info.mask_value == 0)
			    && (kernel_mask_set_mod_init)) {
				/*
				 * the kernel debug mask was set when the
				 * kernel module was loaded; don't override
				 * it if the client-core was started without
841
				 * a value for ORANGEFS_KMODMASK.
M
Mike Marshall 已提交
842 843 844 845 846 847
				 */
				return 0;
			}
			debug_mask_to_string(&mask_info.mask_value,
					     mask_info.mask_type);
			gossip_debug_mask = mask_info.mask_value;
848
			pr_info("%s: kernel debug mask has been modified to "
M
Mike Marshall 已提交
849
				":%s: :%llx:\n",
850
				__func__,
M
Mike Marshall 已提交
851 852 853 854 855
				kernel_debug_string,
				(unsigned long long)gossip_debug_mask);
		} else if (mask_info.mask_type == CLIENT_MASK) {
			debug_mask_to_string(&mask_info.mask_value,
					     mask_info.mask_type);
856
			pr_info("%s: client debug mask has been modified to"
M
Mike Marshall 已提交
857
				":%s: :%llx:\n",
858
				__func__,
M
Mike Marshall 已提交
859 860 861 862 863 864 865 866 867 868 869 870 871 872 873
				client_debug_string,
				llu(mask_info.mask_value));
		} else {
			gossip_lerr("Invalid mask type....\n");
			return -EINVAL;
		}

		return ret;

	default:
		return -ENOIOCTLCMD;
	}
	return -ENOIOCTLCMD;
}

874
static long orangefs_devreq_ioctl(struct file *file,
M
Mike Marshall 已提交
875 876 877 878 879 880 881 882 883 884 885 886 887 888
			       unsigned int command, unsigned long arg)
{
	long ret;

	/* Check for properly constructed commands */
	ret = check_ioctl_command(command);
	if (ret < 0)
		return (int)ret;

	return (int)dispatch_ioctl_command(command, arg);
}

#ifdef CONFIG_COMPAT		/* CONFIG_COMPAT is in .config */

889 890
/*  Compat structure for the ORANGEFS_DEV_MAP ioctl */
struct ORANGEFS_dev_map_desc32 {
M
Mike Marshall 已提交
891 892 893 894 895 896 897 898
	compat_uptr_t ptr;
	__s32 total_size;
	__s32 size;
	__s32 count;
};

static unsigned long translate_dev_map26(unsigned long args, long *error)
{
899
	struct ORANGEFS_dev_map_desc32 __user *p32 = (void __user *)args;
M
Mike Marshall 已提交
900 901 902 903
	/*
	 * Depending on the architecture, allocate some space on the
	 * user-call-stack based on our expected layout.
	 */
904
	struct ORANGEFS_dev_map_desc __user *p =
M
Mike Marshall 已提交
905
	    compat_alloc_user_space(sizeof(*p));
906
	compat_uptr_t addr;
M
Mike Marshall 已提交
907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931

	*error = 0;
	/* get the ptr from the 32 bit user-space */
	if (get_user(addr, &p32->ptr))
		goto err;
	/* try to put that into a 64-bit layout */
	if (put_user(compat_ptr(addr), &p->ptr))
		goto err;
	/* copy the remaining fields */
	if (copy_in_user(&p->total_size, &p32->total_size, sizeof(__s32)))
		goto err;
	if (copy_in_user(&p->size, &p32->size, sizeof(__s32)))
		goto err;
	if (copy_in_user(&p->count, &p32->count, sizeof(__s32)))
		goto err;
	return (unsigned long)p;
err:
	*error = -EFAULT;
	return 0;
}

/*
 * 32 bit user-space apps' ioctl handlers when kernel modules
 * is compiled as a 64 bit one
 */
932
static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd,
M
Mike Marshall 已提交
933 934 935 936 937 938 939 940 941
				      unsigned long args)
{
	long ret;
	unsigned long arg = args;

	/* Check for properly constructed commands */
	ret = check_ioctl_command(cmd);
	if (ret < 0)
		return ret;
942
	if (cmd == ORANGEFS_DEV_MAP) {
M
Mike Marshall 已提交
943 944 945 946 947 948 949 950 951 952 953 954 955 956
		/*
		 * convert the arguments to what we expect internally
		 * in kernel space
		 */
		arg = translate_dev_map26(args, &ret);
		if (ret < 0) {
			gossip_err("Could not translate dev map\n");
			return ret;
		}
	}
	/* no other ioctl requires translation */
	return dispatch_ioctl_command(cmd, arg);
}

M
Mike Marshall 已提交
957 958 959 960 961 962 963 964
#endif /* CONFIG_COMPAT is in .config */

/*
 * The following two ioctl32 functions had been refactored into the above
 * CONFIG_COMPAT ifdef, but that was an over simplification that was
 * not noticed until we tried to compile on power pc...
 */
#if (defined(CONFIG_COMPAT) && !defined(HAVE_REGISTER_IOCTL32_CONVERSION)) || !defined(CONFIG_COMPAT)
965
static int orangefs_ioctl32_init(void)
M
Mike Marshall 已提交
966 967 968 969
{
	return 0;
}

970
static void orangefs_ioctl32_cleanup(void)
M
Mike Marshall 已提交
971 972 973
{
	return;
}
M
Mike Marshall 已提交
974
#endif
M
Mike Marshall 已提交
975 976

/* the assigned character device major number */
977
static int orangefs_dev_major;
M
Mike Marshall 已提交
978 979

/*
980
 * Initialize orangefs device specific state:
M
Mike Marshall 已提交
981 982
 * Must be called at module load time only
 */
983
int orangefs_dev_init(void)
M
Mike Marshall 已提交
984 985 986 987
{
	int ret;

	/* register the ioctl32 sub-system */
988
	ret = orangefs_ioctl32_init();
M
Mike Marshall 已提交
989 990 991
	if (ret < 0)
		return ret;

992 993 994 995 996
	/* register orangefs-req device  */
	orangefs_dev_major = register_chrdev(0,
					  ORANGEFS_REQDEVICE_NAME,
					  &orangefs_devreq_file_operations);
	if (orangefs_dev_major < 0) {
M
Mike Marshall 已提交
997 998
		gossip_debug(GOSSIP_DEV_DEBUG,
			     "Failed to register /dev/%s (error %d)\n",
999 1000 1001
			     ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
		orangefs_ioctl32_cleanup();
		return orangefs_dev_major;
M
Mike Marshall 已提交
1002 1003 1004 1005
	}

	gossip_debug(GOSSIP_DEV_DEBUG,
		     "*** /dev/%s character device registered ***\n",
1006
		     ORANGEFS_REQDEVICE_NAME);
M
Mike Marshall 已提交
1007
	gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n",
1008
		     ORANGEFS_REQDEVICE_NAME, orangefs_dev_major);
M
Mike Marshall 已提交
1009 1010 1011
	return 0;
}

1012
void orangefs_dev_cleanup(void)
M
Mike Marshall 已提交
1013
{
1014
	unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME);
M
Mike Marshall 已提交
1015 1016
	gossip_debug(GOSSIP_DEV_DEBUG,
		     "*** /dev/%s character device unregistered ***\n",
1017
		     ORANGEFS_REQDEVICE_NAME);
M
Mike Marshall 已提交
1018
	/* unregister the ioctl32 sub-system */
1019
	orangefs_ioctl32_cleanup();
M
Mike Marshall 已提交
1020 1021
}

1022
static unsigned int orangefs_devreq_poll(struct file *file,
M
Mike Marshall 已提交
1023 1024 1025 1026 1027
				      struct poll_table_struct *poll_table)
{
	int poll_revent_mask = 0;

	if (open_access_count == 1) {
1028
		poll_wait(file, &orangefs_request_list_waitq, poll_table);
M
Mike Marshall 已提交
1029

1030 1031
		spin_lock(&orangefs_request_list_lock);
		if (!list_empty(&orangefs_request_list))
M
Mike Marshall 已提交
1032
			poll_revent_mask |= POLL_IN;
1033
		spin_unlock(&orangefs_request_list_lock);
M
Mike Marshall 已提交
1034 1035 1036 1037
	}
	return poll_revent_mask;
}

1038
const struct file_operations orangefs_devreq_file_operations = {
M
Mike Marshall 已提交
1039
	.owner = THIS_MODULE,
1040 1041 1042 1043 1044
	.read = orangefs_devreq_read,
	.write_iter = orangefs_devreq_write_iter,
	.open = orangefs_devreq_open,
	.release = orangefs_devreq_release,
	.unlocked_ioctl = orangefs_devreq_ioctl,
M
Mike Marshall 已提交
1045 1046

#ifdef CONFIG_COMPAT		/* CONFIG_COMPAT is in .config */
1047
	.compat_ioctl = orangefs_devreq_compat_ioctl,
M
Mike Marshall 已提交
1048
#endif
1049
	.poll = orangefs_devreq_poll
M
Mike Marshall 已提交
1050
};