orangefs-bufmap.c 14.0 KB
Newer Older
M
Mike Marshall 已提交
1 2 3 4 5 6
/*
 * (C) 2001 Clemson University and The University of Chicago
 *
 * See COPYING in top-level directory.
 */
#include "protocol.h"
7 8
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
M
Mike Marshall 已提交
9

10
DECLARE_WAIT_QUEUE_HEAD(orangefs_bufmap_init_waitq);
M
Mike Marshall 已提交
11

12 13 14 15 16 17 18 19
/* used to describe mapped buffers */
struct orangefs_bufmap_desc {
	void *uaddr;			/* user space address pointer */
	struct page **page_array;	/* array of mapped pages */
	int array_count;		/* size of above arrays */
	struct list_head list_link;
};

20
static struct orangefs_bufmap {
M
Mike Marshall 已提交
21 22 23 24 25 26 27 28 29
	atomic_t refcnt;

	int desc_size;
	int desc_shift;
	int desc_count;
	int total_size;
	int page_count;

	struct page **page_array;
30
	struct orangefs_bufmap_desc *desc_array;
M
Mike Marshall 已提交
31 32 33 34 35 36

	/* array to track usage of buffer descriptors */
	int *buffer_index_array;
	spinlock_t buffer_index_lock;

	/* array to track usage of buffer descriptors for readdir */
37
	int readdir_index_array[ORANGEFS_READDIR_DEFAULT_DESC_COUNT];
M
Mike Marshall 已提交
38
	spinlock_t readdir_index_lock;
39
} *__orangefs_bufmap;
M
Mike Marshall 已提交
40

41
static DEFINE_SPINLOCK(orangefs_bufmap_lock);
M
Mike Marshall 已提交
42 43

static void
44
orangefs_bufmap_unmap(struct orangefs_bufmap *bufmap)
M
Mike Marshall 已提交
45 46 47 48 49 50 51 52
{
	int i;

	for (i = 0; i < bufmap->page_count; i++)
		page_cache_release(bufmap->page_array[i]);
}

static void
53
orangefs_bufmap_free(struct orangefs_bufmap *bufmap)
M
Mike Marshall 已提交
54 55 56 57 58 59 60
{
	kfree(bufmap->page_array);
	kfree(bufmap->desc_array);
	kfree(bufmap->buffer_index_array);
	kfree(bufmap);
}

61
static struct orangefs_bufmap *orangefs_bufmap_ref(void)
M
Mike Marshall 已提交
62
{
63
	struct orangefs_bufmap *bufmap = NULL;
M
Mike Marshall 已提交
64

65 66 67
	spin_lock(&orangefs_bufmap_lock);
	if (__orangefs_bufmap) {
		bufmap = __orangefs_bufmap;
M
Mike Marshall 已提交
68 69
		atomic_inc(&bufmap->refcnt);
	}
70
	spin_unlock(&orangefs_bufmap_lock);
M
Mike Marshall 已提交
71 72 73
	return bufmap;
}

74
static void orangefs_bufmap_unref(struct orangefs_bufmap *bufmap)
M
Mike Marshall 已提交
75
{
76 77 78
	if (atomic_dec_and_lock(&bufmap->refcnt, &orangefs_bufmap_lock)) {
		__orangefs_bufmap = NULL;
		spin_unlock(&orangefs_bufmap_lock);
M
Mike Marshall 已提交
79

80 81
		orangefs_bufmap_unmap(bufmap);
		orangefs_bufmap_free(bufmap);
M
Mike Marshall 已提交
82 83 84
	}
}

85
inline int orangefs_bufmap_size_query(void)
M
Mike Marshall 已提交
86
{
87
	struct orangefs_bufmap *bufmap = orangefs_bufmap_ref();
M
Mike Marshall 已提交
88 89
	int size = bufmap ? bufmap->desc_size : 0;

90
	orangefs_bufmap_unref(bufmap);
M
Mike Marshall 已提交
91 92 93
	return size;
}

94
inline int orangefs_bufmap_shift_query(void)
M
Mike Marshall 已提交
95
{
96
	struct orangefs_bufmap *bufmap = orangefs_bufmap_ref();
M
Mike Marshall 已提交
97 98
	int shift = bufmap ? bufmap->desc_shift : 0;

99
	orangefs_bufmap_unref(bufmap);
M
Mike Marshall 已提交
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
	return shift;
}

static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq);
static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq);

/*
 * get_bufmap_init
 *
 * If bufmap_init is 1, then the shared memory system, including the
 * buffer_index_array, is available.  Otherwise, it is not.
 *
 * returns the value of bufmap_init
 */
int get_bufmap_init(void)
{
116
	return __orangefs_bufmap ? 1 : 0;
M
Mike Marshall 已提交
117 118 119
}


120 121
static struct orangefs_bufmap *
orangefs_bufmap_alloc(struct ORANGEFS_dev_map_desc *user_desc)
M
Mike Marshall 已提交
122
{
123
	struct orangefs_bufmap *bufmap;
M
Mike Marshall 已提交
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138

	bufmap = kzalloc(sizeof(*bufmap), GFP_KERNEL);
	if (!bufmap)
		goto out;

	atomic_set(&bufmap->refcnt, 1);
	bufmap->total_size = user_desc->total_size;
	bufmap->desc_count = user_desc->count;
	bufmap->desc_size = user_desc->size;
	bufmap->desc_shift = ilog2(bufmap->desc_size);

	spin_lock_init(&bufmap->buffer_index_lock);
	bufmap->buffer_index_array =
		kcalloc(bufmap->desc_count, sizeof(int), GFP_KERNEL);
	if (!bufmap->buffer_index_array) {
139
		gossip_err("orangefs: could not allocate %d buffer indices\n",
M
Mike Marshall 已提交
140 141 142 143 144 145
				bufmap->desc_count);
		goto out_free_bufmap;
	}
	spin_lock_init(&bufmap->readdir_index_lock);

	bufmap->desc_array =
146
		kcalloc(bufmap->desc_count, sizeof(struct orangefs_bufmap_desc),
M
Mike Marshall 已提交
147 148
			GFP_KERNEL);
	if (!bufmap->desc_array) {
149
		gossip_err("orangefs: could not allocate %d descriptors\n",
M
Mike Marshall 已提交
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
				bufmap->desc_count);
		goto out_free_index_array;
	}

	bufmap->page_count = bufmap->total_size / PAGE_SIZE;

	/* allocate storage to track our page mappings */
	bufmap->page_array =
		kcalloc(bufmap->page_count, sizeof(struct page *), GFP_KERNEL);
	if (!bufmap->page_array)
		goto out_free_desc_array;

	return bufmap;

out_free_desc_array:
	kfree(bufmap->desc_array);
out_free_index_array:
	kfree(bufmap->buffer_index_array);
out_free_bufmap:
	kfree(bufmap);
out:
	return NULL;
}

static int
175 176
orangefs_bufmap_map(struct orangefs_bufmap *bufmap,
		struct ORANGEFS_dev_map_desc *user_desc)
M
Mike Marshall 已提交
177 178 179 180 181
{
	int pages_per_desc = bufmap->desc_size / PAGE_SIZE;
	int offset = 0, ret, i;

	/* map the pages */
182 183
	ret = get_user_pages_fast((unsigned long)user_desc->ptr,
			     bufmap->page_count, 1, bufmap->page_array);
M
Mike Marshall 已提交
184 185 186 187 188

	if (ret < 0)
		return ret;

	if (ret != bufmap->page_count) {
189
		gossip_err("orangefs error: asked for %d pages, only got %d.\n",
M
Mike Marshall 已提交
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
				bufmap->page_count, ret);

		for (i = 0; i < ret; i++) {
			SetPageError(bufmap->page_array[i]);
			page_cache_release(bufmap->page_array[i]);
		}
		return -ENOMEM;
	}

	/*
	 * ideally we want to get kernel space pointers for each page, but
	 * we can't kmap that many pages at once if highmem is being used.
	 * so instead, we just kmap/kunmap the page address each time the
	 * kaddr is needed.
	 */
	for (i = 0; i < bufmap->page_count; i++)
		flush_dcache_page(bufmap->page_array[i]);

	/* build a list of available descriptors */
	for (offset = 0, i = 0; i < bufmap->desc_count; i++) {
		bufmap->desc_array[i].page_array = &bufmap->page_array[offset];
		bufmap->desc_array[i].array_count = pages_per_desc;
		bufmap->desc_array[i].uaddr =
		    (user_desc->ptr + (i * pages_per_desc * PAGE_SIZE));
		offset += pages_per_desc;
	}

	return 0;
}

/*
221
 * orangefs_bufmap_initialize()
M
Mike Marshall 已提交
222 223 224 225 226
 *
 * initializes the mapped buffer interface
 *
 * returns 0 on success, -errno on failure
 */
227
int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc)
M
Mike Marshall 已提交
228
{
229
	struct orangefs_bufmap *bufmap;
M
Mike Marshall 已提交
230 231 232
	int ret = -EINVAL;

	gossip_debug(GOSSIP_BUFMAP_DEBUG,
233
		     "orangefs_bufmap_initialize: called (ptr ("
M
Mike Marshall 已提交
234 235 236 237 238 239 240 241 242 243 244
		     "%p) sz (%d) cnt(%d).\n",
		     user_desc->ptr,
		     user_desc->size,
		     user_desc->count);

	/*
	 * sanity check alignment and size of buffer that caller wants to
	 * work with
	 */
	if (PAGE_ALIGN((unsigned long)user_desc->ptr) !=
	    (unsigned long)user_desc->ptr) {
245
		gossip_err("orangefs error: memory alignment (front). %p\n",
M
Mike Marshall 已提交
246 247 248 249 250 251
			   user_desc->ptr);
		goto out;
	}

	if (PAGE_ALIGN(((unsigned long)user_desc->ptr + user_desc->total_size))
	    != (unsigned long)(user_desc->ptr + user_desc->total_size)) {
252
		gossip_err("orangefs error: memory alignment (back).(%p + %d)\n",
M
Mike Marshall 已提交
253 254 255 256 257 258
			   user_desc->ptr,
			   user_desc->total_size);
		goto out;
	}

	if (user_desc->total_size != (user_desc->size * user_desc->count)) {
259
		gossip_err("orangefs error: user provided an oddly sized buffer: (%d, %d, %d)\n",
M
Mike Marshall 已提交
260 261 262 263 264 265 266
			   user_desc->total_size,
			   user_desc->size,
			   user_desc->count);
		goto out;
	}

	if ((user_desc->size % PAGE_SIZE) != 0) {
267
		gossip_err("orangefs error: bufmap size not page size divisible (%d).\n",
M
Mike Marshall 已提交
268 269 270 271 272
			   user_desc->size);
		goto out;
	}

	ret = -ENOMEM;
273
	bufmap = orangefs_bufmap_alloc(user_desc);
M
Mike Marshall 已提交
274 275 276
	if (!bufmap)
		goto out;

277
	ret = orangefs_bufmap_map(bufmap, user_desc);
M
Mike Marshall 已提交
278 279 280 281
	if (ret)
		goto out_free_bufmap;


282 283 284 285
	spin_lock(&orangefs_bufmap_lock);
	if (__orangefs_bufmap) {
		spin_unlock(&orangefs_bufmap_lock);
		gossip_err("orangefs: error: bufmap already initialized.\n");
M
Mike Marshall 已提交
286 287 288
		ret = -EALREADY;
		goto out_unmap_bufmap;
	}
289 290
	__orangefs_bufmap = bufmap;
	spin_unlock(&orangefs_bufmap_lock);
M
Mike Marshall 已提交
291 292

	/*
293
	 * If there are operations in orangefs_bufmap_init_waitq, wake them up.
M
Mike Marshall 已提交
294 295 296 297 298 299 300 301
	 * This scenario occurs when the client-core is restarted and I/O
	 * requests in the in-progress or waiting tables are restarted.  I/O
	 * requests cannot be restarted until the shared memory system is
	 * completely re-initialized, so we put the I/O requests in this
	 * waitq until initialization has completed.  NOTE:  the I/O requests
	 * are also on a timer, so they don't wait forever just in case the
	 * client-core doesn't come back up.
	 */
302
	wake_up_interruptible(&orangefs_bufmap_init_waitq);
M
Mike Marshall 已提交
303 304

	gossip_debug(GOSSIP_BUFMAP_DEBUG,
305
		     "orangefs_bufmap_initialize: exiting normally\n");
M
Mike Marshall 已提交
306 307 308
	return 0;

out_unmap_bufmap:
309
	orangefs_bufmap_unmap(bufmap);
M
Mike Marshall 已提交
310
out_free_bufmap:
311
	orangefs_bufmap_free(bufmap);
M
Mike Marshall 已提交
312 313 314 315 316
out:
	return ret;
}

/*
317
 * orangefs_bufmap_finalize()
M
Mike Marshall 已提交
318 319 320 321 322 323
 *
 * shuts down the mapped buffer interface and releases any resources
 * associated with it
 *
 * no return value
 */
324
void orangefs_bufmap_finalize(void)
M
Mike Marshall 已提交
325
{
326 327 328
	gossip_debug(GOSSIP_BUFMAP_DEBUG, "orangefs_bufmap_finalize: called\n");
	BUG_ON(!__orangefs_bufmap);
	orangefs_bufmap_unref(__orangefs_bufmap);
M
Mike Marshall 已提交
329
	gossip_debug(GOSSIP_BUFMAP_DEBUG,
330
		     "orangefs_bufmap_finalize: exiting normally\n");
M
Mike Marshall 已提交
331 332 333 334 335 336 337 338 339 340 341 342 343
}

struct slot_args {
	int slot_count;
	int *slot_array;
	spinlock_t *slot_lock;
	wait_queue_head_t *slot_wq;
};

static int wait_for_a_slot(struct slot_args *slargs, int *buffer_index)
{
	int ret = -1;
	int i = 0;
344
	DEFINE_WAIT(wait_entry);
M
Mike Marshall 已提交
345 346 347 348 349 350 351

	while (1) {
		/*
		 * check for available desc, slot_lock is the appropriate
		 * index_lock
		 */
		spin_lock(slargs->slot_lock);
352 353 354
		prepare_to_wait_exclusive(slargs->slot_wq,
					  &wait_entry,
					  TASK_INTERRUPTIBLE);
M
Mike Marshall 已提交
355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
		for (i = 0; i < slargs->slot_count; i++)
			if (slargs->slot_array[i] == 0) {
				slargs->slot_array[i] = 1;
				*buffer_index = i;
				ret = 0;
				break;
			}
		spin_unlock(slargs->slot_lock);

		/* if we acquired a buffer, then break out of while */
		if (ret == 0)
			break;

		if (!signal_pending(current)) {
			int timeout =
			    MSECS_TO_JIFFIES(1000 * slot_timeout_secs);
			gossip_debug(GOSSIP_BUFMAP_DEBUG,
				     "[BUFMAP]: waiting %d "
				     "seconds for a slot\n",
				     slot_timeout_secs);
			if (!schedule_timeout(timeout)) {
				gossip_debug(GOSSIP_BUFMAP_DEBUG,
					     "*** wait_for_a_slot timed out\n");
				ret = -ETIMEDOUT;
				break;
			}
			gossip_debug(GOSSIP_BUFMAP_DEBUG,
			  "[BUFMAP]: woken up by a slot becoming available.\n");
			continue;
		}

386
		gossip_debug(GOSSIP_BUFMAP_DEBUG, "orangefs: %s interrupted.\n",
M
Mike Marshall 已提交
387 388 389 390 391
			     __func__);
		ret = -EINTR;
		break;
	}

392 393 394
	spin_lock(slargs->slot_lock);
	finish_wait(slargs->slot_wq, &wait_entry);
	spin_unlock(slargs->slot_lock);
M
Mike Marshall 已提交
395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
	return ret;
}

static void put_back_slot(struct slot_args *slargs, int buffer_index)
{
	/* slot_lock is the appropriate index_lock */
	spin_lock(slargs->slot_lock);
	if (buffer_index < 0 || buffer_index >= slargs->slot_count) {
		spin_unlock(slargs->slot_lock);
		return;
	}

	/* put the desc back on the queue */
	slargs->slot_array[buffer_index] = 0;
	spin_unlock(slargs->slot_lock);

	/* wake up anyone who may be sleeping on the queue */
	wake_up_interruptible(slargs->slot_wq);
}

/*
416
 * orangefs_bufmap_get()
M
Mike Marshall 已提交
417 418 419 420 421 422
 *
 * gets a free mapped buffer descriptor, will sleep until one becomes
 * available if necessary
 *
 * returns 0 on success, -errno on failure
 */
423
int orangefs_bufmap_get(struct orangefs_bufmap **mapp, int *buffer_index)
M
Mike Marshall 已提交
424
{
425
	struct orangefs_bufmap *bufmap = orangefs_bufmap_ref();
M
Mike Marshall 已提交
426 427 428 429
	struct slot_args slargs;
	int ret;

	if (!bufmap) {
430
		gossip_err("orangefs: please confirm that pvfs2-client daemon is running.\n");
M
Mike Marshall 已提交
431 432 433 434 435 436 437 438 439
		return -EIO;
	}

	slargs.slot_count = bufmap->desc_count;
	slargs.slot_array = bufmap->buffer_index_array;
	slargs.slot_lock = &bufmap->buffer_index_lock;
	slargs.slot_wq = &bufmap_waitq;
	ret = wait_for_a_slot(&slargs, buffer_index);
	if (ret)
440
		orangefs_bufmap_unref(bufmap);
M
Mike Marshall 已提交
441 442 443 444 445
	*mapp = bufmap;
	return ret;
}

/*
446
 * orangefs_bufmap_put()
M
Mike Marshall 已提交
447 448 449 450 451
 *
 * returns a mapped buffer descriptor to the collection
 *
 * no return value
 */
452
void orangefs_bufmap_put(struct orangefs_bufmap *bufmap, int buffer_index)
M
Mike Marshall 已提交
453 454 455 456 457 458 459 460
{
	struct slot_args slargs;

	slargs.slot_count = bufmap->desc_count;
	slargs.slot_array = bufmap->buffer_index_array;
	slargs.slot_lock = &bufmap->buffer_index_lock;
	slargs.slot_wq = &bufmap_waitq;
	put_back_slot(&slargs, buffer_index);
461
	orangefs_bufmap_unref(bufmap);
M
Mike Marshall 已提交
462 463 464 465 466 467 468 469 470 471 472 473 474
}

/*
 * readdir_index_get()
 *
 * gets a free descriptor, will sleep until one becomes
 * available if necessary.
 * Although the readdir buffers are not mapped into kernel space
 * we could do that at a later point of time. Regardless, these
 * indices are used by the client-core.
 *
 * returns 0 on success, -errno on failure
 */
475
int readdir_index_get(struct orangefs_bufmap **mapp, int *buffer_index)
M
Mike Marshall 已提交
476
{
477
	struct orangefs_bufmap *bufmap = orangefs_bufmap_ref();
M
Mike Marshall 已提交
478 479 480 481
	struct slot_args slargs;
	int ret;

	if (!bufmap) {
482
		gossip_err("orangefs: please confirm that pvfs2-client daemon is running.\n");
M
Mike Marshall 已提交
483 484 485
		return -EIO;
	}

486
	slargs.slot_count = ORANGEFS_READDIR_DEFAULT_DESC_COUNT;
M
Mike Marshall 已提交
487 488 489 490 491
	slargs.slot_array = bufmap->readdir_index_array;
	slargs.slot_lock = &bufmap->readdir_index_lock;
	slargs.slot_wq = &readdir_waitq;
	ret = wait_for_a_slot(&slargs, buffer_index);
	if (ret)
492
		orangefs_bufmap_unref(bufmap);
M
Mike Marshall 已提交
493 494 495 496
	*mapp = bufmap;
	return ret;
}

497
void readdir_index_put(struct orangefs_bufmap *bufmap, int buffer_index)
M
Mike Marshall 已提交
498 499 500
{
	struct slot_args slargs;

501
	slargs.slot_count = ORANGEFS_READDIR_DEFAULT_DESC_COUNT;
M
Mike Marshall 已提交
502 503 504 505
	slargs.slot_array = bufmap->readdir_index_array;
	slargs.slot_lock = &bufmap->readdir_index_lock;
	slargs.slot_wq = &readdir_waitq;
	put_back_slot(&slargs, buffer_index);
506
	orangefs_bufmap_unref(bufmap);
M
Mike Marshall 已提交
507 508
}

M
Mike Marshall 已提交
509 510 511 512
/*
 * we've been handed an iovec, we need to copy it to 
 * the shared memory descriptor at "buffer_index".
 */
513
int orangefs_bufmap_copy_from_iovec(struct orangefs_bufmap *bufmap,
514 515 516
				struct iov_iter *iter,
				int buffer_index,
				size_t size)
M
Mike Marshall 已提交
517
{
518
	struct orangefs_bufmap_desc *to = &bufmap->desc_array[buffer_index];
M
Mike Marshall 已提交
519
	int i;
M
Mike Marshall 已提交
520 521

	gossip_debug(GOSSIP_BUFMAP_DEBUG,
522
		     "%s: buffer_index:%d: size:%zu:\n",
M
Mike Marshall 已提交
523
		     __func__, buffer_index, size);
M
Mike Marshall 已提交
524 525


M
Mike Marshall 已提交
526
	for (i = 0; size; i++) {
527 528 529 530 531 532 533 534
		struct page *page = to->page_array[i];
		size_t n = size;
		if (n > PAGE_SIZE)
			n = PAGE_SIZE;
		n = copy_page_from_iter(page, 0, n, iter);
		if (!n)
			return -EFAULT;
		size -= n;
M
Mike Marshall 已提交
535
	}
536
	return 0;
M
Mike Marshall 已提交
537 538 539 540

}

/*
M
Mike Marshall 已提交
541 542
 * we've been handed an iovec, we need to fill it from
 * the shared memory descriptor at "buffer_index".
M
Mike Marshall 已提交
543
 */
544
int orangefs_bufmap_copy_to_iovec(struct orangefs_bufmap *bufmap,
M
Mike Marshall 已提交
545
				    struct iov_iter *iter,
546 547
				    int buffer_index,
				    size_t size)
M
Mike Marshall 已提交
548
{
549
	struct orangefs_bufmap_desc *from = &bufmap->desc_array[buffer_index];
M
Mike Marshall 已提交
550
	int i;
M
Mike Marshall 已提交
551 552

	gossip_debug(GOSSIP_BUFMAP_DEBUG,
553 554
		     "%s: buffer_index:%d: size:%zu:\n",
		     __func__, buffer_index, size);
M
Mike Marshall 已提交
555 556


557 558 559 560 561 562 563 564 565
	for (i = 0; size; i++) {
		struct page *page = from->page_array[i];
		size_t n = size;
		if (n > PAGE_SIZE)
			n = PAGE_SIZE;
		n = copy_page_to_iter(page, 0, n, iter);
		if (!n)
			return -EFAULT;
		size -= n;
M
Mike Marshall 已提交
566
	}
567
	return 0;
M
Mike Marshall 已提交
568
}