extent_map.c 13.1 KB
Newer Older
1 2 3 4 5
/* -*- mode: c; c-basic-offset: 8; -*-
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
 * extent_map.c
 *
6
 * Block/Cluster mapping functions
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
 *
 * Copyright (C) 2004 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License, version 2,  as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/types.h>

#define MLOG_MASK_PREFIX ML_EXTENT_MAP
#include <cluster/masklog.h>

#include "ocfs2.h"

34
#include "alloc.h"
35 36 37 38 39 40
#include "extent_map.h"
#include "inode.h"
#include "super.h"

#include "buffer_head_io.h"

M
Mark Fasheh 已提交
41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
/*
 * The extent caching implementation is intentionally trivial.
 *
 * We only cache a small number of extents stored directly on the
 * inode, so linear order operations are acceptable. If we ever want
 * to increase the size of the extent map, then these algorithms must
 * get smarter.
 */

void ocfs2_extent_map_init(struct inode *inode)
{
	struct ocfs2_inode_info *oi = OCFS2_I(inode);

	oi->ip_extent_map.em_num_items = 0;
	INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
}

static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
				      unsigned int cpos,
				      struct ocfs2_extent_map_item **ret_emi)
{
	unsigned int range;
	struct ocfs2_extent_map_item *emi;

	*ret_emi = NULL;

	list_for_each_entry(emi, &em->em_list, ei_list) {
		range = emi->ei_cpos + emi->ei_clusters;

		if (cpos >= emi->ei_cpos && cpos < range) {
			list_move(&emi->ei_list, &em->em_list);

			*ret_emi = emi;
			break;
		}
	}
}

static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
				   unsigned int *phys, unsigned int *len,
				   unsigned int *flags)
{
	unsigned int coff;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	struct ocfs2_extent_map_item *emi;

	spin_lock(&oi->ip_lock);

	__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
	if (emi) {
		coff = cpos - emi->ei_cpos;
		*phys = emi->ei_phys + coff;
		if (len)
			*len = emi->ei_clusters - coff;
		if (flags)
			*flags = emi->ei_flags;
	}

	spin_unlock(&oi->ip_lock);

	if (emi == NULL)
		return -ENOENT;

	return 0;
}

/*
 * Forget about all clusters equal to or greater than cpos.
 */
void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
{
	struct list_head *p, *n;
	struct ocfs2_extent_map_item *emi;
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	struct ocfs2_extent_map *em = &oi->ip_extent_map;
	LIST_HEAD(tmp_list);
	unsigned int range;

	spin_lock(&oi->ip_lock);
	list_for_each_safe(p, n, &em->em_list) {
		emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);

		if (emi->ei_cpos >= cpos) {
			/* Full truncate of this record. */
			list_move(&emi->ei_list, &tmp_list);
			BUG_ON(em->em_num_items == 0);
			em->em_num_items--;
			continue;
		}

		range = emi->ei_cpos + emi->ei_clusters;
		if (range > cpos) {
			/* Partial truncate */
			emi->ei_clusters = cpos - emi->ei_cpos;
		}
	}
	spin_unlock(&oi->ip_lock);

	list_for_each_safe(p, n, &tmp_list) {
		emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
		list_del(&emi->ei_list);
		kfree(emi);
	}
}

/*
 * Is any part of emi2 contained within emi1
 */
static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
				 struct ocfs2_extent_map_item *emi2)
{
	unsigned int range1, range2;

	/*
	 * Check if logical start of emi2 is inside emi1
	 */
	range1 = emi1->ei_cpos + emi1->ei_clusters;
	if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
		return 1;

	/*
	 * Check if logical end of emi2 is inside emi1
	 */
	range2 = emi2->ei_cpos + emi2->ei_clusters;
	if (range2 > emi1->ei_cpos && range2 <= range1)
		return 1;

	return 0;
}

static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
				  struct ocfs2_extent_map_item *src)
{
	dest->ei_cpos = src->ei_cpos;
	dest->ei_phys = src->ei_phys;
	dest->ei_clusters = src->ei_clusters;
	dest->ei_flags = src->ei_flags;
}

/*
 * Try to merge emi with ins. Returns 1 if merge succeeds, zero
 * otherwise.
 */
static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
					 struct ocfs2_extent_map_item *ins)
{
	/*
	 * Handle contiguousness
	 */
	if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
	    ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
	    ins->ei_flags == emi->ei_flags) {
		emi->ei_clusters += ins->ei_clusters;
		return 1;
	} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
		   (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
		   ins->ei_flags == emi->ei_flags) {
		emi->ei_phys = ins->ei_phys;
		emi->ei_cpos = ins->ei_cpos;
		emi->ei_clusters += ins->ei_clusters;
		return 1;
	}

	/*
	 * Overlapping extents - this shouldn't happen unless we've
	 * split an extent to change it's flags. That is exceedingly
	 * rare, so there's no sense in trying to optimize it yet.
	 */
	if (ocfs2_ei_is_contained(emi, ins) ||
	    ocfs2_ei_is_contained(ins, emi)) {
		ocfs2_copy_emi_fields(emi, ins);
		return 1;
	}

	/* No merge was possible. */
	return 0;
}

/*
 * In order to reduce complexity on the caller, this insert function
 * is intentionally liberal in what it will accept.
 *
 * The only rule is that the truncate call *must* be used whenever
 * records have been deleted. This avoids inserting overlapping
 * records with different physical mappings.
 */
void ocfs2_extent_map_insert_rec(struct inode *inode,
				 struct ocfs2_extent_rec *rec)
{
	struct ocfs2_inode_info *oi = OCFS2_I(inode);
	struct ocfs2_extent_map *em = &oi->ip_extent_map;
	struct ocfs2_extent_map_item *emi, *new_emi = NULL;
	struct ocfs2_extent_map_item ins;

	ins.ei_cpos = le32_to_cpu(rec->e_cpos);
	ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
					       le64_to_cpu(rec->e_blkno));
	ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
	ins.ei_flags = rec->e_flags;

search:
	spin_lock(&oi->ip_lock);

	list_for_each_entry(emi, &em->em_list, ei_list) {
		if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
			list_move(&emi->ei_list, &em->em_list);
			spin_unlock(&oi->ip_lock);
			goto out;
		}
	}

	/*
	 * No item could be merged.
	 *
	 * Either allocate and add a new item, or overwrite the last recently
	 * inserted.
	 */

	if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
		if (new_emi == NULL) {
			spin_unlock(&oi->ip_lock);

			new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
			if (new_emi == NULL)
				goto out;

			goto search;
		}

		ocfs2_copy_emi_fields(new_emi, &ins);
		list_add(&new_emi->ei_list, &em->em_list);
		em->em_num_items++;
		new_emi = NULL;
	} else {
		BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
		emi = list_entry(em->em_list.prev,
				 struct ocfs2_extent_map_item, ei_list);
		list_move(&emi->ei_list, &em->em_list);
		ocfs2_copy_emi_fields(emi, &ins);
	}

	spin_unlock(&oi->ip_lock);

out:
	if (new_emi)
		kfree(new_emi);
}

289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
/*
 * Return the 1st index within el which contains an extent start
 * larger than v_cluster.
 */
static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
				       u32 v_cluster)
{
	int i;
	struct ocfs2_extent_rec *rec;

	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
		rec = &el->l_recs[i];

		if (v_cluster < le32_to_cpu(rec->e_cpos))
			break;
	}

	return i;
}

/*
 * Figure out the size of a hole which starts at v_cluster within the given
 * extent list.
 *
 * If there is no more allocation past v_cluster, we return the maximum
 * cluster size minus v_cluster.
 *
 * If we have in-inode extents, then el points to the dinode list and
 * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
 * containing el.
 */
static int ocfs2_figure_hole_clusters(struct inode *inode,
				      struct ocfs2_extent_list *el,
				      struct buffer_head *eb_bh,
				      u32 v_cluster,
				      u32 *num_clusters)
{
	int ret, i;
	struct buffer_head *next_eb_bh = NULL;
	struct ocfs2_extent_block *eb, *next_eb;

	i = ocfs2_search_for_hole_index(el, v_cluster);

	if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
		eb = (struct ocfs2_extent_block *)eb_bh->b_data;

		/*
		 * Check the next leaf for any extents.
		 */

		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
			goto no_more_extents;

		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
				       le64_to_cpu(eb->h_next_leaf_blk),
				       &next_eb_bh, OCFS2_BH_CACHED, inode);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}
		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;

		if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
			ret = -EROFS;
			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
			goto out;
		}

		el = &next_eb->h_list;

		i = ocfs2_search_for_hole_index(el, v_cluster);
	}

no_more_extents:
	if (i == le16_to_cpu(el->l_next_free_rec)) {
		/*
		 * We're at the end of our existing allocation. Just
		 * return the maximum number of clusters we could
		 * possibly allocate.
		 */
		*num_clusters = UINT_MAX - v_cluster;
	} else {
		*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
	}

	ret = 0;
out:
	brelse(next_eb_bh);
	return ret;
}

380
/*
381 382
 * Return the index of the extent record which contains cluster #v_cluster.
 * -1 is returned if it was not found.
383
 *
384
 * Should work fine on interior and exterior nodes.
385
 */
386 387
static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
				    u32 v_cluster)
388
{
389 390
	int ret = -1;
	int i;
391
	struct ocfs2_extent_rec *rec;
392
	u32 rec_end, rec_start, clusters;
393

394
	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
395
		rec = &el->l_recs[i];
396

397
		rec_start = le32_to_cpu(rec->e_cpos);
398 399 400
		clusters = ocfs2_rec_clusters(el, rec);

		rec_end = rec_start + clusters;
401

402 403 404
		if (v_cluster >= rec_start && v_cluster < rec_end) {
			ret = i;
			break;
405 406 407 408 409 410
		}
	}

	return ret;
}

411
int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
412 413
		       u32 *p_cluster, u32 *num_clusters,
		       unsigned int *extent_flags)
414
{
415
	int ret, i;
416
	unsigned int flags = 0;
417 418
	struct buffer_head *di_bh = NULL;
	struct buffer_head *eb_bh = NULL;
419
	struct ocfs2_dinode *di;
420
	struct ocfs2_extent_block *eb;
421
	struct ocfs2_extent_list *el;
422 423
	struct ocfs2_extent_rec *rec;
	u32 coff;
424

M
Mark Fasheh 已提交
425 426 427 428 429
	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
				      num_clusters, extent_flags);
	if (ret == 0)
		goto out;

430 431
	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
			       &di_bh, OCFS2_BH_CACHED, inode);
432 433
	if (ret) {
		mlog_errno(ret);
434
		goto out;
435 436
	}

437 438
	di = (struct ocfs2_dinode *) di_bh->b_data;
	el = &di->id2.i_list;
439

440 441 442 443 444 445
	if (el->l_tree_depth) {
		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
		if (ret) {
			mlog_errno(ret);
			goto out;
		}
446

447 448
		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
		el = &eb->h_list;
449 450 451 452 453 454 455 456 457

		if (el->l_tree_depth) {
			ocfs2_error(inode->i_sb,
				    "Inode %lu has non zero tree depth in "
				    "leaf block %llu\n", inode->i_ino,
				    (unsigned long long)eb_bh->b_blocknr);
			ret = -EROFS;
			goto out;
		}
458
	}
459

460 461
	i = ocfs2_search_extent_list(el, v_cluster);
	if (i == -1) {
462
		/*
463
		 * A hole was found. Return some canned values that
464 465
		 * callers can key on. If asked for, num_clusters will
		 * be populated with the size of the hole.
466
		 */
467
		*p_cluster = 0;
468 469 470 471 472 473 474 475 476
		if (num_clusters) {
			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
							 v_cluster,
							 num_clusters);
			if (ret) {
				mlog_errno(ret);
				goto out;
			}
		}
477 478
	} else {
		rec = &el->l_recs[i];
479

480
		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
481

482 483 484 485
		if (!rec->e_blkno) {
			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
				    "record (%u, %u, 0)", inode->i_ino,
				    le32_to_cpu(rec->e_cpos),
486
				    ocfs2_rec_clusters(el, rec));
487 488
			ret = -EROFS;
			goto out;
489 490
		}

491
		coff = v_cluster - le32_to_cpu(rec->e_cpos);
492

493 494 495
		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
						    le64_to_cpu(rec->e_blkno));
		*p_cluster = *p_cluster + coff;
496

497
		if (num_clusters)
498
			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
499 500

		flags = rec->e_flags;
M
Mark Fasheh 已提交
501 502

		ocfs2_extent_map_insert_rec(inode, rec);
503 504
	}

505 506 507
	if (extent_flags)
		*extent_flags = flags;

508 509 510
out:
	brelse(di_bh);
	brelse(eb_bh);
511 512 513 514
	return ret;
}

/*
515 516
 * This expects alloc_sem to be held. The allocation cannot change at
 * all while the map is in the process of being updated.
517
 */
518
int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
519
				u64 *ret_count, unsigned int *extent_flags)
520 521 522
{
	int ret;
	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
523 524
	u32 cpos, num_clusters, p_cluster;
	u64 boff = 0;
525 526 527

	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);

528 529
	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
				 extent_flags);
530 531
	if (ret) {
		mlog_errno(ret);
532
		goto out;
533 534
	}

535 536 537 538 539
	/*
	 * p_cluster == 0 indicates a hole.
	 */
	if (p_cluster) {
		boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
540 541 542
		boff += (v_blkno & (u64)(bpc - 1));
	}

543
	*p_blkno = boff;
544

545 546 547
	if (ret_count) {
		*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
		*ret_count -= v_blkno & (u64)(bpc - 1);
548 549
	}

550 551
out:
	return ret;
552
}