/* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * Copyright (c) 2010 David Chinner. * Copyright (c) 2011 Christoph Hellwig. * All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_extent_busy.h" #include "xfs_trace.h" #include "xfs_trans.h" #include "xfs_log.h" void xfs_extent_busy_insert( struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags) { struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; struct xfs_perag *pag; struct rb_node **rbp; struct rb_node *parent = NULL; new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP); new->agno = agno; new->bno = bno; new->length = len; INIT_LIST_HEAD(&new->list); new->flags = flags; /* trace before insert to be able to see failed inserts */ trace_xfs_extent_busy(tp->t_mountp, agno, bno, len); pag = xfs_perag_get(tp->t_mountp, new->agno); spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; while (*rbp) { parent = *rbp; busyp = rb_entry(parent, struct xfs_extent_busy, rb_node); if (new->bno < busyp->bno) { rbp = &(*rbp)->rb_left; ASSERT(new->bno + new->length <= busyp->bno); } else if (new->bno > busyp->bno) { rbp = &(*rbp)->rb_right; ASSERT(bno >= busyp->bno + busyp->length); } else { ASSERT(0); } } rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &pag->pagb_tree); list_add(&new->list, &tp->t_busy); spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); } /* * Search for a busy extent within the range of the extent we are about to * allocate. You need to be holding the busy extent tree lock when calling * xfs_extent_busy_search(). This function returns 0 for no overlapping busy * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact * match. This is done so that a non-zero return indicates an overlap that * will require a synchronous transaction, but it can still be * used to distinguish between a partial or exact match. */ int xfs_extent_busy_search( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t bno, xfs_extlen_t len) { struct xfs_perag *pag; struct rb_node *rbp; struct xfs_extent_busy *busyp; int match = 0; pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); rbp = pag->pagb_tree.rb_node; /* find closest start bno overlap */ while (rbp) { busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); if (bno < busyp->bno) { /* may overlap, but exact start block is lower */ if (bno + len > busyp->bno) match = -1; rbp = rbp->rb_left; } else if (bno > busyp->bno) { /* may overlap, but exact start block is higher */ if (bno < busyp->bno + busyp->length) match = -1; rbp = rbp->rb_right; } else { /* bno matches busyp, length determines exact match */ match = (busyp->length == len) ? 1 : -1; break; } } spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); return match; } /* * The found free extent [fbno, fend] overlaps part or all of the given busy * extent. If the overlap covers the beginning, the end, or all of the busy * extent, the overlapping portion can be made unbusy and used for the * allocation. We can't split a busy extent because we can't modify a * transaction/CIL context busy list, but we can update an entry's block * number or length. * * Returns true if the extent can safely be reused, or false if the search * needs to be restarted. */ STATIC bool xfs_extent_busy_update_extent( struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_extent_busy *busyp, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata) __releases(&pag->pagb_lock) __acquires(&pag->pagb_lock) { xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; /* * This extent is currently being discarded. Give the thread * performing the discard a chance to mark the extent unbusy * and retry. */ if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) { spin_unlock(&pag->pagb_lock); delay(1); spin_lock(&pag->pagb_lock); return false; } /* * If there is a busy extent overlapping a user allocation, we have * no choice but to force the log and retry the search. * * Fortunately this does not happen during normal operation, but * only if the filesystem is very low on space and has to dip into * the AGFL for normal allocations. */ if (userdata) goto out_force_log; if (bbno < fbno && bend > fend) { /* * Case 1: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +---------+ * fbno fend */ /* * We would have to split the busy extent to be able to track * it correct, which we cannot do because we would have to * modify the list of busy extents attached to the transaction * or CIL context, which is immutable. * * Force out the log to clear the busy extent and retry the * search. */ goto out_force_log; } else if (bbno >= fbno && bend <= fend) { /* * Case 2: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-----------------+ * fbno fend * * Case 3: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +--------------------------+ * fbno fend * * Case 4: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +--------------------------+ * fbno fend * * Case 5: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-----------------------------------+ * fbno fend * */ /* * The busy extent is fully covered by the extent we are * allocating, and can simply be removed from the rbtree. * However we cannot remove it from the immutable list * tracking busy extents in the transaction or CIL context, * so set the length to zero to mark it invalid. * * We also need to restart the busy extent search from the * tree root, because erasing the node can rearrange the * tree topology. */ rb_erase(&busyp->rb_node, &pag->pagb_tree); busyp->length = 0; return false; } else if (fend < bend) { /* * Case 6: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +---------+ * fbno fend * * Case 7: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +------------------+ * fbno fend * */ busyp->bno = fend; } else if (bbno < fbno) { /* * Case 8: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-------------+ * fbno fend * * Case 9: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +----------------------+ * fbno fend */ busyp->length = fbno - busyp->bno; } else { ASSERT(0); } trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen); return true; out_force_log: spin_unlock(&pag->pagb_lock); xfs_log_force(mp, XFS_LOG_SYNC); trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen); spin_lock(&pag->pagb_lock); return false; } /* * For a given extent [fbno, flen], make sure we can reuse it safely. */ void xfs_extent_busy_reuse( struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata) { struct xfs_perag *pag; struct rb_node *rbp; ASSERT(flen > 0); pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); restart: rbp = pag->pagb_tree.rb_node; while (rbp) { struct xfs_extent_busy *busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; if (fbno + flen <= bbno) { rbp = rbp->rb_left; continue; } else if (fbno >= bend) { rbp = rbp->rb_right; continue; } if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen, userdata)) goto restart; } spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); } /* * For a given extent [fbno, flen], search the busy extent list to find a * subset of the extent that is not busy. If *rlen is smaller than * args->minlen no suitable extent could be found, and the higher level * code needs to force out the log and retry the allocation. * * Return the current busy generation for the AG if the extent is busy. This * value can be used to wait for at least one of the currently busy extents * to be cleared. Note that the busy list is not guaranteed to be empty after * the gen is woken. The state of a specific extent must always be confirmed * with another call to xfs_extent_busy_trim() before it can be used. */ bool xfs_extent_busy_trim( struct xfs_alloc_arg *args, xfs_agblock_t *bno, xfs_extlen_t *len, unsigned *busy_gen) { xfs_agblock_t fbno; xfs_extlen_t flen; struct rb_node *rbp; bool ret = false; ASSERT(*len > 0); spin_lock(&args->pag->pagb_lock); restart: fbno = *bno; flen = *len; rbp = args->pag->pagb_tree.rb_node; while (rbp && flen >= args->minlen) { struct xfs_extent_busy *busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node); xfs_agblock_t fend = fbno + flen; xfs_agblock_t bbno = busyp->bno; xfs_agblock_t bend = bbno + busyp->length; if (fend <= bbno) { rbp = rbp->rb_left; continue; } else if (fbno >= bend) { rbp = rbp->rb_right; continue; } /* * If this is a metadata allocation, try to reuse the busy * extent instead of trimming the allocation. */ if (!xfs_alloc_is_userdata(args->datatype) && !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { if (!xfs_extent_busy_update_extent(args->mp, args->pag, busyp, fbno, flen, false)) goto restart; continue; } if (bbno <= fbno) { /* start overlap */ /* * Case 1: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +---------+ * fbno fend * * Case 2: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-------------+ * fbno fend * * Case 3: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-------------+ * fbno fend * * Case 4: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-----------------+ * fbno fend * * No unbusy region in extent, return failure. */ if (fend <= bend) goto fail; /* * Case 5: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +----------------------+ * fbno fend * * Case 6: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +--------------------------+ * fbno fend * * Needs to be trimmed to: * +-------+ * fbno fend */ fbno = bend; } else if (bend >= fend) { /* end overlap */ /* * Case 7: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +------------------+ * fbno fend * * Case 8: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +--------------------------+ * fbno fend * * Needs to be trimmed to: * +-------+ * fbno fend */ fend = bbno; } else { /* middle overlap */ /* * Case 9: * bbno bend * +BBBBBBBBBBBBBBBBB+ * +-----------------------------------+ * fbno fend * * Can be trimmed to: * +-------+ OR +-------+ * fbno fend fbno fend * * Backward allocation leads to significant * fragmentation of directories, which degrades * directory performance, therefore we always want to * choose the option that produces forward allocation * patterns. * Preferring the lower bno extent will make the next * request use "fend" as the start of the next * allocation; if the segment is no longer busy at * that point, we'll get a contiguous allocation, but * even if it is still busy, we will get a forward * allocation. * We try to avoid choosing the segment at "bend", * because that can lead to the next allocation * taking the segment at "fbno", which would be a * backward allocation. We only use the segment at * "fbno" if it is much larger than the current * requested size, because in that case there's a * good chance subsequent allocations will be * contiguous. */ if (bbno - fbno >= args->maxlen) { /* left candidate fits perfect */ fend = bbno; } else if (fend - bend >= args->maxlen * 4) { /* right candidate has enough free space */ fbno = bend; } else if (bbno - fbno >= args->minlen) { /* left candidate fits minimum requirement */ fend = bbno; } else { goto fail; } } flen = fend - fbno; } out: if (fbno != *bno || flen != *len) { trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len, fbno, flen); *bno = fbno; *len = flen; *busy_gen = args->pag->pagb_gen; ret = true; } spin_unlock(&args->pag->pagb_lock); return ret; fail: /* * Return a zero extent length as failure indications. All callers * re-check if the trimmed extent satisfies the minlen requirement. */ flen = 0; goto out; } STATIC void xfs_extent_busy_clear_one( struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_extent_busy *busyp) { if (busyp->length) { trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno, busyp->length); rb_erase(&busyp->rb_node, &pag->pagb_tree); } list_del_init(&busyp->list); kmem_free(busyp); } static void xfs_extent_busy_put_pag( struct xfs_perag *pag, bool wakeup) __releases(pag->pagb_lock) { if (wakeup) { pag->pagb_gen++; wake_up_all(&pag->pagb_wait); } spin_unlock(&pag->pagb_lock); xfs_perag_put(pag); } /* * Remove all extents on the passed in list from the busy extents tree. * If do_discard is set skip extents that need to be discarded, and mark * these as undergoing a discard operation instead. */ void xfs_extent_busy_clear( struct xfs_mount *mp, struct list_head *list, bool do_discard) { struct xfs_extent_busy *busyp, *n; struct xfs_perag *pag = NULL; xfs_agnumber_t agno = NULLAGNUMBER; bool wakeup = false; list_for_each_entry_safe(busyp, n, list, list) { if (busyp->agno != agno) { if (pag) xfs_extent_busy_put_pag(pag, wakeup); agno = busyp->agno; pag = xfs_perag_get(mp, agno); spin_lock(&pag->pagb_lock); wakeup = false; } if (do_discard && busyp->length && !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) { busyp->flags = XFS_EXTENT_BUSY_DISCARDED; } else { xfs_extent_busy_clear_one(mp, pag, busyp); wakeup = true; } } if (pag) xfs_extent_busy_put_pag(pag, wakeup); } /* * Flush out all busy extents for this AG. */ void xfs_extent_busy_flush( struct xfs_mount *mp, struct xfs_perag *pag, unsigned busy_gen) { DEFINE_WAIT (wait); int error; trace_xfs_log_force(mp, 0, _THIS_IP_); error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); if (error) return; do { prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); if (busy_gen != READ_ONCE(pag->pagb_gen)) break; schedule(); } while (1); finish_wait(&pag->pagb_wait, &wait); } void xfs_extent_busy_wait_all( struct xfs_mount *mp) { DEFINE_WAIT (wait); xfs_agnumber_t agno; for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { struct xfs_perag *pag = xfs_perag_get(mp, agno); do { prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); if (RB_EMPTY_ROOT(&pag->pagb_tree)) break; schedule(); } while (1); finish_wait(&pag->pagb_wait, &wait); xfs_perag_put(pag); } } /* * Callback for list_sort to sort busy extents by the AG they reside in. */ int xfs_extent_busy_ag_cmp( void *priv, struct list_head *l1, struct list_head *l2) { struct xfs_extent_busy *b1 = container_of(l1, struct xfs_extent_busy, list); struct xfs_extent_busy *b2 = container_of(l2, struct xfs_extent_busy, list); s32 diff; diff = b1->agno - b2->agno; if (!diff) diff = b1->bno - b2->bno; return diff; }