xfs: dynamic speculative EOF preallocation

Currently the size of the speculative preallocation during delayed allocation is fixed by either the allocsize mount option of a default size. We are seeing a lot of cases where we need to recommend using the allocsize mount option to prevent fragmentation when buffered writes land in the same AG. Rather than using a fixed preallocation size by default (up to 64k), make it dynamic by basing it on the current inode size. That way the EOF preallocation will increase as the file size increases. Hence for streaming writes we are much more likely to get large preallocations exactly when we need it to reduce fragementation. For default settings, the size of the initial extents is determined by the number of parallel writers and the amount of memory in the machine. For 4GB RAM and 4 concurrent 32GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576 1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576 2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152 3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304 4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608 5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208 6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088 7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088 and for 16 concurrent 16GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144 1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144 2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288 3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576 4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152 5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304 6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608 7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208 Because it is hard to take back specualtive preallocation, cases where there are large slow growing log files on a nearly full filesystem may cause premature ENOSPC. Hence as the filesystem nears full, the maximum dynamic prealloc size іs reduced according to this table (based on 4k block size): freespace max prealloc size >5% full extent (8GB) 4-5% 2GB (8GB >> 2) 3-4% 1GB (8GB >> 3) 2-3% 512MB (8GB >> 4) 1-2% 256MB (8GB >> 5) <1% 128MB (8GB >> 6) This should reduce the amount of space held in speculative preallocation for such cases. The allocsize mount option turns off the dynamic behaviour and fixes the prealloc size to whatever the mount option specifies. i.e. the behaviour is unchanged. Signed-off-by: N Dave Chinner <dchinner@redhat.com>

xfs: dynamic speculative EOF preallocation
Currently the size of the speculative preallocation during delayed allocation is fixed by either the allocsize mount option of a default size. We are seeing a lot of cases where we need to recommend using the allocsize mount option to prevent fragmentation when buffered writes land in the same AG. Rather than using a fixed preallocation size by default (up to 64k), make it dynamic by basing it on the current inode size. That way the EOF preallocation will increase as the file size increases. Hence for streaming writes we are much more likely to get large preallocations exactly when we need it to reduce fragementation. For default settings, the size of the initial extents is determined by the number of parallel writers and the amount of memory in the machine. For 4GB RAM and 4 concurrent 32GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..1048575]: 1048672..2097247 0 (1048672..2097247) 1048576 1: [1048576..2097151]: 5242976..6291551 0 (5242976..6291551) 1048576 2: [2097152..4194303]: 12583008..14680159 0 (12583008..14680159) 2097152 3: [4194304..8388607]: 25165920..29360223 0 (25165920..29360223) 4194304 4: [8388608..16777215]: 58720352..67108959 0 (58720352..67108959) 8388608 5: [16777216..33554423]: 117440584..134217791 0 (117440584..134217791) 16777208 6: [33554424..50331511]: 184549056..201326143 0 (184549056..201326143) 16777088 7: [50331512..67108599]: 251657408..268434495 0 (251657408..268434495) 16777088 and for 16 concurrent 16GB file writes: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL 0: [0..262143]: 2490472..2752615 0 (2490472..2752615) 262144 1: [262144..524287]: 6291560..6553703 0 (6291560..6553703) 262144 2: [524288..1048575]: 13631592..14155879 0 (13631592..14155879) 524288 3: [1048576..2097151]: 30408808..31457383 0 (30408808..31457383) 1048576 4: [2097152..4194303]: 52428904..54526055 0 (52428904..54526055) 2097152 5: [4194304..8388607]: 104857704..109052007 0 (104857704..109052007) 4194304 6: [8388608..16777215]: 209715304..218103911 0 (209715304..218103911) 8388608 7: [16777216..33554423]: 452984848..469762055 0 (452984848..469762055) 16777208 Because it is hard to take back specualtive preallocation, cases where there are large slow growing log files on a nearly full filesystem may cause premature ENOSPC. Hence as the filesystem nears full, the maximum dynamic prealloc size іs reduced according to this table (based on 4k block size): freespace max prealloc size >5% full extent (8GB) 4-5% 2GB (8GB >> 2) 3-4% 1GB (8GB >> 3) 2-3% 512MB (8GB >> 4) 1-2% 256MB (8GB >> 5) <1% 128MB (8GB >> 6) This should reduce the amount of space held in speculative preallocation for such cases. The allocsize mount option turns off the dynamic behaviour and fixes the prealloc size to whatever the mount option specifies. i.e. the behaviour is unchanged. Signed-off-by: N Dave Chinner <dchinner@redhat.com>
055388a3 · Dave Chinner · Dave Chinner · 622d8149 · 055388a3 · 055388a3
Showing with 110 addition and 10 deletion

fs/xfs/xfs_fsops.c fs/xfs/xfs_fsops.c +1 -0

fs/xfs/xfs_iomap.c fs/xfs/xfs_iomap.c +74 -10

fs/xfs/xfs_mount.c fs/xfs/xfs_mount.c +21 -0

fs/xfs/xfs_mount.h fs/xfs/xfs_mount.h +14 -0

未找到文件。
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -374,6 +374,7 @@ xfs_growfs_data_private(
 		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
 	} else
 		mp->m_maxicount = 0;
+	xfs_set_low_space_thresholds(mp);

 	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {

--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -267,6 +267,9 @@ xfs_iomap_write_direct(
 * If the caller is doing a write at the end of the file, then extend the
 * allocation out to the file system's write iosize.  We clean up any extra
 * space left over when the file is closed in xfs_inactive().
+ *
+ * If we find we already have delalloc preallocation beyond EOF, don't do more
+ * preallocation as it it not needed.
 */
 STATIC int
 xfs_iomap_eof_want_preallocate(
@@ -282,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
 	xfs_filblks_t   count_fsb;
 	xfs_fsblock_t	firstblock;
 	int		n, error, imaps;
+	int		found_delalloc = 0;

 	*prealloc = 0;
 	if ((offset + count) <= ip->i_size)
@@ -306,12 +310,60 @@ xfs_iomap_eof_want_preallocate(
 				return 0;
 			start_fsb += imap[n].br_blockcount;
 			count_fsb -= imap[n].br_blockcount;
+
+			if (imap[n].br_startblock == DELAYSTARTBLOCK)
+				found_delalloc = 1;
 		}
 	}
+	if (!found_delalloc)
 		*prealloc = 1;
 	return 0;
 }

+/*
+ * If we don't have a user specified preallocation size, dynamically increase
+ * the preallocation size as the size of the file grows. Cap the maximum size
+ * at a single extent or less if the filesystem is near full. The closer the
+ * filesystem is to full, the smaller the maximum prealocation.
+ */
+STATIC xfs_fsblock_t
+xfs_iomap_prealloc_size(
+	struct xfs_mount	*mp,
+	struct xfs_inode	*ip)
+{
+	xfs_fsblock_t		alloc_blocks = 0;
+
+	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+		int shift = 0;
+		int64_t freesp;
+
+		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);
+		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
+					rounddown_pow_of_two(alloc_blocks));
+
+		xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+		freesp = mp->m_sb.sb_fdblocks;
+		if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
+			shift = 2;
+			if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
+				shift++;
+			if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
+				shift++;
+		}
+		if (shift)
+			alloc_blocks >>= shift;
+	}
+
+	if (alloc_blocks < mp->m_writeio_blocks)
+		alloc_blocks = mp->m_writeio_blocks;
+
+	return alloc_blocks;
+}
+
 int
 xfs_iomap_write_delay(
 	xfs_inode_t	*ip,
@@ -344,6 +396,7 @@ xfs_iomap_write_delay(
 	extsz = xfs_get_extsz_hint(ip);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);

+
 	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
 				imap, XFS_WRITE_IMAPS, &prealloc);
 	if (error)
@@ -351,9 +404,11 @@ xfs_iomap_write_delay(

 retry:
 	if (prealloc) {
+		xfs_fsblock_t	alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+
 		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
 		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
-		last_fsb = ioalign + mp->m_writeio_blocks;
+		last_fsb = ioalign + alloc_blocks;
 	} else {
 		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
 	}
@@ -371,22 +426,31 @@ xfs_iomap_write_delay(
 			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
 			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
 			  &nimaps, NULL);
-	if (error && (error != ENOSPC))
+	switch (error) {
+	case 0:
+	case ENOSPC:
+	case EDQUOT:
+		break;
+	default:
 		return XFS_ERROR(error);
+	}

 	/*
-	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
-	 * then we must have run out of space - flush all other inodes with
-	 * delalloc blocks and retry without EOF preallocation.
+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
+	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
+	 * some of the excess reserved metadata space. For both cases, retry
+	 * without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		trace_xfs_delalloc_enospc(ip, offset, count);
 		if (flushed)
-			return XFS_ERROR(ENOSPC);
+			return XFS_ERROR(error ? error : ENOSPC);

+		if (error == ENOSPC) {
 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
 			xfs_flush_inodes(ip);
 			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		}

 		flushed = 1;
 		error = 0;

--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -974,6 +974,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
 	mp->m_writeio_blocks = 1 << (mp->m_writeio_log - sbp->sb_blocklog);
 }

+/*
+ * precalculate the low space thresholds for dynamic speculative preallocation.
+ */
+void
+xfs_set_low_space_thresholds(
+	struct xfs_mount	*mp)
+{
+	int i;
+
+	for (i = 0; i < XFS_LOWSP_MAX; i++) {
+		__uint64_t space = mp->m_sb.sb_dblocks;
+
+		do_div(space, 100);
+		mp->m_low_space[i] = space * (i + 1);
+	}
+}
+
+
 /*
 * Set whether we're using inode alignment.
 */
@@ -1196,6 +1214,9 @@ xfs_mountfs(
 	 */
 	xfs_set_rw_sizes(mp);

+	/* set the low space thresholds for dynamic preallocation */
+	xfs_set_low_space_thresholds(mp);
+
 	/*
 	 * Set the inode cluster size.
 	 * This may still be overridden by the file system

--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -103,6 +103,16 @@ extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
 	xfs_mod_incore_sb(mp, field, delta, rsvd)
 #endif

+/* dynamic preallocation free space thresholds, 5% down to 1% */
+enum {
+	XFS_LOWSP_1_PCNT = 0,
+	XFS_LOWSP_2_PCNT,
+	XFS_LOWSP_3_PCNT,
+	XFS_LOWSP_4_PCNT,
+	XFS_LOWSP_5_PCNT,
+	XFS_LOWSP_MAX,
+};
+
 typedef struct xfs_mount {
 	struct super_block	*m_super;
 	xfs_tid_t		m_tid;		/* next unused tid for fs */
@@ -202,6 +212,8 @@ typedef struct xfs_mount {
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
+	int64_t			m_low_space[XFS_LOWSP_MAX];
+						/* low free space thresholds */
 } xfs_mount_t;

 /*
@@ -379,6 +391,8 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);

 extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);

+extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
+
 #endif	/* __KERNEL__ */

 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);