提交 d6211330 编写于 作者: C Chandan Babu R 提交者: Darrick J. Wong

xfs: Fix false ENOSPC when performing direct write on a delalloc extent in cow fork

On a higly fragmented filesystem a Direct IO write can fail with -ENOSPC error
even though the filesystem has sufficient number of free blocks.

This occurs if the file offset range on which the write operation is being
performed has a delalloc extent in the cow fork and this delalloc extent
begins much before the Direct IO range.

In such a scenario, xfs_reflink_allocate_cow() invokes xfs_bmapi_write() to
allocate the blocks mapped by the delalloc extent. The extent thus allocated
may not cover the beginning of file offset range on which the Direct IO write
was issued. Hence xfs_reflink_allocate_cow() ends up returning -ENOSPC.

The following script reliably recreates the bug described above.

  #!/usr/bin/bash

  device=/dev/loop0
  shortdev=$(basename $device)

  mntpnt=/mnt/
  file1=${mntpnt}/file1
  file2=${mntpnt}/file2
  fragmentedfile=${mntpnt}/fragmentedfile
  punchprog=/root/repos/xfstests-dev/src/punch-alternating

  errortag=/sys/fs/xfs/${shortdev}/errortag/bmap_alloc_minlen_extent

  umount $device > /dev/null 2>&1

  echo "Create FS"
  mkfs.xfs -f -m reflink=1 $device > /dev/null 2>&1
  if [[ $? != 0 ]]; then
  	echo "mkfs failed."
  	exit 1
  fi

  echo "Mount FS"
  mount $device $mntpnt > /dev/null 2>&1
  if [[ $? != 0 ]]; then
  	echo "mount failed."
  	exit 1
  fi

  echo "Create source file"
  xfs_io -f -c "pwrite 0 32M" $file1 > /dev/null 2>&1

  sync

  echo "Create Reflinked file"
  xfs_io -f -c "reflink $file1" $file2 &>/dev/null

  echo "Set cowextsize"
  xfs_io -c "cowextsize 16M" $file1 > /dev/null 2>&1

  echo "Fragment FS"
  xfs_io -f -c "pwrite 0 64M" $fragmentedfile > /dev/null 2>&1
  sync
  $punchprog $fragmentedfile

  echo "Allocate block sized extent from now onwards"
  echo -n 1 > $errortag

  echo "Create 16MiB delalloc extent in CoW fork"
  xfs_io -c "pwrite 0 4k" $file1 > /dev/null 2>&1

  sync

  echo "Direct I/O write at offset 12k"
  xfs_io -d -c "pwrite 12k 8k" $file1

This commit fixes the bug by invoking xfs_bmapi_write() in a loop until disk
blocks are allocated for atleast the starting file offset of the Direct IO
write range.

Fixes: 3c68d44a ("xfs: allocate direct I/O COW blocks in iomap_begin")
Reported-and-Root-caused-by: NWengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: NChandan Babu R <chandan.babu@oracle.com>
Reviewed-by: NDarrick J. Wong <djwong@kernel.org>
[djwong: slight editing to make the locking less grody, and fix some style things]
Signed-off-by: NDarrick J. Wong <djwong@kernel.org>
上级 f0c2d7d2
......@@ -341,36 +341,55 @@ xfs_find_trim_cow_extent(
return 0;
}
/* Allocate all CoW reservations covering a range of blocks in a file. */
int
xfs_reflink_allocate_cow(
static int
xfs_reflink_convert_unwritten(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
bool *shared,
uint *lockmode,
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
xfs_filblks_t count_fsb = imap->br_blockcount;
struct xfs_trans *tp;
int nimaps, error = 0;
bool found;
xfs_filblks_t resaligned;
xfs_extlen_t resblks = 0;
int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
}
/*
* cmap might larger than imap due to cowextsize hint.
*/
xfs_trim_extent(cmap, offset_fsb, count_fsb);
/*
* COW fork extents are supposed to remain unwritten until we're ready
* to initiate a disk write. For direct I/O we are going to write the
* data and need the conversion, but for buffered writes we're done.
*/
if (!convert_now || cmap->br_state == XFS_EXT_NORM)
return 0;
trace_xfs_reflink_convert_cow(ip, cmap);
error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
if (!error)
cmap->br_state = XFS_EXT_NORM;
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
return error;
if (found)
goto convert;
}
static int
xfs_reflink_fill_cow_hole(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
bool *shared,
uint *lockmode,
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
xfs_filblks_t resaligned;
xfs_extlen_t resblks;
int nimaps;
int error;
bool found;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
......@@ -386,17 +405,17 @@ xfs_reflink_allocate_cow(
*lockmode = XFS_ILOCK_EXCL;
/*
* Check for an overlapping extent again now that we dropped the ilock.
*/
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
goto out_trans_cancel;
if (found) {
xfs_trans_cancel(tp);
goto convert;
}
ASSERT(cmap->br_startoff > imap->br_startoff);
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
......@@ -416,26 +435,135 @@ xfs_reflink_allocate_cow(
*/
if (nimaps == 0)
return -ENOSPC;
convert:
xfs_trim_extent(cmap, offset_fsb, count_fsb);
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
static int
xfs_reflink_fill_delalloc(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
bool *shared,
uint *lockmode,
bool convert_now)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
int nimaps;
int error;
bool found;
do {
xfs_iunlock(ip, *lockmode);
*lockmode = 0;
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
false, &tp);
if (error)
return error;
*lockmode = XFS_ILOCK_EXCL;
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
&found);
if (error || !*shared)
goto out_trans_cancel;
if (found) {
xfs_trans_cancel(tp);
break;
}
ASSERT(isnullstartblock(cmap->br_startblock) ||
cmap->br_startblock == DELAYSTARTBLOCK);
/*
* COW fork extents are supposed to remain unwritten until we're ready
* to initiate a disk write. For direct I/O we are going to write the
* data and need the conversion, but for buffered writes we're done.
* Replace delalloc reservation with an unwritten extent.
*/
if (!convert_now || cmap->br_state == XFS_EXT_NORM)
return 0;
trace_xfs_reflink_convert_cow(ip, cmap);
error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
if (!error)
cmap->br_state = XFS_EXT_NORM;
nimaps = 1;
error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
cmap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
cmap, &nimaps);
if (error)
goto out_trans_cancel;
xfs_inode_set_cowblocks_tag(ip);
error = xfs_trans_commit(tp);
if (error)
return error;
/*
* Allocation succeeded but the requested range was not even
* partially satisfied? Bail out!
*/
if (nimaps == 0)
return -ENOSPC;
} while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
out_trans_cancel:
xfs_trans_cancel(tp);
return error;
}
/* Allocate all CoW reservations covering a range of blocks in a file. */
int
xfs_reflink_allocate_cow(
struct xfs_inode *ip,
struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap,
bool *shared,
uint *lockmode,
bool convert_now)
{
int error;
bool found;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
}
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
if (error || !*shared)
return error;
/* CoW fork has a real extent */
if (found)
return xfs_reflink_convert_unwritten(ip, imap, cmap,
convert_now);
/*
* CoW fork does not have an extent and data extent is shared.
* Allocate a real extent in the CoW fork.
*/
if (cmap->br_startoff > imap->br_startoff)
return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
lockmode, convert_now);
/*
* CoW fork has a delalloc reservation. Replace it with a real extent.
* There may or may not be a data fork mapping.
*/
if (isnullstartblock(cmap->br_startblock) ||
cmap->br_startblock == DELAYSTARTBLOCK)
return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
lockmode, convert_now);
/* Shouldn't get here. */
ASSERT(0);
return -EFSCORRUPTED;
}
/*
* Cancel CoW reservations for some block range of an inode.
*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册