未验证 提交 1a82ccca 编写于 作者: O openeuler-ci-bot 提交者: Gitee

!631 Backport CVEs and bugfixes

Merge Pull Request from: @zhangjialin11 
 
Pull new CVEs:
CVE-2023-1855
CVE-2023-2006
CVE-2023-30772
CVE-2023-1872

net bugfixes from Ziyang Xuan
mm cleanup from Ma Wupeng
timer bugfix from Yu Liao
xfs bugfixes from Guo Xuenan 
 
Link:https://gitee.com/openeuler/kernel/pulls/631 

Reviewed-by: Xie XiuQi <xiexiuqi@huawei.com> 
Signed-off-by: Xie XiuQi <xiexiuqi@huawei.com> 
...@@ -768,6 +768,7 @@ static int xgene_hwmon_remove(struct platform_device *pdev) ...@@ -768,6 +768,7 @@ static int xgene_hwmon_remove(struct platform_device *pdev)
{ {
struct xgene_hwmon_dev *ctx = platform_get_drvdata(pdev); struct xgene_hwmon_dev *ctx = platform_get_drvdata(pdev);
cancel_work_sync(&ctx->workq);
hwmon_device_unregister(ctx->hwmon_dev); hwmon_device_unregister(ctx->hwmon_dev);
kfifo_free(&ctx->async_msg_fifo); kfifo_free(&ctx->async_msg_fifo);
if (acpi_disabled) if (acpi_disabled)
......
...@@ -1689,6 +1689,20 @@ void bond_lower_state_changed(struct slave *slave) ...@@ -1689,6 +1689,20 @@ void bond_lower_state_changed(struct slave *slave)
netdev_lower_state_changed(slave->dev, &info); netdev_lower_state_changed(slave->dev, &info);
} }
/* The bonding driver uses ether_setup() to convert a master bond device
* to ARPHRD_ETHER, that resets the target netdevice's flags so we always
* have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP
* if they were set
*/
static void bond_ether_setup(struct net_device *bond_dev)
{
unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP);
ether_setup(bond_dev);
bond_dev->flags |= IFF_MASTER | flags;
bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
}
/* enslave device <slave> to bond device <master> */ /* enslave device <slave> to bond device <master> */
int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
struct netlink_ext_ack *extack) struct netlink_ext_ack *extack)
...@@ -1783,10 +1797,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, ...@@ -1783,10 +1797,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
if (slave_dev->type != ARPHRD_ETHER) if (slave_dev->type != ARPHRD_ETHER)
bond_setup_by_slave(bond_dev, slave_dev); bond_setup_by_slave(bond_dev, slave_dev);
else { else
ether_setup(bond_dev); bond_ether_setup(bond_dev);
bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
}
call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
bond_dev); bond_dev);
...@@ -2171,9 +2183,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, ...@@ -2171,9 +2183,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
eth_hw_addr_random(bond_dev); eth_hw_addr_random(bond_dev);
if (bond_dev->type != ARPHRD_ETHER) { if (bond_dev->type != ARPHRD_ETHER) {
dev_close(bond_dev); dev_close(bond_dev);
ether_setup(bond_dev); bond_ether_setup(bond_dev);
bond_dev->flags |= IFF_MASTER;
bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
} }
} }
......
...@@ -662,6 +662,7 @@ static int da9150_charger_remove(struct platform_device *pdev) ...@@ -662,6 +662,7 @@ static int da9150_charger_remove(struct platform_device *pdev)
if (!IS_ERR_OR_NULL(charger->usb_phy)) if (!IS_ERR_OR_NULL(charger->usb_phy))
usb_unregister_notifier(charger->usb_phy, &charger->otg_nb); usb_unregister_notifier(charger->usb_phy, &charger->otg_nb);
cancel_work_sync(&charger->otg_work);
power_supply_unregister(charger->battery); power_supply_unregister(charger->battery);
power_supply_unregister(charger->usb); power_supply_unregister(charger->usb);
......
...@@ -90,7 +90,7 @@ xfs_allocbt_free_block( ...@@ -90,7 +90,7 @@ xfs_allocbt_free_block(
xfs_agblock_t bno; xfs_agblock_t bno;
int error; int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
if (error) if (error)
return error; return error;
......
...@@ -1313,7 +1313,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ...@@ -1313,7 +1313,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) { for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->bp) { if (blk->bp) {
blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->disk_blkno = xfs_buf_daddr(blk->bp);
blk->bp = NULL; blk->bp = NULL;
} else { } else {
blk->disk_blkno = 0; blk->disk_blkno = 0;
...@@ -1328,7 +1328,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ...@@ -1328,7 +1328,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) { for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->bp) { if (blk->bp) {
blk->disk_blkno = XFS_BUF_ADDR(blk->bp); blk->disk_blkno = xfs_buf_daddr(blk->bp);
blk->bp = NULL; blk->bp = NULL;
} else { } else {
blk->disk_blkno = 0; blk->disk_blkno = 0;
......
...@@ -239,7 +239,7 @@ xfs_bmap_get_bp( ...@@ -239,7 +239,7 @@ xfs_bmap_get_bp(
for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) {
if (!cur->bc_bufs[i]) if (!cur->bc_bufs[i])
break; break;
if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) if (xfs_buf_daddr(cur->bc_bufs[i]) == bno)
return cur->bc_bufs[i]; return cur->bc_bufs[i];
} }
...@@ -248,7 +248,7 @@ xfs_bmap_get_bp( ...@@ -248,7 +248,7 @@ xfs_bmap_get_bp(
struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip;
if (bip->bli_item.li_type == XFS_LI_BUF && if (bip->bli_item.li_type == XFS_LI_BUF &&
XFS_BUF_ADDR(bip->bli_buf) == bno) xfs_buf_daddr(bip->bli_buf) == bno)
return bip->bli_buf; return bip->bli_buf;
} }
......
...@@ -282,7 +282,7 @@ xfs_bmbt_free_block( ...@@ -282,7 +282,7 @@ xfs_bmbt_free_block(
struct xfs_mount *mp = cur->bc_mp; struct xfs_mount *mp = cur->bc_mp;
struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_inode *ip = cur->bc_ino.ip;
struct xfs_trans *tp = cur->bc_tp; struct xfs_trans *tp = cur->bc_tp;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
struct xfs_owner_info oinfo; struct xfs_owner_info oinfo;
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
......
...@@ -50,6 +50,71 @@ xfs_btree_magic( ...@@ -50,6 +50,71 @@ xfs_btree_magic(
return magic; return magic;
} }
/*
* These sibling pointer checks are optimised for null sibling pointers. This
* happens a lot, and we don't need to byte swap at runtime if the sibling
* pointer is NULL.
*
* These are explicitly marked at inline because the cost of calling them as
* functions instead of inlining them is about 36 bytes extra code per call site
* on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
* two sibling check functions reduces the compiled code size by over 300
* bytes.
*/
static inline xfs_failaddr_t
xfs_btree_check_lblock_siblings(
struct xfs_mount *mp,
struct xfs_btree_cur *cur,
int level,
xfs_fsblock_t fsb,
__be64 dsibling)
{
xfs_fsblock_t sibling;
if (dsibling == cpu_to_be64(NULLFSBLOCK))
return NULL;
sibling = be64_to_cpu(dsibling);
if (sibling == fsb)
return __this_address;
if (level >= 0) {
if (!xfs_btree_check_lptr(cur, sibling, level + 1))
return __this_address;
} else {
if (!xfs_verify_fsbno(mp, sibling))
return __this_address;
}
return NULL;
}
static inline xfs_failaddr_t
xfs_btree_check_sblock_siblings(
struct xfs_mount *mp,
struct xfs_btree_cur *cur,
int level,
xfs_agnumber_t agno,
xfs_agblock_t agbno,
__be32 dsibling)
{
xfs_agblock_t sibling;
if (dsibling == cpu_to_be32(NULLAGBLOCK))
return NULL;
sibling = be32_to_cpu(dsibling);
if (sibling == agbno)
return __this_address;
if (level >= 0) {
if (!xfs_btree_check_sptr(cur, sibling, level + 1))
return __this_address;
} else {
if (!xfs_verify_agbno(mp, agno, sibling))
return __this_address;
}
return NULL;
}
/* /*
* Check a long btree block header. Return the address of the failing check, * Check a long btree block header. Return the address of the failing check,
* or NULL if everything is ok. * or NULL if everything is ok.
...@@ -64,6 +129,8 @@ __xfs_btree_check_lblock( ...@@ -64,6 +129,8 @@ __xfs_btree_check_lblock(
struct xfs_mount *mp = cur->bc_mp; struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum; xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_sb_version_hascrc(&mp->m_sb); int crc = xfs_sb_version_hascrc(&mp->m_sb);
xfs_failaddr_t fa;
xfs_fsblock_t fsb = NULLFSBLOCK;
if (crc) { if (crc) {
if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
...@@ -82,16 +149,16 @@ __xfs_btree_check_lblock( ...@@ -82,16 +149,16 @@ __xfs_btree_check_lblock(
if (be16_to_cpu(block->bb_numrecs) > if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level)) cur->bc_ops->get_maxrecs(cur, level))
return __this_address; return __this_address;
if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
!xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib),
level + 1))
return __this_address;
if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
!xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib),
level + 1))
return __this_address;
return NULL; if (bp)
fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
block->bb_u.l.bb_leftsib);
if (!fa)
fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
block->bb_u.l.bb_rightsib);
return fa;
} }
/* Check a long btree block header. */ /* Check a long btree block header. */
...@@ -129,6 +196,9 @@ __xfs_btree_check_sblock( ...@@ -129,6 +196,9 @@ __xfs_btree_check_sblock(
struct xfs_mount *mp = cur->bc_mp; struct xfs_mount *mp = cur->bc_mp;
xfs_btnum_t btnum = cur->bc_btnum; xfs_btnum_t btnum = cur->bc_btnum;
int crc = xfs_sb_version_hascrc(&mp->m_sb); int crc = xfs_sb_version_hascrc(&mp->m_sb);
xfs_failaddr_t fa;
xfs_agblock_t agbno = NULLAGBLOCK;
xfs_agnumber_t agno = NULLAGNUMBER;
if (crc) { if (crc) {
if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
...@@ -145,16 +215,18 @@ __xfs_btree_check_sblock( ...@@ -145,16 +215,18 @@ __xfs_btree_check_sblock(
if (be16_to_cpu(block->bb_numrecs) > if (be16_to_cpu(block->bb_numrecs) >
cur->bc_ops->get_maxrecs(cur, level)) cur->bc_ops->get_maxrecs(cur, level))
return __this_address; return __this_address;
if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) &&
!xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib),
level + 1))
return __this_address;
if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) &&
!xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib),
level + 1))
return __this_address;
return NULL; if (bp) {
agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
}
fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno,
block->bb_u.s.bb_leftsib);
if (!fa)
fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno,
agbno, block->bb_u.s.bb_rightsib);
return fa;
} }
/* Check a short btree block header. */ /* Check a short btree block header. */
...@@ -423,7 +495,7 @@ xfs_btree_dup_cursor( ...@@ -423,7 +495,7 @@ xfs_btree_dup_cursor(
bp = cur->bc_bufs[i]; bp = cur->bc_bufs[i];
if (bp) { if (bp) {
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_BUF_ADDR(bp), mp->m_bsize, xfs_buf_daddr(bp), mp->m_bsize,
0, &bp, 0, &bp,
cur->bc_ops->buf_ops); cur->bc_ops->buf_ops);
if (error) { if (error) {
...@@ -1195,10 +1267,10 @@ xfs_btree_buf_to_ptr( ...@@ -1195,10 +1267,10 @@ xfs_btree_buf_to_ptr(
{ {
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
XFS_BUF_ADDR(bp))); xfs_buf_daddr(bp)));
else { else {
ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
XFS_BUF_ADDR(bp))); xfs_buf_daddr(bp)));
} }
} }
...@@ -1742,7 +1814,7 @@ xfs_btree_lookup_get_block( ...@@ -1742,7 +1814,7 @@ xfs_btree_lookup_get_block(
error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
if (error) if (error)
return error; return error;
if (bp && XFS_BUF_ADDR(bp) == daddr) { if (bp && xfs_buf_daddr(bp) == daddr) {
*blkp = XFS_BUF_TO_BLOCK(bp); *blkp = XFS_BUF_TO_BLOCK(bp);
return 0; return 0;
} }
...@@ -3204,7 +3276,7 @@ xfs_btree_insrec( ...@@ -3204,7 +3276,7 @@ xfs_btree_insrec(
struct xfs_btree_block *block; /* btree block */ struct xfs_btree_block *block; /* btree block */
struct xfs_buf *bp; /* buffer for block */ struct xfs_buf *bp; /* buffer for block */
union xfs_btree_ptr nptr; /* new block ptr */ union xfs_btree_ptr nptr; /* new block ptr */
struct xfs_btree_cur *ncur; /* new btree cursor */ struct xfs_btree_cur *ncur = NULL; /* new btree cursor */
union xfs_btree_key nkey; /* new block key */ union xfs_btree_key nkey; /* new block key */
union xfs_btree_key *lkey; union xfs_btree_key *lkey;
int optr; /* old key/record index */ int optr; /* old key/record index */
...@@ -3284,7 +3356,7 @@ xfs_btree_insrec( ...@@ -3284,7 +3356,7 @@ xfs_btree_insrec(
#ifdef DEBUG #ifdef DEBUG
error = xfs_btree_check_block(cur, block, level, bp); error = xfs_btree_check_block(cur, block, level, bp);
if (error) if (error)
return error; goto error0;
#endif #endif
/* /*
...@@ -3304,7 +3376,7 @@ xfs_btree_insrec( ...@@ -3304,7 +3376,7 @@ xfs_btree_insrec(
for (i = numrecs - ptr; i >= 0; i--) { for (i = numrecs - ptr; i >= 0; i--) {
error = xfs_btree_debug_check_ptr(cur, pp, i, level); error = xfs_btree_debug_check_ptr(cur, pp, i, level);
if (error) if (error)
return error; goto error0;
} }
xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
...@@ -3389,6 +3461,8 @@ xfs_btree_insrec( ...@@ -3389,6 +3461,8 @@ xfs_btree_insrec(
return 0; return 0;
error0: error0:
if (ncur)
xfs_btree_del_cursor(ncur, error);
return error; return error;
} }
...@@ -4281,6 +4355,21 @@ xfs_btree_visit_block( ...@@ -4281,6 +4355,21 @@ xfs_btree_visit_block(
if (xfs_btree_ptr_is_null(cur, &rptr)) if (xfs_btree_ptr_is_null(cur, &rptr))
return -ENOENT; return -ENOENT;
/*
* We only visit blocks once in this walk, so we have to avoid the
* internal xfs_btree_lookup_get_block() optimisation where it will
* return the same block without checking if the right sibling points
* back to us and creates a cyclic reference in the btree.
*/
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
xfs_buf_daddr(bp)))
return -EFSCORRUPTED;
} else {
if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
xfs_buf_daddr(bp)))
return -EFSCORRUPTED;
}
return xfs_btree_lookup_get_block(cur, level, &rptr, &block); return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
} }
...@@ -4455,20 +4544,21 @@ xfs_btree_lblock_verify( ...@@ -4455,20 +4544,21 @@ xfs_btree_lblock_verify(
{ {
struct xfs_mount *mp = bp->b_mount; struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
xfs_fsblock_t fsb;
xfs_failaddr_t fa;
/* numrecs verification */ /* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs) if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address; return __this_address;
/* sibling pointer verification */ /* sibling pointer verification */
if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
!xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
return __this_address; block->bb_u.l.bb_leftsib);
if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && if (!fa)
!xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
return __this_address; block->bb_u.l.bb_rightsib);
return fa;
return NULL;
} }
/** /**
...@@ -4509,22 +4599,23 @@ xfs_btree_sblock_verify( ...@@ -4509,22 +4599,23 @@ xfs_btree_sblock_verify(
{ {
struct xfs_mount *mp = bp->b_mount; struct xfs_mount *mp = bp->b_mount;
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
xfs_agblock_t agno; xfs_agnumber_t agno;
xfs_agblock_t agbno;
xfs_failaddr_t fa;
/* numrecs verification */ /* numrecs verification */
if (be16_to_cpu(block->bb_numrecs) > max_recs) if (be16_to_cpu(block->bb_numrecs) > max_recs)
return __this_address; return __this_address;
/* sibling pointer verification */ /* sibling pointer verification */
agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
!xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
return __this_address; block->bb_u.s.bb_leftsib);
if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && if (!fa)
!xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
return __this_address; block->bb_u.s.bb_rightsib);
return fa;
return NULL;
} }
/* /*
......
...@@ -156,7 +156,7 @@ __xfs_inobt_free_block( ...@@ -156,7 +156,7 @@ __xfs_inobt_free_block(
{ {
xfs_inobt_mod_blockcount(cur, -1); xfs_inobt_mod_blockcount(cur, -1);
return xfs_free_extent(cur->bc_tp, return xfs_free_extent(cur->bc_tp,
XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1,
&XFS_RMAP_OINFO_INOBT, resv); &XFS_RMAP_OINFO_INOBT, resv);
} }
......
...@@ -48,7 +48,7 @@ xfs_inode_buf_verify( ...@@ -48,7 +48,7 @@ xfs_inode_buf_verify(
/* /*
* Validate the magic number and version of every inode in the buffer * Validate the magic number and version of every inode in the buffer
*/ */
agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
for (i = 0; i < ni; i++) { for (i = 0; i < ni; i++) {
int di_ok; int di_ok;
......
...@@ -102,7 +102,7 @@ xfs_refcountbt_free_block( ...@@ -102,7 +102,7 @@ xfs_refcountbt_free_block(
struct xfs_mount *mp = cur->bc_mp; struct xfs_mount *mp = cur->bc_mp;
struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_buf *agbp = cur->bc_ag.agbp;
struct xfs_agf *agf = agbp->b_addr; struct xfs_agf *agf = agbp->b_addr;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
int error; int error;
trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno, trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno,
......
...@@ -124,7 +124,7 @@ xfs_rmapbt_free_block( ...@@ -124,7 +124,7 @@ xfs_rmapbt_free_block(
xfs_agblock_t bno; xfs_agblock_t bno;
int error; int error;
bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno,
bno, 1); bno, 1);
be32_add_cpu(&agf->agf_rmap_blocks, -1); be32_add_cpu(&agf->agf_rmap_blocks, -1);
......
...@@ -156,7 +156,7 @@ xfs_validate_sb_write( ...@@ -156,7 +156,7 @@ xfs_validate_sb_write(
* secondary superblocks, so allow this usage to continue because * secondary superblocks, so allow this usage to continue because
* we never read counters from such superblocks. * we never read counters from such superblocks.
*/ */
if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress &&
(sbp->sb_fdblocks > sbp->sb_dblocks || (sbp->sb_fdblocks > sbp->sb_dblocks ||
!xfs_verify_icount(mp, sbp->sb_icount) || !xfs_verify_icount(mp, sbp->sb_icount) ||
sbp->sb_ifree > sbp->sb_icount)) { sbp->sb_ifree > sbp->sb_icount)) {
...@@ -382,17 +382,6 @@ xfs_validate_sb_common( ...@@ -382,17 +382,6 @@ xfs_validate_sb_common(
return -EFSCORRUPTED; return -EFSCORRUPTED;
} }
/*
* Until this is fixed only page-sized or smaller data blocks work.
*/
if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
xfs_warn(mp,
"File system with blocksize %d bytes. "
"Only pagesize (%ld) or less will currently work.",
sbp->sb_blocksize, PAGE_SIZE);
return -ENOSYS;
}
/* /*
* Currently only very few inode sizes are supported. * Currently only very few inode sizes are supported.
*/ */
...@@ -408,22 +397,6 @@ xfs_validate_sb_common( ...@@ -408,22 +397,6 @@ xfs_validate_sb_common(
return -ENOSYS; return -ENOSYS;
} }
if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
xfs_warn(mp,
"file system too large to be mounted on this system.");
return -EFBIG;
}
/*
* Don't touch the filesystem if a user tool thinks it owns the primary
* superblock. mkfs doesn't clear the flag from secondary supers, so
* we don't check them at all.
*/
if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && sbp->sb_inprogress) {
xfs_warn(mp, "Offline file system operation in progress!");
return -EFSCORRUPTED;
}
return 0; return 0;
} }
......
...@@ -435,12 +435,12 @@ xchk_btree_check_owner( ...@@ -435,12 +435,12 @@ xchk_btree_check_owner(
if (!co) if (!co)
return -ENOMEM; return -ENOMEM;
co->level = level; co->level = level;
co->daddr = XFS_BUF_ADDR(bp); co->daddr = xfs_buf_daddr(bp);
list_add_tail(&co->list, &bs->to_check); list_add_tail(&co->list, &bs->to_check);
return 0; return 0;
} }
return xchk_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp)); return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
} }
/* /*
......
...@@ -178,7 +178,7 @@ xfs_attr3_node_inactive( ...@@ -178,7 +178,7 @@ xfs_attr3_node_inactive(
return error; return error;
/* save for re-read later */ /* save for re-read later */
child_blkno = XFS_BUF_ADDR(child_bp); child_blkno = xfs_buf_daddr(child_bp);
/* /*
* Invalidate the subtree, however we have to. * Invalidate the subtree, however we have to.
......
...@@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count) ...@@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count)
return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES); return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES);
} }
static void
xfs_flush_bdev_async_endio(
struct bio *bio)
{
complete(bio->bi_private);
}
/*
* Submit a request for an async cache flush to run. If the request queue does
* not require flush operations, just skip it altogether. If the caller needs
* to wait for the flush completion at a later point in time, they must supply a
* valid completion. This will be signalled when the flush completes. The
* caller never sees the bio that is issued here.
*/
void
xfs_flush_bdev_async(
struct bio *bio,
struct block_device *bdev,
struct completion *done)
{
struct request_queue *q = bdev->bd_disk->queue;
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
complete(done);
return;
}
bio_init(bio, NULL, 0);
bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
bio->bi_private = done;
bio->bi_end_io = xfs_flush_bdev_async_endio;
submit_bio(bio);
}
int int
xfs_rw_bdev( xfs_rw_bdev(
struct block_device *bdev, struct block_device *bdev,
......
...@@ -1402,7 +1402,7 @@ xfs_buf_ioerror_alert( ...@@ -1402,7 +1402,7 @@ xfs_buf_ioerror_alert(
{ {
xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
"metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
func, (uint64_t)XFS_BUF_ADDR(bp), func, (uint64_t)xfs_buf_daddr(bp),
bp->b_length, -bp->b_error); bp->b_length, -bp->b_error);
} }
......
...@@ -305,9 +305,13 @@ extern void xfs_buf_terminate(void); ...@@ -305,9 +305,13 @@ extern void xfs_buf_terminate(void);
* In future, uncached buffers will pass the block number directly to the io * In future, uncached buffers will pass the block number directly to the io
* request function and hence these macros will go away at that point. * request function and hence these macros will go away at that point.
*/ */
#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn)
#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp)
{
return bp->b_maps[0].bm_bn;
}
void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref);
/* /*
......
...@@ -447,7 +447,7 @@ xfs_inodegc_queue_all( ...@@ -447,7 +447,7 @@ xfs_inodegc_queue_all(
for_each_online_cpu(cpu) { for_each_online_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu); gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) if (!llist_empty(&gc->list))
queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
} }
} }
...@@ -1874,8 +1874,8 @@ void ...@@ -1874,8 +1874,8 @@ void
xfs_inodegc_worker( xfs_inodegc_worker(
struct work_struct *work) struct work_struct *work)
{ {
struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, struct xfs_inodegc *gc = container_of(to_delayed_work(work),
work); struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list); struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n; struct xfs_inode *ip, *n;
unsigned int nofs_flag; unsigned int nofs_flag;
...@@ -1905,28 +1905,30 @@ xfs_inodegc_worker( ...@@ -1905,28 +1905,30 @@ xfs_inodegc_worker(
} }
/* /*
* Force all currently queued inode inactivation work to run immediately, and * Expedite all pending inodegc work to run immediately. This does not wait for
* wait for the work to finish. Two pass - queue all the work first pass, wait * completion of the work.
* for it in a second pass.
*/ */
void void
xfs_inodegc_flush( xfs_inodegc_push(
struct xfs_mount *mp) struct xfs_mount *mp)
{ {
struct xfs_inodegc *gc;
int cpu;
if (!xfs_is_inodegc_enabled(mp)) if (!xfs_is_inodegc_enabled(mp))
return; return;
trace_xfs_inodegc_push(mp, __return_address);
trace_xfs_inodegc_flush(mp, __return_address);
xfs_inodegc_queue_all(mp); xfs_inodegc_queue_all(mp);
}
for_each_online_cpu(cpu) { /*
gc = per_cpu_ptr(mp->m_inodegc, cpu); * Force all currently queued inode inactivation work to run immediately and
flush_work(&gc->work); * wait for the work to finish.
} */
void
xfs_inodegc_flush(
struct xfs_mount *mp)
{
xfs_inodegc_push(mp);
trace_xfs_inodegc_flush(mp, __return_address);
flush_workqueue(mp->m_inodegc_wq);
} }
/* /*
...@@ -1937,18 +1939,12 @@ void ...@@ -1937,18 +1939,12 @@ void
xfs_inodegc_stop( xfs_inodegc_stop(
struct xfs_mount *mp) struct xfs_mount *mp)
{ {
struct xfs_inodegc *gc;
int cpu;
if (!xfs_clear_inodegc_enabled(mp)) if (!xfs_clear_inodegc_enabled(mp))
return; return;
xfs_inodegc_queue_all(mp); xfs_inodegc_queue_all(mp);
drain_workqueue(mp->m_inodegc_wq);
for_each_online_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
cancel_work_sync(&gc->work);
}
trace_xfs_inodegc_stop(mp, __return_address); trace_xfs_inodegc_stop(mp, __return_address);
} }
...@@ -2068,6 +2064,7 @@ xfs_inodegc_queue( ...@@ -2068,6 +2064,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc; struct xfs_inodegc *gc;
int items; int items;
unsigned int shrinker_hits; unsigned int shrinker_hits;
unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip); trace_xfs_inode_set_need_inactive(ip);
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
...@@ -2079,19 +2076,26 @@ xfs_inodegc_queue( ...@@ -2079,19 +2076,26 @@ xfs_inodegc_queue(
items = READ_ONCE(gc->items); items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1); WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits); shrinker_hits = READ_ONCE(gc->shrinker_hits);
put_cpu_ptr(gc);
if (!xfs_is_inodegc_enabled(mp)) /*
* We queue the work while holding the current CPU so that the work
* is scheduled to run on this CPU.
*/
if (!xfs_is_inodegc_enabled(mp)) {
put_cpu_ptr(gc);
return; return;
}
if (xfs_inodegc_want_queue_work(ip, items))
queue_delay = 0;
if (xfs_inodegc_want_queue_work(ip, items)) {
trace_xfs_inodegc_queue(mp, __return_address); trace_xfs_inodegc_queue(mp, __return_address);
queue_work(mp->m_inodegc_wq, &gc->work); mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay);
} put_cpu_ptr(gc);
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address); trace_xfs_inodegc_throttle(mp, __return_address);
flush_work(&gc->work); flush_delayed_work(&gc->work);
} }
} }
...@@ -2108,7 +2112,7 @@ xfs_inodegc_cpu_dead( ...@@ -2108,7 +2112,7 @@ xfs_inodegc_cpu_dead(
unsigned int count = 0; unsigned int count = 0;
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
cancel_work_sync(&dead_gc->work); cancel_delayed_work_sync(&dead_gc->work);
if (llist_empty(&dead_gc->list)) if (llist_empty(&dead_gc->list))
return; return;
...@@ -2127,12 +2131,12 @@ xfs_inodegc_cpu_dead( ...@@ -2127,12 +2131,12 @@ xfs_inodegc_cpu_dead(
llist_add_batch(first, last, &gc->list); llist_add_batch(first, last, &gc->list);
count += READ_ONCE(gc->items); count += READ_ONCE(gc->items);
WRITE_ONCE(gc->items, count); WRITE_ONCE(gc->items, count);
put_cpu_ptr(gc);
if (xfs_is_inodegc_enabled(mp)) { if (xfs_is_inodegc_enabled(mp)) {
trace_xfs_inodegc_queue(mp, __return_address); trace_xfs_inodegc_queue(mp, __return_address);
queue_work(mp->m_inodegc_wq, &gc->work); mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0);
} }
put_cpu_ptr(gc);
} }
/* /*
...@@ -2227,7 +2231,7 @@ xfs_inodegc_shrinker_scan( ...@@ -2227,7 +2231,7 @@ xfs_inodegc_shrinker_scan(
unsigned int h = READ_ONCE(gc->shrinker_hits); unsigned int h = READ_ONCE(gc->shrinker_hits);
WRITE_ONCE(gc->shrinker_hits, h + 1); WRITE_ONCE(gc->shrinker_hits, h + 1);
queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
no_items = false; no_items = false;
} }
} }
......
...@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp); ...@@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp);
void xfs_blockgc_start(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp);
void xfs_inodegc_worker(struct work_struct *work); void xfs_inodegc_worker(struct work_struct *work);
void xfs_inodegc_push(struct xfs_mount *mp);
void xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp);
......
...@@ -196,8 +196,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) ...@@ -196,8 +196,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y)
int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
char *data, unsigned int op); char *data, unsigned int op);
void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev,
struct completion *done);
#define ASSERT_ALWAYS(expr) \ #define ASSERT_ALWAYS(expr) \
(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
......
...@@ -75,13 +75,12 @@ xlog_verify_iclog( ...@@ -75,13 +75,12 @@ xlog_verify_iclog(
STATIC void STATIC void
xlog_verify_tail_lsn( xlog_verify_tail_lsn(
struct xlog *log, struct xlog *log,
struct xlog_in_core *iclog, struct xlog_in_core *iclog);
xfs_lsn_t tail_lsn);
#else #else
#define xlog_verify_dest_ptr(a,b) #define xlog_verify_dest_ptr(a,b)
#define xlog_verify_grant_tail(a) #define xlog_verify_grant_tail(a)
#define xlog_verify_iclog(a,b,c) #define xlog_verify_iclog(a,b,c)
#define xlog_verify_tail_lsn(a,b,c) #define xlog_verify_tail_lsn(a,b)
#endif #endif
STATIC int STATIC int
...@@ -523,17 +522,28 @@ xlog_state_shutdown_callbacks( ...@@ -523,17 +522,28 @@ xlog_state_shutdown_callbacks(
/* /*
* Flush iclog to disk if this is the last reference to the given iclog and the * Flush iclog to disk if this is the last reference to the given iclog and the
* it is in the WANT_SYNC state. If the caller passes in a non-zero * it is in the WANT_SYNC state.
* @old_tail_lsn and the current log tail does not match, there may be metadata *
* on disk that must be persisted before this iclog is written. To satisfy that * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
* requirement, set the XLOG_ICL_NEED_FLUSH flag as a condition for writing this * log tail is updated correctly. NEED_FUA indicates that the iclog will be
* iclog with the new log tail value. * written to stable storage, and implies that a commit record is contained
* within the iclog. We need to ensure that the log tail does not move beyond
* the tail that the first commit record in the iclog ordered against, otherwise
* correct recovery of that checkpoint becomes dependent on future operations
* performed on this iclog.
*
* Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the
* current tail into iclog. Once the iclog tail is set, future operations must
* not modify it, otherwise they potentially violate ordering constraints for
* the checkpoint commit that wrote the initial tail lsn value. The tail lsn in
* the iclog will get zeroed on activation of the iclog after sync, so we
* always capture the tail lsn on the iclog on the first NEED_FUA release
* regardless of the number of active reference counts on this iclog.
*/ */
int int
xlog_state_release_iclog( xlog_state_release_iclog(
struct xlog *log, struct xlog *log,
struct xlog_in_core *iclog, struct xlog_in_core *iclog)
xfs_lsn_t old_tail_lsn)
{ {
xfs_lsn_t tail_lsn; xfs_lsn_t tail_lsn;
bool last_ref; bool last_ref;
...@@ -544,14 +554,14 @@ xlog_state_release_iclog( ...@@ -544,14 +554,14 @@ xlog_state_release_iclog(
/* /*
* Grabbing the current log tail needs to be atomic w.r.t. the writing * Grabbing the current log tail needs to be atomic w.r.t. the writing
* of the tail LSN into the iclog so we guarantee that the log tail does * of the tail LSN into the iclog so we guarantee that the log tail does
* not move between deciding if a cache flush is required and writing * not move between the first time we know that the iclog needs to be
* the LSN into the iclog below. * made stable and when we eventually submit it.
*/ */
if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
(iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
!iclog->ic_header.h_tail_lsn) {
tail_lsn = xlog_assign_tail_lsn(log->l_mp); tail_lsn = xlog_assign_tail_lsn(log->l_mp);
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
if (old_tail_lsn && tail_lsn != old_tail_lsn)
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
} }
last_ref = atomic_dec_and_test(&iclog->ic_refcnt); last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
...@@ -576,8 +586,7 @@ xlog_state_release_iclog( ...@@ -576,8 +586,7 @@ xlog_state_release_iclog(
} }
iclog->ic_state = XLOG_STATE_SYNCING; iclog->ic_state = XLOG_STATE_SYNCING;
iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); xlog_verify_tail_lsn(log, iclog);
xlog_verify_tail_lsn(log, iclog, tail_lsn);
trace_xlog_iclog_syncing(iclog, _RET_IP_); trace_xlog_iclog_syncing(iclog, _RET_IP_);
spin_unlock(&log->l_icloglock); spin_unlock(&log->l_icloglock);
...@@ -845,7 +854,7 @@ xlog_force_iclog( ...@@ -845,7 +854,7 @@ xlog_force_iclog(
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
if (iclog->ic_state == XLOG_STATE_ACTIVE) if (iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
return xlog_state_release_iclog(iclog->ic_log, iclog, 0); return xlog_state_release_iclog(iclog->ic_log, iclog);
} }
/* /*
...@@ -2299,7 +2308,7 @@ xlog_write_copy_finish( ...@@ -2299,7 +2308,7 @@ xlog_write_copy_finish(
ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
xlog_is_shutdown(log)); xlog_is_shutdown(log));
release_iclog: release_iclog:
error = xlog_state_release_iclog(log, iclog, 0); error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock); spin_unlock(&log->l_icloglock);
return error; return error;
} }
...@@ -2516,7 +2525,7 @@ xlog_write( ...@@ -2516,7 +2525,7 @@ xlog_write(
spin_lock(&log->l_icloglock); spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
error = xlog_state_release_iclog(log, iclog, 0); error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock); spin_unlock(&log->l_icloglock);
return error; return error;
...@@ -2553,6 +2562,7 @@ xlog_state_activate_iclog( ...@@ -2553,6 +2562,7 @@ xlog_state_activate_iclog(
memset(iclog->ic_header.h_cycle_data, 0, memset(iclog->ic_header.h_cycle_data, 0,
sizeof(iclog->ic_header.h_cycle_data)); sizeof(iclog->ic_header.h_cycle_data));
iclog->ic_header.h_lsn = 0; iclog->ic_header.h_lsn = 0;
iclog->ic_header.h_tail_lsn = 0;
} }
/* /*
...@@ -2939,7 +2949,7 @@ xlog_state_get_iclog_space( ...@@ -2939,7 +2949,7 @@ xlog_state_get_iclog_space(
* reference to the iclog. * reference to the iclog.
*/ */
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
error = xlog_state_release_iclog(log, iclog, 0); error = xlog_state_release_iclog(log, iclog);
spin_unlock(&log->l_icloglock); spin_unlock(&log->l_icloglock);
if (error) if (error)
return error; return error;
...@@ -3581,9 +3591,9 @@ xlog_verify_grant_tail( ...@@ -3581,9 +3591,9 @@ xlog_verify_grant_tail(
STATIC void STATIC void
xlog_verify_tail_lsn( xlog_verify_tail_lsn(
struct xlog *log, struct xlog *log,
struct xlog_in_core *iclog, struct xlog_in_core *iclog)
xfs_lsn_t tail_lsn)
{ {
xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn);
int blocks; int blocks;
if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) {
......
...@@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state( ...@@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state(
* The LSN we need to pass to the log items on transaction * The LSN we need to pass to the log items on transaction
* commit is the LSN reported by the first log vector write, not * commit is the LSN reported by the first log vector write, not
* the commit lsn. If we use the commit record lsn then we can * the commit lsn. If we use the commit record lsn then we can
* move the tail beyond the grant write head. * move the grant write head beyond the tail LSN and overwrite
* it.
*/ */
ctx->start_lsn = lsn; ctx->start_lsn = lsn;
wake_up_all(&cil->xc_start_wait); wake_up_all(&cil->xc_start_wait);
spin_unlock(&cil->xc_push_lock); spin_unlock(&cil->xc_push_lock);
/*
* Make sure the metadata we are about to overwrite in the log
* has been flushed to stable storage before this iclog is
* issued.
*/
spin_lock(&cil->xc_log->l_icloglock);
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
spin_unlock(&cil->xc_log->l_icloglock);
return; return;
} }
...@@ -888,10 +898,7 @@ xlog_cil_push_work( ...@@ -888,10 +898,7 @@ xlog_cil_push_work(
struct xfs_trans_header thdr; struct xfs_trans_header thdr;
struct xfs_log_iovec lhdr; struct xfs_log_iovec lhdr;
struct xfs_log_vec lvhdr = { NULL }; struct xfs_log_vec lvhdr = { NULL };
xfs_lsn_t preflush_tail_lsn;
xfs_csn_t push_seq; xfs_csn_t push_seq;
struct bio bio;
DECLARE_COMPLETION_ONSTACK(bdev_flush);
bool push_commit_stable; bool push_commit_stable;
new_ctx = xlog_cil_ctx_alloc(); new_ctx = xlog_cil_ctx_alloc();
...@@ -961,23 +968,6 @@ xlog_cil_push_work( ...@@ -961,23 +968,6 @@ xlog_cil_push_work(
list_add(&ctx->committing, &cil->xc_committing); list_add(&ctx->committing, &cil->xc_committing);
spin_unlock(&cil->xc_push_lock); spin_unlock(&cil->xc_push_lock);
/*
* The CIL is stable at this point - nothing new will be added to it
* because we hold the flush lock exclusively. Hence we can now issue
* a cache flush to ensure all the completed metadata in the journal we
* are about to overwrite is on stable storage.
*
* Because we are issuing this cache flush before we've written the
* tail lsn to the iclog, we can have metadata IO completions move the
* tail forwards between the completion of this flush and the iclog
* being written. In this case, we need to re-issue the cache flush
* before the iclog write. To detect whether the log tail moves, sample
* the tail LSN *before* we issue the flush.
*/
preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
&bdev_flush);
/* /*
* Pull all the log vectors off the items in the CIL, and remove the * Pull all the log vectors off the items in the CIL, and remove the
* items from the CIL. We don't need the CIL lock here because it's only * items from the CIL. We don't need the CIL lock here because it's only
...@@ -1054,12 +1044,6 @@ xlog_cil_push_work( ...@@ -1054,12 +1044,6 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr; lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain; lvhdr.lv_next = ctx->lv_chain;
/*
* Before we format and submit the first iclog, we have to ensure that
* the metadata writeback ordering cache flush is complete.
*/
wait_for_completion(&bdev_flush);
error = xlog_cil_write_chain(ctx, &lvhdr); error = xlog_cil_write_chain(ctx, &lvhdr);
if (error) if (error)
goto out_abort_free_ticket; goto out_abort_free_ticket;
...@@ -1118,7 +1102,7 @@ xlog_cil_push_work( ...@@ -1118,7 +1102,7 @@ xlog_cil_push_work(
if (push_commit_stable && if (push_commit_stable &&
ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); xlog_state_release_iclog(log, ctx->commit_iclog);
/* Not safe to reference ctx now! */ /* Not safe to reference ctx now! */
...@@ -1139,7 +1123,7 @@ xlog_cil_push_work( ...@@ -1139,7 +1123,7 @@ xlog_cil_push_work(
return; return;
} }
spin_lock(&log->l_icloglock); spin_lock(&log->l_icloglock);
xlog_state_release_iclog(log, ctx->commit_iclog, 0); xlog_state_release_iclog(log, ctx->commit_iclog);
/* Not safe to reference ctx now! */ /* Not safe to reference ctx now! */
spin_unlock(&log->l_icloglock); spin_unlock(&log->l_icloglock);
} }
......
...@@ -48,6 +48,16 @@ enum xlog_iclog_state { ...@@ -48,6 +48,16 @@ enum xlog_iclog_state {
{ XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \ { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \
{ XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" } { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }
/*
* In core log flags
*/
#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
#define XLOG_ICL_STRINGS \
{ XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \
{ XLOG_ICL_NEED_FUA, "XLOG_ICL_NEED_FUA" }
/* /*
* Log ticket flags * Log ticket flags
...@@ -132,9 +142,6 @@ enum xlog_iclog_state { ...@@ -132,9 +142,6 @@ enum xlog_iclog_state {
#define XLOG_COVER_OPS 5 #define XLOG_COVER_OPS 5
#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */
#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */
/* Ticket reservation region accounting */ /* Ticket reservation region accounting */
#define XLOG_TIC_LEN_MAX 15 #define XLOG_TIC_LEN_MAX 15
...@@ -511,8 +518,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); ...@@ -511,8 +518,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
int eventual_size); int eventual_size);
int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
xfs_lsn_t log_tail_lsn);
/* /*
* When we crack an atomic LSN, we sample it first so that the value will not * When we crack an atomic LSN, we sample it first so that the value will not
......
...@@ -60,7 +60,7 @@ struct xfs_error_cfg { ...@@ -60,7 +60,7 @@ struct xfs_error_cfg {
*/ */
struct xfs_inodegc { struct xfs_inodegc {
struct llist_head list; struct llist_head list;
struct work_struct work; struct delayed_work work;
/* approximate count of inodes in the list */ /* approximate count of inodes in the list */
unsigned int items; unsigned int items;
......
...@@ -481,9 +481,12 @@ xfs_qm_scall_getquota( ...@@ -481,9 +481,12 @@ xfs_qm_scall_getquota(
struct xfs_dquot *dqp; struct xfs_dquot *dqp;
int error; int error;
/* Flush inodegc work at the start of a quota reporting scan. */ /*
* Expedite pending inodegc work at the start of a quota reporting
* scan but don't block waiting for it to complete.
*/
if (id == 0) if (id == 0)
xfs_inodegc_flush(mp); xfs_inodegc_push(mp);
/* /*
* Try to get the dquot. We don't want it allocated on disk, so don't * Try to get the dquot. We don't want it allocated on disk, so don't
...@@ -525,7 +528,7 @@ xfs_qm_scall_getquota_next( ...@@ -525,7 +528,7 @@ xfs_qm_scall_getquota_next(
/* Flush inodegc work at the start of a quota reporting scan. */ /* Flush inodegc work at the start of a quota reporting scan. */
if (*id == 0) if (*id == 0)
xfs_inodegc_flush(mp); xfs_inodegc_push(mp);
error = xfs_qm_dqget_next(mp, *id, type, &dqp); error = xfs_qm_dqget_next(mp, *id, type, &dqp);
if (error) if (error)
......
...@@ -789,8 +789,11 @@ xfs_fs_statfs( ...@@ -789,8 +789,11 @@ xfs_fs_statfs(
xfs_extlen_t lsize; xfs_extlen_t lsize;
int64_t ffree; int64_t ffree;
/* Wait for whatever inactivations are in progress. */ /*
xfs_inodegc_flush(mp); * Expedite background inodegc but don't wait. We do not want to block
* here waiting hours for a billion extent file to be truncated.
*/
xfs_inodegc_push(mp);
statp->f_type = XFS_SUPER_MAGIC; statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1; statp->f_namelen = MAXNAMELEN - 1;
...@@ -1101,7 +1104,7 @@ xfs_inodegc_init_percpu( ...@@ -1101,7 +1104,7 @@ xfs_inodegc_init_percpu(
gc = per_cpu_ptr(mp->m_inodegc, cpu); gc = per_cpu_ptr(mp->m_inodegc, cpu);
init_llist_head(&gc->list); init_llist_head(&gc->list);
gc->items = 0; gc->items = 0;
INIT_WORK(&gc->work, xfs_inodegc_worker); INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
} }
return 0; return 0;
} }
...@@ -1547,6 +1550,38 @@ xfs_fc_fill_super( ...@@ -1547,6 +1550,38 @@ xfs_fc_fill_super(
#endif #endif
} }
/*
* Don't touch the filesystem if a user tool thinks it owns the primary
* superblock. mkfs doesn't clear the flag from secondary supers, so
* we don't check them at all.
*/
if (mp->m_sb.sb_inprogress) {
xfs_warn(mp, "Offline file system operation in progress!");
error = -EFSCORRUPTED;
goto out_free_sb;
}
/*
* Until this is fixed only page-sized or smaller data blocks work.
*/
if (mp->m_sb.sb_blocksize > PAGE_SIZE) {
xfs_warn(mp,
"File system with blocksize %d bytes. "
"Only pagesize (%ld) or less will currently work.",
mp->m_sb.sb_blocksize, PAGE_SIZE);
error = -ENOSYS;
goto out_free_sb;
}
/* Ensure this filesystem fits in the page cache limits */
if (xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_dblocks) ||
xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_rblocks)) {
xfs_warn(mp,
"file system too large to be mounted on this system.");
error = -EFBIG;
goto out_free_sb;
}
/* /*
* XFS block mappings use 54 bits to store the logical block offset. * XFS block mappings use 54 bits to store the logical block offset.
* This should suffice to handle the maximum file size that the VFS * This should suffice to handle the maximum file size that the VFS
......
...@@ -187,6 +187,7 @@ DEFINE_EVENT(xfs_fs_class, name, \ ...@@ -187,6 +187,7 @@ DEFINE_EVENT(xfs_fs_class, name, \
TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \
TP_ARGS(mp, caller_ip)) TP_ARGS(mp, caller_ip))
DEFINE_FS_EVENT(xfs_inodegc_flush); DEFINE_FS_EVENT(xfs_inodegc_flush);
DEFINE_FS_EVENT(xfs_inodegc_push);
DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_start);
DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_inodegc_stop);
DEFINE_FS_EVENT(xfs_inodegc_queue); DEFINE_FS_EVENT(xfs_inodegc_queue);
...@@ -4010,6 +4011,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class, ...@@ -4010,6 +4011,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class,
__field(uint32_t, state) __field(uint32_t, state)
__field(int32_t, refcount) __field(int32_t, refcount)
__field(uint32_t, offset) __field(uint32_t, offset)
__field(uint32_t, flags)
__field(unsigned long long, lsn) __field(unsigned long long, lsn)
__field(unsigned long, caller_ip) __field(unsigned long, caller_ip)
), ),
...@@ -4018,15 +4020,17 @@ DECLARE_EVENT_CLASS(xlog_iclog_class, ...@@ -4018,15 +4020,17 @@ DECLARE_EVENT_CLASS(xlog_iclog_class,
__entry->state = iclog->ic_state; __entry->state = iclog->ic_state;
__entry->refcount = atomic_read(&iclog->ic_refcnt); __entry->refcount = atomic_read(&iclog->ic_refcnt);
__entry->offset = iclog->ic_offset; __entry->offset = iclog->ic_offset;
__entry->flags = iclog->ic_flags;
__entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn);
__entry->caller_ip = caller_ip; __entry->caller_ip = caller_ip;
), ),
TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx caller %pS", TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__print_symbolic(__entry->state, XLOG_STATE_STRINGS), __print_symbolic(__entry->state, XLOG_STATE_STRINGS),
__entry->refcount, __entry->refcount,
__entry->offset, __entry->offset,
__entry->lsn, __entry->lsn,
__print_flags(__entry->flags, "|", XLOG_ICL_STRINGS),
(char *)__entry->caller_ip) (char *)__entry->caller_ip)
); );
......
...@@ -38,7 +38,7 @@ xfs_trans_buf_item_match( ...@@ -38,7 +38,7 @@ xfs_trans_buf_item_match(
blip = (struct xfs_buf_log_item *)lip; blip = (struct xfs_buf_log_item *)lip;
if (blip->bli_item.li_type == XFS_LI_BUF && if (blip->bli_item.li_type == XFS_LI_BUF &&
blip->bli_buf->b_target == target && blip->bli_buf->b_target == target &&
XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn &&
blip->bli_buf->b_length == len) { blip->bli_buf->b_length == len) {
ASSERT(blip->bli_buf->b_map_count == nmaps); ASSERT(blip->bli_buf->b_map_count == nmaps);
return blip->bli_buf; return blip->bli_buf;
......
...@@ -1090,7 +1090,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, ...@@ -1090,7 +1090,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type,
unsigned nr_args); unsigned nr_args);
static void io_clean_op(struct io_kiocb *req); static void io_clean_op(struct io_kiocb *req);
static struct file *io_file_get(struct io_ring_ctx *ctx, static struct file *io_file_get(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd, bool fixed); struct io_kiocb *req, int fd, bool fixed,
unsigned int issue_flags);
static void __io_queue_sqe(struct io_kiocb *req); static void __io_queue_sqe(struct io_kiocb *req);
static void io_rsrc_put_work(struct work_struct *work); static void io_rsrc_put_work(struct work_struct *work);
...@@ -3914,7 +3915,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) ...@@ -3914,7 +3915,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN; return -EAGAIN;
in = io_file_get(req->ctx, req, sp->splice_fd_in, in = io_file_get(req->ctx, req, sp->splice_fd_in,
(sp->flags & SPLICE_F_FD_IN_FIXED)); (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
if (!in) { if (!in) {
ret = -EBADF; ret = -EBADF;
goto done; goto done;
...@@ -3954,7 +3955,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) ...@@ -3954,7 +3955,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags)
return -EAGAIN; return -EAGAIN;
in = io_file_get(req->ctx, req, sp->splice_fd_in, in = io_file_get(req->ctx, req, sp->splice_fd_in,
(sp->flags & SPLICE_F_FD_IN_FIXED)); (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags);
if (!in) { if (!in) {
ret = -EBADF; ret = -EBADF;
goto done; goto done;
...@@ -6742,13 +6743,16 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file ...@@ -6742,13 +6743,16 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file
} }
static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd) struct io_kiocb *req, int fd,
unsigned int issue_flags)
{ {
struct file *file; struct file *file = NULL;
unsigned long file_ptr; unsigned long file_ptr;
io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
if (unlikely((unsigned int)fd >= ctx->nr_user_files)) if (unlikely((unsigned int)fd >= ctx->nr_user_files))
return NULL; goto out;
fd = array_index_nospec(fd, ctx->nr_user_files); fd = array_index_nospec(fd, ctx->nr_user_files);
file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
file = (struct file *) (file_ptr & FFS_MASK); file = (struct file *) (file_ptr & FFS_MASK);
...@@ -6756,6 +6760,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, ...@@ -6756,6 +6760,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx,
/* mask in overlapping REQ_F and FFS bits */ /* mask in overlapping REQ_F and FFS bits */
req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT);
io_req_set_rsrc_node(req); io_req_set_rsrc_node(req);
out:
io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK));
return file; return file;
} }
...@@ -6773,10 +6779,11 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx, ...@@ -6773,10 +6779,11 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx,
} }
static inline struct file *io_file_get(struct io_ring_ctx *ctx, static inline struct file *io_file_get(struct io_ring_ctx *ctx,
struct io_kiocb *req, int fd, bool fixed) struct io_kiocb *req, int fd, bool fixed,
unsigned int issue_flags)
{ {
if (fixed) if (fixed)
return io_file_get_fixed(ctx, req, fd); return io_file_get_fixed(ctx, req, fd, issue_flags);
else else
return io_file_get_normal(ctx, req, fd); return io_file_get_normal(ctx, req, fd);
} }
...@@ -6998,7 +7005,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -6998,7 +7005,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
if (io_op_defs[req->opcode].needs_file) { if (io_op_defs[req->opcode].needs_file) {
req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd),
(sqe_flags & IOSQE_FIXED_FILE)); (sqe_flags & IOSQE_FIXED_FILE),
IO_URING_F_NONBLOCK);
if (unlikely(!req->file)) if (unlikely(!req->file))
ret = -EBADF; ret = -EBADF;
} }
......
...@@ -350,7 +350,8 @@ asmlinkage __visible void do_softirq(void) ...@@ -350,7 +350,8 @@ asmlinkage __visible void do_softirq(void)
*/ */
void irq_enter_rcu(void) void irq_enter_rcu(void)
{ {
if (is_idle_task(current) && !in_interrupt()) { if (tick_nohz_full_cpu(smp_processor_id()) ||
(is_idle_task(current) && !in_interrupt())) {
/* /*
* Prevent raise_softirq from needlessly waking up ksoftirqd * Prevent raise_softirq from needlessly waking up ksoftirqd
* here, as softirq will be serviced on return from interrupt. * here, as softirq will be serviced on return from interrupt.
......
...@@ -1347,6 +1347,13 @@ static inline void tick_nohz_irq_enter(void) ...@@ -1347,6 +1347,13 @@ static inline void tick_nohz_irq_enter(void)
now = ktime_get(); now = ktime_get();
if (ts->idle_active) if (ts->idle_active)
tick_nohz_stop_idle(ts, now); tick_nohz_stop_idle(ts, now);
/*
* If all CPUs are idle. We may need to update a stale jiffies value.
* Note nohz_full is a special case: a timekeeper is guaranteed to stay
* alive but it might be busy looping with interrupts disabled in some
* rare case (typically stop machine). So we must make sure we have a
* last resort.
*/
if (ts->tick_stopped) if (ts->tick_stopped)
tick_nohz_update_jiffies(now); tick_nohz_update_jiffies(now);
} }
......
...@@ -529,5 +529,5 @@ void reliable_report_usage(struct seq_file *m, struct mm_struct *mm) ...@@ -529,5 +529,5 @@ void reliable_report_usage(struct seq_file *m, struct mm_struct *mm)
return; return;
SEQ_PUT_DEC("Reliable:\t", atomic_long_read(&mm->reliable_nr_page)); SEQ_PUT_DEC("Reliable:\t", atomic_long_read(&mm->reliable_nr_page));
seq_puts(m, "kB\n"); seq_puts(m, " kB\n");
} }
...@@ -392,6 +392,7 @@ enum rxrpc_conn_proto_state { ...@@ -392,6 +392,7 @@ enum rxrpc_conn_proto_state {
struct rxrpc_bundle { struct rxrpc_bundle {
struct rxrpc_conn_parameters params; struct rxrpc_conn_parameters params;
atomic_t usage; atomic_t usage;
atomic_t active; /* Number of active users */
unsigned int debug_id; unsigned int debug_id;
bool try_upgrade; /* True if the bundle is attempting upgrade */ bool try_upgrade; /* True if the bundle is attempting upgrade */
bool alloc_conn; /* True if someone's getting a conn */ bool alloc_conn; /* True if someone's getting a conn */
......
...@@ -40,6 +40,8 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; ...@@ -40,6 +40,8 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ;
DEFINE_IDR(rxrpc_client_conn_ids); DEFINE_IDR(rxrpc_client_conn_ids);
static DEFINE_SPINLOCK(rxrpc_conn_id_lock); static DEFINE_SPINLOCK(rxrpc_conn_id_lock);
static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle);
/* /*
* Get a connection ID and epoch for a client connection from the global pool. * Get a connection ID and epoch for a client connection from the global pool.
* The connection struct pointer is then recorded in the idr radix tree. The * The connection struct pointer is then recorded in the idr radix tree. The
...@@ -123,6 +125,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, ...@@ -123,6 +125,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp,
bundle->params = *cp; bundle->params = *cp;
rxrpc_get_peer(bundle->params.peer); rxrpc_get_peer(bundle->params.peer);
atomic_set(&bundle->usage, 1); atomic_set(&bundle->usage, 1);
atomic_set(&bundle->active, 1);
spin_lock_init(&bundle->channel_lock); spin_lock_init(&bundle->channel_lock);
INIT_LIST_HEAD(&bundle->waiting_calls); INIT_LIST_HEAD(&bundle->waiting_calls);
} }
...@@ -341,6 +344,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c ...@@ -341,6 +344,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c
rxrpc_free_bundle(candidate); rxrpc_free_bundle(candidate);
found_bundle: found_bundle:
rxrpc_get_bundle(bundle); rxrpc_get_bundle(bundle);
atomic_inc(&bundle->active);
spin_unlock(&local->client_bundles_lock); spin_unlock(&local->client_bundles_lock);
_leave(" = %u [found]", bundle->debug_id); _leave(" = %u [found]", bundle->debug_id);
return bundle; return bundle;
...@@ -438,6 +442,7 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) ...@@ -438,6 +442,7 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp)
if (old) if (old)
trace_rxrpc_client(old, -1, rxrpc_client_replace); trace_rxrpc_client(old, -1, rxrpc_client_replace);
candidate->bundle_shift = shift; candidate->bundle_shift = shift;
atomic_inc(&bundle->active);
bundle->conns[i] = candidate; bundle->conns[i] = candidate;
for (j = 0; j < RXRPC_MAXCALLS; j++) for (j = 0; j < RXRPC_MAXCALLS; j++)
set_bit(shift + j, &bundle->avail_chans); set_bit(shift + j, &bundle->avail_chans);
...@@ -728,6 +733,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, ...@@ -728,6 +733,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
smp_rmb(); smp_rmb();
out_put_bundle: out_put_bundle:
rxrpc_deactivate_bundle(bundle);
rxrpc_put_bundle(bundle); rxrpc_put_bundle(bundle);
out: out:
_leave(" = %d", ret); _leave(" = %d", ret);
...@@ -903,9 +909,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call ...@@ -903,9 +909,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call
static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
{ {
struct rxrpc_bundle *bundle = conn->bundle; struct rxrpc_bundle *bundle = conn->bundle;
struct rxrpc_local *local = bundle->params.local;
unsigned int bindex; unsigned int bindex;
bool need_drop = false, need_put = false; bool need_drop = false;
int i; int i;
_enter("C=%x", conn->debug_id); _enter("C=%x", conn->debug_id);
...@@ -924,15 +929,22 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) ...@@ -924,15 +929,22 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
} }
spin_unlock(&bundle->channel_lock); spin_unlock(&bundle->channel_lock);
/* If there are no more connections, remove the bundle */ if (need_drop) {
if (!bundle->avail_chans) { rxrpc_deactivate_bundle(bundle);
_debug("maybe unbundle"); rxrpc_put_connection(conn);
spin_lock(&local->client_bundles_lock); }
}
for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) /*
if (bundle->conns[i]) * Drop the active count on a bundle.
break; */
if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) { static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle)
{
struct rxrpc_local *local = bundle->params.local;
bool need_put = false;
if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) {
if (!bundle->params.exclusive) {
_debug("erase bundle"); _debug("erase bundle");
rb_erase(&bundle->local_node, &local->client_bundles); rb_erase(&bundle->local_node, &local->client_bundles);
need_put = true; need_put = true;
...@@ -942,10 +954,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) ...@@ -942,10 +954,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn)
if (need_put) if (need_put)
rxrpc_put_bundle(bundle); rxrpc_put_bundle(bundle);
} }
if (need_drop)
rxrpc_put_connection(conn);
_leave("");
} }
/* /*
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册