diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index f2a5af239c9569b23d3629accf2bae7327fa2678..f5d3cf86753f76cc683895d5521b25af3264fc86 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -768,6 +768,7 @@ static int xgene_hwmon_remove(struct platform_device *pdev) { struct xgene_hwmon_dev *ctx = platform_get_drvdata(pdev); + cancel_work_sync(&ctx->workq); hwmon_device_unregister(ctx->hwmon_dev); kfifo_free(&ctx->async_msg_fifo); if (acpi_disabled) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 5b0a640eccf673c4178cb437d5f74a4976c2cbae..67b7ef45eb834f1b1df37a4a78b5e9bd8bd3af84 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1690,6 +1690,20 @@ void bond_lower_state_changed(struct slave *slave) netdev_lower_state_changed(slave->dev, &info); } +/* The bonding driver uses ether_setup() to convert a master bond device + * to ARPHRD_ETHER, that resets the target netdevice's flags so we always + * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP + * if they were set + */ +static void bond_ether_setup(struct net_device *bond_dev) +{ + unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP); + + ether_setup(bond_dev); + bond_dev->flags |= IFF_MASTER | flags; + bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; +} + /* enslave device to bond device */ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) @@ -1784,10 +1798,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, if (slave_dev->type != ARPHRD_ETHER) bond_setup_by_slave(bond_dev, slave_dev); - else { - ether_setup(bond_dev); - bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; - } + else + bond_ether_setup(bond_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, bond_dev); @@ -2174,9 +2186,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, eth_hw_addr_random(bond_dev); if (bond_dev->type != ARPHRD_ETHER) { dev_close(bond_dev); - ether_setup(bond_dev); - bond_dev->flags |= IFF_MASTER; - bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; + bond_ether_setup(bond_dev); } } diff --git a/drivers/power/supply/da9150-charger.c b/drivers/power/supply/da9150-charger.c index f9314cc0cd75ff19fcb7cc537116de40fb944414..6b987da586556e0acd38cfbe7aa766da931626d5 100644 --- a/drivers/power/supply/da9150-charger.c +++ b/drivers/power/supply/da9150-charger.c @@ -662,6 +662,7 @@ static int da9150_charger_remove(struct platform_device *pdev) if (!IS_ERR_OR_NULL(charger->usb_phy)) usb_unregister_notifier(charger->usb_phy, &charger->otg_nb); + cancel_work_sync(&charger->otg_work); power_supply_unregister(charger->battery); power_supply_unregister(charger->usb); diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index a43e4c50e69b7ba9bd828e742125c2b75f4334d3..ef2b8ee87b8d3f291e361dbf19023e2ddf212dcb 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -90,7 +90,7 @@ xfs_allocbt_free_block( xfs_agblock_t bno; int error; - bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index cc016776d21e1616cd7809a28d605a1350e85bab..6da91abb129bda152aa639cb16a447e14acddde2 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -1313,7 +1313,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->disk_blkno = xfs_buf_daddr(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1328,7 +1328,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->disk_blkno = xfs_buf_daddr(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ea5a9a93948cd56dca97358619857543d20b4c4c..243099a4f6470e16b8ab4bd52de9bf376e150702 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -239,7 +239,7 @@ xfs_bmap_get_bp( for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { if (!cur->bc_bufs[i]) break; - if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + if (xfs_buf_daddr(cur->bc_bufs[i]) == bno) return cur->bc_bufs[i]; } @@ -248,7 +248,7 @@ xfs_bmap_get_bp( struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; if (bip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_ADDR(bip->bli_buf) == bno) + xfs_buf_daddr(bip->bli_buf) == bno) return bip->bli_buf; } diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index c71741e2857c85f26bfaeb7d096da1e28fe79d9e..b48b38c78b55dc6d4b9ce93fa65429e8278b1ecf 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -282,7 +282,7 @@ xfs_bmbt_free_block( struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_trans *tp = cur->bc_tp; - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); struct xfs_owner_info oinfo; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 7c4d3fd47950408c2d08c28549d514c4ee34ab2e..748a21f52decef7da4c1296c2d0c0100ce74e970 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -50,6 +50,71 @@ xfs_btree_magic( return magic; } +/* + * These sibling pointer checks are optimised for null sibling pointers. This + * happens a lot, and we don't need to byte swap at runtime if the sibling + * pointer is NULL. + * + * These are explicitly marked at inline because the cost of calling them as + * functions instead of inlining them is about 36 bytes extra code per call site + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these + * two sibling check functions reduces the compiled code size by over 300 + * bytes. + */ +static inline xfs_failaddr_t +xfs_btree_check_lblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_fsblock_t fsb, + __be64 dsibling) +{ + xfs_fsblock_t sibling; + + if (dsibling == cpu_to_be64(NULLFSBLOCK)) + return NULL; + + sibling = be64_to_cpu(dsibling); + if (sibling == fsb) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_lptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_fsbno(mp, sibling)) + return __this_address; + } + + return NULL; +} + +static inline xfs_failaddr_t +xfs_btree_check_sblock_siblings( + struct xfs_mount *mp, + struct xfs_btree_cur *cur, + int level, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + __be32 dsibling) +{ + xfs_agblock_t sibling; + + if (dsibling == cpu_to_be32(NULLAGBLOCK)) + return NULL; + + sibling = be32_to_cpu(dsibling); + if (sibling == agbno) + return __this_address; + if (level >= 0) { + if (!xfs_btree_check_sptr(cur, sibling, level + 1)) + return __this_address; + } else { + if (!xfs_verify_agbno(mp, agno, sibling)) + return __this_address; + } + return NULL; +} + /* * Check a long btree block header. Return the address of the failing check, * or NULL if everything is ok. @@ -64,6 +129,8 @@ __xfs_btree_check_lblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_sb_version_hascrc(&mp->m_sb); + xfs_failaddr_t fa; + xfs_fsblock_t fsb = NULLFSBLOCK; if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -82,16 +149,16 @@ __xfs_btree_check_lblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_btree_check_lptr(cur, be64_to_cpu(block->bb_u.l.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /* Check a long btree block header. */ @@ -129,6 +196,9 @@ __xfs_btree_check_sblock( struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; int crc = xfs_sb_version_hascrc(&mp->m_sb); + xfs_failaddr_t fa; + xfs_agblock_t agbno = NULLAGBLOCK; + xfs_agnumber_t agno = NULLAGNUMBER; if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) @@ -145,16 +215,18 @@ __xfs_btree_check_sblock( if (be16_to_cpu(block->bb_numrecs) > cur->bc_ops->get_maxrecs(cur, level)) return __this_address; - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_leftsib), - level + 1)) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_btree_check_sptr(cur, be32_to_cpu(block->bb_u.s.bb_rightsib), - level + 1)) - return __this_address; - return NULL; + if (bp) { + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); + } + + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, + agbno, block->bb_u.s.bb_rightsib); + return fa; } /* Check a short btree block header. */ @@ -423,7 +495,7 @@ xfs_btree_dup_cursor( bp = cur->bc_bufs[i]; if (bp) { error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, + xfs_buf_daddr(bp), mp->m_bsize, 0, &bp, cur->bc_ops->buf_ops); if (error) { @@ -1195,10 +1267,10 @@ xfs_btree_buf_to_ptr( { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, - XFS_BUF_ADDR(bp))); + xfs_buf_daddr(bp))); else { ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, - XFS_BUF_ADDR(bp))); + xfs_buf_daddr(bp))); } } @@ -1742,7 +1814,7 @@ xfs_btree_lookup_get_block( error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); if (error) return error; - if (bp && XFS_BUF_ADDR(bp) == daddr) { + if (bp && xfs_buf_daddr(bp) == daddr) { *blkp = XFS_BUF_TO_BLOCK(bp); return 0; } @@ -3204,7 +3276,7 @@ xfs_btree_insrec( struct xfs_btree_block *block; /* btree block */ struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ - struct xfs_btree_cur *ncur; /* new btree cursor */ + struct xfs_btree_cur *ncur = NULL; /* new btree cursor */ union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ @@ -3284,7 +3356,7 @@ xfs_btree_insrec( #ifdef DEBUG error = xfs_btree_check_block(cur, block, level, bp); if (error) - return error; + goto error0; #endif /* @@ -3304,7 +3376,7 @@ xfs_btree_insrec( for (i = numrecs - ptr; i >= 0; i--) { error = xfs_btree_debug_check_ptr(cur, pp, i, level); if (error) - return error; + goto error0; } xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1); @@ -3389,6 +3461,8 @@ xfs_btree_insrec( return 0; error0: + if (ncur) + xfs_btree_del_cursor(ncur, error); return error; } @@ -4281,6 +4355,21 @@ xfs_btree_visit_block( if (xfs_btree_ptr_is_null(cur, &rptr)) return -ENOENT; + /* + * We only visit blocks once in this walk, so we have to avoid the + * internal xfs_btree_lookup_get_block() optimisation where it will + * return the same block without checking if the right sibling points + * back to us and creates a cyclic reference in the btree. + */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } else { + if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp, + xfs_buf_daddr(bp))) + return -EFSCORRUPTED; + } return xfs_btree_lookup_get_block(cur, level, &rptr, &block); } @@ -4455,20 +4544,21 @@ xfs_btree_lblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + xfs_fsblock_t fsb; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))) - return __this_address; - if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) && - !xfs_verify_fsbno(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))) - return __this_address; - - return NULL; + fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_leftsib); + if (!fa) + fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, + block->bb_u.l.bb_rightsib); + return fa; } /** @@ -4509,22 +4599,23 @@ xfs_btree_sblock_verify( { struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - xfs_agblock_t agno; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + xfs_failaddr_t fa; /* numrecs verification */ if (be16_to_cpu(block->bb_numrecs) > max_recs) return __this_address; /* sibling pointer verification */ - agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); - if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) - return __this_address; - if (block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK) && - !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_rightsib))) - return __this_address; - - return NULL; + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); + agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + block->bb_u.s.bb_leftsib); + if (!fa) + fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, + block->bb_u.s.bb_rightsib); + return fa; } /* diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index cc919a2ee870f09c00e579eeae95bccb98338321..5a0e4d70f0b49ca624b35c48bc52605848178059 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -156,7 +156,7 @@ __xfs_inobt_free_block( { xfs_inobt_mod_blockcount(cur, -1); return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, + XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1, &XFS_RMAP_OINFO_INOBT, resv); } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 3e0f5741d2d37f2dca5e79f091efc9b48a05a0b5..4b247577fdc6b1a505141ea8723226c140295cdb 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -48,7 +48,7 @@ xfs_inode_buf_verify( /* * Validate the magic number and version of every inode in the buffer */ - agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { int di_ok; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index a6ac60ae94216589f0afe7a78a86ac24e82863ae..b2e0407c00685b66db33e752b08ba7cae784adba 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -102,7 +102,7 @@ xfs_refcountbt_free_block( struct xfs_mount *mp = cur->bc_mp; struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); int error; trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.agno, diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 9f5bcbd834c392f0abf617e3a50466225cb2cebf..ab06af29d9db2a45a3a88596abb2331e1e26893f 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -124,7 +124,7 @@ xfs_rmapbt_free_block( xfs_agblock_t bno; int error; - bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); trace_xfs_rmapbt_free_block(cur->bc_mp, cur->bc_ag.agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 1296cff23a3b30793773371c9efb89efc7ad8cd2..836bbcdc88bdce10c004b7693671098686ea8a11 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -156,7 +156,7 @@ xfs_validate_sb_write( * secondary superblocks, so allow this usage to continue because * we never read counters from such superblocks. */ - if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && + if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && (sbp->sb_fdblocks > sbp->sb_dblocks || !xfs_verify_icount(mp, sbp->sb_icount) || sbp->sb_ifree > sbp->sb_icount)) { @@ -382,17 +382,6 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } - /* - * Until this is fixed only page-sized or smaller data blocks work. - */ - if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - xfs_warn(mp, - "File system with blocksize %d bytes. " - "Only pagesize (%ld) or less will currently work.", - sbp->sb_blocksize, PAGE_SIZE); - return -ENOSYS; - } - /* * Currently only very few inode sizes are supported. */ @@ -408,22 +397,6 @@ xfs_validate_sb_common( return -ENOSYS; } - if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || - xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { - xfs_warn(mp, - "file system too large to be mounted on this system."); - return -EFBIG; - } - - /* - * Don't touch the filesystem if a user tool thinks it owns the primary - * superblock. mkfs doesn't clear the flag from secondary supers, so - * we don't check them at all. - */ - if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && sbp->sb_inprogress) { - xfs_warn(mp, "Offline file system operation in progress!"); - return -EFSCORRUPTED; - } return 0; } diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 043984245a6d3cee5c22209139f92f7f050af4c3..5c962b9d80e0bc085b9380afd9921919334a93ac 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -435,12 +435,12 @@ xchk_btree_check_owner( if (!co) return -ENOMEM; co->level = level; - co->daddr = XFS_BUF_ADDR(bp); + co->daddr = xfs_buf_daddr(bp); list_add_tail(&co->list, &bs->to_check); return 0; } - return xchk_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp)); + return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp)); } /* diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 0be9a3567c957852d80802f3e6eb2fad57b54126..d9c7ff1469c845de7e8267e9db3571fd67089e00 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -178,7 +178,7 @@ xfs_attr3_node_inactive( return error; /* save for re-read later */ - child_blkno = XFS_BUF_ADDR(child_bp); + child_blkno = xfs_buf_daddr(child_bp); /* * Invalidate the subtree, however we have to. diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index b20a9b639acb35a0d8314735710426701edb75bd..e2148f2d5d6bc048ebc666fc6ada98fb13fe5f06 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(unsigned int count) return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES); } -static void -xfs_flush_bdev_async_endio( - struct bio *bio) -{ - complete(bio->bi_private); -} - -/* - * Submit a request for an async cache flush to run. If the request queue does - * not require flush operations, just skip it altogether. If the caller needs - * to wait for the flush completion at a later point in time, they must supply a - * valid completion. This will be signalled when the flush completes. The - * caller never sees the bio that is issued here. - */ -void -xfs_flush_bdev_async( - struct bio *bio, - struct block_device *bdev, - struct completion *done) -{ - struct request_queue *q = bdev->bd_disk->queue; - - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - complete(done); - return; - } - - bio_init(bio, NULL, 0); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - bio->bi_private = done; - bio->bi_end_io = xfs_flush_bdev_async_endio; - - submit_bio(bio); -} int xfs_rw_bdev( struct block_device *bdev, diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 4cf07ffd9edf54661c8145f68db7be259edc406e..0bdf1811db159d3413944f0a36785996cacce462 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1402,7 +1402,7 @@ xfs_buf_ioerror_alert( { xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", - func, (uint64_t)XFS_BUF_ADDR(bp), + func, (uint64_t)xfs_buf_daddr(bp), bp->b_length, -bp->b_error); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 6ab044cea75dd5a23508af45cf8d35d1abb9ca55..200793275c36405207b6561a0f4592c12d8f41ad 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -305,9 +305,13 @@ extern void xfs_buf_terminate(void); * In future, uncached buffers will pass the block number directly to the io * request function and hence these macros will go away at that point. */ -#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) +static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) +{ + return bp->b_maps[0].bm_bn; +} + void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); /* diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 4f09e6991e4ec6d07862ffcc5e987ea6efb9b1e1..318330432297c39daa7a0206d35a4ff0965e8192 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -447,7 +447,7 @@ xfs_inodegc_queue_all( for_each_online_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); } } @@ -1874,8 +1874,8 @@ void xfs_inodegc_worker( struct work_struct *work) { - struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, - work); + struct xfs_inodegc *gc = container_of(to_delayed_work(work), + struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; unsigned int nofs_flag; @@ -1905,28 +1905,30 @@ xfs_inodegc_worker( } /* - * Force all currently queued inode inactivation work to run immediately, and - * wait for the work to finish. Two pass - queue all the work first pass, wait - * for it in a second pass. + * Expedite all pending inodegc work to run immediately. This does not wait for + * completion of the work. */ void -xfs_inodegc_flush( +xfs_inodegc_push( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_is_inodegc_enabled(mp)) return; - - trace_xfs_inodegc_flush(mp, __return_address); - + trace_xfs_inodegc_push(mp, __return_address); xfs_inodegc_queue_all(mp); +} - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - flush_work(&gc->work); - } +/* + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + xfs_inodegc_push(mp); + trace_xfs_inodegc_flush(mp, __return_address); + flush_workqueue(mp->m_inodegc_wq); } /* @@ -1937,18 +1939,12 @@ void xfs_inodegc_stop( struct xfs_mount *mp) { - struct xfs_inodegc *gc; - int cpu; - if (!xfs_clear_inodegc_enabled(mp)) return; xfs_inodegc_queue_all(mp); + drain_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { - gc = per_cpu_ptr(mp->m_inodegc, cpu); - cancel_work_sync(&gc->work); - } trace_xfs_inodegc_stop(mp, __return_address); } @@ -2068,6 +2064,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); spin_lock(&ip->i_flags_lock); @@ -2079,19 +2076,26 @@ xfs_inodegc_queue( items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); - put_cpu_ptr(gc); - if (!xfs_is_inodegc_enabled(mp)) + /* + * We queue the work while holding the current CPU so that the work + * is scheduled to run on this CPU. + */ + if (!xfs_is_inodegc_enabled(mp)) { + put_cpu_ptr(gc); return; - - if (xfs_inodegc_want_queue_work(ip, items)) { - trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); } + if (xfs_inodegc_want_queue_work(ip, items)) + queue_delay = 0; + + trace_xfs_inodegc_queue(mp, __return_address); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, queue_delay); + put_cpu_ptr(gc); + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); - flush_work(&gc->work); + flush_delayed_work(&gc->work); } } @@ -2108,7 +2112,7 @@ xfs_inodegc_cpu_dead( unsigned int count = 0; dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_work_sync(&dead_gc->work); + cancel_delayed_work_sync(&dead_gc->work); if (llist_empty(&dead_gc->list)) return; @@ -2127,12 +2131,12 @@ xfs_inodegc_cpu_dead( llist_add_batch(first, last, &gc->list); count += READ_ONCE(gc->items); WRITE_ONCE(gc->items, count); - put_cpu_ptr(gc); if (xfs_is_inodegc_enabled(mp)) { trace_xfs_inodegc_queue(mp, __return_address); - queue_work(mp->m_inodegc_wq, &gc->work); + mod_delayed_work(mp->m_inodegc_wq, &gc->work, 0); } + put_cpu_ptr(gc); } /* @@ -2227,7 +2231,7 @@ xfs_inodegc_shrinker_scan( unsigned int h = READ_ONCE(gc->shrinker_hits); WRITE_ONCE(gc->shrinker_hits, h + 1); - queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); no_items = false; } } diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 2e4cfddf8b8ed8faeb49bd5c54e0c501fb2b4b8c..6cd180721659b28f9aa122f8f8bc03296ac92b3e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -76,6 +76,7 @@ void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_push(struct xfs_mount *mp); void xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 953d98bc483258c48ecac91ca94958a0fcf31dab..af6be9b9ccdf81d806b9e6ed4e37e773d1896051 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -196,8 +196,6 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, char *data, unsigned int op); -void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, - struct completion *done); #define ASSERT_ALWAYS(expr) \ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 7bf94b1d3dbd06abaf1c8ec74a6c2dd6423bdb8c..6e70143176fb2ac0fcc3c8484801f108dfc77bd6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -75,13 +75,12 @@ xlog_verify_iclog( STATIC void xlog_verify_tail_lsn( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t tail_lsn); + struct xlog_in_core *iclog); #else #define xlog_verify_dest_ptr(a,b) #define xlog_verify_grant_tail(a) #define xlog_verify_iclog(a,b,c) -#define xlog_verify_tail_lsn(a,b,c) +#define xlog_verify_tail_lsn(a,b) #endif STATIC int @@ -523,17 +522,28 @@ xlog_state_shutdown_callbacks( /* * Flush iclog to disk if this is the last reference to the given iclog and the - * it is in the WANT_SYNC state. If the caller passes in a non-zero - * @old_tail_lsn and the current log tail does not match, there may be metadata - * on disk that must be persisted before this iclog is written. To satisfy that - * requirement, set the XLOG_ICL_NEED_FLUSH flag as a condition for writing this - * iclog with the new log tail value. + * it is in the WANT_SYNC state. + * + * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the + * log tail is updated correctly. NEED_FUA indicates that the iclog will be + * written to stable storage, and implies that a commit record is contained + * within the iclog. We need to ensure that the log tail does not move beyond + * the tail that the first commit record in the iclog ordered against, otherwise + * correct recovery of that checkpoint becomes dependent on future operations + * performed on this iclog. + * + * Hence if NEED_FUA is set and the current iclog tail lsn is empty, write the + * current tail into iclog. Once the iclog tail is set, future operations must + * not modify it, otherwise they potentially violate ordering constraints for + * the checkpoint commit that wrote the initial tail lsn value. The tail lsn in + * the iclog will get zeroed on activation of the iclog after sync, so we + * always capture the tail lsn on the iclog on the first NEED_FUA release + * regardless of the number of active reference counts on this iclog. */ int xlog_state_release_iclog( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t old_tail_lsn) + struct xlog_in_core *iclog) { xfs_lsn_t tail_lsn; bool last_ref; @@ -544,14 +554,14 @@ xlog_state_release_iclog( /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does - * not move between deciding if a cache flush is required and writing - * the LSN into the iclog below. + * not move between the first time we know that the iclog needs to be + * made stable and when we eventually submit it. */ - if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { + if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || + (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && + !iclog->ic_header.h_tail_lsn) { tail_lsn = xlog_assign_tail_lsn(log->l_mp); - - if (old_tail_lsn && tail_lsn != old_tail_lsn) - iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } last_ref = atomic_dec_and_test(&iclog->ic_refcnt); @@ -576,8 +586,7 @@ xlog_state_release_iclog( } iclog->ic_state = XLOG_STATE_SYNCING; - iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); - xlog_verify_tail_lsn(log, iclog, tail_lsn); + xlog_verify_tail_lsn(log, iclog); trace_xlog_iclog_syncing(iclog, _RET_IP_); spin_unlock(&log->l_icloglock); @@ -845,7 +854,7 @@ xlog_force_iclog( iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; if (iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); - return xlog_state_release_iclog(iclog->ic_log, iclog, 0); + return xlog_state_release_iclog(iclog->ic_log, iclog); } /* @@ -2299,7 +2308,7 @@ xlog_write_copy_finish( ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || xlog_is_shutdown(log)); release_iclog: - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; } @@ -2516,7 +2525,7 @@ xlog_write( spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); return error; @@ -2553,6 +2562,7 @@ xlog_state_activate_iclog( memset(iclog->ic_header.h_cycle_data, 0, sizeof(iclog->ic_header.h_cycle_data)); iclog->ic_header.h_lsn = 0; + iclog->ic_header.h_tail_lsn = 0; } /* @@ -2939,7 +2949,7 @@ xlog_state_get_iclog_space( * reference to the iclog. */ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) - error = xlog_state_release_iclog(log, iclog, 0); + error = xlog_state_release_iclog(log, iclog); spin_unlock(&log->l_icloglock); if (error) return error; @@ -3581,10 +3591,10 @@ xlog_verify_grant_tail( STATIC void xlog_verify_tail_lsn( struct xlog *log, - struct xlog_in_core *iclog, - xfs_lsn_t tail_lsn) + struct xlog_in_core *iclog) { - int blocks; + xfs_lsn_t tail_lsn = be64_to_cpu(iclog->ic_header.h_tail_lsn); + int blocks; if (CYCLE_LSN(tail_lsn) == log->l_prev_cycle) { blocks = diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index c2fa07909ea62c2b8d9ee72ca4795b30a980e20c..1022b4bde0da28424ffc807b1f09b76e95d3dd86 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -705,11 +705,21 @@ xlog_cil_set_ctx_write_state( * The LSN we need to pass to the log items on transaction * commit is the LSN reported by the first log vector write, not * the commit lsn. If we use the commit record lsn then we can - * move the tail beyond the grant write head. + * move the grant write head beyond the tail LSN and overwrite + * it. */ ctx->start_lsn = lsn; wake_up_all(&cil->xc_start_wait); spin_unlock(&cil->xc_push_lock); + + /* + * Make sure the metadata we are about to overwrite in the log + * has been flushed to stable storage before this iclog is + * issued. + */ + spin_lock(&cil->xc_log->l_icloglock); + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + spin_unlock(&cil->xc_log->l_icloglock); return; } @@ -888,10 +898,7 @@ xlog_cil_push_work( struct xfs_trans_header thdr; struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; - xfs_lsn_t preflush_tail_lsn; xfs_csn_t push_seq; - struct bio bio; - DECLARE_COMPLETION_ONSTACK(bdev_flush); bool push_commit_stable; new_ctx = xlog_cil_ctx_alloc(); @@ -961,23 +968,6 @@ xlog_cil_push_work( list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); - /* - * The CIL is stable at this point - nothing new will be added to it - * because we hold the flush lock exclusively. Hence we can now issue - * a cache flush to ensure all the completed metadata in the journal we - * are about to overwrite is on stable storage. - * - * Because we are issuing this cache flush before we've written the - * tail lsn to the iclog, we can have metadata IO completions move the - * tail forwards between the completion of this flush and the iclog - * being written. In this case, we need to re-issue the cache flush - * before the iclog write. To detect whether the log tail moves, sample - * the tail LSN *before* we issue the flush. - */ - preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); - xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, - &bdev_flush); - /* * Pull all the log vectors off the items in the CIL, and remove the * items from the CIL. We don't need the CIL lock here because it's only @@ -1054,12 +1044,6 @@ xlog_cil_push_work( lvhdr.lv_iovecp = &lhdr; lvhdr.lv_next = ctx->lv_chain; - /* - * Before we format and submit the first iclog, we have to ensure that - * the metadata writeback ordering cache flush is complete. - */ - wait_for_completion(&bdev_flush); - error = xlog_cil_write_chain(ctx, &lvhdr); if (error) goto out_abort_free_ticket; @@ -1118,7 +1102,7 @@ xlog_cil_push_work( if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); - xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ @@ -1139,7 +1123,7 @@ xlog_cil_push_work( return; } spin_lock(&log->l_icloglock); - xlog_state_release_iclog(log, ctx->commit_iclog, 0); + xlog_state_release_iclog(log, ctx->commit_iclog); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index ca19a3f04bee0064ef5efa78f3abf9777537917b..eee173bdc8f48b97443b7bbe9f592e835741b64f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -48,6 +48,16 @@ enum xlog_iclog_state { { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \ { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" } +/* + * In core log flags + */ +#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ +#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ + +#define XLOG_ICL_STRINGS \ + { XLOG_ICL_NEED_FLUSH, "XLOG_ICL_NEED_FLUSH" }, \ + { XLOG_ICL_NEED_FUA, "XLOG_ICL_NEED_FUA" } + /* * Log ticket flags @@ -132,9 +142,6 @@ enum xlog_iclog_state { #define XLOG_COVER_OPS 5 -#define XLOG_ICL_NEED_FLUSH (1 << 0) /* iclog needs REQ_PREFLUSH */ -#define XLOG_ICL_NEED_FUA (1 << 1) /* iclog needs REQ_FUA */ - /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 @@ -511,8 +518,7 @@ void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, int eventual_size); -int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, - xfs_lsn_t log_tail_lsn); +int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); /* * When we crack an atomic LSN, we sample it first so that the value will not diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 3dc54ed4315acb9a0a46cfd2385f96cd0bd7cb43..4410d0fad4925c35427deb8c90d7000c0761158c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -60,7 +60,7 @@ struct xfs_error_cfg { */ struct xfs_inodegc { struct llist_head list; - struct work_struct work; + struct delayed_work work; /* approximate count of inodes in the list */ unsigned int items; diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index b541dc3fac17c94edf0f471f0c805e44bc504bb5..e429284f78c1be7fce6f1ef6292518408757cb75 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -481,9 +481,12 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; - /* Flush inodegc work at the start of a quota reporting scan. */ + /* + * Expedite pending inodegc work at the start of a quota reporting + * scan but don't block waiting for it to complete. + */ if (id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); /* * Try to get the dquot. We don't want it allocated on disk, so don't @@ -525,7 +528,7 @@ xfs_qm_scall_getquota_next( /* Flush inodegc work at the start of a quota reporting scan. */ if (*id == 0) - xfs_inodegc_flush(mp); + xfs_inodegc_push(mp); error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 04110183f3eda3cfd75a59931c579cfd51d19a89..a79deea63a266e68937a109280345dcaaf40451a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -789,8 +789,11 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; - /* Wait for whatever inactivations are in progress. */ - xfs_inodegc_flush(mp); + /* + * Expedite background inodegc but don't wait. We do not want to block + * here waiting hours for a billion extent file to be truncated. + */ + xfs_inodegc_push(mp); statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1101,7 +1104,7 @@ xfs_inodegc_init_percpu( gc = per_cpu_ptr(mp->m_inodegc, cpu); init_llist_head(&gc->list); gc->items = 0; - INIT_WORK(&gc->work, xfs_inodegc_worker); + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); } return 0; } @@ -1547,6 +1550,38 @@ xfs_fc_fill_super( #endif } + /* + * Don't touch the filesystem if a user tool thinks it owns the primary + * superblock. mkfs doesn't clear the flag from secondary supers, so + * we don't check them at all. + */ + if (mp->m_sb.sb_inprogress) { + xfs_warn(mp, "Offline file system operation in progress!"); + error = -EFSCORRUPTED; + goto out_free_sb; + } + + /* + * Until this is fixed only page-sized or smaller data blocks work. + */ + if (mp->m_sb.sb_blocksize > PAGE_SIZE) { + xfs_warn(mp, + "File system with blocksize %d bytes. " + "Only pagesize (%ld) or less will currently work.", + mp->m_sb.sb_blocksize, PAGE_SIZE); + error = -ENOSYS; + goto out_free_sb; + } + + /* Ensure this filesystem fits in the page cache limits */ + if (xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_dblocks) || + xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_rblocks)) { + xfs_warn(mp, + "file system too large to be mounted on this system."); + error = -EFBIG; + goto out_free_sb; + } + /* * XFS block mappings use 54 bits to store the logical block offset. * This should suffice to handle the maximum file size that the VFS diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ae96aaa3bb3190aa6c770f2d27cac4200f17b241..cdf6e23ec517c228337f2fddcc2402267c4010d3 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -187,6 +187,7 @@ DEFINE_EVENT(xfs_fs_class, name, \ TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ TP_ARGS(mp, caller_ip)) DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_push); DEFINE_FS_EVENT(xfs_inodegc_start); DEFINE_FS_EVENT(xfs_inodegc_stop); DEFINE_FS_EVENT(xfs_inodegc_queue); @@ -4010,6 +4011,7 @@ DECLARE_EVENT_CLASS(xlog_iclog_class, __field(uint32_t, state) __field(int32_t, refcount) __field(uint32_t, offset) + __field(uint32_t, flags) __field(unsigned long long, lsn) __field(unsigned long, caller_ip) ), @@ -4018,15 +4020,17 @@ DECLARE_EVENT_CLASS(xlog_iclog_class, __entry->state = iclog->ic_state; __entry->refcount = atomic_read(&iclog->ic_refcnt); __entry->offset = iclog->ic_offset; + __entry->flags = iclog->ic_flags; __entry->lsn = be64_to_cpu(iclog->ic_header.h_lsn); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx caller %pS", + TP_printk("dev %d:%d state %s refcnt %d offset %u lsn 0x%llx flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->state, XLOG_STATE_STRINGS), __entry->refcount, __entry->offset, __entry->lsn, + __print_flags(__entry->flags, "|", XLOG_ICL_STRINGS), (char *)__entry->caller_ip) ); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 342817e15ada8a32ceb88011b278a8ea9dee4130..ae645a80ded0a1a1ca521e7d1544b205c185618c 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -38,7 +38,7 @@ xfs_trans_buf_item_match( blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && - XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && + xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn && blip->bli_buf->b_length == len) { ASSERT(blip->bli_buf->b_map_count == nmaps); return blip->bli_buf; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a200750b536ccffede15ecdcde55d129897cde3e..d9ba27f4c11605c96aeb0d0c5a1d1061d2877997 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1090,7 +1090,8 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); static struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd, bool fixed); + struct io_kiocb *req, int fd, bool fixed, + unsigned int issue_flags); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -3914,7 +3915,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; in = io_file_get(req->ctx, req, sp->splice_fd_in, - (sp->flags & SPLICE_F_FD_IN_FIXED)); + (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags); if (!in) { ret = -EBADF; goto done; @@ -3954,7 +3955,7 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; in = io_file_get(req->ctx, req, sp->splice_fd_in, - (sp->flags & SPLICE_F_FD_IN_FIXED)); + (sp->flags & SPLICE_F_FD_IN_FIXED), issue_flags); if (!in) { ret = -EBADF; goto done; @@ -6742,13 +6743,16 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file } static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd) + struct io_kiocb *req, int fd, + unsigned int issue_flags) { - struct file *file; + struct file *file = NULL; unsigned long file_ptr; + io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - return NULL; + goto out; fd = array_index_nospec(fd, ctx->nr_user_files); file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; file = (struct file *) (file_ptr & FFS_MASK); @@ -6756,6 +6760,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, /* mask in overlapping REQ_F and FFS bits */ req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); io_req_set_rsrc_node(req); +out: + io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); return file; } @@ -6773,10 +6779,11 @@ static struct file *io_file_get_normal(struct io_ring_ctx *ctx, } static inline struct file *io_file_get(struct io_ring_ctx *ctx, - struct io_kiocb *req, int fd, bool fixed) + struct io_kiocb *req, int fd, bool fixed, + unsigned int issue_flags) { if (fixed) - return io_file_get_fixed(ctx, req, fd); + return io_file_get_fixed(ctx, req, fd, issue_flags); else return io_file_get_normal(ctx, req, fd); } @@ -6998,7 +7005,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (io_op_defs[req->opcode].needs_file) { req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), - (sqe_flags & IOSQE_FIXED_FILE)); + (sqe_flags & IOSQE_FIXED_FILE), + IO_URING_F_NONBLOCK); if (unlikely(!req->file)) ret = -EBADF; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 19668d614f4738d511df3289d650e55ca3bb7c8d..4196b9f84690066a75f6e67bdefbe4ff79eb0336 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -350,7 +350,8 @@ asmlinkage __visible void do_softirq(void) */ void irq_enter_rcu(void) { - if (is_idle_task(current) && !in_interrupt()) { + if (tick_nohz_full_cpu(smp_processor_id()) || + (is_idle_task(current) && !in_interrupt())) { /* * Prevent raise_softirq from needlessly waking up ksoftirqd * here, as softirq will be serviced on return from interrupt. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 33750db5b56469e9cd770a0f0a4bfe75cdd7f747..aed5d6b6ca718f90a3509d97295e413ca5419a83 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1347,6 +1347,13 @@ static inline void tick_nohz_irq_enter(void) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(ts, now); + /* + * If all CPUs are idle. We may need to update a stale jiffies value. + * Note nohz_full is a special case: a timekeeper is guaranteed to stay + * alive but it might be busy looping with interrupts disabled in some + * rare case (typically stop machine). So we must make sure we have a + * last resort. + */ if (ts->tick_stopped) tick_nohz_update_jiffies(now); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ccb65412b6704b38f88fe3f398d9529143e711a6..90ab12559092e500421936aed5f5a650a5ac0792 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -392,6 +392,7 @@ enum rxrpc_conn_proto_state { struct rxrpc_bundle { struct rxrpc_conn_parameters params; atomic_t usage; + atomic_t active; /* Number of active users */ unsigned int debug_id; bool try_upgrade; /* True if the bundle is attempting upgrade */ bool alloc_conn; /* True if someone's getting a conn */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index f5fb223aba82ab3684cc2880bd51c1b8a1c04481..34430cc9a0a3fc9ea75921b341cf1b1752a9b85c 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -40,6 +40,8 @@ __read_mostly unsigned long rxrpc_conn_idle_client_fast_expiry = 2 * HZ; DEFINE_IDR(rxrpc_client_conn_ids); static DEFINE_SPINLOCK(rxrpc_conn_id_lock); +static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); + /* * Get a connection ID and epoch for a client connection from the global pool. * The connection struct pointer is then recorded in the idr radix tree. The @@ -123,6 +125,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, bundle->params = *cp; rxrpc_get_peer(bundle->params.peer); atomic_set(&bundle->usage, 1); + atomic_set(&bundle->active, 1); spin_lock_init(&bundle->channel_lock); INIT_LIST_HEAD(&bundle->waiting_calls); } @@ -341,6 +344,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c rxrpc_free_bundle(candidate); found_bundle: rxrpc_get_bundle(bundle); + atomic_inc(&bundle->active); spin_unlock(&local->client_bundles_lock); _leave(" = %u [found]", bundle->debug_id); return bundle; @@ -438,6 +442,7 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) if (old) trace_rxrpc_client(old, -1, rxrpc_client_replace); candidate->bundle_shift = shift; + atomic_inc(&bundle->active); bundle->conns[i] = candidate; for (j = 0; j < RXRPC_MAXCALLS; j++) set_bit(shift + j, &bundle->avail_chans); @@ -728,6 +733,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, smp_rmb(); out_put_bundle: + rxrpc_deactivate_bundle(bundle); rxrpc_put_bundle(bundle); out: _leave(" = %d", ret); @@ -903,9 +909,8 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) { struct rxrpc_bundle *bundle = conn->bundle; - struct rxrpc_local *local = bundle->params.local; unsigned int bindex; - bool need_drop = false, need_put = false; + bool need_drop = false; int i; _enter("C=%x", conn->debug_id); @@ -924,15 +929,22 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) } spin_unlock(&bundle->channel_lock); - /* If there are no more connections, remove the bundle */ - if (!bundle->avail_chans) { - _debug("maybe unbundle"); - spin_lock(&local->client_bundles_lock); + if (need_drop) { + rxrpc_deactivate_bundle(bundle); + rxrpc_put_connection(conn); + } +} - for (i = 0; i < ARRAY_SIZE(bundle->conns); i++) - if (bundle->conns[i]) - break; - if (i == ARRAY_SIZE(bundle->conns) && !bundle->params.exclusive) { +/* + * Drop the active count on a bundle. + */ +static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) +{ + struct rxrpc_local *local = bundle->params.local; + bool need_put = false; + + if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { + if (!bundle->params.exclusive) { _debug("erase bundle"); rb_erase(&bundle->local_node, &local->client_bundles); need_put = true; @@ -942,10 +954,6 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (need_put) rxrpc_put_bundle(bundle); } - - if (need_drop) - rxrpc_put_connection(conn); - _leave(""); } /*