// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and * only version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include "euler.h" #include "dax.h" #include "dep.h" #include "wear.h" #include "alloc_interface.h" int eufs_persist_btree_node(void *root, int sta, int len); static __always_inline void eufs_clear_pmem(void *addr, size_t size) { memset(addr, 0, size); eufs_flush_range(addr, size); } static __always_inline void *eufs_find_data_block_btree(struct inode *inode, unsigned long blocknr, __le64 **parent) { __le64 *bp; u32 height, bit_shift; unsigned int idx; struct eufs_inode_info *vi = EUFS_I(inode); /* inode must be a regular file */ height = vi->i_volatile_height; bp = vi->i_volatile_root; NV_ASSERT(blocknr < (1UL << (height * EUFS_FILE_TREE_DEGREE_SHIFT))); if (height == 0) { BUG_ON(blocknr != 0); if (parent) *parent = NULL; return (void *)bp; } if (height == 1) { if (bp[blocknr] == NULL_VAL) BUG(); if (parent) *parent = bp; return s2p(inode->i_sb, bp[blocknr]); } while (height > 0) { bit_shift = (height - 1) * EUFS_FILE_TREE_DEGREE_SHIFT; idx = blocknr >> bit_shift; if (parent) *parent = bp; bp = s2p(inode->i_sb, bp[idx]); if (bp == 0) return 0; blocknr = blocknr & ((1 << bit_shift) - 1); height--; } return bp; } static int eufs_extend_btree_recursive_blind(struct inode *inode, int level_left, __le64 *parent, int sta_index, int end_index, /* inclusive */ struct alloc_batch *ab) { struct super_block *sb = inode->i_sb; void *p; long r; int i; for (i = sta_index; i <= end_index; ++i) { if (!level_left) { parent[i] = NULL_ADDR; continue; } /* level_left */ p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab); if (!p) return -ENOSPC; parent[i] = p2s(sb, p); /* recur */ r = eufs_extend_btree_recursive_blind(inode, level_left - 1, p, 0, EUFS_FILE_TREE_DEGREE - 1, ab); if (IS_ERR_VALUE(r)) return r; } return 0; } /* * Allocate blocks from top to bottom. * * Only allocate the interior blocks which will have a leaf child block or * an interior child block. And the unused pointers for children will NOT * be zeroed. * * New leaf blocks are not allocated, and their values are set to NULL_ADDR. */ static int eufs_extend_btree_recursive(struct inode *inode, int level_left, __le64 *parent, unsigned long origin, unsigned long num_blocks, struct alloc_batch *ab, bool blind) { struct super_block *sb = inode->i_sb; const unsigned long nblocks_per_slot = 1 << (level_left * EUFS_FILE_TREE_DEGREE_SHIFT); unsigned long off; int sta_index, end_index; int i; long r; void *p; if (blind) { return eufs_extend_btree_recursive_blind( inode, level_left, parent, 0, EUFS_FILE_TREE_DEGREE - 1, ab); } if (origin == 0) { /* end_index could be zero */ end_index = (num_blocks - 1) / nblocks_per_slot; r = eufs_extend_btree_recursive_blind(inode, level_left, parent, 0, end_index - 1, ab); if (IS_ERR_VALUE(r)) return r; if (!level_left) { parent[end_index] = NULL_ADDR; } else { p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab); if (!p) return -ENOSPC; parent[end_index] = p2s(sb, p); off = nblocks_per_slot * end_index; r = eufs_extend_btree_recursive(inode, level_left - 1, p, 0, num_blocks - off, ab, false); if (IS_ERR_VALUE(r)) return r; } return 0; } sta_index = (origin - 1) / nblocks_per_slot; end_index = (num_blocks - 1) / nblocks_per_slot; /* * No need to create a new sub-tree, so descend to the sub-tree * rooted in parent[sta_index] */ if (sta_index == end_index) { if (!level_left) return 0; /* calculate the needed block count in the sub-tree */ off = sta_index * nblocks_per_slot; r = eufs_extend_btree_recursive(inode, level_left - 1, s2p(sb, parent[sta_index]), origin - off, num_blocks - off, ab, false); if (IS_ERR_VALUE(r)) return r; return 0; } if (!level_left) { for (i = sta_index + 1; i <= end_index; ++i) parent[i] = NULL_ADDR; return 0; } /* extend sub-tree shared with existed blocks to its maximum size */ off = sta_index * nblocks_per_slot; r = eufs_extend_btree_recursive(inode, level_left - 1, s2p(sb, parent[sta_index]), origin - off, nblocks_per_slot, ab, false); if (IS_ERR_VALUE(r)) return r; /* new sub-trees which will be fully initialized */ r = eufs_extend_btree_recursive_blind(inode, level_left, parent, sta_index + 1, end_index - 1, ab); if (IS_ERR_VALUE(r)) return r; /* the last new sub-tree which may only needs partial initialization */ p = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab); if (!p) return -ENOSPC; parent[end_index] = p2s(sb, p); off = end_index * nblocks_per_slot; r = eufs_extend_btree_recursive(inode, level_left - 1, p, 0, num_blocks - off, ab, false); if (IS_ERR_VALUE(r)) return r; return 0; } static unsigned long eufs_count_pages(unsigned long leaf_blocks) { unsigned long tot = leaf_blocks; while (leaf_blocks > 1) { leaf_blocks = DIV_ROUND_UP(leaf_blocks, EUFS_FILE_TREE_DEGREE); tot += leaf_blocks; } return tot; } /* So that we have page[0..num_blocks-1] */ int eufs_extend_btree(struct inode *inode, unsigned long num_blocks) { struct eufs_inode_info *vi = EUFS_I(inode); unsigned long full_size; unsigned long need_blocks; __le64 *new_root; long r = 0; struct alloc_batch *ab = &vi->page_batch; if (!num_blocks) return 0; if (vi->i_volatile_tree_blocks >= num_blocks) /* already allocated */ return 0; if (num_blocks > inode->i_sb->s_maxbytes >> EUFS_BLOCK_SIZE_BITS) return -EFBIG; /* Grow from vi->i_volatile_tree_blocks to num_blocks */ need_blocks = eufs_count_pages(num_blocks) - eufs_count_pages(vi->i_volatile_tree_blocks); /* Set NULL_ADDR for extended data blocks */ need_blocks -= (num_blocks - vi->i_volatile_tree_blocks); r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks); if (IS_ERR_VALUE(r)) return r; BUG_ON(!vi->i_volatile_root); if (!vi->i_volatile_root) { vi->i_volatile_root = eufs_alloc_batch_allocate_file_data(inode->i_sb, ab); BUG_ON(!vi->i_volatile_root); vi->i_volatile_height = 0; } if (num_blocks == 1) { /* Already allocated */ goto out; } full_size = 1UL << (vi->i_volatile_height * EUFS_FILE_TREE_DEGREE_SHIFT); while (full_size < num_blocks) { new_root = eufs_alloc_batch_allocate_file_index(inode->i_sb, ab); new_root[0] = p2s(inode->i_sb, vi->i_volatile_root); vi->i_volatile_root = new_root; vi->i_volatile_height++; full_size <<= EUFS_FILE_TREE_DEGREE_SHIFT; } BUG_ON(vi->i_volatile_height < 1); r = eufs_extend_btree_recursive(inode, vi->i_volatile_height - 1, vi->i_volatile_root, vi->i_volatile_tree_blocks, num_blocks, ab, false); out: eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab); vi->i_volatile_tree_blocks = num_blocks; num_blocks <<= (inode->i_blkbits - 9); if (num_blocks > inode->i_blocks) inode->i_blocks = num_blocks; return r; } int eufs_alloc_blocks_btree(struct inode *inode, unsigned long start_block, unsigned long num_blocks, int zero) { long r; unsigned long blocknr, need_blocks = 0, end_block = start_block + num_blocks; long pi_tree_blocks = eufs_iread_tree_blocks(EUFS_FRESH_PI(EUFS_PI(inode))); struct eufs_inode_info *vi = EUFS_I(inode); struct alloc_batch *ab = &vi->page_batch; __le64 *parent; unsigned int ofs; void *xmem; int last_ofs_line = -1; r = eufs_extend_btree(inode, start_block + num_blocks); if (r) return r; if (start_block == 0) vi->hole_at_sta = false; /* The 0th data block is always allocated. */ blocknr = start_block ? start_block : 1; /* TODO: need optimization. */ while (blocknr < end_block) { eufs_find_data_block_btree(inode, blocknr, &parent); BUG_ON(!parent); ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1); while (ofs < EUFS_FILE_TREE_DEGREE && blocknr < end_block) { if (parent[ofs] == NULL_ADDR) { /* * The leaf blocks are not allocated before persist, * e.g. through truncate() + fsync() */ if (blocknr < pi_tree_blocks) { xmem = eufs_zalloc_file_data( inode->i_sb); if (!xmem) return -ENOSPC; eufs_alloc_persist(inode->i_sb, xmem, false); eufs_flush_page(xmem); parent[ofs] = p2s(inode->i_sb, xmem); eufs_flush_cacheline(&parent[ofs]); invalidate_inode_pages2_range( inode->i_mapping, blocknr, blocknr); } else need_blocks++; } ofs++; blocknr++; } } if (!need_blocks) return 0; /* TODO: need optimization. */ r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks); if (IS_ERR_VALUE(r)) return r; blocknr = start_block ? start_block : 1; while (blocknr < end_block) { eufs_find_data_block_btree(inode, blocknr, &parent); BUG_ON(!parent); last_ofs_line = -1; ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1); while (ofs < EUFS_FILE_TREE_DEGREE && blocknr < end_block) { if (parent[ofs] == NULL_ADDR) { xmem = eufs_alloc_batch_allocate_file_data( inode->i_sb, ab); if (zero == EUFS_ALLOC_BLOCKS_ZERO_ALL || ((zero == EUFS_ALLOC_BLOCKS_ZERO_EDGE) && (blocknr == start_block || blocknr == end_block - 1))) eufs_clear_pmem(xmem, PAGE_SIZE); parent[ofs] = p2s(inode->i_sb, xmem); invalidate_inode_pages2_range(inode->i_mapping, blocknr, blocknr); if (last_ofs_line == -1) last_ofs_line = (ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE); } ofs++; if (last_ofs_line != -1 && (ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE) != last_ofs_line) { eufs_flush_cacheline(&parent[ofs - 1]); last_ofs_line = -1; } blocknr++; } if (last_ofs_line != -1) eufs_flush_cacheline(&parent[ofs - 1]); } eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab); return r; } static int eufs_alloc_blocks_btree_for_write(struct inode *inode, loff_t pos, int len) { long r; unsigned long blocknr, need_blocks = 0; long pi_tree_blocks = eufs_iread_tree_blocks(EUFS_FRESH_PI(EUFS_PI(inode))); size_t file_size_block = PAGE_DIV_ROUND_UP(inode->i_size); struct eufs_inode_info *vi = EUFS_I(inode); struct alloc_batch *ab = &vi->page_batch; __le64 *parent; void *xmem; /* The page first byte resides in */ unsigned long first_page = PAGE_DIV_ROUND_DOWN(pos); /* The page last byte resides in */ unsigned long last_page = PAGE_DIV_ROUND_DOWN(pos + len - 1); unsigned long pending_flush_bits; int start_offset; int end_offset; int ofs; r = eufs_extend_btree(inode, last_page + 1); if (r) return r; /* hole_at_sta is used by SEEK_HOLE. */ /* FIXME: We need a durable way to present hole_at_sta. */ if (first_page == 0) vi->hole_at_sta = false; /* The 0th data block is always allocated. */ blocknr = first_page ? first_page : 1; /* * Can be optimized by saving the top-down parent pointers * in a cursor and advancing by moving the cursor */ while (blocknr <= last_page) { eufs_find_data_block_btree(inode, blocknr, &parent); BUG_ON(!parent); /* One ofs, one block */ for (ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1); ofs < EUFS_FILE_TREE_DEGREE && blocknr <= last_page; ++ofs, ++blocknr) { /* Not a hole */ if (parent[ofs] != NULL_ADDR) continue; /* Hole */ if (blocknr < pi_tree_blocks) { /* * TODO: optimize option, instead of wrting * zeros here, we can write the actual data * instead. */ xmem = eufs_zalloc_file_data(inode->i_sb); if (!xmem) return -ENOSPC; eufs_alloc_persist(inode->i_sb, xmem, false); eufs_flush_page(xmem); parent[ofs] = p2s(inode->i_sb, xmem); eufs_flush_cacheline(&parent[ofs]); invalidate_inode_pages2_range(inode->i_mapping, blocknr, blocknr); } else need_blocks++; } } if (!need_blocks) return 0; /* FIXME: This requries re-write */ r = eufs_alloc_batch_pre_allocate_begin(inode->i_sb, ab, need_blocks); if (IS_ERR_VALUE(r)) return r; start_offset = pos & (PAGE_SIZE - 1); end_offset = (pos + len) & (PAGE_SIZE - 1); blocknr = first_page ? first_page : 1; while (blocknr <= last_page) { unsigned long bit; eufs_find_data_block_btree(inode, blocknr, &parent); BUG_ON(!parent); /* No cacheline is pending to be flushed for this index block */ pending_flush_bits = 0; for (ofs = blocknr & (EUFS_FILE_TREE_DEGREE - 1); ofs < EUFS_FILE_TREE_DEGREE && blocknr <= last_page; ++ofs, ++blocknr) { /* Not a hole */ if (parent[ofs] != NULL_ADDR) continue; xmem = eufs_alloc_batch_allocate_file_data(inode->i_sb, ab); if (unlikely(blocknr == first_page && (start_offset != 0))) eufs_clear_pmem(xmem, start_offset); /* Do not clear the last block which is after the EOF-block */ if (unlikely(blocknr == last_page && (end_offset != 0) && blocknr < file_size_block)) eufs_clear_pmem((char *)xmem + end_offset, PAGE_SIZE - end_offset); parent[ofs] = p2s(inode->i_sb, xmem); invalidate_inode_pages2_range(inode->i_mapping, blocknr, blocknr); set_bit(ofs >> EUFS_PTR_CNT_SHIFT_PER_CACHELINE, &pending_flush_bits); } for (bit = find_first_bit(&pending_flush_bits, 64); bit < 64; bit = find_next_bit(&pending_flush_bits, 64, bit + 1)) { ofs = bit << EUFS_PTR_CNT_SHIFT_PER_CACHELINE; eufs_flush_cacheline(&parent[ofs]); } } eufs_alloc_batch_pre_allocate_end(inode->i_sb, ab); return r; } static void eufs_free_recursive_btree_blind(struct super_block *sb, __le64 *root, int level_left) { int i; BUG_ON(!root); if (level_left == -1) { if (root != NULL_ADDR_PTR) nv_zfree(sb, root); return; } /* level_left */ BUG_ON(root == NULL_ADDR_PTR); for (i = 0; i < EUFS_FILE_TREE_DEGREE; ++i) { eufs_free_recursive_btree_blind(sb, s2p(sb, root[i]), level_left - 1); } nv_zfree(sb, root); } static void eufs_free_recursive_btree(struct super_block *sb, __le64 *root, int level_left, u64 blocks_left) { u64 nblocks_per_slot; int i; BUG_ON(!root); BUG_ON(!blocks_left); if (level_left == -1) { if (root != NULL_ADDR_PTR) nv_zfree(sb, root); return; } /* level_left */ BUG_ON(root == NULL_ADDR_PTR); nblocks_per_slot = 1 << (level_left * EUFS_FILE_TREE_DEGREE_SHIFT); for (i = 0; i < EUFS_FILE_TREE_DEGREE; ++i) { if (blocks_left >= nblocks_per_slot) { /* the whole sub-tree needs to be freed */ eufs_free_recursive_btree_blind(sb, s2p(sb, root[i]), level_left - 1); blocks_left -= nblocks_per_slot; if (blocks_left == 0) break; } else { eufs_free_recursive_btree(sb, s2p(sb, root[i]), level_left - 1, blocks_left); break; } } nv_zfree(sb, root); } int eufs_shrink_btree(struct inode *inode) { struct super_block *sb = inode->i_sb; struct eufs_inode_info *vi = EUFS_I(inode); struct eufs_inode *pi = EUFS_PI(inode); void *root; u64 capacity; __le64 *new_root; __le64 *old_root; int new_height; u64 size; u64 blocks; u64 count; u64 blocks_left; u64 __maybe_unused pi_root_o; BUG_ON(!inode_is_locked(inode)); BUG_ON(vi->i_volatile_height > EUFS_MAX_FILE_TREE_HEIGHT); pi_root_o = eufs_iread_root(pi); eufs_dbg("shrink btree stage 1: pi=%px vi->{s=%lld b=%lld h=%d r=%px} pi->{s=%lld b=%lld h=%d r=0x%llx}\n", pi, inode->i_size, vi->i_volatile_tree_blocks, vi->i_volatile_height, vi->i_volatile_root, eufs_iread_size(pi), eufs_iread_tree_blocks(pi), root_height(pi_root_o), root_ptr(pi_root_o)); eufs_sync_pinode(inode, pi, false); capacity = PAGE_SIZE << (EUFS_FILE_TREE_DEGREE_SHIFT * vi->i_volatile_height); new_root = vi->i_volatile_root; old_root = vi->i_volatile_root; new_height = vi->i_volatile_height; size = inode->i_size == 0 ? 1 : inode->i_size; /* old block count */ blocks = vi->i_volatile_tree_blocks; /* Check whether the height should be reduced */ for (;;) { capacity >>= EUFS_FILE_TREE_DEGREE_SHIFT; if (capacity < size || capacity < PAGE_SIZE) break; new_root = s2p(sb, new_root[0]); new_height--; } vi->i_volatile_root = new_root; vi->i_volatile_height = new_height; vi->i_volatile_tree_blocks = DIV_ROUND_UP(size, PAGE_SIZE); eufs_sync_pinode(inode, pi, false); eufs_alloc_batch_persist_reset(sb, &vi->page_batch); /* new block count and it's greater than 0 */ count = blocks_left = vi->i_volatile_tree_blocks; /* shrink from old_root/_height to new_root/_height */ root = old_root; if (blocks_left == blocks) goto out; if (blocks == 1) goto out; if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) { int i; __le64 *proot = root; for (i = count; i < blocks; ++i) { nv_free(sb, s2p(sb, proot[i])); proot[i] = NULL_VAL; } goto out; } if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { int i; __le64 *proot = root; for (i = EUFS_H2_INDEX_IN_L0(count); i < EUFS_FILE_TREE_DEGREE; ++i) { __le64 *pages = s2p(sb, proot[i]); int j = EUFS_H2_INDEX_IN_L1(count); for (; j < EUFS_FILE_TREE_DEGREE; ++j) { nv_free(sb, s2p(sb, pages[j])); pages[j] = NULL_VAL; count++; if (count >= blocks) break; } if (EUFS_H2_IS_FREE_L1_SUBTREE(i, blocks_left)) { nv_free(sb, pages); proot[i] = NULL_VAL; } if (count >= blocks) break; } goto out; } if (blocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { int i, j, k; __le64 *pproot = root; for (i = EUFS_H3_INDEX_IN_L0(count); i < EUFS_FILE_TREE_DEGREE; ++i) { __le64 *proot = s2p(sb, pproot[i]); j = EUFS_H3_INDEX_IN_L1(i, count); for (; j < EUFS_FILE_TREE_DEGREE; ++j) { __le64 *pages = s2p(sb, proot[j]); k = EUFS_H3_INDEX_IN_L2(count); for (; k < EUFS_FILE_TREE_DEGREE; ++k) { nv_free(sb, s2p(sb, pages[k])); pages[k] = NULL_VAL; count++; if (count >= blocks) break; } if (EUFS_H3_IS_FREE_L2_SUBTREE(i, j, blocks_left)) { nv_free(sb, pages); proot[j] = NULL_VAL; } if (count >= blocks) break; } if (EUFS_H3_IS_FREE_L1_SUBTREE(i, blocks_left)) { nv_free(sb, proot); pproot[i] = NULL_VAL; } if (count >= blocks) break; } } out: while (old_root != new_root) { __le64 *r = old_root; BUG_ON(!r); old_root = s2p(sb, r[0]); nv_free(sb, r); } return 0; } int eufs_free_btree(struct super_block *sb, void *root, int height, u64 blocks) { NV_ASSERT(!(height < 0 || height > EUFS_MAX_FILE_TREE_HEIGHT)); eufs_dbg("nvfree tree root: %px\n", root); if (blocks == 0) return 0; if (blocks == 1) { /* height == 0 */ nv_free(sb, root); return 0; } eufs_free_recursive_btree(sb, (__le64 *)root, height - 1, blocks); return 0; } int eufs_persist_btree_node(void *root, int sta, int len) { BUG_ON(len > EUFS_FILE_TREE_DEGREE); BUG_ON(len < 0); BUG_ON(sta + len > EUFS_FILE_TREE_DEGREE); BUG_ON(sta + len < 0); if (len == 0) return 0; eufs_ptr_fast_check(root); eufs_flush_range(((void **)root) + sta, len * sizeof(void *)); return 0; } static void eufs_persist_btree_h2_subtree(struct super_block *sb, void *root, int start0, int idx0, int idx1) { __le64 *proot = root; int i; void *p; for (i = start0; i < idx0; ++i) { BUG_ON(proot[i] == 0); p = s2p(sb, proot[i]); eufs_ptr_fast_check(p); eufs_persist_btree_node(p, 0, EUFS_FILE_TREE_DEGREE); } /* * According to the WARN_ON in eufs_persist_new_btree_h2, * idx0 < EUFS_FILE_TREE_DEGREE if idx1 != 0. So the following code * is safe. */ if (idx1 != 0) { p = s2p(sb, proot[idx0]); eufs_persist_btree_node(p, 0, idx1); } } static void eufs_persist_btree_h2_root(void *root, int start0, int idx0, int idx1) { int cnt = idx0 - start0; /* * It's the L1 index of the next block, so when it's not equals with 0, * the node[idx0] also needs persistence. */ if (idx1 != 0) cnt++; eufs_persist_btree_node(root, start0, cnt); } static void eufs_persist_new_btree_h2_by_idx(struct super_block *sb, void *root, int start0, int idx0, int idx1) { eufs_persist_btree_h2_subtree(sb, root, start0, idx0, idx1); /* It's a new btree, so persist the whole root node */ eufs_persist_btree_h2_root(root, 0, idx0, idx1); } static void eufs_persist_new_btree_h2(struct super_block *sb, void *root, int start0, unsigned long bcnt) { /* the L0/L1 index of the next block in new tree with height 2 */ int idx0 = EUFS_H2_INDEX_IN_L0(bcnt); int idx1 = EUFS_H2_INDEX_IN_L1(bcnt); /* * Notice a corner case: bcnt == EUFS_FILE_BCNT_WITH_HEIGHT(2), in * which (idx0 == EUFS_FILE_TREE_DEGREE && idx1 == 0) */ WARN_ON(idx0 == EUFS_FILE_TREE_DEGREE && idx1); eufs_persist_new_btree_h2_by_idx(sb, root, start0, idx0, idx1); } static void eufs_persist_inc_btree_h2_by_idx(struct super_block *sb, void *root, int old_idx0, int old_idx1, int new_idx0, int new_idx1) { __le64 *proot = root; void *p; int start; p = s2p(sb, proot[old_idx0]); if (old_idx0 == new_idx0) { if (old_idx0 == EUFS_FILE_TREE_DEGREE) return; eufs_persist_btree_node(p, old_idx1, new_idx1 - old_idx1); /* node[old_idx0] needs persistence */ if (!old_idx1) eufs_persist_btree_node(root, old_idx0, 1); return; } eufs_persist_btree_node(p, old_idx1, EUFS_FILE_TREE_DEGREE - old_idx1); eufs_persist_btree_h2_subtree(sb, root, old_idx0 + 1, new_idx0, new_idx1); start = old_idx0; /* if old_idx0 is not 0, root[start] must have already been persisted */ if (old_idx1) start++; eufs_persist_btree_h2_root(root, start, new_idx0, new_idx1); } static void eufs_persist_inc_btree_h2(struct super_block *sb, void *root, unsigned long old_bcnt, unsigned long new_bcnt) { /* the L0/L1 index of the next block in tree */ int old_idx0 = EUFS_H2_INDEX_IN_L0(old_bcnt); int old_idx1 = EUFS_H2_INDEX_IN_L1(old_bcnt); int new_idx0 = EUFS_H2_INDEX_IN_L0(new_bcnt); int new_idx1 = EUFS_H2_INDEX_IN_L1(new_bcnt); /* * Notice a corner case: bcnt == EUFS_FILE_BCNT_WITH_HEIGHT(2), in * which (idx0 == EUFS_FILE_TREE_DEGREE && idx1 == 0) */ WARN_ON(old_idx0 == EUFS_FILE_TREE_DEGREE && old_idx1); WARN_ON(new_idx0 == EUFS_FILE_TREE_DEGREE && new_idx1); eufs_persist_inc_btree_h2_by_idx(sb, root, old_idx0, old_idx1, new_idx0, new_idx1); } static void eufs_persist_new_btree_h3(struct super_block *sb, void *root, int start0, unsigned long bcnt_left) { int i; unsigned long left = bcnt_left; __le64 *pproot = root; for (i = start0; i < EUFS_FILE_TREE_DEGREE; ++i) { __le64 *proot = s2p(sb, pproot[i]); int j; for (j = 0; j < EUFS_FILE_TREE_DEGREE; ++j) { void *p = s2p(sb, proot[j]); if (left >= EUFS_FILE_TREE_DEGREE) { eufs_persist_btree_node(p, 0, EUFS_FILE_TREE_DEGREE); left -= EUFS_FILE_TREE_DEGREE; } else { eufs_persist_btree_node(p, 0, left); left = 0; j++; break; } } eufs_persist_btree_node(proot, 0, j); if (!left) { i++; break; } } eufs_persist_btree_node(root, 0, i); } static void eufs_persist_inc_btree_h3(struct super_block *sb, void *root, unsigned long old_bcnt, unsigned long new_bcnt) { /* The L0/L1/L2 position of the next block in tree */ int o_idx0 = EUFS_H3_INDEX_IN_L0(old_bcnt); int o_idx1 = EUFS_H3_INDEX_IN_L1(o_idx0, old_bcnt); int o_idx2 = EUFS_H3_INDEX_IN_L2(old_bcnt); int n_idx0 = EUFS_H3_INDEX_IN_L0(new_bcnt); int n_idx1 = EUFS_H3_INDEX_IN_L1(n_idx0, new_bcnt); int n_idx2 = EUFS_H3_INDEX_IN_L2(new_bcnt); __le64 *pproot = root; __le64 *proot; void *p; int i; if (o_idx0 == n_idx0 && o_idx1 == n_idx1) { /* persist from the bottom up */ proot = s2p(sb, pproot[o_idx0]); p = s2p(sb, proot[o_idx1]); eufs_persist_btree_node(p, o_idx2, n_idx2 - o_idx2); /* node[o_idx1] needs persistence */ if (!o_idx2) { eufs_persist_btree_node(proot, o_idx1, 1); /* node[o_idx0] needs persistence */ if (!o_idx1) eufs_persist_btree_node(root, o_idx0, 1); } return; } if (o_idx0 == n_idx0) { proot = s2p(sb, pproot[o_idx0]); eufs_persist_inc_btree_h2_by_idx(sb, proot, o_idx1, o_idx2, n_idx1, n_idx2); /* node[o_idx0] needs persistence */ if (!o_idx1 && !o_idx2) eufs_persist_btree_node(root, o_idx0, 1); return; } /* * A corner case: o_idx1 == EUFS_FILE_TREE_DEGREE && o_idx2 == 0. This * can be handled in the function eufs_persist_inc_btree_h2_by_idx, but * we still check it here for efficiency. */ if (o_idx1 < EUFS_FILE_TREE_DEGREE) { proot = s2p(sb, pproot[o_idx0]); eufs_persist_inc_btree_h2_by_idx(sb, proot, o_idx1, o_idx2, EUFS_FILE_TREE_DEGREE, 0); } else { WARN_ON(o_idx2 != 0); } for (i = o_idx0 + 1; i < n_idx0; ++i) { proot = s2p(sb, pproot[i]); eufs_persist_new_btree_h2_by_idx(sb, proot, 0, EUFS_FILE_TREE_DEGREE, 0); } if (n_idx1 || n_idx2) { proot = s2p(sb, pproot[n_idx0]); eufs_persist_new_btree_h2_by_idx(sb, proot, 0, n_idx1, n_idx2); /* root[n_idx0] needs to be persisted */ n_idx0++; } /* root[o_idx0] has been persisted */ if (o_idx1 || o_idx2) o_idx0++; eufs_persist_btree_node(root, o_idx0, n_idx0 - o_idx0); } /* Only structure persistency is needed */ int eufs_persist_btree(struct super_block *sb, void *root, int height, u64 old_size, u64 new_size) { unsigned long old_nblocks, new_nblocks; __le64 *proot; __le64 *pproot; if (old_size == 0) old_size = 1; /* at least one block */ NV_ASSERT(!(height < 0 || height > EUFS_MAX_FILE_TREE_HEIGHT)); if (!root) return 0; /* don't support for persisting for shrink */ if (old_size > new_size) return 0; old_nblocks = DIV_ROUND_UP(old_size, PAGE_SIZE); new_nblocks = DIV_ROUND_UP(new_size, PAGE_SIZE); if (old_nblocks == new_nblocks) return 0; proot = root; if (old_nblocks == 1) { /* data do not need flush */ if (new_nblocks == 1) return 0; if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) { eufs_persist_btree_node(root, 0, new_nblocks); return 0; } if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { eufs_persist_new_btree_h2(sb, root, 0, new_nblocks); return 0; } if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { eufs_persist_new_btree_h3(sb, root, 0, new_nblocks); return 0; } } else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) { if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(1)) { eufs_persist_btree_node(root, old_nblocks, new_nblocks - old_nblocks); return 0; } if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { __le64 *p = s2p(sb, proot[0]); eufs_persist_btree_node(p, old_nblocks, EUFS_FILE_TREE_DEGREE - old_nblocks); eufs_persist_new_btree_h2(sb, proot, 1, new_nblocks); return 0; } if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { void *p; pproot = root; proot = s2p(sb, pproot[0]); p = s2p(sb, proot[0]); eufs_persist_btree_node(p, old_nblocks, EUFS_FILE_TREE_DEGREE - old_nblocks); eufs_persist_new_btree_h2( sb, proot, 1, EUFS_FILE_BCNT_WITH_HEIGHT(2)); eufs_persist_new_btree_h3( sb, root, 1, new_nblocks - EUFS_FILE_BCNT_WITH_HEIGHT(2)); return 0; } } else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(2)) { eufs_persist_inc_btree_h2(sb, root, old_nblocks, new_nblocks); return 0; } if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { pproot = root; proot = s2p(sb, pproot[0]); eufs_persist_inc_btree_h2( sb, proot, old_nblocks, EUFS_FILE_BCNT_WITH_HEIGHT(2)); eufs_persist_new_btree_h3( sb, root, 1, new_nblocks - EUFS_FILE_BCNT_WITH_HEIGHT(2)); return 0; } } else if (old_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { if (new_nblocks <= EUFS_FILE_BCNT_WITH_HEIGHT(3)) { eufs_persist_inc_btree_h3(sb, root, old_nblocks, new_nblocks); return 0; } } BUG(); return 0; } static ssize_t do_mapping_read(struct address_space *mapping, struct file_ra_state *_ra, struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct inode *inode = mapping->host; pgoff_t index, end_index; unsigned long offset; loff_t isize, pos; size_t copied = 0, error = 0; pos = *ppos; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; isize = i_size_read(inode); if (!isize) goto out; end_index = (isize - 1) >> PAGE_SHIFT; do { unsigned long nr, left; void *xmem; /* nr is the maximum number of bytes to copy from this page */ nr = PAGE_SIZE; if (index >= end_index) { if (index > end_index) goto out; nr = ((isize - 1) & ~PAGE_MASK) + 1; if (nr <= offset) goto out; } nr = nr - offset; if (nr > len - copied) nr = len - copied; xmem = eufs_find_data_block_btree(inode, index, NULL); BUG_ON(!eufs_access_ok(inode->i_sb, xmem, PAGE_SIZE)); if (unlikely(!xmem)) BUG(); /* * Ok, we have the mem, so now we can copy it to user space... * * The actor routine returns how many bytes were actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ if (xmem != NULL_ADDR_PTR) left = __copy_to_user(buf + copied, xmem + offset, nr); else left = __clear_user(buf + copied, nr); if (left) { error = -EFAULT; goto out; } copied += (nr - left); offset += (nr - left); index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; } while (copied < len); out: *ppos = pos + copied; if (filp) file_accessed(filp); return copied ? copied : error; } /* * Wrappers. We need to use the rcu read lock to avoid * concurrent truncate operation. No problem for write because we held * i_mutex. */ ssize_t eufs_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { ssize_t res; inode_lock_shared(file_inode(filp)); if (!access_ok(buf, len)) res = -EFAULT; else res = do_mapping_read(filp->f_mapping, &filp->f_ra, filp, buf, len, ppos); inode_unlock_shared(file_inode(filp)); return res; } static __always_inline size_t memcpy_to_nvmm(char *kmem, loff_t offset, const char __user *buf, size_t bytes) { size_t copied; if (support_clwb && !force_nocache_write) { copied = bytes - __copy_from_user(kmem + offset, buf, bytes); eufs_flush_buffer(kmem + offset, copied, 0); } else { copied = bytes - __copy_from_user_inatomic_nocache( kmem + offset, buf, bytes); } return copied; } ssize_t __eufs_file_write_inode(struct inode *inode, const char __user *buf, size_t count, loff_t pos, loff_t *ppos, bool zero, bool keep) { long status = 0; size_t bytes; ssize_t written = 0; if (!count) return 0; eufs_dbg("file write: inode=%px count=%lx pos=%llx, zero=%d keep=%d\n", inode, count, pos, zero, keep); do { unsigned long index; unsigned long offset; size_t copied; __le64 *parent; void __pmem *xmem; void __pmem *xmem_new = NULL; offset = (pos & (PAGE_SIZE - 1)); /* Within page */ index = pos >> PAGE_SHIFT; bytes = PAGE_SIZE - offset; if (bytes > count) bytes = count; xmem = eufs_find_data_block_btree(inode, index, &parent); if (!eufs_access_ok(inode->i_sb, xmem, PAGE_SIZE)) { dump_stack(); BUG(); } /* do no wear leveling for 0-level btrees */ if (xmem != NULL_ADDR_PTR && parent && !zero) { /* wear threshold! */ if (!wear_inc(inode->i_sb, xmem)) xmem_new = eufs_malloc_file_data(inode->i_sb); } if (zero) { copied = bytes; if (xmem != NULL_ADDR_PTR) eufs_clear_pmem((char *)xmem + offset, bytes); } else { BUG_ON(xmem == NULL_ADDR_PTR); copied = memcpy_to_nvmm((char *)xmem, offset, buf, bytes); } if (xmem_new) { struct eufs_inode_info *vi = EUFS_I(inode); eufs_dbg( "inode=%px pos=%llx xmem:[%px -> %px] weared\n", inode, pos, xmem, xmem_new); eufs_alloc_persist(inode->i_sb, xmem_new, true); WARN_ON(xmem != s2p(inode->i_sb, parent[index % EUFS_FILE_TREE_DEGREE])); /* * disable page fault, clear all related PTEs, and remove the * dax entry from the radix tree before replace the old block */ down_write(&vi->mmap_rwsem); invalidate_inode_pages2_range(inode->i_mapping, pos / PAGE_SIZE, pos / PAGE_SIZE); memcpy_to_nvmm(xmem_new, 0, xmem, PAGE_SIZE); parent[index % EUFS_FILE_TREE_DEGREE] = p2s(inode->i_sb, xmem_new); up_write(&vi->mmap_rwsem); eufs_flush_cacheline( &parent[index % EUFS_FILE_TREE_DEGREE]); eufs_pbarrier(); /* * It is important to persist all preivous alllocations * here. Otherwise, the xmem might be freed before its * information is handled in the page_batch, which will * cause xmem being marked as allocated (page_batch does * this) when it is in the free list. * xfstests/generic/299 can trigger this. */ eufs_alloc_batch_persist_reset( inode->i_sb, &EUFS_I(inode)->page_batch); nv_free_rest(inode->i_sb, xmem); } eufs_dbg( "! file writing to pos=%ld xmem=%px, offset=%ld, buf=%px, bytes=%ld index=%ld, copied=%ld\n", (long)pos, xmem, (long)offset, buf, (long)bytes, (long)index, (long)copied); if (likely(copied > 0)) { written += copied; count -= copied; pos += copied; buf += copied; } if (unlikely(copied != bytes)) { status = -EFAULT; break; } } while (count); if (ppos) *ppos = pos; eufs_dbg("pos: %d inode->i_size: %d written: %d\n", (int)pos, (int)inode->i_size, (int)written); /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold i_mutex. */ if (!keep && pos > inode->i_size) eufs_inode_size_write(inode, pos); return written ? written : status; } ssize_t __eufs_file_write(struct address_space *mapping, const char __user *buf, size_t count, loff_t pos, loff_t *ppos, bool zero, bool keep) { return __eufs_file_write_inode(mapping->host, buf, count, pos, ppos, zero, keep); } ssize_t eufs_file_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; struct eufs_inode_info *vi = EUFS_I(inode); struct super_block *sb = inode->i_sb; ssize_t written = 0; loff_t pos; size_t count, ret; bool osync = false; inode_lock(inode); if (!access_ok(buf, len)) { ret = -EFAULT; goto out; } if (filp->f_flags & O_APPEND) pos = inode->i_size; else pos = *ppos; if (filp->f_flags & __O_SYNC) osync = true; count = len; if (count == 0) { ret = 0; goto out; } NV_ASSERT(sb->s_blocksize == PAGE_SIZE); ret = file_remove_privs(filp); if (ret) goto out; inode->i_ctime = inode->i_mtime = current_time(inode); /* * It's a little tricky here. We should use mmap_rwsem to protect * the block allocation and i_size update, but mmap_rwsem can not * be taken during block writing because that will lead to * dead-lock. We only use mmap_rwsem to protect the block allocation, * and there are two reasons we can do that: * 1. mmap fault takes the mmap_rwsem before read i_size, so it * can not read the updated i_size before the allocation is done. * 2. write only extends the block tree, and will not remove or * modify the existed block mappings. */ down_write(&vi->mmap_rwsem); /* * Possible cases for writing [pos~pos+len) * * Definitions * EOF: the byte after last valid byte * EOF-page: page contains EOF * first: the page pos belongs to * last: the page pos+len belongs to * Macro EOP(p): the last byte of p's page * * IMPORTANT NOTICE: we do not guarantee that [EOF~EOP(EOF)] are * zeroed! When we mmap a file, we will erase that (in DRAM) in the * mmap syscall. This can concurrently happen with a write syscall * which may cause consistency problems (especially when it's an * append). Concurrent mmap-access and read-/write-access should be * protected by the application. * * 1) EOF-page | first | last * area-to-zero: [EOF~EOP(EOF)] * 2) EOF-page=first| last * area-to-zero: [EOF~pos) if EOFmmap_rwsem); goto out; } /* If we decide to guarantee zeroed file tail, we may use this snippet. */ /* zeroing part of the last block goes beyond the new EOF */ if (PAGE_ALIGN(pos + count) > PAGE_ALIGN(inode->i_size)) eufs_inode_zero_range(inode, pos + count, PAGE_ALIGN(pos + count)); /* * zeroing the hole created by write. * part of the hole is included in the last page that exceeds EOF, * and it has already been zeroed, so only zeroing the remaining part. */ if (pos > inode->i_size) { loff_t offset = inode->i_size & (PAGE_SIZE - 1); if (offset || !inode->i_size) { /* * Zero EOF~EOP(EOF). * This also satisfies case 2), since [EOP(EOF)+1~pos] * are holes. */ eufs_inode_zero_range_len(inode, inode->i_size, PAGE_SIZE - offset); } } up_write(&vi->mmap_rwsem); written = __eufs_file_write(mapping, buf, count, pos, ppos, false, false); if (written < 0 || written != count) { eufs_dbg("write incomplete/failed: written %ld len %ld pos %llx\n", written, count, pos); } if (osync) { eufs_alloc_batch_persist_reset(sb, &EUFS_I(inode)->page_batch); eufs_sync_pinode(inode, EUFS_PI(inode), false); } else { request_persistence(inode); } ret = written; out: inode_unlock(inode); return ret; } static int eufs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *src) { struct super_block *sb = inode->i_sb; struct eufs_sb_info *sbi = sb->s_fs_info; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; bool new = false; void *__pmem xmem; __le64 *__pmem parent; eufs_dbg("fault: inode=%px addr=0x%llx rw=%d length=%lld\n", inode, offset, flags & IOMAP_WRITE, length); inode_leaf_lock(inode); xmem = eufs_find_data_block_btree(inode, first_block, &parent); /* allocate a new block for write */ if (xmem == NULL_ADDR_PTR && (flags & IOMAP_WRITE)) { int ofs = first_block & (EUFS_FILE_TREE_DEGREE - 1); /* * We cannot use normal allocation here because they can send * IPI to gather pages and blocks. So here we need to use * non-blocking version, which uses reserved pages instead of * gathering pages by IPI. */ xmem = eufs_zalloc_file_data(inode->i_sb); if (!xmem) { inode_leaf_unlock(inode); return -ENOSPC; } eufs_alloc_persist(inode->i_sb, xmem, false); /* * the first block is preallocated during inode initialization, * so parent should not be NULL when xmem is NULL_ADDR */ BUG_ON(!parent); eufs_flush_page(xmem); parent[ofs] = p2s(sb, xmem); eufs_flush_cacheline(&parent[ofs]); new = true; } inode_leaf_unlock(inode); iomap->flags = 0; iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64)first_block << blkbits; iomap->dax_dev = sbi->s_dax_dev; iomap->length = 1 << blkbits; if (xmem == NULL_ADDR_PTR) { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } else { iomap->type = IOMAP_MAPPED; iomap->addr = (xmem - sbi->virt_addr); } if (new) iomap->flags |= IOMAP_F_NEW; return 0; } static int eufs_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { return 0; } const struct iomap_ops eufs_iomap_ops = { .iomap_begin = eufs_iomap_begin, .iomap_end = eufs_iomap_end, }; static unsigned int eufs_dax_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); struct eufs_inode_info *vi = EUFS_I(inode); int ret; if (vmf->flags & FAULT_FLAG_WRITE) { sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } /* * i_size and the block tree must be consistent during mmap fault, * else eulerfs may map to a freed block or a hole instead of an * allocated block. * * Now i_rwsem is used to protect against the update of i_size and * the block tree, but it can NOT been used in mmap fault path, * because mmap fault may be triggered in the middle of * write or read operation when the dst or src buffer is a mapped * range of the same file, and that will lead to dead-lock due to * two acquisitions of the same lock (i_rwsem). * * So mmap_rwsem is provided. The read-lock will be used in mmap * fault path, and the write-lock will be used in truncate & * fallocate & write paths. */ down_read(&vi->mmap_rwsem); ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ret, &eufs_iomap_ops); up_read(&vi->mmap_rwsem); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(inode->i_sb); return ret; } const struct vm_operations_struct eufs_file_vm_ops = { .fault = eufs_dax_fault, .page_mkwrite = eufs_dax_fault, .pfn_mkwrite = eufs_dax_fault, }; int eufs_dax_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); vma->vm_flags |= VM_MIXEDMAP; vma->vm_ops = &eufs_file_vm_ops; eufs_dbg("dax file mmaped!\n"); return 0; } static loff_t eufs_seek_block(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t maxbytes = inode->i_sb->s_maxbytes; pgoff_t pgofs; loff_t data_ofs = offset, isize; __le64 *parent; void *addr; unsigned int ofs; inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) goto fail; pgofs = (pgoff_t)(offset >> PAGE_SHIFT); if (EUFS_I(inode)->hole_at_sta && pgofs == 0) { if (whence == SEEK_HOLE) goto found; pgofs++; data_ofs = (loff_t)pgofs << PAGE_SHIFT; } while (data_ofs < isize) { addr = eufs_find_data_block_btree(inode, pgofs, &parent); ofs = pgofs & (EUFS_FILE_TREE_DEGREE - 1); while (ofs < EUFS_FILE_TREE_DEGREE && data_ofs < isize) { if (parent) addr = s2p(inode->i_sb, parent[ofs]); if (addr == NULL_ADDR_PTR && whence == SEEK_HOLE) goto found; if (addr && addr != NULL_ADDR_PTR && whence == SEEK_DATA) goto found; ofs++; pgofs++; data_ofs = (loff_t)pgofs << PAGE_SHIFT; } } if (whence == SEEK_DATA) goto fail; found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: inode_unlock(inode); return -ENXIO; } loff_t eufs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file_inode(file); loff_t maxbytes = inode->i_sb->s_maxbytes; switch (whence) { case SEEK_SET: case SEEK_CUR: case SEEK_END: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_DATA: case SEEK_HOLE: if (offset < 0) return -ENXIO; return eufs_seek_block(file, offset, whence); } return -EINVAL; }