btrfs_inode.h 18.2 KB
Newer Older
1
/* SPDX-License-Identifier: GPL-2.0 */
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

6 7
#ifndef BTRFS_INODE_H
#define BTRFS_INODE_H
C
Chris Mason 已提交
8

9
#include <linux/hash.h>
10
#include <linux/refcount.h>
11
#include "extent_map.h"
12
#include "extent_io.h"
13
#include "ordered-data.h"
14
#include "delayed-inode.h"
15

16 17 18 19 20 21 22
/*
 * Since we search a directory based on f_pos (struct dir_context::pos) we have
 * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so
 * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()).
 */
#define BTRFS_DIR_START_INDEX 2

23 24 25 26 27 28 29
/*
 * ordered_data_close is set by truncate when a file that used
 * to have good data has been truncated to zero.  When it is set
 * the btrfs file release call will add this inode to the
 * ordered operations list so that we make sure to flush out any
 * new data the application may have written before commit.
 */
30
enum {
31
	BTRFS_INODE_FLUSH_ON_CLOSE,
32 33 34
	BTRFS_INODE_DUMMY,
	BTRFS_INODE_IN_DEFRAG,
	BTRFS_INODE_HAS_ASYNC_EXTENT,
35 36 37 38 39
	 /*
	  * Always set under the VFS' inode lock, otherwise it can cause races
	  * during fsync (we start as a fast fsync and then end up in a full
	  * fsync racing with ordered extent completion).
	  */
40 41 42 43
	BTRFS_INODE_NEEDS_FULL_SYNC,
	BTRFS_INODE_COPY_EVERYTHING,
	BTRFS_INODE_IN_DELALLOC_LIST,
	BTRFS_INODE_HAS_PROPS,
44
	BTRFS_INODE_SNAPSHOT_FLUSH,
45 46 47 48 49 50 51
	/*
	 * Set and used when logging an inode and it serves to signal that an
	 * inode does not have xattrs, so subsequent fsyncs can avoid searching
	 * for xattrs to log. This bit must be cleared whenever a xattr is added
	 * to an inode.
	 */
	BTRFS_INODE_NO_XATTRS,
52 53 54 55 56 57 58 59 60
	/*
	 * Set when we are in a context where we need to start a transaction and
	 * have dirty pages with the respective file range locked. This is to
	 * ensure that when reserving space for the transaction, if we are low
	 * on available space and need to flush delalloc, we will not flush
	 * delalloc for this inode, because that could result in a deadlock (on
	 * the file range, inode's io_tree).
	 */
	BTRFS_INODE_NO_DELALLOC_FLUSH,
B
Boris Burkov 已提交
61 62 63 64 65 66 67
	/*
	 * Set when we are working on enabling verity for a file. Computing and
	 * writing the whole Merkle tree can take a while so we want to prevent
	 * races where two separate tasks attempt to simultaneously start verity
	 * on the same file.
	 */
	BTRFS_INODE_VERITY_IN_PROGRESS,
68 69
	/* Set when this inode is a free space inode. */
	BTRFS_INODE_FREE_SPACE_INODE,
70
};
71

A
Aneesh 已提交
72
/* in memory btrfs inode */
C
Chris Mason 已提交
73
struct btrfs_inode {
C
Chris Mason 已提交
74
	/* which subvolume this inode belongs to */
75
	struct btrfs_root *root;
C
Chris Mason 已提交
76 77 78 79

	/* key used to find this inode on disk.  This is used by the code
	 * to read in roots of subvolumes
	 */
80
	struct btrfs_key location;
C
Chris Mason 已提交
81

82 83 84
	/*
	 * Lock for counters and all fields used to determine if the inode is in
	 * the log or not (last_trans, last_sub_trans, last_log_commit,
85 86
	 * logged_trans), to access/update new_delalloc_bytes and to update the
	 * VFS' inode number of bytes used.
87
	 */
88 89
	spinlock_t lock;

C
Chris Mason 已提交
90
	/* the extent_tree has caches of all the extent mappings to disk */
91
	struct extent_map_tree extent_tree;
C
Chris Mason 已提交
92 93

	/* the io_tree does range state (DIRTY, LOCKED etc) */
94
	struct extent_io_tree io_tree;
C
Chris Mason 已提交
95

96 97 98 99 100 101
	/*
	 * Keep track of where the inode has extent items mapped in order to
	 * make sure the i_size adjustments are accurate
	 */
	struct extent_io_tree file_extent_tree;

C
Chris Mason 已提交
102
	/* held while logging the inode in tree-log.c */
103
	struct mutex log_mutex;
C
Chris Mason 已提交
104 105

	/* used to order data wrt metadata */
106
	struct btrfs_ordered_inode_tree ordered_tree;
107

C
Chris Mason 已提交
108 109 110 111
	/* list of all the delalloc inodes in the FS.  There are times we need
	 * to write all the delalloc pages to disk, and this list is used
	 * to walk them all.
	 */
112 113
	struct list_head delalloc_inodes;

114 115 116
	/* node for the red-black tree that links inodes in subvolume root */
	struct rb_node rb_node;

117 118
	unsigned long runtime_flags;

119
	/* Keep track of who's O_SYNC/fsyncing currently */
120 121
	atomic_t sync_writers;

C
Chris Mason 已提交
122 123 124
	/* full 64 bit generation number, struct vfs_inode doesn't have a big
	 * enough field for this.
	 */
125 126
	u64 generation;

127 128 129 130
	/*
	 * transid of the trans_handle that last modified this inode
	 */
	u64 last_trans;
131 132

	/*
133
	 * transid that last logged this inode
134
	 */
135
	u64 logged_trans;
136

137
	/*
138
	 * log transid when this inode was last modified
139
	 */
140 141 142 143
	int last_sub_trans;

	/* a local copy of root's last_log_commit */
	int last_log_commit;
C
Chris Mason 已提交
144

145 146 147 148 149
	/*
	 * Total number of bytes pending delalloc, used by stat to calculate the
	 * real block usage of the file. This is used only for files.
	 */
	u64 delalloc_bytes;
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164

	union {
		/*
		 * Total number of bytes pending delalloc that fall within a file
		 * range that is either a hole or beyond EOF (and no prealloc extent
		 * exists in the range). This is always <= delalloc_bytes and this
		 * is used only for files.
		 */
		u64 new_delalloc_bytes;
		/*
		 * The offset of the last dir index key that was logged.
		 * This is used only for directories.
		 */
		u64 last_dir_index_offset;
	};
165

166 167 168 169 170 171
	/*
	 * total number of bytes pending defrag, used by stat to check whether
	 * it needs COW.
	 */
	u64 defrag_bytes;

C
Chris Mason 已提交
172 173 174 175 176
	/*
	 * the size of the file stored in the metadata on disk.  data=ordered
	 * means the in-memory i_size might be larger than the size on disk
	 * because not all the blocks are written yet.
	 */
177
	u64 disk_i_size;
C
Chris Mason 已提交
178

179
	/*
180 181 182
	 * If this is a directory then index_cnt is the counter for the index
	 * number for new files that are created. For an empty directory, this
	 * must be initialized to BTRFS_DIR_START_INDEX.
183 184
	 */
	u64 index_cnt;
C
Chris Mason 已提交
185

186 187 188
	/* Cache the directory index number to speed the dir/file remove */
	u64 dir_index;

189 190 191 192 193 194 195
	/* the fsync log has some corner cases that mean we have to check
	 * directories to see if any unlinks have been done before
	 * the directory was logged.  See tree-log.c for all the
	 * details
	 */
	u64 last_unlink_trans;

196 197 198 199 200 201 202 203 204 205 206
	/*
	 * The id/generation of the last transaction where this inode was
	 * either the source or the destination of a clone/dedupe operation.
	 * Used when logging an inode to know if there are shared extents that
	 * need special care when logging checksum items, to avoid duplicate
	 * checksum items in a log (which can lead to a corruption where we end
	 * up with missing checksum ranges after log replay).
	 * Protected by the vfs inode lock.
	 */
	u64 last_reflink_trans;

207 208 209 210 211 212
	/*
	 * Number of bytes outstanding that are going to need csums.  This is
	 * used in ENOSPC accounting.
	 */
	u64 csum_bytes;

213
	/* Backwards incompatible flags, lower half of inode_item::flags  */
214
	u32 flags;
215 216
	/* Read-only compatibility flags, upper half of inode_item::flags */
	u32 ro_flags;
217

J
Josef Bacik 已提交
218
	/*
219 220 221 222
	 * Counters to keep track of the number of extent item's we may use due
	 * to delalloc and such.  outstanding_extents is the number of extent
	 * items we think we'll end up using, and reserved_extents is the number
	 * of extent items we've reserved metadata for.
J
Josef Bacik 已提交
223
	 */
224
	unsigned outstanding_extents;
225 226

	struct btrfs_block_rsv block_rsv;
J
Josef Bacik 已提交
227

C
Chris Mason 已提交
228
	/*
229
	 * Cached values of inode properties
C
Chris Mason 已提交
230
	 */
231
	unsigned prop_compress;		/* per-file compression algorithm */
232 233 234 235 236
	/*
	 * Force compression on the file using the defrag ioctl, could be
	 * different from prop_compress and takes precedence if set
	 */
	unsigned defrag_compress;
C
Chris Mason 已提交
237

238 239
	struct btrfs_delayed_node *delayed_node;

240
	/* File creation time. */
241
	struct timespec64 i_otime;
242

243 244 245
	/* Hook into fs_info->delayed_iputs */
	struct list_head delayed_iput;

246
	struct rw_semaphore i_mmap_lock;
C
Chris Mason 已提交
247
	struct inode vfs_inode;
C
Chris Mason 已提交
248
};
249

250
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
C
Chris Mason 已提交
251 252 253 254
{
	return container_of(inode, struct btrfs_inode, vfs_inode);
}

255 256 257
static inline unsigned long btrfs_inode_hash(u64 objectid,
					     const struct btrfs_root *root)
{
258
	u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
259 260 261 262 263 264 265 266

#if BITS_PER_LONG == 32
	h = (h >> 32) ^ (h & 0xffffffff);
#endif

	return (unsigned long)h;
}

267 268 269 270 271 272
#if BITS_PER_LONG == 32

/*
 * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so
 * we use the inode's location objectid which is a u64 to avoid truncation.
 */
273
static inline u64 btrfs_ino(const struct btrfs_inode *inode)
L
Li Zefan 已提交
274
{
275
	u64 ino = inode->location.objectid;
L
Li Zefan 已提交
276

277 278
	/* type == BTRFS_ROOT_ITEM_KEY: subvol dir */
	if (inode->location.type == BTRFS_ROOT_ITEM_KEY)
279
		ino = inode->vfs_inode.i_ino;
L
Li Zefan 已提交
280 281 282
	return ino;
}

283 284 285 286 287 288 289 290 291
#else

static inline u64 btrfs_ino(const struct btrfs_inode *inode)
{
	return inode->vfs_inode.i_ino;
}

#endif

292
static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
293
{
294 295
	i_size_write(&inode->vfs_inode, size);
	inode->disk_i_size = size;
296 297
}

298
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
299
{
300
	return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
301 302
}

303 304 305 306 307
static inline bool is_data_inode(struct inode *inode)
{
	return btrfs_ino(BTRFS_I(inode)) != BTRFS_BTREE_INODE_OBJECTID;
}

J
Josef Bacik 已提交
308 309 310 311 312 313 314
static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
						 int mod)
{
	lockdep_assert_held(&inode->lock);
	inode->outstanding_extents += mod;
	if (btrfs_is_free_space_inode(inode))
		return;
315 316
	trace_btrfs_inode_mod_outstanding_extents(inode->root, btrfs_ino(inode),
						  mod);
J
Josef Bacik 已提交
317 318
}

319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
/*
 * Called every time after doing a buffered, direct IO or memory mapped write.
 *
 * This is to ensure that if we write to a file that was previously fsynced in
 * the current transaction, then try to fsync it again in the same transaction,
 * we will know that there were changes in the file and that it needs to be
 * logged.
 */
static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
{
	spin_lock(&inode->lock);
	inode->last_sub_trans = inode->root->log_transid;
	spin_unlock(&inode->lock);
}

334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
/*
 * Should be called while holding the inode's VFS lock in exclusive mode or in a
 * context where no one else can access the inode concurrently (during inode
 * creation or when loading an inode from disk).
 */
static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode)
{
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
	/*
	 * The inode may have been part of a reflink operation in the last
	 * transaction that modified it, and then a fsync has reset the
	 * last_reflink_trans to avoid subsequent fsyncs in the same
	 * transaction to do unnecessary work. So update last_reflink_trans
	 * to the last_trans value (we have to be pessimistic and assume a
	 * reflink happened).
	 *
	 * The ->last_trans is protected by the inode's spinlock and we can
	 * have a concurrent ordered extent completion update it. Also set
	 * last_reflink_trans to ->last_trans only if the former is less than
	 * the later, because we can be called in a context where
	 * last_reflink_trans was set to the current transaction generation
	 * while ->last_trans was not yet updated in the current transaction,
	 * and therefore has a lower value.
	 */
	spin_lock(&inode->lock);
	if (inode->last_reflink_trans < inode->last_trans)
		inode->last_reflink_trans = inode->last_trans;
	spin_unlock(&inode->lock);
}

364
static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
365
{
366
	bool ret = false;
367

368 369 370
	spin_lock(&inode->lock);
	if (inode->logged_trans == generation &&
	    inode->last_sub_trans <= inode->last_log_commit &&
371 372
	    inode->last_sub_trans <= inode->root->last_log_commit)
		ret = true;
373
	spin_unlock(&inode->lock);
374
	return ret;
375 376
}

377 378 379 380 381 382 383 384 385 386 387
/*
 * Check if the inode has flags compatible with compression
 */
static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
{
	if (inode->flags & BTRFS_INODE_NODATACOW ||
	    inode->flags & BTRFS_INODE_NODATASUM)
		return false;
	return true;
}

388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
/*
 * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
 * separate u32s. These two functions convert between the two representations.
 */
static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags)
{
	return (flags | ((u64)ro_flags << 32));
}

static inline void btrfs_inode_split_flags(u64 inode_item_flags,
					   u32 *flags, u32 *ro_flags)
{
	*flags = (u32)inode_item_flags;
	*ro_flags = (u32)(inode_item_flags >> 32);
}

404 405 406
/* Array of bytes with variable length, hexadecimal format 0x1234 */
#define CSUM_FMT				"0x%*phN"
#define CSUM_FMT_VALUE(size, bytes)		size, bytes
407

408
void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
409
void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
410 411 412
			int mirror_num, enum btrfs_compression_type compress_type);
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
413
blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
414 415
bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
			u32 bio_offset, struct bio_vec *bv);
416 417 418 419 420 421 422 423 424 425 426 427 428
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
			      u64 *orig_start, u64 *orig_block_len,
			      u64 *ram_bytes, bool nowait, bool strict);

void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
		       struct btrfs_inode *dir, struct btrfs_inode *inode,
		       const struct fscrypt_str *name);
int btrfs_add_link(struct btrfs_trans_handle *trans,
		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
		   const struct fscrypt_str *name, int add_backref, u64 index);
429
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
			 int front);

int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
			       bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
			      unsigned int extra_bits,
			      struct extent_state **cached_state);

struct btrfs_new_inode_args {
	/* Input */
	struct inode *dir;
	struct dentry *dentry;
	struct inode *inode;
	bool orphan;
	bool subvol;

	/* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */
	struct posix_acl *default_acl;
	struct posix_acl *acl;
	struct fscrypt_name fname;
};

int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
			    unsigned int *trans_num_items);
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
			   struct btrfs_new_inode_args *args);
void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
				     struct inode *dir);
461
 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
462
			        u32 bits);
463
void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
464
				 struct extent_state *state, u32 bits);
465
void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
466
				 struct extent_state *other);
467
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490
				 struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
			      struct btrfs_root *root, struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
				    struct page *page, size_t pg_offset,
				    u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
				struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
491
void btrfs_add_delayed_iput(struct btrfs_inode *inode);
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
			      u64 start, u64 num_bytes, u64 min_size,
			      loff_t actual_len, u64 *alloc_hint);
int btrfs_prealloc_file_range_trans(struct inode *inode,
				    struct btrfs_trans_handle *trans, int mode,
				    u64 start, u64 num_bytes, u64 min_size,
				    loff_t actual_len, u64 *alloc_hint);
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
			     u64 start, u64 end, int *page_started,
			     unsigned long *nr_written, struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
					  struct page *page, u64 start,
					  u64 end, bool uptodate);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
					     int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
					  u64 file_offset, u64 disk_bytenr,
					  u64 disk_io_size,
					  struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
			   struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
			       const struct btrfs_ioctl_encoded_io_args *encoded);

ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
		       size_t done_before);
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
				  size_t done_before);

extern const struct dentry_operations btrfs_dentry_operations;

/* Inode locking type flags, by default the exclusive lock is taken. */
enum btrfs_ilock_type {
	ENUM_BIT(BTRFS_ILOCK_SHARED),
	ENUM_BIT(BTRFS_ILOCK_TRY),
	ENUM_BIT(BTRFS_ILOCK_MMAP),
};

533
int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags);
534
void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
535 536 537 538
void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
			      const u64 del_bytes);
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);

C
Chris Mason 已提交
539
#endif