diff --git a/fs/Kconfig b/fs/Kconfig index 18f5a85b47c667a90e3cc8048a76c41fa85a7a8d..31cce5d88b1a001c69a600caf72cb99deb82ea3a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -501,6 +501,8 @@ config BTRFS_FS tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" depends on EXPERIMENTAL select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7125716e142b4a75aa411b8b758c632877065cec..d2cf5a54a4b816d308ed7c452892fae27e009db1 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,7 +7,8 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - ref-cache.o export.o tree-log.o acl.o free-space-cache.o + ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o else # Normal Makefile diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 0000000000000000000000000000000000000000..c5470367ca5c2907d16a4580d5106e875d132b18 --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,454 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compat.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; +}; + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + struct bio *bio; + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_byte >> 9; + } + return bio; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) + bio_io_error(cb->orig_bio); + else + bio_endio(cb->orig_bio, 0); + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + while(nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min(nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ + return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, 1); + + end_compressed_writeback(inode, cb->start, cb->len); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int page_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(sizeof(*cb), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + ret = btrfs_csum_file_bytes(root, inode, start, len); + BUG_ON(ret); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + while(bytes_left > 0) { + page = compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (bio->bi_size) + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + atomic_inc(&cb->pending_bios); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + page_index++; + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long page_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_sector << 9; + struct extent_map *em; + int ret; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + cb = kmalloc(sizeof(*cb), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + + cb->start = em->start; + compressed_len = em->block_len; + free_extent_map(em); + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->orig_bio = bio; + + nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + GFP_NOFS); + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (page_index = 0; page_index < nr_pages; page_index++) { + cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + } + cb->nr_pages = nr_pages; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (page_index = 0; page_index < nr_pages; page_index++) { + page = cb->compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (comp_bio->bi_size) + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + ret = btrfs_map_bio(root, READ, comp_bio, 0, 0); + BUG_ON(ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + atomic_inc(&cb->pending_bios); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + ret = btrfs_map_bio(root, READ, comp_bio, 0, 0); + BUG_ON(ret); + + bio_put(comp_bio); + return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 0000000000000000000000000000000000000000..421f5b4aa7151d4704256222de8e342e8899259d --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); +void btrfs_zlib_exit(void); +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8559f39fd47fb8ed0165ffd6c686cfa146f618f4..793d8fdda24474a147edd4b520ba677a752b8404 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -400,10 +400,18 @@ struct btrfs_timespec { __le32 nsec; } __attribute__ ((__packed__)); -/* - * there is no padding here on purpose. If you want to extent the inode, - * make a new item type - */ +typedef enum { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LAST = 2, +} btrfs_compression_type; + +/* we don't understand any encryption methods right now */ +typedef enum { + BTRFS_ENCRYPTION_NONE = 0, + BTRFS_ENCRYPTION_LAST = 1, +} btrfs_encryption_type; + struct btrfs_inode_item { /* nfs style generation number */ __le64 generation; @@ -419,6 +427,7 @@ struct btrfs_inode_item { __le64 rdev; __le16 flags; __le16 compat_flags; + struct btrfs_timespec atime; struct btrfs_timespec ctime; struct btrfs_timespec mtime; @@ -454,8 +463,33 @@ struct btrfs_root_item { #define BTRFS_FILE_EXTENT_INLINE 1 struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ u8 type; + /* * disk space consumed by the extent, checksum blocks are included * in these numbers @@ -471,9 +505,11 @@ struct btrfs_file_extent_item { */ __le64 offset; /* - * the logical number of file blocks (no csums included) + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. */ __le64 num_bytes; + } __attribute__ ((__packed__)); struct btrfs_csum_item { @@ -814,6 +850,7 @@ struct btrfs_root { #define BTRFS_MOUNT_NOBARRIER (1 << 2) #define BTRFS_MOUNT_SSD (1 << 3) #define BTRFS_MOUNT_DEGRADED (1 << 4) +#define BTRFS_MOUNT_COMPRESS (1 << 5) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -825,6 +862,7 @@ struct btrfs_root { #define BTRFS_INODE_NODATASUM (1 << 0) #define BTRFS_INODE_NODATACOW (1 << 1) #define BTRFS_INODE_READONLY (1 << 2) +#define BTRFS_INODE_NOCOMPRESS (1 << 3) #define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ ~BTRFS_INODE_##flag) #define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ @@ -1424,14 +1462,6 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; } -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - struct btrfs_item *e) -{ - unsigned long offset; - offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); - return btrfs_item_size(eb, e) - offset; -} - BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, disk_bytenr, 64); BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, @@ -1442,6 +1472,36 @@ BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, offset, 64); BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + struct btrfs_file_extent_item *e) +{ + return btrfs_file_extent_ram_bytes(eb, e); +} + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers. If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, + struct btrfs_item *e) +{ + unsigned long offset; + offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); + return btrfs_item_size(eb, e) - offset; +} static inline struct btrfs_root *btrfs_sb(struct super_block *sb) { @@ -1745,10 +1805,11 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 objectid, u64 pos, u64 disk_offset, - u64 disk_num_bytes, - u64 num_bytes, u64 offset); + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding); int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, @@ -1758,6 +1819,8 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio); +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, + u64 start, unsigned long len); struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1799,7 +1862,7 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name, int namelen); int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio); + size_t size, struct bio *bio, unsigned long bio_flags); unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0be044bb619499a8d588b4c2735bd1c402e9d79f..dc95f636a11ba0a58eb45624e565a8f8a82813e1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -83,6 +83,7 @@ struct async_submit_bio { extent_submit_bio_hook_t *submit_bio_hook; int rw; int mirror_num; + unsigned long bio_flags; struct btrfs_work work; }; @@ -115,6 +116,7 @@ struct extent_map *btree_get_extent(struct inode *inode, struct page *page, } em->start = 0; em->len = (u64)-1; + em->block_len = (u64)-1; em->block_start = 0; em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; @@ -469,12 +471,13 @@ static void run_one_async_submit(struct btrfs_work *work) wake_up(&fs_info->async_submit_wait); async->submit_bio_hook(async->inode, async->rw, async->bio, - async->mirror_num); + async->mirror_num, async->bio_flags); kfree(async); } int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, extent_submit_bio_hook_t *submit_bio_hook) { struct async_submit_bio *async; @@ -491,6 +494,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, async->submit_bio_hook = submit_bio_hook; async->work.func = run_one_async_submit; async->work.flags = 0; + async->bio_flags = bio_flags; while(atomic_read(&fs_info->async_submit_draining) && atomic_read(&fs_info->nr_async_submits)) { @@ -530,7 +534,7 @@ static int btree_csum_one_bio(struct bio *bio) } static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num) + int mirror_num, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; @@ -556,17 +560,17 @@ static int __btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, } static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num) + int mirror_num, unsigned long bio_flags) { /* * kthread helpers are used to submit writes so that checksumming * can happen in parallel across all CPUs */ if (!(rw & (1 << BIO_RW))) { - return __btree_submit_bio_hook(inode, rw, bio, mirror_num); + return __btree_submit_bio_hook(inode, rw, bio, mirror_num, 0); } return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, rw, bio, mirror_num, + inode, rw, bio, mirror_num, 0, __btree_submit_bio_hook); } @@ -1407,6 +1411,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, fs_info->btree_inode = new_inode(sb); fs_info->btree_inode->i_ino = 1; fs_info->btree_inode->i_nlink = 1; + fs_info->thread_pool_size = min(num_online_cpus() + 2, 8); INIT_LIST_HEAD(&fs_info->ordered_extents); @@ -1508,6 +1513,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, */ btrfs_init_workers(&fs_info->workers, "worker", fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->submit_workers, "submit", min_t(u64, fs_devices->num_devices, fs_info->thread_pool_size)); @@ -1559,6 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb, } fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, + 4 * 1024 * 1024 / PAGE_CACHE_SIZE); nodesize = btrfs_super_nodesize(disk_super); leafsize = btrfs_super_leafsize(disk_super); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f84f5058dbbb892435802c52ad9efd430d7f7900..4eb1f1408d21285bd75a7abe58dfbe571aeb94e6 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -71,6 +71,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, int metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, extent_submit_bio_hook_t *submit_bio_hook); int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 280ac1aa9b6d7d39816a24283836cdaa6de60cf4..bbf04e80a1a3c2b130b223b44c7858f52441b39e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3278,6 +3278,7 @@ static int noinline relocate_data_extent(struct inode *reloc_inode, em->start = extent_key->objectid - offset; em->len = extent_key->offset; + em->block_len = extent_key->offset; em->block_start = extent_key->objectid; em->bdev = root->fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -3314,10 +3315,14 @@ struct btrfs_ref_path { }; struct disk_extent { + u64 ram_bytes; u64 disk_bytenr; u64 disk_num_bytes; u64 offset; u64 num_bytes; + u8 compression; + u8 encryption; + u16 other_encoding; }; static int is_cowonly_root(u64 root_objectid) @@ -3631,6 +3636,11 @@ static int noinline get_new_locations(struct inode *reloc_inode, btrfs_file_extent_disk_num_bytes(leaf, fi); exts[nr].offset = btrfs_file_extent_offset(leaf, fi); exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + exts[nr].compression = btrfs_file_extent_compression(leaf, fi); + exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); + exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, + fi); WARN_ON(exts[nr].offset > 0); WARN_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); @@ -3846,6 +3856,8 @@ static int noinline replace_one_extent(struct btrfs_trans_handle *trans, new_extents[0].disk_bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, new_extents[0].disk_num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extents[0].ram_bytes); ext_offset += new_extents[0].offset; btrfs_set_file_extent_offset(leaf, fi, ext_offset); btrfs_mark_buffer_dirty(leaf); @@ -3911,6 +3923,16 @@ static int noinline replace_one_extent(struct btrfs_trans_handle *trans, new_extents[i].disk_bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, new_extents[i].disk_num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extents[i].ram_bytes); + + btrfs_set_file_extent_compression(leaf, fi, + new_extents[i].compression); + btrfs_set_file_extent_encryption(leaf, fi, + new_extents[i].encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, + new_extents[i].other_encoding); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_len); ext_offset += new_extents[i].offset; @@ -4169,6 +4191,8 @@ static int noinline replace_extents_in_leaf(struct btrfs_trans_handle *trans, ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extent->ram_bytes); btrfs_set_file_extent_disk_bytenr(leaf, fi, new_extent->disk_bytenr); btrfs_set_file_extent_disk_num_bytes(leaf, fi, @@ -4847,7 +4871,8 @@ static struct inode noinline *create_reloc_inode(struct btrfs_fs_info *fs_info, BUG_ON(err); err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, - group->key.offset, 0); + group->key.offset, 0, group->key.offset, + 0, 0, 0); BUG_ON(err); inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 563b2d12f4f29cc517bbe6a79c8120516991994b..314041fdfa43a25d5df9cc865b6831563a2c4b64 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache; static LIST_HEAD(buffers); static LIST_HEAD(states); +#define LEAK_DEBUG 1 #ifdef LEAK_DEBUG static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED; #endif @@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state); * * 1 is returned if we find something, 0 if nothing was in the tree */ -static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, - u64 *start, u64 *end, u64 max_bytes) +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) { struct rb_node *node; struct extent_state *state; @@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, u64 total_bytes = 0; spin_lock_irq(&tree->lock); + /* * this search will find all the extents that end after * our range starts. */ -search_again: node = tree_search(tree, cur_start); if (!node) { if (!found) @@ -1100,40 +1101,6 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, *end = state->end; goto out; } - if (!found && !(state->state & EXTENT_BOUNDARY)) { - struct extent_state *prev_state; - struct rb_node *prev_node = node; - while(1) { - prev_node = rb_prev(prev_node); - if (!prev_node) - break; - prev_state = rb_entry(prev_node, - struct extent_state, - rb_node); - if ((prev_state->end + 1 != state->start) || - !(prev_state->state & EXTENT_DELALLOC)) - break; - if ((cur_start - prev_state->start) * 2 > - max_bytes) - break; - state = prev_state; - node = prev_node; - } - } - if (state->state & EXTENT_LOCKED) { - DEFINE_WAIT(wait); - atomic_inc(&state->refs); - prepare_to_wait(&state->wq, &wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&tree->lock); - schedule(); - spin_lock_irq(&tree->lock); - finish_wait(&state->wq, &wait); - free_extent_state(state); - goto search_again; - } - set_state_cb(tree, state, EXTENT_LOCKED); - state->state |= EXTENT_LOCKED; if (!found) *start = state->start; found++; @@ -1151,6 +1118,208 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, return found; } +static noinline int __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + if (index == locked_page->index && end_index == index) + return 0; + + while(nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min(nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] != locked_page) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 delalloc_start, + u64 delalloc_end) +{ + unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long start_index = index; + unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long pages_locked = 0; + struct page *pages[16]; + unsigned long nrpages; + int ret; + int i; + + /* the caller is responsible for locking the start index */ + if (index == locked_page->index && index == end_index) + return 0; + + /* skip the page at the start index */ + nrpages = end_index - index + 1; + while(nrpages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min(nrpages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + ret = -EAGAIN; + goto done; + } + /* now we have an array of pages, lock them all */ + for (i = 0; i < ret; i++) { + /* + * the caller is taking responsibility for + * locked_page + */ + if (pages[i] != locked_page) + lock_page(pages[i]); + page_cache_release(pages[i]); + } + pages_locked += ret; + nrpages -= ret; + index += ret; + cond_resched(); + } + ret = 0; +done: + if (ret && pages_locked) { + __unlock_for_delalloc(inode, locked_page, + delalloc_start, + ((u64)(start_index + pages_locked - 1)) << + PAGE_CACHE_SHIFT); + } + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, + u64 *start, u64 *end, + u64 max_bytes) +{ + u64 delalloc_start; + u64 delalloc_end; + u64 found; + int ret; + int loops = 0; + +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes); + if (!found) { + *start = delalloc_start; + *end = delalloc_end; + return found; + } + + /* + * make sure to limit the number of pages we try to lock down + * if we're looping. + */ + if (delalloc_end + 1 - delalloc_start > max_bytes && loops) { + delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) & + ~((u64)PAGE_CACHE_SIZE - 1); + } + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; + loops = 1; + goto again; + } else { + found = 0; + goto out_failed; + } + } + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ + lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1); + if (!ret) { + unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int clear_dirty, int set_writeback, + int end_writeback) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; + + if (clear_dirty) + clear_bits |= EXTENT_DIRTY; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); + + while(nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min(nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } + if (clear_dirty) + clear_page_dirty_for_io(pages[i]); + if (set_writeback) + set_page_writeback(pages[i]); + if (end_writeback) + end_page_writeback(pages[i]); + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} +EXPORT_SYMBOL(extent_clear_unlock_delalloc); + /* * count the number of bytes in the tree that have a given bit(s) * set. This can be fairly slow, except for EXTENT_DIRTY which is @@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, return bio; } -static int submit_one_bio(int rw, struct bio *bio, int mirror_num) +static int submit_one_bio(int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags) { int ret = 0; struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; struct page *page = bvec->bv_page; struct extent_io_tree *tree = bio->bi_private; - struct rb_node *node; - struct extent_state *state; u64 start; u64 end; start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; end = start + bvec->bv_len - 1; - spin_lock_irq(&tree->lock); - node = __etree_search(tree, start, NULL, NULL); - BUG_ON(!node); - state = rb_entry(node, struct extent_state, rb_node); - while(state->end < end) { - node = rb_next(node); - state = rb_entry(node, struct extent_state, rb_node); - } - BUG_ON(state->end != end); - spin_unlock_irq(&tree->lock); - bio->bi_private = NULL; bio_get(bio); if (tree->ops && tree->ops->submit_bio_hook) tree->ops->submit_bio_hook(page->mapping->host, rw, bio, - mirror_num); + mirror_num, bio_flags); else submit_bio(rw, bio); if (bio_flagged(bio, BIO_EOPNOTSUPP)) @@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, struct bio **bio_ret, unsigned long max_pages, bio_end_io_t end_io_func, - int mirror_num) + int mirror_num, + unsigned long prev_bio_flags, + unsigned long bio_flags) { int ret = 0; struct bio *bio; int nr; + int contig = 0; + int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; + int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; + size_t page_size = min(size, PAGE_CACHE_SIZE); if (bio_ret && *bio_ret) { bio = *bio_ret; - if (bio->bi_sector + (bio->bi_size >> 9) != sector || + if (old_compressed) + contig = bio->bi_sector == sector; + else + contig = bio->bi_sector + (bio->bi_size >> 9) == + sector; + + if (prev_bio_flags != bio_flags || !contig || (tree->ops && tree->ops->merge_bio_hook && - tree->ops->merge_bio_hook(page, offset, size, bio)) || - bio_add_page(bio, page, size, offset) < size) { - ret = submit_one_bio(rw, bio, mirror_num); + tree->ops->merge_bio_hook(page, offset, page_size, bio, + bio_flags)) || + bio_add_page(bio, page, page_size, offset) < page_size) { + ret = submit_one_bio(rw, bio, mirror_num, + prev_bio_flags); bio = NULL; } else { return 0; } } - nr = bio_get_nr_vecs(bdev); + if (this_compressed) + nr = BIO_MAX_PAGES; + else + nr = bio_get_nr_vecs(bdev); + bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); if (!bio) { printk("failed to allocate bio nr %d\n", nr); } - - bio_add_page(bio, page, size, offset); + bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; if (bio_ret) { *bio_ret = bio; } else { - ret = submit_one_bio(rw, bio, mirror_num); + ret = submit_one_bio(rw, bio, mirror_num, bio_flags); } return ret; @@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len) static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, - struct bio **bio, int mirror_num) + struct bio **bio, int mirror_num, + unsigned long *bio_flags) { struct inode *inode = page->mapping->host; u64 start = (u64)page->index << PAGE_CACHE_SHIFT; @@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree, int nr = 0; size_t page_offset = 0; size_t iosize; + size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; + unsigned long this_bio_flag = 0; set_page_extent_mapped(page); end = page_end; lock_extent(tree, start, end, GFP_NOFS); + if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + char *userpage; + size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + iosize = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } while (cur <= end) { if (cur >= last_byte) { char *userpage; @@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur); } BUG_ON(end < cur); + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = EXTENT_BIO_COMPRESSED; + iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); - sector = (em->block_start + extent_offset) >> 9; + if (this_bio_flag & EXTENT_BIO_COMPRESSED) { + disk_io_size = em->block_len; + sector = em->block_start >> 9; + } else { + sector = (em->block_start + extent_offset) >> 9; + disk_io_size = iosize; + } bdev = em->bdev; block_start = em->block_start; free_extent_map(em); @@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur); unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; pnr -= page->index; ret = submit_extent_page(READ, tree, page, - sector, iosize, page_offset, + sector, disk_io_size, page_offset, bdev, bio, pnr, - end_bio_extent_readpage, mirror_num); + end_bio_extent_readpage, mirror_num, + *bio_flags, + this_bio_flag); nr++; + *bio_flags = this_bio_flag; } if (ret) SetPageError(page); @@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent) { struct bio *bio = NULL; + unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(tree, page, get_extent, &bio, 0); + ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + &bio_flags); if (bio) - submit_one_bio(READ, bio, 0); + submit_one_bio(READ, bio, 0, bio_flags); return ret; } EXPORT_SYMBOL(extent_read_full_page); @@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; u64 nr_delalloc; u64 delalloc_end; + int page_started; + int compressed; WARN_ON(!PageLocked(page)); pg_offset = i_size & (PAGE_CACHE_SIZE - 1); @@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, delalloc_start = start; delalloc_end = 0; + page_started = 0; while(delalloc_end < page_end) { - nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, &delalloc_end, 128 * 1024 * 1024); if (nr_delalloc == 0) { delalloc_start = delalloc_end + 1; continue; } - tree->ops->fill_delalloc(inode, delalloc_start, - delalloc_end); - clear_extent_bit(tree, delalloc_start, - delalloc_end, - EXTENT_LOCKED | EXTENT_DELALLOC, - 1, 0, GFP_NOFS); + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started); delalloc_start = delalloc_end + 1; } + + /* did the fill delalloc function already unlock and start the IO? */ + if (page_started) { + return 0; + } + lock_extent(tree, start, page_end, GFP_NOFS); unlock_start = start; if (tree->ops && tree->ops->writepage_start_hook) { - ret = tree->ops->writepage_start_hook(page, start, page_end); + ret = tree->ops->writepage_start_hook(page, start, + page_end); if (ret == -EAGAIN) { unlock_extent(tree, start, page_end, GFP_NOFS); redirty_page_for_writepage(wbc, page); @@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, sector = (em->block_start + extent_offset) >> 9; bdev = em->bdev; block_start = em->block_start; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); free_extent_map(em); em = NULL; - if (block_start == EXTENT_MAP_HOLE || + /* + * compressed and inline extents are written through other + * paths in the FS + */ + if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); @@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unlock_extent(tree, unlock_start, cur + iosize -1, GFP_NOFS); - if (tree->ops && tree->ops->writepage_end_io_hook) + /* + * end_io notification does not happen here for + * compressed extents + */ + if (!compressed && tree->ops && + tree->ops->writepage_end_io_hook) tree->ops->writepage_end_io_hook(page, cur, cur + iosize - 1, NULL, 1); - cur = cur + iosize; + else if (compressed) { + /* we don't want to end_page_writeback on + * a compressed extent. this happens + * elsewhere + */ + nr++; + } + + cur += iosize; pg_offset += iosize; unlock_start = cur; continue; } - /* leave this out until we have a page_mkwrite call */ if (0 && !test_range_bit(tree, cur, cur + iosize - 1, EXTENT_DIRTY, 0)) { @@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset += iosize; continue; } + clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); if (tree->ops && tree->ops->writepage_io_hook) { ret = tree->ops->writepage_io_hook(page, cur, @@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ret = submit_extent_page(WRITE, tree, page, sector, iosize, pg_offset, bdev, &epd->bio, max_nr, - end_bio_extent_writepage, 0); + end_bio_extent_writepage, + 0, 0, 0); if (ret) SetPageError(page); } @@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, extent_write_cache_pages(tree, mapping, &wbc_writepages, __extent_writepage, &epd); if (epd.bio) { - submit_one_bio(WRITE, epd.bio, 0); + submit_one_bio(WRITE, epd.bio, 0, 0); } return ret; } @@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree, ret = extent_write_cache_pages(tree, mapping, wbc, __extent_writepage, &epd); if (epd.bio) { - submit_one_bio(WRITE, epd.bio, 0); + submit_one_bio(WRITE, epd.bio, 0, 0); } return ret; } @@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree, struct bio *bio = NULL; unsigned page_idx; struct pagevec pvec; + unsigned long bio_flags = 0; pagevec_init(&pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { @@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree, if (!pagevec_add(&pvec, page)) __pagevec_lru_add(&pvec); __extent_read_full_page(tree, page, get_extent, - &bio, 0); + &bio, 0, &bio_flags); } page_cache_release(page); } @@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree, __pagevec_lru_add(&pvec); BUG_ON(!list_empty(pages)); if (bio) - submit_one_bio(READ, bio, 0); + submit_one_bio(READ, bio, 0, bio_flags); return 0; } EXPORT_SYMBOL(extent_readpages); @@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree, ret = submit_extent_page(READ, tree, page, sector, iosize, page_offset, em->bdev, NULL, 1, - end_bio_extent_preparewrite, 0); + end_bio_extent_preparewrite, 0, + 0, 0); iocount++; block_start = block_start + iosize; } else { @@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map, } if (!test_range_bit(tree, em->start, extent_map_end(em) - 1, - EXTENT_LOCKED, 0)) { + EXTENT_LOCKED | EXTENT_WRITEBACK | + EXTENT_ORDERED, + 0)) { remove_extent_mapping(map, em); /* once for the rb tree */ free_extent_map(em); @@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, int inc_all_pages = 0; unsigned long num_pages; struct bio *bio = NULL; + unsigned long bio_flags = 0; if (eb->flags & EXTENT_UPTODATE) return 0; @@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ClearPageError(page); err = __extent_read_full_page(tree, page, get_extent, &bio, - mirror_num); + mirror_num, &bio_flags); if (err) { ret = err; printk("err %d from __extent_read_full_page\n", ret); @@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } if (bio) - submit_one_bio(READ, bio, mirror_num); + submit_one_bio(READ, bio, mirror_num, bio_flags); if (ret || !wait) { if (ret) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c9d1908a1ae3881d68deafb48b3f8c32b668a707..86f859b87a6ec51bffe6975f8a95704f65bc5c53 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -18,6 +18,9 @@ #define EXTENT_BOUNDARY (1 << 11) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +/* flags for bio submission */ +#define EXTENT_BIO_COMPRESSED 1 + /* * page->private values. Every page that is controlled by the extent * map has page->private set to one. @@ -28,14 +31,17 @@ struct extent_state; typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, - struct bio *bio, int mirror_num); + struct bio *bio, int mirror_num, + unsigned long bio_flags); struct extent_io_ops { - int (*fill_delalloc)(struct inode *inode, u64 start, u64 end); + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); int (*writepage_io_hook)(struct page *page, u64 start, u64 end); extent_submit_bio_hook_t *submit_bio_hook; int (*merge_bio_hook)(struct page *page, unsigned long offset, - size_t size, struct bio *bio); + size_t size, struct bio *bio, + unsigned long bio_flags); int (*readpage_io_hook)(struct page *page, u64 start, u64 end); int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, u64 start, u64 end, @@ -245,4 +251,9 @@ void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); int release_extent_buffer_tail_pages(struct extent_buffer *eb); int extent_range_uptodate(struct extent_io_tree *tree, u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int clear_dirty, int set_writeback, + int clear_writeback); #endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 74b2a29880d36a3db2e40efcf45b8784319998ee..fd3ebfb8c3c5e80bf440c21e6541d976325bcef2 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -184,6 +184,13 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) return 0; + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -239,6 +246,7 @@ int add_extent_mapping(struct extent_map_tree *tree, if (rb && mergable_maps(merge, em)) { em->start = merge->start; em->len += merge->len; + em->block_len += merge->block_len; em->block_start = merge->block_start; merge->in_tree = 0; rb_erase(&merge->rb_node, &tree->map); @@ -250,6 +258,7 @@ int add_extent_mapping(struct extent_map_tree *tree, merge = rb_entry(rb, struct extent_map, rb_node); if (rb && mergable_maps(em, merge)) { em->len += merge->len; + em->block_len += merge->len; rb_erase(&merge->rb_node, &tree->map); merge->in_tree = 0; free_extent_map(merge); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 26ac6fe0b2682ed15df0467931d0913c9af6fa4f..abbcbeb28c79d0d910ac922164b8def8a353a8ea 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -10,6 +10,7 @@ /* bits for the flags field */ #define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1 struct extent_map { struct rb_node rb_node; @@ -18,6 +19,7 @@ struct extent_map { u64 start; u64 len; u64 block_start; + u64 block_len; unsigned long flags; struct block_device *bdev; atomic_t refs; @@ -38,9 +40,9 @@ static inline u64 extent_map_end(struct extent_map *em) static inline u64 extent_map_block_end(struct extent_map *em) { - if (em->block_start + em->len < em->block_start) + if (em->block_start + em->block_len < em->block_start) return (u64)-1; - return em->block_start + em->len; + return em->block_start + em->block_len; } void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6dbe88b9d7d43c2707bb6be2d270bb9eb3b4d50f..f4d3fa71bc419c3c5ef569eda6ded776ba3e605e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -31,7 +31,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 disk_offset, u64 disk_num_bytes, - u64 num_bytes, u64 offset) + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding) { int ret = 0; struct btrfs_file_extent_item *item; @@ -57,8 +58,13 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); btrfs_set_file_extent_offset(leaf, item, offset); btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); btrfs_set_file_extent_generation(leaf, item, trans->transid); btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, compression); + btrfs_set_file_extent_encryption(leaf, item, encryption); + btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); @@ -213,6 +219,73 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, return 0; } +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, + u64 start, unsigned long len) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + char *data; + struct page *page; + unsigned long total_bytes = 0; + unsigned long this_sum_bytes = 0; + + sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS); + if (!sums) + return -ENOMEM; + + sector_sum = sums->sums; + sums->file_offset = start; + sums->len = len; + INIT_LIST_HEAD(&sums->list); + ordered = btrfs_lookup_ordered_extent(inode, sums->file_offset); + BUG_ON(!ordered); + + while(len > 0) { + if (start >= ordered->file_offset + ordered->len || + start < ordered->file_offset) { + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + sums = kzalloc(btrfs_ordered_sum_size(root, len), + GFP_NOFS); + BUG_ON(!sums); + sector_sum = sums->sums; + sums->len = len; + sums->file_offset = start; + ordered = btrfs_lookup_ordered_extent(inode, + sums->file_offset); + BUG_ON(!ordered); + } + + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + + data = kmap_atomic(page, KM_USER0); + sector_sum->sum = ~(u32)0; + sector_sum->sum = btrfs_csum_data(root, data, sector_sum->sum, + PAGE_CACHE_SIZE); + kunmap_atomic(data, KM_USER0); + btrfs_csum_final(sector_sum->sum, + (char *)§or_sum->sum); + sector_sum->offset = page_offset(page); + page_cache_release(page); + + sector_sum++; + total_bytes += PAGE_CACHE_SIZE; + this_sum_bytes += PAGE_CACHE_SIZE; + start += PAGE_CACHE_SIZE; + + WARN_ON(len < PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + } + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, struct bio *bio) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 69abbe19add25aa3f8485e3dcae0f2b393d86f29..0aa15436590e46bdc05396e28871d4aa3e12c072 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -95,153 +95,6 @@ static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages) } } -/* this does all the hard work for inserting an inline extent into - * the btree. Any existing inline extent is extended as required to make room, - * otherwise things are inserted as required into the btree - */ -static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 offset, size_t size, - struct page **pages, size_t page_offset, - int num_pages) -{ - struct btrfs_key key; - struct btrfs_path *path; - struct extent_buffer *leaf; - char *kaddr; - unsigned long ptr; - struct btrfs_file_extent_item *ei; - struct page *page; - u32 datasize; - int err = 0; - int ret; - int i; - ssize_t cur_size; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - btrfs_set_trans_block_group(trans, inode); - - key.objectid = inode->i_ino; - key.offset = offset; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) { - err = ret; - goto fail; - } - if (ret == 1) { - struct btrfs_key found_key; - - if (path->slots[0] == 0) - goto insert; - - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - if (found_key.objectid != inode->i_ino) - goto insert; - - if (found_key.type != BTRFS_EXTENT_DATA_KEY) - goto insert; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, ei) != - BTRFS_FILE_EXTENT_INLINE) { - goto insert; - } - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - ret = 0; - } - if (ret == 0) { - u32 found_size; - u64 found_end; - - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_type(leaf, ei) != - BTRFS_FILE_EXTENT_INLINE) { - err = ret; - btrfs_print_leaf(root, leaf); - printk("found wasn't inline offset %Lu inode %lu\n", - offset, inode->i_ino); - goto fail; - } - found_size = btrfs_file_extent_inline_len(leaf, - btrfs_item_nr(leaf, path->slots[0])); - found_end = key.offset + found_size; - - if (found_end < offset + size) { - btrfs_release_path(root, path); - ret = btrfs_search_slot(trans, root, &key, path, - offset + size - found_end, 1); - BUG_ON(ret != 0); - - ret = btrfs_extend_item(trans, root, path, - offset + size - found_end); - if (ret) { - err = ret; - goto fail; - } - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - inode_add_bytes(inode, offset + size - found_end); - } - if (found_end < offset) { - ptr = btrfs_file_extent_inline_start(ei) + found_size; - memset_extent_buffer(leaf, 0, ptr, offset - found_end); - } - } else { -insert: - btrfs_release_path(root, path); - datasize = offset + size - key.offset; - inode_add_bytes(inode, datasize); - datasize = btrfs_file_extent_calc_inline_size(datasize); - ret = btrfs_insert_empty_item(trans, root, path, &key, - datasize); - if (ret) { - err = ret; - printk("got bad ret %d\n", ret); - goto fail; - } - leaf = path->nodes[0]; - ei = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, ei, trans->transid); - btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); - } - ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset; - - cur_size = size; - i = 0; - while (size > 0) { - page = pages[i]; - kaddr = kmap_atomic(page, KM_USER0); - cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size); - write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size); - kunmap_atomic(kaddr, KM_USER0); - page_offset = 0; - ptr += cur_size; - size -= cur_size; - if (i >= num_pages) { - printk("i %d num_pages %d\n", i, num_pages); - } - i++; - } - btrfs_mark_buffer_dirty(leaf); -fail: - btrfs_free_path(path); - return err; -} - /* * after copy_from_user, pages need to be dirtied and we need to make * sure holes are created between the current EOF and the start of @@ -267,8 +120,6 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, u64 start_pos; u64 end_of_last_block; u64 end_pos = pos + write_bytes; - u64 inline_size; - int did_inline = 0; loff_t isize = i_size_read(inode); start_pos = pos & ~((u64)root->sectorsize - 1); @@ -314,7 +165,8 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, err = btrfs_insert_file_extent(trans, root, inode->i_ino, last_pos_in_file, - 0, 0, hole_size, 0); + 0, 0, hole_size, 0, + hole_size, 0, 0, 0); btrfs_drop_extent_cache(inode, last_pos_in_file, last_pos_in_file + hole_size - 1, 0); mutex_unlock(&BTRFS_I(inode)->extent_mutex); @@ -324,57 +176,19 @@ static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans, goto failed; } - /* - * either allocate an extent for the new bytes or setup the key - * to show we are doing inline data in the extent + /* check for reserved extents on each page, we don't want + * to reset the delalloc bit on things that already have + * extents reserved. */ - inline_size = end_pos; - if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) || - inline_size > root->fs_info->max_inline || - (inline_size & (root->sectorsize -1)) == 0 || - inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) { - /* check for reserved extents on each page, we don't want - * to reset the delalloc bit on things that already have - * extents reserved. - */ - btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); - for (i = 0; i < num_pages; i++) { - struct page *p = pages[i]; - SetPageUptodate(p); - ClearPageChecked(p); - set_page_dirty(p); - } - } else { - u64 aligned_end; - /* step one, delete the existing extents in this range */ - aligned_end = (pos + write_bytes + root->sectorsize - 1) & - ~((u64)root->sectorsize - 1); - mutex_lock(&BTRFS_I(inode)->extent_mutex); - err = btrfs_drop_extents(trans, root, inode, start_pos, - aligned_end, aligned_end, &hint_byte); - if (err) - goto failed; - if (isize > inline_size) - inline_size = min_t(u64, isize, aligned_end); - inline_size -= start_pos; - err = insert_inline_extent(trans, root, inode, start_pos, - inline_size, pages, 0, num_pages); - btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0); - BUG_ON(err); - mutex_unlock(&BTRFS_I(inode)->extent_mutex); - - /* - * an ugly way to do all the prop accounting around - * the page bits and mapping tags - */ - set_page_writeback(pages[0]); - end_page_writeback(pages[0]); - did_inline = 1; + btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + ClearPageChecked(p); + set_page_dirty(p); } if (end_pos > isize) { i_size_write(inode, end_pos); - if (did_inline) - BTRFS_I(inode)->disk_i_size = end_pos; btrfs_update_inode(trans, root, inode); } failed: @@ -399,6 +213,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int ret; int testend = 1; unsigned long flags; + int compressed = 0; WARN_ON(end < start); if (end == (u64)-1) { @@ -434,6 +249,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, free_extent_map(em); continue; } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); clear_bit(EXTENT_FLAG_PINNED, &em->flags); remove_extent_mapping(em_tree, em); @@ -442,6 +258,12 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->start = em->start; split->len = start - em->start; split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + split->bdev = em->bdev; split->flags = flags; ret = add_extent_mapping(em_tree, split); @@ -459,7 +281,13 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, split->bdev = em->bdev; split->flags = flags; - split->block_start = em->block_start + diff; + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + diff; + } ret = add_extent_mapping(em_tree, split); BUG_ON(ret); @@ -533,7 +361,7 @@ int btrfs_check_file(struct btrfs_root *root, struct inode *inode) struct btrfs_item *item; item = btrfs_item_nr(leaf, slot); extent_end = found_key.offset + - btrfs_file_extent_inline_len(leaf, item); + btrfs_file_extent_inline_len(leaf, extent); extent_end = (extent_end + root->sectorsize - 1) & ~((u64)root->sectorsize -1 ); } @@ -573,6 +401,10 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, u64 extent_end = 0; u64 search_start = start; u64 leaf_start; + u64 ram_bytes = 0; + u8 compression = 0; + u8 encryption = 0; + u16 other_encoding = 0; u64 root_gen; u64 root_owner; struct extent_buffer *leaf; @@ -589,6 +421,7 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, int recow; int ret; + inline_limit = 0; btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); @@ -637,6 +470,12 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, extent); + compression = btrfs_file_extent_compression(leaf, + extent); + encryption = btrfs_file_extent_encryption(leaf, + extent); + other_encoding = btrfs_file_extent_other_encoding(leaf, + extent); if (found_type == BTRFS_FILE_EXTENT_REG) { extent_end = btrfs_file_extent_disk_bytenr(leaf, @@ -646,13 +485,13 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, extent); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, + extent); found_extent = 1; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item; - item = btrfs_item_nr(leaf, slot); found_inline = 1; extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, item); + btrfs_file_extent_inline_len(leaf, extent); } } else { extent_end = search_start; @@ -680,10 +519,9 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, search_start = (extent_end + mask) & ~mask; } else search_start = extent_end; - if (end <= extent_end && start >= key.offset && found_inline) { + + if (end <= extent_end && start >= key.offset && found_inline) *hint_byte = EXTENT_MAP_INLINE; - goto out; - } if (found_extent) { read_extent_buffer(leaf, &old, (unsigned long)extent, @@ -770,12 +608,27 @@ int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, &old, (unsigned long)extent, sizeof(old)); + btrfs_set_file_extent_compression(leaf, extent, + compression); + btrfs_set_file_extent_encryption(leaf, extent, + encryption); + btrfs_set_file_extent_other_encoding(leaf, extent, + other_encoding); btrfs_set_file_extent_offset(leaf, extent, le64_to_cpu(old.offset) + end - key.offset); WARN_ON(le64_to_cpu(old.num_bytes) < (extent_end - end)); btrfs_set_file_extent_num_bytes(leaf, extent, extent_end - end); + + /* + * set the ram bytes to the size of the full extent + * before splitting. This is a worst case flag, + * but its the best we can do because we don't know + * how splitting affects compression + */ + btrfs_set_file_extent_ram_bytes(leaf, extent, + ram_bytes); btrfs_set_file_extent_type(leaf, extent, BTRFS_FILE_EXTENT_REG); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bf4bed6ca4d601b2773fe9c9bb17890637a5aa99..9797592dc86b7c7b8398a18f61a892d70a86c24d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -49,6 +49,7 @@ #include "compat.h" #include "tree-log.h" #include "ref-cache.h" +#include "compression.h" struct btrfs_iget_args { u64 ino; @@ -83,6 +84,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { }; static void btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); /* * a very lame attempt at stopping writes when the FS is 85% full. There @@ -113,58 +115,375 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, return ret; } +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static int noinline insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, size_t size, size_t compressed_size, + struct page **compressed_pages) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int err = 0; + int ret; + size_t cur_size = size; + size_t datasize; + unsigned long offset; + int use_compress = 0; + + if (compressed_size && compressed_pages) { + use_compress = 1; + cur_size = compressed_size; + } + + path = btrfs_alloc_path(); if (!path) + return -ENOMEM; + + btrfs_set_trans_block_group(trans, inode); + + key.objectid = inode->i_ino; + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + inode_add_bytes(inode, size); + datasize = btrfs_file_extent_calc_inline_size(cur_size); + + inode_add_bytes(inode, size); + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(ret); + if (ret) { + err = ret; + printk("got bad ret %d\n", ret); + goto fail; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (use_compress) { + struct page *cpage; + int i = 0; + while(compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min(compressed_size, + PAGE_CACHE_SIZE); + + kaddr = kmap(cpage); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap(cpage); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + BTRFS_COMPRESS_ZLIB); + } else { + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_atomic(page, KM_USER0); + offset = start & (PAGE_CACHE_SIZE - 1); + write_extent_buffer(leaf, kaddr + offset, ptr, size); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + BTRFS_I(inode)->disk_i_size = inode->i_size; + btrfs_update_inode(trans, root, inode); + return 0; +fail: + btrfs_free_path(path); + return err; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static int cow_file_range_inline(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end, + size_t compressed_size, + struct page **compressed_pages) +{ + u64 isize = i_size_read(inode); + u64 actual_end = min(end + 1, isize); + u64 inline_len = actual_end - start; + u64 aligned_end = (end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + u64 hint_byte; + u64 data_len = inline_len; + int ret; + + if (compressed_size) + data_len = compressed_size; + + if (start > 0 || + data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + (!compressed_size && + (actual_end & (root->sectorsize - 1)) == 0) || + end + 1 < isize || + data_len > root->fs_info->max_inline) { + return 1; + } + + mutex_lock(&BTRFS_I(inode)->extent_mutex); + ret = btrfs_drop_extents(trans, root, inode, start, + aligned_end, aligned_end, &hint_byte); + BUG_ON(ret); + + if (isize > actual_end) + inline_len = min_t(u64, isize, actual_end); + ret = insert_inline_extent(trans, root, inode, start, + inline_len, compressed_size, + compressed_pages); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, start, aligned_end, 0); + mutex_unlock(&BTRFS_I(inode)->extent_mutex); + return 0; +} + /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. */ -static int cow_file_range(struct inode *inode, u64 start, u64 end) +static int cow_file_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 alloc_hint = 0; u64 num_bytes; + unsigned long ram_size; + u64 orig_start; + u64 disk_num_bytes; u64 cur_alloc_size; u64 blocksize = root->sectorsize; - u64 orig_num_bytes; + u64 actual_end; struct btrfs_key ins; struct extent_map *em; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 0; + struct page **pages = NULL; + unsigned long nr_pages; + unsigned long nr_pages_ret = 0; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned long max_compressed = 128 * 1024; + unsigned long max_uncompressed = 256 * 1024; + int i; + int will_compress; trans = btrfs_join_transaction(root, 1); BUG_ON(!trans); btrfs_set_trans_block_group(trans, inode); + orig_start = start; + + /* + * compression made this loop a bit ugly, but the basic idea is to + * compress some pages but keep the total size of the compressed + * extent relatively small. If compression is off, this goto target + * is never used. + */ +again: + will_compress = 0; + nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; + nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + actual_end = min_t(u64, i_size_read(inode), end + 1); + total_compressed = actual_end - start; + + /* we want to make sure that amount of ram required to uncompress + * an extent is reasonable, so we limit the total size in ram + * of a compressed extent to 256k + */ + total_compressed = min(total_compressed, max_uncompressed); num_bytes = (end - start + blocksize) & ~(blocksize - 1); num_bytes = max(blocksize, num_bytes); - orig_num_bytes = num_bytes; + disk_num_bytes = num_bytes; + total_in = 0; + ret = 0; - if (alloc_hint == EXTENT_MAP_INLINE) - goto out; + /* we do compression for mount -o compress and when the + * inode has not been flagged as nocompress + */ + if (!btrfs_test_flag(inode, NOCOMPRESS) && + btrfs_test_opt(root, COMPRESS)) { + WARN_ON(pages); + pages = kmalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + + /* we want to make sure the amount of IO required to satisfy + * a random read is reasonably small, so we limit the size + * of a compressed extent to 128k + */ + ret = btrfs_zlib_compress_pages(inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); + + if (!ret) { + unsigned long offset = total_compressed & + (PAGE_CACHE_SIZE - 1); + struct page *page = pages[nr_pages_ret - 1]; + char *kaddr; + + /* zero the tail end of the last page, we might be + * sending it down to disk + */ + if (offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, + PAGE_CACHE_SIZE - offset); + kunmap_atomic(kaddr, KM_USER0); + } + will_compress = 1; + } + } + if (start == 0) { + /* lets try to make an inline extent */ + if (ret || total_in < (end - start + 1)) { + /* we didn't compress the entire range, try + * to make an uncompressed inline extent. This + * is almost sure to fail, but maybe inline sizes + * will get bigger later + */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + } else { + ret = cow_file_range_inline(trans, root, inode, + start, end, + total_compressed, pages); + } + if (ret == 0) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, + 1, 1, 1); + *page_started = 1; + ret = 0; + goto free_pages_out; + } + } + + if (will_compress) { + /* + * we aren't doing an inline extent round the compressed size + * up to a block size boundary so the allocator does sane + * things + */ + total_compressed = (total_compressed + blocksize - 1) & + ~(blocksize - 1); + + /* + * one last check to make sure the compression is really a + * win, compare the page count read with the blocks on disk + */ + total_in = (total_in + PAGE_CACHE_SIZE - 1) & + ~(PAGE_CACHE_SIZE - 1); + if (total_compressed >= total_in) { + will_compress = 0; + } else { + disk_num_bytes = total_compressed; + num_bytes = total_in; + } + } + if (!will_compress && pages) { + /* + * the compression code ran but failed to make things smaller, + * free any pages it allocated and our page pointer array + */ + for (i = 0; i < nr_pages_ret; i++) { + page_cache_release(pages[i]); + } + kfree(pages); + pages = NULL; + total_compressed = 0; + nr_pages_ret = 0; + + /* flag the file so we don't compress in the future */ + btrfs_set_flag(inode, NOCOMPRESS); + } + + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); - BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy)); mutex_lock(&BTRFS_I(inode)->extent_mutex); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); mutex_unlock(&BTRFS_I(inode)->extent_mutex); - while(num_bytes > 0) { - cur_alloc_size = min(num_bytes, root->fs_info->max_extent); + while(disk_num_bytes > 0) { + unsigned long min_bytes; + + /* + * the max size of a compressed extent is pretty small, + * make the code a little less complex by forcing + * the allocator to find a whole compressed extent at once + */ + if (will_compress) + min_bytes = disk_num_bytes; + else + min_bytes = root->sectorsize; + + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); ret = btrfs_reserve_extent(trans, root, cur_alloc_size, - root->sectorsize, 0, alloc_hint, + min_bytes, 0, alloc_hint, (u64)-1, &ins, 1); if (ret) { WARN_ON(1); - goto out; + goto free_pages_out_fail; } em = alloc_extent_map(GFP_NOFS); em->start = start; - em->len = ins.offset; + + if (will_compress) { + ram_size = num_bytes; + em->len = num_bytes; + } else { + /* ramsize == disk size */ + ram_size = ins.offset; + em->len = ins.offset; + } + em->block_start = ins.objectid; + em->block_len = ins.offset; em->bdev = root->fs_info->fs_devices->latest_bdev; + mutex_lock(&BTRFS_I(inode)->extent_mutex); set_bit(EXTENT_FLAG_PINNED, &em->flags); + + if (will_compress) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + while(1) { spin_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em); @@ -174,26 +493,95 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) break; } btrfs_drop_extent_cache(inode, start, - start + ins.offset - 1, 0); + start + ram_size - 1, 0); } mutex_unlock(&BTRFS_I(inode)->extent_mutex); cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, - ins.offset, 0); + ram_size, cur_alloc_size, 0, + will_compress); BUG_ON(ret); - if (num_bytes < cur_alloc_size) { - printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes, + + if (disk_num_bytes < cur_alloc_size) { + printk("num_bytes %Lu cur_alloc %Lu\n", disk_num_bytes, cur_alloc_size); break; } + + if (will_compress) { + /* + * we're doing compression, we and we need to + * submit the compressed extents down to the device. + * + * We lock down all the file pages, clearing their + * dirty bits and setting them writeback. Everyone + * that wants to modify the page will wait on the + * ordered extent above. + * + * The writeback bits on the file pages are + * cleared when the compressed pages are on disk + */ + btrfs_end_transaction(trans, root); + + if (start <= page_offset(locked_page) && + page_offset(locked_page) < start + ram_size) { + *page_started = 1; + } + + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, + start + ram_size - 1, + NULL, 1, 1, 0); + + ret = btrfs_submit_compressed_write(inode, start, + ram_size, ins.objectid, + cur_alloc_size, pages, + nr_pages_ret); + + BUG_ON(ret); + trans = btrfs_join_transaction(root, 1); + if (start + ram_size < end) { + start += ram_size; + alloc_hint = ins.objectid + ins.offset; + /* pages will be freed at end_bio time */ + pages = NULL; + goto again; + } else { + /* we've written everything, time to go */ + break; + } + } + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits + */ + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, + locked_page, 0, 0, 0); + disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; } + + ret = 0; out: btrfs_end_transaction(trans, root); + return ret; + +free_pages_out_fail: + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, end, locked_page, 0, 0, 0); +free_pages_out: + for (i = 0; i < nr_pages_ret; i++) + page_cache_release(pages[i]); + if (pages) + kfree(pages); + + goto out; } /* @@ -203,7 +591,8 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end) * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ -static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) +static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started) { u64 extent_start; u64 extent_end; @@ -260,6 +649,11 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) extent_end = extent_start + extent_num_bytes; err = 0; + if (btrfs_file_extent_compression(leaf, item) || + btrfs_file_extent_encryption(leaf,item) || + btrfs_file_extent_other_encoding(leaf, item)) + goto not_found; + if (loops && start != extent_start) goto not_found; @@ -284,7 +678,8 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) bytenr += btrfs_file_extent_offset(leaf, item); extent_num_bytes = min(end + 1, extent_end) - start; ret = btrfs_add_ordered_extent(inode, start, bytenr, - extent_num_bytes, 1); + extent_num_bytes, + extent_num_bytes, 1, 0); if (ret) { err = ret; goto out; @@ -300,7 +695,8 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) not_found: btrfs_end_transaction(trans, root); btrfs_free_path(path); - return cow_file_range(inode, start, end); + return cow_file_range(inode, locked_page, start, end, + page_started); } out: WARN_ON(err); @@ -312,16 +708,19 @@ static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end) /* * extent_io.c call back to do delayed allocation processing */ -static int run_delalloc_range(struct inode *inode, u64 start, u64 end) +static int run_delalloc_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; if (btrfs_test_opt(root, NODATACOW) || btrfs_test_flag(inode, NODATACOW)) - ret = run_delalloc_nocow(inode, start, end); + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started); else - ret = cow_file_range(inode, start, end); + ret = cow_file_range(inode, locked_page, start, end, + page_started); return ret; } @@ -383,7 +782,8 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, * we don't create bios that span stripes or chunks */ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, - size_t size, struct bio *bio) + size_t size, struct bio *bio, + unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; struct btrfs_mapping_tree *map_tree; @@ -413,7 +813,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, * are inserted into the btree */ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num) + int mirror_num, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -429,7 +829,7 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, * or reading the csums from the tree before a read */ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, - int mirror_num) + int mirror_num, unsigned long bio_flags) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; @@ -444,11 +844,17 @@ int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, if (!(rw & (1 << BIO_RW))) { btrfs_lookup_bio_sums(root, inode, bio); + + if (bio_flags & EXTENT_BIO_COMPRESSED) { + return btrfs_submit_compressed_read(inode, bio, + mirror_num, bio_flags); + } + goto mapit; } return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, inode, rw, bio, mirror_num, - __btrfs_submit_bio_hook); + bio_flags, __btrfs_submit_bio_hook); mapit: return btrfs_map_bio(root, rw, bio, mirror_num, 0); } @@ -539,7 +945,7 @@ void btrfs_writepage_fixup_worker(struct btrfs_work *work) * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * - * In our case any range that doesn't have the EXTENT_ORDERED bit set + * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. @@ -632,10 +1038,21 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) btrfs_set_file_extent_disk_bytenr(leaf, extent_item, ordered_extent->start); btrfs_set_file_extent_disk_num_bytes(leaf, extent_item, - ordered_extent->len); + ordered_extent->disk_len); btrfs_set_file_extent_offset(leaf, extent_item, 0); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + btrfs_set_file_extent_compression(leaf, extent_item, 1); + else + btrfs_set_file_extent_compression(leaf, extent_item, 0); + btrfs_set_file_extent_encryption(leaf, extent_item, 0); + btrfs_set_file_extent_other_encoding(leaf, extent_item, 0); + + /* ram bytes = extent_num_bytes for now */ btrfs_set_file_extent_num_bytes(leaf, extent_item, ordered_extent->len); + btrfs_set_file_extent_ram_bytes(leaf, extent_item, + ordered_extent->len); btrfs_mark_buffer_dirty(leaf); btrfs_drop_extent_cache(inode, ordered_extent->file_offset, @@ -644,7 +1061,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) mutex_unlock(&BTRFS_I(inode)->extent_mutex); ins.objectid = ordered_extent->start; - ins.offset = ordered_extent->len; + ins.offset = ordered_extent->disk_len; ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, root->root_key.objectid, @@ -714,6 +1131,7 @@ int btrfs_io_failed_hook(struct bio *failed_bio, int ret; int rw; u64 logical; + unsigned long bio_flags = 0; ret = get_state_private(failure_tree, start, &private); if (ret) { @@ -738,6 +1156,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio, } logical = start - em->start; logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + bio_flags = EXTENT_BIO_COMPRESSED; failrec->logical = logical; free_extent_map(em); set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | @@ -781,7 +1201,8 @@ int btrfs_io_failed_hook(struct bio *failed_bio, rw = READ; BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, - failrec->last_mirror); + failrec->last_mirror, + bio_flags); return 0; } @@ -1644,10 +2065,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, item_end += btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - struct btrfs_item *item = btrfs_item_nr(leaf, - path->slots[0]); item_end += btrfs_file_extent_inline_len(leaf, - item); + fi); } item_end--; } @@ -1715,7 +2134,14 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, root_owner = btrfs_header_owner(leaf); } } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - if (!del_item) { + /* + * we can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { u32 size = new_size - found_key.offset; if (root->ref_cows) { @@ -1926,7 +2352,8 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) err = btrfs_insert_file_extent(trans, root, inode->i_ino, hole_start, 0, 0, - hole_size, 0); + hole_size, 0, hole_size, + 0, 0, 0); btrfs_drop_extent_cache(inode, hole_start, (u64)-1, 0); btrfs_check_file(root, inode); @@ -2894,11 +3321,50 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, start_diff = map_start - em->start; em->start = map_start; em->len = map_len; - if (em->block_start < EXTENT_MAP_LAST_BYTE) + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff; + em->block_len -= start_diff; + } return add_extent_mapping(em_tree, em); } +static noinline int uncompress_inline(struct btrfs_path *path, + struct inode *inode, struct page *page, + size_t pg_offset, u64 extent_offset, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + + WARN_ON(pg_offset != 0); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, + btrfs_item_nr(leaf, path->slots[0])); + tmp = kmalloc(inline_size, GFP_NOFS); + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min(PAGE_CACHE_SIZE, max_size); + ret = btrfs_zlib_decompress(tmp, page, extent_offset, + inline_size, max_size); + if (ret) { + char *kaddr = kmap_atomic(page, KM_USER0); + unsigned long copy_size = min_t(u64, + PAGE_CACHE_SIZE - pg_offset, + max_size - extent_offset); + memset(kaddr + pg_offset, 0, copy_size); + kunmap_atomic(kaddr, KM_USER0); + } + kfree(tmp); + return 0; +} + /* * a bit scary, this does extent mapping from logical file offset to the disk. * the ugly parts come from merging extents from the disk with the @@ -2927,6 +3393,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_trans_handle *trans = NULL; + int compressed; again: spin_lock(&em_tree->lock); @@ -2951,6 +3418,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, em->bdev = root->fs_info->fs_devices->latest_bdev; em->start = EXTENT_MAP_HOLE; em->len = (u64)-1; + em->block_len = (u64)-1; if (!path) { path = btrfs_alloc_path(); @@ -2983,6 +3451,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, found_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; + compressed = btrfs_file_extent_compression(leaf, item); if (found_type == BTRFS_FILE_EXTENT_REG) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); @@ -3005,10 +3474,18 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, em->block_start = EXTENT_MAP_HOLE; goto insert; } - bytenr += btrfs_file_extent_offset(leaf, item); - em->block_start = bytenr; em->start = extent_start; em->len = extent_end - extent_start; + if (compressed) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->block_start = bytenr; + em->block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); + } else { + bytenr += btrfs_file_extent_offset(leaf, item); + em->block_start = bytenr; + em->block_len = em->len; + } goto insert; } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { u64 page_start; @@ -3018,8 +3495,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, size_t extent_offset; size_t copy_size; - size = btrfs_file_extent_inline_len(leaf, btrfs_item_nr(leaf, - path->slots[0])); + size = btrfs_file_extent_inline_len(leaf, item); extent_end = (extent_start + size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); if (start < extent_start || start >= extent_end) { @@ -3035,9 +3511,10 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, } em->block_start = EXTENT_MAP_INLINE; - if (!page) { + if (!page || create) { em->start = extent_start; - em->len = size; + em->len = (size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); goto out; } @@ -3048,11 +3525,22 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, em->start = extent_start + extent_offset; em->len = (copy_size + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); - map = kmap(page); + if (compressed) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (create == 0 && !PageUptodate(page)) { - read_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); + if (btrfs_file_extent_compression(leaf, item) == + BTRFS_COMPRESS_ZLIB) { + ret = uncompress_inline(path, inode, page, + pg_offset, + extent_offset, item); + BUG_ON(ret); + } else { + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + } flush_dcache_page(page); } else if (create && PageUptodate(page)) { if (!trans) { @@ -3063,11 +3551,12 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, trans = btrfs_join_transaction(root, 1); goto again; } + map = kmap(page); write_extent_buffer(leaf, map + pg_offset, ptr, copy_size); + kunmap(page); btrfs_mark_buffer_dirty(leaf); } - kunmap(page); set_extent_uptodate(io_tree, em->start, extent_map_end(em) - 1, GFP_NOFS); goto insert; @@ -3779,6 +4268,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); btrfs_mark_buffer_dirty(leaf); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2eb6caba57c278ad92a044da28b7782d009890fb..b5745bb96d40ddfc34d7d3719c5c4ab5b04df790 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -165,7 +165,8 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, * inserted. */ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, int nocow) + u64 start, u64 len, u64 disk_len, int nocow, + int compressed) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -180,9 +181,12 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; + entry->disk_len = disk_len; entry->inode = inode; if (nocow) set_bit(BTRFS_ORDERED_NOCOW, &entry->flags); + if (compressed) + set_bit(BTRFS_ORDERED_COMPRESSED, &entry->flags); /* one ref for the tree */ atomic_set(&entry->refs, 1); @@ -389,9 +393,10 @@ void btrfs_start_ordered_extent(struct inode *inode, * for pdflush to find them */ btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_NONE); - if (wait) + if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); + } } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f50f8870a1449801386846b1dd66d6dd4eb068fd..1ef464145d226a509a8f24646e0758e52e18273b 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -66,6 +66,8 @@ struct btrfs_ordered_sum { #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ + struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -73,9 +75,12 @@ struct btrfs_ordered_extent { /* disk byte number */ u64 start; - /* length of the extent in bytes */ + /* ram length of the extent in bytes */ u64 len; + /* extent length on disk */ + u64 disk_len; + /* flags (described above) */ unsigned long flags; @@ -127,7 +132,8 @@ int btrfs_remove_ordered_extent(struct inode *inode, int btrfs_dec_test_ordered_pending(struct inode *inode, u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, int nocow); + u64 start, u64 len, u64 disk_len, int nocow, + int compressed); int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index bd9ab3e9a7f2fa44fa1795a06e6c6e59800c8ffb..64725c13aa1160de7c44e09b88995f7d090cbe09 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -115,15 +115,16 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) if (btrfs_file_extent_type(l, fi) == BTRFS_FILE_EXTENT_INLINE) { printk("\t\tinline extent data size %u\n", - btrfs_file_extent_inline_len(l, item)); + btrfs_file_extent_inline_len(l, fi)); break; } printk("\t\textent data disk bytenr %llu nr %llu\n", (unsigned long long)btrfs_file_extent_disk_bytenr(l, fi), (unsigned long long)btrfs_file_extent_disk_num_bytes(l, fi)); - printk("\t\textent data offset %llu nr %llu\n", + printk("\t\textent data offset %llu nr %llu ram %llu\n", (unsigned long long)btrfs_file_extent_offset(l, fi), - (unsigned long long)btrfs_file_extent_num_bytes(l, fi)); + (unsigned long long)btrfs_file_extent_num_bytes(l, fi), + (unsigned long long)btrfs_file_extent_ram_bytes(l, fi)); break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2e6039825b7bc19d97f79276b494a4f3fd177d92..431fdf144b585709afe6f6f50f81df2f8b0021e6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -47,6 +47,7 @@ #include "volumes.h" #include "version.h" #include "export.h" +#include "compression.h" #define BTRFS_SUPER_MAGIC 0x9123683E @@ -69,7 +70,7 @@ static void btrfs_put_super (struct super_block * sb) enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, - Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_err, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, }; static match_table_t tokens = { @@ -83,6 +84,7 @@ static match_table_t tokens = { {Opt_max_inline, "max_inline=%s"}, {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, + {Opt_compress, "compress"}, {Opt_ssd, "ssd"}, {Opt_noacl, "noacl"}, {Opt_err, NULL}, @@ -163,6 +165,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, NODATACOW); btrfs_set_opt(info->mount_opt, NODATASUM); break; + case Opt_compress: + printk(KERN_INFO "btrfs: use compression\n"); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); @@ -622,6 +628,7 @@ static int __init init_btrfs_fs(void) err = btrfs_interface_init(); if (err) goto free_extent_map; + err = register_filesystem(&btrfs_fs_type); if (err) goto unregister_ioctl; @@ -651,6 +658,7 @@ static void __exit exit_btrfs_fs(void) unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); + btrfs_zlib_exit(); } module_init(init_btrfs_fs) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index cf618cc8b34ad3fd59d065adb01342ba027c2558..e6d579053a475b6191e31c6a8396140ab8293e98 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -540,8 +540,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, if (found_type == BTRFS_FILE_EXTENT_REG) extent_end = start + btrfs_file_extent_num_bytes(eb, item); else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - size = btrfs_file_extent_inline_len(eb, - btrfs_item_nr(eb, slot)); + size = btrfs_file_extent_inline_len(eb, item); extent_end = (start + size + mask) & ~mask; } else { ret = 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2eed7f91f51a3dca150ba8d2f61d746991167ef1..7db4cfd03a98bd5697e851c4113ee9e74c43af4c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1816,6 +1816,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->start = key.offset; em->len = *num_bytes; em->block_start = 0; + em->block_len = em->len; if (type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_add_system_chunk(trans, chunk_root, &key, @@ -2323,6 +2324,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em->start = logical; em->len = length; em->block_start = 0; + em->block_len = em->len; map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 0000000000000000000000000000000000000000..e99309180a11461ee3df870cf793ba453c226027 --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,637 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Plan: call deflate() with avail_in == *sourcelen, + avail_out = *dstlen - 12 and flush == Z_FINISH. + If it doesn't manage to finish, call it again with + avail_in == 0 and avail_out set to the remaining 12 + bytes for it to clean up. + Q: Is 12 bytes sufficient? +*/ +#define STREAM_END_SPACE 12 + +struct workspace { + z_stream inf_strm; + z_stream def_strm; + char *buf; + struct list_head list; +}; + +static LIST_HEAD(idle_workspace); +static DEFINE_SPINLOCK(workspace_lock); +static unsigned long num_workspace; +static atomic_t alloc_workspace = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); + +/* + * this finds an available zlib workspace or allocates a new one + * NULL or an ERR_PTR is returned if things go bad. + */ +static struct workspace *find_zlib_workspace(void) +{ + struct workspace *workspace; + int ret; + int cpus = num_online_cpus(); + +again: + spin_lock(&workspace_lock); + if (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + num_workspace--; + spin_unlock(&workspace_lock); + return workspace; + + } + spin_unlock(&workspace_lock); + if (atomic_read(&alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&alloc_workspace) > cpus) + schedule(); + finish_wait(&workspace_wait, &wait); + goto again; + } + atomic_inc(&alloc_workspace); + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) { + ret = -ENOMEM; + goto fail; + } + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); + if (!workspace->def_strm.workspace) { + ret = -ENOMEM; + goto fail; + } + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!workspace->inf_strm.workspace) { + ret = -ENOMEM; + goto fail_inflate; + } + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->buf) { + ret = -ENOMEM; + goto fail_kmalloc; + } + return workspace; + +fail_kmalloc: + vfree(workspace->inf_strm.workspace); +fail_inflate: + vfree(workspace->def_strm.workspace); +fail: + kfree(workspace); + atomic_dec(&alloc_workspace); + wake_up(&workspace_wait); + return ERR_PTR(ret); +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static int free_workspace(struct workspace *workspace) +{ + spin_lock(&workspace_lock); + if (num_workspace < num_online_cpus()) { + list_add_tail(&workspace->list, &idle_workspace); + num_workspace++; + spin_unlock(&workspace_lock); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; + } + spin_unlock(&workspace_lock); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + + atomic_dec(&alloc_workspace); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct workspace *workspace; + while(!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + atomic_dec(&alloc_workspace); + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + int ret; + struct workspace *workspace; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + int out_written = 0; + int in_read = 0; + unsigned long bytes_left; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + workspace = find_zlib_workspace(); + if (!workspace) + return -1; + + if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + printk(KERN_WARNING "deflateInit failed\n"); + ret = -1; + goto out; + } + + workspace->def_strm.total_in = 0; + workspace->def_strm.total_out = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->def_strm.next_in = data_in; + workspace->def_strm.next_out = cpage_out; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + + out_written = 0; + in_read = 0; + + while (workspace->def_strm.total_in < len) { + ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->def_strm); + ret = -1; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->def_strm.total_in > 8192 && + workspace->def_strm.total_in < + workspace->def_strm.total_out) { + ret = -1; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->def_strm.avail_out == 0) { + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->def_strm.total_in >= len) + break; + + /* we've read in a full page, get a new one */ + if (workspace->def_strm.avail_in == 0) { + if (workspace->def_strm.total_out > max_out) + break; + + bytes_left = len - workspace->def_strm.total_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, + start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + workspace->def_strm.avail_in = min(bytes_left, + PAGE_CACHE_SIZE); + workspace->def_strm.next_in = data_in; + } + } + workspace->def_strm.avail_in = 0; + ret = zlib_deflate(&workspace->def_strm, Z_FINISH); + zlib_deflateEnd(&workspace->def_strm); + + if (ret != Z_STREAM_END) { + ret = -1; + goto out; + } + + if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + ret = -1; + goto out; + } + + ret = 0; + *total_out = workspace->def_strm.total_out; + *total_in = workspace->def_strm.total_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + free_workspace(workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + char *data_in; + size_t total_out = 0; + unsigned long page_bytes_left; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + struct page *page_out; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + unsigned long start_byte; + unsigned long current_buf_start; + char *kaddr; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = min(srclen, PAGE_CACHE_SIZE); + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.total_out = 0; + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + page_out = bvec[page_out_index].bv_page; + page_bytes_left = PAGE_CACHE_SIZE; + pg_offset = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + while(workspace->inf_strm.total_in < srclen) { + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) { + break; + } + + /* + * buf start is the byte offset we're of the start of + * our workspace buffer + */ + buf_start = total_out; + + /* total_out is the last byte of the workspace buffer */ + total_out = workspace->inf_strm.total_out; + + working_bytes = total_out - buf_start; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + if (working_bytes == 0) { + /* we didn't make progress in this inflate + * call, we're done + */ + if (ret != Z_STREAM_END) + ret = -1; + break; + } + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) { + goto next; + } + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while(working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, + bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + pg_offset += bytes; + page_bytes_left -= bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (page_bytes_left == 0) { + page_out_index++; + if (page_out_index >= vcnt) { + ret = 0; + goto done; + } + page_out = bvec[page_out_index].bv_page; + pg_offset = 0; + page_bytes_left = PAGE_CACHE_SIZE; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) { + goto next; + } + + /* the next page in the biovec might not + * be adjacent to the last page, but it + * might still be found inside this working + * buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + + buf_offset; + } + } + } +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + + if (workspace->inf_strm.avail_in == 0) { + unsigned long tmp; + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + tmp = srclen - workspace->inf_strm.total_in; + workspace->inf_strm.avail_in = min(tmp, + PAGE_CACHE_SIZE); + } + } + if (ret != Z_STREAM_END) { + ret = -1; + } else { + ret = 0; + } +done: + zlib_inflateEnd(&workspace->inf_strm); + if (data_in) + kunmap(pages_in[page_in_index]); +out: + free_workspace(workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + unsigned long bytes_left = destlen; + unsigned long total_out = 0; + char *kaddr; + + if (destlen > PAGE_CACHE_SIZE) + return -ENOMEM; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = srclen; + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->inf_strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + + while(bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long pg_offset = 0; + + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) { + break; + } + + buf_start = total_out; + total_out = workspace->inf_strm.total_out; + + if (total_out == buf_start) { + ret = -1; + break; + } + + if (total_out <= start_byte) { + goto next; + } + + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + } else { + buf_offset = 0; + } + + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, bytes_left); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + } + if (ret != Z_STREAM_END && bytes_left != 0) { + ret = -1; + } else { + ret = 0; + } + zlib_inflateEnd(&workspace->inf_strm); +out: + free_workspace(workspace); + return ret; +} + +void btrfs_zlib_exit(void) +{ + free_workspaces(); +}