提交 dc2af6a6 编写于 作者: L Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: (42 commits)
  Btrfs: hash the btree inode during  fill_super
  Btrfs: relocate file extents in clusters
  Btrfs: don't rename file into dummy directory
  Btrfs: check size of inode backref before adding hardlink
  Btrfs: fix releasepage to avoid unlocking extents we haven't locked
  Btrfs: Fix test_range_bit for whole file extents
  Btrfs: fix errors handling cached state in set/clear_extent_bit
  Btrfs: fix early enospc during balancing
  Btrfs: deal with NULL space info
  Btrfs: account for space used by the super mirrors
  Btrfs: fix extent entry threshold calculation
  Btrfs: remove dead code
  Btrfs: fix bitmap size tracking
  Btrfs: don't keep retrying a block group if we fail to allocate a cluster
  Btrfs: make balance code choose more wisely when relocating
  Btrfs: fix arithmetic error in clone ioctl
  Btrfs: add snapshot/subvolume destroy ioctl
  Btrfs: change how subvolumes are organized
  Btrfs: do not reuse objectid of deleted snapshot/subvol
  Btrfs: speed up snapshot dropping
  ...
......@@ -48,6 +48,9 @@ struct btrfs_worker_thread {
/* number of things on the pending list */
atomic_t num_pending;
/* reference counter for this struct */
atomic_t refs;
unsigned long sequence;
/* protects the pending list. */
......@@ -71,7 +74,12 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
unsigned long flags;
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 1;
list_move(&worker->worker_list, &worker->workers->idle_list);
/* the list may be empty if the worker is just starting */
if (!list_empty(&worker->worker_list)) {
list_move(&worker->worker_list,
&worker->workers->idle_list);
}
spin_unlock_irqrestore(&worker->workers->lock, flags);
}
}
......@@ -87,23 +95,49 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
unsigned long flags;
spin_lock_irqsave(&worker->workers->lock, flags);
worker->idle = 0;
list_move_tail(&worker->worker_list,
&worker->workers->worker_list);
if (!list_empty(&worker->worker_list)) {
list_move_tail(&worker->worker_list,
&worker->workers->worker_list);
}
spin_unlock_irqrestore(&worker->workers->lock, flags);
}
}
static noinline int run_ordered_completions(struct btrfs_workers *workers,
struct btrfs_work *work)
static void check_pending_worker_creates(struct btrfs_worker_thread *worker)
{
struct btrfs_workers *workers = worker->workers;
unsigned long flags;
rmb();
if (!workers->atomic_start_pending)
return;
spin_lock_irqsave(&workers->lock, flags);
if (!workers->atomic_start_pending)
goto out;
workers->atomic_start_pending = 0;
if (workers->num_workers >= workers->max_workers)
goto out;
spin_unlock_irqrestore(&workers->lock, flags);
btrfs_start_workers(workers, 1);
return;
out:
spin_unlock_irqrestore(&workers->lock, flags);
}
static noinline int run_ordered_completions(struct btrfs_workers *workers,
struct btrfs_work *work)
{
if (!workers->ordered)
return 0;
set_bit(WORK_DONE_BIT, &work->flags);
spin_lock_irqsave(&workers->lock, flags);
spin_lock(&workers->order_lock);
while (1) {
if (!list_empty(&workers->prio_order_list)) {
......@@ -126,45 +160,118 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags))
break;
spin_unlock_irqrestore(&workers->lock, flags);
spin_unlock(&workers->order_lock);
work->ordered_func(work);
/* now take the lock again and call the freeing code */
spin_lock_irqsave(&workers->lock, flags);
spin_lock(&workers->order_lock);
list_del(&work->order_list);
work->ordered_free(work);
}
spin_unlock_irqrestore(&workers->lock, flags);
spin_unlock(&workers->order_lock);
return 0;
}
static void put_worker(struct btrfs_worker_thread *worker)
{
if (atomic_dec_and_test(&worker->refs))
kfree(worker);
}
static int try_worker_shutdown(struct btrfs_worker_thread *worker)
{
int freeit = 0;
spin_lock_irq(&worker->lock);
spin_lock(&worker->workers->lock);
if (worker->workers->num_workers > 1 &&
worker->idle &&
!worker->working &&
!list_empty(&worker->worker_list) &&
list_empty(&worker->prio_pending) &&
list_empty(&worker->pending) &&
atomic_read(&worker->num_pending) == 0) {
freeit = 1;
list_del_init(&worker->worker_list);
worker->workers->num_workers--;
}
spin_unlock(&worker->workers->lock);
spin_unlock_irq(&worker->lock);
if (freeit)
put_worker(worker);
return freeit;
}
static struct btrfs_work *get_next_work(struct btrfs_worker_thread *worker,
struct list_head *prio_head,
struct list_head *head)
{
struct btrfs_work *work = NULL;
struct list_head *cur = NULL;
if(!list_empty(prio_head))
cur = prio_head->next;
smp_mb();
if (!list_empty(&worker->prio_pending))
goto refill;
if (!list_empty(head))
cur = head->next;
if (cur)
goto out;
refill:
spin_lock_irq(&worker->lock);
list_splice_tail_init(&worker->prio_pending, prio_head);
list_splice_tail_init(&worker->pending, head);
if (!list_empty(prio_head))
cur = prio_head->next;
else if (!list_empty(head))
cur = head->next;
spin_unlock_irq(&worker->lock);
if (!cur)
goto out_fail;
out:
work = list_entry(cur, struct btrfs_work, list);
out_fail:
return work;
}
/*
* main loop for servicing work items
*/
static int worker_loop(void *arg)
{
struct btrfs_worker_thread *worker = arg;
struct list_head *cur;
struct list_head head;
struct list_head prio_head;
struct btrfs_work *work;
INIT_LIST_HEAD(&head);
INIT_LIST_HEAD(&prio_head);
do {
spin_lock_irq(&worker->lock);
again_locked:
again:
while (1) {
if (!list_empty(&worker->prio_pending))
cur = worker->prio_pending.next;
else if (!list_empty(&worker->pending))
cur = worker->pending.next;
else
work = get_next_work(worker, &prio_head, &head);
if (!work)
break;
work = list_entry(cur, struct btrfs_work, list);
list_del(&work->list);
clear_bit(WORK_QUEUED_BIT, &work->flags);
work->worker = worker;
spin_unlock_irq(&worker->lock);
work->func(work);
......@@ -175,9 +282,13 @@ static int worker_loop(void *arg)
*/
run_ordered_completions(worker->workers, work);
spin_lock_irq(&worker->lock);
check_idle_worker(worker);
check_pending_worker_creates(worker);
}
spin_lock_irq(&worker->lock);
check_idle_worker(worker);
if (freezing(current)) {
worker->working = 0;
spin_unlock_irq(&worker->lock);
......@@ -216,8 +327,10 @@ static int worker_loop(void *arg)
spin_lock_irq(&worker->lock);
set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending))
goto again_locked;
!list_empty(&worker->prio_pending)) {
spin_unlock_irq(&worker->lock);
goto again;
}
/*
* this makes sure we get a wakeup when someone
......@@ -226,8 +339,13 @@ static int worker_loop(void *arg)
worker->working = 0;
spin_unlock_irq(&worker->lock);
if (!kthread_should_stop())
schedule();
if (!kthread_should_stop()) {
schedule_timeout(HZ * 120);
if (!worker->working &&
try_worker_shutdown(worker)) {
return 0;
}
}
}
__set_current_state(TASK_RUNNING);
}
......@@ -242,16 +360,30 @@ int btrfs_stop_workers(struct btrfs_workers *workers)
{
struct list_head *cur;
struct btrfs_worker_thread *worker;
int can_stop;
spin_lock_irq(&workers->lock);
list_splice_init(&workers->idle_list, &workers->worker_list);
while (!list_empty(&workers->worker_list)) {
cur = workers->worker_list.next;
worker = list_entry(cur, struct btrfs_worker_thread,
worker_list);
kthread_stop(worker->task);
list_del(&worker->worker_list);
kfree(worker);
atomic_inc(&worker->refs);
workers->num_workers -= 1;
if (!list_empty(&worker->worker_list)) {
list_del_init(&worker->worker_list);
put_worker(worker);
can_stop = 1;
} else
can_stop = 0;
spin_unlock_irq(&workers->lock);
if (can_stop)
kthread_stop(worker->task);
spin_lock_irq(&workers->lock);
put_worker(worker);
}
spin_unlock_irq(&workers->lock);
return 0;
}
......@@ -266,10 +398,13 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
INIT_LIST_HEAD(&workers->order_list);
INIT_LIST_HEAD(&workers->prio_order_list);
spin_lock_init(&workers->lock);
spin_lock_init(&workers->order_lock);
workers->max_workers = max;
workers->idle_thresh = 32;
workers->name = name;
workers->ordered = 0;
workers->atomic_start_pending = 0;
workers->atomic_worker_start = 0;
}
/*
......@@ -293,7 +428,9 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock);
atomic_set(&worker->num_pending, 0);
atomic_set(&worker->refs, 1);
worker->workers = workers;
worker->task = kthread_run(worker_loop, worker,
"btrfs-%s-%d", workers->name,
......@@ -303,7 +440,6 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
kfree(worker);
goto fail;
}
spin_lock_irq(&workers->lock);
list_add_tail(&worker->worker_list, &workers->idle_list);
worker->idle = 1;
......@@ -350,7 +486,6 @@ static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers)
*/
next = workers->worker_list.next;
worker = list_entry(next, struct btrfs_worker_thread, worker_list);
atomic_inc(&worker->num_pending);
worker->sequence++;
if (worker->sequence % workers->idle_thresh == 0)
......@@ -367,28 +502,18 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
{
struct btrfs_worker_thread *worker;
unsigned long flags;
struct list_head *fallback;
again:
spin_lock_irqsave(&workers->lock, flags);
worker = next_worker(workers);
spin_unlock_irqrestore(&workers->lock, flags);
if (!worker) {
spin_lock_irqsave(&workers->lock, flags);
if (workers->num_workers >= workers->max_workers) {
struct list_head *fallback = NULL;
/*
* we have failed to find any workers, just
* return the force one
*/
if (!list_empty(&workers->worker_list))
fallback = workers->worker_list.next;
if (!list_empty(&workers->idle_list))
fallback = workers->idle_list.next;
BUG_ON(!fallback);
worker = list_entry(fallback,
struct btrfs_worker_thread, worker_list);
spin_unlock_irqrestore(&workers->lock, flags);
goto fallback;
} else if (workers->atomic_worker_start) {
workers->atomic_start_pending = 1;
goto fallback;
} else {
spin_unlock_irqrestore(&workers->lock, flags);
/* we're below the limit, start another worker */
......@@ -396,6 +521,28 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers)
goto again;
}
}
goto found;
fallback:
fallback = NULL;
/*
* we have failed to find any workers, just
* return the first one we can find.
*/
if (!list_empty(&workers->worker_list))
fallback = workers->worker_list.next;
if (!list_empty(&workers->idle_list))
fallback = workers->idle_list.next;
BUG_ON(!fallback);
worker = list_entry(fallback,
struct btrfs_worker_thread, worker_list);
found:
/*
* this makes sure the worker doesn't exit before it is placed
* onto a busy/idle list
*/
atomic_inc(&worker->num_pending);
spin_unlock_irqrestore(&workers->lock, flags);
return worker;
}
......@@ -427,7 +574,7 @@ int btrfs_requeue_work(struct btrfs_work *work)
spin_lock(&worker->workers->lock);
worker->idle = 0;
list_move_tail(&worker->worker_list,
&worker->workers->worker_list);
&worker->workers->worker_list);
spin_unlock(&worker->workers->lock);
}
if (!worker->working) {
......@@ -435,9 +582,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
worker->working = 1;
}
spin_unlock_irqrestore(&worker->lock, flags);
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
out:
return 0;
......@@ -463,14 +610,18 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
worker = find_worker(workers);
if (workers->ordered) {
spin_lock_irqsave(&workers->lock, flags);
/*
* you're not allowed to do ordered queues from an
* interrupt handler
*/
spin_lock(&workers->order_lock);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
list_add_tail(&work->order_list,
&workers->prio_order_list);
} else {
list_add_tail(&work->order_list, &workers->order_list);
}
spin_unlock_irqrestore(&workers->lock, flags);
spin_unlock(&workers->order_lock);
} else {
INIT_LIST_HEAD(&work->order_list);
}
......@@ -481,7 +632,6 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
list_add_tail(&work->list, &worker->prio_pending);
else
list_add_tail(&work->list, &worker->pending);
atomic_inc(&worker->num_pending);
check_busy_worker(worker);
/*
......@@ -492,10 +642,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
wake = 1;
worker->working = 1;
spin_unlock_irqrestore(&worker->lock, flags);
if (wake)
wake_up_process(worker->task);
spin_unlock_irqrestore(&worker->lock, flags);
out:
return 0;
}
......@@ -73,6 +73,15 @@ struct btrfs_workers {
/* force completions in the order they were queued */
int ordered;
/* more workers required, but in an interrupt handler */
int atomic_start_pending;
/*
* are we allowed to sleep while starting workers or are we required
* to start them at a later time?
*/
int atomic_worker_start;
/* list with all the work threads. The workers on the idle thread
* may be actively servicing jobs, but they haven't yet hit the
* idle thresh limit above.
......@@ -90,6 +99,9 @@ struct btrfs_workers {
/* lock for finding the next worker thread to queue on */
spinlock_t lock;
/* lock for the ordered lists */
spinlock_t order_lock;
/* extra name for this worker, used for current->name */
char *name;
};
......
......@@ -138,6 +138,7 @@ struct btrfs_inode {
* of these.
*/
unsigned ordered_data_close:1;
unsigned dummy_inode:1;
struct inode vfs_inode;
};
......
......@@ -506,10 +506,10 @@ static noinline int add_ra_bio_pages(struct inode *inode,
*/
set_page_extent_mapped(page);
lock_extent(tree, last_offset, end, GFP_NOFS);
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, last_offset,
PAGE_CACHE_SIZE);
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
if (!em || last_offset < em->start ||
(last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
......@@ -593,11 +593,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree,
page_offset(bio->bi_io_vec->bv_page),
PAGE_CACHE_SIZE);
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
compressed_len = em->block_len;
cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
......
......@@ -2853,6 +2853,12 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int split;
int num_doubles = 0;
l = path->nodes[0];
slot = path->slots[0];
if (extend && data_size + btrfs_item_size_nr(l, slot) +
sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
return -EOVERFLOW;
/* first try to make some room by pushing left and right */
if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
wret = push_leaf_right(trans, root, path, data_size, 0);
......
......@@ -114,6 +114,10 @@ struct btrfs_ordered_sum;
*/
#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
#define BTRFS_BTREE_INODE_OBJECTID 1
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
/*
* we can actually store much bigger names, but lets not confuse the rest
* of linux
......@@ -670,6 +674,7 @@ struct btrfs_space_info {
u64 bytes_reserved; /* total bytes the allocator has reserved for
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
u64 bytes_super; /* total bytes reserved for the super blocks */
/* delalloc accounting */
u64 bytes_delalloc; /* number of bytes reserved for allocation,
......@@ -726,6 +731,15 @@ enum btrfs_caching_type {
BTRFS_CACHE_FINISHED = 2,
};
struct btrfs_caching_control {
struct list_head list;
struct mutex mutex;
wait_queue_head_t wait;
struct btrfs_block_group_cache *block_group;
u64 progress;
atomic_t count;
};
struct btrfs_block_group_cache {
struct btrfs_key key;
struct btrfs_block_group_item item;
......@@ -733,6 +747,7 @@ struct btrfs_block_group_cache {
spinlock_t lock;
u64 pinned;
u64 reserved;
u64 bytes_super;
u64 flags;
u64 sectorsize;
int extents_thresh;
......@@ -742,8 +757,9 @@ struct btrfs_block_group_cache {
int dirty;
/* cache tracking stuff */
wait_queue_head_t caching_q;
int cached;
struct btrfs_caching_control *caching_ctl;
u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
......@@ -782,13 +798,16 @@ struct btrfs_fs_info {
/* the log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
/* block group cache stuff */
spinlock_t block_group_cache_lock;
struct rb_root block_group_cache_tree;
struct extent_io_tree pinned_extents;
struct extent_io_tree freed_extents[2];
struct extent_io_tree *pinned_extents;
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
......@@ -822,11 +841,7 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
struct mutex drop_mutex;
struct mutex volume_mutex;
struct mutex tree_reloc_mutex;
struct rw_semaphore extent_commit_sem;
/*
* this protects the ordered operations list only while we are
* processing all of the entries on it. This way we make
......@@ -835,10 +850,16 @@ struct btrfs_fs_info {
* before jumping into the main commit.
*/
struct mutex ordered_operations_mutex;
struct rw_semaphore extent_commit_sem;
struct rw_semaphore subvol_sem;
struct srcu_struct subvol_srcu;
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
struct list_head caching_block_groups;
atomic_t nr_async_submits;
atomic_t async_submit_draining;
......@@ -996,10 +1017,12 @@ struct btrfs_root {
u32 stripesize;
u32 type;
u64 highest_inode;
u64 last_inode_alloc;
u64 highest_objectid;
int ref_cows;
int track_dirty;
int in_radix;
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
......@@ -1920,8 +1943,8 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_update_pinned_extents(struct btrfs_root *root,
u64 bytenr, u64 num, int pin);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num, int reserved);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
......@@ -1971,9 +1994,10 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_io_tree *unpin);
struct btrfs_root *root);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
......@@ -1984,6 +2008,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
int btrfs_read_block_groups(struct btrfs_root *root);
int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr);
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytes_used,
u64 type, u64 chunk_objectid, u64 chunk_offset,
......@@ -2006,7 +2031,6 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
/* ctree.c */
int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
int level, int *slot);
......@@ -2100,12 +2124,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent);
/* root-item.c */
int btrfs_find_root_ref(struct btrfs_root *tree_root,
struct btrfs_path *path,
u64 root_id, u64 ref_id);
struct btrfs_path *path,
u64 root_id, u64 ref_id);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
u64 root_id, u8 type, u64 ref_id,
u64 dirid, u64 sequence,
u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
const char *name, int name_len);
int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
const char *name, int name_len);
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key);
......@@ -2120,6 +2147,7 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
int btrfs_search_root(struct btrfs_root *root, u64 search_start,
u64 *found_objectid);
int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
int btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
/* dir-item.c */
......@@ -2138,6 +2166,10 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
u64 objectid, const char *name, int name_len,
int mod);
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
struct btrfs_path *path,
const char *name, int name_len);
......@@ -2160,6 +2192,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
/* inode-map.c */
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
......@@ -2232,6 +2265,10 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index);
int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len);
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 new_size,
......@@ -2242,7 +2279,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end);
int btrfs_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root, struct dentry *dentry,
struct btrfs_root *new_root,
u64 new_dirid, u64 alloc_hint);
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio, unsigned long bio_flags);
......@@ -2258,6 +2295,7 @@ int btrfs_write_inode(struct inode *inode, int wait);
void btrfs_dirty_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
void btrfs_drop_inode(struct inode *inode);
int btrfs_init_cachep(void);
void btrfs_destroy_cachep(void);
long btrfs_ioctl_trans_end(struct file *file);
......@@ -2275,6 +2313,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
void btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct inode *inode, loff_t size);
int btrfs_invalidate_inodes(struct btrfs_root *root);
extern struct dentry_operations btrfs_dentry_operations;
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
......@@ -2290,7 +2330,7 @@ extern struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 locked_end,
u64 inline_limit, u64 *hint_block);
u64 inline_limit, u64 *hint_block, int drop_cache);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end);
......
......@@ -281,6 +281,53 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
return btrfs_match_dir_item_name(root, path, name, name_len);
}
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
{
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
u32 nritems;
int ret;
key.objectid = dirid;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret);
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0)
break;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
continue;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
di = btrfs_match_dir_item_name(root, path, name, name_len);
if (di)
return di;
path->slots[0]++;
}
return NULL;
}
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
......
......@@ -41,6 +41,7 @@
static struct extent_io_ops btree_extent_io_ops;
static void end_workqueue_fn(struct btrfs_work *work);
static void free_fs_root(struct btrfs_root *root);
static atomic_t btrfs_bdi_num = ATOMIC_INIT(0);
......@@ -123,15 +124,15 @@ static struct extent_map *btree_get_extent(struct inode *inode,
struct extent_map *em;
int ret;
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em) {
em->bdev =
BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
goto out;
}
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
em = alloc_extent_map(GFP_NOFS);
if (!em) {
......@@ -144,7 +145,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
em->block_start = 0;
em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
if (ret == -EEXIST) {
u64 failed_start = em->start;
......@@ -163,7 +164,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
free_extent_map(em);
em = NULL;
}
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
if (ret)
em = ERR_PTR(ret);
......@@ -895,8 +896,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
root->fs_info = fs_info;
root->objectid = objectid;
root->last_trans = 0;
root->highest_inode = 0;
root->last_inode_alloc = 0;
root->highest_objectid = 0;
root->name = NULL;
root->in_sysfs = 0;
root->inode_tree.rb_node = NULL;
......@@ -952,14 +952,16 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
root, fs_info, objectid);
ret = btrfs_find_last_root(tree_root, objectid,
&root->root_item, &root->root_key);
if (ret > 0)
return -ENOENT;
BUG_ON(ret);
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
root->commit_root = btrfs_root_node(root);
return 0;
}
......@@ -1095,7 +1097,6 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
struct btrfs_fs_info *fs_info = tree_root->fs_info;
struct btrfs_path *path;
struct extent_buffer *l;
u64 highest_inode;
u64 generation;
u32 blocksize;
int ret = 0;
......@@ -1110,7 +1111,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
kfree(root);
return ERR_PTR(ret);
}
goto insert;
goto out;
}
__setup_root(tree_root->nodesize, tree_root->leafsize,
......@@ -1120,39 +1121,30 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
path = btrfs_alloc_path();
BUG_ON(!path);
ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
if (ret != 0) {
if (ret > 0)
ret = -ENOENT;
goto out;
if (ret == 0) {
l = path->nodes[0];
read_extent_buffer(l, &root->root_item,
btrfs_item_ptr_offset(l, path->slots[0]),
sizeof(root->root_item));
memcpy(&root->root_key, location, sizeof(*location));
}
l = path->nodes[0];
read_extent_buffer(l, &root->root_item,
btrfs_item_ptr_offset(l, path->slots[0]),
sizeof(root->root_item));
memcpy(&root->root_key, location, sizeof(*location));
ret = 0;
out:
btrfs_release_path(root, path);
btrfs_free_path(path);
if (ret) {
kfree(root);
if (ret > 0)
ret = -ENOENT;
return ERR_PTR(ret);
}
generation = btrfs_root_generation(&root->root_item);
blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
blocksize, generation);
root->commit_root = btrfs_root_node(root);
BUG_ON(!root->node);
insert:
if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
out:
if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
root->ref_cows = 1;
ret = btrfs_find_highest_inode(root, &highest_inode);
if (ret == 0) {
root->highest_inode = highest_inode;
root->last_inode_alloc = highest_inode;
}
}
return root;
}
......@@ -1187,39 +1179,66 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
return fs_info->dev_root;
if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
return fs_info->csum_root;
again:
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)location->objectid);
spin_unlock(&fs_info->fs_roots_radix_lock);
if (root)
return root;
ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
if (ret == 0)
ret = -ENOENT;
if (ret < 0)
return ERR_PTR(ret);
root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
if (IS_ERR(root))
return root;
WARN_ON(btrfs_root_refs(&root->root_item) == 0);
set_anon_super(&root->anon_super, NULL);
ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
if (ret)
goto fail;
spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
if (ret == 0)
root->in_radix = 1;
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
if (ret) {
free_extent_buffer(root->node);
kfree(root);
return ERR_PTR(ret);
if (ret == -EEXIST) {
free_fs_root(root);
goto again;
}
goto fail;
}
if (!(fs_info->sb->s_flags & MS_RDONLY)) {
ret = btrfs_find_dead_roots(fs_info->tree_root,
root->root_key.objectid);
BUG_ON(ret);
ret = btrfs_find_dead_roots(fs_info->tree_root,
root->root_key.objectid);
WARN_ON(ret);
if (!(fs_info->sb->s_flags & MS_RDONLY))
btrfs_orphan_cleanup(root);
}
return root;
fail:
free_fs_root(root);
return ERR_PTR(ret);
}
struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *location,
const char *name, int namelen)
{
return btrfs_read_fs_root_no_name(fs_info, location);
#if 0
struct btrfs_root *root;
int ret;
......@@ -1236,7 +1255,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
kfree(root);
return ERR_PTR(ret);
}
#if 0
ret = btrfs_sysfs_add_root(root);
if (ret) {
free_extent_buffer(root->node);
......@@ -1244,9 +1263,9 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
kfree(root);
return ERR_PTR(ret);
}
#endif
root->in_sysfs = 1;
return root;
#endif
}
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
......@@ -1325,9 +1344,9 @@ static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
offset = page_offset(page);
em_tree = &BTRFS_I(inode)->extent_tree;
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
if (!em) {
__unplug_io_fn(bdi, page);
return;
......@@ -1360,8 +1379,10 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
err = bdi_register(bdi, NULL, "btrfs-%d",
atomic_inc_return(&btrfs_bdi_num));
if (err)
if (err) {
bdi_destroy(bdi);
return err;
}
bdi->ra_pages = default_backing_dev_info.ra_pages;
bdi->unplug_io_fn = btrfs_unplug_io_fn;
......@@ -1451,9 +1472,12 @@ static int cleaner_kthread(void *arg)
break;
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
mutex_trylock(&root->fs_info->cleaner_mutex)) {
btrfs_clean_old_snapshots(root);
mutex_unlock(&root->fs_info->cleaner_mutex);
}
if (freezing(current)) {
refrigerator();
......@@ -1558,15 +1582,36 @@ struct btrfs_root *open_ctree(struct super_block *sb,
err = -ENOMEM;
goto fail;
}
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS);
ret = init_srcu_struct(&fs_info->subvol_srcu);
if (ret) {
err = ret;
goto fail;
}
ret = setup_bdi(fs_info, &fs_info->bdi);
if (ret) {
err = ret;
goto fail_srcu;
}
fs_info->btree_inode = new_inode(sb);
if (!fs_info->btree_inode) {
err = -ENOMEM;
goto fail_bdi;
}
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
INIT_LIST_HEAD(&fs_info->ordered_operations);
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->new_trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
init_completion(&fs_info->kobj_unregister);
fs_info->tree_root = tree_root;
......@@ -1585,11 +1630,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
if (setup_bdi(fs_info, &fs_info->bdi))
goto fail_bdi;
fs_info->btree_inode = new_inode(sb);
fs_info->btree_inode->i_ino = 1;
fs_info->btree_inode->i_nlink = 1;
fs_info->metadata_ratio = 8;
fs_info->thread_pool_size = min_t(unsigned long,
......@@ -1602,6 +1642,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
sb->s_blocksize_bits = blksize_bits(4096);
sb->s_bdi = &fs_info->bdi;
fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
fs_info->btree_inode->i_nlink = 1;
/*
* we set the i_size on the btree inode to the max possible int.
* the real end of the address space is determined by all of
......@@ -1620,28 +1662,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
BTRFS_I(fs_info->btree_inode)->root = tree_root;
memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
sizeof(struct btrfs_key));
BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
insert_inode_hash(fs_info->btree_inode);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree.rb_node = NULL;
extent_io_tree_init(&fs_info->pinned_extents,
extent_io_tree_init(&fs_info->freed_extents[0],
fs_info->btree_inode->i_mapping, GFP_NOFS);
extent_io_tree_init(&fs_info->freed_extents[1],
fs_info->btree_inode->i_mapping, GFP_NOFS);
fs_info->pinned_extents = &fs_info->freed_extents[0];
fs_info->do_barriers = 1;
BTRFS_I(fs_info->btree_inode)->root = tree_root;
memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
sizeof(struct btrfs_key));
insert_inode_hash(fs_info->btree_inode);
mutex_init(&fs_info->trans_mutex);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->drop_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->tree_reloc_mutex);
init_rwsem(&fs_info->extent_commit_sem);
init_rwsem(&fs_info->subvol_sem);
btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
......@@ -1700,7 +1746,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
err = -EINVAL;
goto fail_iput;
}
printk("thread pool is %d\n", fs_info->thread_pool_size);
/*
* we need to start all the end_io workers up front because the
* queue work function gets called at interrupt time, and so it
......@@ -1745,20 +1791,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->endio_workers.idle_thresh = 4;
fs_info->endio_meta_workers.idle_thresh = 4;
fs_info->endio_write_workers.idle_thresh = 64;
fs_info->endio_meta_write_workers.idle_thresh = 64;
fs_info->endio_write_workers.idle_thresh = 2;
fs_info->endio_meta_write_workers.idle_thresh = 2;
fs_info->endio_workers.atomic_worker_start = 1;
fs_info->endio_meta_workers.atomic_worker_start = 1;
fs_info->endio_write_workers.atomic_worker_start = 1;
fs_info->endio_meta_write_workers.atomic_worker_start = 1;
btrfs_start_workers(&fs_info->workers, 1);
btrfs_start_workers(&fs_info->submit_workers, 1);
btrfs_start_workers(&fs_info->delalloc_workers, 1);
btrfs_start_workers(&fs_info->fixup_workers, 1);
btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_meta_workers,
fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_meta_write_workers,
fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_write_workers,
fs_info->thread_pool_size);
btrfs_start_workers(&fs_info->endio_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_workers, 1);
btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
btrfs_start_workers(&fs_info->endio_write_workers, 1);
fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
......@@ -1918,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
}
}
ret = btrfs_find_orphan_roots(tree_root);
BUG_ON(ret);
if (!(sb->s_flags & MS_RDONLY)) {
ret = btrfs_recover_relocation(tree_root);
BUG_ON(ret);
......@@ -1977,6 +2028,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
btrfs_mapping_tree_free(&fs_info->mapping_tree);
fail_bdi:
bdi_destroy(&fs_info->bdi);
fail_srcu:
cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
kfree(extent_root);
kfree(tree_root);
......@@ -2236,20 +2289,29 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
{
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
spin_unlock(&fs_info->fs_roots_radix_lock);
if (btrfs_root_refs(&root->root_item) == 0)
synchronize_srcu(&fs_info->subvol_srcu);
free_fs_root(root);
return 0;
}
static void free_fs_root(struct btrfs_root *root)
{
WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
if (root->anon_super.s_dev) {
down_write(&root->anon_super.s_umount);
kill_anon_super(&root->anon_super);
}
if (root->node)
free_extent_buffer(root->node);
if (root->commit_root)
free_extent_buffer(root->commit_root);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root->name);
kfree(root);
return 0;
}
static int del_fs_roots(struct btrfs_fs_info *fs_info)
......@@ -2258,6 +2320,20 @@ static int del_fs_roots(struct btrfs_fs_info *fs_info)
struct btrfs_root *gang[8];
int i;
while (!list_empty(&fs_info->dead_roots)) {
gang[0] = list_entry(fs_info->dead_roots.next,
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
if (gang[0]->in_radix) {
btrfs_free_fs_root(fs_info, gang[0]);
} else {
free_extent_buffer(gang[0]->node);
free_extent_buffer(gang[0]->commit_root);
kfree(gang[0]);
}
}
while (1) {
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, 0,
......@@ -2287,9 +2363,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
root_objectid = gang[ret - 1]->root_key.objectid + 1;
for (i = 0; i < ret; i++) {
root_objectid = gang[i]->root_key.objectid;
ret = btrfs_find_dead_roots(fs_info->tree_root,
root_objectid);
BUG_ON(ret);
btrfs_orphan_cleanup(gang[i]);
}
root_objectid++;
......@@ -2359,7 +2432,6 @@ int close_ctree(struct btrfs_root *root)
free_extent_buffer(root->fs_info->csum_root->commit_root);
btrfs_free_block_groups(root->fs_info);
btrfs_free_pinned_extents(root->fs_info);
del_fs_roots(fs_info);
......@@ -2378,6 +2450,7 @@ int close_ctree(struct btrfs_root *root)
btrfs_mapping_tree_free(&fs_info->mapping_tree);
bdi_destroy(&fs_info->bdi);
cleanup_srcu_struct(&fs_info->subvol_srcu);
kfree(fs_info->extent_root);
kfree(fs_info->tree_root);
......
......@@ -28,7 +28,7 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
len = BTRFS_FID_SIZE_NON_CONNECTABLE;
type = FILEID_BTRFS_WITHOUT_PARENT;
fid->objectid = BTRFS_I(inode)->location.objectid;
fid->objectid = inode->i_ino;
fid->root_objectid = BTRFS_I(inode)->root->objectid;
fid->gen = inode->i_generation;
......@@ -60,34 +60,61 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
}
static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u32 generation)
u64 root_objectid, u32 generation,
int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
struct btrfs_root *root;
struct dentry *dentry;
struct inode *inode;
struct btrfs_key key;
int index;
int err = 0;
if (objectid < BTRFS_FIRST_FREE_OBJECTID)
return ERR_PTR(-ESTALE);
key.objectid = root_objectid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key);
if (IS_ERR(root))
return ERR_CAST(root);
index = srcu_read_lock(&fs_info->subvol_srcu);
root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(root)) {
err = PTR_ERR(root);
goto fail;
}
if (btrfs_root_refs(&root->root_item) == 0) {
err = -ENOENT;
goto fail;
}
key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
inode = btrfs_iget(sb, &key, root);
if (IS_ERR(inode))
return (void *)inode;
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto fail;
}
srcu_read_unlock(&fs_info->subvol_srcu, index);
if (generation != inode->i_generation) {
if (check_generation && generation != inode->i_generation) {
iput(inode);
return ERR_PTR(-ESTALE);
}
return d_obtain_alias(inode);
dentry = d_obtain_alias(inode);
if (!IS_ERR(dentry))
dentry->d_op = &btrfs_dentry_operations;
return dentry;
fail:
srcu_read_unlock(&fs_info->subvol_srcu, index);
return ERR_PTR(err);
}
static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
......@@ -111,7 +138,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
objectid = fid->parent_objectid;
generation = fid->parent_gen;
return btrfs_get_dentry(sb, objectid, root_objectid, generation);
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
......@@ -133,66 +160,76 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
root_objectid = fid->root_objectid;
generation = fid->gen;
return btrfs_get_dentry(sb, objectid, root_objectid, generation);
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
static struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = child->d_inode;
static struct dentry *dentry;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
int slot;
u64 objectid;
struct btrfs_root_ref *ref;
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
path = btrfs_alloc_path();
key.objectid = dir->i_ino;
btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY);
key.offset = (u64)-1;
if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
key.objectid = root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = (u64)-1;
root = root->fs_info->tree_root;
} else {
key.objectid = dir->i_ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
}
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
/* Error */
btrfs_free_path(path);
return ERR_PTR(ret);
if (ret < 0)
goto fail;
BUG_ON(ret == 0);
if (path->slots[0] == 0) {
ret = -ENOENT;
goto fail;
}
path->slots[0]--;
leaf = path->nodes[0];
slot = path->slots[0];
if (ret) {
/* btrfs_search_slot() returns the slot where we'd want to
insert a backref for parent inode #0xFFFFFFFFFFFFFFFF.
The _real_ backref, telling us what the parent inode
_actually_ is, will be in the slot _before_ the one
that btrfs_search_slot() returns. */
if (!slot) {
/* Unless there is _no_ key in the tree before... */
btrfs_free_path(path);
return ERR_PTR(-EIO);
}
slot--;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
if (found_key.objectid != key.objectid || found_key.type != key.type) {
ret = -ENOENT;
goto fail;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
key.objectid = btrfs_root_ref_dirid(leaf, ref);
} else {
key.objectid = found_key.offset;
}
btrfs_free_path(path);
if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY)
return ERR_PTR(-EINVAL);
objectid = key.offset;
/* If we are already at the root of a subvol, return the real root */
if (objectid == dir->i_ino)
return dget(dir->i_sb->s_root);
if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
return btrfs_get_dentry(root->fs_info->sb, key.objectid,
found_key.offset, 0, 0);
}
/* Build a new key for the inode item */
key.objectid = objectid;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
if (!IS_ERR(dentry))
dentry->d_op = &btrfs_dentry_operations;
return dentry;
fail:
btrfs_free_path(path);
return ERR_PTR(ret);
}
const struct export_operations btrfs_export_ops = {
......
......@@ -32,12 +32,12 @@
#include "locking.h"
#include "free-space-cache.h"
static int update_reserved_extents(struct btrfs_root *root,
u64 bytenr, u64 num, int reserve);
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc,
int mark_free);
static int update_reserved_extents(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
......@@ -57,10 +57,17 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
u64 flags, int force);
static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 bytenr, u64 num_bytes,
int is_data, int reserved,
struct extent_buffer **must_clean);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
......@@ -153,34 +160,34 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
return ret;
}
/*
* We always set EXTENT_LOCKED for the super mirror extents so we don't
* overwrite them, so those bits need to be unset. Also, if we are unmounting
* with pinned extents still sitting there because we had a block group caching,
* we need to clear those now, since we are done.
*/
void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
static int add_excluded_extent(struct btrfs_root *root,
u64 start, u64 num_bytes)
{
u64 start, end, last = 0;
int ret;
u64 end = start + num_bytes - 1;
set_extent_bits(&root->fs_info->freed_extents[0],
start, end, EXTENT_UPTODATE, GFP_NOFS);
set_extent_bits(&root->fs_info->freed_extents[1],
start, end, EXTENT_UPTODATE, GFP_NOFS);
return 0;
}
while (1) {
ret = find_first_extent_bit(&info->pinned_extents, last,
&start, &end,
EXTENT_LOCKED|EXTENT_DIRTY);
if (ret)
break;
static void free_excluded_extents(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
u64 start, end;
clear_extent_bits(&info->pinned_extents, start, end,
EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
last = end+1;
}
start = cache->key.objectid;
end = start + cache->key.offset - 1;
clear_extent_bits(&root->fs_info->freed_extents[0],
start, end, EXTENT_UPTODATE, GFP_NOFS);
clear_extent_bits(&root->fs_info->freed_extents[1],
start, end, EXTENT_UPTODATE, GFP_NOFS);
}
static int remove_sb_from_cache(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
static int exclude_super_stripes(struct btrfs_root *root,
struct btrfs_block_group_cache *cache)
{
struct btrfs_fs_info *fs_info = root->fs_info;
u64 bytenr;
u64 *logical;
int stripe_len;
......@@ -192,17 +199,42 @@ static int remove_sb_from_cache(struct btrfs_root *root,
cache->key.objectid, bytenr,
0, &logical, &nr, &stripe_len);
BUG_ON(ret);
while (nr--) {
try_lock_extent(&fs_info->pinned_extents,
logical[nr],
logical[nr] + stripe_len - 1, GFP_NOFS);
cache->bytes_super += stripe_len;
ret = add_excluded_extent(root, logical[nr],
stripe_len);
BUG_ON(ret);
}
kfree(logical);
}
return 0;
}
static struct btrfs_caching_control *
get_caching_control(struct btrfs_block_group_cache *cache)
{
struct btrfs_caching_control *ctl;
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_STARTED) {
spin_unlock(&cache->lock);
return NULL;
}
ctl = cache->caching_ctl;
atomic_inc(&ctl->count);
spin_unlock(&cache->lock);
return ctl;
}
static void put_caching_control(struct btrfs_caching_control *ctl)
{
if (atomic_dec_and_test(&ctl->count))
kfree(ctl);
}
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
......@@ -215,9 +247,9 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
int ret;
while (start < end) {
ret = find_first_extent_bit(&info->pinned_extents, start,
ret = find_first_extent_bit(info->pinned_extents, start,
&extent_start, &extent_end,
EXTENT_DIRTY|EXTENT_LOCKED);
EXTENT_DIRTY | EXTENT_UPTODATE);
if (ret)
break;
......@@ -249,22 +281,27 @@ static int caching_kthread(void *data)
{
struct btrfs_block_group_cache *block_group = data;
struct btrfs_fs_info *fs_info = block_group->fs_info;
u64 last = 0;
struct btrfs_caching_control *caching_ctl = block_group->caching_ctl;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_path *path;
int ret = 0;
struct btrfs_key key;
struct extent_buffer *leaf;
int slot;
struct btrfs_key key;
u64 total_found = 0;
BUG_ON(!fs_info);
u64 last = 0;
u32 nritems;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
atomic_inc(&block_group->space_info->caching_threads);
exclude_super_stripes(extent_root, block_group);
spin_lock(&block_group->space_info->lock);
block_group->space_info->bytes_super += block_group->bytes_super;
spin_unlock(&block_group->space_info->lock);
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
......@@ -277,74 +314,64 @@ static int caching_kthread(void *data)
key.objectid = last;
key.offset = 0;
btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
key.type = BTRFS_EXTENT_ITEM_KEY;
again:
mutex_lock(&caching_ctl->mutex);
/* need to make sure the commit_root doesn't disappear */
down_read(&fs_info->extent_commit_sem);
ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
if (ret < 0)
goto err;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
smp_mb();
if (block_group->fs_info->closing > 1) {
if (fs_info->closing > 1) {
last = (u64)-1;
break;
}
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(fs_info->extent_root, path);
if (ret < 0)
goto err;
else if (ret)
if (path->slots[0] < nritems) {
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
} else {
ret = find_next_key(path, 0, &key);
if (ret)
break;
if (need_resched() ||
btrfs_transaction_in_commit(fs_info)) {
leaf = path->nodes[0];
/* this shouldn't happen, but if the
* leaf is empty just move on.
*/
if (btrfs_header_nritems(leaf) == 0)
break;
/*
* we need to copy the key out so that
* we are sure the next search advances
* us forward in the btree.
*/
btrfs_item_key_to_cpu(leaf, &key, 0);
btrfs_release_path(fs_info->extent_root, path);
up_read(&fs_info->extent_commit_sem);
caching_ctl->progress = last;
btrfs_release_path(extent_root, path);
up_read(&fs_info->extent_commit_sem);
mutex_unlock(&caching_ctl->mutex);
if (btrfs_transaction_in_commit(fs_info))
schedule_timeout(1);
goto again;
}
else
cond_resched();
goto again;
}
if (key.objectid < block_group->key.objectid) {
path->slots[0]++;
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid < block_group->key.objectid)
goto next;
if (key.objectid >= block_group->key.objectid +
block_group->key.offset)
break;
if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
if (key.type == BTRFS_EXTENT_ITEM_KEY) {
total_found += add_new_free_space(block_group,
fs_info, last,
key.objectid);
last = key.objectid + key.offset;
}
if (total_found > (1024 * 1024 * 2)) {
total_found = 0;
wake_up(&block_group->caching_q);
if (total_found > (1024 * 1024 * 2)) {
total_found = 0;
wake_up(&caching_ctl->wait);
}
}
next:
path->slots[0]++;
}
ret = 0;
......@@ -352,33 +379,65 @@ static int caching_kthread(void *data)
total_found += add_new_free_space(block_group, fs_info, last,
block_group->key.objectid +
block_group->key.offset);
caching_ctl->progress = (u64)-1;
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
err:
btrfs_free_path(path);
up_read(&fs_info->extent_commit_sem);
atomic_dec(&block_group->space_info->caching_threads);
wake_up(&block_group->caching_q);
free_excluded_extents(extent_root, block_group);
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
put_caching_control(caching_ctl);
atomic_dec(&block_group->space_info->caching_threads);
return 0;
}
static int cache_block_group(struct btrfs_block_group_cache *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
struct btrfs_caching_control *caching_ctl;
struct task_struct *tsk;
int ret = 0;
smp_mb();
if (cache->cached != BTRFS_CACHE_NO)
return 0;
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
BUG_ON(!caching_ctl);
INIT_LIST_HEAD(&caching_ctl->list);
mutex_init(&caching_ctl->mutex);
init_waitqueue_head(&caching_ctl->wait);
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
/* one for caching kthread, one for caching block group list */
atomic_set(&caching_ctl->count, 2);
spin_lock(&cache->lock);
if (cache->cached != BTRFS_CACHE_NO) {
spin_unlock(&cache->lock);
return ret;
kfree(caching_ctl);
return 0;
}
cache->caching_ctl = caching_ctl;
cache->cached = BTRFS_CACHE_STARTED;
spin_unlock(&cache->lock);
down_write(&fs_info->extent_commit_sem);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
up_write(&fs_info->extent_commit_sem);
atomic_inc(&cache->space_info->caching_threads);
tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
cache->key.objectid);
if (IS_ERR(tsk)) {
......@@ -1657,7 +1716,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
parent, ref_root, flags,
ref->objectid, ref->offset,
&ins, node->ref_mod);
update_reserved_extents(root, ins.objectid, ins.offset, 0);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent,
......@@ -1783,7 +1841,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
extent_op->flags_to_set,
&extent_op->key,
ref->level, &ins);
update_reserved_extents(root, ins.objectid, ins.offset, 0);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
node->num_bytes, parent, ref_root,
......@@ -1818,16 +1875,32 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
if (insert_reserved) {
int mark_free = 0;
struct extent_buffer *must_clean = NULL;
ret = pin_down_bytes(trans, root, NULL,
node->bytenr, node->num_bytes,
head->is_data, 1, &must_clean);
if (ret > 0)
mark_free = 1;
if (must_clean) {
clean_tree_block(NULL, root, must_clean);
btrfs_tree_unlock(must_clean);
free_extent_buffer(must_clean);
}
if (head->is_data) {
ret = btrfs_del_csums(trans, root,
node->bytenr,
node->num_bytes);
BUG_ON(ret);
}
btrfs_update_pinned_extents(root, node->bytenr,
node->num_bytes, 1);
update_reserved_extents(root, node->bytenr,
node->num_bytes, 0);
if (mark_free) {
ret = btrfs_free_reserved_extent(root,
node->bytenr,
node->num_bytes);
BUG_ON(ret);
}
}
mutex_unlock(&head->mutex);
return 0;
......@@ -2706,6 +2779,8 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
if (!meta_sinfo)
goto alloc;
again:
spin_lock(&meta_sinfo->lock);
......@@ -2717,12 +2792,13 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
do_div(thresh, 100);
if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super > thresh) {
struct btrfs_trans_handle *trans;
if (!meta_sinfo->full) {
meta_sinfo->force_alloc = 1;
spin_unlock(&meta_sinfo->lock);
alloc:
trans = btrfs_start_transaction(root, 1);
if (!trans)
return -ENOMEM;
......@@ -2730,6 +2806,10 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2 * 1024 * 1024, alloc_target, 0);
btrfs_end_transaction(trans, root);
if (!meta_sinfo) {
meta_sinfo = __find_space_info(info,
alloc_target);
}
goto again;
}
spin_unlock(&meta_sinfo->lock);
......@@ -2765,13 +2845,16 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
data_sinfo = BTRFS_I(inode)->space_info;
if (!data_sinfo)
goto alloc;
again:
/* make sure we have enough space to handle the data first */
spin_lock(&data_sinfo->lock);
if (data_sinfo->total_bytes - data_sinfo->bytes_used -
data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
data_sinfo->bytes_may_use < bytes) {
data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
struct btrfs_trans_handle *trans;
/*
......@@ -2783,7 +2866,7 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
data_sinfo->force_alloc = 1;
spin_unlock(&data_sinfo->lock);
alloc:
alloc_target = btrfs_get_alloc_profile(root, 1);
trans = btrfs_start_transaction(root, 1);
if (!trans)
......@@ -2795,6 +2878,11 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
btrfs_end_transaction(trans, root);
if (ret)
return ret;
if (!data_sinfo) {
btrfs_set_inode_space_info(root, inode);
data_sinfo = BTRFS_I(inode)->space_info;
}
goto again;
}
spin_unlock(&data_sinfo->lock);
......@@ -3009,10 +3097,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
num_bytes = min(total, cache->key.offset - byte_in_group);
if (alloc) {
old_val += num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
cache->reserved -= num_bytes;
cache->space_info->bytes_used += num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
if (cache->ro)
cache->space_info->bytes_readonly -= num_bytes;
btrfs_set_block_group_used(&cache->item, old_val);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
} else {
......@@ -3057,127 +3147,136 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
return bytenr;
}
int btrfs_update_pinned_extents(struct btrfs_root *root,
u64 bytenr, u64 num, int pin)
/*
* this function must be called within transaction
*/
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int reserved)
{
u64 len;
struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache;
if (pin)
set_extent_dirty(&fs_info->pinned_extents,
bytenr, bytenr + num - 1, GFP_NOFS);
while (num > 0) {
cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache);
len = min(num, cache->key.offset -
(bytenr - cache->key.objectid));
if (pin) {
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += len;
cache->space_info->bytes_pinned += len;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
fs_info->total_pinned += len;
} else {
int unpin = 0;
cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache);
/*
* in order to not race with the block group caching, we
* only want to unpin the extent if we are cached. If
* we aren't cached, we want to start async caching this
* block group so we can free the extent the next time
* around.
*/
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
unpin = (cache->cached == BTRFS_CACHE_FINISHED);
if (likely(unpin)) {
cache->pinned -= len;
cache->space_info->bytes_pinned -= len;
fs_info->total_pinned -= len;
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned += num_bytes;
cache->space_info->bytes_pinned += num_bytes;
if (reserved) {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
if (likely(unpin))
clear_extent_dirty(&fs_info->pinned_extents,
bytenr, bytenr + len -1,
GFP_NOFS);
else
cache_block_group(cache);
btrfs_put_block_group(cache);
if (unpin)
btrfs_add_free_space(cache, bytenr, len);
}
btrfs_put_block_group(cache);
bytenr += len;
num -= len;
set_extent_dirty(fs_info->pinned_extents,
bytenr, bytenr + num_bytes - 1, GFP_NOFS);
return 0;
}
static int update_reserved_extents(struct btrfs_block_group_cache *cache,
u64 num_bytes, int reserve)
{
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
if (reserve) {
cache->reserved += num_bytes;
cache->space_info->bytes_reserved += num_bytes;
} else {
cache->reserved -= num_bytes;
cache->space_info->bytes_reserved -= num_bytes;
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
return 0;
}
static int update_reserved_extents(struct btrfs_root *root,
u64 bytenr, u64 num, int reserve)
int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
u64 len;
struct btrfs_block_group_cache *cache;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_caching_control *next;
struct btrfs_caching_control *caching_ctl;
struct btrfs_block_group_cache *cache;
while (num > 0) {
cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache);
len = min(num, cache->key.offset -
(bytenr - cache->key.objectid));
down_write(&fs_info->extent_commit_sem);
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
if (reserve) {
cache->reserved += len;
cache->space_info->bytes_reserved += len;
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
cache = caching_ctl->block_group;
if (block_group_cache_done(cache)) {
cache->last_byte_to_unpin = (u64)-1;
list_del_init(&caching_ctl->list);
put_caching_control(caching_ctl);
} else {
cache->reserved -= len;
cache->space_info->bytes_reserved -= len;
cache->last_byte_to_unpin = caching_ctl->progress;
}
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
btrfs_put_block_group(cache);
bytenr += len;
num -= len;
}
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
fs_info->pinned_extents = &fs_info->freed_extents[1];
else
fs_info->pinned_extents = &fs_info->freed_extents[0];
up_write(&fs_info->extent_commit_sem);
return 0;
}
int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
u64 last = 0;
u64 start;
u64 end;
struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_group_cache *cache = NULL;
u64 len;
while (1) {
ret = find_first_extent_bit(pinned_extents, last,
&start, &end, EXTENT_DIRTY);
if (ret)
break;
while (start <= end) {
if (!cache ||
start >= cache->key.objectid + cache->key.offset) {
if (cache)
btrfs_put_block_group(cache);
cache = btrfs_lookup_block_group(fs_info, start);
BUG_ON(!cache);
}
len = cache->key.objectid + cache->key.offset - start;
len = min(len, end + 1 - start);
if (start < cache->last_byte_to_unpin) {
len = min(len, cache->last_byte_to_unpin - start);
btrfs_add_free_space(cache, start, len);
}
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
cache->space_info->bytes_pinned -= len;
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
set_extent_dirty(copy, start, end, GFP_NOFS);
last = end + 1;
start += len;
}
if (cache)
btrfs_put_block_group(cache);
return 0;
}
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_io_tree *unpin)
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_io_tree *unpin;
u64 start;
u64 end;
int ret;
if (fs_info->pinned_extents == &fs_info->freed_extents[0])
unpin = &fs_info->freed_extents[1];
else
unpin = &fs_info->freed_extents[0];
while (1) {
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
......@@ -3186,10 +3285,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = btrfs_discard_extent(root, start, end + 1 - start);
/* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
unpin_extent_range(root, start, end);
cond_resched();
}
......@@ -3199,7 +3296,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 bytenr, u64 num_bytes, int is_data,
u64 bytenr, u64 num_bytes,
int is_data, int reserved,
struct extent_buffer **must_clean)
{
int err = 0;
......@@ -3231,15 +3329,15 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
}
free_extent_buffer(buf);
pinit:
btrfs_set_path_blocking(path);
if (path)
btrfs_set_path_blocking(path);
/* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
btrfs_pin_extent(root, bytenr, num_bytes, reserved);
BUG_ON(err < 0);
return 0;
}
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
......@@ -3413,7 +3511,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
ret = pin_down_bytes(trans, root, path, bytenr,
num_bytes, is_data, &must_clean);
num_bytes, is_data, 0, &must_clean);
if (ret > 0)
mark_free = 1;
BUG_ON(ret < 0);
......@@ -3544,8 +3642,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
/* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
update_reserved_extents(root, bytenr, num_bytes, 0);
btrfs_pin_extent(root, bytenr, num_bytes, 1);
ret = 0;
} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
......@@ -3585,19 +3682,33 @@ static noinline int
wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
u64 num_bytes)
{
struct btrfs_caching_control *caching_ctl;
DEFINE_WAIT(wait);
prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
if (block_group_cache_done(cache)) {
finish_wait(&cache->caching_q, &wait);
caching_ctl = get_caching_control(cache);
if (!caching_ctl)
return 0;
}
schedule();
finish_wait(&cache->caching_q, &wait);
wait_event(cache->caching_q, block_group_cache_done(cache) ||
wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
(cache->free_space >= num_bytes));
put_caching_control(caching_ctl);
return 0;
}
static noinline int
wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
{
struct btrfs_caching_control *caching_ctl;
DEFINE_WAIT(wait);
caching_ctl = get_caching_control(cache);
if (!caching_ctl)
return 0;
wait_event(caching_ctl->wait, block_group_cache_done(cache));
put_caching_control(caching_ctl);
return 0;
}
......@@ -3635,6 +3746,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
int last_ptr_loop = 0;
int loop = 0;
bool found_uncached_bg = false;
bool failed_cluster_refill = false;
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
......@@ -3732,7 +3844,16 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
if (unlikely(block_group->ro))
goto loop;
if (last_ptr) {
/*
* Ok we want to try and use the cluster allocator, so lets look
* there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
* have tried the cluster allocator plenty of times at this
* point and not have found anything, so we are likely way too
* fragmented for the clustering stuff to find anything, so lets
* just skip it and let the allocator find whatever block it can
* find
*/
if (last_ptr && loop < LOOP_NO_EMPTY_SIZE) {
/*
* the refill lock keeps out other
* people trying to start a new cluster
......@@ -3807,9 +3928,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
spin_unlock(&last_ptr->refill_lock);
goto checks;
}
} else if (!cached && loop > LOOP_CACHING_NOWAIT) {
} else if (!cached && loop > LOOP_CACHING_NOWAIT
&& !failed_cluster_refill) {
spin_unlock(&last_ptr->refill_lock);
failed_cluster_refill = true;
wait_block_group_cache_progress(block_group,
num_bytes + empty_cluster + empty_size);
goto have_block_group;
......@@ -3821,13 +3944,9 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
* cluster. Free the cluster we've been trying
* to use, and go to the next block group
*/
if (loop < LOOP_NO_EMPTY_SIZE) {
btrfs_return_cluster_to_free_space(NULL,
last_ptr);
spin_unlock(&last_ptr->refill_lock);
goto loop;
}
btrfs_return_cluster_to_free_space(NULL, last_ptr);
spin_unlock(&last_ptr->refill_lock);
goto loop;
}
offset = btrfs_find_space_for_alloc(block_group, search_start,
......@@ -3881,9 +4000,12 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
search_start - offset);
BUG_ON(offset > search_start);
update_reserved_extents(block_group, num_bytes, 1);
/* we are all good, lets return */
break;
loop:
failed_cluster_refill = false;
btrfs_put_block_group(block_group);
}
up_read(&space_info->groups_sem);
......@@ -3973,12 +4095,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
up_read(&info->groups_sem);
}
static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins,
u64 data)
int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins,
u64 data)
{
int ret;
u64 search_start = 0;
......@@ -4044,25 +4166,8 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
ret = btrfs_discard_extent(root, start, len);
btrfs_add_free_space(cache, start, len);
update_reserved_extents(cache, len, 0);
btrfs_put_block_group(cache);
update_reserved_extents(root, start, len, 0);
return ret;
}
int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 num_bytes, u64 min_alloc_size,
u64 empty_size, u64 hint_byte,
u64 search_end, struct btrfs_key *ins,
u64 data)
{
int ret;
ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
empty_size, hint_byte, search_end, ins,
data);
if (!ret)
update_reserved_extents(root, ins->objectid, ins->offset, 1);
return ret;
}
......@@ -4223,15 +4328,46 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
{
int ret;
struct btrfs_block_group_cache *block_group;
struct btrfs_caching_control *caching_ctl;
u64 start = ins->objectid;
u64 num_bytes = ins->offset;
block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
cache_block_group(block_group);
wait_event(block_group->caching_q,
block_group_cache_done(block_group));
caching_ctl = get_caching_control(block_group);
ret = btrfs_remove_free_space(block_group, ins->objectid,
ins->offset);
BUG_ON(ret);
if (!caching_ctl) {
BUG_ON(!block_group_cache_done(block_group));
ret = btrfs_remove_free_space(block_group, start, num_bytes);
BUG_ON(ret);
} else {
mutex_lock(&caching_ctl->mutex);
if (start >= caching_ctl->progress) {
ret = add_excluded_extent(root, start, num_bytes);
BUG_ON(ret);
} else if (start + num_bytes <= caching_ctl->progress) {
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
BUG_ON(ret);
} else {
num_bytes = caching_ctl->progress - start;
ret = btrfs_remove_free_space(block_group,
start, num_bytes);
BUG_ON(ret);
start = caching_ctl->progress;
num_bytes = ins->objectid + ins->offset -
caching_ctl->progress;
ret = add_excluded_extent(root, start, num_bytes);
BUG_ON(ret);
}
mutex_unlock(&caching_ctl->mutex);
put_caching_control(caching_ctl);
}
update_reserved_extents(block_group, ins->offset, 1);
btrfs_put_block_group(block_group);
ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
0, owner, offset, ins, 1);
......@@ -4255,9 +4391,9 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
int ret;
u64 flags = 0;
ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
empty_size, hint_byte, search_end,
ins, 0);
ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
empty_size, hint_byte, search_end,
ins, 0);
if (ret)
return ret;
......@@ -4268,7 +4404,6 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
} else
BUG_ON(parent > 0);
update_reserved_extents(root, ins->objectid, ins->offset, 1);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
struct btrfs_delayed_extent_op *extent_op;
extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
......@@ -4347,452 +4482,99 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
return buf;
}
#if 0
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf)
struct walk_control {
u64 refs[BTRFS_MAX_LEVEL];
u64 flags[BTRFS_MAX_LEVEL];
struct btrfs_key update_progress;
int stage;
int level;
int shared_level;
int update_ref;
int keep_locks;
int reada_slot;
int reada_count;
};
#define DROP_REFERENCE 1
#define UPDATE_BACKREF 2
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct walk_control *wc,
struct btrfs_path *path)
{
u64 disk_bytenr;
u64 num_bytes;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr;
u64 generation;
u64 refs;
u64 last = 0;
u32 nritems;
int i;
u32 blocksize;
struct btrfs_key key;
struct extent_buffer *eb;
int ret;
int slot;
int nread = 0;
BUG_ON(!btrfs_is_leaf(leaf));
nritems = btrfs_header_nritems(leaf);
if (path->slots[wc->level] < wc->reada_slot) {
wc->reada_count = wc->reada_count * 2 / 3;
wc->reada_count = max(wc->reada_count, 2);
} else {
wc->reada_count = wc->reada_count * 3 / 2;
wc->reada_count = min_t(int, wc->reada_count,
BTRFS_NODEPTRS_PER_BLOCK(root));
}
for (i = 0; i < nritems; i++) {
cond_resched();
btrfs_item_key_to_cpu(leaf, &key, i);
eb = path->nodes[wc->level];
nritems = btrfs_header_nritems(eb);
blocksize = btrfs_level_size(root, wc->level - 1);
/* only extents have references, skip everything else */
if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
continue;
fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
/* inline extents live in the btree, they don't have refs */
if (btrfs_file_extent_type(leaf, fi) ==
BTRFS_FILE_EXTENT_INLINE)
continue;
disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
/* holes don't have refs */
if (disk_bytenr == 0)
continue;
num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
leaf->start, 0, key.objectid, 0);
BUG_ON(ret);
}
return 0;
}
static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_leaf_ref *ref)
{
int i;
int ret;
struct btrfs_extent_info *info;
struct refsort *sorted;
if (ref->nritems == 0)
return 0;
sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
for (i = 0; i < ref->nritems; i++) {
sorted[i].bytenr = ref->extents[i].bytenr;
sorted[i].slot = i;
}
sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
for (slot = path->slots[wc->level]; slot < nritems; slot++) {
if (nread >= wc->reada_count)
break;
/*
* the items in the ref were sorted when the ref was inserted
* into the ref cache, so this is already in order
*/
for (i = 0; i < ref->nritems; i++) {
info = ref->extents + sorted[i].slot;
ret = btrfs_free_extent(trans, root, info->bytenr,
info->num_bytes, ref->bytenr,
ref->owner, ref->generation,
info->objectid, 0);
atomic_inc(&root->fs_info->throttle_gen);
wake_up(&root->fs_info->transaction_throttle);
cond_resched();
bytenr = btrfs_node_blockptr(eb, slot);
generation = btrfs_node_ptr_generation(eb, slot);
BUG_ON(ret);
info++;
}
kfree(sorted);
return 0;
}
static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 start,
u64 len, u32 *refs)
{
int ret;
ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
BUG_ON(ret);
#if 0 /* some debugging code in case we see problems here */
/* if the refs count is one, it won't get increased again. But
* if the ref count is > 1, someone may be decreasing it at
* the same time we are.
*/
if (*refs != 1) {
struct extent_buffer *eb = NULL;
eb = btrfs_find_create_tree_block(root, start, len);
if (eb)
btrfs_tree_lock(eb);
mutex_lock(&root->fs_info->alloc_mutex);
ret = lookup_extent_ref(NULL, root, start, len, refs);
BUG_ON(ret);
mutex_unlock(&root->fs_info->alloc_mutex);
if (eb) {
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
}
if (*refs == 1) {
printk(KERN_ERR "btrfs block %llu went down to one "
"during drop_snap\n", (unsigned long long)start);
}
}
#endif
cond_resched();
return ret;
}
if (slot == path->slots[wc->level])
goto reada;
/*
* this is used while deleting old snapshots, and it drops the refs
* on a whole subtree starting from a level 1 node.
*
* The idea is to sort all the leaf pointers, and then drop the
* ref on all the leaves in order. Most of the time the leaves
* will have ref cache entries, so no leaf IOs will be required to
* find the extents they have references on.
*
* For each leaf, any references it has are also dropped in order
*
* This ends up dropping the references in something close to optimal
* order for reading and modifying the extent allocation tree.
*/
static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path)
{
u64 bytenr;
u64 root_owner;
u64 root_gen;
struct extent_buffer *eb = path->nodes[1];
struct extent_buffer *leaf;
struct btrfs_leaf_ref *ref;
struct refsort *sorted = NULL;
int nritems = btrfs_header_nritems(eb);
int ret;
int i;
int refi = 0;
int slot = path->slots[1];
u32 blocksize = btrfs_level_size(root, 0);
u32 refs;
if (nritems == 0)
goto out;
root_owner = btrfs_header_owner(eb);
root_gen = btrfs_header_generation(eb);
sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
/*
* step one, sort all the leaf pointers so we don't scribble
* randomly into the extent allocation tree
*/
for (i = slot; i < nritems; i++) {
sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
sorted[refi].slot = i;
refi++;
}
/*
* nritems won't be zero, but if we're picking up drop_snapshot
* after a crash, slot might be > 0, so double check things
* just in case.
*/
if (refi == 0)
goto out;
sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
/*
* the first loop frees everything the leaves point to
*/
for (i = 0; i < refi; i++) {
u64 ptr_gen;
bytenr = sorted[i].bytenr;
/*
* check the reference count on this leaf. If it is > 1
* we just decrement it below and don't update any
* of the refs the leaf points to.
*/
ret = drop_snap_lookup_refcount(trans, root, bytenr,
blocksize, &refs);
BUG_ON(ret);
if (refs != 1)
if (wc->stage == UPDATE_BACKREF &&
generation <= root->root_key.offset)
continue;
ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
/*
* the leaf only had one reference, which means the
* only thing pointing to this leaf is the snapshot
* we're deleting. It isn't possible for the reference
* count to increase again later
*
* The reference cache is checked for the leaf,
* and if found we'll be able to drop any refs held by
* the leaf without needing to read it in.
*/
ref = btrfs_lookup_leaf_ref(root, bytenr);
if (ref && ref->generation != ptr_gen) {
btrfs_free_leaf_ref(root, ref);
ref = NULL;
}
if (ref) {
ret = cache_drop_leaf_ref(trans, root, ref);
BUG_ON(ret);
btrfs_remove_leaf_ref(root, ref);
btrfs_free_leaf_ref(root, ref);
} else {
/*
* the leaf wasn't in the reference cache, so
* we have to read it.
*/
leaf = read_tree_block(root, bytenr, blocksize,
ptr_gen);
ret = btrfs_drop_leaf_ref(trans, root, leaf);
if (wc->stage == DROP_REFERENCE) {
ret = btrfs_lookup_extent_info(trans, root,
bytenr, blocksize,
&refs, NULL);
BUG_ON(ret);
free_extent_buffer(leaf);
}
atomic_inc(&root->fs_info->throttle_gen);
wake_up(&root->fs_info->transaction_throttle);
cond_resched();
}
/*
* run through the loop again to free the refs on the leaves.
* This is faster than doing it in the loop above because
* the leaves are likely to be clustered together. We end up
* working in nice chunks on the extent allocation tree.
*/
for (i = 0; i < refi; i++) {
bytenr = sorted[i].bytenr;
ret = btrfs_free_extent(trans, root, bytenr,
blocksize, eb->start,
root_owner, root_gen, 0, 1);
BUG_ON(ret);
BUG_ON(refs == 0);
if (refs == 1)
goto reada;
atomic_inc(&root->fs_info->throttle_gen);
wake_up(&root->fs_info->transaction_throttle);
cond_resched();
}
out:
kfree(sorted);
/*
* update the path to show we've processed the entire level 1
* node. This will get saved into the root's drop_snapshot_progress
* field so these drops are not repeated again if this transaction
* commits.
*/
path->slots[1] = nritems;
return 0;
}
/*
* helper function for drop_snapshot, this walks down the tree dropping ref
* counts as it goes.
*/
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, int *level)
{
u64 root_owner;
u64 root_gen;
u64 bytenr;
u64 ptr_gen;
struct extent_buffer *next;
struct extent_buffer *cur;
struct extent_buffer *parent;
u32 blocksize;
int ret;
u32 refs;
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
path->nodes[*level]->len, &refs);
BUG_ON(ret);
if (refs > 1)
goto out;
/*
* walk down to the last node level and free all the leaves
*/
while (*level >= 0) {
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
cur = path->nodes[*level];
if (btrfs_header_level(cur) != *level)
WARN_ON(1);
if (path->slots[*level] >=
btrfs_header_nritems(cur))
break;
/* the new code goes down to level 1 and does all the
* leaves pointed to that node in bulk. So, this check
* for level 0 will always be false.
*
* But, the disk format allows the drop_snapshot_progress
* field in the root to leave things in a state where
* a leaf will need cleaning up here. If someone crashes
* with the old code and then boots with the new code,
* we might find a leaf here.
*/
if (*level == 0) {
ret = btrfs_drop_leaf_ref(trans, root, cur);
BUG_ON(ret);
break;
if (!wc->update_ref ||
generation <= root->root_key.offset)
continue;
btrfs_node_key_to_cpu(eb, &key, slot);
ret = btrfs_comp_cpu_keys(&key,
&wc->update_progress);
if (ret < 0)
continue;
}
/*
* once we get to level one, process the whole node
* at once, including everything below it.
*/
if (*level == 1) {
ret = drop_level_one_refs(trans, root, path);
BUG_ON(ret);
reada:
ret = readahead_tree_block(root, bytenr, blocksize,
generation);
if (ret)
break;
}
bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
blocksize = btrfs_level_size(root, *level - 1);
ret = drop_snap_lookup_refcount(trans, root, bytenr,
blocksize, &refs);
BUG_ON(ret);
/*
* if there is more than one reference, we don't need
* to read that node to drop any references it has. We
* just drop the ref we hold on that node and move on to the
* next slot in this level.
*/
if (refs != 1) {
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
root_gen = btrfs_header_generation(parent);
path->slots[*level]++;
ret = btrfs_free_extent(trans, root, bytenr,
blocksize, parent->start,
root_owner, root_gen,
*level - 1, 1);
BUG_ON(ret);
atomic_inc(&root->fs_info->throttle_gen);
wake_up(&root->fs_info->transaction_throttle);
cond_resched();
continue;
}
/*
* we need to keep freeing things in the next level down.
* read the block and loop around to process it
*/
next = read_tree_block(root, bytenr, blocksize, ptr_gen);
WARN_ON(*level <= 0);
if (path->nodes[*level-1])
free_extent_buffer(path->nodes[*level-1]);
path->nodes[*level-1] = next;
*level = btrfs_header_level(next);
path->slots[*level] = 0;
cond_resched();
last = bytenr + blocksize;
nread++;
}
out:
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
if (path->nodes[*level] == root->node) {
parent = path->nodes[*level];
bytenr = path->nodes[*level]->start;
} else {
parent = path->nodes[*level + 1];
bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
}
blocksize = btrfs_level_size(root, *level);
root_owner = btrfs_header_owner(parent);
root_gen = btrfs_header_generation(parent);
/*
* cleanup and free the reference on the last node
* we processed
*/
ret = btrfs_free_extent(trans, root, bytenr, blocksize,
parent->start, root_owner, root_gen,
*level, 1);
free_extent_buffer(path->nodes[*level]);
path->nodes[*level] = NULL;
*level += 1;
BUG_ON(ret);
cond_resched();
return 0;
wc->reada_slot = slot;
}
#endif
struct walk_control {
u64 refs[BTRFS_MAX_LEVEL];
u64 flags[BTRFS_MAX_LEVEL];
struct btrfs_key update_progress;
int stage;
int level;
int shared_level;
int update_ref;
int keep_locks;
};
#define DROP_REFERENCE 1
#define UPDATE_BACKREF 2
/*
* hepler to process tree block while walking down the tree.
*
* when wc->stage == DROP_REFERENCE, this function checks
* reference count of the block. if the block is shared and
* we need update back refs for the subtree rooted at the
* block, this function changes wc->stage to UPDATE_BACKREF
*
* when wc->stage == UPDATE_BACKREF, this function updates
* back refs for pointers in the block.
*
......@@ -4805,7 +4587,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
{
int level = wc->level;
struct extent_buffer *eb = path->nodes[level];
struct btrfs_key key;
u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
int ret;
......@@ -4828,21 +4609,6 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
BUG_ON(wc->refs[level] == 0);
}
if (wc->stage == DROP_REFERENCE &&
wc->update_ref && wc->refs[level] > 1) {
BUG_ON(eb == root->node);
BUG_ON(path->slots[level] > 0);
if (level == 0)
btrfs_item_key_to_cpu(eb, &key, path->slots[level]);
else
btrfs_node_key_to_cpu(eb, &key, path->slots[level]);
if (btrfs_header_owner(eb) == root->root_key.objectid &&
btrfs_comp_cpu_keys(&key, &wc->update_progress) >= 0) {
wc->stage = UPDATE_BACKREF;
wc->shared_level = level;
}
}
if (wc->stage == DROP_REFERENCE) {
if (wc->refs[level] > 1)
return 1;
......@@ -4878,6 +4644,123 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
return 0;
}
/*
* hepler to process tree block pointer.
*
* when wc->stage == DROP_REFERENCE, this function checks
* reference count of the block pointed to. if the block
* is shared and we need update back refs for the subtree
* rooted at the block, this function changes wc->stage to
* UPDATE_BACKREF. if the block is shared and there is no
* need to update back, this function drops the reference
* to the block.
*
* NOTE: return value 1 means we should stop walking down.
*/
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct walk_control *wc)
{
u64 bytenr;
u64 generation;
u64 parent;
u32 blocksize;
struct btrfs_key key;
struct extent_buffer *next;
int level = wc->level;
int reada = 0;
int ret = 0;
generation = btrfs_node_ptr_generation(path->nodes[level],
path->slots[level]);
/*
* if the lower level block was created before the snapshot
* was created, we know there is no need to update back refs
* for the subtree
*/
if (wc->stage == UPDATE_BACKREF &&
generation <= root->root_key.offset)
return 1;
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
blocksize = btrfs_level_size(root, level - 1);
next = btrfs_find_tree_block(root, bytenr, blocksize);
if (!next) {
next = btrfs_find_create_tree_block(root, bytenr, blocksize);
reada = 1;
}
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
if (wc->stage == DROP_REFERENCE) {
ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
&wc->refs[level - 1],
&wc->flags[level - 1]);
BUG_ON(ret);
BUG_ON(wc->refs[level - 1] == 0);
if (wc->refs[level - 1] > 1) {
if (!wc->update_ref ||
generation <= root->root_key.offset)
goto skip;
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
if (ret < 0)
goto skip;
wc->stage = UPDATE_BACKREF;
wc->shared_level = level - 1;
}
}
if (!btrfs_buffer_uptodate(next, generation)) {
btrfs_tree_unlock(next);
free_extent_buffer(next);
next = NULL;
}
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(root, bytenr, blocksize, generation);
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
}
level--;
BUG_ON(level != btrfs_header_level(next));
path->nodes[level] = next;
path->slots[level] = 0;
path->locks[level] = 1;
wc->level = level;
if (wc->level == 1)
wc->reada_slot = 0;
return 0;
skip:
wc->refs[level - 1] = 0;
wc->flags[level - 1] = 0;
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
parent = path->nodes[level]->start;
} else {
BUG_ON(root->root_key.objectid !=
btrfs_header_owner(path->nodes[level]));
parent = 0;
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
root->root_key.objectid, level - 1, 0);
BUG_ON(ret);
btrfs_tree_unlock(next);
free_extent_buffer(next);
return 1;
}
/*
* hepler to process tree block while walking up the tree.
*
......@@ -4905,7 +4788,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (level < wc->shared_level)
goto out;
BUG_ON(wc->refs[level] <= 1);
ret = find_next_key(path, level + 1, &wc->update_progress);
if (ret > 0)
wc->update_ref = 0;
......@@ -4936,8 +4818,6 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
path->locks[level] = 0;
return 1;
}
} else {
BUG_ON(level != 0);
}
}
......@@ -4990,17 +4870,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct walk_control *wc)
{
struct extent_buffer *next;
struct extent_buffer *cur;
u64 bytenr;
u64 ptr_gen;
u32 blocksize;
int level = wc->level;
int ret;
while (level >= 0) {
cur = path->nodes[level];
BUG_ON(path->slots[level] >= btrfs_header_nritems(cur));
if (path->slots[level] >=
btrfs_header_nritems(path->nodes[level]))
break;
ret = walk_down_proc(trans, root, path, wc);
if (ret > 0)
......@@ -5009,20 +4885,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
if (level == 0)
break;
bytenr = btrfs_node_blockptr(cur, path->slots[level]);
blocksize = btrfs_level_size(root, level - 1);
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[level]);
next = read_tree_block(root, bytenr, blocksize, ptr_gen);
btrfs_tree_lock(next);
btrfs_set_lock_blocking(next);
level--;
BUG_ON(level != btrfs_header_level(next));
path->nodes[level] = next;
path->slots[level] = 0;
path->locks[level] = 1;
wc->level = level;
ret = do_walk_down(trans, root, path, wc);
if (ret > 0) {
path->slots[level]++;
continue;
}
level = wc->level;
}
return 0;
}
......@@ -5112,9 +4980,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
err = ret;
goto out;
}
btrfs_node_key_to_cpu(path->nodes[level], &key,
path->slots[level]);
WARN_ON(memcmp(&key, &wc->update_progress, sizeof(key)));
WARN_ON(ret > 0);
/*
* unlock our path, this is safe because only this
......@@ -5149,6 +5015,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
wc->stage = DROP_REFERENCE;
wc->update_ref = update_ref;
wc->keep_locks = 0;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
ret = walk_down_tree(trans, root, path, wc);
......@@ -5201,9 +5068,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
ret = btrfs_del_root(trans, tree_root, &root->root_key);
BUG_ON(ret);
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root);
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
NULL, NULL);
BUG_ON(ret < 0);
if (ret > 0) {
ret = btrfs_del_orphan_item(trans, tree_root,
root->root_key.objectid);
BUG_ON(ret);
}
}
if (root->in_radix) {
btrfs_free_fs_root(tree_root->fs_info, root);
} else {
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
kfree(root);
}
out:
btrfs_end_transaction(trans, tree_root);
kfree(wc);
......@@ -5255,6 +5137,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
wc->stage = DROP_REFERENCE;
wc->update_ref = 0;
wc->keep_locks = 1;
wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
while (1) {
wret = walk_down_tree(trans, root, path, wc);
......@@ -5397,9 +5280,9 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
while (1) {
int ret;
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
......@@ -6842,287 +6725,86 @@ int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
return 0;
}
#if 0
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 size)
{
struct btrfs_path *path;
struct btrfs_inode_item *item;
struct extent_buffer *leaf;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
path->leave_spinning = 1;
ret = btrfs_insert_empty_inode(trans, root, path, objectid);
if (ret)
goto out;
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, size);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(root, path);
out:
btrfs_free_path(path);
return ret;
}
static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
struct btrfs_block_group_cache *group)
/*
* checks to see if its even possible to relocate this block group.
*
* @return - -1 if it's not a good idea to relocate this block group, 0 if its
* ok to go ahead and try.
*/
int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
{
struct inode *inode = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root;
struct btrfs_key root_key;
u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
int err = 0;
struct btrfs_block_group_cache *block_group;
struct btrfs_space_info *space_info;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
struct btrfs_device *device;
int full = 0;
int ret = 0;
root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
root_key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(fs_info, &root_key);
if (IS_ERR(root))
return ERR_CAST(root);
block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans);
/* odd, couldn't find the block group, leave it alone */
if (!block_group)
return -1;
err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
if (err)
/* no bytes used, we're good */
if (!btrfs_block_group_used(&block_group->item))
goto out;
err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
BUG_ON(err);
err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
group->key.offset, 0, group->key.offset,
0, 0, 0);
BUG_ON(err);
inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
if (inode->i_state & I_NEW) {
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->location.objectid = objectid;
BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
BTRFS_I(inode)->location.offset = 0;
btrfs_read_locked_inode(inode);
unlock_new_inode(inode);
BUG_ON(is_bad_inode(inode));
} else {
BUG_ON(1);
}
BTRFS_I(inode)->index_cnt = group->key.objectid;
err = btrfs_orphan_add(trans, inode);
out:
btrfs_end_transaction(trans, root);
if (err) {
if (inode)
iput(inode);
inode = ERR_PTR(err);
}
return inode;
}
int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
{
struct btrfs_ordered_sum *sums;
struct btrfs_sector_sum *sector_sum;
struct btrfs_ordered_extent *ordered;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct list_head list;
size_t offset;
int ret;
u64 disk_bytenr;
INIT_LIST_HEAD(&list);
ordered = btrfs_lookup_ordered_extent(inode, file_pos);
BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
disk_bytenr + len - 1, &list);
while (!list_empty(&list)) {
sums = list_entry(list.next, struct btrfs_ordered_sum, list);
list_del_init(&sums->list);
sector_sum = sums->sums;
sums->bytenr = ordered->start;
space_info = block_group->space_info;
spin_lock(&space_info->lock);
offset = 0;
while (offset < sums->len) {
sector_sum->bytenr += ordered->start - disk_bytenr;
sector_sum++;
offset += root->sectorsize;
}
full = space_info->full;
btrfs_add_ordered_sum(inode, ordered, sums);
/*
* if this is the last block group we have in this space, we can't
* relocate it unless we're able to allocate a new chunk below.
*
* Otherwise, we need to make sure we have room in the space to handle
* all of the extents from this block group. If we can, we're good
*/
if ((space_info->total_bytes != block_group->key.offset) &&
(space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
btrfs_block_group_used(&block_group->item) <
space_info->total_bytes)) {
spin_unlock(&space_info->lock);
goto out;
}
btrfs_put_ordered_extent(ordered);
return 0;
}
int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
{
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_fs_info *info = root->fs_info;
struct extent_buffer *leaf;
struct inode *reloc_inode;
struct btrfs_block_group_cache *block_group;
struct btrfs_key key;
u64 skipped;
u64 cur_byte;
u64 total_found;
u32 nritems;
int ret;
int progress;
int pass = 0;
root = root->fs_info->extent_root;
block_group = btrfs_lookup_block_group(info, group_start);
BUG_ON(!block_group);
printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
(unsigned long long)block_group->key.objectid,
(unsigned long long)block_group->flags);
path = btrfs_alloc_path();
BUG_ON(!path);
reloc_inode = create_reloc_inode(info, block_group);
BUG_ON(IS_ERR(reloc_inode));
__alloc_chunk_for_shrink(root, block_group, 1);
set_block_group_readonly(block_group);
btrfs_start_delalloc_inodes(info->tree_root);
btrfs_wait_ordered_extents(info->tree_root, 0);
again:
skipped = 0;
total_found = 0;
progress = 0;
key.objectid = block_group->key.objectid;
key.offset = 0;
key.type = 0;
cur_byte = key.objectid;
trans = btrfs_start_transaction(info->tree_root, 1);
btrfs_commit_transaction(trans, info->tree_root);
spin_unlock(&space_info->lock);
mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_clean_old_snapshots(info->tree_root);
btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
mutex_unlock(&root->fs_info->cleaner_mutex);
/*
* ok we don't have enough space, but maybe we have free space on our
* devices to allocate new chunks for relocation, so loop through our
* alloc devices and guess if we have enough space. However, if we
* were marked as full, then we know there aren't enough chunks, and we
* can just return.
*/
ret = -1;
if (full)
goto out;
trans = btrfs_start_transaction(info->tree_root, 1);
btrfs_commit_transaction(trans, info->tree_root);
mutex_lock(&root->fs_info->chunk_mutex);
list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
u64 min_free = btrfs_block_group_used(&block_group->item);
u64 dev_offset, max_avail;
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
next:
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
if (ret == 1) {
ret = 0;
/*
* check to make sure we can actually find a chunk with enough
* space to fit our block group in.
*/
if (device->total_bytes > device->bytes_used + min_free) {
ret = find_free_dev_extent(NULL, device, min_free,
&dev_offset, &max_avail);
if (!ret)
break;
}
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid >= block_group->key.objectid +
block_group->key.offset)
break;
if (progress && need_resched()) {
btrfs_release_path(root, path);
cond_resched();
progress = 0;
continue;
ret = -1;
}
progress = 1;
if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
key.objectid + key.offset <= cur_byte) {
path->slots[0]++;
goto next;
}
total_found++;
cur_byte = key.objectid + key.offset;
btrfs_release_path(root, path);
__alloc_chunk_for_shrink(root, block_group, 0);
ret = relocate_one_extent(root, path, &key, block_group,
reloc_inode, pass);
BUG_ON(ret < 0);
if (ret > 0)
skipped++;
key.objectid = cur_byte;
key.type = 0;
key.offset = 0;
}
btrfs_release_path(root, path);
if (pass == 0) {
btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
}
if (total_found > 0) {
printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
(unsigned long long)total_found, pass);
pass++;
if (total_found == skipped && pass > 2) {
iput(reloc_inode);
reloc_inode = create_reloc_inode(info, block_group);
pass = 0;
}
goto again;
}
/* delete reloc_inode */
iput(reloc_inode);
/* unpin extents in this range */
trans = btrfs_start_transaction(info->tree_root, 1);
btrfs_commit_transaction(trans, info->tree_root);
spin_lock(&block_group->lock);
WARN_ON(block_group->pinned > 0);
WARN_ON(block_group->reserved > 0);
WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
spin_unlock(&block_group->lock);
btrfs_put_block_group(block_group);
ret = 0;
mutex_unlock(&root->fs_info->chunk_mutex);
out:
btrfs_free_path(path);
btrfs_put_block_group(block_group);
return ret;
}
#endif
static int find_first_block_group(struct btrfs_root *root,
struct btrfs_path *path, struct btrfs_key *key)
......@@ -7165,8 +6847,18 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
struct btrfs_block_group_cache *block_group;
struct btrfs_space_info *space_info;
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
down_write(&info->extent_commit_sem);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
put_caching_control(caching_ctl);
}
up_write(&info->extent_commit_sem);
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group_cache,
......@@ -7180,8 +6872,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_event(block_group->caching_q,
block_group_cache_done(block_group));
wait_block_group_cache_done(block_group);
btrfs_remove_free_space_cache(block_group);
......@@ -7251,7 +6942,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
cache->fs_info = info;
init_waitqueue_head(&cache->caching_q);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
......@@ -7273,8 +6963,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
cache->flags = btrfs_block_group_flags(&cache->item);
cache->sectorsize = root->sectorsize;
remove_sb_from_cache(root, cache);
/*
* check for two cases, either we are full, and therefore
* don't need to bother with the caching work since we won't
......@@ -7283,13 +6971,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* time, particularly in the full case.
*/
if (found_key.offset == btrfs_block_group_used(&cache->item)) {
exclude_super_stripes(root, cache);
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
free_excluded_extents(root, cache);
} else if (btrfs_block_group_used(&cache->item) == 0) {
exclude_super_stripes(root, cache);
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
add_new_free_space(cache, root->fs_info,
found_key.objectid,
found_key.objectid +
found_key.offset);
free_excluded_extents(root, cache);
}
ret = update_space_info(info, cache->flags, found_key.offset,
......@@ -7297,6 +6991,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
&space_info);
BUG_ON(ret);
cache->space_info = space_info;
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_super += cache->bytes_super;
spin_unlock(&cache->space_info->lock);
down_write(&space_info->groups_sem);
list_add_tail(&cache->list, &space_info->block_groups);
up_write(&space_info->groups_sem);
......@@ -7346,7 +7044,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
spin_lock_init(&cache->tree_lock);
init_waitqueue_head(&cache->caching_q);
INIT_LIST_HEAD(&cache->list);
INIT_LIST_HEAD(&cache->cluster_list);
......@@ -7355,15 +7052,23 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
cache->flags = type;
btrfs_set_block_group_flags(&cache->item, type);
cache->last_byte_to_unpin = (u64)-1;
cache->cached = BTRFS_CACHE_FINISHED;
remove_sb_from_cache(root, cache);
exclude_super_stripes(root, cache);
add_new_free_space(cache, root->fs_info, chunk_offset,
chunk_offset + size);
free_excluded_extents(root, cache);
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
&cache->space_info);
BUG_ON(ret);
spin_lock(&cache->space_info->lock);
cache->space_info->bytes_super += cache->bytes_super;
spin_unlock(&cache->space_info->lock);
down_write(&cache->space_info->groups_sem);
list_add_tail(&cache->list, &cache->space_info->block_groups);
up_write(&cache->space_info->groups_sem);
......@@ -7429,8 +7134,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
up_write(&block_group->space_info->groups_sem);
if (block_group->cached == BTRFS_CACHE_STARTED)
wait_event(block_group->caching_q,
block_group_cache_done(block_group));
wait_block_group_cache_done(block_group);
btrfs_remove_free_space_cache(block_group);
......
......@@ -367,10 +367,10 @@ static int insert_state(struct extent_io_tree *tree,
}
if (bits & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
set_state_cb(tree, state, bits);
state->state |= bits;
state->start = start;
state->end = end;
set_state_cb(tree, state, bits);
state->state |= bits;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
struct extent_state *found;
......@@ -471,10 +471,14 @@ static int clear_state_bit(struct extent_io_tree *tree,
* bits were already set, or zero if none of the bits were already set.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int wake, int delete, gfp_t mask)
int bits, int wake, int delete,
struct extent_state **cached_state,
gfp_t mask)
{
struct extent_state *state;
struct extent_state *cached;
struct extent_state *prealloc = NULL;
struct rb_node *next_node;
struct rb_node *node;
u64 last_end;
int err;
......@@ -488,6 +492,17 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
}
spin_lock(&tree->lock);
if (cached_state) {
cached = *cached_state;
*cached_state = NULL;
cached_state = NULL;
if (cached && cached->tree && cached->start == start) {
atomic_dec(&cached->refs);
state = cached;
goto hit_next;
}
free_extent_state(cached);
}
/*
* this search will find the extents that end after
* our range starts
......@@ -496,6 +511,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (!node)
goto out;
state = rb_entry(node, struct extent_state, rb_node);
hit_next:
if (state->start > end)
goto out;
WARN_ON(state->end < start);
......@@ -531,8 +547,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
} else {
start = state->start;
}
goto search_again;
}
......@@ -550,16 +564,28 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (wake)
wake_up(&state->wq);
set |= clear_state_bit(tree, prealloc, bits,
wake, delete);
prealloc = NULL;
goto out;
}
if (state->end < end && prealloc && !need_resched())
next_node = rb_next(&state->rb_node);
else
next_node = NULL;
set |= clear_state_bit(tree, state, bits, wake, delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
if (start <= end && next_node) {
state = rb_entry(next_node, struct extent_state,
rb_node);
if (state->start == start)
goto hit_next;
}
goto search_again;
out:
......@@ -653,28 +679,40 @@ static void set_state_bits(struct extent_io_tree *tree,
state->state |= bits;
}
static void cache_state(struct extent_state *state,
struct extent_state **cached_ptr)
{
if (cached_ptr && !(*cached_ptr)) {
if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
*cached_ptr = state;
atomic_inc(&state->refs);
}
}
}
/*
* set some bits on a range in the tree. This may require allocations
* or sleeping, so the gfp mask is used to indicate what is allowed.
* set some bits on a range in the tree. This may require allocations or
* sleeping, so the gfp mask is used to indicate what is allowed.
*
* If 'exclusive' == 1, this will fail with -EEXIST if some part of the
* range already has the desired bits set. The start of the existing
* range is returned in failed_start in this case.
* If any of the exclusive bits are set, this will fail with -EEXIST if some
* part of the range already has the desired bits set. The start of the
* existing range is returned in failed_start in this case.
*
* [start, end] is inclusive
* This takes the tree lock.
* [start, end] is inclusive This takes the tree lock.
*/
static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int exclusive, u64 *failed_start,
int bits, int exclusive_bits, u64 *failed_start,
struct extent_state **cached_state,
gfp_t mask)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
struct rb_node *node;
int err = 0;
int set;
u64 last_start;
u64 last_end;
again:
if (!prealloc && (mask & __GFP_WAIT)) {
prealloc = alloc_extent_state(mask);
......@@ -683,6 +721,13 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
}
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->start == start && state->tree) {
node = &state->rb_node;
goto hit_next;
}
}
/*
* this search will find all the extents that end after
* our range starts.
......@@ -694,8 +739,8 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
BUG_ON(err == -EEXIST);
goto out;
}
state = rb_entry(node, struct extent_state, rb_node);
hit_next:
last_start = state->start;
last_end = state->end;
......@@ -706,17 +751,29 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
set = state->state & bits;
if (set && exclusive) {
struct rb_node *next_node;
if (state->state & exclusive_bits) {
*failed_start = state->start;
err = -EEXIST;
goto out;
}
set_state_bits(tree, state, bits);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
if (start < end && prealloc && !need_resched()) {
next_node = rb_next(node);
if (next_node) {
state = rb_entry(next_node, struct extent_state,
rb_node);
if (state->start == start)
goto hit_next;
}
}
goto search_again;
}
......@@ -737,8 +794,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* desired bit on it.
*/
if (state->start < start) {
set = state->state & bits;
if (exclusive && set) {
if (state->state & exclusive_bits) {
*failed_start = start;
err = -EEXIST;
goto out;
......@@ -750,12 +806,11 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
} else {
start = state->start;
}
goto search_again;
}
......@@ -774,6 +829,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
bits);
cache_state(prealloc, cached_state);
prealloc = NULL;
BUG_ON(err == -EEXIST);
if (err)
......@@ -788,8 +844,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* on the first half
*/
if (state->start <= end && state->end > end) {
set = state->state & bits;
if (exclusive && set) {
if (state->state & exclusive_bits) {
*failed_start = start;
err = -EEXIST;
goto out;
......@@ -798,6 +853,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
BUG_ON(err == -EEXIST);
set_state_bits(tree, prealloc, bits);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
goto out;
......@@ -826,86 +882,64 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
mask);
}
int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
NULL, mask);
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
return set_extent_bit(tree, start, end, bits, 0, NULL,
mask);
NULL, mask);
}
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask)
{
return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
}
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_DIRTY,
0, NULL, mask);
EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
0, NULL, NULL, mask);
}
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
}
int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
NULL, mask);
}
int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
mask);
NULL, mask);
}
static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
NULL, mask);
}
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
mask);
NULL, mask);
}
static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
}
static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
0, NULL, mask);
}
static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
NULL, mask);
}
int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
......@@ -917,13 +951,15 @@ int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
* either insert or lock state struct between start and end use mask to tell
* us if waiting is desired.
*/
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, struct extent_state **cached_state, gfp_t mask)
{
int err;
u64 failed_start;
while (1) {
err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
&failed_start, mask);
err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
EXTENT_LOCKED, &failed_start,
cached_state, mask);
if (err == -EEXIST && (mask & __GFP_WAIT)) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
......@@ -935,27 +971,40 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
return err;
}
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
{
return lock_extent_bits(tree, start, end, 0, NULL, mask);
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
int err;
u64 failed_start;
err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
&failed_start, mask);
err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
&failed_start, NULL, mask);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, 1, 0, mask);
EXTENT_LOCKED, 1, 0, NULL, mask);
return 0;
}
return 1;
}
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
mask);
}
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask)
{
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
mask);
}
/*
......@@ -974,7 +1023,6 @@ int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
set_extent_dirty(tree, start, end, GFP_NOFS);
return 0;
}
......@@ -994,7 +1042,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
page_cache_release(page);
index++;
}
set_extent_writeback(tree, start, end, GFP_NOFS);
return 0;
}
......@@ -1232,6 +1279,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 delalloc_start;
u64 delalloc_end;
u64 found;
struct extent_state *cached_state = NULL;
int ret;
int loops = 0;
......@@ -1269,6 +1317,7 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
/* some of the pages are gone, lets avoid looping by
* shortening the size of the delalloc range we're searching
*/
free_extent_state(cached_state);
if (!loops) {
unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
max_bytes = PAGE_CACHE_SIZE - offset;
......@@ -1282,18 +1331,21 @@ static noinline u64 find_lock_delalloc_range(struct inode *inode,
BUG_ON(ret);
/* step three, lock the state bits for the whole range */
lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
lock_extent_bits(tree, delalloc_start, delalloc_end,
0, &cached_state, GFP_NOFS);
/* then test to make sure it is all still delalloc */
ret = test_range_bit(tree, delalloc_start, delalloc_end,
EXTENT_DELALLOC, 1);
EXTENT_DELALLOC, 1, cached_state);
if (!ret) {
unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
unlock_extent_cached(tree, delalloc_start, delalloc_end,
&cached_state, GFP_NOFS);
__unlock_for_delalloc(inode, locked_page,
delalloc_start, delalloc_end);
cond_resched();
goto again;
}
free_extent_state(cached_state);
*start = delalloc_start;
*end = delalloc_end;
out_failed:
......@@ -1307,7 +1359,8 @@ int extent_clear_unlock_delalloc(struct inode *inode,
int clear_unlock,
int clear_delalloc, int clear_dirty,
int set_writeback,
int end_writeback)
int end_writeback,
int set_private2)
{
int ret;
struct page *pages[16];
......@@ -1325,8 +1378,9 @@ int extent_clear_unlock_delalloc(struct inode *inode,
if (clear_delalloc)
clear_bits |= EXTENT_DELALLOC;
clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
set_private2))
return 0;
while (nr_pages > 0) {
......@@ -1334,6 +1388,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
min_t(unsigned long,
nr_pages, ARRAY_SIZE(pages)), pages);
for (i = 0; i < ret; i++) {
if (set_private2)
SetPagePrivate2(pages[i]);
if (pages[i] == locked_page) {
page_cache_release(pages[i]);
continue;
......@@ -1476,14 +1534,17 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int filled)
int bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
int bitset = 0;
spin_lock(&tree->lock);
node = tree_search(tree, start);
if (cached && cached->tree && cached->start == start)
node = &cached->rb_node;
else
node = tree_search(tree, start);
while (node && start <= end) {
state = rb_entry(node, struct extent_state, rb_node);
......@@ -1503,6 +1564,10 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
bitset = 0;
break;
}
if (state->end == (u64)-1)
break;
start = state->end + 1;
if (start > end)
break;
......@@ -1526,7 +1591,7 @@ static int check_page_uptodate(struct extent_io_tree *tree,
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
SetPageUptodate(page);
return 0;
}
......@@ -1540,7 +1605,7 @@ static int check_page_locked(struct extent_io_tree *tree,
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
unlock_page(page);
return 0;
}
......@@ -1552,10 +1617,7 @@ static int check_page_locked(struct extent_io_tree *tree,
static int check_page_writeback(struct extent_io_tree *tree,
struct page *page)
{
u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
u64 end = start + PAGE_CACHE_SIZE - 1;
if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
end_page_writeback(page);
end_page_writeback(page);
return 0;
}
......@@ -1613,13 +1675,11 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
}
if (!uptodate) {
clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
clear_extent_uptodate(tree, start, end, GFP_NOFS);
ClearPageUptodate(page);
SetPageError(page);
}
clear_extent_writeback(tree, start, end, GFP_ATOMIC);
if (whole_page)
end_page_writeback(page);
else
......@@ -1983,7 +2043,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
continue;
}
/* the get_extent function already copied into the page */
if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
if (test_range_bit(tree, cur, cur_end,
EXTENT_UPTODATE, 1, NULL)) {
check_page_uptodate(tree, page);
unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
cur = cur + iosize;
......@@ -2078,6 +2139,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 iosize;
u64 unlock_start;
sector_t sector;
struct extent_state *cached_state = NULL;
struct extent_map *em;
struct block_device *bdev;
int ret;
......@@ -2124,6 +2186,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_end = 0;
page_started = 0;
if (!epd->extent_locked) {
u64 delalloc_to_write = 0;
/*
* make sure the wbc mapping index is at least updated
* to this page.
......@@ -2143,8 +2206,24 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
tree->ops->fill_delalloc(inode, page, delalloc_start,
delalloc_end, &page_started,
&nr_written);
/*
* delalloc_end is already one less than the total
* length, so we don't subtract one from
* PAGE_CACHE_SIZE
*/
delalloc_to_write += (delalloc_end - delalloc_start +
PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
delalloc_start = delalloc_end + 1;
}
if (wbc->nr_to_write < delalloc_to_write) {
int thresh = 8192;
if (delalloc_to_write < thresh * 2)
thresh = delalloc_to_write;
wbc->nr_to_write = min_t(u64, delalloc_to_write,
thresh);
}
/* did the fill delalloc function already unlock and start
* the IO?
......@@ -2160,15 +2239,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done_unlocked;
}
}
lock_extent(tree, start, page_end, GFP_NOFS);
unlock_start = start;
if (tree->ops && tree->ops->writepage_start_hook) {
ret = tree->ops->writepage_start_hook(page, start,
page_end);
if (ret == -EAGAIN) {
unlock_extent(tree, start, page_end, GFP_NOFS);
redirty_page_for_writepage(wbc, page);
update_nr_written(page, wbc, nr_written);
unlock_page(page);
......@@ -2184,12 +2258,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
update_nr_written(page, wbc, nr_written + 1);
end = page_end;
if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
if (last_byte <= start) {
clear_extent_dirty(tree, start, page_end, GFP_NOFS);
unlock_extent(tree, start, page_end, GFP_NOFS);
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, start,
page_end, NULL, 1);
......@@ -2197,13 +2266,10 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
goto done;
}
set_extent_uptodate(tree, start, page_end, GFP_NOFS);
blocksize = inode->i_sb->s_blocksize;
while (cur <= end) {
if (cur >= last_byte) {
clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
if (tree->ops && tree->ops->writepage_end_io_hook)
tree->ops->writepage_end_io_hook(page, cur,
page_end, NULL, 1);
......@@ -2235,12 +2301,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
*/
if (compressed || block_start == EXTENT_MAP_HOLE ||
block_start == EXTENT_MAP_INLINE) {
clear_extent_dirty(tree, cur,
cur + iosize - 1, GFP_NOFS);
unlock_extent(tree, unlock_start, cur + iosize - 1,
GFP_NOFS);
/*
* end_io notification does not happen here for
* compressed extents
......@@ -2265,13 +2325,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
/* leave this out until we have a page_mkwrite call */
if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
EXTENT_DIRTY, 0)) {
EXTENT_DIRTY, 0, NULL)) {
cur = cur + iosize;
pg_offset += iosize;
continue;
}
clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
if (tree->ops && tree->ops->writepage_io_hook) {
ret = tree->ops->writepage_io_hook(page, cur,
cur + iosize - 1);
......@@ -2309,12 +2368,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
set_page_writeback(page);
end_page_writeback(page);
}
if (unlock_start <= page_end)
unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
unlock_page(page);
done_unlocked:
/* drop our reference on any cached states */
free_extent_state(cached_state);
return 0;
}
......@@ -2339,9 +2398,9 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
writepage_t writepage, void *data,
void (*flush_fn)(void *))
{
struct backing_dev_info *bdi = mapping->backing_dev_info;
int ret = 0;
int done = 0;
int nr_to_write_done = 0;
struct pagevec pvec;
int nr_pages;
pgoff_t index;
......@@ -2361,7 +2420,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
scanned = 1;
}
retry:
while (!done && (index <= end) &&
while (!done && !nr_to_write_done && (index <= end) &&
(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY, min(end - index,
(pgoff_t)PAGEVEC_SIZE-1) + 1))) {
......@@ -2412,12 +2471,15 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
unlock_page(page);
ret = 0;
}
if (ret || wbc->nr_to_write <= 0)
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
if (ret)
done = 1;
}
/*
* the filesystem may choose to bump up nr_to_write.
* We have to make sure to honor the new nr_to_write
* at any time
*/
nr_to_write_done = wbc->nr_to_write <= 0;
}
pagevec_release(&pvec);
cond_resched();
......@@ -2604,10 +2666,10 @@ int extent_invalidatepage(struct extent_io_tree *tree,
return 0;
lock_extent(tree, start, end, GFP_NOFS);
wait_on_extent_writeback(tree, start, end);
wait_on_page_writeback(page);
clear_extent_bit(tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
1, 1, GFP_NOFS);
1, 1, NULL, GFP_NOFS);
return 0;
}
......@@ -2687,7 +2749,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
!isnew && !PageUptodate(page) &&
(block_off_end > to || block_off_start < from) &&
!test_range_bit(tree, block_start, cur_end,
EXTENT_UPTODATE, 1)) {
EXTENT_UPTODATE, 1, NULL)) {
u64 sector;
u64 extent_offset = block_start - em->start;
size_t iosize;
......@@ -2701,7 +2763,7 @@ int extent_prepare_write(struct extent_io_tree *tree,
*/
set_extent_bit(tree, block_start,
block_start + iosize - 1,
EXTENT_LOCKED, 0, NULL, GFP_NOFS);
EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
ret = submit_extent_page(READ, tree, page,
sector, iosize, page_offset, em->bdev,
NULL, 1,
......@@ -2742,13 +2804,18 @@ int try_release_extent_state(struct extent_map_tree *map,
int ret = 1;
if (test_range_bit(tree, start, end,
EXTENT_IOBITS | EXTENT_ORDERED, 0))
EXTENT_IOBITS, 0, NULL))
ret = 0;
else {
if ((mask & GFP_NOFS) == GFP_NOFS)
mask = GFP_NOFS;
clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
1, 1, mask);
/*
* at this point we can safely clear everything except the
* locked bit and the nodatasum bit
*/
clear_extent_bit(tree, start, end,
~(EXTENT_LOCKED | EXTENT_NODATASUM),
0, 0, NULL, mask);
}
return ret;
}
......@@ -2771,29 +2838,28 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 len;
while (start <= end) {
len = end - start + 1;
spin_lock(&map->lock);
write_lock(&map->lock);
em = lookup_extent_mapping(map, start, len);
if (!em || IS_ERR(em)) {
spin_unlock(&map->lock);
write_unlock(&map->lock);
break;
}
if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
em->start != start) {
spin_unlock(&map->lock);
write_unlock(&map->lock);
free_extent_map(em);
break;
}
if (!test_range_bit(tree, em->start,
extent_map_end(em) - 1,
EXTENT_LOCKED | EXTENT_WRITEBACK |
EXTENT_ORDERED,
0)) {
EXTENT_LOCKED | EXTENT_WRITEBACK,
0, NULL)) {
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
}
start = extent_map_end(em);
spin_unlock(&map->lock);
write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
......@@ -3203,7 +3269,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
int uptodate;
unsigned long index;
ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
if (ret)
return 1;
while (start <= end) {
......@@ -3233,7 +3299,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
return 1;
ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
EXTENT_UPTODATE, 1);
EXTENT_UPTODATE, 1, NULL);
if (ret)
return ret;
......@@ -3269,7 +3335,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
return 0;
if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
EXTENT_UPTODATE, 1)) {
EXTENT_UPTODATE, 1, NULL)) {
return 0;
}
......
......@@ -13,10 +13,8 @@
#define EXTENT_DEFRAG (1 << 6)
#define EXTENT_DEFRAG_DONE (1 << 7)
#define EXTENT_BUFFER_FILLED (1 << 8)
#define EXTENT_ORDERED (1 << 9)
#define EXTENT_ORDERED_METADATA (1 << 10)
#define EXTENT_BOUNDARY (1 << 11)
#define EXTENT_NODATASUM (1 << 12)
#define EXTENT_BOUNDARY (1 << 9)
#define EXTENT_NODATASUM (1 << 10)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
/* flags for bio submission */
......@@ -142,6 +140,8 @@ int try_release_extent_state(struct extent_map_tree *map,
struct extent_io_tree *tree, struct page *page,
gfp_t mask);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, struct extent_state **cached, gfp_t mask);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
......@@ -155,11 +155,12 @@ u64 count_range_bits(struct extent_io_tree *tree,
u64 max_bytes, unsigned long bits);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int filled);
int bits, int filled, struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
int bits, int wake, int delete, gfp_t mask);
int bits, int wake, int delete, struct extent_state **cached,
gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int bits, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
......@@ -282,5 +283,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
int clear_unlock,
int clear_delalloc, int clear_dirty,
int set_writeback,
int end_writeback);
int end_writeback,
int set_private2);
#endif
......@@ -36,7 +36,7 @@ void extent_map_exit(void)
void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
{
tree->map.rb_node = NULL;
spin_lock_init(&tree->lock);
rwlock_init(&tree->lock);
}
/**
......@@ -198,6 +198,56 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
return 0;
}
int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
{
int ret = 0;
struct extent_map *merge = NULL;
struct rb_node *rb;
struct extent_map *em;
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
WARN_ON(em->start != start || !em);
if (!em)
goto out;
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && mergable_maps(merge, em)) {
em->start = merge->start;
em->len += merge->len;
em->block_len += merge->block_len;
em->block_start = merge->block_start;
merge->in_tree = 0;
rb_erase(&merge->rb_node, &tree->map);
free_extent_map(merge);
}
}
rb = rb_next(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
if (rb && mergable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->len;
rb_erase(&merge->rb_node, &tree->map);
merge->in_tree = 0;
free_extent_map(merge);
}
free_extent_map(em);
out:
write_unlock(&tree->lock);
return ret;
}
/**
* add_extent_mapping - add new extent map to the extent tree
* @tree: tree to insert new map in
......@@ -222,7 +272,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
ret = -EEXIST;
goto out;
}
assert_spin_locked(&tree->lock);
rb = tree_insert(&tree->map, em->start, &em->rb_node);
if (rb) {
ret = -EEXIST;
......@@ -285,7 +334,6 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *next = NULL;
u64 end = range_end(start, len);
assert_spin_locked(&tree->lock);
rb_node = __tree_search(&tree->map, start, &prev, &next);
if (!rb_node && prev) {
em = rb_entry(prev, struct extent_map, rb_node);
......@@ -318,6 +366,54 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
return em;
}
/**
* search_extent_mapping - find a nearby extent map
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
*
* Find and return the first extent_map struct in @tree that intersects the
* [start, len] range.
*
* If one can't be found, any nearby extent may be returned
*/
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len)
{
struct extent_map *em;
struct rb_node *rb_node;
struct rb_node *prev = NULL;
struct rb_node *next = NULL;
rb_node = __tree_search(&tree->map, start, &prev, &next);
if (!rb_node && prev) {
em = rb_entry(prev, struct extent_map, rb_node);
goto found;
}
if (!rb_node && next) {
em = rb_entry(next, struct extent_map, rb_node);
goto found;
}
if (!rb_node) {
em = NULL;
goto out;
}
if (IS_ERR(rb_node)) {
em = ERR_PTR(PTR_ERR(rb_node));
goto out;
}
em = rb_entry(rb_node, struct extent_map, rb_node);
goto found;
em = NULL;
goto out;
found:
atomic_inc(&em->refs);
out:
return em;
}
/**
* remove_extent_mapping - removes an extent_map from the extent tree
* @tree: extent tree to remove from
......@@ -331,7 +427,6 @@ int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
int ret = 0;
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
assert_spin_locked(&tree->lock);
rb_erase(&em->rb_node, &tree->map);
em->in_tree = 0;
return ret;
......
......@@ -31,7 +31,7 @@ struct extent_map {
struct extent_map_tree {
struct rb_root map;
spinlock_t lock;
rwlock_t lock;
};
static inline u64 extent_map_end(struct extent_map *em)
......@@ -59,4 +59,7 @@ struct extent_map *alloc_extent_map(gfp_t mask);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void extent_map_exit(void);
int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
#endif
......@@ -112,8 +112,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
int err = 0;
int i;
struct inode *inode = fdentry(file)->d_inode;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
u64 hint_byte;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
......@@ -125,22 +123,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
end_of_last_block = start_pos + num_bytes - 1;
lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
trans = btrfs_join_transaction(root, 1);
if (!trans) {
err = -ENOMEM;
goto out_unlock;
}
btrfs_set_trans_block_group(trans, inode);
hint_byte = 0;
set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
/* check for reserved extents on each page, we don't want
* to reset the delalloc bit on things that already have
* extents reserved.
*/
btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
......@@ -155,9 +137,6 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
* at this time.
*/
}
err = btrfs_end_transaction(trans, root);
out_unlock:
unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
return err;
}
......@@ -189,18 +168,18 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
if (!split2)
split2 = alloc_extent_map(GFP_NOFS);
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (!em) {
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
break;
}
flags = em->flags;
if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
spin_unlock(&em_tree->lock);
if (em->start <= start &&
(!testend || em->start + em->len >= start + len)) {
free_extent_map(em);
write_unlock(&em_tree->lock);
break;
}
if (start < em->start) {
......@@ -210,6 +189,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
start = em->start + em->len;
}
free_extent_map(em);
write_unlock(&em_tree->lock);
continue;
}
compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
......@@ -260,7 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
free_extent_map(split);
split = NULL;
}
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
/* once for us */
free_extent_map(em);
......@@ -289,7 +269,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
u64 start, u64 end, u64 locked_end,
u64 inline_limit, u64 *hint_byte)
u64 inline_limit, u64 *hint_byte, int drop_cache)
{
u64 extent_end = 0;
u64 search_start = start;
......@@ -314,7 +294,8 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
int ret;
inline_limit = 0;
btrfs_drop_extent_cache(inode, start, end - 1, 0);
if (drop_cache)
btrfs_drop_extent_cache(inode, start, end - 1, 0);
path = btrfs_alloc_path();
if (!path)
......
......@@ -259,7 +259,9 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
{
u64 max_bytes, possible_bytes;
u64 max_bytes;
u64 bitmap_bytes;
u64 extent_bytes;
/*
* The goal is to keep the total amount of memory used per 1gb of space
......@@ -269,22 +271,27 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
max_bytes = MAX_CACHE_BYTES_PER_GIG *
(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
(sizeof(struct btrfs_free_space) *
block_group->extents_thresh);
/*
* we want to account for 1 more bitmap than what we have so we can make
* sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
* we add more bitmaps.
*/
bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
if (possible_bytes > max_bytes) {
int extent_bytes = max_bytes -
(block_group->total_bitmaps * PAGE_CACHE_SIZE);
if (bitmap_bytes >= max_bytes) {
block_group->extents_thresh = 0;
return;
}
if (extent_bytes <= 0) {
block_group->extents_thresh = 0;
return;
}
/*
* we want the extent entry threshold to always be at most 1/2 the maxw
* bytes we can have, or whatever is less than that.
*/
extent_bytes = max_bytes - bitmap_bytes;
extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
block_group->extents_thresh = extent_bytes /
(sizeof(struct btrfs_free_space));
}
block_group->extents_thresh =
div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
}
static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
......@@ -403,6 +410,7 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
BUG_ON(block_group->total_bitmaps >= max_bitmaps);
info->offset = offset_to_bitmap(block_group, offset);
info->bytes = 0;
link_free_space(block_group, info);
block_group->total_bitmaps++;
......
......@@ -149,6 +149,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
if (ret == -EOVERFLOW)
ret = -EMLINK;
goto out;
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
......@@ -177,8 +179,6 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_inode_item));
if (ret == 0 && objectid > root->highest_inode)
root->highest_inode = objectid;
return ret;
}
......
......@@ -43,9 +43,10 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
slot = path->slots[0] - 1;
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &found_key, slot);
*objectid = found_key.objectid;
*objectid = max_t(u64, found_key.objectid,
BTRFS_FIRST_FREE_OBJECTID - 1);
} else {
*objectid = BTRFS_FIRST_FREE_OBJECTID;
*objectid = BTRFS_FIRST_FREE_OBJECTID - 1;
}
ret = 0;
error:
......@@ -53,91 +54,27 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
return ret;
}
/*
* walks the btree of allocated inodes and find a hole.
*/
int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 dirid, u64 *objectid)
{
struct btrfs_path *path;
struct btrfs_key key;
int ret;
int slot = 0;
u64 last_ino = 0;
int start_found;
struct extent_buffer *l;
struct btrfs_key search_key;
u64 search_start = dirid;
mutex_lock(&root->objectid_mutex);
if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID &&
root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) {
*objectid = ++root->last_inode_alloc;
mutex_unlock(&root->objectid_mutex);
return 0;
}
path = btrfs_alloc_path();
BUG_ON(!path);
search_start = max(search_start, (u64)BTRFS_FIRST_FREE_OBJECTID);
search_key.objectid = search_start;
search_key.type = 0;
search_key.offset = 0;
start_found = 0;
ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
if (ret < 0)
goto error;
while (1) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto error;
if (!start_found) {
*objectid = search_start;
start_found = 1;
goto found;
}
*objectid = last_ino > search_start ?
last_ino : search_start;
goto found;
}
btrfs_item_key_to_cpu(l, &key, slot);
if (key.objectid >= search_start) {
if (start_found) {
if (last_ino < search_start)
last_ino = search_start;
if (key.objectid > last_ino) {
*objectid = last_ino;
goto found;
}
} else if (key.objectid > search_start) {
*objectid = search_start;
goto found;
}
}
if (key.objectid >= BTRFS_LAST_FREE_OBJECTID)
break;
if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_find_highest_inode(root, &root->highest_objectid);
if (ret)
goto out;
}
start_found = 1;
last_ino = key.objectid + 1;
path->slots[0]++;
if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
ret = -ENOSPC;
goto out;
}
BUG_ON(1);
found:
btrfs_release_path(root, path);
btrfs_free_path(path);
BUG_ON(*objectid < search_start);
mutex_unlock(&root->objectid_mutex);
return 0;
error:
btrfs_release_path(root, path);
btrfs_free_path(path);
*objectid = ++root->highest_objectid;
ret = 0;
out:
mutex_unlock(&root->objectid_mutex);
return ret;
}
......@@ -231,7 +231,8 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
}
ret = btrfs_drop_extents(trans, root, inode, start,
aligned_end, aligned_end, start, &hint_byte);
aligned_end, aligned_end, start,
&hint_byte, 1);
BUG_ON(ret);
if (isize > actual_end)
......@@ -240,7 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
inline_len, compressed_size,
compressed_pages);
BUG_ON(ret);
btrfs_drop_extent_cache(inode, start, aligned_end, 0);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
return 0;
}
......@@ -425,7 +426,7 @@ static noinline int compress_file_range(struct inode *inode,
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
start, end, NULL, 1, 0,
0, 1, 1, 1);
0, 1, 1, 1, 0);
ret = 0;
goto free_pages_out;
}
......@@ -611,9 +612,9 @@ static noinline int submit_compressed_extents(struct inode *inode,
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
while (1) {
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
......@@ -640,7 +641,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
NULL, 1, 1, 0, 1, 1, 0);
NULL, 1, 1, 0, 1, 1, 0, 0);
ret = btrfs_submit_compressed_write(inode,
async_extent->start,
......@@ -713,7 +714,7 @@ static noinline int cow_file_range(struct inode *inode,
extent_clear_unlock_delalloc(inode,
&BTRFS_I(inode)->io_tree,
start, end, NULL, 1, 1,
1, 1, 1, 1);
1, 1, 1, 1, 0);
*nr_written = *nr_written +
(end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
*page_started = 1;
......@@ -725,6 +726,15 @@ static noinline int cow_file_range(struct inode *inode,
BUG_ON(disk_num_bytes >
btrfs_super_total_bytes(&root->fs_info->super_copy));
read_lock(&BTRFS_I(inode)->extent_tree.lock);
em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
start, num_bytes);
if (em) {
alloc_hint = em->block_start;
free_extent_map(em);
}
read_unlock(&BTRFS_I(inode)->extent_tree.lock);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
while (disk_num_bytes > 0) {
......@@ -737,7 +747,6 @@ static noinline int cow_file_range(struct inode *inode,
em = alloc_extent_map(GFP_NOFS);
em->start = start;
em->orig_start = em->start;
ram_size = ins.offset;
em->len = ins.offset;
......@@ -747,9 +756,9 @@ static noinline int cow_file_range(struct inode *inode,
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
......@@ -776,11 +785,14 @@ static noinline int cow_file_range(struct inode *inode,
/* we're not doing compressed IO, don't unlock the first
* page (which the caller expects to stay locked), don't
* clear any dirty bits and don't set any writeback bits
*
* Do set the Private2 bit so we know this page was properly
* setup for writepage
*/
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
start, start + ram_size - 1,
locked_page, unlock, 1,
1, 0, 0, 0);
1, 0, 0, 0, 1);
disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
......@@ -853,7 +865,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
int limit = 10 * 1024 * 1042;
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
EXTENT_DELALLOC, 1, 0, GFP_NOFS);
EXTENT_DELALLOC, 1, 0, NULL, GFP_NOFS);
while (start < end) {
async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
async_cow->inode = inode;
......@@ -1080,9 +1092,9 @@ static noinline int run_delalloc_nocow(struct inode *inode,
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
while (1) {
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
......@@ -1101,7 +1113,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
cur_offset, cur_offset + num_bytes - 1,
locked_page, 1, 1, 1, 0, 0, 0);
locked_page, 1, 1, 1, 0, 0, 0, 1);
cur_offset = extent_end;
if (cur_offset > end)
break;
......@@ -1374,10 +1386,8 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
/* already ordered? We're done */
if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
EXTENT_ORDERED, 0)) {
if (PagePrivate2(page))
goto out;
}
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
......@@ -1413,11 +1423,9 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
struct inode *inode = page->mapping->host;
struct btrfs_writepage_fixup *fixup;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
EXTENT_ORDERED, 0);
if (ret)
/* this page is properly in the ordered list */
if (TestClearPagePrivate2(page))
return 0;
if (PageChecked(page))
......@@ -1455,9 +1463,19 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
BUG_ON(!path);
path->leave_spinning = 1;
/*
* we may be replacing one extent in the tree with another.
* The new extent is pinned in the extent map, and we don't want
* to drop it from the cache until it is completely in the btree.
*
* So, tell btrfs_drop_extents to leave this extent in the cache.
* the caller is expected to unpin it and allow it to be merged
* with the others.
*/
ret = btrfs_drop_extents(trans, root, inode, file_pos,
file_pos + num_bytes, locked_end,
file_pos, &hint);
file_pos, &hint, 0);
BUG_ON(ret);
ins.objectid = inode->i_ino;
......@@ -1485,7 +1503,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
inode_add_bytes(inode, num_bytes);
btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0);
ins.objectid = disk_bytenr;
ins.offset = disk_num_bytes;
......@@ -1596,6 +1613,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
ordered_extent->len,
compressed, 0, 0,
BTRFS_FILE_EXTENT_REG);
unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
ordered_extent->file_offset,
ordered_extent->len);
BUG_ON(ret);
}
unlock_extent(io_tree, ordered_extent->file_offset,
......@@ -1623,6 +1643,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate)
{
ClearPagePrivate2(page);
return btrfs_finish_ordered_io(page->mapping->host, start, end);
}
......@@ -1669,13 +1690,13 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
failrec->last_mirror = 0;
failrec->bio_flags = 0;
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, failrec->len);
if (em->start > start || em->start + em->len < start) {
free_extent_map(em);
em = NULL;
}
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
if (!em || IS_ERR(em)) {
kfree(failrec);
......@@ -1794,7 +1815,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
return 0;
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) {
test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
GFP_NOFS);
return 0;
......@@ -2352,6 +2373,69 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
return ret;
}
int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *dir, u64 objectid,
const char *name, int name_len)
{
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
u64 index;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
name, name_len, -1);
BUG_ON(!di || IS_ERR(di));
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
BUG_ON(ret);
btrfs_release_path(root, path);
ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
objectid, root->root_key.objectid,
dir->i_ino, &index, name, name_len);
if (ret < 0) {
BUG_ON(ret != -ENOENT);
di = btrfs_search_dir_index_item(root, path, dir->i_ino,
name, name_len);
BUG_ON(!di || IS_ERR(di));
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(root, path);
index = key.offset;
}
di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
index, name, name_len, -1);
BUG_ON(!di || IS_ERR(di));
leaf = path->nodes[0];
btrfs_dir_item_key_to_cpu(leaf, di, &key);
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
BUG_ON(ret);
btrfs_release_path(root, path);
btrfs_i_size_write(dir, dir->i_size - name_len * 2);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
ret = btrfs_update_inode(trans, root, dir);
BUG_ON(ret);
dir->i_sb->s_dirt = 1;
btrfs_free_path(path);
return 0;
}
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
......@@ -2361,29 +2445,31 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
struct btrfs_trans_handle *trans;
unsigned long nr = 0;
/*
* the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir
* the root of a subvolume or snapshot
*/
if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -ENOTEMPTY;
}
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
err = btrfs_unlink_subvol(trans, root, dir,
BTRFS_I(inode)->location.objectid,
dentry->d_name.name,
dentry->d_name.len);
goto out;
}
err = btrfs_orphan_add(trans, inode);
if (err)
goto fail_trans;
goto out;
/* now the directory is empty */
err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
if (!err)
btrfs_i_size_write(inode, 0);
fail_trans:
out:
nr = trans->blocks_used;
ret = btrfs_end_transaction_throttle(trans, root);
btrfs_btree_balance_dirty(root, nr);
......@@ -2935,7 +3021,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
cur_offset,
cur_offset + hole_size,
block_end,
cur_offset, &hint_byte);
cur_offset, &hint_byte, 1);
if (err)
break;
err = btrfs_insert_file_extent(trans, root,
......@@ -3003,6 +3089,11 @@ void btrfs_delete_inode(struct inode *inode)
}
btrfs_wait_ordered_range(inode, 0, (u64)-1);
if (inode->i_nlink > 0) {
BUG_ON(btrfs_root_refs(&root->root_item) != 0);
goto no_delete;
}
btrfs_i_size_write(inode, 0);
trans = btrfs_join_transaction(root, 1);
......@@ -3070,29 +3161,67 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
* is kind of like crossing a mount point.
*/
static int fixup_tree_root_location(struct btrfs_root *root,
struct btrfs_key *location,
struct btrfs_root **sub_root,
struct dentry *dentry)
struct inode *dir,
struct dentry *dentry,
struct btrfs_key *location,
struct btrfs_root **sub_root)
{
struct btrfs_root_item *ri;
struct btrfs_path *path;
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
int ret;
int err = 0;
if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
return 0;
if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
return 0;
path = btrfs_alloc_path();
if (!path) {
err = -ENOMEM;
goto out;
}
*sub_root = btrfs_read_fs_root(root->fs_info, location,
dentry->d_name.name,
dentry->d_name.len);
if (IS_ERR(*sub_root))
return PTR_ERR(*sub_root);
err = -ENOENT;
ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
BTRFS_I(dir)->root->root_key.objectid,
location->objectid);
if (ret) {
if (ret < 0)
err = ret;
goto out;
}
ri = &(*sub_root)->root_item;
location->objectid = btrfs_root_dirid(ri);
btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
location->offset = 0;
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
goto out;
return 0;
ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
(unsigned long)(ref + 1),
dentry->d_name.len);
if (ret)
goto out;
btrfs_release_path(root->fs_info->tree_root, path);
new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
if (IS_ERR(new_root)) {
err = PTR_ERR(new_root);
goto out;
}
if (btrfs_root_refs(&new_root->root_item) == 0) {
err = -ENOENT;
goto out;
}
*sub_root = new_root;
location->objectid = btrfs_root_dirid(&new_root->root_item);
location->type = BTRFS_INODE_ITEM_KEY;
location->offset = 0;
err = 0;
out:
btrfs_free_path(path);
return err;
}
static void inode_tree_add(struct inode *inode)
......@@ -3101,11 +3230,13 @@ static void inode_tree_add(struct inode *inode)
struct btrfs_inode *entry;
struct rb_node **p;
struct rb_node *parent;
again:
p = &root->inode_tree.rb_node;
parent = NULL;
if (hlist_unhashed(&inode->i_hash))
return;
spin_lock(&root->inode_lock);
while (*p) {
parent = *p;
......@@ -3132,13 +3263,87 @@ static void inode_tree_add(struct inode *inode)
static void inode_tree_del(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int empty = 0;
spin_lock(&root->inode_lock);
if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
empty = RB_EMPTY_ROOT(&root->inode_tree);
}
spin_unlock(&root->inode_lock);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
synchronize_srcu(&root->fs_info->subvol_srcu);
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
spin_unlock(&root->inode_lock);
if (empty)
btrfs_add_dead_root(root);
}
}
int btrfs_invalidate_inodes(struct btrfs_root *root)
{
struct rb_node *node;
struct rb_node *prev;
struct btrfs_inode *entry;
struct inode *inode;
u64 objectid = 0;
WARN_ON(btrfs_root_refs(&root->root_item) != 0);
spin_lock(&root->inode_lock);
again:
node = root->inode_tree.rb_node;
prev = NULL;
while (node) {
prev = node;
entry = rb_entry(node, struct btrfs_inode, rb_node);
if (objectid < entry->vfs_inode.i_ino)
node = node->rb_left;
else if (objectid > entry->vfs_inode.i_ino)
node = node->rb_right;
else
break;
}
if (!node) {
while (prev) {
entry = rb_entry(prev, struct btrfs_inode, rb_node);
if (objectid <= entry->vfs_inode.i_ino) {
node = prev;
break;
}
prev = rb_next(prev);
}
}
while (node) {
entry = rb_entry(node, struct btrfs_inode, rb_node);
objectid = entry->vfs_inode.i_ino + 1;
inode = igrab(&entry->vfs_inode);
if (inode) {
spin_unlock(&root->inode_lock);
if (atomic_read(&inode->i_count) > 1)
d_prune_aliases(inode);
/*
* btrfs_drop_inode will remove it from
* the inode cache when its usage count
* hits zero.
*/
iput(inode);
cond_resched();
spin_lock(&root->inode_lock);
goto again;
}
if (cond_resched_lock(&root->inode_lock))
goto again;
node = rb_next(node);
}
spin_unlock(&root->inode_lock);
return 0;
}
static noinline void init_btrfs_i(struct inode *inode)
......@@ -3225,15 +3430,41 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
return inode;
}
static struct inode *new_simple_dir(struct super_block *s,
struct btrfs_key *key,
struct btrfs_root *root)
{
struct inode *inode = new_inode(s);
if (!inode)
return ERR_PTR(-ENOMEM);
init_btrfs_i(inode);
BTRFS_I(inode)->root = root;
memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
BTRFS_I(inode)->dummy_inode = 1;
inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
return inode;
}
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
{
struct inode *inode;
struct btrfs_inode *bi = BTRFS_I(dir);
struct btrfs_root *root = bi->root;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *sub_root = root;
struct btrfs_key location;
int index;
int ret;
dentry->d_op = &btrfs_dentry_operations;
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
......@@ -3242,29 +3473,50 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
if (ret < 0)
return ERR_PTR(ret);
inode = NULL;
if (location.objectid) {
ret = fixup_tree_root_location(root, &location, &sub_root,
dentry);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0)
return ERR_PTR(-ENOENT);
if (location.objectid == 0)
return NULL;
if (location.type == BTRFS_INODE_ITEM_KEY) {
inode = btrfs_iget(dir->i_sb, &location, root);
return inode;
}
BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
index = srcu_read_lock(&root->fs_info->subvol_srcu);
ret = fixup_tree_root_location(root, dir, dentry,
&location, &sub_root);
if (ret < 0) {
if (ret != -ENOENT)
inode = ERR_PTR(ret);
else
inode = new_simple_dir(dir->i_sb, &location, sub_root);
} else {
inode = btrfs_iget(dir->i_sb, &location, sub_root);
if (IS_ERR(inode))
return ERR_CAST(inode);
}
srcu_read_unlock(&root->fs_info->subvol_srcu, index);
return inode;
}
static int btrfs_dentry_delete(struct dentry *dentry)
{
struct btrfs_root *root;
if (!dentry->d_inode)
return 0;
root = BTRFS_I(dentry->d_inode)->root;
if (btrfs_root_refs(&root->root_item) == 0)
return 1;
return 0;
}
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
struct nameidata *nd)
{
struct inode *inode;
if (dentry->d_name.len > BTRFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
inode = btrfs_lookup_dentry(dir, dentry);
if (IS_ERR(inode))
return ERR_CAST(inode);
......@@ -3603,9 +3855,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
if (ret != 0)
goto fail;
if (objectid > root->highest_inode)
root->highest_inode = objectid;
inode->i_uid = current_fsuid();
if (dir && (dir->i_mode & S_ISGID)) {
......@@ -3673,26 +3922,35 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
struct inode *parent_inode, struct inode *inode,
const char *name, int name_len, int add_backref, u64 index)
{
int ret;
int ret = 0;
struct btrfs_key key;
struct btrfs_root *root = BTRFS_I(parent_inode)->root;
key.objectid = inode->i_ino;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
} else {
key.objectid = inode->i_ino;
btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
key.offset = 0;
}
if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
key.objectid, root->root_key.objectid,
parent_inode->i_ino,
index, name, name_len);
} else if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root,
name, name_len, inode->i_ino,
parent_inode->i_ino, index);
}
ret = btrfs_insert_dir_item(trans, root, name, name_len,
parent_inode->i_ino,
&key, btrfs_inode_type(inode),
index);
if (ret == 0) {
if (add_backref) {
ret = btrfs_insert_inode_ref(trans, root,
name, name_len,
inode->i_ino,
parent_inode->i_ino,
index);
}
ret = btrfs_insert_dir_item(trans, root, name, name_len,
parent_inode->i_ino, &key,
btrfs_inode_type(inode), index);
BUG_ON(ret);
btrfs_i_size_write(parent_inode, parent_inode->i_size +
name_len * 2);
parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
......@@ -3875,18 +4133,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
err = btrfs_add_nondir(trans, dentry, inode, 1, index);
if (err)
drop_inode = 1;
btrfs_update_inode_block_group(trans, dir);
err = btrfs_update_inode(trans, root, inode);
if (err)
if (err) {
drop_inode = 1;
} else {
btrfs_update_inode_block_group(trans, dir);
err = btrfs_update_inode(trans, root, inode);
BUG_ON(err);
btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
}
nr = trans->blocks_used;
btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
btrfs_end_transaction_throttle(trans, root);
fail:
if (drop_inode) {
......@@ -4064,11 +4320,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
int compressed;
again:
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
if (em)
em->bdev = root->fs_info->fs_devices->latest_bdev;
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
if (em) {
if (em->start > start || em->start + em->len <= start)
......@@ -4215,6 +4471,11 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
map = kmap(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
memset(map + pg_offset + copy_size, 0,
PAGE_CACHE_SIZE - pg_offset -
copy_size);
}
kunmap(page);
}
flush_dcache_page(page);
......@@ -4259,7 +4520,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
}
err = 0;
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that
......@@ -4299,7 +4560,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
err = 0;
}
}
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
out:
if (path)
btrfs_free_path(path);
......@@ -4398,13 +4659,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
u64 page_start = page_offset(page);
u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
/*
* we have the page locked, so new writeback can't start,
* and the dirty bit won't be cleared while we are here.
*
* Wait for IO on this page so that we can safely clear
* the PagePrivate2 bit and do ordered accounting
*/
wait_on_page_writeback(page);
tree = &BTRFS_I(page->mapping->host)->io_tree;
if (offset) {
btrfs_releasepage(page, GFP_NOFS);
return;
}
lock_extent(tree, page_start, page_end, GFP_NOFS);
ordered = btrfs_lookup_ordered_extent(page->mapping->host,
page_offset(page));
......@@ -4415,16 +4684,21 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
*/
clear_extent_bit(tree, page_start, page_end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_LOCKED, 1, 0, GFP_NOFS);
btrfs_finish_ordered_io(page->mapping->host,
page_start, page_end);
EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
/*
* whoever cleared the private bit is responsible
* for the finish_ordered_io
*/
if (TestClearPagePrivate2(page)) {
btrfs_finish_ordered_io(page->mapping->host,
page_start, page_end);
}
btrfs_put_ordered_extent(ordered);
lock_extent(tree, page_start, page_end, GFP_NOFS);
}
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_ORDERED,
1, 1, GFP_NOFS);
EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
1, 1, NULL, GFP_NOFS);
__btrfs_releasepage(page, GFP_NOFS);
ClearPageChecked(page);
......@@ -4521,11 +4795,14 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
}
ClearPageChecked(page);
set_page_dirty(page);
SetPageUptodate(page);
BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
if (!ret)
return VM_FAULT_LOCKED;
unlock_page(page);
out:
return ret;
......@@ -4594,11 +4871,11 @@ static void btrfs_truncate(struct inode *inode)
* create a new subvolume directory/inode (helper for the ioctl).
*/
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root, struct dentry *dentry,
struct btrfs_root *new_root,
u64 new_dirid, u64 alloc_hint)
{
struct inode *inode;
int error;
int err;
u64 index = 0;
inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
......@@ -4611,11 +4888,10 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
inode->i_nlink = 1;
btrfs_i_size_write(inode, 0);
error = btrfs_update_inode(trans, new_root, inode);
if (error)
return error;
err = btrfs_update_inode(trans, new_root, inode);
BUG_ON(err);
d_instantiate(dentry, inode);
iput(inode);
return 0;
}
......@@ -4693,6 +4969,16 @@ void btrfs_destroy_inode(struct inode *inode)
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
void btrfs_drop_inode(struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0)
generic_delete_inode(inode);
else
generic_drop_inode(inode);
}
static void init_once(void *foo)
{
struct btrfs_inode *ei = (struct btrfs_inode *) foo;
......@@ -4761,31 +5047,32 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
{
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(old_dir)->root;
struct btrfs_root *dest = BTRFS_I(new_dir)->root;
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec ctime = CURRENT_TIME;
u64 index = 0;
u64 root_objectid;
int ret;
/* we're not allowed to rename between subvolumes */
if (BTRFS_I(old_inode)->root->root_key.objectid !=
BTRFS_I(new_dir)->root->root_key.objectid)
if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
/* we only allow rename subvolume link between subvolumes */
if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
if (S_ISDIR(old_inode->i_mode) && new_inode &&
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) {
if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
(new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
return -ENOTEMPTY;
}
/* to rename a snapshot or subvolume, we need to juggle the
* backrefs. This isn't coded yet
*/
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
return -EXDEV;
if (S_ISDIR(old_inode->i_mode) && new_inode &&
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
ret = btrfs_check_metadata_free_space(root);
if (ret)
goto out_unlock;
return ret;
/*
* we're using rename to replace one file with another.
......@@ -4796,8 +5083,40 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
filemap_flush(old_inode->i_mapping);
/* close the racy window with snapshot create/destroy ioctl */
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&root->fs_info->subvol_sem);
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, new_dir);
if (dest != root)
btrfs_record_root_in_trans(trans, dest);
ret = btrfs_set_inode_index(new_dir, &index);
if (ret)
goto out_fail;
if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
/* force full log commit if subvolume involved. */
root->fs_info->last_trans_log_full_commit = trans->transid;
} else {
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
old_inode->i_ino,
new_dir->i_ino, index);
if (ret)
goto out_fail;
/*
* this is an ugly little race, but the rename is required
* to make sure that if we crash, the inode is either at the
* old name or the new one. pinning the log transaction lets
* us make sure we don't allow a log commit to come in after
* we unlink the name but before we add the new name back in.
*/
btrfs_pin_log_trans(root);
}
/*
* make sure the inode gets flushed if it is replacing
* something.
......@@ -4807,18 +5126,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
btrfs_add_ordered_operation(trans, root, old_inode);
}
/*
* this is an ugly little race, but the rename is required to make
* sure that if we crash, the inode is either at the old name
* or the new one. pinning the log transaction lets us make sure
* we don't allow a log commit to come in after we unlink the
* name but before we add the new name back in.
*/
btrfs_pin_log_trans(root);
btrfs_set_trans_block_group(trans, new_dir);
btrfs_inc_nlink(old_dentry->d_inode);
old_dir->i_ctime = old_dir->i_mtime = ctime;
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
......@@ -4826,47 +5133,58 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_dentry->d_parent != new_dentry->d_parent)
btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
old_dentry->d_name.name,
old_dentry->d_name.len);
if (ret)
goto out_fail;
if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
old_dentry->d_name.name,
old_dentry->d_name.len);
} else {
btrfs_inc_nlink(old_dentry->d_inode);
ret = btrfs_unlink_inode(trans, root, old_dir,
old_dentry->d_inode,
old_dentry->d_name.name,
old_dentry->d_name.len);
}
BUG_ON(ret);
if (new_inode) {
new_inode->i_ctime = CURRENT_TIME;
ret = btrfs_unlink_inode(trans, root, new_dir,
new_dentry->d_inode,
new_dentry->d_name.name,
new_dentry->d_name.len);
if (ret)
goto out_fail;
if (unlikely(new_inode->i_ino ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
root_objectid = BTRFS_I(new_inode)->location.objectid;
ret = btrfs_unlink_subvol(trans, dest, new_dir,
root_objectid,
new_dentry->d_name.name,
new_dentry->d_name.len);
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, dest, new_dir,
new_dentry->d_inode,
new_dentry->d_name.name,
new_dentry->d_name.len);
}
BUG_ON(ret);
if (new_inode->i_nlink == 0) {
ret = btrfs_orphan_add(trans, new_dentry->d_inode);
if (ret)
goto out_fail;
BUG_ON(ret);
}
}
ret = btrfs_set_inode_index(new_dir, &index);
if (ret)
goto out_fail;
ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode,
old_inode, new_dentry->d_name.name,
new_dentry->d_name.len, 1, index);
if (ret)
goto out_fail;
ret = btrfs_add_link(trans, new_dir, old_inode,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
BUG_ON(ret);
btrfs_log_new_name(trans, old_inode, old_dir,
new_dentry->d_parent);
if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
btrfs_log_new_name(trans, old_inode, old_dir,
new_dentry->d_parent);
btrfs_end_log_trans(root);
}
out_fail:
/* this btrfs_end_log_trans just allows the current
* log-sub transaction to complete
*/
btrfs_end_log_trans(root);
btrfs_end_transaction_throttle(trans, root);
out_unlock:
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
return ret;
}
......@@ -5058,6 +5376,8 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
0, 0, 0,
BTRFS_FILE_EXTENT_PREALLOC);
BUG_ON(ret);
btrfs_drop_extent_cache(inode, cur_offset,
cur_offset + ins.offset -1, 0);
num_bytes -= ins.offset;
cur_offset += ins.offset;
alloc_hint = ins.objectid + ins.offset;
......@@ -5223,6 +5543,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
.lookup = btrfs_lookup,
.permission = btrfs_permission,
};
static struct file_operations btrfs_dir_file_operations = {
.llseek = generic_file_llseek,
.read = generic_read_dir,
......@@ -5310,3 +5631,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
.listxattr = btrfs_listxattr,
.removexattr = btrfs_removexattr,
};
struct dentry_operations btrfs_dentry_operations = {
.d_delete = btrfs_dentry_delete,
};
......@@ -230,8 +230,8 @@ static noinline int create_subvol(struct btrfs_root *root,
struct btrfs_root_item root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *new_root = root;
struct inode *dir;
struct btrfs_root *new_root;
struct inode *dir = dentry->d_parent->d_inode;
int ret;
int err;
u64 objectid;
......@@ -241,7 +241,7 @@ static noinline int create_subvol(struct btrfs_root *root,
ret = btrfs_check_metadata_free_space(root);
if (ret)
goto fail_commit;
return ret;
trans = btrfs_start_transaction(root, 1);
BUG_ON(!trans);
......@@ -304,11 +304,17 @@ static noinline int create_subvol(struct btrfs_root *root,
if (ret)
goto fail;
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
BUG_ON(IS_ERR(new_root));
btrfs_record_root_in_trans(trans, new_root);
ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
BTRFS_I(dir)->block_group);
/*
* insert the directory item
*/
key.offset = (u64)-1;
dir = dentry->d_parent->d_inode;
ret = btrfs_set_inode_index(dir, &index);
BUG_ON(ret);
......@@ -322,44 +328,18 @@ static noinline int create_subvol(struct btrfs_root *root,
ret = btrfs_update_inode(trans, root, dir);
BUG_ON(ret);
/* add the backref first */
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
objectid, BTRFS_ROOT_BACKREF_KEY,
root->root_key.objectid,
objectid, root->root_key.objectid,
dir->i_ino, index, name, namelen);
BUG_ON(ret);
/* now add the forward ref */
ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
root->root_key.objectid, BTRFS_ROOT_REF_KEY,
objectid,
dir->i_ino, index, name, namelen);
BUG_ON(ret);
ret = btrfs_commit_transaction(trans, root);
if (ret)
goto fail_commit;
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
BUG_ON(!new_root);
trans = btrfs_start_transaction(new_root, 1);
BUG_ON(!trans);
ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid,
BTRFS_I(dir)->block_group);
if (ret)
goto fail;
d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
fail:
nr = trans->blocks_used;
err = btrfs_commit_transaction(trans, new_root);
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
fail_commit:
btrfs_btree_balance_dirty(root, nr);
return ret;
}
......@@ -420,14 +400,15 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
static noinline int btrfs_mksubvol(struct path *parent, char *name,
int mode, int namelen,
static noinline int btrfs_mksubvol(struct path *parent,
char *name, int namelen,
struct btrfs_root *snap_src)
{
struct inode *dir = parent->dentry->d_inode;
struct dentry *dentry;
int error;
mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(name, parent->dentry, namelen);
error = PTR_ERR(dentry);
......@@ -438,99 +419,39 @@ static noinline int btrfs_mksubvol(struct path *parent, char *name,
if (dentry->d_inode)
goto out_dput;
if (!IS_POSIXACL(parent->dentry->d_inode))
mode &= ~current_umask();
error = mnt_want_write(parent->mnt);
if (error)
goto out_dput;
error = btrfs_may_create(parent->dentry->d_inode, dentry);
error = btrfs_may_create(dir, dentry);
if (error)
goto out_drop_write;
/*
* Actually perform the low-level subvolume creation after all
* this VFS fuzz.
*
* Eventually we want to pass in an inode under which we create this
* subvolume, but for now all are under the filesystem root.
*
* Also we should pass on the mode eventually to allow creating new
* subvolume with specific mode bits.
*/
down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
goto out_up_read;
if (snap_src) {
struct dentry *dir = dentry->d_parent;
struct dentry *test = dir->d_parent;
struct btrfs_path *path = btrfs_alloc_path();
int ret;
u64 test_oid;
u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid;
test_oid = snap_src->root_key.objectid;
ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
path, parent_oid, test_oid);
if (ret == 0)
goto create;
btrfs_release_path(snap_src->fs_info->tree_root, path);
/* we need to make sure we aren't creating a directory loop
* by taking a snapshot of something that has our current
* subvol in its directory tree. So, this loops through
* the dentries and checks the forward refs for each subvolume
* to see if is references the subvolume where we are
* placing this new snapshot.
*/
while (1) {
if (!test ||
dir == snap_src->fs_info->sb->s_root ||
test == snap_src->fs_info->sb->s_root ||
test->d_inode->i_sb != snap_src->fs_info->sb) {
break;
}
if (S_ISLNK(test->d_inode->i_mode)) {
printk(KERN_INFO "Btrfs symlink in snapshot "
"path, failed\n");
error = -EMLINK;
btrfs_free_path(path);
goto out_drop_write;
}
test_oid =
BTRFS_I(test->d_inode)->root->root_key.objectid;
ret = btrfs_find_root_ref(snap_src->fs_info->tree_root,
path, test_oid, parent_oid);
if (ret == 0) {
printk(KERN_INFO "Btrfs snapshot creation "
"failed, looping\n");
error = -EMLINK;
btrfs_free_path(path);
goto out_drop_write;
}
btrfs_release_path(snap_src->fs_info->tree_root, path);
test = test->d_parent;
}
create:
btrfs_free_path(path);
error = create_snapshot(snap_src, dentry, name, namelen);
error = create_snapshot(snap_src, dentry,
name, namelen);
} else {
error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root,
dentry, name, namelen);
error = create_subvol(BTRFS_I(dir)->root, dentry,
name, namelen);
}
if (error)
goto out_drop_write;
fsnotify_mkdir(parent->dentry->d_inode, dentry);
if (!error)
fsnotify_mkdir(dir, dentry);
out_up_read:
up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
out_drop_write:
mnt_drop_write(parent->mnt);
out_dput:
dput(dentry);
out_unlock:
mutex_unlock(&parent->dentry->d_inode->i_mutex);
mutex_unlock(&dir->i_mutex);
return error;
}
static int btrfs_defrag_file(struct file *file)
{
struct inode *inode = fdentry(file)->d_inode;
......@@ -596,9 +517,8 @@ static int btrfs_defrag_file(struct file *file)
clear_page_dirty_for_io(page);
btrfs_set_extent_delalloc(inode, page_start, page_end);
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
set_page_dirty(page);
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
......@@ -609,7 +529,8 @@ static int btrfs_defrag_file(struct file *file)
return 0;
}
static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg)
static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
void __user *arg)
{
u64 new_size;
u64 old_size;
......@@ -718,10 +639,7 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
{
struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_dir_item *di;
struct btrfs_path *path;
struct file *src_file;
u64 root_dirid;
int namelen;
int ret = 0;
......@@ -739,32 +657,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
goto out;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
path, root_dirid,
vol_args->name, namelen, 0);
btrfs_free_path(path);
if (di && !IS_ERR(di)) {
ret = -EEXIST;
goto out;
}
if (IS_ERR(di)) {
ret = PTR_ERR(di);
goto out;
}
if (subvol) {
ret = btrfs_mksubvol(&file->f_path, vol_args->name,
file->f_path.dentry->d_inode->i_mode,
namelen, NULL);
ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
NULL);
} else {
struct inode *src_inode;
src_file = fget(vol_args->fd);
......@@ -781,17 +676,156 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
fput(src_file);
goto out;
}
ret = btrfs_mksubvol(&file->f_path, vol_args->name,
file->f_path.dentry->d_inode->i_mode,
namelen, BTRFS_I(src_inode)->root);
ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
BTRFS_I(src_inode)->root);
fput(src_file);
}
out:
kfree(vol_args);
return ret;
}
/*
* helper to check if the subvolume references other subvolumes
*/
static noinline int may_destroy_subvol(struct btrfs_root *root)
{
struct btrfs_path *path;
struct btrfs_key key;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = root->root_key.objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = (u64)-1;
ret = btrfs_search_slot(NULL, root->fs_info->tree_root,
&key, path, 0, 0);
if (ret < 0)
goto out;
BUG_ON(ret == 0);
ret = 0;
if (path->slots[0] > 0) {
path->slots[0]--;
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid == root->root_key.objectid &&
key.type == BTRFS_ROOT_REF_KEY)
ret = -ENOTEMPTY;
}
out:
btrfs_free_path(path);
return ret;
}
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
void __user *arg)
{
struct dentry *parent = fdentry(file);
struct dentry *dentry;
struct inode *dir = parent->d_inode;
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
int namelen;
int ret;
int err = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
namelen = strlen(vol_args->name);
if (strchr(vol_args->name, '/') ||
strncmp(vol_args->name, "..", namelen) == 0) {
err = -EINVAL;
goto out;
}
err = mnt_want_write(file->f_path.mnt);
if (err)
goto out;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
dentry = lookup_one_len(vol_args->name, parent, namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock_dir;
}
if (!dentry->d_inode) {
err = -ENOENT;
goto out_dput;
}
inode = dentry->d_inode;
if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out_dput;
}
dest = BTRFS_I(inode)->root;
mutex_lock(&inode->i_mutex);
err = d_invalidate(dentry);
if (err)
goto out_unlock;
down_write(&root->fs_info->subvol_sem);
err = may_destroy_subvol(dest);
if (err)
goto out_up_write;
trans = btrfs_start_transaction(root, 1);
ret = btrfs_unlink_subvol(trans, root, dir,
dest->root_key.objectid,
dentry->d_name.name,
dentry->d_name.len);
BUG_ON(ret);
btrfs_record_root_in_trans(trans, dest);
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
dest->root_item.drop_level = 0;
btrfs_set_root_refs(&dest->root_item, 0);
ret = btrfs_insert_orphan_item(trans,
root->fs_info->tree_root,
dest->root_key.objectid);
BUG_ON(ret);
ret = btrfs_commit_transaction(trans, root);
BUG_ON(ret);
inode->i_flags |= S_DEAD;
out_up_write:
up_write(&root->fs_info->subvol_sem);
out_unlock:
mutex_unlock(&inode->i_mutex);
if (!err) {
btrfs_invalidate_inodes(dest);
d_delete(dentry);
}
out_dput:
dput(dentry);
out_unlock_dir:
mutex_unlock(&dir->i_mutex);
mnt_drop_write(file->f_path.mnt);
out:
kfree(vol_args);
return err;
}
static int btrfs_ioctl_defrag(struct file *file)
{
struct inode *inode = fdentry(file)->d_inode;
......@@ -865,8 +899,8 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
return ret;
}
static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
struct inode *inode = fdentry(file)->d_inode;
struct btrfs_root *root = BTRFS_I(inode)->root;
......@@ -976,7 +1010,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
/* punch hole in destination first */
btrfs_drop_extents(trans, root, inode, off, off + len,
off + len, 0, &hint_byte);
off + len, 0, &hint_byte, 1);
/* clone data */
key.objectid = src->i_ino;
......@@ -1071,8 +1105,7 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
datao += off - key.offset;
datal -= off - key.offset;
}
if (key.offset + datao + datal + key.offset >
off + len)
if (key.offset + datao + datal > off + len)
datal = off + len - key.offset - datao;
/* disko == 0 means it's a hole */
if (!disko)
......@@ -1258,6 +1291,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SUBVOL_CREATE:
return btrfs_ioctl_snap_create(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp);
case BTRFS_IOC_DEFRAG:
return btrfs_ioctl_defrag(file);
case BTRFS_IOC_RESIZE:
......
......@@ -65,5 +65,6 @@ struct btrfs_ioctl_clone_range_args {
#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
struct btrfs_ioctl_vol_args)
#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
struct btrfs_ioctl_vol_args)
#endif
......@@ -159,8 +159,6 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
*
* len is the length of the extent
*
* This also sets the EXTENT_ORDERED bit on the range in the inode.
*
* The tree is given a single reference on the ordered extent that was
* inserted.
*/
......@@ -181,6 +179,7 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
entry->start = start;
entry->len = len;
entry->disk_len = disk_len;
entry->bytes_left = len;
entry->inode = inode;
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
......@@ -195,9 +194,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
&entry->rb_node);
BUG_ON(node);
set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
entry_end(entry) - 1, GFP_NOFS);
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_add_tail(&entry->root_extent_list,
&BTRFS_I(inode)->root->fs_info->ordered_extents);
......@@ -241,13 +237,10 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
struct rb_node *node;
struct btrfs_ordered_extent *entry;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
int ret;
tree = &BTRFS_I(inode)->ordered_tree;
mutex_lock(&tree->mutex);
clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
GFP_NOFS);
node = tree_search(tree, file_offset);
if (!node) {
ret = 1;
......@@ -260,11 +253,16 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
goto out;
}
ret = test_range_bit(io_tree, entry->file_offset,
entry->file_offset + entry->len - 1,
EXTENT_ORDERED, 0);
if (ret == 0)
if (io_size > entry->bytes_left) {
printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
(unsigned long long)entry->bytes_left,
(unsigned long long)io_size);
}
entry->bytes_left -= io_size;
if (entry->bytes_left == 0)
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
else
ret = 1;
out:
mutex_unlock(&tree->mutex);
return ret == 0;
......@@ -476,6 +474,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
u64 orig_end;
u64 wait_end;
struct btrfs_ordered_extent *ordered;
int found;
if (start + len < start) {
orig_end = INT_LIMIT(loff_t);
......@@ -502,6 +501,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
orig_end >> PAGE_CACHE_SHIFT);
end = orig_end;
found = 0;
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, end);
if (!ordered)
......@@ -514,6 +514,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
btrfs_put_ordered_extent(ordered);
break;
}
found++;
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
btrfs_put_ordered_extent(ordered);
......@@ -521,8 +522,8 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
break;
end--;
}
if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
EXTENT_DELALLOC, 0, NULL)) {
schedule_timeout(1);
goto again;
}
......@@ -613,7 +614,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
*/
if (test_range_bit(io_tree, disk_i_size,
ordered->file_offset + ordered->len - 1,
EXTENT_DELALLOC, 0)) {
EXTENT_DELALLOC, 0, NULL)) {
goto out;
}
/*
......@@ -664,7 +665,7 @@ int btrfs_ordered_update_i_size(struct inode *inode,
*/
if (i_size_test > entry_end(ordered) &&
!test_range_bit(io_tree, entry_end(ordered), i_size_test - 1,
EXTENT_DELALLOC, 0)) {
EXTENT_DELALLOC, 0, NULL)) {
new_i_size = min_t(u64, i_size_test, i_size_read(inode));
}
BTRFS_I(inode)->disk_i_size = new_i_size;
......
......@@ -85,6 +85,9 @@ struct btrfs_ordered_extent {
/* extent length on disk */
u64 disk_len;
/* number of bytes that still need writing */
u64 bytes_left;
/* flags (described above) */
unsigned long flags;
......
......@@ -65,3 +65,23 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
return ret;
}
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset)
{
struct btrfs_path *path;
struct btrfs_key key;
int ret;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = offset;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
btrfs_free_path(path);
return ret;
}
......@@ -121,6 +121,15 @@ struct inodevec {
int nr;
};
#define MAX_EXTENTS 128
struct file_extent_cluster {
u64 start;
u64 end;
u64 boundary[MAX_EXTENTS];
unsigned int nr;
};
struct reloc_control {
/* block group to relocate */
struct btrfs_block_group_cache *block_group;
......@@ -2180,7 +2189,7 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
struct reloc_control *rc)
{
if (test_range_bit(&rc->processed_blocks, bytenr,
bytenr + blocksize - 1, EXTENT_DIRTY, 1))
bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
return 1;
return 0;
}
......@@ -2529,56 +2538,94 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
}
static noinline_for_stack
int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
u64 block_start)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
int ret = 0;
em = alloc_extent_map(GFP_NOFS);
if (!em)
return -ENOMEM;
em->start = start;
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
while (1) {
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
write_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
}
btrfs_drop_extent_cache(inode, start, end, 0);
}
unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
return ret;
}
static int relocate_file_extent_cluster(struct inode *inode,
struct file_extent_cluster *cluster)
{
u64 page_start;
u64 page_end;
unsigned long i;
unsigned long first_index;
u64 offset = BTRFS_I(inode)->index_cnt;
unsigned long index;
unsigned long last_index;
unsigned int total_read = 0;
unsigned int total_dirty = 0;
unsigned int dirty_page = 0;
struct page *page;
struct file_ra_state *ra;
struct btrfs_ordered_extent *ordered;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
int nr = 0;
int ret = 0;
if (!cluster->nr)
return 0;
ra = kzalloc(sizeof(*ra), GFP_NOFS);
if (!ra)
return -ENOMEM;
index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
mutex_lock(&inode->i_mutex);
first_index = start >> PAGE_CACHE_SHIFT;
last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
/* make sure the dirty trick played by the caller work */
while (1) {
ret = invalidate_inode_pages2_range(inode->i_mapping,
first_index, last_index);
if (ret != -EBUSY)
break;
schedule_timeout(HZ/10);
}
i_size_write(inode, cluster->end + 1 - offset);
ret = setup_extent_mapping(inode, cluster->start - offset,
cluster->end - offset, cluster->start);
if (ret)
goto out_unlock;
file_ra_state_init(ra, inode->i_mapping);
for (i = first_index ; i <= last_index; i++) {
if (total_read % ra->ra_pages == 0) {
btrfs_force_ra(inode->i_mapping, ra, NULL, i,
min(last_index, ra->ra_pages + i - 1));
}
total_read++;
again:
if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
BUG_ON(1);
page = grab_cache_page(inode->i_mapping, i);
WARN_ON(cluster->start != cluster->boundary[0]);
while (index <= last_index) {
page = find_lock_page(inode->i_mapping, index);
if (!page) {
ret = -ENOMEM;
goto out_unlock;
page_cache_sync_readahead(inode->i_mapping,
ra, NULL, index,
last_index + 1 - index);
page = grab_cache_page(inode->i_mapping, index);
if (!page) {
ret = -ENOMEM;
goto out_unlock;
}
}
if (PageReadahead(page)) {
page_cache_async_readahead(inode->i_mapping,
ra, NULL, page, index,
last_index + 1 - index);
}
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
......@@ -2589,75 +2636,79 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
goto out_unlock;
}
}
wait_on_page_writeback(page);
page_start = (u64)page->index << PAGE_CACHE_SHIFT;
page_end = page_start + PAGE_CACHE_SIZE - 1;
lock_extent(io_tree, page_start, page_end, GFP_NOFS);
ordered = btrfs_lookup_ordered_extent(inode, page_start);
if (ordered) {
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
}
lock_extent(&BTRFS_I(inode)->io_tree,
page_start, page_end, GFP_NOFS);
set_page_extent_mapped(page);
if (i == first_index)
set_extent_bits(io_tree, page_start, page_end,
if (nr < cluster->nr &&
page_start + offset == cluster->boundary[nr]) {
set_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end,
EXTENT_BOUNDARY, GFP_NOFS);
nr++;
}
btrfs_set_extent_delalloc(inode, page_start, page_end);
set_page_dirty(page);
total_dirty++;
dirty_page++;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
unlock_extent(&BTRFS_I(inode)->io_tree,
page_start, page_end, GFP_NOFS);
unlock_page(page);
page_cache_release(page);
index++;
if (nr < cluster->nr &&
page_end + 1 + offset == cluster->boundary[nr]) {
balance_dirty_pages_ratelimited_nr(inode->i_mapping,
dirty_page);
dirty_page = 0;
}
}
if (dirty_page) {
balance_dirty_pages_ratelimited_nr(inode->i_mapping,
dirty_page);
}
WARN_ON(nr != cluster->nr);
out_unlock:
mutex_unlock(&inode->i_mutex);
kfree(ra);
balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
return ret;
}
static noinline_for_stack
int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
struct file_extent_cluster *cluster)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_map *em;
u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
u64 end = start + extent_key->offset - 1;
em = alloc_extent_map(GFP_NOFS);
em->start = start;
em->len = extent_key->offset;
em->block_len = extent_key->offset;
em->block_start = extent_key->objectid;
em->bdev = root->fs_info->fs_devices->latest_bdev;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
int ret;
/* setup extent map to cheat btrfs_readpage */
lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
while (1) {
int ret;
spin_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
if (ret != -EEXIST) {
free_extent_map(em);
break;
}
btrfs_drop_extent_cache(inode, start, end, 0);
if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
ret = relocate_file_extent_cluster(inode, cluster);
if (ret)
return ret;
cluster->nr = 0;
}
unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
return relocate_inode_pages(inode, start, extent_key->offset);
if (!cluster->nr)
cluster->start = extent_key->objectid;
else
BUG_ON(cluster->nr >= MAX_EXTENTS);
cluster->end = extent_key->objectid + extent_key->offset - 1;
cluster->boundary[cluster->nr] = extent_key->objectid;
cluster->nr++;
if (cluster->nr >= MAX_EXTENTS) {
ret = relocate_file_extent_cluster(inode, cluster);
if (ret)
return ret;
cluster->nr = 0;
}
return 0;
}
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
......@@ -3203,10 +3254,12 @@ static int check_extent_flags(u64 flags)
return 0;
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
struct rb_root blocks = RB_ROOT;
struct btrfs_key key;
struct file_extent_cluster *cluster;
struct btrfs_trans_handle *trans = NULL;
struct btrfs_path *path;
struct btrfs_extent_item *ei;
......@@ -3216,10 +3269,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
int ret;
int err = 0;
cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
if (!cluster)
return -ENOMEM;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
rc->extents_found = 0;
rc->extents_skipped = 0;
rc->search_start = rc->block_group->key.objectid;
clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
GFP_NOFS);
......@@ -3306,14 +3366,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, rc->extent_root);
btrfs_end_transaction(trans, rc->extent_root);
trans = NULL;
btrfs_btree_balance_dirty(rc->extent_root, nr);
if (rc->stage == MOVE_DATA_EXTENTS &&
(flags & BTRFS_EXTENT_FLAG_DATA)) {
rc->found_file_extent = 1;
ret = relocate_data_extent(rc->data_inode, &key);
ret = relocate_data_extent(rc->data_inode,
&key, cluster);
if (ret < 0) {
err = ret;
break;
......@@ -3328,6 +3389,14 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
btrfs_btree_balance_dirty(rc->extent_root, nr);
}
if (!err) {
ret = relocate_file_extent_cluster(rc->data_inode, cluster);
if (ret < 0)
err = ret;
}
kfree(cluster);
rc->create_reloc_root = 0;
smp_mb();
......@@ -3348,8 +3417,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
}
static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 size)
struct btrfs_root *root, u64 objectid)
{
struct btrfs_path *path;
struct btrfs_inode_item *item;
......@@ -3368,7 +3436,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
btrfs_set_inode_generation(leaf, item, 1);
btrfs_set_inode_size(leaf, item, size);
btrfs_set_inode_size(leaf, item, 0);
btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
btrfs_mark_buffer_dirty(leaf);
......@@ -3404,12 +3472,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
if (err)
goto out;
err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
BUG_ON(err);
err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
group->key.offset, 0, group->key.offset,
0, 0, 0);
err = __insert_orphan_inode(trans, root, objectid);
BUG_ON(err);
key.objectid = objectid;
......@@ -3475,14 +3538,15 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
btrfs_wait_ordered_extents(fs_info->tree_root, 0);
while (1) {
mutex_lock(&fs_info->cleaner_mutex);
btrfs_clean_old_snapshots(fs_info->tree_root);
mutex_unlock(&fs_info->cleaner_mutex);
rc->extents_found = 0;
rc->extents_skipped = 0;
mutex_lock(&fs_info->cleaner_mutex);
btrfs_clean_old_snapshots(fs_info->tree_root);
ret = relocate_block_group(rc);
mutex_unlock(&fs_info->cleaner_mutex);
if (ret < 0) {
err = ret;
break;
......@@ -3514,10 +3578,10 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
}
}
filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
rc->block_group->key.objectid,
rc->block_group->key.objectid +
rc->block_group->key.offset - 1);
filemap_write_and_wait_range(fs_info->btree_inode->i_mapping,
rc->block_group->key.objectid,
rc->block_group->key.objectid +
rc->block_group->key.offset - 1);
WARN_ON(rc->block_group->pinned > 0);
WARN_ON(rc->block_group->reserved > 0);
......@@ -3530,6 +3594,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
return err;
}
static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
int ret;
trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
memset(&root->root_item.drop_progress, 0,
sizeof(root->root_item.drop_progress));
root->root_item.drop_level = 0;
btrfs_set_root_refs(&root->root_item, 0);
ret = btrfs_update_root(trans, root->fs_info->tree_root,
&root->root_key, &root->root_item);
BUG_ON(ret);
ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
BUG_ON(ret);
return 0;
}
/*
* recover relocation interrupted by system crash.
*
......@@ -3589,8 +3673,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(root->fs_info,
reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
goto out;
ret = PTR_ERR(fs_root);
if (ret != -ENOENT) {
err = ret;
goto out;
}
mark_garbage_root(reloc_root);
}
}
......
......@@ -94,17 +94,23 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
goto out;
BUG_ON(ret == 0);
if (path->slots[0] == 0) {
ret = 1;
goto out;
}
l = path->nodes[0];
BUG_ON(path->slots[0] == 0);
slot = path->slots[0] - 1;
btrfs_item_key_to_cpu(l, &found_key, slot);
if (found_key.objectid != objectid) {
if (found_key.objectid != objectid ||
found_key.type != BTRFS_ROOT_ITEM_KEY) {
ret = 1;
goto out;
}
read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
sizeof(*item));
memcpy(key, &found_key, sizeof(found_key));
if (item)
read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
sizeof(*item));
if (key)
memcpy(key, &found_key, sizeof(found_key));
ret = 0;
out:
btrfs_free_path(path);
......@@ -249,6 +255,59 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
return ret;
}
int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
{
struct extent_buffer *leaf;
struct btrfs_path *path;
struct btrfs_key key;
int err = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_ORPHAN_OBJECTID;
key.type = BTRFS_ORPHAN_ITEM_KEY;
key.offset = 0;
while (1) {
ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
if (ret < 0) {
err = ret;
break;
}
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(tree_root, path);
if (ret < 0)
err = ret;
if (ret != 0)
break;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_release_path(tree_root, path);
if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
key.type != BTRFS_ORPHAN_ITEM_KEY)
break;
ret = btrfs_find_dead_roots(tree_root, key.offset);
if (ret) {
err = ret;
break;
}
key.offset++;
}
btrfs_free_path(path);
return err;
}
/* drop the root item for 'key' from 'root' */
int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_key *key)
......@@ -278,31 +337,57 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ret;
}
#if 0 /* this will get used when snapshot deletion is implemented */
int btrfs_del_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
u64 root_id, u8 type, u64 ref_id)
u64 root_id, u64 ref_id, u64 dirid, u64 *sequence,
const char *name, int name_len)
{
struct btrfs_path *path;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
struct btrfs_key key;
unsigned long ptr;
int err = 0;
int ret;
struct btrfs_path *path;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = root_id;
key.type = type;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
BUG_ON(ret);
ret = btrfs_del_item(trans, tree_root, path);
BUG_ON(ret);
BUG_ON(ret < 0);
if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
ptr = (unsigned long)(ref + 1);
WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
BUG_ON(ret);
} else
err = -ENOENT;
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(tree_root, path);
key.objectid = ref_id;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = root_id;
goto again;
}
btrfs_free_path(path);
return ret;
return err;
}
#endif
int btrfs_find_root_ref(struct btrfs_root *tree_root,
struct btrfs_path *path,
......@@ -319,7 +404,6 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
return ret;
}
/*
* add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY
* or BTRFS_ROOT_BACKREF_KEY.
......@@ -335,8 +419,7 @@ int btrfs_find_root_ref(struct btrfs_root *tree_root,
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
u64 root_id, u8 type, u64 ref_id,
u64 dirid, u64 sequence,
u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
const char *name, int name_len)
{
struct btrfs_key key;
......@@ -346,13 +429,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
unsigned long ptr;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = root_id;
key.type = type;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = ref_id;
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
BUG_ON(ret);
......@@ -366,6 +450,14 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
write_extent_buffer(leaf, name, ptr, name_len);
btrfs_mark_buffer_dirty(leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {
btrfs_release_path(tree_root, path);
key.objectid = ref_id;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = root_id;
goto again;
}
btrfs_free_path(path);
return ret;
return 0;
}
......@@ -676,6 +676,7 @@ static int btrfs_unfreeze(struct super_block *sb)
}
static const struct super_operations btrfs_super_ops = {
.drop_inode = btrfs_drop_inode,
.delete_inode = btrfs_delete_inode,
.put_super = btrfs_put_super,
.sync_fs = btrfs_sync_fs,
......
......@@ -104,7 +104,6 @@ static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
{
if (root->ref_cows && root->last_trans < trans->transid) {
WARN_ON(root == root->fs_info->extent_root);
WARN_ON(root->root_item.refs == 0);
WARN_ON(root->commit_root != root->node);
radix_tree_tag_set(&root->fs_info->fs_roots_radix,
......@@ -720,7 +719,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
key.objectid = objectid;
key.offset = 0;
/* record when the snapshot was created in key.offset */
key.offset = trans->transid;
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
old = btrfs_lock_root_node(root);
......@@ -778,24 +778,14 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
ret = btrfs_update_inode(trans, parent_root, parent_inode);
BUG_ON(ret);
/* add the backref first */
ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
pending->root_key.objectid,
BTRFS_ROOT_BACKREF_KEY,
parent_root->root_key.objectid,
parent_inode->i_ino, index, pending->name,
namelen);
BUG_ON(ret);
/* now add the forward ref */
ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
parent_root->root_key.objectid,
BTRFS_ROOT_REF_KEY,
pending->root_key.objectid,
parent_inode->i_ino, index, pending->name,
namelen);
inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
d_instantiate(pending->dentry, inode);
fail:
......@@ -874,7 +864,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
unsigned long timeout = 1;
struct btrfs_transaction *cur_trans;
struct btrfs_transaction *prev_trans = NULL;
struct extent_io_tree *pinned_copy;
DEFINE_WAIT(wait);
int ret;
int should_grow = 0;
......@@ -915,13 +904,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return 0;
}
pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS);
if (!pinned_copy)
return -ENOMEM;
extent_io_tree_init(pinned_copy,
root->fs_info->btree_inode->i_mapping, GFP_NOFS);
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
if (cur_trans->list.prev != &root->fs_info->trans_list) {
......@@ -1019,6 +1001,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = commit_cowonly_roots(trans, root);
BUG_ON(ret);
btrfs_prepare_extent_commit(trans, root);
cur_trans = root->fs_info->running_transaction;
spin_lock(&root->fs_info->new_trans_lock);
root->fs_info->running_transaction = NULL;
......@@ -1042,8 +1026,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
sizeof(root->fs_info->super_copy));
btrfs_copy_pinned(root, pinned_copy);
trans->transaction->blocked = 0;
wake_up(&root->fs_info->transaction_wait);
......@@ -1059,8 +1041,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->fs_info->tree_log_mutex);
btrfs_finish_extent_commit(trans, root, pinned_copy);
kfree(pinned_copy);
btrfs_finish_extent_commit(trans, root);
/* do the directory inserts of any pending snapshot creations */
finish_pending_snapshots(trans, root->fs_info);
......@@ -1096,8 +1077,13 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
while (!list_empty(&list)) {
root = list_entry(list.next, struct btrfs_root, root_list);
list_del_init(&root->root_list);
btrfs_drop_snapshot(root, 0);
list_del(&root->root_list);
if (btrfs_header_backref_rev(root->node) <
BTRFS_MIXED_BACKREF_REV)
btrfs_drop_snapshot(root, 0);
else
btrfs_drop_snapshot(root, 1);
}
return 0;
}
......@@ -263,8 +263,8 @@ static int process_one_buffer(struct btrfs_root *log,
struct walk_control *wc, u64 gen)
{
if (wc->pin)
btrfs_update_pinned_extents(log->fs_info->extent_root,
eb->start, eb->len, 1);
btrfs_pin_extent(log->fs_info->extent_root,
eb->start, eb->len, 0);
if (btrfs_buffer_uptodate(eb, gen)) {
if (wc->write)
......@@ -534,7 +534,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
saved_nbytes = inode_get_bytes(inode);
/* drop any overlapping extents */
ret = btrfs_drop_extents(trans, root, inode,
start, extent_end, extent_end, start, &alloc_hint);
start, extent_end, extent_end, start, &alloc_hint, 1);
BUG_ON(ret);
if (found_type == BTRFS_FILE_EXTENT_REG ||
......@@ -2841,7 +2841,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
if (parent == sb->s_root)
if (IS_ROOT(parent))
break;
parent = parent->d_parent;
......@@ -2880,6 +2880,12 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_no_trans;
}
if (root != BTRFS_I(inode)->root ||
btrfs_root_refs(&root->root_item) == 0) {
ret = 1;
goto end_no_trans;
}
ret = check_parent_dirs_for_sync(trans, inode, parent,
sb, last_committed);
if (ret)
......@@ -2907,12 +2913,15 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
break;
inode = parent->d_inode;
if (root != BTRFS_I(inode)->root)
break;
if (BTRFS_I(inode)->generation >
root->fs_info->last_trans_committed) {
ret = btrfs_log_inode(trans, root, inode, inode_only);
BUG_ON(ret);
}
if (parent == sb->s_root)
if (IS_ROOT(parent))
break;
parent = parent->d_parent;
......@@ -2951,7 +2960,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
struct btrfs_key tmp_key;
struct btrfs_root *log;
struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
u64 highest_inode;
struct walk_control wc = {
.process_func = process_one_buffer,
.stage = 0,
......@@ -3010,11 +3018,6 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
path);
BUG_ON(ret);
}
ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
if (ret == 0) {
wc.replay_dest->highest_inode = highest_inode;
wc.replay_dest->last_inode_alloc = highest_inode;
}
key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
......
......@@ -276,7 +276,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
* is now congested. Back off and let other work structs
* run instead
*/
if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
fs_info->fs_devices->open_devices > 1) {
struct io_context *ioc;
......@@ -719,10 +719,9 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
* called very infrequently and that a given device has a small number
* of extents
*/
static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 num_bytes, u64 *start,
u64 *max_avail)
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail)
{
struct btrfs_key key;
struct btrfs_root *root = device->dev_root;
......@@ -1736,6 +1735,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
extent_root = root->fs_info->extent_root;
em_tree = &root->fs_info->mapping_tree.map_tree;
ret = btrfs_can_relocate(extent_root, chunk_offset);
if (ret)
return -ENOSPC;
/* step one, relocate all the extents inside this chunk */
ret = btrfs_relocate_block_group(extent_root, chunk_offset);
BUG_ON(ret);
......@@ -1749,9 +1752,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
* step two, delete the device extents and the
* chunk tree entries
*/
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_offset, 1);
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
BUG_ON(em->start > chunk_offset ||
em->start + em->len < chunk_offset);
......@@ -1780,9 +1783,9 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
BUG_ON(ret);
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
remove_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
kfree(map);
em->bdev = NULL;
......@@ -1807,12 +1810,15 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
struct btrfs_key found_key;
u64 chunk_tree = chunk_root->root_key.objectid;
u64 chunk_type;
bool retried = false;
int failed = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
again:
key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
key.offset = (u64)-1;
key.type = BTRFS_CHUNK_ITEM_KEY;
......@@ -1842,7 +1848,10 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
found_key.objectid,
found_key.offset);
BUG_ON(ret);
if (ret == -ENOSPC)
failed++;
else if (ret)
BUG();
}
if (found_key.offset == 0)
......@@ -1850,6 +1859,14 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
key.offset = found_key.offset - 1;
}
ret = 0;
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (failed && retried) {
WARN_ON(1);
ret = -ENOSPC;
}
error:
btrfs_free_path(path);
return ret;
......@@ -1894,6 +1911,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
continue;
ret = btrfs_shrink_device(device, old_size - size_to_free);
if (ret == -ENOSPC)
break;
BUG_ON(ret);
trans = btrfs_start_transaction(dev_root, 1);
......@@ -1938,9 +1957,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
chunk = btrfs_item_ptr(path->nodes[0],
path->slots[0],
struct btrfs_chunk);
key.offset = found_key.offset;
/* chunk zero is special */
if (key.offset == 0)
if (found_key.offset == 0)
break;
btrfs_release_path(chunk_root, path);
......@@ -1948,7 +1966,8 @@ int btrfs_balance(struct btrfs_root *dev_root)
chunk_root->root_key.objectid,
found_key.objectid,
found_key.offset);
BUG_ON(ret);
BUG_ON(ret && ret != -ENOSPC);
key.offset = found_key.offset - 1;
}
ret = 0;
error:
......@@ -1974,10 +1993,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
u64 chunk_offset;
int ret;
int slot;
int failed = 0;
bool retried = false;
struct extent_buffer *l;
struct btrfs_key key;
struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
u64 old_total = btrfs_super_total_bytes(super_copy);
u64 old_size = device->total_bytes;
u64 diff = device->total_bytes - new_size;
if (new_size >= device->total_bytes)
......@@ -1987,12 +2009,6 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (!path)
return -ENOMEM;
trans = btrfs_start_transaction(root, 1);
if (!trans) {
ret = -ENOMEM;
goto done;
}
path->reada = 2;
lock_chunks(root);
......@@ -2001,8 +2017,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
if (device->writeable)
device->fs_devices->total_rw_bytes -= diff;
unlock_chunks(root);
btrfs_end_transaction(trans, root);
again:
key.objectid = device->devid;
key.offset = (u64)-1;
key.type = BTRFS_DEV_EXTENT_KEY;
......@@ -2017,6 +2033,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
goto done;
if (ret) {
ret = 0;
btrfs_release_path(root, path);
break;
}
......@@ -2024,14 +2041,18 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid)
if (key.objectid != device->devid) {
btrfs_release_path(root, path);
break;
}
dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size)
if (key.offset + length <= new_size) {
btrfs_release_path(root, path);
break;
}
chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
......@@ -2040,8 +2061,26 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
chunk_offset);
if (ret)
if (ret && ret != -ENOSPC)
goto done;
if (ret == -ENOSPC)
failed++;
key.offset -= 1;
}
if (failed && !retried) {
failed = 0;
retried = true;
goto again;
} else if (failed && retried) {
ret = -ENOSPC;
lock_chunks(root);
device->total_bytes = old_size;
if (device->writeable)
device->fs_devices->total_rw_bytes += diff;
unlock_chunks(root);
goto done;
}
/* Shrinking succeeded, else we would be at "done". */
......@@ -2294,9 +2333,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
em->block_len = em->len;
em_tree = &extent_root->fs_info->mapping_tree.map_tree;
spin_lock(&em_tree->lock);
write_lock(&em_tree->lock);
ret = add_extent_mapping(em_tree, em);
spin_unlock(&em_tree->lock);
write_unlock(&em_tree->lock);
BUG_ON(ret);
free_extent_map(em);
......@@ -2491,9 +2530,9 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
int readonly = 0;
int i;
spin_lock(&map_tree->map_tree.lock);
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
spin_unlock(&map_tree->map_tree.lock);
read_unlock(&map_tree->map_tree.lock);
if (!em)
return 1;
......@@ -2518,11 +2557,11 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
struct extent_map *em;
while (1) {
spin_lock(&tree->map_tree.lock);
write_lock(&tree->map_tree.lock);
em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
if (em)
remove_extent_mapping(&tree->map_tree, em);
spin_unlock(&tree->map_tree.lock);
write_unlock(&tree->map_tree.lock);
if (!em)
break;
kfree(em->bdev);
......@@ -2540,9 +2579,9 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
struct extent_map_tree *em_tree = &map_tree->map_tree;
int ret;
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, len);
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
BUG_ON(!em);
BUG_ON(em->start > logical || em->start + em->len < logical);
......@@ -2604,9 +2643,9 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
atomic_set(&multi->error, 0);
}
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, logical, *length);
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
if (!em && unplug_page)
return 0;
......@@ -2763,9 +2802,9 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 stripe_nr;
int i, j, nr = 0;
spin_lock(&em_tree->lock);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, chunk_start, 1);
spin_unlock(&em_tree->lock);
read_unlock(&em_tree->lock);
BUG_ON(!em || em->start != chunk_start);
map = (struct map_lookup *)em->bdev;
......@@ -3053,9 +3092,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
spin_lock(&map_tree->map_tree.lock);
read_lock(&map_tree->map_tree.lock);
em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
spin_unlock(&map_tree->map_tree.lock);
read_unlock(&map_tree->map_tree.lock);
/* already mapped? */
if (em && em->start <= logical && em->start + em->len > logical) {
......@@ -3114,9 +3153,9 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
map->stripes[i].dev->in_fs_metadata = 1;
}
spin_lock(&map_tree->map_tree.lock);
write_lock(&map_tree->map_tree.lock);
ret = add_extent_mapping(&map_tree->map_tree, em);
spin_unlock(&map_tree->map_tree.lock);
write_unlock(&map_tree->map_tree.lock);
BUG_ON(ret);
free_extent_map(em);
......
......@@ -181,4 +181,7 @@ int btrfs_balance(struct btrfs_root *dev_root);
void btrfs_unlock_volumes(void);
void btrfs_lock_volumes(void);
int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
int find_free_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device, u64 num_bytes,
u64 *start, u64 *max_avail);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册