summaryrefslogtreecommitdiff
path: root/sys-fs
diff options
context:
space:
mode:
authorJoe Peterson <lavajoe@gentoo.org>2008-08-20 23:59:43 +0000
committerJoe Peterson <lavajoe@gentoo.org>2008-08-20 23:59:43 +0000
commit05e41b7b4f8b48bf4676c718264823d6318f26d5 (patch)
tree6810381e43e1aed88becc82a67554125dba639b7 /sys-fs
parentFix ChangeLog. (diff)
downloadgentoo-2-05e41b7b4f8b48bf4676c718264823d6318f26d5.tar.gz
gentoo-2-05e41b7b4f8b48bf4676c718264823d6318f26d5.tar.bz2
gentoo-2-05e41b7b4f8b48bf4676c718264823d6318f26d5.zip
Apply 0.16 hotfixes (first set)
(Portage version: 2.2_rc8/cvs/Linux 2.6.26-gentoo i686)
Diffstat (limited to 'sys-fs')
-rw-r--r--sys-fs/btrfs/ChangeLog8
-rw-r--r--sys-fs/btrfs/btrfs-0.16-r2.ebuild66
-rw-r--r--sys-fs/btrfs/files/btrfs-0.16-hotfix-1.patch992
3 files changed, 1065 insertions, 1 deletions
diff --git a/sys-fs/btrfs/ChangeLog b/sys-fs/btrfs/ChangeLog
index b972bbcbcf47..3b52470c450b 100644
--- a/sys-fs/btrfs/ChangeLog
+++ b/sys-fs/btrfs/ChangeLog
@@ -1,6 +1,12 @@
# ChangeLog for sys-fs/btrfs
# Copyright 1999-2008 Gentoo Foundation; Distributed under the GPL v2
-# $Header: /var/cvsroot/gentoo-x86/sys-fs/btrfs/ChangeLog,v 1.12 2008/08/18 17:10:32 lavajoe Exp $
+# $Header: /var/cvsroot/gentoo-x86/sys-fs/btrfs/ChangeLog,v 1.13 2008/08/20 23:59:42 lavajoe Exp $
+
+*btrfs-0.16-r2 (20 Aug 2008)
+
+ 20 Aug 2008; Joe Peterson <lavajoe@gentoo.org>
+ +files/btrfs-0.16-hotfix-1.patch, +btrfs-0.16-r2.ebuild:
+ Apply 0.16 hotfixes (first set)
*btrfs-0.16-r1 (18 Aug 2008)
diff --git a/sys-fs/btrfs/btrfs-0.16-r2.ebuild b/sys-fs/btrfs/btrfs-0.16-r2.ebuild
new file mode 100644
index 000000000000..dbae32b51317
--- /dev/null
+++ b/sys-fs/btrfs/btrfs-0.16-r2.ebuild
@@ -0,0 +1,66 @@
+# Copyright 1999-2008 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+# $Header: /var/cvsroot/gentoo-x86/sys-fs/btrfs/btrfs-0.16-r2.ebuild,v 1.1 2008/08/20 23:59:42 lavajoe Exp $
+
+inherit eutils linux-mod
+
+DESCRIPTION="A checksumming copy-on-write filesystem"
+HOMEPAGE="http://btrfs.wiki.kernel.org/"
+SRC_URI="http://www.kernel.org/pub/linux/kernel/people/mason/btrfs/${P}.tar.bz2"
+
+LICENSE="GPL-2"
+SLOT="0"
+KEYWORDS="~amd64 ~x86"
+IUSE=""
+
+DEPEND=""
+RDEPEND="${DEPEND}"
+
+pkg_setup()
+{
+ linux-mod_pkg_setup
+
+ BUILD_TARGETS="all"
+ BUILD_PARAMS="KERNELDIR=${KV_OUT_DIR}"
+ MODULE_NAMES="btrfs(fs:${S}/"
+
+ if ! kernel_is 2 6; then
+ eerror "Need a 2.6 kernel to compile against!"
+ die "Need a 2.6 kernel to compile against!"
+ fi
+
+ if ! linux_chkconfig_present LIBCRC32C; then
+ eerror "You need to enable LIBCRC32C in your kernel!"
+ die "You need to enable LIBCRC32C in your kernel!"
+ fi
+}
+
+src_unpack() {
+ unpack ${A}
+ cd "${S}"
+
+ # Apply hot fixes
+ epatch "${FILESDIR}/${P}-hotfix-1.patch"
+}
+
+src_install()
+{
+ linux-mod_src_install
+
+ dodoc INSTALL TODO
+}
+
+pkg_postinst() {
+ linux-mod_pkg_postinst
+
+ ewarn "WARNING: Btrfs is under heavy development and is not suitable for"
+ ewarn " any uses other than benchmarking and review."
+ ewarn " The Btrfs disk format is not yet finalized."
+ ewarn
+ ewarn " Also, it is highly recommended that the versions of"
+ ewarn " btrfs and btrfs-progs match."
+ ewarn
+ ewarn "Note: THE DISK FORMAT HAS CHANGED!"
+ ewarn " You must backup your data and re-create your btrfs"
+ ewarn " filesystem(s) for use with this version."
+}
diff --git a/sys-fs/btrfs/files/btrfs-0.16-hotfix-1.patch b/sys-fs/btrfs/files/btrfs-0.16-hotfix-1.patch
new file mode 100644
index 000000000000..dcd6c397cb9b
--- /dev/null
+++ b/sys-fs/btrfs/files/btrfs-0.16-hotfix-1.patch
@@ -0,0 +1,992 @@
+diff -Nurp btrfs-0.16/async-thread.c btrfs-0.16.new/async-thread.c
+--- btrfs-0.16/async-thread.c 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/async-thread.c 2008-08-20 17:43:19.211404694 -0600
+@@ -49,6 +49,8 @@ struct btrfs_worker_thread {
+ /* number of things on the pending list */
+ atomic_t num_pending;
+
++ unsigned long sequence;
++
+ /* protects the pending list. */
+ spinlock_t lock;
+
+@@ -153,7 +155,7 @@ int btrfs_stop_workers(struct btrfs_work
+ /*
+ * simple init on struct btrfs_workers
+ */
+-void btrfs_init_workers(struct btrfs_workers *workers, int max)
++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
+ {
+ workers->num_workers = 0;
+ INIT_LIST_HEAD(&workers->worker_list);
+@@ -161,6 +163,7 @@ void btrfs_init_workers(struct btrfs_wor
+ spin_lock_init(&workers->lock);
+ workers->max_workers = max;
+ workers->idle_thresh = 32;
++ workers->name = name;
+ }
+
+ /*
+@@ -184,7 +187,9 @@ int btrfs_start_workers(struct btrfs_wor
+ INIT_LIST_HEAD(&worker->worker_list);
+ spin_lock_init(&worker->lock);
+ atomic_set(&worker->num_pending, 0);
+- worker->task = kthread_run(worker_loop, worker, "btrfs");
++ worker->task = kthread_run(worker_loop, worker,
++ "btrfs-%s-%d", workers->name,
++ workers->num_workers + i);
+ worker->workers = workers;
+ if (IS_ERR(worker->task)) {
+ kfree(worker);
+@@ -194,6 +199,7 @@ int btrfs_start_workers(struct btrfs_wor
+
+ spin_lock_irq(&workers->lock);
+ list_add_tail(&worker->worker_list, &workers->idle_list);
++ worker->idle = 1;
+ workers->num_workers++;
+ spin_unlock_irq(&workers->lock);
+ }
+@@ -235,7 +241,10 @@ static struct btrfs_worker_thread *next_
+ */
+ next = workers->worker_list.next;
+ worker = list_entry(next, struct btrfs_worker_thread, worker_list);
+- list_move_tail(next, &workers->worker_list);
++ atomic_inc(&worker->num_pending);
++ worker->sequence++;
++ if (worker->sequence % workers->idle_thresh == 0)
++ list_move_tail(next, &workers->worker_list);
+ return worker;
+ }
+
+diff -Nurp btrfs-0.16/async-thread.h btrfs-0.16.new/async-thread.h
+--- btrfs-0.16/async-thread.h 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/async-thread.h 2008-08-20 17:43:19.211404694 -0600
+@@ -69,11 +69,14 @@ struct btrfs_workers {
+
+ /* lock for finding the next worker thread to queue on */
+ spinlock_t lock;
++
++ /* extra name for this worker */
++ char *name;
+ };
+
+ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
+ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
+ int btrfs_stop_workers(struct btrfs_workers *workers);
+-void btrfs_init_workers(struct btrfs_workers *workers, int max);
++void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
+ int btrfs_requeue_work(struct btrfs_work *work);
+ #endif
+diff -Nurp btrfs-0.16/compat.h btrfs-0.16.new/compat.h
+--- btrfs-0.16/compat.h 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/compat.h 2008-08-20 17:43:19.501402495 -0600
+@@ -1,6 +1,9 @@
+ #ifndef _COMPAT_H_
+ #define _COMPAT_H_
+
++#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,26)
++#define trylock_page(page) (!TestSetPageLocked(page))
++#endif
+
+ /*
+ * Even if AppArmor isn't enabled, it still has different prototypes.
+diff -Nurp btrfs-0.16/ctree.h btrfs-0.16.new/ctree.h
+--- btrfs-0.16/ctree.h 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/ctree.h 2008-08-20 17:43:19.521406669 -0600
+@@ -526,6 +526,7 @@ struct btrfs_fs_info {
+ struct btrfs_transaction *running_transaction;
+ wait_queue_head_t transaction_throttle;
+ wait_queue_head_t transaction_wait;
++ wait_queue_head_t async_submit_wait;
+ struct btrfs_super_block super_copy;
+ struct btrfs_super_block super_for_commit;
+ struct block_device *__bdev;
+@@ -544,6 +545,7 @@ struct btrfs_fs_info {
+ struct list_head hashers;
+ struct list_head dead_roots;
+ atomic_t nr_async_submits;
++ atomic_t nr_async_bios;
+
+ /*
+ * this is used by the balancing code to wait for all the pending
+@@ -1648,7 +1650,7 @@ int btrfs_csum_truncate(struct btrfs_tra
+ /* inode.c */
+
+ /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
+-#ifdef ClearPageFsMisc
++#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
+ #define ClearPageChecked ClearPageFsMisc
+ #define SetPageChecked SetPageFsMisc
+ #define PageChecked PageFsMisc
+diff -Nurp btrfs-0.16/disk-io.c btrfs-0.16.new/disk-io.c
+--- btrfs-0.16/disk-io.c 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/disk-io.c 2008-08-20 17:43:19.541408335 -0600
+@@ -429,14 +429,38 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_
+ return 0;
+ }
+
++unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
++{
++ unsigned long limit = min_t(unsigned long,
++ info->workers.max_workers,
++ info->fs_devices->open_devices);
++ return 256 * limit;
++}
++
++int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
++{
++ return atomic_read(&info->nr_async_bios) >
++ btrfs_async_submit_limit(info);
++}
++
+ static void run_one_async_submit(struct btrfs_work *work)
+ {
+ struct btrfs_fs_info *fs_info;
+ struct async_submit_bio *async;
++ int limit;
+
+ async = container_of(work, struct async_submit_bio, work);
+ fs_info = BTRFS_I(async->inode)->root->fs_info;
++
++ limit = btrfs_async_submit_limit(fs_info);
++ limit = limit * 2 / 3;
++
+ atomic_dec(&fs_info->nr_async_submits);
++
++ if (atomic_read(&fs_info->nr_async_submits) < limit &&
++ waitqueue_active(&fs_info->async_submit_wait))
++ wake_up(&fs_info->async_submit_wait);
++
+ async->submit_bio_hook(async->inode, async->rw, async->bio,
+ async->mirror_num);
+ kfree(async);
+@@ -447,6 +471,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_
+ extent_submit_bio_hook_t *submit_bio_hook)
+ {
+ struct async_submit_bio *async;
++ int limit = btrfs_async_submit_limit(fs_info);
+
+ async = kmalloc(sizeof(*async), GFP_NOFS);
+ if (!async)
+@@ -461,6 +486,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_
+ async->work.flags = 0;
+ atomic_inc(&fs_info->nr_async_submits);
+ btrfs_queue_worker(&fs_info->workers, &async->work);
++
++ wait_event_timeout(fs_info->async_submit_wait,
++ (atomic_read(&fs_info->nr_async_submits) < limit),
++ HZ/10);
+ return 0;
+ }
+
+@@ -475,11 +504,11 @@ static int __btree_submit_bio_hook(struc
+
+ /*
+ * when we're called for a write, we're already in the async
+- * submission context. Just jump ingo btrfs_map_bio
++ * submission context. Just jump into btrfs_map_bio
+ */
+ if (rw & (1 << BIO_RW)) {
+ return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+- mirror_num, 0);
++ mirror_num, 1);
+ }
+
+ /*
+@@ -511,6 +540,12 @@ static int btree_writepage(struct page *
+ {
+ struct extent_io_tree *tree;
+ tree = &BTRFS_I(page->mapping->host)->io_tree;
++
++ if (current->flags & PF_MEMALLOC) {
++ redirty_page_for_writepage(wbc, page);
++ unlock_page(page);
++ return 0;
++ }
+ return extent_write_full_page(tree, page, btree_get_extent, wbc);
+ }
+
+@@ -522,16 +557,11 @@ static int btree_writepages(struct addre
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ u64 num_dirty;
+ u64 start = 0;
+- unsigned long thresh = 96 * 1024 * 1024;
++ unsigned long thresh = 8 * 1024 * 1024;
+
+ if (wbc->for_kupdate)
+ return 0;
+
+- if (current_is_pdflush()) {
+- thresh = 96 * 1024 * 1024;
+- } else {
+- thresh = 8 * 1024 * 1024;
+- }
+ num_dirty = count_range_bits(tree, &start, (u64)-1,
+ thresh, EXTENT_DIRTY);
+ if (num_dirty < thresh) {
+@@ -938,15 +968,13 @@ static int btrfs_congested_fn(void *cong
+ {
+ struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
+ int ret = 0;
+- int limit = 256 * info->fs_devices->open_devices;
+ struct list_head *cur;
+ struct btrfs_device *device;
+ struct backing_dev_info *bdi;
+
+ if ((bdi_bits & (1 << BDI_write_congested)) &&
+- atomic_read(&info->nr_async_submits) > limit) {
++ btrfs_congested_async(info, 0))
+ return 1;
+- }
+
+ list_for_each(cur, &info->fs_devices->devices) {
+ device = list_entry(cur, struct btrfs_device, dev_list);
+@@ -1250,6 +1278,7 @@ struct btrfs_root *open_ctree(struct sup
+ INIT_LIST_HEAD(&fs_info->space_info);
+ btrfs_mapping_init(&fs_info->mapping_tree);
+ atomic_set(&fs_info->nr_async_submits, 0);
++ atomic_set(&fs_info->nr_async_bios, 0);
+ atomic_set(&fs_info->throttles, 0);
+ atomic_set(&fs_info->throttle_gen, 0);
+ fs_info->sb = sb;
+@@ -1311,6 +1340,7 @@ struct btrfs_root *open_ctree(struct sup
+ mutex_init(&fs_info->volume_mutex);
+ init_waitqueue_head(&fs_info->transaction_throttle);
+ init_waitqueue_head(&fs_info->transaction_wait);
++ init_waitqueue_head(&fs_info->async_submit_wait);
+
+ #if 0
+ ret = add_hasher(fs_info, "crc32c");
+@@ -1347,8 +1377,11 @@ struct btrfs_root *open_ctree(struct sup
+ * queue work function gets called at interrupt time, and so it
+ * cannot dynamically grow.
+ */
+- btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
+- btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
++ btrfs_init_workers(&fs_info->workers, "worker",
++ fs_info->thread_pool_size);
++ btrfs_init_workers(&fs_info->submit_workers, "submit",
++ min_t(u64, fs_devices->num_devices,
++ fs_info->thread_pool_size));
+
+ /* a higher idle thresh on the submit workers makes it much more
+ * likely that bios will be send down in a sane order to the
+@@ -1356,9 +1389,18 @@ struct btrfs_root *open_ctree(struct sup
+ */
+ fs_info->submit_workers.idle_thresh = 64;
+
+- btrfs_init_workers(&fs_info->fixup_workers, 1);
+- btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+- btrfs_init_workers(&fs_info->endio_write_workers,
++ /* fs_info->workers is responsible for checksumming file data
++ * blocks and metadata. Using a larger idle thresh allows each
++ * worker thread to operate on things in roughly the order they
++ * were sent by the writeback daemons, improving overall locality
++ * of the IO going down the pipe.
++ */
++ fs_info->workers.idle_thresh = 128;
++
++ btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1);
++ btrfs_init_workers(&fs_info->endio_workers, "endio",
++ fs_info->thread_pool_size);
++ btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
+ fs_info->thread_pool_size);
+
+ /*
+@@ -1823,10 +1865,10 @@ void btrfs_btree_balance_dirty(struct bt
+ struct extent_io_tree *tree;
+ u64 num_dirty;
+ u64 start = 0;
+- unsigned long thresh = 16 * 1024 * 1024;
++ unsigned long thresh = 96 * 1024 * 1024;
+ tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
+
+- if (current_is_pdflush())
++ if (current_is_pdflush() || current->flags & PF_MEMALLOC)
+ return;
+
+ num_dirty = count_range_bits(tree, &start, (u64)-1,
+diff -Nurp btrfs-0.16/disk-io.h btrfs-0.16.new/disk-io.h
+--- btrfs-0.16/disk-io.h 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/disk-io.h 2008-08-20 17:43:19.541408335 -0600
+@@ -72,4 +72,6 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_
+ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
+ int rw, struct bio *bio, int mirror_num,
+ extent_submit_bio_hook_t *submit_bio_hook);
++int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
++unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
+ #endif
+diff -Nurp btrfs-0.16/extent_io.c btrfs-0.16.new/extent_io.c
+--- btrfs-0.16/extent_io.c 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/extent_io.c 2008-08-20 17:43:19.561407722 -0600
+@@ -14,6 +14,9 @@
+ #include <linux/pagevec.h>
+ #include "extent_io.h"
+ #include "extent_map.h"
++#include "compat.h"
++#include "ctree.h"
++#include "btrfs_inode.h"
+
+ /* temporary define until extent_map moves out of btrfs */
+ struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
+@@ -1393,15 +1396,11 @@ static int end_bio_extent_writepage(stru
+ {
+ int uptodate = err == 0;
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+- struct extent_state *state = bio->bi_private;
+- struct extent_io_tree *tree = state->tree;
+- struct rb_node *node;
++ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+- u64 cur;
+ int whole_page;
+ int ret;
+- unsigned long flags;
+
+ #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+ if (bio->bi_size)
+@@ -1409,6 +1408,8 @@ static int end_bio_extent_writepage(stru
+ #endif
+ do {
+ struct page *page = bvec->bv_page;
++ tree = &BTRFS_I(page->mapping->host)->io_tree;
++
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+@@ -1422,7 +1423,7 @@ static int end_bio_extent_writepage(stru
+ prefetchw(&bvec->bv_page->flags);
+ if (tree->ops && tree->ops->writepage_end_io_hook) {
+ ret = tree->ops->writepage_end_io_hook(page, start,
+- end, state, uptodate);
++ end, NULL, uptodate);
+ if (ret)
+ uptodate = 0;
+ }
+@@ -1430,9 +1431,8 @@ static int end_bio_extent_writepage(stru
+ if (!uptodate && tree->ops &&
+ tree->ops->writepage_io_failed_hook) {
+ ret = tree->ops->writepage_io_failed_hook(bio, page,
+- start, end, state);
++ start, end, NULL);
+ if (ret == 0) {
+- state = NULL;
+ uptodate = (err == 0);
+ continue;
+ }
+@@ -1444,68 +1444,7 @@ static int end_bio_extent_writepage(stru
+ SetPageError(page);
+ }
+
+- /*
+- * bios can get merged in funny ways, and so we need to
+- * be careful with the state variable. We know the
+- * state won't be merged with others because it has
+- * WRITEBACK set, but we can't be sure each biovec is
+- * sequential in the file. So, if our cached state
+- * doesn't match the expected end, search the tree
+- * for the correct one.
+- */
+-
+- spin_lock_irqsave(&tree->lock, flags);
+- if (!state || state->end != end) {
+- state = NULL;
+- node = __etree_search(tree, start, NULL, NULL);
+- if (node) {
+- state = rb_entry(node, struct extent_state,
+- rb_node);
+- if (state->end != end ||
+- !(state->state & EXTENT_WRITEBACK))
+- state = NULL;
+- }
+- if (!state) {
+- spin_unlock_irqrestore(&tree->lock, flags);
+- clear_extent_writeback(tree, start,
+- end, GFP_ATOMIC);
+- goto next_io;
+- }
+- }
+- cur = end;
+- while(1) {
+- struct extent_state *clear = state;
+- cur = state->start;
+- node = rb_prev(&state->rb_node);
+- if (node) {
+- state = rb_entry(node,
+- struct extent_state,
+- rb_node);
+- } else {
+- state = NULL;
+- }
+-
+- clear_state_bit(tree, clear, EXTENT_WRITEBACK,
+- 1, 0);
+- if (cur == start)
+- break;
+- if (cur < start) {
+- WARN_ON(1);
+- break;
+- }
+- if (!node)
+- break;
+- }
+- /* before releasing the lock, make sure the next state
+- * variable has the expected bits set and corresponds
+- * to the correct offsets in the file
+- */
+- if (state && (state->end + 1 != start ||
+- !(state->state & EXTENT_WRITEBACK))) {
+- state = NULL;
+- }
+- spin_unlock_irqrestore(&tree->lock, flags);
+-next_io:
++ clear_extent_writeback(tree, start, end, GFP_ATOMIC);
+
+ if (whole_page)
+ end_page_writeback(page);
+@@ -1538,13 +1477,9 @@ static int end_bio_extent_readpage(struc
+ {
+ int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+- struct extent_state *state = bio->bi_private;
+- struct extent_io_tree *tree = state->tree;
+- struct rb_node *node;
++ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+- u64 cur;
+- unsigned long flags;
+ int whole_page;
+ int ret;
+
+@@ -1555,6 +1490,8 @@ static int end_bio_extent_readpage(struc
+
+ do {
+ struct page *page = bvec->bv_page;
++ tree = &BTRFS_I(page->mapping->host)->io_tree;
++
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+@@ -1569,80 +1506,26 @@ static int end_bio_extent_readpage(struc
+
+ if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
+ ret = tree->ops->readpage_end_io_hook(page, start, end,
+- state);
++ NULL);
+ if (ret)
+ uptodate = 0;
+ }
+ if (!uptodate && tree->ops &&
+ tree->ops->readpage_io_failed_hook) {
+ ret = tree->ops->readpage_io_failed_hook(bio, page,
+- start, end, state);
++ start, end, NULL);
+ if (ret == 0) {
+- state = NULL;
+ uptodate =
+ test_bit(BIO_UPTODATE, &bio->bi_flags);
+ continue;
+ }
+ }
+
+- spin_lock_irqsave(&tree->lock, flags);
+- if (!state || state->end != end) {
+- state = NULL;
+- node = __etree_search(tree, start, NULL, NULL);
+- if (node) {
+- state = rb_entry(node, struct extent_state,
+- rb_node);
+- if (state->end != end ||
+- !(state->state & EXTENT_LOCKED))
+- state = NULL;
+- }
+- if (!state) {
+- spin_unlock_irqrestore(&tree->lock, flags);
+- if (uptodate)
+- set_extent_uptodate(tree, start, end,
+- GFP_ATOMIC);
+- unlock_extent(tree, start, end, GFP_ATOMIC);
+- goto next_io;
+- }
+- }
++ if (uptodate)
++ set_extent_uptodate(tree, start, end,
++ GFP_ATOMIC);
++ unlock_extent(tree, start, end, GFP_ATOMIC);
+
+- cur = end;
+- while(1) {
+- struct extent_state *clear = state;
+- cur = state->start;
+- node = rb_prev(&state->rb_node);
+- if (node) {
+- state = rb_entry(node,
+- struct extent_state,
+- rb_node);
+- } else {
+- state = NULL;
+- }
+- if (uptodate) {
+- set_state_cb(tree, clear, EXTENT_UPTODATE);
+- clear->state |= EXTENT_UPTODATE;
+- }
+- clear_state_bit(tree, clear, EXTENT_LOCKED,
+- 1, 0);
+- if (cur == start)
+- break;
+- if (cur < start) {
+- WARN_ON(1);
+- break;
+- }
+- if (!node)
+- break;
+- }
+- /* before releasing the lock, make sure the next state
+- * variable has the expected bits set and corresponds
+- * to the correct offsets in the file
+- */
+- if (state && (state->end + 1 != start ||
+- !(state->state & EXTENT_LOCKED))) {
+- state = NULL;
+- }
+- spin_unlock_irqrestore(&tree->lock, flags);
+-next_io:
+ if (whole_page) {
+ if (uptodate) {
+ SetPageUptodate(page);
+@@ -1682,8 +1565,7 @@ static int end_bio_extent_preparewrite(s
+ {
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+- struct extent_state *state = bio->bi_private;
+- struct extent_io_tree *tree = state->tree;
++ struct extent_io_tree *tree;
+ u64 start;
+ u64 end;
+
+@@ -1694,6 +1576,8 @@ static int end_bio_extent_preparewrite(s
+
+ do {
+ struct page *page = bvec->bv_page;
++ tree = &BTRFS_I(page->mapping->host)->io_tree;
++
+ start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+ bvec->bv_offset;
+ end = start + bvec->bv_len - 1;
+@@ -1764,7 +1648,7 @@ static int submit_one_bio(int rw, struct
+ BUG_ON(state->end != end);
+ spin_unlock_irq(&tree->lock);
+
+- bio->bi_private = state;
++ bio->bi_private = NULL;
+
+ bio_get(bio);
+
+@@ -3055,7 +2939,7 @@ int read_extent_buffer_pages(struct exte
+ for (i = start_i; i < num_pages; i++) {
+ page = extent_buffer_page(eb, i);
+ if (!wait) {
+- if (TestSetPageLocked(page))
++ if (!trylock_page(page))
+ goto unlock_exit;
+ } else {
+ lock_page(page);
+diff -Nurp btrfs-0.16/extent_map.c btrfs-0.16.new/extent_map.c
+--- btrfs-0.16/extent_map.c 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/extent_map.c 2008-08-20 17:43:19.571405082 -0600
+@@ -207,7 +207,14 @@ int add_extent_mapping(struct extent_map
+ int ret = 0;
+ struct extent_map *merge = NULL;
+ struct rb_node *rb;
++ struct extent_map *exist;
+
++ exist = lookup_extent_mapping(tree, em->start, em->len);
++ if (exist) {
++ free_extent_map(exist);
++ ret = -EEXIST;
++ goto out;
++ }
+ assert_spin_locked(&tree->lock);
+ rb = tree_insert(&tree->map, em->start, &em->rb_node);
+ if (rb) {
+diff -Nurp btrfs-0.16/file-item.c btrfs-0.16.new/file-item.c
+--- btrfs-0.16/file-item.c 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/file-item.c 2008-08-20 17:43:19.571405082 -0600
+@@ -134,7 +134,6 @@ int btrfs_lookup_file_extent(struct btrf
+ return ret;
+ }
+
+-#if 0 /* broken */
+ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio)
+ {
+@@ -151,6 +150,8 @@ int btrfs_lookup_bio_sums(struct btrfs_r
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+ path = btrfs_alloc_path();
++ if (bio->bi_size > PAGE_CACHE_SIZE * 8)
++ path->reada = 2;
+
+ WARN_ON(bio->bi_vcnt <= 0);
+
+@@ -211,7 +212,6 @@ found:
+ btrfs_free_path(path);
+ return 0;
+ }
+-#endif
+
+ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
+ struct bio *bio)
+@@ -321,6 +321,7 @@ again:
+ file_key.offset = offset;
+ btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
+
++ mutex_lock(&BTRFS_I(inode)->csum_mutex);
+ item = btrfs_lookup_csum(trans, root, path, objectid, offset, 1);
+ if (!IS_ERR(item)) {
+ leaf = path->nodes[0];
+@@ -367,7 +368,7 @@ again:
+ ret = btrfs_search_slot(trans, root, &file_key, path,
+ BTRFS_CRC32_SIZE, 1);
+ if (ret < 0)
+- goto fail;
++ goto fail_unlock;
+ if (ret == 0) {
+ BUG();
+ }
+@@ -411,10 +412,10 @@ insert:
+ ret = btrfs_insert_empty_item(trans, root, path, &file_key,
+ ins_size);
+ if (ret < 0)
+- goto fail;
++ goto fail_unlock;
+ if (ret != 0) {
+ WARN_ON(1);
+- goto fail;
++ goto fail_unlock;
+ }
+ csum:
+ leaf = path->nodes[0];
+@@ -427,6 +428,8 @@ found:
+ item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
+ btrfs_item_size_nr(leaf, path->slots[0]));
+ eb_token = NULL;
++ mutex_unlock(&BTRFS_I(inode)->csum_mutex);
++ cond_resched();
+ next_sector:
+
+ if (!eb_token ||
+@@ -467,13 +470,18 @@ next_sector:
+ eb_token = NULL;
+ }
+ btrfs_mark_buffer_dirty(path->nodes[0]);
++ cond_resched();
+ if (total_bytes < sums->len) {
+ btrfs_release_path(root, path);
+ goto again;
+ }
+-fail:
++out:
+ btrfs_free_path(path);
+ return ret;
++
++fail_unlock:
++ mutex_unlock(&BTRFS_I(inode)->csum_mutex);
++ goto out;
+ }
+
+ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
+diff -Nurp btrfs-0.16/inode.c btrfs-0.16.new/inode.c
+--- btrfs-0.16/inode.c 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/inode.c 2008-08-20 17:43:19.601409609 -0600
+@@ -389,15 +389,15 @@ int btrfs_submit_bio_hook(struct inode *
+ ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+ BUG_ON(ret);
+
+- if (!(rw & (1 << BIO_RW))) {
+- goto mapit;
+- }
+-
+ if (btrfs_test_opt(root, NODATASUM) ||
+ btrfs_test_flag(inode, NODATASUM)) {
+ goto mapit;
+ }
+
++ if (!(rw & (1 << BIO_RW))) {
++ btrfs_lookup_bio_sums(root, inode, bio);
++ goto mapit;
++ }
+ return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+ inode, rw, bio, mirror_num,
+ __btrfs_submit_bio_hook);
+@@ -415,10 +415,8 @@ static noinline int add_pending_csums(st
+ btrfs_set_trans_block_group(trans, inode);
+ list_for_each(cur, list) {
+ sum = list_entry(cur, struct btrfs_ordered_sum, list);
+- mutex_lock(&BTRFS_I(inode)->csum_mutex);
+ btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
+ inode, sum);
+- mutex_unlock(&BTRFS_I(inode)->csum_mutex);
+ }
+ return 0;
+ }
+@@ -605,58 +603,6 @@ int btrfs_writepage_end_io_hook(struct p
+ return btrfs_finish_ordered_io(page->mapping->host, start, end);
+ }
+
+-int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
+-{
+- int ret = 0;
+- struct inode *inode = page->mapping->host;
+- struct btrfs_root *root = BTRFS_I(inode)->root;
+- struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+- struct btrfs_csum_item *item;
+- struct btrfs_path *path = NULL;
+- u32 csum;
+-
+- if (btrfs_test_opt(root, NODATASUM) ||
+- btrfs_test_flag(inode, NODATASUM))
+- return 0;
+-
+- /*
+- * It is possible there is an ordered extent that has
+- * not yet finished for this range in the file. If so,
+- * that extent will have a csum cached, and it will insert
+- * the sum after all the blocks in the extent are fully
+- * on disk. So, look for an ordered extent and use the
+- * sum if found. We have to do this before looking in the
+- * btree because csum items are pre-inserted based on
+- * the file size. btrfs_lookup_csum might find an item
+- * that still hasn't been fully filled.
+- */
+- ret = btrfs_find_ordered_sum(inode, start, &csum);
+- if (ret == 0)
+- goto found;
+-
+- ret = 0;
+- path = btrfs_alloc_path();
+- item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
+- if (IS_ERR(item)) {
+- ret = PTR_ERR(item);
+- /* a csum that isn't present is a preallocated region. */
+- if (ret == -ENOENT || ret == -EFBIG)
+- ret = 0;
+- csum = 0;
+- printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
+- start);
+- goto out;
+- }
+- read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
+- BTRFS_CRC32_SIZE);
+-found:
+- set_state_private(io_tree, start, csum);
+-out:
+- if (path)
+- btrfs_free_path(path);
+- return ret;
+-}
+-
+ struct io_failure_record {
+ struct page *page;
+ u64 start;
+@@ -1655,8 +1601,20 @@ static int btrfs_setattr(struct dentry *
+ btrfs_truncate_page(inode->i_mapping, inode->i_size);
+
+ hole_size = block_end - hole_start;
+- btrfs_wait_ordered_range(inode, hole_start, hole_size);
+- lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
++ while(1) {
++ struct btrfs_ordered_extent *ordered;
++ btrfs_wait_ordered_range(inode, hole_start, hole_size);
++
++ lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
++ ordered = btrfs_lookup_ordered_extent(inode, hole_start);
++ if (ordered) {
++ unlock_extent(io_tree, hole_start,
++ block_end - 1, GFP_NOFS);
++ btrfs_put_ordered_extent(ordered);
++ } else {
++ break;
++ }
++ }
+
+ trans = btrfs_start_transaction(root, 1);
+ btrfs_set_trans_block_group(trans, inode);
+@@ -1833,6 +1791,7 @@ static int btrfs_init_locked_inode(struc
+ inode->i_ino = args->ino;
+ BTRFS_I(inode)->root = args->root;
+ BTRFS_I(inode)->delalloc_bytes = 0;
++ inode->i_mapping->writeback_index = 0;
+ BTRFS_I(inode)->disk_i_size = 0;
+ BTRFS_I(inode)->index_cnt = (u64)-1;
+ extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
+@@ -2239,6 +2198,7 @@ static struct inode *btrfs_new_inode(str
+ mutex_init(&BTRFS_I(inode)->csum_mutex);
+ mutex_init(&BTRFS_I(inode)->extent_mutex);
+ BTRFS_I(inode)->delalloc_bytes = 0;
++ inode->i_mapping->writeback_index = 0;
+ BTRFS_I(inode)->disk_i_size = 0;
+ BTRFS_I(inode)->root = root;
+
+@@ -2486,6 +2446,7 @@ static int btrfs_create(struct inode *di
+ mutex_init(&BTRFS_I(inode)->extent_mutex);
+ BTRFS_I(inode)->delalloc_bytes = 0;
+ BTRFS_I(inode)->disk_i_size = 0;
++ inode->i_mapping->writeback_index = 0;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+ }
+@@ -3549,6 +3510,7 @@ static int btrfs_symlink(struct inode *d
+ mutex_init(&BTRFS_I(inode)->extent_mutex);
+ BTRFS_I(inode)->delalloc_bytes = 0;
+ BTRFS_I(inode)->disk_i_size = 0;
++ inode->i_mapping->writeback_index = 0;
+ BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+ btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
+ }
+@@ -3654,7 +3616,6 @@ static struct extent_io_ops btrfs_extent
+ .fill_delalloc = run_delalloc_range,
+ .submit_bio_hook = btrfs_submit_bio_hook,
+ .merge_bio_hook = btrfs_merge_bio_hook,
+- .readpage_io_hook = btrfs_readpage_io_hook,
+ .readpage_end_io_hook = btrfs_readpage_end_io_hook,
+ .writepage_end_io_hook = btrfs_writepage_end_io_hook,
+ .writepage_start_hook = btrfs_writepage_start_hook,
+diff -Nurp btrfs-0.16/transaction.c btrfs-0.16.new/transaction.c
+--- btrfs-0.16/transaction.c 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/transaction.c 2008-08-20 17:43:19.641404933 -0600
+@@ -303,12 +303,12 @@ int btrfs_write_and_wait_transaction(str
+ struct btrfs_root *root)
+ {
+ int ret;
+- int err;
++ int err = 0;
+ int werr = 0;
+ struct extent_io_tree *dirty_pages;
+ struct page *page;
+ struct inode *btree_inode = root->fs_info->btree_inode;
+- u64 start;
++ u64 start = 0;
+ u64 end;
+ unsigned long index;
+
+@@ -317,12 +317,13 @@ int btrfs_write_and_wait_transaction(str
+ }
+ dirty_pages = &trans->transaction->dirty_pages;
+ while(1) {
+- ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
++ ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+ EXTENT_DIRTY);
+ if (ret)
+ break;
+- clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
+ while(start <= end) {
++ cond_resched();
++
+ index = start >> PAGE_CACHE_SHIFT;
+ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+ page = find_lock_page(btree_inode->i_mapping, index);
+@@ -343,7 +344,30 @@ int btrfs_write_and_wait_transaction(str
+ page_cache_release(page);
+ }
+ }
+- err = filemap_fdatawait(btree_inode->i_mapping);
++ while(1) {
++ ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
++ EXTENT_DIRTY);
++ if (ret)
++ break;
++
++ clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
++ while(start <= end) {
++ index = start >> PAGE_CACHE_SHIFT;
++ start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
++ page = find_get_page(btree_inode->i_mapping, index);
++ if (!page)
++ continue;
++ if (PageDirty(page)) {
++ lock_page(page);
++ err = write_one_page(page, 0);
++ if (err)
++ werr = err;
++ }
++ wait_on_page_writeback(page);
++ page_cache_release(page);
++ cond_resched();
++ }
++ }
+ if (err)
+ werr = err;
+ return werr;
+diff -Nurp btrfs-0.16/volumes.c btrfs-0.16.new/volumes.c
+--- btrfs-0.16/volumes.c 2008-08-05 12:13:37.000000000 -0600
++++ btrfs-0.16.new/volumes.c 2008-08-20 17:43:19.651405254 -0600
+@@ -138,12 +138,18 @@ int run_scheduled_bios(struct btrfs_devi
+ {
+ struct bio *pending;
+ struct backing_dev_info *bdi;
++ struct btrfs_fs_info *fs_info;
+ struct bio *tail;
+ struct bio *cur;
+ int again = 0;
+ unsigned long num_run = 0;
++ unsigned long limit;
+
+ bdi = device->bdev->bd_inode->i_mapping->backing_dev_info;
++ fs_info = device->dev_root->fs_info;
++ limit = btrfs_async_submit_limit(fs_info);
++ limit = limit * 2 / 3;
++
+ loop:
+ spin_lock(&device->io_lock);
+
+@@ -179,7 +185,11 @@ loop:
+ cur = pending;
+ pending = pending->bi_next;
+ cur->bi_next = NULL;
+- atomic_dec(&device->dev_root->fs_info->nr_async_submits);
++ atomic_dec(&fs_info->nr_async_bios);
++
++ if (atomic_read(&fs_info->nr_async_bios) < limit &&
++ waitqueue_active(&fs_info->async_submit_wait))
++ wake_up(&fs_info->async_submit_wait);
+
+ BUG_ON(atomic_read(&cur->bi_cnt) == 0);
+ bio_get(cur);
+@@ -2135,6 +2145,7 @@ int schedule_bio(struct btrfs_root *root
+ int rw, struct bio *bio)
+ {
+ int should_queue = 1;
++ unsigned long limit;
+
+ /* don't bother with additional async steps for reads, right now */
+ if (!(rw & (1 << BIO_RW))) {
+@@ -2145,12 +2156,12 @@ int schedule_bio(struct btrfs_root *root
+ }
+
+ /*
+- * nr_async_sumbits allows us to reliably return congestion to the
++ * nr_async_bios allows us to reliably return congestion to the
+ * higher layers. Otherwise, the async bio makes it appear we have
+ * made progress against dirty pages when we've really just put it
+ * on a queue for later
+ */
+- atomic_inc(&root->fs_info->nr_async_submits);
++ atomic_inc(&root->fs_info->nr_async_bios);
+ WARN_ON(bio->bi_next);
+ bio->bi_next = NULL;
+ bio->bi_rw |= rw;
+@@ -2171,6 +2182,11 @@ int schedule_bio(struct btrfs_root *root
+ if (should_queue)
+ btrfs_queue_worker(&root->fs_info->submit_workers,
+ &device->work);
++
++ limit = btrfs_async_submit_limit(root->fs_info);
++ wait_event_timeout(root->fs_info->async_submit_wait,
++ (atomic_read(&root->fs_info->nr_async_bios) < limit),
++ HZ/10);
+ return 0;
+ }
+