295 files changed, 7581 insertions, 5590 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index c295ba7..f0c7de7 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -41,8 +41,8 @@ extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
 extern const struct file_operations v9fs_dir_operations;
-extern struct dentry_operations v9fs_dentry_operations;
-extern struct dentry_operations v9fs_cached_dentry_operations;
+extern const struct dentry_operations v9fs_dentry_operations;
+extern const struct dentry_operations v9fs_cached_dentry_operations;
 
 struct inode *v9fs_get_inode(struct super_block *sb, int mode);
 ino_t v9fs_qid2ino(struct p9_qid *qid);
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 06dcc7c..d743252 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -104,12 +104,12 @@ void v9fs_dentry_release(struct dentry *dentry)
 	}
 }
 
-struct dentry_operations v9fs_cached_dentry_operations = {
+const struct dentry_operations v9fs_cached_dentry_operations = {
 	.d_delete = v9fs_cached_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
 
-struct dentry_operations v9fs_dentry_operations = {
+const struct dentry_operations v9fs_dentry_operations = {
 	.d_delete = v9fs_dentry_delete,
 	.d_release = v9fs_dentry_release,
 };
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 93212e4..5f8ab8a 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -168,8 +168,9 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	p9stat_free(st);
 	kfree(st);
 
-P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n");
-	return simple_set_mnt(mnt, sb);
+P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	simple_set_mnt(mnt, sb);
+	return 0;
 
 release_sb:
 	if (sb) {
diff --git a/fs/Kconfig b/fs/Kconfig
index 93945dd..cef8b18 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -56,61 +56,7 @@ endif # BLOCK
 
 source "fs/notify/Kconfig"
 
-config QUOTA
-	bool "Quota support"
-	help
-	  If you say Y here, you will be able to set per user limits for disk
-	  usage (also called disk quotas). Currently, it works for the
-	  ext2, ext3, and reiserfs file system. ext3 also supports journalled
-	  quotas for which you don't need to run quotacheck(8) after an unclean
-	  shutdown.
-	  For further details, read the Quota mini-HOWTO, available from
-	  <http://www.tldp.org/docs.html#howto>, or the documentation provided
-	  with the quota tools. Probably the quota support is only useful for
-	  multi user systems. If unsure, say N.
-
-config QUOTA_NETLINK_INTERFACE
-	bool "Report quota messages through netlink interface"
-	depends on QUOTA && NET
-	help
-	  If you say Y here, quota warnings (about exceeding softlimit, reaching
-	  hardlimit, etc.) will be reported through netlink interface. If unsure,
-	  say Y.
-
-config PRINT_QUOTA_WARNING
-	bool "Print quota warnings to console (OBSOLETE)"
-	depends on QUOTA
-	default y
-	help
-	  If you say Y here, quota warnings (about exceeding softlimit, reaching
-	  hardlimit, etc.) will be printed to the process' controlling terminal.
-	  Note that this behavior is currently deprecated and may go away in
-	  future. Please use notification via netlink socket instead.
-
-# Generic support for tree structured quota files. Seleted when needed.
-config QUOTA_TREE
-	 tristate
-
-config QFMT_V1
-	tristate "Old quota format support"
-	depends on QUOTA
-	help
-	  This quota format was (is) used by kernels earlier than 2.4.22. If
-	  you have quota working and you don't want to convert to new quota
-	  format say Y here.
-
-config QFMT_V2
-	tristate "Quota format v2 support"
-	depends on QUOTA
-	select QUOTA_TREE
-	help
-	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
-	  need this functionality say Y here.
-
-config QUOTACTL
-	bool
-	depends on XFS_QUOTA || QUOTA
-	default y
+source "fs/quota/Kconfig"
 
 source "fs/autofs/Kconfig"
 source "fs/autofs4/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 38bc735..6e82a30 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -51,11 +51,7 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 
-obj-$(CONFIG_QUOTA)		+= dquot.o
-obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
-obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
-obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
-obj-$(CONFIG_QUOTACTL)		+= quota.o
+obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-y				+= partitions/
@@ -69,10 +65,12 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4
+obj-$(CONFIG_EXT2_FS)		+= ext2/
+# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
+# unless explicitly requested by rootfstype
+obj-$(CONFIG_EXT4_FS)		+= ext4/
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
-obj-$(CONFIG_EXT2_FS)		+= ext2/
 obj-$(CONFIG_CRAMFS)		+= cramfs/
 obj-$(CONFIG_SQUASHFS)		+= squashfs/
 obj-y				+= ramfs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 8311575..e0a85db 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -86,7 +86,7 @@ void __adfs_error(struct super_block *sb, const char *function,
 /* dir_*.c */
 extern const struct inode_operations adfs_dir_inode_operations;
 extern const struct file_operations adfs_dir_operations;
-extern struct dentry_operations adfs_dentry_operations;
+extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
 
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 85a30e9..e867ccf 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -263,7 +263,7 @@ adfs_compare(struct dentry *parent, struct qstr *entry, struct qstr *name)
 	return 0;
 }
 
-struct dentry_operations adfs_dentry_operations = {
+const struct dentry_operations adfs_dentry_operations = {
 	.d_hash		= adfs_hash,
 	.d_compare	= adfs_compare,
 };
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index e9ec915..1a2d5e3 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -199,8 +199,7 @@ extern const struct address_space_operations	 affs_symlink_aops;
 extern const struct address_space_operations	 affs_aops;
 extern const struct address_space_operations	 affs_aops_ofs;
 
-extern struct dentry_operations	 affs_dentry_operations;
-extern struct dentry_operations	 affs_dentry_operations_intl;
+extern const struct dentry_operations	 affs_dentry_operations;
 
 static inline void
 affs_set_blocksize(struct super_block *sb, int size)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 8055730..7d0f0a3 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -179,14 +179,18 @@ affs_remove_link(struct dentry *dentry)
 		affs_lock_dir(dir);
 		affs_fix_dcache(dentry, link_ino);
 		retval = affs_remove_hash(dir, link_bh);
-		if (retval)
+		if (retval) {
+			affs_unlock_dir(dir);
 			goto done;
+		}
 		mark_buffer_dirty_inode(link_bh, inode);
 
 		memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32);
 		retval = affs_insert_hash(dir, bh);
-		if (retval)
+		if (retval) {
+			affs_unlock_dir(dir);
 			goto done;
+		}
 		mark_buffer_dirty_inode(bh, inode);
 
 		affs_unlock_dir(dir);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index cfcf1b6..960d336 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -19,12 +19,12 @@ static int	 affs_intl_toupper(int ch);
 static int	 affs_intl_hash_dentry(struct dentry *, struct qstr *);
 static int       affs_intl_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 
-struct dentry_operations affs_dentry_operations = {
+const struct dentry_operations affs_dentry_operations = {
 	.d_hash		= affs_hash_dentry,
 	.d_compare	= affs_compare_dentry,
 };
 
-static struct dentry_operations affs_intl_dentry_operations = {
+static const struct dentry_operations affs_intl_dentry_operations = {
 	.d_hash		= affs_intl_hash_dentry,
 	.d_compare	= affs_intl_compare_dentry,
 };
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 99cf390..9bd7577 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -62,7 +62,7 @@ const struct inode_operations afs_dir_inode_operations = {
 	.setattr	= afs_setattr,
 };
 
-static struct dentry_operations afs_fs_dentry_operations = {
+static const struct dentry_operations afs_fs_dentry_operations = {
 	.d_revalidate	= afs_d_revalidate,
 	.d_delete	= afs_d_delete,
 	.d_release	= afs_d_release,
diff --git a/fs/aio.c b/fs/aio.c
index 8fa77e2..76da125 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -443,7 +443,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
 	req->private = NULL;
 	req->ki_iovec = NULL;
 	INIT_LIST_HEAD(&req->ki_run_list);
-	req->ki_eventfd = ERR_PTR(-EINVAL);
+	req->ki_eventfd = NULL;
 
 	/* Check if the completion queue has enough free space to
 	 * accept an event from this io.
@@ -485,8 +485,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 {
 	assert_spin_locked(&ctx->ctx_lock);
 
-	if (!IS_ERR(req->ki_eventfd))
-		fput(req->ki_eventfd);
 	if (req->ki_dtor)
 		req->ki_dtor(req);
 	if (req->ki_iovec != &req->ki_inline_vec)
@@ -508,8 +506,11 @@ static void aio_fput_routine(struct work_struct *data)
 		list_del(&req->ki_list);
 		spin_unlock_irq(&fput_lock);
 
-		/* Complete the fput */
-		__fput(req->ki_filp);
+		/* Complete the fput(s) */
+		if (req->ki_filp != NULL)
+			__fput(req->ki_filp);
+		if (req->ki_eventfd != NULL)
+			__fput(req->ki_eventfd);
 
 		/* Link the iocb into the context's free list */
 		spin_lock_irq(&ctx->ctx_lock);
@@ -527,12 +528,14 @@ static void aio_fput_routine(struct work_struct *data)
  */
 static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
+	int schedule_putreq = 0;
+
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	req->ki_users --;
+	req->ki_users--;
 	BUG_ON(req->ki_users < 0);
 	if (likely(req->ki_users))
 		return 0;
@@ -540,10 +543,23 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
 
-	/* Must be done under the lock to serialise against cancellation.
-	 * Call this aio_fput as it duplicates fput via the fput_work.
+	/*
+	 * Try to optimize the aio and eventfd file* puts, by avoiding to
+	 * schedule work in case it is not __fput() time. In normal cases,
+	 * we would not be holding the last reference to the file*, so
+	 * this function will be executed w/out any aio kthread wakeup.
 	 */
-	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
+	if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count)))
+		schedule_putreq++;
+	else
+		req->ki_filp = NULL;
+	if (req->ki_eventfd != NULL) {
+		if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count)))
+			schedule_putreq++;
+		else
+			req->ki_eventfd = NULL;
+	}
+	if (unlikely(schedule_putreq)) {
 		get_ioctx(ctx);
 		spin_lock(&fput_lock);
 		list_add(&req->ki_list, &fput_head);
@@ -571,7 +587,7 @@ int aio_put_req(struct kiocb *req)
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
 	struct mm_struct *mm = current->mm;
-	struct kioctx *ctx = NULL;
+	struct kioctx *ctx, *ret = NULL;
 	struct hlist_node *n;
 
 	rcu_read_lock();
@@ -579,12 +595,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
 		if (ctx->user_id == ctx_id && !ctx->dead) {
 			get_ioctx(ctx);
+			ret = ctx;
 			break;
 		}
 	}
 
 	rcu_read_unlock();
-	return ctx;
+	return ret;
 }
 
 /*
@@ -1009,7 +1026,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 	 * eventfd. The eventfd_signal() function is safe to be called
 	 * from IRQ context.
 	 */
-	if (!IS_ERR(iocb->ki_eventfd))
+	if (iocb->ki_eventfd != NULL)
 		eventfd_signal(iocb->ki_eventfd, 1);
 
 put_rq:
@@ -1608,6 +1625,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd);
 		if (IS_ERR(req->ki_eventfd)) {
 			ret = PTR_ERR(req->ki_eventfd);
+			req->ki_eventfd = NULL;
 			goto out_put_req;
 		}
 	}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 3bbdb9d..1dd96d4 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -48,7 +48,7 @@ static struct file_system_type anon_inode_fs_type = {
 	.get_sb		= anon_inodefs_get_sb,
 	.kill_sb	= kill_anon_super,
 };
-static struct dentry_operations anon_inodefs_dentry_operations = {
+static const struct dentry_operations anon_inodefs_dentry_operations = {
 	.d_delete	= anon_inodefs_delete_dentry,
 };
 
diff --git a/fs/attr.c b/fs/attr.c
index f436019..9fe1b1b 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -173,7 +173,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 		if (!error) {
 			if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
 			    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid))
-				error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+				error = vfs_dq_transfer(inode, attr) ?
+					-EDQUOT : 0;
 			if (!error)
 				error = inode_setattr(inode, attr);
 		}
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8aacade..4a1401c 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -192,7 +192,7 @@ static int autofs_revalidate(struct dentry * dentry, struct nameidata *nd)
 	return 1;
 }
 
-static struct dentry_operations autofs_dentry_operations = {
+static const struct dentry_operations autofs_dentry_operations = {
 	.d_revalidate	= autofs_revalidate,
 };
 
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 716e12b..69c8142 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -310,7 +310,7 @@ static struct autofs_info *autofs4_mkroot(struct autofs_sb_info *sbi)
 	return ino;
 }
 
-static struct dentry_operations autofs4_sb_dentry_operations = {
+static const struct dentry_operations autofs4_sb_dentry_operations = {
 	.d_release      = autofs4_dentry_release,
 };
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2a41c2a..74b1469 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -349,13 +349,13 @@ void autofs4_dentry_release(struct dentry *de)
 }
 
 /* For dentries of directories in the root dir */
-static struct dentry_operations autofs4_root_dentry_operations = {
+static const struct dentry_operations autofs4_root_dentry_operations = {
 	.d_revalidate	= autofs4_revalidate,
 	.d_release	= autofs4_dentry_release,
 };
 
 /* For other dentries */
-static struct dentry_operations autofs4_dentry_operations = {
+static const struct dentry_operations autofs4_dentry_operations = {
 	.d_revalidate	= autofs4_revalidate,
 	.d_release	= autofs4_dentry_release,
 };
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e3ff2b9..33b7235 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1208,9 +1208,11 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 	 * check for an ELF header.  If we find one, dump the first page to
 	 * aid in determining what was mapped here.
 	 */
-	if (FILTER(ELF_HEADERS) && vma->vm_file != NULL && vma->vm_pgoff == 0) {
+	if (FILTER(ELF_HEADERS) &&
+	    vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
 		u32 __user *header = (u32 __user *) vma->vm_start;
 		u32 word;
+		mm_segment_t fs = get_fs();
 		/*
 		 * Doing it this way gets the constant folded by GCC.
 		 */
@@ -1223,7 +1225,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
 		magic.elfmag[EI_MAG1] = ELFMAG1;
 		magic.elfmag[EI_MAG2] = ELFMAG2;
 		magic.elfmag[EI_MAG3] = ELFMAG3;
-		if (get_user(word, header) == 0 && word == magic.cmp)
+		/*
+		 * Switch to the user "segment" for get_user(),
+		 * then put back what elf_core_dump() had in place.
+		 */
+		set_fs(USER_DS);
+		if (unlikely(get_user(word, header)))
+			word = 0;
+		set_fs(fs);
+		if (word == magic.cmp)
 			return PAGE_SIZE;
 	}
 
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 549b014..31c46a2 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -26,23 +26,23 @@
 #include <linux/workqueue.h>
 
 static struct kmem_cache *bio_integrity_slab __read_mostly;
+static mempool_t *bio_integrity_pool;
+static struct bio_set *integrity_bio_set;
 static struct workqueue_struct *kintegrityd_wq;
 
 /**
- * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
  * @bio:	bio to attach integrity metadata to
  * @gfp_mask:	Memory allocation mask
  * @nr_vecs:	Number of integrity metadata scatter-gather elements
- * @bs:		bio_set to allocate from
  *
  * Description: This function prepares a bio for attaching integrity
  * metadata.  nr_vecs specifies the maximum number of pages containing
  * integrity metadata that can be attached.
  */
-struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
-							 gfp_t gfp_mask,
-							 unsigned int nr_vecs,
-							 struct bio_set *bs)
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+						  gfp_t gfp_mask,
+						  unsigned int nr_vecs)
 {
 	struct bio_integrity_payload *bip;
 	struct bio_vec *iv;
@@ -50,7 +50,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	BUG_ON(bio == NULL);
 
-	bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+	bip = mempool_alloc(bio_integrity_pool, gfp_mask);
 	if (unlikely(bip == NULL)) {
 		printk(KERN_ERR "%s: could not alloc bip\n", __func__);
 		return NULL;
@@ -58,10 +58,10 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	memset(bip, 0, sizeof(*bip));
 
-	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, integrity_bio_set);
 	if (unlikely(iv == NULL)) {
 		printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
-		mempool_free(bip, bs->bio_integrity_pool);
+		mempool_free(bip, bio_integrity_pool);
 		return NULL;
 	}
 
@@ -72,35 +72,16 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
 
 	return bip;
 }
-EXPORT_SYMBOL(bio_integrity_alloc_bioset);
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio:	bio to attach integrity metadata to
- * @gfp_mask:	Memory allocation mask
- * @nr_vecs:	Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata.  nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
-						  gfp_t gfp_mask,
-						  unsigned int nr_vecs)
-{
-	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
-}
 EXPORT_SYMBOL(bio_integrity_alloc);
 
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
- * @bs:		bio_set this bio was allocated from
  *
  * Description: Used to free the integrity portion of a bio. Usually
  * called from bio_free().
  */
-void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+void bio_integrity_free(struct bio *bio)
 {
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 
@@ -111,8 +92,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
 	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
-	bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
-	mempool_free(bip, bs->bio_integrity_pool);
+	bvec_free_bs(integrity_bio_set, bip->bip_vec, bip->bip_pool);
+	mempool_free(bip, bio_integrity_pool);
 
 	bio->bi_integrity = NULL;
 }
@@ -685,19 +666,18 @@ EXPORT_SYMBOL(bio_integrity_split);
  * bio_integrity_clone - Callback for cloning bios with integrity metadata
  * @bio:	New bio
  * @bio_src:	Original bio
- * @bs:		bio_set to allocate bip from
+ * @gfp_mask:	Memory allocation mask
  *
  * Description:	Called to allocate a bip when cloning a bio
  */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
-			struct bio_set *bs)
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask)
 {
 	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
 	struct bio_integrity_payload *bip;
 
 	BUG_ON(bip_src == NULL);
 
-	bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
 
 	if (bip == NULL)
 		return -EIO;
@@ -713,37 +693,25 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 }
 EXPORT_SYMBOL(bio_integrity_clone);
 
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
+static int __init bio_integrity_init(void)
 {
-	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
-							  bio_integrity_slab);
-	if (!bs->bio_integrity_pool)
-		return -1;
-
-	return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
+	kintegrityd_wq = create_workqueue("kintegrityd");
 
-void bioset_integrity_free(struct bio_set *bs)
-{
-	if (bs->bio_integrity_pool)
-		mempool_destroy(bs->bio_integrity_pool);
-}
-EXPORT_SYMBOL(bioset_integrity_free);
+	if (!kintegrityd_wq)
+		panic("Failed to create kintegrityd\n");
 
-void __init bio_integrity_init_slab(void)
-{
 	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
 					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
-}
 
-static int __init integrity_init(void)
-{
-	kintegrityd_wq = create_workqueue("kintegrityd");
+	bio_integrity_pool = mempool_create_slab_pool(BIO_POOL_SIZE,
+						      bio_integrity_slab);
+	if (!bio_integrity_pool)
+		panic("bio_integrity: can't allocate bip pool\n");
 
-	if (!kintegrityd_wq)
-		panic("Failed to create kintegrityd\n");
+	integrity_bio_set = bioset_create(BIO_POOL_SIZE, 0);
+	if (!integrity_bio_set)
+		panic("bio_integrity: can't allocate bio_set\n");
 
 	return 0;
 }
-subsys_initcall(integrity_init);
+subsys_initcall(bio_integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 062299a..a040cde 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -248,7 +248,7 @@ void bio_free(struct bio *bio, struct bio_set *bs)
 		bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 	if (bio_integrity(bio))
-		bio_integrity_free(bio, bs);
+		bio_integrity_free(bio);
 
 	/*
 	 * If we have front padding, adjust the bio pointer before freeing
@@ -301,47 +301,51 @@ void bio_init(struct bio *bio)
  **/
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	struct bio_vec *bvl = NULL;
 	struct bio *bio = NULL;
+	unsigned long idx = 0;
+	void *p = NULL;
 
 	if (bs) {
-		void *p = mempool_alloc(bs->bio_pool, gfp_mask);
-
-		if (p)
-			bio = p + bs->front_pad;
-	} else
+		p = mempool_alloc(bs->bio_pool, gfp_mask);
+		if (!p)
+			goto err;
+		bio = p + bs->front_pad;
+	} else {
 		bio = kmalloc(sizeof(*bio), gfp_mask);
+		if (!bio)
+			goto err;
+	}
 
-	if (likely(bio)) {
-		struct bio_vec *bvl = NULL;
-
-		bio_init(bio);
-		if (likely(nr_iovecs)) {
-			unsigned long uninitialized_var(idx);
-
-			if (nr_iovecs <= BIO_INLINE_VECS) {
-				idx = 0;
-				bvl = bio->bi_inline_vecs;
-				nr_iovecs = BIO_INLINE_VECS;
-			} else {
-				bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
-							bs);
-				nr_iovecs = bvec_nr_vecs(idx);
-			}
-			if (unlikely(!bvl)) {
-				if (bs)
-					mempool_free(bio, bs->bio_pool);
-				else
-					kfree(bio);
-				bio = NULL;
-				goto out;
-			}
-			bio->bi_flags |= idx << BIO_POOL_OFFSET;
-			bio->bi_max_vecs = nr_iovecs;
-		}
-		bio->bi_io_vec = bvl;
+	bio_init(bio);
+
+	if (unlikely(!nr_iovecs))
+		goto out_set;
+
+	if (nr_iovecs <= BIO_INLINE_VECS) {
+		bvl = bio->bi_inline_vecs;
+		nr_iovecs = BIO_INLINE_VECS;
+	} else {
+		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+		if (unlikely(!bvl))
+			goto err_free;
+
+		nr_iovecs = bvec_nr_vecs(idx);
 	}
-out:
+	bio->bi_flags |= idx << BIO_POOL_OFFSET;
+	bio->bi_max_vecs = nr_iovecs;
+out_set:
+	bio->bi_io_vec = bvl;
+
 	return bio;
+
+err_free:
+	if (bs)
+		mempool_free(p, bs->bio_pool);
+	else
+		kfree(bio);
+err:
+	return NULL;
 }
 
 struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
@@ -462,10 +466,12 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 	if (bio_integrity(bio)) {
 		int ret;
 
-		ret = bio_integrity_clone(b, bio, fs_bio_set);
+		ret = bio_integrity_clone(b, bio, gfp_mask);
 
-		if (ret < 0)
+		if (ret < 0) {
+			bio_put(b);
 			return NULL;
+		}
 	}
 
 	return b;
@@ -1523,7 +1529,6 @@ void bioset_free(struct bio_set *bs)
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
-	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
 	bio_put_slab(bs);
 
@@ -1564,9 +1569,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (bioset_integrity_create(bs, pool_size))
-		goto bad;
-
 	if (!biovec_create_pools(bs, pool_size))
 		return bs;
 
@@ -1583,6 +1585,13 @@ static void __init biovec_init_slabs(void)
 		int size;
 		struct biovec_slab *bvs = bvec_slabs + i;
 
+#ifndef CONFIG_BLK_DEV_INTEGRITY
+		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
+			bvs->slab = NULL;
+			continue;
+		}
+#endif
+
 		size = bvs->nr_vecs * sizeof(struct bio_vec);
 		bvs->slab = kmem_cache_create(bvs->name, size, 0,
                                 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
@@ -1597,7 +1606,6 @@ static int __init init_bio(void)
 	if (!bio_slabs)
 		panic("bio: can't allocate bios\n");
 
-	bio_integrity_init_slab();
 	biovec_init_slabs();
 
 	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b3c1eff..8c3c689 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/buffer_head.h>
+#include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/mount.h>
@@ -174,6 +175,151 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping.  Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+	int ret = 0;
+
+	if (bdev)
+		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
+	return ret;
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * device.   Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_bdev(struct block_device *bdev)
+{
+	struct super_block *sb = get_super(bdev);
+	if (sb) {
+		int res = fsync_super(sb);
+		drop_super(sb);
+		return res;
+	}
+	return sync_blockdev(bdev);
+}
+
+/**
+ * freeze_bdev  --  lock a filesystem and force it into a consistent state
+ * @bdev:	blockdevice to lock
+ *
+ * This takes the block device bd_mount_sem to make sure no new mounts
+ * happen on bdev until thaw_bdev() is called.
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
+ * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
+ */
+struct super_block *freeze_bdev(struct block_device *bdev)
+{
+	struct super_block *sb;
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (bdev->bd_fsfreeze_count > 0) {
+		bdev->bd_fsfreeze_count++;
+		sb = get_super(bdev);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return sb;
+	}
+	bdev->bd_fsfreeze_count++;
+
+	down(&bdev->bd_mount_sem);
+	sb = get_super(bdev);
+	if (sb && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_frozen = SB_FREEZE_WRITE;
+		smp_wmb();
+
+		__fsync_super(sb);
+
+		sb->s_frozen = SB_FREEZE_TRANS;
+		smp_wmb();
+
+		sync_blockdev(sb->s_bdev);
+
+		if (sb->s_op->freeze_fs) {
+			error = sb->s_op->freeze_fs(sb);
+			if (error) {
+				printk(KERN_ERR
+					"VFS:Filesystem freeze failed\n");
+				sb->s_frozen = SB_UNFROZEN;
+				drop_super(sb);
+				up(&bdev->bd_mount_sem);
+				bdev->bd_fsfreeze_count--;
+				mutex_unlock(&bdev->bd_fsfreeze_mutex);
+				return ERR_PTR(error);
+			}
+		}
+	}
+
+	sync_blockdev(bdev);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+
+	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
+}
+EXPORT_SYMBOL(freeze_bdev);
+
+/**
+ * thaw_bdev  -- unlock filesystem
+ * @bdev:	blockdevice to unlock
+ * @sb:		associated superblock
+ *
+ * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ */
+int thaw_bdev(struct block_device *bdev, struct super_block *sb)
+{
+	int error = 0;
+
+	mutex_lock(&bdev->bd_fsfreeze_mutex);
+	if (!bdev->bd_fsfreeze_count) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return -EINVAL;
+	}
+
+	bdev->bd_fsfreeze_count--;
+	if (bdev->bd_fsfreeze_count > 0) {
+		if (sb)
+			drop_super(sb);
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return 0;
+	}
+
+	if (sb) {
+		BUG_ON(sb->s_bdev != bdev);
+		if (!(sb->s_flags & MS_RDONLY)) {
+			if (sb->s_op->unfreeze_fs) {
+				error = sb->s_op->unfreeze_fs(sb);
+				if (error) {
+					printk(KERN_ERR
+						"VFS:Filesystem thaw failed\n");
+					sb->s_frozen = SB_FREEZE_TRANS;
+					bdev->bd_fsfreeze_count++;
+					mutex_unlock(&bdev->bd_fsfreeze_mutex);
+					return error;
+				}
+			}
+			sb->s_frozen = SB_UNFROZEN;
+			smp_wmb();
+			wake_up(&sb->s_wait_unfrozen);
+		}
+		drop_super(sb);
+	}
+
+	up(&bdev->bd_mount_sem);
+	mutex_unlock(&bdev->bd_fsfreeze_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(thaw_bdev);
+
 static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 {
 	return block_write_full_page(page, blkdev_get_block, wbc);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f8fcf99..7bb3c02 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -16,3 +16,16 @@ config BTRFS_FS
 	  module will be called btrfs.
 
 	  If unsure, say N.
+
+config BTRFS_FS_POSIX_ACL
+	bool "Btrfs POSIX Access Control Lists"
+	depends on BTRFS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8e2fec0..c84ca1f 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -16,11 +16,11 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/version.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-# include <linux/freezer.h>
+#include <linux/freezer.h>
+#include <linux/ftrace.h>
 #include "async-thread.h"
 
 #define WORK_QUEUED_BIT 0
@@ -143,6 +143,7 @@ static int worker_loop(void *arg)
 	struct btrfs_work *work;
 	do {
 		spin_lock_irq(&worker->lock);
+again_locked:
 		while (!list_empty(&worker->pending)) {
 			cur = worker->pending.next;
 			work = list_entry(cur, struct btrfs_work, list);
@@ -165,14 +166,50 @@ static int worker_loop(void *arg)
 			check_idle_worker(worker);
 
 		}
-		worker->working = 0;
 		if (freezing(current)) {
+			worker->working = 0;
+			spin_unlock_irq(&worker->lock);
 			refrigerator();
 		} else {
-			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&worker->lock);
-			if (!kthread_should_stop())
+			if (!kthread_should_stop()) {
+				cpu_relax();
+				/*
+				 * we've dropped the lock, did someone else
+				 * jump_in?
+				 */
+				smp_mb();
+				if (!list_empty(&worker->pending))
+					continue;
+
+				/*
+				 * this short schedule allows more work to
+				 * come in without the queue functions
+				 * needing to go through wake_up_process()
+				 *
+				 * worker->working is still 1, so nobody
+				 * is going to try and wake us up
+				 */
+				schedule_timeout(1);
+				smp_mb();
+				if (!list_empty(&worker->pending))
+					continue;
+
+				/* still no more work?, sleep for real */
+				spin_lock_irq(&worker->lock);
+				set_current_state(TASK_INTERRUPTIBLE);
+				if (!list_empty(&worker->pending))
+					goto again_locked;
+
+				/*
+				 * this makes sure we get a wakeup when someone
+				 * adds something new to the queue
+				 */
+				worker->working = 0;
+				spin_unlock_irq(&worker->lock);
+
 				schedule();
+			}
 			__set_current_state(TASK_RUNNING);
 		}
 	} while (!kthread_should_stop());
@@ -350,13 +387,14 @@ int btrfs_requeue_work(struct btrfs_work *work)
 {
 	struct btrfs_worker_thread *worker = work->worker;
 	unsigned long flags;
+	int wake = 0;
 
 	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
 		goto out;
 
 	spin_lock_irqsave(&worker->lock, flags);
-	atomic_inc(&worker->num_pending);
 	list_add_tail(&work->list, &worker->pending);
+	atomic_inc(&worker->num_pending);
 
 	/* by definition we're busy, take ourselves off the idle
 	 * list
@@ -368,10 +406,16 @@ int btrfs_requeue_work(struct btrfs_work *work)
 			       &worker->workers->worker_list);
 		spin_unlock_irqrestore(&worker->workers->lock, flags);
 	}
+	if (!worker->working) {
+		wake = 1;
+		worker->working = 1;
+	}
 
 	spin_unlock_irqrestore(&worker->lock, flags);
-
+	if (wake)
+		wake_up_process(worker->task);
 out:
+
 	return 0;
 }
 
@@ -398,9 +442,10 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
 	}
 
 	spin_lock_irqsave(&worker->lock, flags);
+
+	list_add_tail(&work->list, &worker->pending);
 	atomic_inc(&worker->num_pending);
 	check_busy_worker(worker);
-	list_add_tail(&work->list, &worker->pending);
 
 	/*
 	 * avoid calling into wake_up_process if this thread has already
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693..72677ce 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,9 @@ struct btrfs_inode {
 	 */
 	struct list_head delalloc_inodes;
 
+	/* the space_info for where this inode's data allocations are done */
+	struct btrfs_space_info *space_info;
+
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 	 * enough field for this.
 	 */
@@ -94,6 +97,11 @@ struct btrfs_inode {
 	 */
 	u64 delalloc_bytes;
 
+	/* total number of bytes that may be used for this inode for
+	 * delalloc
+	 */
+	u64 reserved_bytes;
+
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ee848d8..ab07627 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -32,7 +32,6 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
-#include <linux/version.h>
 #include <linux/pagevec.h>
 #include "compat.h"
 #include "ctree.h"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 9e46c07..37f31b5 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,22 +38,64 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
-inline void btrfs_init_path(struct btrfs_path *p)
-{
-	memset(p, 0, sizeof(*p));
-}
-
 struct btrfs_path *btrfs_alloc_path(void)
 {
 	struct btrfs_path *path;
-	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
-	if (path) {
-		btrfs_init_path(path);
+	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+	if (path)
 		path->reada = 1;
-	}
 	return path;
 }
 
+/*
+ * set all locked nodes in the path to blocking locks.  This should
+ * be done before scheduling
+ */
+noinline void btrfs_set_path_blocking(struct btrfs_path *p)
+{
+	int i;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+		if (p->nodes[i] && p->locks[i])
+			btrfs_set_lock_blocking(p->nodes[i]);
+	}
+}
+
+/*
+ * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
+ */
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+					struct extent_buffer *held)
+{
+	int i;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/* lockdep really cares that we take all of these spinlocks
+	 * in the right order.  If any of the locks in the path are not
+	 * currently blocking, it is going to complain.  So, make really
+	 * really sure by forcing the path to blocking before we clear
+	 * the path blocking.
+	 */
+	if (held)
+		btrfs_set_lock_blocking(held);
+	btrfs_set_path_blocking(p);
+#endif
+
+	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
+		if (p->nodes[i] && p->locks[i])
+			btrfs_clear_lock_blocking(p->nodes[i]);
+	}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (held)
+		btrfs_clear_lock_blocking(held);
+#endif
+}
+
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
@@ -235,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 
 	if (parent)
 		parent_start = parent->start;
@@ -261,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 						  trans->transid, level, &ins);
 		BUG_ON(ret);
 		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-					    buf->len);
+					    buf->len, level);
 	} else {
 		cow = btrfs_alloc_free_block(trans, root, buf->len,
 					     parent_start,
@@ -272,6 +314,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
+	/* cow is set to blocking by btrfs_init_new_buffer */
+
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
@@ -388,17 +432,20 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(1);
 	}
 
-	spin_lock(&root->fs_info->hash_lock);
 	if (btrfs_header_generation(buf) == trans->transid &&
 	    btrfs_header_owner(buf) == root->root_key.objectid &&
 	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 		*cow_ret = buf;
-		spin_unlock(&root->fs_info->hash_lock);
 		WARN_ON(prealloc_dest);
 		return 0;
 	}
-	spin_unlock(&root->fs_info->hash_lock);
+
 	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+
+	if (parent)
+		btrfs_set_lock_blocking(parent);
+	btrfs_set_lock_blocking(buf);
+
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0,
 				 prealloc_dest);
@@ -504,6 +551,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 	if (parent_nritems == 1)
 		return 0;
 
+	btrfs_set_lock_blocking(parent);
+
 	for (i = start_slot; i < end_slot; i++) {
 		int close = 1;
 
@@ -564,6 +613,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 			search_start = last_block;
 
 		btrfs_tree_lock(cur);
+		btrfs_set_lock_blocking(cur);
 		err = __btrfs_cow_block(trans, root, cur, parent, i,
 					&cur, search_start,
 					min(16 * blocksize,
@@ -862,6 +912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		return 0;
 
 	mid = path->nodes[level];
+
 	WARN_ON(!path->locks[level]);
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
@@ -883,8 +934,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
-		btrfs_tree_lock(child);
 		BUG_ON(!child);
+		btrfs_tree_lock(child);
+		btrfs_set_lock_blocking(child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
 		BUG_ON(ret);
 
@@ -900,6 +952,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
+
 		path->locks[level] = 0;
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
@@ -924,6 +977,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
 		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
 		wret = btrfs_cow_block(trans, root, left,
 				       parent, pslot - 1, &left, 0);
 		if (wret) {
@@ -934,6 +988,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	right = read_node_slot(root, parent, pslot + 1);
 	if (right) {
 		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
 		wret = btrfs_cow_block(trans, root, right,
 				       parent, pslot + 1, &right, 0);
 		if (wret) {
@@ -1109,6 +1164,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 		u32 left_nr;
 
 		btrfs_tree_lock(left);
+		btrfs_set_lock_blocking(left);
+
 		left_nr = btrfs_header_nritems(left);
 		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -1155,7 +1212,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	 */
 	if (right) {
 		u32 right_nr;
+
 		btrfs_tree_lock(right);
+		btrfs_set_lock_blocking(right);
+
 		right_nr = btrfs_header_nritems(right);
 		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
 			wret = 1;
@@ -1210,8 +1270,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 	struct btrfs_disk_key disk_key;
 	u32 nritems;
 	u64 search;
-	u64 lowest_read;
-	u64 highest_read;
+	u64 target;
 	u64 nread = 0;
 	int direction = path->reada;
 	struct extent_buffer *eb;
@@ -1235,8 +1294,7 @@ static noinline void reada_for_search(struct btrfs_root *root,
 		return;
 	}
 
-	highest_read = search;
-	lowest_read = search;
+	target = search;
 
 	nritems = btrfs_header_nritems(node);
 	nr = slot;
@@ -1256,27 +1314,80 @@ static noinline void reada_for_search(struct btrfs_root *root,
 				break;
 		}
 		search = btrfs_node_blockptr(node, nr);
-		if ((search >= lowest_read && search <= highest_read) ||
-		    (search < lowest_read && lowest_read - search <= 16384) ||
-		    (search > highest_read && search - highest_read <= 16384)) {
+		if ((search <= target && target - search <= 65536) ||
+		    (search > target && search - target <= 65536)) {
 			readahead_tree_block(root, search, blocksize,
 				     btrfs_node_ptr_generation(node, nr));
 			nread += blocksize;
 		}
 		nscan++;
-		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
+		if ((nread > 65536 || nscan > 32))
 			break;
+	}
+}
 
-		if (nread > (256 * 1024) || nscan > 128)
-			break;
+/*
+ * returns -EAGAIN if it had to drop the path, or zero if everything was in
+ * cache
+ */
+static noinline int reada_for_balance(struct btrfs_root *root,
+				      struct btrfs_path *path, int level)
+{
+	int slot;
+	int nritems;
+	struct extent_buffer *parent;
+	struct extent_buffer *eb;
+	u64 gen;
+	u64 block1 = 0;
+	u64 block2 = 0;
+	int ret = 0;
+	int blocksize;
+
+	parent = path->nodes[level - 1];
+	if (!parent)
+		return 0;
 
-		if (search < lowest_read)
-			lowest_read = search;
-		if (search > highest_read)
-			highest_read = search;
+	nritems = btrfs_header_nritems(parent);
+	slot = path->slots[level];
+	blocksize = btrfs_level_size(root, level);
+
+	if (slot > 0) {
+		block1 = btrfs_node_blockptr(parent, slot - 1);
+		gen = btrfs_node_ptr_generation(parent, slot - 1);
+		eb = btrfs_find_tree_block(root, block1, blocksize);
+		if (eb && btrfs_buffer_uptodate(eb, gen))
+			block1 = 0;
+		free_extent_buffer(eb);
+	}
+	if (slot < nritems) {
+		block2 = btrfs_node_blockptr(parent, slot + 1);
+		gen = btrfs_node_ptr_generation(parent, slot + 1);
+		eb = btrfs_find_tree_block(root, block2, blocksize);
+		if (eb && btrfs_buffer_uptodate(eb, gen))
+			block2 = 0;
+		free_extent_buffer(eb);
 	}
+	if (block1 || block2) {
+		ret = -EAGAIN;
+		btrfs_release_path(root, path);
+		if (block1)
+			readahead_tree_block(root, block1, blocksize, 0);
+		if (block2)
+			readahead_tree_block(root, block2, blocksize, 0);
+
+		if (block1) {
+			eb = read_tree_block(root, block1, blocksize, 0);
+			free_extent_buffer(eb);
+		}
+		if (block1) {
+			eb = read_tree_block(root, block2, blocksize, 0);
+			free_extent_buffer(eb);
+		}
+	}
+	return ret;
 }
 
+
 /*
  * when we walk down the tree, it is usually safe to unlock the higher layers
  * in the tree.  The exceptions are when our path goes through slot 0, because
@@ -1328,6 +1439,32 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
 }
 
 /*
+ * This releases any locks held in the path starting at level and
+ * going all the way up to the root.
+ *
+ * btrfs_search_slot will keep the lock held on higher nodes in a few
+ * corner cases, such as COW of the block at slot zero in the node.  This
+ * ignores those rules, and it should only be called when there are no
+ * more updates to be done higher up in the tree.
+ */
+noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
+{
+	int i;
+
+	if (path->keep_locks || path->lowest_level)
+		return;
+
+	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
+		if (!path->nodes[i])
+			continue;
+		if (!path->locks[i])
+			continue;
+		btrfs_tree_unlock(path->nodes[i]);
+		path->locks[i] = 0;
+	}
+}
+
+/*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
  * level of the path (level 0)
@@ -1387,32 +1524,30 @@ again:
 			int wret;
 
 			/* is a cow on this block not required */
-			spin_lock(&root->fs_info->hash_lock);
 			if (btrfs_header_generation(b) == trans->transid &&
 			    btrfs_header_owner(b) == root->root_key.objectid &&
 			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
-				spin_unlock(&root->fs_info->hash_lock);
 				goto cow_done;
 			}
-			spin_unlock(&root->fs_info->hash_lock);
 
 			/* ok, we have to cow, is our old prealloc the right
 			 * size?
 			 */
 			if (prealloc_block.objectid &&
 			    prealloc_block.offset != b->len) {
+				btrfs_release_path(root, p);
 				btrfs_free_reserved_extent(root,
 					   prealloc_block.objectid,
 					   prealloc_block.offset);
 				prealloc_block.objectid = 0;
+				goto again;
 			}
 
 			/*
 			 * for higher level blocks, try not to allocate blocks
 			 * with the block and the parent locks held.
 			 */
-			if (level > 1 && !prealloc_block.objectid &&
-			    btrfs_path_lock_waiting(p, level)) {
+			if (level > 0 && !prealloc_block.objectid) {
 				u32 size = b->len;
 				u64 hint = b->start;
 
@@ -1425,6 +1560,8 @@ again:
 				goto again;
 			}
 
+			btrfs_set_path_blocking(p);
+
 			wret = btrfs_cow_block(trans, root, b,
 					       p->nodes[level + 1],
 					       p->slots[level + 1],
@@ -1446,6 +1583,22 @@ cow_done:
 		if (!p->skip_locking)
 			p->locks[level] = 1;
 
+		btrfs_clear_path_blocking(p, NULL);
+
+		/*
+		 * we have a lock on b and as long as we aren't changing
+		 * the tree, there is no way to for the items in b to change.
+		 * It is safe to drop the lock on our parent before we
+		 * go through the expensive btree search on b.
+		 *
+		 * If cow is true, then we might be changing slot zero,
+		 * which may require changing the parent.  So, we can't
+		 * drop the lock until after we know which slot we're
+		 * operating on.
+		 */
+		if (!cow)
+			btrfs_unlock_up_safe(p, level + 1);
+
 		ret = check_block(root, p, level);
 		if (ret) {
 			ret = -1;
@@ -1453,6 +1606,7 @@ cow_done:
 		}
 
 		ret = bin_search(b, key, level, &slot);
+
 		if (level != 0) {
 			if (ret && slot > 0)
 				slot -= 1;
@@ -1460,7 +1614,16 @@ cow_done:
 			if ((p->search_for_split || ins_len > 0) &&
 			    btrfs_header_nritems(b) >=
 			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
-				int sret = split_node(trans, root, p, level);
+				int sret;
+
+				sret = reada_for_balance(root, p, level);
+				if (sret)
+					goto again;
+
+				btrfs_set_path_blocking(p);
+				sret = split_node(trans, root, p, level);
+				btrfs_clear_path_blocking(p, NULL);
+
 				BUG_ON(sret > 0);
 				if (sret) {
 					ret = sret;
@@ -1468,9 +1631,19 @@ cow_done:
 				}
 				b = p->nodes[level];
 				slot = p->slots[level];
-			} else if (ins_len < 0) {
-				int sret = balance_level(trans, root, p,
-							 level);
+			} else if (ins_len < 0 &&
+				   btrfs_header_nritems(b) <
+				   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+				int sret;
+
+				sret = reada_for_balance(root, p, level);
+				if (sret)
+					goto again;
+
+				btrfs_set_path_blocking(p);
+				sret = balance_level(trans, root, p, level);
+				btrfs_clear_path_blocking(p, NULL);
+
 				if (sret) {
 					ret = sret;
 					goto done;
@@ -1504,7 +1677,7 @@ cow_done:
 				 * of the btree by dropping locks before
 				 * we read.
 				 */
-				if (level > 1) {
+				if (level > 0) {
 					btrfs_release_path(NULL, p);
 					if (tmp)
 						free_extent_buffer(tmp);
@@ -1519,6 +1692,7 @@ cow_done:
 						free_extent_buffer(tmp);
 					goto again;
 				} else {
+					btrfs_set_path_blocking(p);
 					if (tmp)
 						free_extent_buffer(tmp);
 					if (should_reada)
@@ -1528,14 +1702,29 @@ cow_done:
 					b = read_node_slot(root, b, slot);
 				}
 			}
-			if (!p->skip_locking)
-				btrfs_tree_lock(b);
+			if (!p->skip_locking) {
+				int lret;
+
+				btrfs_clear_path_blocking(p, NULL);
+				lret = btrfs_try_spin_lock(b);
+
+				if (!lret) {
+					btrfs_set_path_blocking(p);
+					btrfs_tree_lock(b);
+					btrfs_clear_path_blocking(p, b);
+				}
+			}
 		} else {
 			p->slots[level] = slot;
 			if (ins_len > 0 &&
 			    btrfs_leaf_free_space(root, b) < ins_len) {
-				int sret = split_leaf(trans, root, key,
+				int sret;
+
+				btrfs_set_path_blocking(p);
+				sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
+				btrfs_clear_path_blocking(p, NULL);
+
 				BUG_ON(sret > 0);
 				if (sret) {
 					ret = sret;
@@ -1549,12 +1738,16 @@ cow_done:
 	}
 	ret = 1;
 done:
+	/*
+	 * we don't really know what they plan on doing with the path
+	 * from here on, so for now just mark it as blocking
+	 */
+	btrfs_set_path_blocking(p);
 	if (prealloc_block.objectid) {
 		btrfs_free_reserved_extent(root,
 			   prealloc_block.objectid,
 			   prealloc_block.offset);
 	}
-
 	return ret;
 }
 
@@ -1578,6 +1771,8 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
 	BUG_ON(ret);
 
+	btrfs_set_lock_blocking(eb);
+
 	parent = eb;
 	while (1) {
 		level = btrfs_header_level(parent);
@@ -1602,6 +1797,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 			eb = read_tree_block(root, bytenr, blocksize,
 					     generation);
 			btrfs_tree_lock(eb);
+			btrfs_set_lock_blocking(eb);
 		}
 
 		/*
@@ -1626,6 +1822,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
 				eb = read_tree_block(root, bytenr, blocksize,
 						generation);
 				btrfs_tree_lock(eb);
+				btrfs_set_lock_blocking(eb);
 			}
 
 			ret = btrfs_cow_block(trans, root, eb, parent, slot,
@@ -2168,10 +2365,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
 
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+	btrfs_assert_tree_locked(path->nodes[1]);
 
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
+	btrfs_set_lock_blocking(right);
+
 	free_space = btrfs_leaf_free_space(root, right);
 	if (free_space < data_size)
 		goto out_unlock;
@@ -2363,10 +2562,12 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right_nritems == 0)
 		return 1;
 
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+	btrfs_assert_tree_locked(path->nodes[1]);
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
+	btrfs_set_lock_blocking(left);
+
 	free_space = btrfs_leaf_free_space(root, left);
 	if (free_space < data_size) {
 		ret = 1;
@@ -2825,6 +3026,12 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
 	path->keep_locks = 0;
 	BUG_ON(ret);
 
+	/*
+	 * make sure any changes to the path from split_leaf leave it
+	 * in a blocking state
+	 */
+	btrfs_set_path_blocking(path);
+
 	leaf = path->nodes[0];
 	BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
 
@@ -3354,6 +3561,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
 		BUG();
 	}
 out:
+	btrfs_unlock_up_safe(path, 1);
 	return ret;
 }
 
@@ -3441,15 +3649,22 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 root_gen = btrfs_header_generation(path->nodes[1]);
+	u64 parent_start = path->nodes[1]->start;
+	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
 
 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
 	if (ret)
 		return ret;
 
+	/*
+	 * btrfs_free_extent is expensive, we want to make sure we
+	 * aren't holding any locks when we call it
+	 */
+	btrfs_unlock_up_safe(path, 0);
+
 	ret = btrfs_free_extent(trans, root, bytenr,
 				btrfs_level_size(root, 0),
-				path->nodes[1]->start,
-				btrfs_header_owner(path->nodes[1]),
+				parent_start, parent_owner,
 				root_gen, 0, 1);
 	return ret;
 }
@@ -3721,6 +3936,7 @@ find_next_key:
 		 */
 		if (slot >= nritems) {
 			path->slots[level] = slot;
+			btrfs_set_path_blocking(path);
 			sret = btrfs_find_next_key(root, path, min_key, level,
 						  cache_only, min_trans);
 			if (sret == 0) {
@@ -3738,16 +3954,20 @@ find_next_key:
 			unlock_up(path, level, 1);
 			goto out;
 		}
+		btrfs_set_path_blocking(path);
 		cur = read_node_slot(root, cur, slot);
 
 		btrfs_tree_lock(cur);
+
 		path->locks[level - 1] = 1;
 		path->nodes[level - 1] = cur;
 		unlock_up(path, level, 1);
+		btrfs_clear_path_blocking(path, NULL);
 	}
 out:
 	if (ret == 0)
 		memcpy(min_key, &found_key, sizeof(found_key));
+	btrfs_set_path_blocking(path);
 	return ret;
 }
 
@@ -3843,6 +4063,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 	if (ret < 0)
 		return ret;
 
+	btrfs_set_path_blocking(path);
 	nritems = btrfs_header_nritems(path->nodes[0]);
 	/*
 	 * by releasing the path above we dropped all our locks.  A balance
@@ -3873,14 +4094,16 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			free_extent_buffer(next);
 		}
 
+		/* the path was set to blocking above */
 		if (level == 1 && (path->locks[1] || path->skip_locking) &&
 		    path->reada)
 			reada_for_search(root, path, level, slot, 0);
 
 		next = read_node_slot(root, c, slot);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(c));
+			btrfs_assert_tree_locked(c);
 			btrfs_tree_lock(next);
+			btrfs_set_lock_blocking(next);
 		}
 		break;
 	}
@@ -3897,12 +4120,15 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			path->locks[level] = 1;
 		if (!level)
 			break;
+
+		btrfs_set_path_blocking(path);
 		if (level == 1 && path->locks[1] && path->reada)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+			btrfs_assert_tree_locked(path->nodes[level]);
 			btrfs_tree_lock(next);
+			btrfs_set_lock_blocking(next);
 		}
 	}
 done:
@@ -3927,6 +4153,7 @@ int btrfs_previous_item(struct btrfs_root *root,
 
 	while (1) {
 		if (path->slots[0] == 0) {
+			btrfs_set_path_blocking(path);
 			ret = btrfs_prev_leaf(root, path);
 			if (ret != 0)
 				return ret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eee060f..5e1d4e3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-#ifdef CONFIG_LOCKDEP
-# define BTRFS_MAX_LEVEL 7
-#else
-# define BTRFS_MAX_LEVEL 8
-#endif
+#define BTRFS_MAX_LEVEL 8
 
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -454,17 +450,11 @@ struct btrfs_timespec {
 	__le32 nsec;
 } __attribute__ ((__packed__));
 
-typedef enum {
+enum btrfs_compression_type {
 	BTRFS_COMPRESS_NONE = 0,
 	BTRFS_COMPRESS_ZLIB = 1,
 	BTRFS_COMPRESS_LAST = 2,
-} btrfs_compression_type;
-
-/* we don't understand any encryption methods right now */
-typedef enum {
-	BTRFS_ENCRYPTION_NONE = 0,
-	BTRFS_ENCRYPTION_LAST = 1,
-} btrfs_encryption_type;
+};
 
 struct btrfs_inode_item {
 	/* nfs style generation number */
@@ -606,13 +596,27 @@ struct btrfs_block_group_item {
 
 struct btrfs_space_info {
 	u64 flags;
-	u64 total_bytes;
-	u64 bytes_used;
-	u64 bytes_pinned;
-	u64 bytes_reserved;
-	u64 bytes_readonly;
-	int full;
-	int force_alloc;
+
+	u64 total_bytes;	/* total bytes in the space */
+	u64 bytes_used;		/* total bytes used on disk */
+	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
+				   transaction finishes */
+	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+				   current allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	/* delalloc accounting */
+	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
+				   this space is not necessarily reserved yet
+				   by the allocator */
+	u64 bytes_may_use;	/* number of bytes that may be used for
+				   delalloc */
+
+	int full;		/* indicates that we cannot allocate any more
+				   chunks for this space */
+	int force_alloc;	/* set if we need to force a chunk alloc for
+				   this space */
+
 	struct list_head list;
 
 	/* for block groups in our same type */
@@ -701,9 +705,7 @@ struct btrfs_fs_info {
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
 	wait_queue_head_t transaction_wait;
-
 	wait_queue_head_t async_submit_wait;
-	wait_queue_head_t tree_log_wait;
 
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
@@ -711,7 +713,6 @@ struct btrfs_fs_info {
 	struct super_block *sb;
 	struct inode *btree_inode;
 	struct backing_dev_info bdi;
-	spinlock_t hash_lock;
 	struct mutex trans_mutex;
 	struct mutex tree_log_mutex;
 	struct mutex transaction_kthread_mutex;
@@ -730,10 +731,6 @@ struct btrfs_fs_info {
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
 	atomic_t async_delalloc_pages;
-	atomic_t tree_log_writers;
-	atomic_t tree_log_commit;
-	unsigned long tree_log_batch;
-	u64 tree_log_transid;
 
 	/*
 	 * this is used by the balancing code to wait for all the pending
@@ -787,7 +784,14 @@ struct btrfs_fs_info {
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
+
+	/*
+	 * the space_info list is almost entirely read only.  It only changes
+	 * when we add a new raid type to the FS, and that happens
+	 * very rarely.  RCU is used to protect it.
+	 */
 	struct list_head space_info;
+
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
@@ -833,7 +837,14 @@ struct btrfs_root {
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
 	struct mutex objectid_mutex;
+
 	struct mutex log_mutex;
+	wait_queue_head_t log_writer_wait;
+	wait_queue_head_t log_commit_wait[2];
+	atomic_t log_writers;
+	atomic_t log_commit[2];
+	unsigned long log_transid;
+	unsigned long log_batch;
 
 	u64 objectid;
 	u64 last_trans;
@@ -1721,7 +1732,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize);
+					    u64 bytenr, u32 blocksize,
+					    int level);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1791,6 +1803,18 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes);
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes);
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				 u64 bytes);
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
@@ -1840,7 +1864,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
-void btrfs_init_path(struct btrfs_path *p);
+void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
+
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int slot, int nr);
 int btrfs_del_leaf(struct btrfs_trans_handle *trans,
@@ -2034,8 +2060,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 81a3138..6ec80c0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -16,7 +16,6 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#include <linux/version.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
@@ -76,6 +75,40 @@ struct async_submit_bio {
 	struct btrfs_work work;
 };
 
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+	/* leaf */
+	"btrfs-extent-00",
+	"btrfs-extent-01",
+	"btrfs-extent-02",
+	"btrfs-extent-03",
+	"btrfs-extent-04",
+	"btrfs-extent-05",
+	"btrfs-extent-06",
+	"btrfs-extent-07",
+	/* highest possible level */
+	"btrfs-extent-08",
+};
+#endif
+
 /*
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
@@ -348,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+	lockdep_set_class_and_name(&eb->lock,
+			   &btrfs_eb_class[level],
+			   btrfs_eb_name[level]);
+}
+#endif
+
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -393,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	found_level = btrfs_header_level(eb);
 
+	btrfs_set_buffer_lockdep_class(eb, found_level);
+
 	ret = csum_tree_block(root, eb, 1);
 	if (ret)
 		ret = -EIO;
@@ -800,7 +844,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 
 	if (ret == 0)
-		buf->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
 	else
 		WARN_ON(1);
 	return buf;
@@ -813,7 +857,11 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid) {
-		WARN_ON(!btrfs_tree_locked(buf));
+		btrfs_assert_tree_locked(buf);
+
+		/* ugh, clear_extent_buffer_dirty can be expensive */
+		btrfs_set_lock_blocking(buf);
+
 		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
 					  buf);
 	}
@@ -850,6 +898,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	spin_lock_init(&root->list_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
+	init_waitqueue_head(&root->log_writer_wait);
+	init_waitqueue_head(&root->log_commit_wait[0]);
+	init_waitqueue_head(&root->log_commit_wait[1]);
+	atomic_set(&root->log_commit[0], 0);
+	atomic_set(&root->log_commit[1], 0);
+	atomic_set(&root->log_writers, 0);
+	root->log_batch = 0;
+	root->log_transid = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
@@ -934,15 +990,16 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
-			     struct btrfs_fs_info *fs_info)
+static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
 {
 	struct btrfs_root *root;
 	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct extent_buffer *leaf;
 
 	root = kzalloc(sizeof(*root), GFP_NOFS);
 	if (!root)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
@@ -951,12 +1008,23 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
 	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
+	/*
+	 * log trees do not get reference counted because they go away
+	 * before a real commit is actually done.  They do store pointers
+	 * to file data extents, and those reference counts still get
+	 * updated (along with back refs to the log tree).
+	 */
 	root->ref_cows = 0;
 
-	root->node = btrfs_alloc_free_block(trans, root, root->leafsize,
-					    0, BTRFS_TREE_LOG_OBJECTID,
-					    trans->transid, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, BTRFS_TREE_LOG_OBJECTID,
+				      trans->transid, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		kfree(root);
+		return ERR_CAST(leaf);
+	}
 
+	root->node = leaf;
 	btrfs_set_header_nritems(root->node, 0);
 	btrfs_set_header_level(root->node, 0);
 	btrfs_set_header_bytenr(root->node, root->node->start);
@@ -968,7 +1036,48 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			    BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(root->node);
 	btrfs_tree_unlock(root->node);
-	fs_info->log_root_tree = root;
+	return root;
+}
+
+int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *log_root;
+
+	log_root = alloc_log_tree(trans, fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+	WARN_ON(fs_info->log_root_tree);
+	fs_info->log_root_tree = log_root;
+	return 0;
+}
+
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root)
+{
+	struct btrfs_root *log_root;
+	struct btrfs_inode_item *inode_item;
+
+	log_root = alloc_log_tree(trans, root->fs_info);
+	if (IS_ERR(log_root))
+		return PTR_ERR(log_root);
+
+	log_root->last_trans = trans->transid;
+	log_root->root_key.offset = root->root_key.objectid;
+
+	inode_item = &log_root->root_item.inode;
+	inode_item->generation = cpu_to_le64(1);
+	inode_item->size = cpu_to_le64(3);
+	inode_item->nlink = cpu_to_le32(1);
+	inode_item->nbytes = cpu_to_le64(root->leafsize);
+	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+
+	btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
+	btrfs_set_root_generation(&log_root->root_item, trans->transid);
+
+	WARN_ON(root->log_root);
+	root->log_root = log_root;
+	root->log_transid = 0;
 	return 0;
 }
 
@@ -1136,7 +1245,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
 	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
 	int ret = 0;
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct backing_dev_info *bdi;
 #if 0
@@ -1144,8 +1252,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	    btrfs_congested_async(info, 0))
 		return 1;
 #endif
-	list_for_each(cur, &info->fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
 		bdi = blk_get_backing_dev_info(device->bdev);
@@ -1163,13 +1270,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
  */
 static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct btrfs_fs_info *info;
 
 	info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-	list_for_each(cur, &info->fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
 
@@ -1447,7 +1552,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->hashers);
 	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-	spin_lock_init(&fs_info->hash_lock);
 	spin_lock_init(&fs_info->delalloc_lock);
 	spin_lock_init(&fs_info->new_trans_lock);
 	spin_lock_init(&fs_info->ref_cache_lock);
@@ -1535,10 +1639,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	init_waitqueue_head(&fs_info->transaction_throttle);
 	init_waitqueue_head(&fs_info->transaction_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
-	init_waitqueue_head(&fs_info->tree_log_wait);
-	atomic_set(&fs_info->tree_log_commit, 0);
-	atomic_set(&fs_info->tree_log_writers, 0);
-	fs_info->tree_log_transid = 0;
 
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -1627,6 +1727,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	 * low idle thresh
 	 */
 	fs_info->endio_workers.idle_thresh = 4;
+	fs_info->endio_meta_workers.idle_thresh = 4;
+
 	fs_info->endio_write_workers.idle_thresh = 64;
 	fs_info->endio_meta_write_workers.idle_thresh = 64;
 
@@ -1720,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	dev_root->track_dirty = 1;
-
 	if (ret)
 		goto fail_extent_root;
 
@@ -1740,13 +1841,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
-	if (!fs_info->cleaner_kthread)
+	if (IS_ERR(fs_info->cleaner_kthread))
 		goto fail_csum_root;
 
 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
 						   tree_root,
 						   "btrfs-transaction");
-	if (!fs_info->transaction_kthread)
+	if (IS_ERR(fs_info->transaction_kthread))
 		goto fail_cleaner;
 
 	if (btrfs_super_log_root(disk_super) != 0) {
@@ -1828,13 +1929,14 @@ fail_sb_buffer:
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
-fail:
+
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+	bdi_destroy(&fs_info->bdi);
 
+fail:
 	kfree(extent_root);
 	kfree(tree_root);
-	bdi_destroy(&fs_info->bdi);
 	kfree(fs_info);
 	kfree(chunk_root);
 	kfree(dev_root);
@@ -1995,7 +2097,6 @@ static int write_dev_supers(struct btrfs_device *device,
 
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-	struct list_head *cur;
 	struct list_head *head = &root->fs_info->fs_devices->devices;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
@@ -2011,8 +2112,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	sb = &root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
 			continue;
@@ -2045,8 +2145,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	}
 
 	total_errors = 0;
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev)
 			continue;
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2260,7 +2359,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	u64 transid = btrfs_header_generation(buf);
 	struct inode *btree_inode = root->fs_info->btree_inode;
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_set_lock_blocking(buf);
+
+	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
@@ -2284,7 +2385,7 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	unsigned long thresh = 32 * 1024 * 1024;
 	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
-	if (current_is_pdflush() || current->flags & PF_MEMALLOC)
+	if (current->flags & PF_MEMALLOC)
 		return;
 
 	num_dirty = count_range_bits(tree, &start, (u64)-1,
@@ -2302,14 +2403,13 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	int ret;
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 	if (ret == 0)
-		buf->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
 	return ret;
 }
 
 int btree_lock_page_hook(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_buffer *eb;
 	unsigned long len;
@@ -2324,9 +2424,7 @@ int btree_lock_page_hook(struct page *page)
 		goto out;
 
 	btrfs_tree_lock(eb);
-	spin_lock(&root->fs_info->hash_lock);
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-	spin_unlock(&root->fs_info->hash_lock);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c0ff404..95029db 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -98,5 +98,17 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
+int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+						 int level)
+{
+}
+#endif
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 293da65..fefe83a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -19,7 +19,8 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
-#include <linux/version.h>
+#include <linux/sort.h>
+#include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
@@ -30,7 +31,6 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
-#include "compat.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -61,6 +61,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
 
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags, int force);
+
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
@@ -326,16 +330,34 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
 						  u64 flags)
 {
 	struct list_head *head = &info->space_info;
-	struct list_head *cur;
 	struct btrfs_space_info *found;
-	list_for_each(cur, head) {
-		found = list_entry(cur, struct btrfs_space_info, list);
-		if (found->flags == flags)
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list) {
+		if (found->flags == flags) {
+			rcu_read_unlock();
 			return found;
+		}
 	}
+	rcu_read_unlock();
 	return NULL;
 }
 
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+	struct list_head *head = &info->space_info;
+	struct btrfs_space_info *found;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(found, head, list)
+		found->full = 0;
+	rcu_read_unlock();
+}
+
 static u64 div_factor(u64 num, int factor)
 {
 	if (factor == 10)
@@ -1326,8 +1348,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root)
 {
-	finish_current_insert(trans, root->fs_info->extent_root, 1);
-	del_pending_extents(trans, root->fs_info->extent_root, 1);
+	u64 start;
+	u64 end;
+	int ret;
+
+	while(1) {
+		finish_current_insert(trans, root->fs_info->extent_root, 1);
+		del_pending_extents(trans, root->fs_info->extent_root, 1);
+
+		/* is there more work to do? */
+		ret = find_first_extent_bit(&root->fs_info->pending_del,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		ret = find_first_extent_bit(&root->fs_info->extent_ins,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		break;
+	}
 	return 0;
 }
 
@@ -1525,15 +1564,55 @@ out:
 	return ret;
 }
 
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
-		  u32 *nr_extents)
+/* when a block goes through cow, we update the reference counts of
+ * everything that block points to.  The internal pointers of the block
+ * can be in just about any order, and it is likely to have clusters of
+ * things that are close together and clusters of things that are not.
+ *
+ * To help reduce the seeks that come with updating all of these reference
+ * counts, sort them by byte number before actual updates are done.
+ *
+ * struct refsort is used to match byte number to slot in the btree block.
+ * we sort based on the byte number and then use the slot to actually
+ * find the item.
+ *
+ * struct refsort is smaller than strcut btrfs_item and smaller than
+ * struct btrfs_key_ptr.  Since we're currently limited to the page size
+ * for a btree block, there's no way for a kmalloc of refsorts for a
+ * single node to be bigger than a page.
+ */
+struct refsort {
+	u64 bytenr;
+	u32 slot;
+};
+
+/*
+ * for passing into sort()
+ */
+static int refsort_cmp(const void *a_void, const void *b_void)
+{
+	const struct refsort *a = a_void;
+	const struct refsort *b = b_void;
+
+	if (a->bytenr < b->bytenr)
+		return -1;
+	if (a->bytenr > b->bytenr)
+		return 1;
+	return 0;
+}
+
+
+noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct extent_buffer *orig_buf,
+			   struct extent_buffer *buf, u32 *nr_extents)
 {
 	u64 bytenr;
 	u64 ref_root;
 	u64 orig_root;
 	u64 ref_generation;
 	u64 orig_generation;
+	struct refsort *sorted;
 	u32 nritems;
 	u32 nr_file_extents = 0;
 	struct btrfs_key key;
@@ -1542,6 +1621,8 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	int level;
 	int ret = 0;
 	int faili = 0;
+	int refi = 0;
+	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
 			    u64, u64, u64, u64, u64, u64, u64, u64);
 
@@ -1553,6 +1634,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	nritems = btrfs_header_nritems(buf);
 	level = btrfs_header_level(buf);
 
+	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+	BUG_ON(!sorted);
+
 	if (root->ref_cows) {
 		process_func = __btrfs_inc_extent_ref;
 	} else {
@@ -1565,6 +1649,11 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		process_func = __btrfs_update_extent_ref;
 	}
 
+	/*
+	 * we make two passes through the items.  In the first pass we
+	 * only record the byte number and slot.  Then we sort based on
+	 * byte number and do the actual work based on the sorted results
+	 */
 	for (i = 0; i < nritems; i++) {
 		cond_resched();
 		if (level == 0) {
@@ -1581,6 +1670,32 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 				continue;
 
 			nr_file_extents++;
+			sorted[refi].bytenr = bytenr;
+			sorted[refi].slot = i;
+			refi++;
+		} else {
+			bytenr = btrfs_node_blockptr(buf, i);
+			sorted[refi].bytenr = bytenr;
+			sorted[refi].slot = i;
+			refi++;
+		}
+	}
+	/*
+	 * if refi == 0, we didn't actually put anything into the sorted
+	 * array and we're done
+	 */
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	for (i = 0; i < refi; i++) {
+		cond_resched();
+		slot = sorted[i].slot;
+		bytenr = sorted[i].bytenr;
+
+		if (level == 0) {
+			btrfs_item_key_to_cpu(buf, &key, slot);
 
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
@@ -1589,25 +1704,25 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 					   key.objectid);
 
 			if (ret) {
-				faili = i;
+				faili = slot;
 				WARN_ON(1);
 				goto fail;
 			}
 		} else {
-			bytenr = btrfs_node_blockptr(buf, i);
 			ret = process_func(trans, root, bytenr,
 					   orig_buf->start, buf->start,
 					   orig_root, ref_root,
 					   orig_generation, ref_generation,
 					   level - 1);
 			if (ret) {
-				faili = i;
+				faili = slot;
 				WARN_ON(1);
 				goto fail;
 			}
 		}
 	}
 out:
+	kfree(sorted);
 	if (nr_extents) {
 		if (level == 0)
 			*nr_extents = nr_file_extents;
@@ -1616,6 +1731,7 @@ out:
 	}
 	return 0;
 fail:
+	kfree(sorted);
 	WARN_ON(1);
 	return ret;
 }
@@ -1808,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	if (!found)
 		return -ENOMEM;
 
-	list_add(&found->list, &info->space_info);
 	INIT_LIST_HEAD(&found->block_groups);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
@@ -1818,9 +1933,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_pinned = 0;
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
+	found->bytes_delalloc = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
+	list_add_rcu(&found->list, &info->space_info);
 	return 0;
 }
 
@@ -1881,6 +1998,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 alloc_profile;
+
+	if (data) {
+		alloc_profile = info->avail_data_alloc_bits &
+			info->data_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+	} else if (root == root->fs_info->chunk_root) {
+		alloc_profile = info->avail_system_alloc_bits &
+			info->system_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+	} else {
+		alloc_profile = info->avail_metadata_alloc_bits &
+			info->metadata_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+	}
+
+	return btrfs_reduce_alloc_profile(root, data);
+}
+
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+	u64 alloc_target;
+
+	alloc_target = btrfs_get_alloc_profile(root, 1);
+	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+						       alloc_target);
+}
+
+/*
+ * for now this just makes sure we have at least 5% of our metadata space free
+ * for use.
+ */
+int btrfs_check_metadata_free_space(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *meta_sinfo;
+	u64 alloc_target, thresh;
+	int committed = 0, ret;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	meta_sinfo = __find_space_info(info, alloc_target);
+
+again:
+	spin_lock(&meta_sinfo->lock);
+	if (!meta_sinfo->full)
+		thresh = meta_sinfo->total_bytes * 80;
+	else
+		thresh = meta_sinfo->total_bytes * 95;
+
+	do_div(thresh, 100);
+
+	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+		struct btrfs_trans_handle *trans;
+		if (!meta_sinfo->full) {
+			meta_sinfo->force_alloc = 1;
+			spin_unlock(&meta_sinfo->lock);
+
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     2 * 1024 * 1024, alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			goto again;
+		}
+		spin_unlock(&meta_sinfo->lock);
+
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+		return -ENOSPC;
+	}
+	spin_unlock(&meta_sinfo->lock);
+
+	return 0;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+	int ret = 0, committed = 0;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+again:
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+	    data_sinfo->bytes_may_use < bytes) {
+		struct btrfs_trans_handle *trans;
+
+		/*
+		 * if we don't have enough free bytes in this space then we need
+		 * to alloc a new chunk.
+		 */
+		if (!data_sinfo->full) {
+			u64 alloc_target;
+
+			data_sinfo->force_alloc = 1;
+			spin_unlock(&data_sinfo->lock);
+
+			alloc_target = btrfs_get_alloc_profile(root, 1);
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     bytes + 2 * 1024 * 1024,
+					     alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+		spin_unlock(&data_sinfo->lock);
+
+		/* commit the current transaction and try again */
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+
+		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+		       ", %llu bytes_used, %llu bytes_reserved, "
+		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+		return -ENOSPC;
+	}
+	data_sinfo->bytes_may_use += bytes;
+	BTRFS_I(inode)->reserved_bytes += bytes;
+	spin_unlock(&data_sinfo->lock);
+
+	return btrfs_check_metadata_free_space(root);
+}
+
+/*
+ * if there was an error for whatever reason after calling
+ * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_may_use -= bytes;
+	BTRFS_I(inode)->reserved_bytes -= bytes;
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are adding a delalloc extent to the inode's io_tree */
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				  u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* get the space info for where this inode will be storing its data */
+	data_sinfo = BTRFS_I(inode)->space_info;
+
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_delalloc += bytes;
+
+	/*
+	 * we are adding a delalloc extent without calling
+	 * btrfs_check_data_free_space first.  This happens on a weird
+	 * writepage condition, but shouldn't hurt our accounting
+	 */
+	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
+		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
+		BTRFS_I(inode)->reserved_bytes = 0;
+	} else {
+		data_sinfo->bytes_may_use -= bytes;
+		BTRFS_I(inode)->reserved_bytes -= bytes;
+	}
+
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are clearing an delalloc extent from the inode's io_tree */
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes)
+{
+	struct btrfs_space_info *info;
+
+	info = BTRFS_I(inode)->space_info;
+
+	spin_lock(&info->lock);
+	info->bytes_delalloc -= bytes;
+	spin_unlock(&info->lock);
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
@@ -2137,13 +2481,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	u64 end;
 	u64 priv;
 	u64 search = 0;
-	u64 skipped = 0;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_path *path;
 	struct pending_extent_op *extent_op, *tmp;
 	struct list_head insert_list, update_list;
 	int ret;
-	int num_inserts = 0, max_inserts;
+	int num_inserts = 0, max_inserts, restart = 0;
 
 	path = btrfs_alloc_path();
 	INIT_LIST_HEAD(&insert_list);
@@ -2159,18 +2502,19 @@ again:
 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
 					    &end, EXTENT_WRITEBACK);
 		if (ret) {
-			if (skipped && all && !num_inserts) {
-				skipped = 0;
+			if (restart && !num_inserts &&
+			    list_empty(&update_list)) {
+				restart = 0;
 				search = 0;
 				continue;
 			}
-			mutex_unlock(&info->extent_ins_mutex);
 			break;
 		}
 
 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
 		if (!ret) {
-			skipped = 1;
+			if (all)
+				restart = 1;
 			search = end + 1;
 			if (need_resched()) {
 				mutex_unlock(&info->extent_ins_mutex);
@@ -2189,7 +2533,7 @@ again:
 			list_add_tail(&extent_op->list, &insert_list);
 			search = end + 1;
 			if (num_inserts == max_inserts) {
-				mutex_unlock(&info->extent_ins_mutex);
+				restart = 1;
 				break;
 			}
 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2205,7 +2549,6 @@ again:
 	 * somebody marked this thing for deletion then just unlock it and be
 	 * done, the free_extents will handle it
 	 */
-	mutex_lock(&info->extent_ins_mutex);
 	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
 				  extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2227,6 +2570,10 @@ again:
 	if (!list_empty(&update_list)) {
 		ret = update_backrefs(trans, extent_root, path, &update_list);
 		BUG_ON(ret);
+
+		/* we may have COW'ed new blocks, so lets start over */
+		if (all)
+			restart = 1;
 	}
 
 	/*
@@ -2234,9 +2581,9 @@ again:
 	 * need to make sure everything is cleaned then reset everything and
 	 * go back to the beginning
 	 */
-	if (!num_inserts && all && skipped) {
+	if (!num_inserts && restart) {
 		search = 0;
-		skipped = 0;
+		restart = 0;
 		INIT_LIST_HEAD(&update_list);
 		INIT_LIST_HEAD(&insert_list);
 		goto again;
@@ -2293,27 +2640,19 @@ again:
 	BUG_ON(ret);
 
 	/*
-	 * if we broke out of the loop in order to insert stuff because we hit
-	 * the maximum number of inserts at a time we can handle, then loop
-	 * back and pick up where we left off
-	 */
-	if (num_inserts == max_inserts) {
-		INIT_LIST_HEAD(&insert_list);
-		INIT_LIST_HEAD(&update_list);
-		num_inserts = 0;
-		goto again;
-	}
-
-	/*
-	 * again, if we need to make absolutely sure there are no more pending
-	 * extent operations left and we know that we skipped some, go back to
-	 * the beginning and do it all again
+	 * if restart is set for whatever reason we need to go back and start
+	 * searching through the pending list again.
+	 *
+	 * We just inserted some extents, which could have resulted in new
+	 * blocks being allocated, which would result in new blocks needing
+	 * updates, so if all is set we _must_ restart to get the updated
+	 * blocks.
 	 */
-	if (all && skipped) {
+	if (restart || all) {
 		INIT_LIST_HEAD(&insert_list);
 		INIT_LIST_HEAD(&update_list);
 		search = 0;
-		skipped = 0;
+		restart = 0;
 		num_inserts = 0;
 		goto again;
 	}
@@ -2547,6 +2886,7 @@ again:
 		if (ret) {
 			if (all && skipped && !nr) {
 				search = 0;
+				skipped = 0;
 				continue;
 			}
 			mutex_unlock(&info->extent_ins_mutex);
@@ -2633,6 +2973,8 @@ again:
 		goto again;
 	}
 
+	if (!err)
+		finish_current_insert(trans, extent_root, 0);
 	return err;
 }
 
@@ -2700,13 +3042,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	/* if metadata always pin */
 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
 		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-			struct btrfs_block_group_cache *cache;
-
-			/* btrfs_free_reserved_extent */
-			cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-			BUG_ON(!cache);
-			btrfs_add_free_space(cache, bytenr, num_bytes);
-			put_block_group(cache);
+			mutex_lock(&root->fs_info->pinned_mutex);
+			btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+			mutex_unlock(&root->fs_info->pinned_mutex);
 			update_reserved_extents(root, bytenr, num_bytes, 0);
 			return 0;
 		}
@@ -2787,7 +3125,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
-		empty_cluster = 64 * 1024;
+		if (!btrfs_test_opt(root, SSD))
+			empty_cluster = 64 * 1024;
 	}
 
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3014,16 +3353,18 @@ loop_check:
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 {
 	struct btrfs_block_group_cache *cache;
-	struct list_head *l;
 
 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved),
 	       (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+	       " may_use=%llu, used=%llu\n", info->total_bytes,
+	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+	       info->bytes_used);
 
 	down_read(&info->groups_sem);
-	list_for_each(l, &info->block_groups) {
-		cache = list_entry(l, struct btrfs_block_group_cache, list);
+	list_for_each_entry(cache, &info->block_groups, list) {
 		spin_lock(&cache->lock);
 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
 		       "%llu pinned %llu reserved\n",
@@ -3047,24 +3388,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 search_start = 0;
-	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
 
-	if (data) {
-		alloc_profile = info->avail_data_alloc_bits &
-			info->data_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-	} else if (root == root->fs_info->chunk_root) {
-		alloc_profile = info->avail_system_alloc_bits &
-			info->system_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-	} else {
-		alloc_profile = info->avail_metadata_alloc_bits &
-			info->metadata_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-	}
+	data = btrfs_get_alloc_profile(root, data);
 again:
-	data = btrfs_reduce_alloc_profile(root, data);
 	/*
 	 * the only place that sets empty_size is btrfs_realloc_node, which
 	 * is not called recursively on allocations
@@ -3332,7 +3659,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize)
+					    u64 bytenr, u32 blocksize,
+					    int level)
 {
 	struct extent_buffer *buf;
 
@@ -3340,9 +3668,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_set_buffer_lockdep_class(buf, level);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
+
+	btrfs_set_lock_blocking(buf);
 	btrfs_set_buffer_uptodate(buf);
+
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		set_extent_dirty(&root->dirty_log_pages, buf->start,
 			 buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3683,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 			 buf->start + buf->len - 1, GFP_NOFS);
 	}
 	trans->blocks_used++;
+	/* this returns a buffer locked for blocking */
 	return buf;
 }
 
@@ -3379,7 +3712,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	}
 
-	buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+				    blocksize, level);
 	return buf;
 }
 
@@ -3388,36 +3722,73 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
 	u64 leaf_owner;
 	u64 leaf_generation;
+	struct refsort *sorted;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int nritems;
 	int ret;
+	int refi = 0;
+	int slot;
 
 	BUG_ON(!btrfs_is_leaf(leaf));
 	nritems = btrfs_header_nritems(leaf);
 	leaf_owner = btrfs_header_owner(leaf);
 	leaf_generation = btrfs_header_generation(leaf);
 
+	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+	/* we do this loop twice.  The first time we build a list
+	 * of the extents we have a reference on, then we sort the list
+	 * by bytenr.  The second time around we actually do the
+	 * extent freeing.
+	 */
 	for (i = 0; i < nritems; i++) {
 		u64 disk_bytenr;
 		cond_resched();
 
 		btrfs_item_key_to_cpu(leaf, &key, i);
+
+		/* only extents have references, skip everything else */
 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 			continue;
+
 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+
+		/* inline extents live in the btree, they don't have refs */
 		if (btrfs_file_extent_type(leaf, fi) ==
 		    BTRFS_FILE_EXTENT_INLINE)
 			continue;
-		/*
-		 * FIXME make sure to insert a trans record that
-		 * repeats the snapshot del on crash
-		 */
+
 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
+		/* holes don't have refs */
 		if (disk_bytenr == 0)
 			continue;
 
+		sorted[refi].bytenr = disk_bytenr;
+		sorted[refi].slot = i;
+		refi++;
+	}
+
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	for (i = 0; i < refi; i++) {
+		u64 disk_bytenr;
+
+		disk_bytenr = sorted[i].bytenr;
+		slot = sorted[i].slot;
+
+		cond_resched();
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
 				btrfs_file_extent_disk_num_bytes(leaf, fi),
 				leaf->start, leaf_owner, leaf_generation,
@@ -3428,6 +3799,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		wake_up(&root->fs_info->transaction_throttle);
 		cond_resched();
 	}
+out:
+	kfree(sorted);
 	return 0;
 }
 
@@ -3437,9 +3810,25 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
 	int i;
 	int ret;
-	struct btrfs_extent_info *info = ref->extents;
+	struct btrfs_extent_info *info;
+	struct refsort *sorted;
+
+	if (ref->nritems == 0)
+		return 0;
 
+	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
 	for (i = 0; i < ref->nritems; i++) {
+		sorted[i].bytenr = ref->extents[i].bytenr;
+		sorted[i].slot = i;
+	}
+	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+
+	/*
+	 * the items in the ref were sorted when the ref was inserted
+	 * into the ref cache, so this is already in order
+	 */
+	for (i = 0; i < ref->nritems; i++) {
+		info = ref->extents + sorted[i].slot;
 		ret = __btrfs_free_extent(trans, root, info->bytenr,
 					  info->num_bytes, ref->bytenr,
 					  ref->owner, ref->generation,
@@ -3453,6 +3842,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		info++;
 	}
 
+	kfree(sorted);
 	return 0;
 }
 
@@ -3497,6 +3887,152 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
 }
 
 /*
+ * this is used while deleting old snapshots, and it drops the refs
+ * on a whole subtree starting from a level 1 node.
+ *
+ * The idea is to sort all the leaf pointers, and then drop the
+ * ref on all the leaves in order.  Most of the time the leaves
+ * will have ref cache entries, so no leaf IOs will be required to
+ * find the extents they have references on.
+ *
+ * For each leaf, any references it has are also dropped in order
+ *
+ * This ends up dropping the references in something close to optimal
+ * order for reading and modifying the extent allocation tree.
+ */
+static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path)
+{
+	u64 bytenr;
+	u64 root_owner;
+	u64 root_gen;
+	struct extent_buffer *eb = path->nodes[1];
+	struct extent_buffer *leaf;
+	struct btrfs_leaf_ref *ref;
+	struct refsort *sorted = NULL;
+	int nritems = btrfs_header_nritems(eb);
+	int ret;
+	int i;
+	int refi = 0;
+	int slot = path->slots[1];
+	u32 blocksize = btrfs_level_size(root, 0);
+	u32 refs;
+
+	if (nritems == 0)
+		goto out;
+
+	root_owner = btrfs_header_owner(eb);
+	root_gen = btrfs_header_generation(eb);
+	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+
+	/*
+	 * step one, sort all the leaf pointers so we don't scribble
+	 * randomly into the extent allocation tree
+	 */
+	for (i = slot; i < nritems; i++) {
+		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+		sorted[refi].slot = i;
+		refi++;
+	}
+
+	/*
+	 * nritems won't be zero, but if we're picking up drop_snapshot
+	 * after a crash, slot might be > 0, so double check things
+	 * just in case.
+	 */
+	if (refi == 0)
+		goto out;
+
+	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+	/*
+	 * the first loop frees everything the leaves point to
+	 */
+	for (i = 0; i < refi; i++) {
+		u64 ptr_gen;
+
+		bytenr = sorted[i].bytenr;
+
+		/*
+		 * check the reference count on this leaf.  If it is > 1
+		 * we just decrement it below and don't update any
+		 * of the refs the leaf points to.
+		 */
+		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+		BUG_ON(ret);
+		if (refs != 1)
+			continue;
+
+		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+
+		/*
+		 * the leaf only had one reference, which means the
+		 * only thing pointing to this leaf is the snapshot
+		 * we're deleting.  It isn't possible for the reference
+		 * count to increase again later
+		 *
+		 * The reference cache is checked for the leaf,
+		 * and if found we'll be able to drop any refs held by
+		 * the leaf without needing to read it in.
+		 */
+		ref = btrfs_lookup_leaf_ref(root, bytenr);
+		if (ref && ref->generation != ptr_gen) {
+			btrfs_free_leaf_ref(root, ref);
+			ref = NULL;
+		}
+		if (ref) {
+			ret = cache_drop_leaf_ref(trans, root, ref);
+			BUG_ON(ret);
+			btrfs_remove_leaf_ref(root, ref);
+			btrfs_free_leaf_ref(root, ref);
+		} else {
+			/*
+			 * the leaf wasn't in the reference cache, so
+			 * we have to read it.
+			 */
+			leaf = read_tree_block(root, bytenr, blocksize,
+					       ptr_gen);
+			ret = btrfs_drop_leaf_ref(trans, root, leaf);
+			BUG_ON(ret);
+			free_extent_buffer(leaf);
+		}
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+	}
+
+	/*
+	 * run through the loop again to free the refs on the leaves.
+	 * This is faster than doing it in the loop above because
+	 * the leaves are likely to be clustered together.  We end up
+	 * working in nice chunks on the extent allocation tree.
+	 */
+	for (i = 0; i < refi; i++) {
+		bytenr = sorted[i].bytenr;
+		ret = __btrfs_free_extent(trans, root, bytenr,
+					blocksize, eb->start,
+					root_owner, root_gen, 0, 1);
+		BUG_ON(ret);
+
+		atomic_inc(&root->fs_info->throttle_gen);
+		wake_up(&root->fs_info->transaction_throttle);
+		cond_resched();
+	}
+out:
+	kfree(sorted);
+
+	/*
+	 * update the path to show we've processed the entire level 1
+	 * node.  This will get saved into the root's drop_snapshot_progress
+	 * field so these drops are not repeated again if this transaction
+	 * commits.
+	 */
+	path->slots[1] = nritems;
+	return 0;
+}
+
+/*
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
  */
@@ -3511,7 +4047,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
-	struct btrfs_leaf_ref *ref;
 	u32 blocksize;
 	int ret;
 	u32 refs;
@@ -3538,17 +4073,46 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 		if (path->slots[*level] >=
 		    btrfs_header_nritems(cur))
 			break;
+
+		/* the new code goes down to level 1 and does all the
+		 * leaves pointed to that node in bulk.  So, this check
+		 * for level 0 will always be false.
+		 *
+		 * But, the disk format allows the drop_snapshot_progress
+		 * field in the root to leave things in a state where
+		 * a leaf will need cleaning up here.  If someone crashes
+		 * with the old code and then boots with the new code,
+		 * we might find a leaf here.
+		 */
 		if (*level == 0) {
 			ret = btrfs_drop_leaf_ref(trans, root, cur);
 			BUG_ON(ret);
 			break;
 		}
+
+		/*
+		 * once we get to level one, process the whole node
+		 * at once, including everything below it.
+		 */
+		if (*level == 1) {
+			ret = drop_level_one_refs(trans, root, path);
+			BUG_ON(ret);
+			break;
+		}
+
 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
 		blocksize = btrfs_level_size(root, *level - 1);
 
 		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
 		BUG_ON(ret);
+
+		/*
+		 * if there is more than one reference, we don't need
+		 * to read that node to drop any references it has.  We
+		 * just drop the ref we hold on that node and move on to the
+		 * next slot in this level.
+		 */
 		if (refs != 1) {
 			parent = path->nodes[*level];
 			root_owner = btrfs_header_owner(parent);
@@ -3567,46 +4131,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
 			continue;
 		}
+
 		/*
-		 * at this point, we have a single ref, and since the
-		 * only place referencing this extent is a dead root
-		 * the reference count should never go higher.
-		 * So, we don't need to check it again
+		 * we need to keep freeing things in the next level down.
+		 * read the block and loop around to process it
 		 */
-		if (*level == 1) {
-			ref = btrfs_lookup_leaf_ref(root, bytenr);
-			if (ref && ref->generation != ptr_gen) {
-				btrfs_free_leaf_ref(root, ref);
-				ref = NULL;
-			}
-			if (ref) {
-				ret = cache_drop_leaf_ref(trans, root, ref);
-				BUG_ON(ret);
-				btrfs_remove_leaf_ref(root, ref);
-				btrfs_free_leaf_ref(root, ref);
-				*level = 0;
-				break;
-			}
-		}
-		next = btrfs_find_tree_block(root, bytenr, blocksize);
-		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
-			free_extent_buffer(next);
-
-			next = read_tree_block(root, bytenr, blocksize,
-					       ptr_gen);
-			cond_resched();
-#if 0
-			/*
-			 * this is a debugging check and can go away
-			 * the ref should never go all the way down to 1
-			 * at this point
-			 */
-			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
-						&refs);
-			BUG_ON(ret);
-			WARN_ON(refs != 1);
-#endif
-		}
+		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
 			free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +4161,16 @@ out:
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
+	/*
+	 * cleanup and free the reference on the last node
+	 * we processed
+	 */
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  parent->start, root_owner, root_gen,
 				  *level, 1);
 	free_extent_buffer(path->nodes[*level]);
 	path->nodes[*level] = NULL;
+
 	*level += 1;
 	BUG_ON(ret);
 
@@ -3687,6 +4222,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 
 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
 		btrfs_tree_lock(next);
+		btrfs_set_lock_blocking(next);
 
 		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
 					      &refs);
@@ -3754,6 +4290,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
 			struct extent_buffer *node;
 			struct btrfs_disk_key disk_key;
+
+			/*
+			 * there is more work to do in this level.
+			 * Update the drop_progress marker to reflect
+			 * the work we've done so far, and then bump
+			 * the slot number
+			 */
 			node = path->nodes[i];
 			path->slots[i]++;
 			*level = i;
@@ -3765,6 +4308,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 			return 0;
 		} else {
 			struct extent_buffer *parent;
+
+			/*
+			 * this whole node is done, free our reference
+			 * on it and go up one level
+			 */
 			if (path->nodes[*level] == root->node)
 				parent = path->nodes[*level];
 			else
@@ -3891,13 +4439,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	BUG_ON(!btrfs_tree_locked(parent));
+	btrfs_assert_tree_locked(parent);
 	parent_level = btrfs_header_level(parent);
 	extent_buffer_get(parent);
 	path->nodes[parent_level] = parent;
 	path->slots[parent_level] = btrfs_header_nritems(parent);
 
-	BUG_ON(!btrfs_tree_locked(node));
+	btrfs_assert_tree_locked(node);
 	level = btrfs_header_level(node);
 	extent_buffer_get(node);
 	path->nodes[level] = node;
@@ -4444,7 +4992,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 	u64 lock_end = 0;
 	u64 num_bytes;
 	u64 ext_offset;
-	u64 first_pos;
+	u64 search_end = (u64)-1;
 	u32 nritems;
 	int nr_scaned = 0;
 	int extent_locked = 0;
@@ -4452,7 +5000,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
 	int ret;
 
 	memcpy(&key, leaf_key, sizeof(key));
-	first_pos = INT_LIMIT(loff_t) - extent_key->offset;
 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
 		if (key.objectid < ref_path->owner_objectid ||
 		    (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +5048,7 @@ next:
 			if ((key.objectid > ref_path->owner_objectid) ||
 			    (key.objectid == ref_path->owner_objectid &&
 			     key.type > BTRFS_EXTENT_DATA_KEY) ||
-			    (key.offset >= first_pos + extent_key->offset))
+			    key.offset >= search_end)
 				break;
 		}
 
@@ -4534,8 +5081,10 @@ next:
 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
 		ext_offset = btrfs_file_extent_offset(leaf, fi);
 
-		if (first_pos > key.offset - ext_offset)
-			first_pos = key.offset - ext_offset;
+		if (search_end == (u64)-1) {
+			search_end = key.offset - ext_offset +
+				btrfs_file_extent_ram_bytes(leaf, fi);
+		}
 
 		if (!extent_locked) {
 			lock_start = key.offset;
@@ -4724,7 +5273,7 @@ next:
 		}
 skip:
 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
-		    key.offset >= first_pos + extent_key->offset)
+		    key.offset >= search_end)
 			break;
 
 		cond_resched();
@@ -4778,6 +5327,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
 		ref->bytenr = buf->start;
 		ref->owner = btrfs_header_owner(buf);
 		ref->generation = btrfs_header_generation(buf);
+
 		ret = btrfs_add_leaf_ref(root, ref, 0);
 		WARN_ON(ret);
 		btrfs_free_leaf_ref(root, ref);
@@ -5351,7 +5901,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 			prev_block = block_start;
 		}
 
+		mutex_lock(&extent_root->fs_info->trans_mutex);
 		btrfs_record_root_in_trans(found_root);
+		mutex_unlock(&extent_root->fs_info->trans_mutex);
 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			/*
 			 * try to update data extent references while
@@ -5789,6 +6341,7 @@ out:
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_block_group_cache *block_group;
+	struct btrfs_space_info *space_info;
 	struct rb_node *n;
 
 	spin_lock(&info->block_group_cache_lock);
@@ -5810,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		spin_lock(&info->block_group_cache_lock);
 	}
 	spin_unlock(&info->block_group_cache_lock);
+
+	/* now that all the block groups are freed, go through and
+	 * free all the space_info structs.  This is only called during
+	 * the final stages of unmount, and so we know nobody is
+	 * using them.  We call synchronize_rcu() once before we start,
+	 * just to be on the safe side.
+	 */
+	synchronize_rcu();
+
+	while(!list_empty(&info->space_info)) {
+		space_info = list_entry(info->space_info.next,
+					struct btrfs_space_info,
+					list);
+
+		list_del(&space_info->list);
+		kfree(space_info);
+	}
 	return 0;
 }
 
@@ -5957,9 +6527,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	btrfs_remove_free_space_cache(block_group);
+	spin_lock(&root->fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+	btrfs_remove_free_space_cache(block_group);
 	down_write(&block_group->space_info->groups_sem);
 	list_del(&block_group->list);
 	up_write(&block_group->space_info->groups_sem);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e086d40..ebe6b29 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -9,7 +9,6 @@
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
-#include <linux/version.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
 #include "extent_io.h"
@@ -31,7 +30,7 @@ static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
 #define LEAK_DEBUG 0
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 static DEFINE_SPINLOCK(leak_lock);
 #endif
 
@@ -120,7 +119,7 @@ void extent_io_tree_init(struct extent_io_tree *tree,
 static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 #endif
 
@@ -130,7 +129,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&state->leak_list, &states);
 	spin_unlock_irqrestore(&leak_lock, flags);
@@ -145,11 +144,11 @@ static void free_extent_state(struct extent_state *state)
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 		unsigned long flags;
 #endif
 		WARN_ON(state->tree);
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 		spin_lock_irqsave(&leak_lock, flags);
 		list_del(&state->leak_list);
 		spin_unlock_irqrestore(&leak_lock, flags);
@@ -416,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 
 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
@@ -2378,11 +2375,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
 	int scanned = 0;
 	int range_whole = 0;
 
-	if (wbc->nonblocking && bdi_write_congested(bdi)) {
-		wbc->encountered_congestion = 1;
-		return 0;
-	}
-
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
@@ -2855,6 +2847,98 @@ out:
 	return sector;
 }
 
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent)
+{
+	int ret;
+	u64 off = start;
+	u64 max = start + len;
+	u32 flags = 0;
+	u64 disko = 0;
+	struct extent_map *em = NULL;
+	int end = 0;
+	u64 em_start = 0, em_len = 0;
+	unsigned long emflags;
+	ret = 0;
+
+	if (len == 0)
+		return -EINVAL;
+
+	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+		GFP_NOFS);
+	em = get_extent(inode, NULL, 0, off, max - off, 0);
+	if (!em)
+		goto out;
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out;
+	}
+	while (!end) {
+		off = em->start + em->len;
+		if (off >= max)
+			end = 1;
+
+		em_start = em->start;
+		em_len = em->len;
+
+		disko = 0;
+		flags = 0;
+
+		switch (em->block_start) {
+		case EXTENT_MAP_LAST_BYTE:
+			end = 1;
+			flags |= FIEMAP_EXTENT_LAST;
+			break;
+		case EXTENT_MAP_HOLE:
+			flags |= FIEMAP_EXTENT_UNWRITTEN;
+			break;
+		case EXTENT_MAP_INLINE:
+			flags |= (FIEMAP_EXTENT_DATA_INLINE |
+				  FIEMAP_EXTENT_NOT_ALIGNED);
+			break;
+		case EXTENT_MAP_DELALLOC:
+			flags |= (FIEMAP_EXTENT_DELALLOC |
+				  FIEMAP_EXTENT_UNKNOWN);
+			break;
+		default:
+			disko = em->block_start;
+			break;
+		}
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			flags |= FIEMAP_EXTENT_ENCODED;
+
+		emflags = em->flags;
+		free_extent_map(em);
+		em = NULL;
+
+		if (!end) {
+			em = get_extent(inode, NULL, 0, off, max - off, 0);
+			if (!em)
+				goto out;
+			if (IS_ERR(em)) {
+				ret = PTR_ERR(em);
+				goto out;
+			}
+			emflags = em->flags;
+		}
+		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+			flags |= FIEMAP_EXTENT_LAST;
+			end = 1;
+		}
+
+		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+					em_len, flags);
+		if (ret)
+			goto out_free;
+	}
+out_free:
+	free_extent_map(em);
+out:
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
+			GFP_NOFS);
+	return ret;
+}
+
 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
@@ -2892,15 +2976,17 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 #endif
 
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	eb->start = start;
 	eb->len = len;
-	mutex_init(&eb->mutex);
-#ifdef LEAK_DEBUG
+	spin_lock_init(&eb->lock);
+	init_waitqueue_head(&eb->lock_wq);
+
+#if LEAK_DEBUG
 	spin_lock_irqsave(&leak_lock, flags);
 	list_add(&eb->leak_list, &buffers);
 	spin_unlock_irqrestore(&leak_lock, flags);
@@ -2912,7 +2998,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 
 static void __free_extent_buffer(struct extent_buffer *eb)
 {
-#ifdef LEAK_DEBUG
+#if LEAK_DEBUG
 	unsigned long flags;
 	spin_lock_irqsave(&leak_lock, flags);
 	list_del(&eb->leak_list);
@@ -2980,8 +3066,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 		unlock_page(p);
 	}
 	if (uptodate)
-		eb->flags |= EXTENT_UPTODATE;
-	eb->flags |= EXTENT_BUFFER_FILLED;
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 	spin_lock(&tree->buffer_lock);
 	exists = buffer_tree_insert(tree, start, &eb->rb_node);
@@ -3135,7 +3220,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
 	unsigned long num_pages;
 
 	num_pages = num_extent_pages(eb->start, eb->len);
-	eb->flags &= ~EXTENT_UPTODATE;
+	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 
 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
 			      GFP_NOFS);
@@ -3206,7 +3291,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 	struct page *page;
 	int pg_uptodate = 1;
 
-	if (eb->flags & EXTENT_UPTODATE)
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 1;
 
 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3242,7 +3327,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 
-	if (eb->flags & EXTENT_UPTODATE)
+	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 
 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
@@ -3273,7 +3358,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
-			eb->flags |= EXTENT_UPTODATE;
+			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 		goto unlock_exit;
 	}
 
@@ -3309,7 +3394,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	if (!ret)
-		eb->flags |= EXTENT_UPTODATE;
+		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	return ret;
 
 unlock_exit:
@@ -3406,7 +3491,6 @@ int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
 		unmap_extent_buffer(eb, eb->map_token, km);
 		eb->map_token = NULL;
 		save = 1;
-		WARN_ON(!mutex_is_locked(&eb->mutex));
 	}
 	err = map_private_extent_buffer(eb, start, min_len, token, map,
 				       map_start, map_len, km);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c5b483a..1f9df88 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -22,6 +22,10 @@
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
 
+/* these are bit numbers for test/set bit */
+#define EXTENT_BUFFER_UPTODATE 0
+#define EXTENT_BUFFER_BLOCKING 1
+
 /*
  * page->private values.  Every page that is controlled by the extent
  * map has page->private set to one.
@@ -95,11 +99,19 @@ struct extent_buffer {
 	unsigned long map_start;
 	unsigned long map_len;
 	struct page *first_page;
+	unsigned long bflags;
 	atomic_t refs;
-	int flags;
 	struct list_head leak_list;
 	struct rb_node rb_node;
-	struct mutex mutex;
+
+	/* the spinlock is used to protect most operations */
+	spinlock_t lock;
+
+	/*
+	 * when we keep the lock held while blocking, waiters go onto
+	 * the wq
+	 */
+	wait_queue_head_t lock_wq;
 };
 
 struct extent_map_tree;
@@ -193,6 +205,8 @@ int extent_commit_write(struct extent_io_tree *tree,
 			unsigned from, unsigned to);
 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
 		get_extent_t *get_extent);
+int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len, get_extent_t *get_extent);
 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 4a83e33..50da69d 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,7 +3,6 @@
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/version.h>
 #include <linux/hardirq.h>
 #include "extent_map.h"
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9026833..dc78954 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -29,7 +29,6 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
-#include <linux/version.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -1092,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
-		ret = btrfs_check_free_space(root, write_bytes, 0);
+		ret = btrfs_check_data_free_space(root, inode, write_bytes);
 		if (ret)
 			goto out;
 
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
 				    write_bytes);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			btrfs_drop_pages(pages, num_pages);
 			goto out;
 		}
@@ -1112,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		ret = dirty_and_release_pages(NULL, root, file, pages,
 					      num_pages, pos, write_bytes);
 		btrfs_drop_pages(pages, num_pages);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		if (will_write) {
 			btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1137,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
+	if (ret)
+		err = ret;
 
 out_nolock:
 	kfree(pages);
@@ -1215,15 +1224,15 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	}
 	mutex_unlock(&root->fs_info->trans_mutex);
 
-	root->fs_info->tree_log_batch++;
+	root->log_batch++;
 	filemap_fdatawrite(inode->i_mapping);
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
-	root->fs_info->tree_log_batch++;
+	root->log_batch++;
 
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-	if (file->private_data)
+	if (file && file->private_data)
 		btrfs_ioctl_trans_end(file);
 
 	trans = btrfs_start_transaction(root, 1);
@@ -1232,7 +1241,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+	ret = btrfs_log_dentry_safe(trans, root, dentry);
 	if (ret < 0)
 		goto out;
 
@@ -1246,7 +1255,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
@@ -1254,7 +1263,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		btrfs_sync_log(trans, root);
 		ret = btrfs_end_transaction(trans, root);
 	}
-	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	mutex_lock(&dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? EIO : ret;
 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa7987..cc7334d 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	search_key.type = 0;
 	search_key.offset = 0;
 
-	btrfs_init_path(path);
 	start_found = 0;
 	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
 	if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8adfe05..7d4f948 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -34,7 +34,6 @@
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
-#include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
@@ -51,6 +50,7 @@
 #include "tree-log.h"
 #include "ref-cache.h"
 #include "compression.h"
+#include "locking.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -91,32 +91,14 @@ static noinline int cow_file_range(struct inode *inode,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
 
-/*
- * a very lame attempt at stopping writes when the FS is 85% full.  There
- * are countless ways this is incorrect, but it is better than nothing.
- */
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del)
+static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
 {
-	u64 total;
-	u64 used;
-	u64 thresh;
-	int ret = 0;
-
-	spin_lock(&root->fs_info->delalloc_lock);
-	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
-	if (for_del)
-		thresh = total * 90;
-	else
-		thresh = total * 85;
-
-	do_div(thresh, 100);
+	int err;
 
-	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
-		ret = -ENOSPC;
-	spin_unlock(&root->fs_info->delalloc_lock);
-	return ret;
+	err = btrfs_init_acl(inode, dir);
+	if (!err)
+		err = btrfs_xattr_security_init(inode, dir);
+	return err;
 }
 
 /*
@@ -350,6 +332,19 @@ again:
 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
 	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
 
+	/*
+	 * we don't want to send crud past the end of i_size through
+	 * compression, that's just a waste of CPU time.  So, if the
+	 * end of the file is before the start of our current
+	 * requested range of bytes, we bail out to the uncompressed
+	 * cleanup code that can deal with all of this.
+	 *
+	 * It isn't really the fastest way to fix things, but this is a
+	 * very uncommon corner.
+	 */
+	if (actual_end <= start)
+		goto cleanup_and_bail_uncompressed;
+
 	total_compressed = actual_end - start;
 
 	/* we want to make sure that amount of ram required to uncompress
@@ -494,6 +489,7 @@ again:
 			goto again;
 		}
 	} else {
+cleanup_and_bail_uncompressed:
 		/*
 		 * No compression, but we still need to write the pages in
 		 * the file we've been given so far.  redirty the locked
@@ -1166,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	 */
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
@@ -1199,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			       (unsigned long long)end - start + 1,
 			       (unsigned long long)
 			       root->fs_info->delalloc_bytes);
+			btrfs_delalloc_free_space(root, inode, (u64)-1);
 			root->fs_info->delalloc_bytes = 0;
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
+			btrfs_delalloc_free_space(root, inode,
+						  end - start + 1);
 			root->fs_info->delalloc_bytes -= end - start + 1;
 			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
@@ -1324,12 +1324,11 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 			     struct inode *inode, u64 file_offset,
 			     struct list_head *list)
 {
-	struct list_head *cur;
 	struct btrfs_ordered_sum *sum;
 
 	btrfs_set_trans_block_group(trans, inode);
-	list_for_each(cur, list) {
-		sum = list_entry(cur, struct btrfs_ordered_sum, list);
+
+	list_for_each_entry(sum, list, list) {
 		btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
 	}
@@ -2013,6 +2012,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 
 	alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
+
 	BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
 						alloc_group_block, 0);
 	btrfs_free_path(path);
@@ -2039,6 +2039,7 @@ void btrfs_read_locked_inode(struct inode *inode)
 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 		break;
 	default:
+		inode->i_op = &btrfs_special_inode_operations;
 		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 	}
@@ -2108,6 +2109,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
 		goto failed;
 	}
 
+	btrfs_unlock_up_safe(path, 1);
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				  struct btrfs_inode_item);
@@ -2219,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	root = BTRFS_I(dir)->root;
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
@@ -2235,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 
 	btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2258,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -ENOTEMPTY;
 	}
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2278,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 fail_trans:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 
 	if (ret && !err)
@@ -2429,6 +2421,8 @@ next_node:
 			ref->generation = leaf_gen;
 			ref->nritems = 0;
 
+			btrfs_sort_leaf_ref(ref);
+
 			ret = btrfs_add_leaf_ref(root, ref, 0);
 			WARN_ON(ret);
 			btrfs_free_leaf_ref(root, ref);
@@ -2476,7 +2470,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	u32 found_type;
+	u32 found_type = (u8)-1;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
@@ -2503,8 +2497,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
 
-	btrfs_init_path(path);
-
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
@@ -2663,6 +2655,8 @@ next:
 			if (pending_del_nr)
 				goto del_pending;
 			btrfs_release_path(root, path);
+			if (found_type == BTRFS_INODE_ITEM_KEY)
+				break;
 			goto search_again;
 		}
 
@@ -2679,6 +2673,8 @@ del_pending:
 			BUG_ON(ret);
 			pending_del_nr = 0;
 			btrfs_release_path(root, path);
+			if (found_type == BTRFS_INODE_ITEM_KEY)
+				break;
 			goto search_again;
 		}
 	}
@@ -2788,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	if (size <= hole_start)
 		return 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		return err;
 
@@ -2984,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->last_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
+	bi->reserved_bytes = 0;
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
@@ -3005,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	inode->i_ino = args->ino;
 	init_btrfs_i(inode);
 	BTRFS_I(inode)->root = args->root;
+	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
 }
 
@@ -3265,7 +3263,7 @@ skip:
 
 	/* Reached end of directory/root. Bump pos past the last item. */
 	if (key_type == BTRFS_DIR_INDEX_KEY)
-		filp->f_pos = INT_LIMIT(typeof(filp->f_pos));
+		filp->f_pos = INT_LIMIT(off_t);
 	else
 		filp->f_pos++;
 nopos:
@@ -3425,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BTRFS_I(inode)->index_cnt = 2;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
+	btrfs_set_inode_space_info(root, inode);
 
 	if (mode & S_IFDIR)
 		owner = 0;
@@ -3458,7 +3457,14 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		root->highest_inode = objectid;
 
 	inode->i_uid = current_fsuid();
-	inode->i_gid = current_fsgid();
+
+	if (dir && (dir->i_mode & S_ISGID)) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else
+		inode->i_gid = current_fsgid();
+
 	inode->i_mode = mode;
 	inode->i_ino = objectid;
 	inode_set_bytes(inode, 0);
@@ -3565,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 
@@ -3586,7 +3592,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -3628,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	u64 objectid;
 	u64 index = 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	trans = btrfs_start_transaction(root, 1);
@@ -3649,7 +3655,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -3696,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		return -ENOENT;
 
 	btrfs_inc_nlink(inode);
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	err = btrfs_set_inode_index(dir, &index);
@@ -3742,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_unlock;
 
@@ -3772,7 +3778,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	drop_on_err = 1;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err)
 		goto out_fail;
 
@@ -4158,9 +4164,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 	return -EINVAL;
 }
 
-static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
+static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+		__u64 start, __u64 len)
 {
-	return extent_bmap(mapping, iblock, btrfs_get_extent);
+	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
 }
 
 int btrfs_readpage(struct file *file, struct page *page)
@@ -4223,7 +4230,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	if (PageWriteback(page) || PageDirty(page))
 		return 0;
-	return __btrfs_releasepage(page, gfp_flags);
+	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
@@ -4298,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	u64 page_start;
 	u64 page_end;
 
-	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
@@ -4311,6 +4318,7 @@ again:
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
@@ -4593,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -EXDEV;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto out_unlock;
 
@@ -4711,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_fail;
 
@@ -4733,7 +4741,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		goto out_unlock;
 
-	err = btrfs_init_acl(inode, dir);
+	err = btrfs_init_inode_security(inode, dir);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock;
@@ -4987,13 +4995,24 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.clear_bit_hook = btrfs_clear_bit_hook,
 };
 
+/*
+ * btrfs doesn't support the bmap operation because swapfiles
+ * use bmap to make a mapping of extents in the file.  They assume
+ * these extents won't change over the life of the file and they
+ * use the bmap result to do IO directly to the drive.
+ *
+ * the btrfs bmap call would return logical addresses that aren't
+ * suitable for IO and they also will change frequently as COW
+ * operations happen.  So, swapfile + btrfs == corruption.
+ *
+ * For now we're avoiding this by dropping bmap.
+ */
 static struct address_space_operations btrfs_aops = {
 	.readpage	= btrfs_readpage,
 	.writepage	= btrfs_writepage,
 	.writepages	= btrfs_writepages,
 	.readpages	= btrfs_readpages,
 	.sync_page	= block_sync_page,
-	.bmap		= btrfs_bmap,
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
@@ -5017,6 +5036,7 @@ static struct inode_operations btrfs_file_inode_operations = {
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.fallocate	= btrfs_fallocate,
+	.fiemap		= btrfs_fiemap,
 };
 static struct inode_operations btrfs_special_inode_operations = {
 	.getattr	= btrfs_getattr,
@@ -5032,4 +5052,8 @@ static struct inode_operations btrfs_symlink_inode_operations = {
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
 	.permission	= btrfs_permission,
+	.setxattr	= btrfs_setxattr,
+	.getxattr	= btrfs_getxattr,
+	.listxattr	= btrfs_listxattr,
+	.removexattr	= btrfs_removexattr,
 };
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c2aa33e..bca729f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -38,7 +38,6 @@
 #include <linux/compat.h>
 #include <linux/bit_spinlock.h>
 #include <linux/security.h>
-#include <linux/version.h>
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
 #include "compat.h"
@@ -71,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_commit;
 
@@ -204,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_unlock;
 
@@ -375,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
 	unsigned long i;
 	int ret;
 
-	ret = btrfs_check_free_space(root, inode->i_size, 0);
+	ret = btrfs_check_data_free_space(root, inode, inode->i_size);
 	if (ret)
 		return -ENOSPC;
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 39bae77..47b0a88 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,64 +25,203 @@
 #include "extent_io.h"
 #include "locking.h"
 
+static inline void spin_nested(struct extent_buffer *eb)
+{
+	spin_lock(&eb->lock);
+}
+
 /*
- * locks the per buffer mutex in an extent buffer.  This uses adaptive locks
- * and the spin is not tuned very extensively.  The spinning does make a big
- * difference in almost every workload, but spinning for the right amount of
- * time needs some help.
- *
- * In general, we want to spin as long as the lock holder is doing btree
- * searches, and we should give up if they are in more expensive code.
+ * Setting a lock to blocking will drop the spinlock and set the
+ * flag that forces other procs who want the lock to wait.  After
+ * this you can safely schedule with the lock held.
  */
+void btrfs_set_lock_blocking(struct extent_buffer *eb)
+{
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+		set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+		spin_unlock(&eb->lock);
+	}
+	/* exit with the spin lock released and the bit set */
+}
 
-int btrfs_tree_lock(struct extent_buffer *eb)
+/*
+ * clearing the blocking flag will take the spinlock again.
+ * After this you can't safely schedule
+ */
+void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 {
-	int i;
+	if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+		spin_nested(eb);
+		clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags);
+		smp_mb__after_clear_bit();
+	}
+	/* exit with the spin lock held */
+}
 
-	if (mutex_trylock(&eb->mutex))
-		return 0;
+/*
+ * unfortunately, many of the places that currently set a lock to blocking
+ * don't end up blocking for every long, and often they don't block
+ * at all.  For a dbench 50 run, if we don't spin one the blocking bit
+ * at all, the context switch rate can jump up to 400,000/sec or more.
+ *
+ * So, we're still stuck with this crummy spin on the blocking bit,
+ * at least until the most common causes of the short blocks
+ * can be dealt with.
+ */
+static int btrfs_spin_on_block(struct extent_buffer *eb)
+{
+	int i;
 	for (i = 0; i < 512; i++) {
 		cpu_relax();
-		if (mutex_trylock(&eb->mutex))
-			return 0;
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		if (need_resched())
+			break;
 	}
-	cpu_relax();
-	mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
 	return 0;
 }
 
-int btrfs_try_tree_lock(struct extent_buffer *eb)
+/*
+ * This is somewhat different from trylock.  It will take the
+ * spinlock but if it finds the lock is set to blocking, it will
+ * return without the lock held.
+ *
+ * returns 1 if it was able to take the lock and zero otherwise
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_try_spin_lock(struct extent_buffer *eb)
 {
-	return mutex_trylock(&eb->mutex);
+	int i;
+
+	spin_nested(eb);
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		return 1;
+	spin_unlock(&eb->lock);
+
+	/* spin for a bit on the BLOCKING flag */
+	for (i = 0; i < 2; i++) {
+		if (!btrfs_spin_on_block(eb))
+			break;
+
+		spin_nested(eb);
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 1;
+		spin_unlock(&eb->lock);
+	}
+	return 0;
 }
 
-int btrfs_tree_unlock(struct extent_buffer *eb)
+/*
+ * the autoremove wake function will return 0 if it tried to wake up
+ * a process that was already awake, which means that process won't
+ * count as an exclusive wakeup.  The waitq code will continue waking
+ * procs until it finds one that was actually sleeping.
+ *
+ * For btrfs, this isn't quite what we want.  We want a single proc
+ * to be notified that the lock is ready for taking.  If that proc
+ * already happen to be awake, great, it will loop around and try for
+ * the lock.
+ *
+ * So, btrfs_wake_function always returns 1, even when the proc that we
+ * tried to wake up was already awake.
+ */
+static int btrfs_wake_function(wait_queue_t *wait, unsigned mode,
+			       int sync, void *key)
 {
-	mutex_unlock(&eb->mutex);
-	return 0;
+	autoremove_wake_function(wait, mode, sync, key);
+	return 1;
 }
 
-int btrfs_tree_locked(struct extent_buffer *eb)
+/*
+ * returns with the extent buffer spinlocked.
+ *
+ * This will spin and/or wait as required to take the lock, and then
+ * return with the spinlock held.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
+ */
+int btrfs_tree_lock(struct extent_buffer *eb)
 {
-	return mutex_is_locked(&eb->mutex);
+	DEFINE_WAIT(wait);
+	wait.func = btrfs_wake_function;
+
+	while(1) {
+		spin_nested(eb);
+
+		/* nobody is blocking, exit with the spinlock held */
+		if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			return 0;
+
+		/*
+		 * we have the spinlock, but the real owner is blocking.
+		 * wait for them
+		 */
+		spin_unlock(&eb->lock);
+
+		/*
+		 * spin for a bit, and if the blocking flag goes away,
+		 * loop around
+		 */
+		if (btrfs_spin_on_block(eb))
+			continue;
+
+		prepare_to_wait_exclusive(&eb->lock_wq, &wait,
+					  TASK_UNINTERRUPTIBLE);
+
+		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+			schedule();
+
+		finish_wait(&eb->lock_wq, &wait);
+	}
+	return 0;
 }
 
 /*
- * btrfs_search_slot uses this to decide if it should drop its locks
- * before doing something expensive like allocating free blocks for cow.
+ * Very quick trylock, this does not spin or schedule.  It returns
+ * 1 with the spinlock held if it was able to take the lock, or it
+ * returns zero if it was unable to take the lock.
+ *
+ * After this call, scheduling is not safe without first calling
+ * btrfs_set_lock_blocking()
  */
-int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
+int btrfs_try_tree_lock(struct extent_buffer *eb)
 {
-	int i;
-	struct extent_buffer *eb;
-	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
-		eb = path->nodes[i];
-		if (!eb)
-			break;
-		smp_mb();
-		if (!list_empty(&eb->mutex.wait_list))
-			return 1;
+	if (spin_trylock(&eb->lock)) {
+		if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
+			/*
+			 * we've got the spinlock, but the real owner is
+			 * blocking.  Drop the spinlock and return failure
+			 */
+			spin_unlock(&eb->lock);
+			return 0;
+		}
+		return 1;
 	}
+	/* someone else has the spinlock giveup */
 	return 0;
 }
 
+int btrfs_tree_unlock(struct extent_buffer *eb)
+{
+	/*
+	 * if we were a blocking owner, we don't have the spinlock held
+	 * just clear the bit and look for waiters
+	 */
+	if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		smp_mb__after_clear_bit();
+	else
+		spin_unlock(&eb->lock);
+
+	if (waitqueue_active(&eb->lock_wq))
+		wake_up(&eb->lock_wq);
+	return 0;
+}
+
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
+{
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		assert_spin_locked(&eb->lock);
+}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index bc1faef..6c4ce45 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,7 +21,11 @@
 
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_tree_locked(struct extent_buffer *eb);
+
 int btrfs_try_tree_lock(struct extent_buffer *eb);
-int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
+int btrfs_try_spin_lock(struct extent_buffer *eb);
+
+void btrfs_set_lock_blocking(struct extent_buffer *eb);
+void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
 #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a209401..77c2411 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -613,7 +613,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 	struct btrfs_sector_sum *sector_sums;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-	struct list_head *cur;
 	unsigned long num_sectors;
 	unsigned long i;
 	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
@@ -624,8 +623,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
 		return 1;
 
 	mutex_lock(&tree->mutex);
-	list_for_each_prev(cur, &ordered->list) {
-		ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list);
+	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
 		if (disk_bytenr >= ordered_sum->bytenr) {
 			num_sectors = ordered_sum->len / sectorsize;
 			sector_sums = ordered_sum->sums;
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index 6f0acc4..d0cc62b 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/sort.h>
 #include "ctree.h"
 #include "ref-cache.h"
 #include "transaction.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index 16f3183..bc283ad 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -73,5 +73,4 @@ int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
 int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
 			   int shared);
 int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-
 #endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index db9fb3b..19a4daf 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -37,7 +37,6 @@
 #include <linux/ctype.h>
 #include <linux/namei.h>
 #include <linux/miscdevice.h>
-#include <linux/version.h>
 #include <linux/magic.h>
 #include "compat.h"
 #include "ctree.h"
@@ -380,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	btrfs_start_delalloc_inodes(root);
 	btrfs_wait_ordered_extents(root, 0);
 
-	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
@@ -512,6 +510,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
+	ret = btrfs_parse_options(root, data);
+	if (ret)
+		return -EINVAL;
+
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 
@@ -583,17 +585,18 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
 	struct btrfs_ioctl_vol_args *vol;
 	struct btrfs_fs_devices *fs_devices;
 	int ret = -ENOTTY;
-	int len;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	vol = kmalloc(sizeof(*vol), GFP_KERNEL);
+	if (!vol)
+		return -ENOMEM;
+
 	if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) {
 		ret = -EFAULT;
 		goto out;
 	}
-	len = strnlen(vol->name, BTRFS_PATH_NAME_MAX);
 
 	switch (cmd) {
 	case BTRFS_IOC_SCAN_DEV:
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8a08f94..4112d53 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
+			mutex_lock(&root->fs_info->trans_mutex);
 			btrfs_record_root_in_trans(root);
+			mutex_unlock(&root->fs_info->trans_mutex);
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
@@ -852,11 +854,9 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_pending_snapshot *pending;
 	struct list_head *head = &trans->transaction->pending_snapshots;
-	struct list_head *cur;
 	int ret;
 
-	list_for_each(cur, head) {
-		pending = list_entry(cur, struct btrfs_pending_snapshot, list);
+	list_for_each_entry(pending, head, list) {
 		ret = create_pending_snapshot(trans, fs_info, pending);
 		BUG_ON(ret);
 	}
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3e8358c..98d25fa 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -74,6 +74,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		u32 nritems;
 
 		root_node = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(root_node);
 		nritems = btrfs_header_nritems(root_node);
 		root->defrag_max.objectid = 0;
 		/* from above we know this is not a leaf */
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d81cda2..9c462fb 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -78,104 +78,6 @@ static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
  */
 
 /*
- * btrfs_add_log_tree adds a new per-subvolume log tree into the
- * tree of log tree roots.  This must be called with a tree log transaction
- * running (see start_log_trans).
- */
-static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root)
-{
-	struct btrfs_key key;
-	struct btrfs_root_item root_item;
-	struct btrfs_inode_item *inode_item;
-	struct extent_buffer *leaf;
-	struct btrfs_root *new_root = root;
-	int ret;
-	u64 objectid = root->root_key.objectid;
-
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID,
-				      trans->transid, 0, 0, 0);
-	if (IS_ERR(leaf)) {
-		ret = PTR_ERR(leaf);
-		return ret;
-	}
-
-	btrfs_set_header_nritems(leaf, 0);
-	btrfs_set_header_level(leaf, 0);
-	btrfs_set_header_bytenr(leaf, leaf->start);
-	btrfs_set_header_generation(leaf, trans->transid);
-	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
-
-	write_extent_buffer(leaf, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(leaf),
-			    BTRFS_FSID_SIZE);
-	btrfs_mark_buffer_dirty(leaf);
-
-	inode_item = &root_item.inode;
-	memset(inode_item, 0, sizeof(*inode_item));
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
-
-	btrfs_set_root_bytenr(&root_item, leaf->start);
-	btrfs_set_root_generation(&root_item, trans->transid);
-	btrfs_set_root_level(&root_item, 0);
-	btrfs_set_root_refs(&root_item, 0);
-	btrfs_set_root_used(&root_item, 0);
-
-	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
-	root_item.drop_level = 0;
-
-	btrfs_tree_unlock(leaf);
-	free_extent_buffer(leaf);
-	leaf = NULL;
-
-	btrfs_set_root_dirid(&root_item, 0);
-
-	key.objectid = BTRFS_TREE_LOG_OBJECTID;
-	key.offset = objectid;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
-				&root_item);
-	if (ret)
-		goto fail;
-
-	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
-					       &key);
-	BUG_ON(!new_root);
-
-	WARN_ON(root->log_root);
-	root->log_root = new_root;
-
-	/*
-	 * log trees do not get reference counted because they go away
-	 * before a real commit is actually done.  They do store pointers
-	 * to file data extents, and those reference counts still get
-	 * updated (along with back refs to the log tree).
-	 */
-	new_root->ref_cows = 0;
-	new_root->last_trans = trans->transid;
-
-	/*
-	 * we need to make sure the root block for this new tree
-	 * is marked as dirty in the dirty_log_pages tree.  This
-	 * is how it gets flushed down to disk at tree log commit time.
-	 *
-	 * the tree logging mutex keeps others from coming in and changing
-	 * the new_root->node, so we can safely access it here
-	 */
-	set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
-			 new_root->node->start + new_root->node->len - 1,
-			 GFP_NOFS);
-
-fail:
-	return ret;
-}
-
-/*
  * start a sub transaction and setup the log tree
  * this increments the log tree writer count to make the people
  * syncing the tree wait for us to finish
@@ -184,6 +86,14 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root)
 {
 	int ret;
+
+	mutex_lock(&root->log_mutex);
+	if (root->log_root) {
+		root->log_batch++;
+		atomic_inc(&root->log_writers);
+		mutex_unlock(&root->log_mutex);
+		return 0;
+	}
 	mutex_lock(&root->fs_info->tree_log_mutex);
 	if (!root->fs_info->log_root_tree) {
 		ret = btrfs_init_log_root_tree(trans, root->fs_info);
@@ -193,9 +103,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
 		ret = btrfs_add_log_tree(trans, root);
 		BUG_ON(ret);
 	}
-	atomic_inc(&root->fs_info->tree_log_writers);
-	root->fs_info->tree_log_batch++;
 	mutex_unlock(&root->fs_info->tree_log_mutex);
+	root->log_batch++;
+	atomic_inc(&root->log_writers);
+	mutex_unlock(&root->log_mutex);
 	return 0;
 }
 
@@ -212,13 +123,12 @@ static int join_running_log_trans(struct btrfs_root *root)
 	if (!root->log_root)
 		return -ENOENT;
 
-	mutex_lock(&root->fs_info->tree_log_mutex);
+	mutex_lock(&root->log_mutex);
 	if (root->log_root) {
 		ret = 0;
-		atomic_inc(&root->fs_info->tree_log_writers);
-		root->fs_info->tree_log_batch++;
+		atomic_inc(&root->log_writers);
 	}
-	mutex_unlock(&root->fs_info->tree_log_mutex);
+	mutex_unlock(&root->log_mutex);
 	return ret;
 }
 
@@ -228,10 +138,11 @@ static int join_running_log_trans(struct btrfs_root *root)
  */
 static int end_log_trans(struct btrfs_root *root)
 {
-	atomic_dec(&root->fs_info->tree_log_writers);
-	smp_mb();
-	if (waitqueue_active(&root->fs_info->tree_log_wait))
-		wake_up(&root->fs_info->tree_log_wait);
+	if (atomic_dec_and_test(&root->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&root->log_writer_wait))
+			wake_up(&root->log_writer_wait);
+	}
 	return 0;
 }
 
@@ -1704,6 +1615,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 				btrfs_tree_lock(next);
 				clean_tree_block(trans, root, next);
+				btrfs_set_lock_blocking(next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
@@ -1750,6 +1662,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		next = path->nodes[*level];
 		btrfs_tree_lock(next);
 		clean_tree_block(trans, root, next);
+		btrfs_set_lock_blocking(next);
 		btrfs_wait_tree_block_writeback(next);
 		btrfs_tree_unlock(next);
 
@@ -1807,6 +1720,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 
 				btrfs_tree_lock(next);
 				clean_tree_block(trans, root, next);
+				btrfs_set_lock_blocking(next);
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
@@ -1879,6 +1793,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
 			btrfs_tree_lock(next);
 			clean_tree_block(trans, log, next);
+			btrfs_set_lock_blocking(next);
 			btrfs_wait_tree_block_writeback(next);
 			btrfs_tree_unlock(next);
 
@@ -1902,26 +1817,65 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 		}
 	}
 	btrfs_free_path(path);
-	if (wc->free)
-		free_extent_buffer(log->node);
 	return ret;
 }
 
-static int wait_log_commit(struct btrfs_root *log)
+/*
+ * helper function to update the item for a given subvolumes log root
+ * in the tree of log roots
+ */
+static int update_log_root(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *log)
+{
+	int ret;
+
+	if (log->log_transid == 1) {
+		/* insert root item on the first sync */
+		ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	} else {
+		ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
+				&log->root_key, &log->root_item);
+	}
+	return ret;
+}
+
+static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
 {
 	DEFINE_WAIT(wait);
-	u64 transid = log->fs_info->tree_log_transid;
+	int index = transid % 2;
 
+	/*
+	 * we only allow two pending log transactions at a time,
+	 * so we know that if ours is more than 2 older than the
+	 * current transaction, we're done
+	 */
 	do {
-		prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
-		mutex_unlock(&log->fs_info->tree_log_mutex);
-		if (atomic_read(&log->fs_info->tree_log_commit))
+		prepare_to_wait(&root->log_commit_wait[index],
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+		if (root->log_transid < transid + 2 &&
+		    atomic_read(&root->log_commit[index]))
 			schedule();
-		finish_wait(&log->fs_info->tree_log_wait, &wait);
-		mutex_lock(&log->fs_info->tree_log_mutex);
-	} while (transid == log->fs_info->tree_log_transid &&
-		atomic_read(&log->fs_info->tree_log_commit));
+		finish_wait(&root->log_commit_wait[index], &wait);
+		mutex_lock(&root->log_mutex);
+	} while (root->log_transid < transid + 2 &&
+		 atomic_read(&root->log_commit[index]));
+	return 0;
+}
+
+static int wait_for_writer(struct btrfs_root *root)
+{
+	DEFINE_WAIT(wait);
+	while (atomic_read(&root->log_writers)) {
+		prepare_to_wait(&root->log_writer_wait,
+				&wait, TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&root->log_mutex);
+		if (atomic_read(&root->log_writers))
+			schedule();
+		mutex_lock(&root->log_mutex);
+		finish_wait(&root->log_writer_wait, &wait);
+	}
 	return 0;
 }
 
@@ -1933,57 +1887,114 @@ static int wait_log_commit(struct btrfs_root *log)
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
 		   struct btrfs_root *root)
 {
+	int index1;
+	int index2;
 	int ret;
-	unsigned long batch;
 	struct btrfs_root *log = root->log_root;
+	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
 
-	mutex_lock(&log->fs_info->tree_log_mutex);
-	if (atomic_read(&log->fs_info->tree_log_commit)) {
-		wait_log_commit(log);
-		goto out;
+	mutex_lock(&root->log_mutex);
+	index1 = root->log_transid % 2;
+	if (atomic_read(&root->log_commit[index1])) {
+		wait_log_commit(root, root->log_transid);
+		mutex_unlock(&root->log_mutex);
+		return 0;
 	}
-	atomic_set(&log->fs_info->tree_log_commit, 1);
+	atomic_set(&root->log_commit[index1], 1);
+
+	/* wait for previous tree log sync to complete */
+	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
+		wait_log_commit(root, root->log_transid - 1);
 
 	while (1) {
-		batch = log->fs_info->tree_log_batch;
-		mutex_unlock(&log->fs_info->tree_log_mutex);
+		unsigned long batch = root->log_batch;
+		mutex_unlock(&root->log_mutex);
 		schedule_timeout_uninterruptible(1);
-		mutex_lock(&log->fs_info->tree_log_mutex);
-
-		while (atomic_read(&log->fs_info->tree_log_writers)) {
-			DEFINE_WAIT(wait);
-			prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
-					TASK_UNINTERRUPTIBLE);
-			mutex_unlock(&log->fs_info->tree_log_mutex);
-			if (atomic_read(&log->fs_info->tree_log_writers))
-				schedule();
-			mutex_lock(&log->fs_info->tree_log_mutex);
-			finish_wait(&log->fs_info->tree_log_wait, &wait);
-		}
-		if (batch == log->fs_info->tree_log_batch)
+		mutex_lock(&root->log_mutex);
+		wait_for_writer(root);
+		if (batch == root->log_batch)
 			break;
 	}
 
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
-	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
-			       &root->fs_info->log_root_tree->dirty_log_pages);
+
+	btrfs_set_root_bytenr(&log->root_item, log->node->start);
+	btrfs_set_root_generation(&log->root_item, trans->transid);
+	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+
+	root->log_batch = 0;
+	root->log_transid++;
+	log->log_transid = root->log_transid;
+	smp_mb();
+	/*
+	 * log tree has been flushed to disk, new modifications of
+	 * the log will be written to new positions. so it's safe to
+	 * allow log writers to go in.
+	 */
+	mutex_unlock(&root->log_mutex);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_writers);
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	ret = update_log_root(trans, log);
+	BUG_ON(ret);
+
+	mutex_lock(&log_root_tree->log_mutex);
+	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
+		smp_mb();
+		if (waitqueue_active(&log_root_tree->log_writer_wait))
+			wake_up(&log_root_tree->log_writer_wait);
+	}
+
+	index2 = log_root_tree->log_transid % 2;
+	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		wait_log_commit(log_root_tree, log_root_tree->log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		goto out;
+	}
+	atomic_set(&log_root_tree->log_commit[index2], 1);
+
+	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
+		wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+
+	wait_for_writer(log_root_tree);
+
+	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
+				&log_root_tree->dirty_log_pages);
 	BUG_ON(ret);
 
 	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
-				 log->fs_info->log_root_tree->node->start);
+				log_root_tree->node->start);
 	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
-		       btrfs_header_level(log->fs_info->log_root_tree->node));
+				btrfs_header_level(log_root_tree->node));
+
+	log_root_tree->log_batch = 0;
+	log_root_tree->log_transid++;
+	smp_mb();
+
+	mutex_unlock(&log_root_tree->log_mutex);
+
+	/*
+	 * nobody else is going to jump in and write the the ctree
+	 * super here because the log_commit atomic below is protecting
+	 * us.  We must be called with a transaction handle pinning
+	 * the running transaction open, so a full commit can't hop
+	 * in and cause problems either.
+	 */
+	write_ctree_super(trans, root->fs_info->tree_root, 2);
 
-	write_ctree_super(trans, log->fs_info->tree_root, 2);
-	log->fs_info->tree_log_transid++;
-	log->fs_info->tree_log_batch = 0;
-	atomic_set(&log->fs_info->tree_log_commit, 0);
+	atomic_set(&log_root_tree->log_commit[index2], 0);
 	smp_mb();
-	if (waitqueue_active(&log->fs_info->tree_log_wait))
-		wake_up(&log->fs_info->tree_log_wait);
+	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
+		wake_up(&log_root_tree->log_commit_wait[index2]);
 out:
-	mutex_unlock(&log->fs_info->tree_log_mutex);
+	atomic_set(&root->log_commit[index1], 0);
+	smp_mb();
+	if (waitqueue_active(&root->log_commit_wait[index1]))
+		wake_up(&root->log_commit_wait[index1]);
 	return 0;
 }
 
@@ -2019,38 +2030,18 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 				   start, end, GFP_NOFS);
 	}
 
-	log = root->log_root;
-	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
-			     &log->root_key);
-	BUG_ON(ret);
+	if (log->log_transid > 0) {
+		ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
+				     &log->root_key);
+		BUG_ON(ret);
+	}
 	root->log_root = NULL;
-	kfree(root->log_root);
+	free_extent_buffer(log->node);
+	kfree(log);
 	return 0;
 }
 
 /*
- * helper function to update the item for a given subvolumes log root
- * in the tree of log roots
- */
-static int update_log_root(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *log)
-{
-	u64 bytenr = btrfs_root_bytenr(&log->root_item);
-	int ret;
-
-	if (log->node->start == bytenr)
-		return 0;
-
-	btrfs_set_root_bytenr(&log->root_item, log->node->start);
-	btrfs_set_root_generation(&log->root_item, trans->transid);
-	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
-	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
-				&log->root_key, &log->root_item);
-	BUG_ON(ret);
-	return ret;
-}
-
-/*
  * If both a file and directory are logged, and unlinks or renames are
  * mixed in, we have a few interesting corners:
  *
@@ -2711,11 +2702,6 @@ next_slot:
 
 	btrfs_free_path(path);
 	btrfs_free_path(dst_path);
-
-	mutex_lock(&root->fs_info->tree_log_mutex);
-	ret = update_log_root(trans, log);
-	BUG_ON(ret);
-	mutex_unlock(&root->fs_info->tree_log_mutex);
 out:
 	return 0;
 }
@@ -2846,7 +2832,9 @@ again:
 		BUG_ON(!wc.replay_dest);
 
 		wc.replay_dest->log_root = log;
+		mutex_lock(&fs_info->trans_mutex);
 		btrfs_record_root_in_trans(wc.replay_dest);
+		mutex_unlock(&fs_info->trans_mutex);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3451e1c..dd06e18 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -20,7 +20,6 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
-#include <linux/version.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -104,10 +103,8 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
 						   u64 devid, u8 *uuid)
 {
 	struct btrfs_device *dev;
-	struct list_head *cur;
 
-	list_for_each(cur, head) {
-		dev = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(dev, head, dev_list) {
 		if (dev->devid == devid &&
 		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 			return dev;
@@ -118,11 +115,9 @@ static noinline struct btrfs_device *__find_device(struct list_head *head,
 
 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 {
-	struct list_head *cur;
 	struct btrfs_fs_devices *fs_devices;
 
-	list_for_each(cur, &fs_uuids) {
-		fs_devices = list_entry(cur, struct btrfs_fs_devices, list);
+	list_for_each_entry(fs_devices, &fs_uuids, list) {
 		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 			return fs_devices;
 	}
@@ -159,6 +154,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 loop:
 	spin_lock(&device->io_lock);
 
+loop_lock:
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
@@ -208,7 +204,7 @@ loop:
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi) &&
+		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct bio *old_head;
 
@@ -220,7 +216,8 @@ loop:
 				tail->bi_next = old_head;
 			else
 				device->pending_bio_tail = tail;
-			device->running_pending = 0;
+
+			device->running_pending = 1;
 
 			spin_unlock(&device->io_lock);
 			btrfs_requeue_work(&device->work);
@@ -229,6 +226,11 @@ loop:
 	}
 	if (again)
 		goto loop;
+
+	spin_lock(&device->io_lock);
+	if (device->pending_bios)
+		goto loop_lock;
+	spin_unlock(&device->io_lock);
 done:
 	return 0;
 }
@@ -345,14 +347,11 @@ error:
 
 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *tmp;
-	struct list_head *cur;
-	struct btrfs_device *device;
+	struct btrfs_device *device, *next;
 
 	mutex_lock(&uuid_mutex);
 again:
-	list_for_each_safe(cur, tmp, &fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->in_fs_metadata)
 			continue;
 
@@ -383,14 +382,12 @@ again:
 
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *cur;
 	struct btrfs_device *device;
 
 	if (--fs_devices->opened > 0)
 		return 0;
 
-	list_for_each(cur, &fs_devices->devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		if (device->bdev) {
 			close_bdev_exclusive(device->bdev, device->mode);
 			fs_devices->open_devices--;
@@ -439,7 +436,6 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 {
 	struct block_device *bdev;
 	struct list_head *head = &fs_devices->devices;
-	struct list_head *cur;
 	struct btrfs_device *device;
 	struct block_device *latest_bdev = NULL;
 	struct buffer_head *bh;
@@ -450,8 +446,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 	int seeding = 1;
 	int ret = 0;
 
-	list_for_each(cur, head) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, head, dev_list) {
 		if (device->bdev)
 			continue;
 		if (!device->name)
@@ -578,7 +573,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		       *(unsigned long long *)disk_super->fsid,
 		       *(unsigned long long *)(disk_super->fsid + 8));
 	}
-	printk(KERN_INFO "devid %llu transid %llu %s\n",
+	printk(KERN_CONT "devid %llu transid %llu %s\n",
 	       (unsigned long long)devid, (unsigned long long)transid, path);
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 
@@ -1017,14 +1012,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	}
 
 	if (strcmp(device_path, "missing") == 0) {
-		struct list_head *cur;
 		struct list_head *devices;
 		struct btrfs_device *tmp;
 
 		device = NULL;
 		devices = &root->fs_info->fs_devices->devices;
-		list_for_each(cur, devices) {
-			tmp = list_entry(cur, struct btrfs_device, dev_list);
+		list_for_each_entry(tmp, devices, dev_list) {
 			if (tmp->in_fs_metadata && !tmp->bdev) {
 				device = tmp;
 				break;
@@ -1280,7 +1273,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
 	struct block_device *bdev;
-	struct list_head *cur;
 	struct list_head *devices;
 	struct super_block *sb = root->fs_info->sb;
 	u64 total_bytes;
@@ -1304,8 +1296,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
-	list_for_each(cur, devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
 			goto error;
@@ -1383,6 +1374,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		ret = btrfs_add_device(trans, root, device);
 	}
 
+	/*
+	 * we've got more storage, clear any full flags on the space
+	 * infos
+	 */
+	btrfs_clear_space_info_full(root->fs_info);
+
 	unlock_chunks(root);
 	btrfs_commit_transaction(trans, root);
 
@@ -1468,6 +1465,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 	device->fs_devices->total_rw_bytes += diff;
 
 	device->total_bytes = new_size;
+	btrfs_clear_space_info_full(device->dev_root->fs_info);
+
 	return btrfs_update_device(trans, device);
 }
 
@@ -1704,7 +1703,6 @@ static u64 div_factor(u64 num, int factor)
 int btrfs_balance(struct btrfs_root *dev_root)
 {
 	int ret;
-	struct list_head *cur;
 	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
 	struct btrfs_device *device;
 	u64 old_size;
@@ -1723,8 +1721,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
 	dev_root = dev_root->fs_info->dev_root;
 
 	/* step one make some room on all the devices */
-	list_for_each(cur, devices) {
-		device = list_entry(cur, struct btrfs_device, dev_list);
+	list_for_each_entry(device, devices, dev_list) {
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
@@ -2905,10 +2902,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		free_extent_map(em);
 	}
 
-	map = kzalloc(sizeof(*map), GFP_NOFS);
-	if (!map)
-		return -ENOMEM;
-
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
@@ -3117,6 +3110,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	if (!sb)
 		return -ENOMEM;
 	btrfs_set_buffer_uptodate(sb);
+	btrfs_set_buffer_lockdep_class(sb, 0);
+
 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 7f332e2..a9d3bf4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/rwsem.h>
 #include <linux/xattr.h>
+#include <linux/security.h>
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "transaction.h"
@@ -45,9 +46,12 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 	/* lookup the xattr by name */
 	di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
 				strlen(name), 0);
-	if (!di || IS_ERR(di)) {
+	if (!di) {
 		ret = -ENODATA;
 		goto out;
+	} else if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto out;
 	}
 
 	leaf = path->nodes[0];
@@ -62,6 +66,14 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 		ret = -ERANGE;
 		goto out;
 	}
+
+	/*
+	 * The way things are packed into the leaf is like this
+	 * |struct btrfs_dir_item|name|data|
+	 * where name is the xattr name, so security.foo, and data is the
+	 * content of the xattr.  data_ptr points to the location in memory
+	 * where the data starts in the in memory leaf
+	 */
 	data_ptr = (unsigned long)((char *)(di + 1) +
 				   btrfs_dir_name_len(leaf, di));
 	read_extent_buffer(leaf, buffer, data_ptr,
@@ -86,7 +98,7 @@ int __btrfs_setxattr(struct inode *inode, const char *name,
 	if (!path)
 		return -ENOMEM;
 
-	trans = btrfs_start_transaction(root, 1);
+	trans = btrfs_join_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, inode);
 
 	/* first lets see if we already have this xattr */
@@ -176,7 +188,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
-	ret = 0;
 	advance = 0;
 	while (1) {
 		leaf = path->nodes[0];
@@ -320,3 +331,34 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 		return -EOPNOTSUPP;
 	return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
 }
+
+int btrfs_xattr_security_init(struct inode *inode, struct inode *dir)
+{
+	int err;
+	size_t len;
+	void *value;
+	char *suffix;
+	char *name;
+
+	err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
+		       GFP_NOFS);
+	if (!name) {
+		err = -ENOMEM;
+	} else {
+		strcpy(name, XATTR_SECURITY_PREFIX);
+		strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
+		err = __btrfs_setxattr(inode, name, value, len, 0);
+		kfree(name);
+	}
+
+	kfree(suffix);
+	kfree(value);
+	return err;
+}
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5b1d08f..c71e9c3 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -36,4 +36,6 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 		const void *value, size_t size, int flags);
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 
+extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir);
+
 #endif /* __XATTR__ */
diff --git a/fs/buffer.c b/fs/buffer.c
index b58208f..a2fd743 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -166,151 +166,6 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * Write out and wait upon all the dirty data associated with a block
- * device via its mapping.  Does not take the superblock lock.
- */
-int sync_blockdev(struct block_device *bdev)
-{
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
-}
-EXPORT_SYMBOL(sync_blockdev);
-
-/*
- * Write out and wait upon all dirty data associated with this
- * device.   Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_bdev(struct block_device *bdev)
-{
-	struct super_block *sb = get_super(bdev);
-	if (sb) {
-		int res = fsync_super(sb);
-		drop_super(sb);
-		return res;
-	}
-	return sync_blockdev(bdev);
-}
-
-/**
- * freeze_bdev  --  lock a filesystem and force it into a consistent state
- * @bdev:	blockdevice to lock
- *
- * This takes the block device bd_mount_sem to make sure no new mounts
- * happen on bdev until thaw_bdev() is called.
- * If a superblock is found on this device, we take the s_umount semaphore
- * on it to make sure nobody unmounts until the snapshot creation is done.
- * The reference counter (bd_fsfreeze_count) guarantees that only the last
- * unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
- * actually.
- */
-struct super_block *freeze_bdev(struct block_device *bdev)
-{
-	struct super_block *sb;
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (bdev->bd_fsfreeze_count > 0) {
-		bdev->bd_fsfreeze_count++;
-		sb = get_super(bdev);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return sb;
-	}
-	bdev->bd_fsfreeze_count++;
-
-	down(&bdev->bd_mount_sem);
-	sb = get_super(bdev);
-	if (sb && !(sb->s_flags & MS_RDONLY)) {
-		sb->s_frozen = SB_FREEZE_WRITE;
-		smp_wmb();
-
-		__fsync_super(sb);
-
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
-
-		sync_blockdev(sb->s_bdev);
-
-		if (sb->s_op->freeze_fs) {
-			error = sb->s_op->freeze_fs(sb);
-			if (error) {
-				printk(KERN_ERR
-					"VFS:Filesystem freeze failed\n");
-				sb->s_frozen = SB_UNFROZEN;
-				drop_super(sb);
-				up(&bdev->bd_mount_sem);
-				bdev->bd_fsfreeze_count--;
-				mutex_unlock(&bdev->bd_fsfreeze_mutex);
-				return ERR_PTR(error);
-			}
-		}
-	}
-
-	sync_blockdev(bdev);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-
-	return sb;	/* thaw_bdev releases s->s_umount and bd_mount_sem */
-}
-EXPORT_SYMBOL(freeze_bdev);
-
-/**
- * thaw_bdev  -- unlock filesystem
- * @bdev:	blockdevice to unlock
- * @sb:		associated superblock
- *
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
- */
-int thaw_bdev(struct block_device *bdev, struct super_block *sb)
-{
-	int error = 0;
-
-	mutex_lock(&bdev->bd_fsfreeze_mutex);
-	if (!bdev->bd_fsfreeze_count) {
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return -EINVAL;
-	}
-
-	bdev->bd_fsfreeze_count--;
-	if (bdev->bd_fsfreeze_count > 0) {
-		if (sb)
-			drop_super(sb);
-		mutex_unlock(&bdev->bd_fsfreeze_mutex);
-		return 0;
-	}
-
-	if (sb) {
-		BUG_ON(sb->s_bdev != bdev);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			if (sb->s_op->unfreeze_fs) {
-				error = sb->s_op->unfreeze_fs(sb);
-				if (error) {
-					printk(KERN_ERR
-						"VFS:Filesystem thaw failed\n");
-					sb->s_frozen = SB_FREEZE_TRANS;
-					bdev->bd_fsfreeze_count++;
-					mutex_unlock(&bdev->bd_fsfreeze_mutex);
-					return error;
-				}
-			}
-			sb->s_frozen = SB_UNFROZEN;
-			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
-		}
-		drop_super(sb);
-	}
-
-	up(&bdev->bd_mount_sem);
-	mutex_unlock(&bdev->bd_fsfreeze_mutex);
-	return 0;
-}
-EXPORT_SYMBOL(thaw_bdev);
-
-/*
  * Various filesystems appear to want __find_get_block to be non-blocking.
  * But it's the page lock which protects the buffers.  To get around this,
  * we get exclusion from try_to_free_buffers with the blockdev mapping's
@@ -760,15 +615,9 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
  */
-static int __set_page_dirty(struct page *page,
+static void __set_page_dirty(struct page *page,
 		struct address_space *mapping, int warn)
 {
-	if (unlikely(!mapping))
-		return !TestSetPageDirty(page);
-
-	if (TestSetPageDirty(page))
-		return 0;
-
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(warn && !PageUptodate(page));
@@ -777,6 +626,7 @@ static int __set_page_dirty(struct page *page,
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
 			__inc_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
+			task_dirty_inc(current);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
@@ -784,8 +634,6 @@ static int __set_page_dirty(struct page *page,
 	}
 	spin_unlock_irq(&mapping->tree_lock);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
-
-	return 1;
 }
 
 /*
@@ -815,6 +663,7 @@ static int __set_page_dirty(struct page *page,
  */
 int __set_page_dirty_buffers(struct page *page)
 {
+	int newly_dirty;
 	struct address_space *mapping = page_mapping(page);
 
 	if (unlikely(!mapping))
@@ -830,9 +679,12 @@ int __set_page_dirty_buffers(struct page *page)
 			bh = bh->b_this_page;
 		} while (bh != head);
 	}
+	newly_dirty = !TestSetPageDirty(page);
 	spin_unlock(&mapping->private_lock);
 
-	return __set_page_dirty(page, mapping, 1);
+	if (newly_dirty)
+		__set_page_dirty(page, mapping, 1);
+	return newly_dirty;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
 
@@ -1261,8 +1113,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
 			return;
 	}
 
-	if (!test_set_buffer_dirty(bh))
-		__set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
+	if (!test_set_buffer_dirty(bh)) {
+		struct page *page = bh->b_page;
+		if (!TestSetPageDirty(page))
+			__set_page_dirty(page, page_mapping(page), 0);
+	}
 }
 
 /*
@@ -2688,7 +2543,7 @@ int nobh_write_end(struct file *file, struct address_space *mapping,
 	struct buffer_head *bh;
 	BUG_ON(fsdata != NULL && page_has_buffers(page));
 
-	if (unlikely(copied < len) && !page_has_buffers(page))
+	if (unlikely(copied < len) && head)
 		attach_nobh_buffers(page, head);
 	if (page_has_buffers(page))
 		return generic_write_end(file, mapping, pos, len,
@@ -3108,7 +2963,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 73ac7eb..6598400 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,22 @@
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session.  This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts.  fsync now sends SMB Flush operation
+to better ensure that we wait for server to write all of the data to
+server disk (not just write it over the network).  Add new mount
+parameter to allow user to disable sending the (slow) SMB flush on
+fsync if desired (fsync still flushes all cached write data to the server).
+Posix file open support added (turned off after one attempt if server
+fails to support it properly, as with Samba server versions prior to 3.3.2)
+Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
+little memory for the "nativeFileSystem" field returned by the server
+during mount). 
+
 Version 1.56
 ------------
 Add "forcemandatorylock" mount option to allow user to use mandatory
@@ -7,7 +26,10 @@ specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
 Samba servers (worked to Windows).  Fix rmdir so that pending search
 (readdir) requests do not get invalid results which include the now
-removed directory.
+removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS.  Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
 
 Version 1.55
 ------------
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 341a989..6994a0f 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -118,6 +118,18 @@ config CIFS_DEBUG2
 	   option can be turned off unless you are debugging
 	   cifs problems.  If unsure, say N.
 
+config CIFS_DFS_UPCALL
+	  bool "DFS feature support"
+	  depends on CIFS && KEYS
+	  help
+	    Distributed File System (DFS) support is used to access shares
+	    transparently in an enterprise name space, even if the share
+	    moves to a different server.  This feature also enables
+	    an upcall mechanism for CIFS which contacts userspace helper
+	    utilities to provide server name resolution (host names to
+	    IP addresses) which is needed for implicit mounts of DFS junction
+	    points. If unsure, say N.
+
 config CIFS_EXPERIMENTAL
 	  bool "CIFS Experimental Features (EXPERIMENTAL)"
 	  depends on CIFS && EXPERIMENTAL
@@ -131,12 +143,3 @@ config CIFS_EXPERIMENTAL
 	    (which is disabled by default). See the file fs/cifs/README
 	    for more details.  If unsure, say N.
 
-config CIFS_DFS_UPCALL
-	  bool "DFS feature support (EXPERIMENTAL)"
-	  depends on CIFS_EXPERIMENTAL
-	  depends on KEYS
-	  help
-	    Enables an upcall mechanism for CIFS which contacts userspace
-	    helper utilities to provide server name resolution (host names to
-	    IP addresses) which is needed for implicit mounts of DFS junction
-	    points. If unsure, say N.
diff --git a/fs/cifs/README b/fs/cifs/README
index da4515e..0743418 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -472,6 +472,19 @@ A partial list of the supported mount options follows:
 		even if the cifs server would support posix advisory locks.
 		"forcemand" is accepted as a shorter form of this mount
 		option.
+ nostrictsync   If this mount option is set, when an application does an
+		fsync call then the cifs client does not send an SMB Flush
+		to the server (to force the server to write all dirty data
+		for this file immediately to disk), although cifs still sends
+		all dirty (cached) file data to the server and waits for the
+		server to respond to the write.  Since SMB Flush can be
+		very slow, and some servers may be reliable enough (to risk
+		delaying slightly flushing the data to disk on the server),
+		turning on this option may be useful to improve performance for
+		applications that fsync too much, at a small risk of server
+		crash.  If this mount option is not set, by default cifs will
+		send an SMB flush request (and wait for a response) on every
+		fsync call.
  nodfs          Disable DFS (global name space support) even if the
 		server claims to support it.  This can help work around
 		a problem with parsing of DFS paths with Samba server
@@ -692,13 +705,14 @@ require this helper. Note that NTLMv2 security (which does not require the
 cifs.upcall helper program), instead of using Kerberos, is sufficient for
 some use cases.
 
-Enabling DFS support (used to access shares transparently in an MS-DFS
-global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled.  In
-addition, DFS support for target shares which are specified as UNC
+DFS support allows transparent redirection to shares in an MS-DFS name space.
+In addition, DFS support for target shares which are specified as UNC
 names which begin with host names (rather than IP addresses) requires
 a user space helper (such as cifs.upcall) to be present in order to
 translate host names to ip address, and the user space helper must also
-be configured in the file /etc/request-key.conf
+be configured in the file /etc/request-key.conf.  Samba, Windows servers and
+many NAS appliances support DFS as a way of constructing a global name
+space to ease network configuration and improve reliability.
 
 To use cifs Kerberos and DFS support, the Linux keyutils package should be
 installed and something like the following lines should be added to the
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 490e34b..877e4d9 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -340,6 +340,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 				seq_printf(m, "\nWrites: %d Bytes: %lld",
 					atomic_read(&tcon->num_writes),
 					(long long)(tcon->bytes_written));
+				seq_printf(m, "\nFlushes: %d",
+					atomic_read(&tcon->num_flushes));
 				seq_printf(m, "\nLocks: %d HardLinks: %d "
 					      "Symlinks: %d",
 					atomic_read(&tcon->num_locks),
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 85c0a74..5fdbf8a 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -104,9 +104,9 @@ static char *cifs_get_share_name(const char *node_name)
 
 
 /**
- * compose_mount_options	-	creates mount options for refferral
+ * cifs_compose_mount_options	-	creates mount options for refferral
  * @sb_mountdata:	parent/root DFS mount options (template)
- * @dentry:		point where we are going to mount
+ * @fullpath:		full path in UNC format
  * @ref:		server's referral
  * @devname:		pointer for saving device name
  *
@@ -116,8 +116,8 @@ static char *cifs_get_share_name(const char *node_name)
  * Returns: pointer to new mount options or ERR_PTR.
  * Caller is responcible for freeing retunrned value if it is not error.
  */
-static char *compose_mount_options(const char *sb_mountdata,
-				   struct dentry *dentry,
+char *cifs_compose_mount_options(const char *sb_mountdata,
+				   const char *fullpath,
 				   const struct dfs_info3_param *ref,
 				   char **devname)
 {
@@ -128,7 +128,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 	char *srvIP = NULL;
 	char sep = ',';
 	int off, noff;
-	char *fullpath;
 
 	if (sb_mountdata == NULL)
 		return ERR_PTR(-EINVAL);
@@ -202,17 +201,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 		goto compose_mount_options_err;
 	}
 
-	/*
-	 * this function gives us a path with a double backslash prefix. We
-	 * require a single backslash for DFS. Temporarily increment fullpath
-	 * to put it in the proper form and decrement before freeing it.
-	 */
-	fullpath = build_path_from_dentry(dentry);
-	if (!fullpath) {
-		rc = -ENOMEM;
-		goto compose_mount_options_err;
-	}
-	++fullpath;
 	tkn_e = strchr(tkn_e + 1, '\\');
 	if (tkn_e || (strlen(fullpath) - ref->path_consumed)) {
 		strncat(mountdata, &sep, 1);
@@ -221,8 +209,6 @@ static char *compose_mount_options(const char *sb_mountdata,
 			strcat(mountdata, tkn_e + 1);
 		strcat(mountdata, fullpath + ref->path_consumed);
 	}
-	--fullpath;
-	kfree(fullpath);
 
 	/*cFYI(1,("%s: parent mountdata: %s", __func__,sb_mountdata));*/
 	/*cFYI(1, ("%s: submount mountdata: %s", __func__, mountdata ));*/
@@ -245,10 +231,20 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 	struct vfsmount *mnt;
 	char *mountdata;
 	char *devname = NULL;
+	char *fullpath;
 
 	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-	mountdata = compose_mount_options(cifs_sb->mountdata,
-						dentry, ref, &devname);
+	/*
+	 * this function gives us a path with a double backslash prefix. We
+	 * require a single backslash for DFS.
+	 */
+	fullpath = build_path_from_dentry(dentry);
+	if (!fullpath)
+		return ERR_PTR(-ENOMEM);
+
+	mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
+			fullpath + 1, ref, &devname);
+	kfree(fullpath);
 
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index c4c306f..4797787 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -32,6 +32,7 @@
 #define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
 #define CIFS_MOUNT_DYNPERM      0x1000 /* allow in-memory only mode setting   */
 #define CIFS_MOUNT_NOPOSIXBRL   0x2000 /* mandatory not posix byte range lock */
+#define CIFS_MOUNT_NOSSYNC      0x4000 /* don't do slow SMBflush on every sync*/
 
 struct cifs_sb_info {
 	struct cifsTconInfo *tcon;	/* primary mount */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 13ea532..38491fd 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -606,7 +606,8 @@ cifs_get_sb(struct file_system_type *fs_type,
 		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 }
 
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7ac4818..77e190d 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -78,8 +78,8 @@ extern int cifs_dir_open(struct inode *inode, struct file *file);
 extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
 
 /* Functions related to dir entries */
-extern struct dentry_operations cifs_dentry_ops;
-extern struct dentry_operations cifs_ci_dentry_ops;
+extern const struct dentry_operations cifs_dentry_ops;
+extern const struct dentry_operations cifs_ci_dentry_ops;
 
 /* Functions related to symlinks */
 extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.56"
+#define CIFS_VERSION   "1.57"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94c1ca0..9fbf4df 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -164,9 +164,12 @@ struct TCP_Server_Info {
 	/* multiplexed reads or writes */
 	unsigned int maxBuf;	/* maxBuf specifies the maximum */
 	/* message size the server can send or receive for non-raw SMBs */
-	unsigned int maxRw;	/* maxRw specifies the maximum */
+	unsigned int max_rw;	/* maxRw specifies the maximum */
 	/* message size the server can send or receive for */
 	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
+	unsigned int max_vcs;	/* maximum number of smb sessions, at least
+				   those that can be specified uniquely with
+				   vcnumbers */
 	char sessid[4];		/* unique token id for this session */
 	/* (returned on Negotiate */
 	int capabilities; /* allow selective disabling of caps by smb sess */
@@ -210,6 +213,7 @@ struct cifsSesInfo {
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
 	__u16 flags;
+	__u16 vcnum;
 	char *serverOS;		/* name of operating system underlying server */
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
@@ -250,6 +254,7 @@ struct cifsTconInfo {
 	atomic_t num_smbs_sent;
 	atomic_t num_writes;
 	atomic_t num_reads;
+	atomic_t num_flushes;
 	atomic_t num_oplock_brks;
 	atomic_t num_opens;
 	atomic_t num_closes;
@@ -294,6 +299,7 @@ struct cifsTconInfo {
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	bool local_lease:1; /* check leases (only) on local system not remote */
+	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool need_reconnect:1; /* connection reset, tid now invalid */
 	/* BB add field for back pointer to sb struct(s)? */
 };
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index b4e2e9f0..b370489 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifspdu.h
  *
- *   Copyright (c) International Business Machines  Corp., 2002,2008
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -23,6 +23,7 @@
 #define _CIFSPDU_H
 
 #include <net/sock.h>
+#include "smbfsctl.h"
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 #define LANMAN_PROT 0
@@ -34,15 +35,15 @@
 #define POSIX_PROT  (CIFS_PROT+1)
 #define BAD_PROT 0xFFFF
 
-/* SMB command codes */
-/*
- * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses
+/* SMB command codes:
+ * Note some commands have minimal (wct=0,bcc=0), or uninteresting, responses
  * (ie which include no useful data other than the SMB error code itself).
- * Knowing this helps avoid response buffer allocations and copy in some cases
+ * This can allow us to avoid response buffer allocations and copy in some cases
  */
 #define SMB_COM_CREATE_DIRECTORY      0x00 /* trivial response */
 #define SMB_COM_DELETE_DIRECTORY      0x01 /* trivial response */
 #define SMB_COM_CLOSE                 0x04 /* triv req/rsp, timestamp ignored */
+#define SMB_COM_FLUSH                 0x05 /* triv req/rsp */
 #define SMB_COM_DELETE                0x06 /* trivial response */
 #define SMB_COM_RENAME                0x07 /* trivial response */
 #define SMB_COM_QUERY_INFORMATION     0x08 /* aka getattr */
@@ -790,6 +791,12 @@ typedef struct smb_com_close_rsp {
 	__u16 ByteCount;	/* bct = 0 */
 } __attribute__((packed)) CLOSE_RSP;
 
+typedef struct smb_com_flush_req {
+	struct smb_hdr hdr;	/* wct = 1 */
+	__u16 FileID;
+	__u16 ByteCount;	/* 0 */
+} __attribute__((packed)) FLUSH_REQ;
+
 typedef struct smb_com_findclose_req {
 	struct smb_hdr hdr; /* wct = 1 */
 	__u16 FileID;
@@ -1924,19 +1931,19 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
 #define DFS_TYPE_ROOT 0x0001
 
 /* Referral Entry Flags */
-#define DFS_NAME_LIST_REF 0x0200
+#define DFS_NAME_LIST_REF 0x0200 /* set for domain or DC referral responses */
+#define DFS_TARGET_SET_BOUNDARY 0x0400 /* only valid with version 4 dfs req */
 
-typedef struct dfs_referral_level_3 {
-	__le16 VersionNumber;
+typedef struct dfs_referral_level_3 { /* version 4 is same, + one flag bit */
+	__le16 VersionNumber;  /* must be 3 or 4 */
 	__le16 Size;
 	__le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
-	__le16 ReferralEntryFlags; /* 0x0200 bit set only for domain
-				      or DC referral responce */
+	__le16 ReferralEntryFlags;
 	__le32 TimeToLive;
 	__le16 DfsPathOffset;
 	__le16 DfsAlternatePathOffset;
 	__le16 NetworkAddressOffset; /* offset of the link target */
-	__le16 ServiceSiteGuid;
+	__u8   ServiceSiteGuid[16];  /* MBZ, ignored */
 } __attribute__((packed)) REFERRAL3;
 
 typedef struct smb_com_transaction_get_dfs_refer_rsp {
@@ -1946,48 +1953,15 @@ typedef struct smb_com_transaction_get_dfs_refer_rsp {
 	__u8 Pad;
 	__le16 PathConsumed;
 	__le16 NumberOfReferrals;
-	__le16 DFSFlags;
-	__u16 Pad2;
+	__le32 DFSFlags;
 	REFERRAL3 referrals[1];	/* array of level 3 dfs_referral structures */
 	/* followed by the strings pointed to by the referral structures */
 } __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_RSP;
 
 /* DFS Flags */
-#define DFSREF_REFERRAL_SERVER  0x0001
-#define DFSREF_STORAGE_SERVER   0x0002
-
-/* IOCTL information */
-/*
- * List of ioctl function codes that look to be of interest to remote clients
- * like this one.  Need to do some experimentation to make sure they all work
- * remotely.  Some of the following, such as the encryption/compression ones
- * would be invoked from tools via a specialized hook into the VFS rather
- * than via the standard vfs entry points
- */
-#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
-#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
-#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
-#define FSCTL_LOCK_VOLUME            0x00090018
-#define FSCTL_UNLOCK_VOLUME          0x0009001C
-#define FSCTL_GET_COMPRESSION        0x0009003C
-#define FSCTL_SET_COMPRESSION        0x0009C040
-#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
-#define FSCTL_FILESYS_GET_STATISTICS 0x00090090
-#define FSCTL_SET_REPARSE_POINT      0x000900A4
-#define FSCTL_GET_REPARSE_POINT      0x000900A8
-#define FSCTL_DELETE_REPARSE_POINT   0x000900AC
-#define FSCTL_SET_SPARSE             0x000900C4
-#define FSCTL_SET_ZERO_DATA          0x000900C8
-#define FSCTL_SET_ENCRYPTION         0x000900D7
-#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB
-#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF
-#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3
-#define FSCTL_SIS_COPYFILE           0x00090100
-#define FSCTL_SIS_LINK_FILES         0x0009C104
-
-#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
-#define IO_REPARSE_TAG_HSM           0xC0000004
-#define IO_REPARSE_TAG_SIS           0x80000007
+#define DFSREF_REFERRAL_SERVER  0x00000001 /* all targets are DFS roots */
+#define DFSREF_STORAGE_SERVER   0x00000002 /* no further ref requests needed */
+#define DFSREF_TARGET_FAILBACK  0x00000004 /* only for DFS referral version 4 */
 
 /*
  ************************************************************************
@@ -2508,8 +2482,6 @@ struct data_blob {
 	6) Use nanosecond timestamps throughout all time fields if
 	   corresponding attribute flag is set
 	7) sendfile - handle based copy
-	8) Direct i/o
-	9) Misc fcntls?
 
 	what about fixing 64 bit alignment
 
@@ -2628,7 +2600,5 @@ typedef struct file_chattr_info {
 	__le64	mode; /* list of actual attribute bits on this inode */
 } __attribute__((packed)) FILE_CHATTR_INFO;  /* ext attributes
 						(chattr, chflags) level 0x206 */
-
-#endif
-
+#endif 				/* POSIX */
 #endif				/* _CIFSPDU_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 382ba62..4167716 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -42,7 +42,11 @@ extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
 #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
+extern char *cifs_compose_mount_options(const char *sb_mountdata,
+		const char *fullpath, const struct dfs_info3_param *ref,
+		char **devname);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
 			struct smb_hdr * /* input */ ,
@@ -91,6 +95,12 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
+extern int cifs_posix_open(char *full_path, struct inode **pinode,
+			   struct super_block *sb, int mode, int oflags,
+			   int *poplock, __u16 *pnetfid, int xid);
+extern void posix_fill_in_inode(struct inode *tmp_inode,
+				FILE_UNIX_BASIC_INFO *pData, int isNewInode);
+extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
@@ -277,6 +287,9 @@ extern int CIFSPOSIXCreate(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon,
 			const int smb_file_id);
 
+extern int CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon,
+			const int smb_file_id);
+
 extern int CIFSSMBRead(const int xid, struct cifsTconInfo *tcon,
 			const int netfid, unsigned int count,
 			const __u64 lseek, unsigned int *nbytes, char **buf,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 552642a..bc09c99 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
 		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
 				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
 		GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
 		/* even though we do not use raw we might as well set this
 		accurately, in case we ever find a need for it */
 		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-			server->maxRw = 0xFF00;
+			server->max_rw = 0xFF00;
 			server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
 		} else {
-			server->maxRw = 0;/* we do not need to use raw anyway */
+			server->max_rw = 0;/* do not need to use raw anyway */
 			server->capabilities = CAP_MPX_MODE;
 		}
 		tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
@@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	/* probably no need to store and check maxvcs */
 	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-	server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
+	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
 	cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
 	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
@@ -1933,6 +1934,27 @@ CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
 }
 
 int
+CIFSSMBFlush(const int xid, struct cifsTconInfo *tcon, int smb_file_id)
+{
+	int rc = 0;
+	FLUSH_REQ *pSMB = NULL;
+	cFYI(1, ("In CIFSSMBFlush"));
+
+	rc = small_smb_init(SMB_COM_FLUSH, 1, tcon, (void **) &pSMB);
+	if (rc)
+		return rc;
+
+	pSMB->FileID = (__u16) smb_file_id;
+	pSMB->ByteCount = 0;
+	rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0);
+	cifs_stats_inc(&tcon->num_flushes);
+	if (rc)
+		cERROR(1, ("Send error in Flush = %d", rc));
+
+	return rc;
+}
+
+int
 CIFSSMBRename(const int xid, struct cifsTconInfo *tcon,
 	      const char *fromName, const char *toName,
 	      const struct nls_table *nls_codepage, int remap)
@@ -2355,8 +2377,10 @@ winCreateHardLinkRetry:
 				     PATH_MAX, nls_codepage, remap);
 		name_len++;	/* trailing null */
 		name_len *= 2;
-		pSMB->OldFileName[name_len] = 0;	/* pad */
-		pSMB->OldFileName[name_len + 1] = 0x04;
+
+		/* protocol specifies ASCII buffer format (0x04) for unicode */
+		pSMB->OldFileName[name_len] = 0x04;
+		pSMB->OldFileName[name_len + 1] = 0x00; /* pad */
 		name_len2 =
 		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
 				     toName, PATH_MAX, nls_codepage, remap);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2209be9..0de3b561 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,7 +23,6 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
-#include <linux/ipv6.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -35,6 +34,7 @@
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -95,6 +95,7 @@ struct smb_vol {
 	bool local_lease:1; /* check leases only on local system, not remote */
 	bool noblocksnd:1;
 	bool noautotune:1;
+	bool nostrictsync:1; /* do not force expensive SMBflush on every sync */
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -1274,6 +1275,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->intr = 0;
 		} else if (strnicmp(data, "intr", 4) == 0) {
 			vol->intr = 1;
+		} else if (strnicmp(data, "nostrictsync", 12) == 0) {
+			vol->nostrictsync = 1;
+		} else if (strnicmp(data, "strictsync", 10) == 0) {
+			vol->nostrictsync = 0;
 		} else if (strnicmp(data, "serverino", 7) == 0) {
 			vol->server_ino = 1;
 		} else if (strnicmp(data, "noserverino", 9) == 0) {
@@ -1379,8 +1384,8 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
 		     server->addr.sockAddr.sin_addr.s_addr))
 			continue;
 		else if (addr->ss_family == AF_INET6 &&
-			 memcmp(&server->addr.sockAddr6.sin6_addr,
-				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
+			 !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
+					  &addr6->sin6_addr))
 			continue;
 
 		++server->srv_count;
@@ -2160,6 +2165,8 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL;
 	if (pvolume_info->nobrl)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL;
+	if (pvolume_info->nostrictsync)
+		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC;
 	if (pvolume_info->mand_lock)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL;
 	if (pvolume_info->cifs_acl)
@@ -2180,6 +2187,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			   "mount option supported"));
 }
 
+static int
+is_path_accessible(int xid, struct cifsTconInfo *tcon,
+		   struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	__u64 inode_num;
+	FILE_ALL_INFO *pfile_info;
+
+	rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
+				   cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != -EOPNOTSUPP)
+		return rc;
+
+	pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (pfile_info == NULL)
+		return -ENOMEM;
+
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
+			      0 /* not legacy */, cifs_sb->local_nls,
+			      cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	kfree(pfile_info);
+	return rc;
+}
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	   char *mount_data, const char *devname)
@@ -2190,6 +2224,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct cifsSesInfo *pSesInfo = NULL;
 	struct cifsTconInfo *tcon = NULL;
 	struct TCP_Server_Info *srvTcp = NULL;
+	char   *full_path;
 
 	xid = GetXid();
 
@@ -2426,6 +2461,23 @@ mount_fail_check:
 		cifs_sb->rsize = min(cifs_sb->rsize,
 			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
 
+	if (!rc && cifs_sb->prepathlen) {
+		/* build_path_to_root works only when we have a valid tcon */
+		full_path = cifs_build_path_to_root(cifs_sb);
+		if (full_path == NULL) {
+			rc = -ENOMEM;
+			goto mount_fail_check;
+		}
+		rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
+		if (rc) {
+			cERROR(1, ("Path %s in not accessible: %d",
+						full_path, rc));
+			kfree(full_path);
+			goto mount_fail_check;
+		}
+		kfree(full_path);
+	}
+
 	/* volume_info->password is freed above when existing session found
 	(in which case it is not needed anymore) but when new sesion is created
 	the password ptr is put in the new session structure (in which case the
@@ -3622,7 +3674,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
 			    BCC(smb_buffer_response)) {
 				kfree(tcon->nativeFileSystem);
 				tcon->nativeFileSystem =
-				    kzalloc(length + 2, GFP_KERNEL);
+				    kzalloc(2*(length + 1), GFP_KERNEL);
 				if (tcon->nativeFileSystem)
 					cifs_strfromUCS_le(
 						tcon->nativeFileSystem,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 964aad0..2f35cccf 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with dentries
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -129,6 +129,80 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+int cifs_posix_open(char *full_path, struct inode **pinode,
+		    struct super_block *sb, int mode, int oflags,
+		    int *poplock, __u16 *pnetfid, int xid)
+{
+	int rc;
+	__u32 oplock;
+	FILE_UNIX_BASIC_INFO *presp_data;
+	__u32 posix_flags = 0;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cFYI(1, ("posix open %s", full_path));
+
+	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (presp_data == NULL)
+		return -ENOMEM;
+
+/* So far cifs posix extensions can only map the following flags.
+   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
+   so far we do not seem to need them, and we can treat them as local only */
+	if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
+		(FMODE_READ | FMODE_WRITE))
+		posix_flags = SMB_O_RDWR;
+	else if (oflags & FMODE_READ)
+		posix_flags = SMB_O_RDONLY;
+	else if (oflags & FMODE_WRITE)
+		posix_flags = SMB_O_WRONLY;
+	if (oflags & O_CREAT)
+		posix_flags |= SMB_O_CREAT;
+	if (oflags & O_EXCL)
+		posix_flags |= SMB_O_EXCL;
+	if (oflags & O_TRUNC)
+		posix_flags |= SMB_O_TRUNC;
+	if (oflags & O_APPEND)
+		posix_flags |= SMB_O_APPEND;
+	if (oflags & O_SYNC)
+		posix_flags |= SMB_O_SYNC;
+	if (oflags & O_DIRECTORY)
+		posix_flags |= SMB_O_DIRECTORY;
+	if (oflags & O_NOFOLLOW)
+		posix_flags |= SMB_O_NOFOLLOW;
+	if (oflags & O_DIRECT)
+		posix_flags |= SMB_O_DIRECT;
+
+
+	rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
+			pnetfid, presp_data, &oplock, full_path,
+			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		goto posix_open_ret;
+
+	if (presp_data->Type == cpu_to_le32(-1))
+		goto posix_open_ret; /* open ok, caller does qpathinfo */
+
+	/* get new inode and set it up */
+	if (!pinode)
+		goto posix_open_ret; /* caller does not need info */
+
+	if (*pinode == NULL)
+		*pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+	/* else an inode was passed in. Update its info, don't create one */
+
+	/* We do not need to close the file if new_inode fails since
+	   the caller will retry qpathinfo as long as inode is null */
+	if (*pinode == NULL)
+		goto posix_open_ret;
+
+	posix_fill_in_inode(*pinode, presp_data, 1);
+
+posix_open_ret:
+	kfree(presp_data);
+	return rc;
+}
+
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
 			      struct dentry *direntry,
 			      struct inode *newinode)
@@ -150,7 +224,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int xid;
 	int create_options = CREATE_NOT_DIR;
 	int oplock = 0;
-	/* BB below access is too much for the mknod to request */
+	int oflags;
+	/*
+	 * BB below access is probably too much for mknod to request
+	 *    but we have to do query and setpathinfo so requesting
+	 *    less could fail (unless we want to request getatr and setatr
+	 *    permissions (only).  At least for POSIX we do not have to
+	 *    request so much.
+	 */
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
 	struct cifs_sb_info *cifs_sb;
@@ -174,13 +255,43 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	}
 
 	mode &= ~current->fs->umask;
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
 
-	if (nd && (nd->flags & LOOKUP_OPEN)) {
-		int oflags = nd->intent.open.flags;
+	if (nd && (nd->flags & LOOKUP_OPEN))
+		oflags = nd->intent.open.flags;
+	else
+		oflags = FMODE_READ;
+
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
+				     mode, oflags, &oplock, &fileHandle, xid);
+		/* EIO could indicate that (posix open) operation is not
+		   supported, despite what server claimed in capability
+		   negotation.  EREMOTE indicates DFS junction, which is not
+		   handled in posix open */
+
+		if ((rc == 0) && (newinode == NULL))
+			goto cifs_create_get_file_info; /* query inode info */
+		else if (rc == 0) /* success, no need to query */
+			goto cifs_create_set_dentry;
+		else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto cifs_create_out;
+		/* else fallthrough to retry, using older open call, this is
+		   case where server does not support this SMB level, and
+		   falsely claims capability (also get here for DFS case
+		   which should be rare for path not covered on files) */
+	}
 
+	if (nd && (nd->flags & LOOKUP_OPEN)) {
+		/* if the file is going to stay open, then we
+		   need to set the desired access properly */
 		desiredAccess = 0;
 		if (oflags & FMODE_READ)
-			desiredAccess |= GENERIC_READ;
+			desiredAccess |= GENERIC_READ; /* is this too little? */
 		if (oflags & FMODE_WRITE) {
 			desiredAccess |= GENERIC_WRITE;
 			if (!(oflags & FMODE_READ))
@@ -199,8 +310,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 
 	/* BB add processing to set equivalent of mode - e.g. via CreateX with
 	   ACLs */
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
@@ -233,116 +342,112 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	}
 	if (rc) {
 		cFYI(1, ("cifs_create returned 0x%x", rc));
-	} else {
-		/* If Open reported that we actually created a file
-		then we now have to set the mode if possible */
-		if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
-			struct cifs_unix_set_info_args args = {
+		goto cifs_create_out;
+	}
+
+	/* If Open reported that we actually created a file
+	   then we now have to set the mode if possible */
+	if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+		struct cifs_unix_set_info_args args = {
 				.mode	= mode,
 				.ctime	= NO_CHANGE_64,
 				.atime	= NO_CHANGE_64,
 				.mtime	= NO_CHANGE_64,
 				.device	= 0,
-			};
+		};
 
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-				args.uid = (__u64) current_fsuid();
-				if (inode->i_mode & S_ISGID)
-					args.gid = (__u64) inode->i_gid;
-				else
-					args.gid = (__u64) current_fsgid();
-			} else {
-				args.uid = NO_CHANGE_64;
-				args.gid = NO_CHANGE_64;
-			}
-			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
-				cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = (__u64) current_fsuid();
+			if (inode->i_mode & S_ISGID)
+				args.gid = (__u64) inode->i_gid;
+			else
+				args.gid = (__u64) current_fsgid();
 		} else {
-			/* BB implement mode setting via Windows security
-			   descriptors e.g. */
-			/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
-
-			/* Could set r/o dos attribute if mode & 0222 == 0 */
+			args.uid = NO_CHANGE_64;
+			args.gid = NO_CHANGE_64;
 		}
+		CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+			cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else {
+		/* BB implement mode setting via Windows security
+		   descriptors e.g. */
+		/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
 
-		/* server might mask mode so we have to query for it */
-		if (tcon->unix_ext)
-			rc = cifs_get_inode_info_unix(&newinode, full_path,
-						 inode->i_sb, xid);
-		else {
-			rc = cifs_get_inode_info(&newinode, full_path,
-						 buf, inode->i_sb, xid,
-						 &fileHandle);
-			if (newinode) {
-				if (cifs_sb->mnt_cifs_flags &
-				    CIFS_MOUNT_DYNPERM)
-					newinode->i_mode = mode;
-				if ((oplock & CIFS_CREATE_ACTION) &&
-				    (cifs_sb->mnt_cifs_flags &
-				     CIFS_MOUNT_SET_UID)) {
-					newinode->i_uid = current_fsuid();
-					if (inode->i_mode & S_ISGID)
-						newinode->i_gid =
-							inode->i_gid;
-					else
-						newinode->i_gid =
-							current_fsgid();
-				}
+		/* Could set r/o dos attribute if mode & 0222 == 0 */
+	}
+
+cifs_create_get_file_info:
+	/* server might mask mode so we have to query for it */
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
+					      inode->i_sb, xid);
+	else {
+		rc = cifs_get_inode_info(&newinode, full_path, buf,
+					 inode->i_sb, xid, &fileHandle);
+		if (newinode) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+				newinode->i_mode = mode;
+			if ((oplock & CIFS_CREATE_ACTION) &&
+			    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+				newinode->i_uid = current_fsuid();
+				if (inode->i_mode & S_ISGID)
+					newinode->i_gid = inode->i_gid;
+				else
+					newinode->i_gid = current_fsgid();
 			}
 		}
+	}
 
-		if (rc != 0) {
-			cFYI(1, ("Create worked, get_inode_info failed rc = %d",
-				 rc));
-		} else
-			setup_cifs_dentry(tcon, direntry, newinode);
-
-		if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
-			(!(nd->flags & LOOKUP_OPEN))) {
-			/* mknod case - do not leave file open */
-			CIFSSMBClose(xid, tcon, fileHandle);
-		} else if (newinode) {
-			struct cifsFileInfo *pCifsFile =
-			   kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-
-			if (pCifsFile == NULL)
-				goto cifs_create_out;
-			pCifsFile->netfid = fileHandle;
-			pCifsFile->pid = current->tgid;
-			pCifsFile->pInode = newinode;
-			pCifsFile->invalidHandle = false;
-			pCifsFile->closePend     = false;
-			init_MUTEX(&pCifsFile->fh_sem);
-			mutex_init(&pCifsFile->lock_mutex);
-			INIT_LIST_HEAD(&pCifsFile->llist);
-			atomic_set(&pCifsFile->wrtPending, 0);
-
-			/* set the following in open now
+cifs_create_set_dentry:
+	if (rc == 0)
+		setup_cifs_dentry(tcon, direntry, newinode);
+	else
+		cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+
+	/* nfsd case - nfs srv does not set nd */
+	if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
+		/* mknod case - do not leave file open */
+		CIFSSMBClose(xid, tcon, fileHandle);
+	} else if (newinode) {
+		struct cifsFileInfo *pCifsFile =
+			kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+
+		if (pCifsFile == NULL)
+			goto cifs_create_out;
+		pCifsFile->netfid = fileHandle;
+		pCifsFile->pid = current->tgid;
+		pCifsFile->pInode = newinode;
+		pCifsFile->invalidHandle = false;
+		pCifsFile->closePend     = false;
+		init_MUTEX(&pCifsFile->fh_sem);
+		mutex_init(&pCifsFile->lock_mutex);
+		INIT_LIST_HEAD(&pCifsFile->llist);
+		atomic_set(&pCifsFile->wrtPending, 0);
+
+		/* set the following in open now
 				pCifsFile->pfile = file; */
-			write_lock(&GlobalSMBSeslock);
-			list_add(&pCifsFile->tlist, &tcon->openFileList);
-			pCifsInode = CIFS_I(newinode);
-			if (pCifsInode) {
-				/* if readable file instance put first in list*/
-				if (write_only) {
-					list_add_tail(&pCifsFile->flist,
-						&pCifsInode->openFileList);
-				} else {
-					list_add(&pCifsFile->flist,
-						&pCifsInode->openFileList);
-				}
-				if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-					pCifsInode->clientCanCacheAll = true;
-					pCifsInode->clientCanCacheRead = true;
-					cFYI(1, ("Exclusive Oplock inode %p",
-						newinode));
-				} else if ((oplock & 0xF) == OPLOCK_READ)
-					pCifsInode->clientCanCacheRead = true;
+		write_lock(&GlobalSMBSeslock);
+		list_add(&pCifsFile->tlist, &tcon->openFileList);
+		pCifsInode = CIFS_I(newinode);
+		if (pCifsInode) {
+			/* if readable file instance put first in list*/
+			if (write_only) {
+				list_add_tail(&pCifsFile->flist,
+					      &pCifsInode->openFileList);
+			} else {
+				list_add(&pCifsFile->flist,
+					 &pCifsInode->openFileList);
 			}
-			write_unlock(&GlobalSMBSeslock);
+			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+				pCifsInode->clientCanCacheAll = true;
+				pCifsInode->clientCanCacheRead = true;
+				cFYI(1, ("Exclusive Oplock inode %p",
+					newinode));
+			} else if ((oplock & 0xF) == OPLOCK_READ)
+				pCifsInode->clientCanCacheRead = true;
 		}
+		write_unlock(&GlobalSMBSeslock);
 	}
 cifs_create_out:
 	kfree(buf);
@@ -596,7 +701,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
 	return rc;
 }     */
 
-struct dentry_operations cifs_dentry_ops = {
+const struct dentry_operations cifs_dentry_ops = {
 	.d_revalidate = cifs_d_revalidate,
 /* d_delete:       cifs_d_delete,      */ /* not needed except for debugging */
 };
@@ -634,7 +739,7 @@ static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
 	return 1;
 }
 
-struct dentry_operations cifs_ci_dentry_ops = {
+const struct dentry_operations cifs_ci_dentry_ops = {
 	.d_revalidate = cifs_d_revalidate,
 	.d_hash = cifs_ci_hash,
 	.d_compare = cifs_ci_compare,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 12bb656..81747ac 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -78,8 +78,36 @@ static inline int cifs_convert_flags(unsigned int flags)
 	return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
 		FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
 		FILE_READ_DATA);
+}
 
+static inline fmode_t cifs_posix_convert_flags(unsigned int flags)
+{
+	fmode_t posix_flags = 0;
 
+	if ((flags & O_ACCMODE) == O_RDONLY)
+		posix_flags = FMODE_READ;
+	else if ((flags & O_ACCMODE) == O_WRONLY)
+		posix_flags = FMODE_WRITE;
+	else if ((flags & O_ACCMODE) == O_RDWR) {
+		/* GENERIC_ALL is too much permission to request
+		   can cause unnecessary access denied on create */
+		/* return GENERIC_ALL; */
+		posix_flags = FMODE_READ | FMODE_WRITE;
+	}
+	/* can not map O_CREAT or O_EXCL or O_TRUNC flags when
+	   reopening a file.  They had their effect on the original open */
+	if (flags & O_APPEND)
+		posix_flags |= (fmode_t)O_APPEND;
+	if (flags & O_SYNC)
+		posix_flags |= (fmode_t)O_SYNC;
+	if (flags & O_DIRECTORY)
+		posix_flags |= (fmode_t)O_DIRECTORY;
+	if (flags & O_NOFOLLOW)
+		posix_flags |= (fmode_t)O_NOFOLLOW;
+	if (flags & O_DIRECT)
+		posix_flags |= (fmode_t)O_DIRECT;
+
+	return posix_flags;
 }
 
 static inline int cifs_get_disposition(unsigned int flags)
@@ -97,6 +125,80 @@ static inline int cifs_get_disposition(unsigned int flags)
 }
 
 /* all arguments to this function must be checked for validity in caller */
+static inline int cifs_posix_open_inode_helper(struct inode *inode,
+			struct file *file, struct cifsInodeInfo *pCifsInode,
+			struct cifsFileInfo *pCifsFile, int oplock, u16 netfid)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+/*	struct timespec temp; */   /* BB REMOVEME BB */
+
+	file->private_data = kmalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+	if (file->private_data == NULL)
+		return -ENOMEM;
+	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
+	write_lock(&GlobalSMBSeslock);
+	list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList);
+
+	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
+	if (pCifsInode == NULL) {
+		write_unlock(&GlobalSMBSeslock);
+		return -EINVAL;
+	}
+
+	/* want handles we can use to read with first
+	   in the list so we do not have to walk the
+	   list to search for one in write_begin */
+	if ((file->f_flags & O_ACCMODE) == O_WRONLY) {
+		list_add_tail(&pCifsFile->flist,
+			      &pCifsInode->openFileList);
+	} else {
+		list_add(&pCifsFile->flist,
+			 &pCifsInode->openFileList);
+	}
+
+	if (pCifsInode->clientCanCacheRead) {
+		/* we have the inode open somewhere else
+		   no need to discard cache data */
+		goto psx_client_can_cache;
+	}
+
+	/* BB FIXME need to fix this check to move it earlier into posix_open
+	   BB  fIX following section BB FIXME */
+
+	/* if not oplocked, invalidate inode pages if mtime or file
+	   size changed */
+/*	temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
+			   (file->f_path.dentry->d_inode->i_size ==
+			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
+		cFYI(1, ("inode unchanged on server"));
+	} else {
+		if (file->f_path.dentry->d_inode->i_mapping) {
+			rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
+			if (rc != 0)
+				CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
+		}
+		cFYI(1, ("invalidating remote inode since open detected it "
+			 "changed"));
+		invalidate_remote_inode(file->f_path.dentry->d_inode);
+	} */
+
+psx_client_can_cache:
+	if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+		pCifsInode->clientCanCacheAll = true;
+		pCifsInode->clientCanCacheRead = true;
+		cFYI(1, ("Exclusive Oplock granted on inode %p",
+			 file->f_path.dentry->d_inode));
+	} else if ((oplock & 0xF) == OPLOCK_READ)
+		pCifsInode->clientCanCacheRead = true;
+
+	/* will have to change the unlock if we reenable the
+	   filemap_fdatawrite (which does not seem necessary */
+	write_unlock(&GlobalSMBSeslock);
+	return 0;
+}
+
+/* all arguments to this function must be checked for validity in caller */
 static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	struct cifsInodeInfo *pCifsInode, struct cifsFileInfo *pCifsFile,
 	struct cifsTconInfo *pTcon, int *oplock, FILE_ALL_INFO *buf,
@@ -167,7 +269,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	int rc = -EACCES;
 	int xid, oplock;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct list_head *tmp;
@@ -180,7 +282,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	xid = GetXid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tcon = cifs_sb->tcon;
 
 	if (file->f_flags & O_CREAT) {
 		/* search inode for this file and fill in file->private_data */
@@ -220,6 +322,45 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	cFYI(1, ("inode = 0x%p file flags are 0x%x for %s",
 		 inode, file->f_flags, full_path));
+
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
+	else
+		oplock = 0;
+
+	if (!tcon->broken_posix_open && tcon->unix_ext &&
+	    (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+		/* can not refresh inode info since size could be stale */
+		rc = cifs_posix_open(full_path, &inode, inode->i_sb,
+				     cifs_sb->mnt_file_mode /* ignored */,
+				     oflags, &oplock, &netfid, xid);
+		if (rc == 0) {
+			cFYI(1, ("posix open succeeded"));
+			/* no need for special case handling of setting mode
+			   on read only files needed here */
+
+			cifs_posix_open_inode_helper(inode, file, pCifsInode,
+						     pCifsFile, oplock, netfid);
+			goto out;
+		} else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
+			if (tcon->ses->serverNOS)
+				cERROR(1, ("server %s of type %s returned"
+					   " unexpected error on SMB posix open"
+					   ", disabling posix open support."
+					   " Check if server update available.",
+					   tcon->ses->serverName,
+					   tcon->ses->serverNOS));
+			tcon->broken_posix_open = true;
+		} else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto out;
+		/* else fallthrough to retry open the old way on network i/o
+		   or DFS errors */
+	}
+
 	desiredAccess = cifs_convert_flags(file->f_flags);
 
 /*********************************************************************
@@ -248,11 +389,6 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	disposition = cifs_get_disposition(file->f_flags);
 
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
-	else
-		oplock = 0;
-
 	/* BB pass O_SYNC flag through on file attributes .. BB */
 
 	/* Also refresh inode by passing in file_info buf returned by SMBOpen
@@ -269,7 +405,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 
 	if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS)
-		rc = CIFSSMBOpen(xid, pTcon, full_path, disposition,
+		rc = CIFSSMBOpen(xid, tcon, full_path, disposition,
 			 desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
 				 & CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -278,7 +414,7 @@ int cifs_open(struct inode *inode, struct file *file)
 
 	if (rc == -EIO) {
 		/* Old server, try legacy style OpenX */
-		rc = SMBLegacyOpen(xid, pTcon, full_path, disposition,
+		rc = SMBLegacyOpen(xid, tcon, full_path, disposition,
 			desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf,
 			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags
 				& CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -295,12 +431,12 @@ int cifs_open(struct inode *inode, struct file *file)
 	}
 	pCifsFile = cifs_init_private(file->private_data, inode, file, netfid);
 	write_lock(&GlobalSMBSeslock);
-	list_add(&pCifsFile->tlist, &pTcon->openFileList);
+	list_add(&pCifsFile->tlist, &tcon->openFileList);
 
 	pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
 	if (pCifsInode) {
 		rc = cifs_open_inode_helper(inode, file, pCifsInode,
-					    pCifsFile, pTcon,
+					    pCifsFile, tcon,
 					    &oplock, buf, full_path, xid);
 	} else {
 		write_unlock(&GlobalSMBSeslock);
@@ -309,7 +445,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	if (oplock & CIFS_CREATE_ACTION) {
 		/* time to set mode which we can not set earlier due to
 		   problems creating new read-only files */
-		if (pTcon->unix_ext) {
+		if (tcon->unix_ext) {
 			struct cifs_unix_set_info_args args = {
 				.mode	= inode->i_mode,
 				.uid	= NO_CHANGE_64,
@@ -319,7 +455,7 @@ int cifs_open(struct inode *inode, struct file *file)
 				.mtime	= NO_CHANGE_64,
 				.device	= 0,
 			};
-			CIFSSMBUnixSetInfo(xid, pTcon, full_path, &args,
+			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -349,7 +485,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	int rc = -EACCES;
 	int xid, oplock;
 	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct cifsTconInfo *tcon;
 	struct cifsFileInfo *pCifsFile;
 	struct cifsInodeInfo *pCifsInode;
 	struct inode *inode;
@@ -387,7 +523,7 @@ static int cifs_reopen_file(struct file *file, bool can_flush)
 	}
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	pTcon = cifs_sb->tcon;
+	tcon = cifs_sb->tcon;
 
 /* can not grab rename sem here because various ops, including
    those that already have the rename sem can end up causing writepage
@@ -404,20 +540,37 @@ reopen_error_exit:
 
 	cFYI(1, ("inode = 0x%p file flags 0x%x for %s",
 		 inode, file->f_flags, full_path));
-	desiredAccess = cifs_convert_flags(file->f_flags);
 
 	if (oplockEnabled)
 		oplock = REQ_OPLOCK;
 	else
 		oplock = 0;
 
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		int oflags = (int) cifs_posix_convert_flags(file->f_flags);
+		/* can not refresh inode info since size could be stale */
+		rc = cifs_posix_open(full_path, NULL, inode->i_sb,
+				     cifs_sb->mnt_file_mode /* ignored */,
+				     oflags, &oplock, &netfid, xid);
+		if (rc == 0) {
+			cFYI(1, ("posix reopen succeeded"));
+			goto reopen_success;
+		}
+		/* fallthrough to retry open the old way on errors, especially
+		   in the reconnect path it is important to retry hard */
+	}
+
+	desiredAccess = cifs_convert_flags(file->f_flags);
+
 	/* Can not refresh inode by passing in file_info buf to be returned
 	   by SMBOpen and then calling get_inode_info with returned buf
 	   since file might have write behind data that needs to be flushed
 	   and server version of file size can be stale. If we knew for sure
 	   that inode was not dirty locally we could do this */
 
-	rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess,
+	rc = CIFSSMBOpen(xid, tcon, full_path, disposition, desiredAccess,
 			 CREATE_NOT_DIR, &netfid, &oplock, NULL,
 			 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 				CIFS_MOUNT_MAP_SPECIAL_CHR);
@@ -426,6 +579,7 @@ reopen_error_exit:
 		cFYI(1, ("cifs_open returned 0x%x", rc));
 		cFYI(1, ("oplock: %d", oplock));
 	} else {
+reopen_success:
 		pCifsFile->netfid = netfid;
 		pCifsFile->invalidHandle = false;
 		up(&pCifsFile->fh_sem);
@@ -439,7 +593,7 @@ reopen_error_exit:
 			   go to server to get inode info */
 				pCifsInode->clientCanCacheAll = false;
 				pCifsInode->clientCanCacheRead = false;
-				if (pTcon->unix_ext)
+				if (tcon->unix_ext)
 					rc = cifs_get_inode_info_unix(&inode,
 						full_path, inode->i_sb, xid);
 				else
@@ -467,7 +621,6 @@ reopen_error_exit:
 			cifs_relock_file(pCifsFile);
 		}
 	}
-
 	kfree(full_path);
 	FreeXid(xid);
 	return rc;
@@ -1523,6 +1676,9 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 {
 	int xid;
 	int rc = 0;
+	struct cifsTconInfo *tcon;
+	struct cifsFileInfo *smbfile =
+		(struct cifsFileInfo *)file->private_data;
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	xid = GetXid();
@@ -1534,7 +1690,12 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
 	if (rc == 0) {
 		rc = CIFS_I(inode)->write_behind_rc;
 		CIFS_I(inode)->write_behind_rc = 0;
+		tcon = CIFS_SB(inode->i_sb)->tcon;
+		if (!rc && tcon && smbfile &&
+		   !(CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC))
+			rc = CIFSSMBFlush(xid, tcon, smbfile->netfid);
 	}
+
 	FreeXid(xid);
 	return rc;
 }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bcf7b51..a8797cc 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
 	pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
 }
 
+/**
+ * cifs_new inode - create new inode, initialize, and hash it
+ * @sb - pointer to superblock
+ * @inum - if valid pointer and serverino is enabled, replace i_ino with val
+ *
+ * Create a new inode, initialize it for CIFS and hash it. Returns the new
+ * inode or NULL if one couldn't be allocated.
+ *
+ * If the share isn't mounted with "serverino" or inum is a NULL pointer then
+ * we'll just use the inode number assigned by new_inode(). Note that this can
+ * mean i_ino collisions since the i_ino assigned by new_inode is not
+ * guaranteed to be unique.
+ */
+struct inode *
+cifs_new_inode(struct super_block *sb, __u64 *inum)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode == NULL)
+		return NULL;
+
+	/*
+	 * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
+	 *     stop passing inum as ptr. Are there sanity checks we can use to
+	 *     ensure that the server is really filling in that field? Also,
+	 *     if serverino is disabled, perhaps we should be using iunique()?
+	 */
+	if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+		inode->i_ino = (unsigned long) *inum;
+
+	/*
+	 * must set this here instead of cifs_alloc_inode since VFS will
+	 * clobber i_flags
+	 */
+	if (sb->s_flags & MS_NOATIME)
+		inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
 int cifs_get_inode_info_unix(struct inode **pinode,
 	const unsigned char *full_path, struct super_block *sb, int xid)
 {
@@ -233,22 +276,11 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = new_inode(sb);
+		*pinode = cifs_new_inode(sb, &find_data.UniqueId);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgiiu_exit;
 		}
-		/* Is an i_ino of zero legal? */
-		/* note ino incremented to unique num in new_inode */
-		/* Are there sanity checks we can use to ensure that
-		   the server is really filling in that field? */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-			(*pinode)->i_ino = (unsigned long)find_data.UniqueId;
-
-		if (sb->s_flags & MS_NOATIME)
-			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-
-		insert_inode_hash(*pinode);
 	}
 
 	inode = *pinode;
@@ -465,11 +497,9 @@ int cifs_get_inode_info(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = new_inode(sb);
-		if (*pinode == NULL) {
-			rc = -ENOMEM;
-			goto cgii_exit;
-		}
+		__u64 inode_num;
+		__u64 *pinum = &inode_num;
+
 		/* Is an i_ino of zero legal? Can we use that to check
 		   if the server supports returning inode numbers?  Are
 		   there other sanity checks we can use to ensure that
@@ -486,22 +516,26 @@ int cifs_get_inode_info(struct inode **pinode,
 
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 			int rc1 = 0;
-			__u64 inode_num;
 
 			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-					full_path, &inode_num,
+					full_path, pinum,
 					cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (rc1) {
 				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+				pinum = NULL;
 				/* BB EOPNOSUPP disable SERVER_INUM? */
-			} else /* do we need cast or hash to ino? */
-				(*pinode)->i_ino = inode_num;
-		} /* else ino incremented to unique num in new_inode*/
-		if (sb->s_flags & MS_NOATIME)
-			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-		insert_inode_hash(*pinode);
+			}
+		} else {
+			pinum = NULL;
+		}
+
+		*pinode = cifs_new_inode(sb, pinum);
+		if (*pinode == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 	}
 	inode = *pinode;
 	cifsInfo = CIFS_I(inode);
@@ -621,7 +655,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
 	.lookup = cifs_lookup,
 };
 
-static char *build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 {
 	int pplen = cifs_sb->prepathlen;
 	int dfsplen;
@@ -678,7 +712,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	full_path = build_path_to_root(cifs_sb);
+	full_path = cifs_build_path_to_root(cifs_sb);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -729,6 +763,9 @@ cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid,
 	struct cifsTconInfo *pTcon = cifs_sb->tcon;
 	FILE_BASIC_INFO	info_buf;
 
+	if (attrs == NULL)
+		return -EINVAL;
+
 	if (attrs->ia_valid & ATTR_ATIME) {
 		set_time = true;
 		info_buf.LastAccessTime =
@@ -1017,7 +1054,7 @@ out_reval:
 	return rc;
 }
 
-static void posix_fill_in_inode(struct inode *tmp_inode,
+void posix_fill_in_inode(struct inode *tmp_inode,
 	FILE_UNIX_BASIC_INFO *pData, int isNewInode)
 {
 	struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
@@ -1114,24 +1151,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			else
 				direntry->d_op = &cifs_dentry_ops;
 
-			newinode = new_inode(inode->i_sb);
+			newinode = cifs_new_inode(inode->i_sb,
+						  &pInfo->UniqueId);
 			if (newinode == NULL) {
 				kfree(pInfo);
 				goto mkdir_get_info;
 			}
 
-			/* Is an i_ino of zero legal? */
-			/* Are there sanity checks we can use to ensure that
-			   the server is really filling in that field? */
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-				newinode->i_ino =
-					(unsigned long)pInfo->UniqueId;
-			} /* note ino incremented to unique num in new_inode */
-			if (inode->i_sb->s_flags & MS_NOATIME)
-				newinode->i_flags |= S_NOATIME | S_NOCMTIME;
 			newinode->i_nlink = 2;
-
-			insert_inode_hash(newinode);
 			d_instantiate(direntry, newinode);
 
 			/* we already checked in POSIXCreate whether
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9f51f9b..c2c01ff 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
 
-/* Returns one if new inode created (which therefore needs to be hashed) */
+/* Returns 1 if new inode created, 2 if both dentry and inode were */
 /* Might check in the future if inode number changed so we can rehash inode */
-static int construct_dentry(struct qstr *qstring, struct file *file,
-	struct inode **ptmp_inode, struct dentry **pnew_dentry)
+static int
+construct_dentry(struct qstr *qstring, struct file *file,
+		 struct inode **ptmp_inode, struct dentry **pnew_dentry,
+		 __u64 *inum)
 {
-	struct dentry *tmp_dentry;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct dentry *tmp_dentry = NULL;
+	struct super_block *sb = file->f_path.dentry->d_sb;
 	int rc = 0;
 
 	cFYI(1, ("For %s", qstring->name));
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
 
 	qstring->hash = full_name_hash(qstring->name, qstring->len);
 	tmp_dentry = d_lookup(file->f_path.dentry, qstring);
 	if (tmp_dentry) {
+		/* BB: overwrite old name? i.e. tmp_dentry->d_name and
+		 * tmp_dentry->d_name.len??
+		 */
 		cFYI(0, ("existing dentry with inode 0x%p",
 			 tmp_dentry->d_inode));
 		*ptmp_inode = tmp_dentry->d_inode;
-/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
 		if (*ptmp_inode == NULL) {
-			*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
+			*ptmp_inode = cifs_new_inode(sb, inum);
 			if (*ptmp_inode == NULL)
 				return rc;
 			rc = 1;
 		}
-		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 	} else {
 		tmp_dentry = d_alloc(file->f_path.dentry, qstring);
 		if (tmp_dentry == NULL) {
@@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 			return rc;
 		}
 
-		*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
-		if (pTcon->nocase)
+		if (CIFS_SB(sb)->tcon->nocase)
 			tmp_dentry->d_op = &cifs_ci_dentry_ops;
 		else
 			tmp_dentry->d_op = &cifs_dentry_ops;
+
+		*ptmp_inode = cifs_new_inode(sb, inum);
 		if (*ptmp_inode == NULL)
 			return rc;
-		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 		rc = 2;
 	}
 
@@ -822,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
 	char *current_entry, __u16 level, unsigned int unicode,
-	struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum)
+	struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
 {
 	int rc = 0;
 	unsigned int len = 0;
@@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 			len = strnlen(filename, PATH_MAX);
 		}
 
-		/* BB fixme - hash low and high 32 bits if not 64 bit arch BB */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-			*pinum = pFindData->UniqueId;
+		*pinum = pFindData->UniqueId;
 	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
 		FILE_DIRECTORY_INFO *pFindData =
 			(FILE_DIRECTORY_INFO *)current_entry;
@@ -907,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	struct qstr qstring;
 	struct cifsFileInfo *pCifsF;
 	unsigned int obj_type;
-	ino_t  inum;
+	__u64  inum;
 	struct cifs_sb_info *cifs_sb;
 	struct inode *tmp_inode;
 	struct dentry *tmp_dentry;
@@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if (rc)
 		return rc;
 
-	rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry);
+	/* only these two infolevels return valid inode numbers */
+	if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
+	    pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
+		rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+					&inum);
+	else
+		rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+					NULL);
+
 	if ((tmp_inode == NULL) || (tmp_dentry == NULL))
 		return -ENOMEM;
 
-	if (rc) {
-		/* inode created, we need to hash it with right inode number */
-		if (inum != 0) {
-			/* BB fixme - hash the 2 32 quantities bits together if
-			 *  necessary BB */
-			tmp_inode->i_ino = inum;
-		}
-		insert_inode_hash(tmp_inode);
-	}
-
 	/* we pass in rc below, indicating whether it is a new inode,
 	   so we can figure out whether to invalidate the inode cached
 	   data if the file has changed */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f22de7..5c68b42 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -34,15 +34,99 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
+/* Checks if this is the first smb session to be reconnected after
+   the socket has been reestablished (so we know whether to use vc 0).
+   Called while holding the cifs_tcp_ses_lock, so do not block */
+static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
+{
+	struct list_head *tmp;
+	struct cifsSesInfo *tmp_ses;
+
+	list_for_each(tmp, &ses->server->smb_ses_list) {
+		tmp_ses = list_entry(tmp, struct cifsSesInfo,
+				     smb_ses_list);
+		if (tmp_ses->need_reconnect == false)
+			return false;
+	}
+	/* could not find a session that was already connected,
+	   this must be the first one we are reconnecting */
+	return true;
+}
+
+/*
+ *	vc number 0 is treated specially by some servers, and should be the
+ *      first one we request.  After that we can use vcnumbers up to maxvcs,
+ *	one for each smb session (some Windows versions set maxvcs incorrectly
+ *	so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
+ *	any vc but zero (some servers reset the connection on vcnum zero)
+ *
+ */
+static __le16 get_next_vcnum(struct cifsSesInfo *ses)
+{
+	__u16 vcnum = 0;
+	struct list_head *tmp;
+	struct cifsSesInfo *tmp_ses;
+	__u16 max_vcs = ses->server->max_vcs;
+	__u16 i;
+	int free_vc_found = 0;
+
+	/* Quoting the MS-SMB specification: "Windows-based SMB servers set this
+	field to one but do not enforce this limit, which allows an SMB client
+	to establish more virtual circuits than allowed by this value ... but
+	other server implementations can enforce this limit." */
+	if (max_vcs < 2)
+		max_vcs = 0xFFFF;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
+			goto get_vc_num_exit;  /* vcnum will be zero */
+	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
+		if (i == 0) /* this is the only connection, use vc 0 */
+			break;
+
+		free_vc_found = 1;
+
+		list_for_each(tmp, &ses->server->smb_ses_list) {
+			tmp_ses = list_entry(tmp, struct cifsSesInfo,
+					     smb_ses_list);
+			if (tmp_ses->vcnum == i) {
+				free_vc_found = 0;
+				break; /* found duplicate, try next vcnum */
+			}
+		}
+		if (free_vc_found)
+			break; /* we found a vcnumber that will work - use it */
+	}
+
+	if (i == 0)
+		vcnum = 0; /* for most common case, ie if one smb session, use
+			      vc zero.  Also for case when no free vcnum, zero
+			      is safest to send (some clients only send zero) */
+	else if (free_vc_found == 0)
+		vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
+				reset all uids on that, but 1 is ok. */
+	else
+		vcnum = i;
+	ses->vcnum = vcnum;
+get_vc_num_exit:
+	write_unlock(&cifs_tcp_ses_lock);
+
+	return le16_to_cpu(vcnum);
+}
+
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 {
 	__u32 capabilities = 0;
 
 	/* init fields common to all four types of SessSetup */
-	/* note that header is initialized to zero in header_assemble */
+	/* Note that offsets for first seven fields in req struct are same  */
+	/*	in CIFS Specs so does not matter which of 3 forms of struct */
+	/*	that we use in next few lines                               */
+	/* Note that header is initialized to zero in header_assemble */
 	pSMB->req.AndXCommand = 0xFF;
 	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
 	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+	pSMB->req.VcNumber = get_next_vcnum(ses);
 
 	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
 
@@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 	if (ses->capabilities & CAP_UNIX)
 		capabilities |= CAP_UNIX;
 
-	/* BB check whether to init vcnum BB */
 	return capabilities;
 }
 
@@ -228,7 +311,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
 
 	kfree(ses->serverOS);
 	/* UTF-8 string will not grow more than four times as big as UCS-16 */
-	ses->serverOS = kzalloc(4 * len, GFP_KERNEL);
+	ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
 	if (ses->serverOS != NULL)
 		cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
 	data += 2 * (len + 1);
@@ -241,7 +324,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
 		return rc;
 
 	kfree(ses->serverNOS);
-	ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */
+	ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
 	if (ses->serverNOS != NULL) {
 		cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
 				   nls_cp);
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
new file mode 100644
index 0000000..7056b89
--- /dev/null
+++ b/fs/cifs/smbfsctl.h
@@ -0,0 +1,84 @@
+/*
+ *   fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions
+ *
+ *   Copyright (c) International Business Machines  Corp., 2002,2009
+ *   Author(s): Steve French (sfrench@us.ibm.com)
+ *
+ *   This library is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU Lesser General Public License as published
+ *   by the Free Software Foundation; either version 2.1 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This library is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU Lesser General Public License for more details.
+ *
+ *   You should have received a copy of the GNU Lesser General Public License
+ *   along with this library; if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+/* IOCTL information */
+/*
+ * List of ioctl/fsctl function codes that are or could be useful in the
+ * future to remote clients like cifs or SMB2 client.  There is probably
+ * a slightly larger set of fsctls that NTFS local filesystem could handle,
+ * including the seven below that we do not have struct definitions for.
+ * Even with protocol definitions for most of these now available, we still
+ * need to do some experimentation to identify which are practical to do
+ * remotely.  Some of the following, such as the encryption/compression ones
+ * could be invoked from tools via a specialized hook into the VFS rather
+ * than via the standard vfs entry points
+ */
+#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000
+#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004
+#define FSCTL_REQUEST_BATCH_OPLOCK   0x00090008
+#define FSCTL_LOCK_VOLUME            0x00090018
+#define FSCTL_UNLOCK_VOLUME          0x0009001C
+#define FSCTL_IS_PATHNAME_VALID      0x0009002C /* BB add struct */
+#define FSCTL_GET_COMPRESSION        0x0009003C /* BB add struct */
+#define FSCTL_SET_COMPRESSION        0x0009C040 /* BB add struct */
+#define FSCTL_QUERY_FAT_BPB          0x00090058 /* BB add struct */
+/* Verify the next FSCTL number, we had it as 0x00090090 before */
+#define FSCTL_FILESYSTEM_GET_STATS   0x00090060 /* BB add struct */
+#define FSCTL_GET_NTFS_VOLUME_DATA   0x00090064 /* BB add struct */
+#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */
+#define FSCTL_IS_VOLUME_DIRTY        0x00090078 /* BB add struct */
+#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */
+#define FSCTL_REQUEST_FILTER_OPLOCK  0x0009008C
+#define FSCTL_FIND_FILES_BY_SID      0x0009008F /* BB add struct */
+#define FSCTL_SET_OBJECT_ID          0x00090098 /* BB add struct */
+#define FSCTL_GET_OBJECT_ID          0x0009009C /* BB add struct */
+#define FSCTL_DELETE_OBJECT_ID       0x000900A0 /* BB add struct */
+#define FSCTL_SET_REPARSE_POINT      0x000900A4 /* BB add struct */
+#define FSCTL_GET_REPARSE_POINT      0x000900A8 /* BB add struct */
+#define FSCTL_DELETE_REPARSE_POINT   0x000900AC /* BB add struct */
+#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
+#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
+#define FSCTL_SET_SPARSE             0x000900C4 /* BB add struct */
+#define FSCTL_SET_ZERO_DATA          0x000900C8 /* BB add struct */
+#define FSCTL_SET_ENCRYPTION         0x000900D7 /* BB add struct */
+#define FSCTL_ENCRYPTION_FSCTL_IO    0x000900DB /* BB add struct */
+#define FSCTL_WRITE_RAW_ENCRYPTED    0x000900DF /* BB add struct */
+#define FSCTL_READ_RAW_ENCRYPTED     0x000900E3 /* BB add struct */
+#define FSCTL_READ_FILE_USN_DATA     0x000900EB /* BB add struct */
+#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */
+#define FSCTL_SIS_COPYFILE           0x00090100 /* BB add struct */
+#define FSCTL_RECALL_FILE            0x00090117 /* BB add struct */
+#define FSCTL_QUERY_SPARING_INFO     0x00090138 /* BB add struct */
+#define FSCTL_SET_ZERO_ON_DEALLOC    0x00090194 /* BB add struct */
+#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */
+#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */
+#define FSCTL_SET_DEFECT_MANAGEMENT  0x00098134 /* BB add struct */
+#define FSCTL_SIS_LINK_FILES         0x0009C104
+#define FSCTL_PIPE_PEEK              0x0011400C /* BB add struct */
+#define FSCTL_PIPE_TRANSCEIVE        0x0011C017 /* BB add struct */
+/* strange that the number for this op is not sequential with previous op */
+#define FSCTL_PIPE_WAIT              0x00110018 /* BB add struct */
+#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
+#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
+
+#define IO_REPARSE_TAG_MOUNT_POINT   0xA0000003
+#define IO_REPARSE_TAG_HSM           0xC0000004
+#define IO_REPARSE_TAG_SIS           0x80000007
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 75b1fa9..4bb9d0a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -59,7 +59,7 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
 
-static struct dentry_operations coda_dentry_operations =
+static const struct dentry_operations coda_dentry_operations =
 {
 	.d_revalidate	= coda_dentry_revalidate,
 	.d_delete	= coda_dentry_delete,
diff --git a/fs/compat.c b/fs/compat.c
index 65a070e..55efdfe 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -378,6 +378,34 @@ out:
 	return error;
 }
 
+/*
+ * This is a copy of sys_ustat, just dealing with a structure layout.
+ * Given how simple this syscall is that apporach is more maintainable
+ * than the various conversion hacks.
+ */
+asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
+{
+	struct super_block *sb;
+	struct compat_ustat tmp;
+	struct kstatfs sbuf;
+	int err;
+
+	sb = user_get_super(new_decode_dev(dev));
+	if (!sb)
+		return -EINVAL;
+	err = vfs_statfs(sb->s_root, &sbuf);
+	drop_super(sb);
+	if (err)
+		return err;
+
+	memset(&tmp, 0, sizeof(struct compat_ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+	if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
+		return -EFAULT;
+	return 0;
+}
+
 static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
 {
 	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
@@ -1392,16 +1420,22 @@ int compat_do_execve(char * filename,
 {
 	struct linux_binprm *bprm;
 	struct file *file;
+	struct files_struct *displaced;
 	int retval;
 
+	retval = unshare_files(&displaced);
+	if (retval)
+		goto out_ret;
+
 	retval = -ENOMEM;
 	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
 	if (!bprm)
-		goto out_ret;
+		goto out_files;
 
 	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
 	if (retval < 0)
 		goto out_free;
+	current->in_execve = 1;
 
 	retval = -ENOMEM;
 	bprm->cred = prepare_exec_creds();
@@ -1454,9 +1488,12 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
+	if (displaced)
+		put_files_struct(displaced);
 	return retval;
 
 out:
@@ -1470,11 +1507,15 @@ out_file:
 	}
 
 out_unlock:
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 
 out_free:
 	free_bprm(bprm);
 
+out_files:
+	if (displaced)
+		reset_files_struct(displaced);
 out_ret:
 	return retval;
 }
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c8f8d59..ff78668 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -522,6 +522,11 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg)
 		if (err)
 			return -EFAULT;
 		break;
+	case SIOCSHWTSTAMP:
+		if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
+			return -EFAULT;
+		ifr.ifr_data = compat_ptr(uifr32->ifr_ifru.ifru_data);
+		break;
 	default:
 		if (copy_from_user(&ifr, uifr32, sizeof(*uifr32)))
 			return -EFAULT;
@@ -785,7 +790,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg)
 
 	if (copy_in_user(&sgio->status, &sgio32->status,
 			 (4 * sizeof(unsigned char)) +
-			 (2 * sizeof(unsigned (short))) +
+			 (2 * sizeof(unsigned short)) +
 			 (3 * sizeof(int))))
 		return -EFAULT;
 
@@ -1913,6 +1918,9 @@ COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
+/* 'X' - originally XFS but some now in the VFS */
+COMPATIBLE_IOCTL(FIFREEZE)
+COMPATIBLE_IOCTL(FITHAW)
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
 COMPATIBLE_IOCTL(GET_ARRAY_INFO)
@@ -1938,6 +1946,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
+COMPATIBLE_IOCTL(PIO_CMAP)
+COMPATIBLE_IOCTL(GIO_CMAP)
 ULONG_IOCTL(KDSIGACCEPT)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
@@ -1988,6 +1998,8 @@ COMPATIBLE_IOCTL(TUNSETGROUP)
 COMPATIBLE_IOCTL(TUNGETFEATURES)
 COMPATIBLE_IOCTL(TUNSETOFFLOAD)
 COMPATIBLE_IOCTL(TUNSETTXFILTER)
+COMPATIBLE_IOCTL(TUNGETSNDBUF)
+COMPATIBLE_IOCTL(TUNSETSNDBUF)
 /* Big V */
 COMPATIBLE_IOCTL(VT_SETMODE)
 COMPATIBLE_IOCTL(VT_GETMODE)
@@ -2561,6 +2573,7 @@ HANDLE_IOCTL(SIOCSIFMAP, dev_ifsioc)
 HANDLE_IOCTL(SIOCGIFADDR, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFADDR, dev_ifsioc)
 HANDLE_IOCTL(SIOCSIFHWBROADCAST, dev_ifsioc)
+HANDLE_IOCTL(SIOCSHWTSTAMP, dev_ifsioc)
 
 /* ioctls used by appletalk ddp.c */
 HANDLE_IOCTL(SIOCATALKDIFADDR, dev_ifsioc)
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8e93341..05373db 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -72,7 +72,7 @@ static int configfs_d_delete(struct dentry *dentry)
 	return 1;
 }
 
-static struct dentry_operations configfs_dentry_ops = {
+static const struct dentry_operations configfs_dentry_ops = {
 	.d_iput		= configfs_d_iput,
 	/* simple_delete_dentry() isn't exported */
 	.d_delete	= configfs_d_delete,
diff --git a/fs/dcache.c b/fs/dcache.c
index 937df0f..90bbd7e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	iput(inode);
 	return res;
 }
-EXPORT_SYMBOL_GPL(d_obtain_alias);
+EXPORT_SYMBOL(d_obtain_alias);
 
 /**
  * d_splice_alias - splice a disconnected dentry into the tree if one exists
@@ -1247,15 +1247,18 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 	struct dentry *found;
 	struct dentry *new;
 
-	/* Does a dentry matching the name exist already? */
+	/*
+	 * First check if a dentry matching the name already exists,
+	 * if not go ahead and create it now.
+	 */
 	found = d_hash_and_lookup(dentry->d_parent, name);
-	/* If not, create it now and return */
 	if (!found) {
 		new = d_alloc(dentry->d_parent, name);
 		if (!new) {
 			error = -ENOMEM;
 			goto err_out;
 		}
+
 		found = d_splice_alias(inode, new);
 		if (found) {
 			dput(new);
@@ -1263,61 +1266,46 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
 		}
 		return new;
 	}
-	/* Matching dentry exists, check if it is negative. */
+
+	/*
+	 * If a matching dentry exists, and it's not negative use it.
+	 *
+	 * Decrement the reference count to balance the iget() done
+	 * earlier on.
+	 */
 	if (found->d_inode) {
 		if (unlikely(found->d_inode != inode)) {
 			/* This can't happen because bad inodes are unhashed. */
 			BUG_ON(!is_bad_inode(inode));
 			BUG_ON(!is_bad_inode(found->d_inode));
 		}
-		/*
-		 * Already have the inode and the dentry attached, decrement
-		 * the reference count to balance the iget() done
-		 * earlier on.  We found the dentry using d_lookup() so it
-		 * cannot be disconnected and thus we do not need to worry
-		 * about any NFS/disconnectedness issues here.
-		 */
 		iput(inode);
 		return found;
 	}
+
 	/*
 	 * Negative dentry: instantiate it unless the inode is a directory and
-	 * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED),
-	 * in which case d_move() that in place of the found dentry.
+	 * already has a dentry.
 	 */
-	if (!S_ISDIR(inode->i_mode)) {
-		/* Not a directory; everything is easy. */
-		d_instantiate(found, inode);
-		return found;
-	}
 	spin_lock(&dcache_lock);
-	if (list_empty(&inode->i_dentry)) {
-		/*
-		 * Directory without a 'disconnected' dentry; we need to do
-		 * d_instantiate() by hand because it takes dcache_lock which
-		 * we already hold.
-		 */
+	if (!S_ISDIR(inode->i_mode) || list_empty(&inode->i_dentry)) {
 		__d_instantiate(found, inode);
 		spin_unlock(&dcache_lock);
 		security_d_instantiate(found, inode);
 		return found;
 	}
+
 	/*
-	 * Directory with a 'disconnected' dentry; get a reference to the
-	 * 'disconnected' dentry.
+	 * In case a directory already has a (disconnected) entry grab a
+	 * reference to it, move it in place and use it.
 	 */
 	new = list_entry(inode->i_dentry.next, struct dentry, d_alias);
 	dget_locked(new);
 	spin_unlock(&dcache_lock);
-	/* Do security vodoo. */
 	security_d_instantiate(found, inode);
-	/* Move new in place of found. */
 	d_move(new, found);
-	/* Balance the iget() we did above. */
 	iput(inode);
-	/* Throw away found. */
 	dput(found);
-	/* Use new as the actual dentry. */
 	return new;
 
 err_out:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5f3231b..63a4a59 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb)
 
 	fsi->ptmx_dentry = dentry;
 	rc = 0;
-
-	printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
-			inode->i_ino);
 out:
 	mutex_unlock(&root->d_inode->i_mutex);
 	return rc;
@@ -325,179 +322,81 @@ static int compare_init_pts_sb(struct super_block *s, void *p)
 }
 
 /*
- * Safely parse the mount options in @data and update @opts.
+ * devpts_get_sb()
  *
- * devpts ends up parsing options two times during mount, due to the
- * two modes of operation it supports. The first parse occurs in
- * devpts_get_sb() when determining the mode (single-instance or
- * multi-instance mode). The second parse happens in devpts_remount()
- * or new_pts_mount() depending on the mode.
+ *     If the '-o newinstance' mount option was specified, mount a new
+ *     (private) instance of devpts.  PTYs created in this instance are
+ *     independent of the PTYs in other devpts instances.
  *
- * Parsing of options modifies the @data making subsequent parsing
- * incorrect. So make a local copy of @data and parse it.
+ *     If the '-o newinstance' option was not specified, mount/remount the
+ *     initial kernel mount of devpts.  This type of mount gives the
+ *     legacy, single-instance semantics.
  *
- * Return: 0 On success, -errno on error
- */
-static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
-{
-	int rc;
-	void *datacp;
-
-	if (!data)
-		return 0;
-
-	/* Use kstrdup() ?  */
-	datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!datacp)
-		return -ENOMEM;
-
-	memcpy(datacp, data, PAGE_SIZE);
-	rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
-	kfree(datacp);
-
-	return rc;
-}
-
-/*
- * Mount a new (private) instance of devpts.  PTYs created in this
- * instance are independent of the PTYs in other devpts instances.
- */
-static int new_pts_mount(struct file_system_type *fs_type, int flags,
-		void *data, struct vfsmount *mnt)
-{
-	int err;
-	struct pts_fs_info *fsi;
-	struct pts_mount_opts *opts;
-
-	printk(KERN_NOTICE "devpts: newinstance mount\n");
-
-	err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
-	if (err)
-		return err;
-
-	fsi = DEVPTS_SB(mnt->mnt_sb);
-	opts = &fsi->mount_opts;
-
-	err = parse_mount_options(data, PARSE_MOUNT, opts);
-	if (err)
-		goto fail;
-
-	err = mknod_ptmx(mnt->mnt_sb);
-	if (err)
-		goto fail;
-
-	return 0;
-
-fail:
-	dput(mnt->mnt_sb->s_root);
-	deactivate_super(mnt->mnt_sb);
-	return err;
-}
-
-/*
- * Check if 'newinstance' mount option was specified in @data.
+ *     The 'newinstance' option is needed to support multiple namespace
+ *     semantics in devpts while preserving backward compatibility of the
+ *     current 'single-namespace' semantics. i.e all mounts of devpts
+ *     without the 'newinstance' mount option should bind to the initial
+ *     kernel mount, like get_sb_single().
  *
- * Return: -errno  	on error (eg: invalid mount options specified)
- * 	 : 1 		if 'newinstance' mount option was specified
- * 	 : 0 		if 'newinstance' mount option was NOT specified
- */
-static int is_new_instance_mount(void *data)
-{
-	int rc;
-	struct pts_mount_opts opts;
-
-	if (!data)
-		return 0;
-
-	rc = safe_parse_mount_options(data, &opts);
-	if (!rc)
-		rc = opts.newinstance;
-
-	return rc;
-}
-
-/*
- * get_init_pts_sb()
+ *     Mounts with 'newinstance' option create a new, private namespace.
  *
- *     This interface is needed to support multiple namespace semantics in
- *     devpts while preserving backward compatibility of the current 'single-
- *     namespace' semantics. i.e all mounts of devpts without the 'newinstance'
- *     mount option should bind to the initial kernel mount, like
- *     get_sb_single().
+ *     NOTE:
  *
- *     Mounts with 'newinstance' option create a new private namespace.
- *
- *     But for single-mount semantics, devpts cannot use get_sb_single(),
+ *     For single-mount semantics, devpts cannot use get_sb_single(),
  *     because get_sb_single()/sget() find and use the super-block from
  *     the most recent mount of devpts. But that recent mount may be a
  *     'newinstance' mount and get_sb_single() would pick the newinstance
  *     super-block instead of the initial super-block.
- *
- *     This interface is identical to get_sb_single() except that it
- *     consistently selects the 'single-namespace' superblock even in the
- *     presence of the private namespace (i.e 'newinstance') super-blocks.
  */
-static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
-		void *data, struct vfsmount *mnt)
+static int devpts_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	struct super_block *s;
 	int error;
+	struct pts_mount_opts opts;
+	struct super_block *s;
+
+	memset(&opts, 0, sizeof(opts));
+	if (data) {
+		error = parse_mount_options(data, PARSE_MOUNT, &opts);
+		if (error)
+			return error;
+	}
+
+	if (opts.newinstance)
+		s = sget(fs_type, NULL, set_anon_super, NULL);
+	else
+		s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
 
-	s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
 	if (IS_ERR(s))
 		return PTR_ERR(s);
 
 	if (!s->s_root) {
 		s->s_flags = flags;
 		error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			up_write(&s->s_umount);
-			deactivate_super(s);
-			return error;
-		}
+		if (error)
+			goto out_undo_sget;
 		s->s_flags |= MS_ACTIVE;
 	}
-	do_remount_sb(s, flags, data, 0);
-	return simple_set_mnt(mnt, s);
-}
 
-/*
- * Mount or remount the initial kernel mount of devpts. This type of
- * mount maintains the legacy, single-instance semantics, while the
- * kernel still allows multiple-instances.
- */
-static int init_pts_mount(struct file_system_type *fs_type, int flags,
-		void *data, struct vfsmount *mnt)
-{
-	int err;
+	simple_set_mnt(mnt, s);
 
-	err = get_init_pts_sb(fs_type, flags, data, mnt);
-	if (err)
-		return err;
+	memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
 
-	err = mknod_ptmx(mnt->mnt_sb);
-	if (err) {
-		dput(mnt->mnt_sb->s_root);
-		deactivate_super(mnt->mnt_sb);
-	}
+	error = mknod_ptmx(s);
+	if (error)
+		goto out_dput;
 
-	return err;
-}
-
-static int devpts_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	int new;
-
-	new = is_new_instance_mount(data);
-	if (new < 0)
-		return new;
+	return 0;
 
-	if (new)
-		return new_pts_mount(fs_type, flags, data, mnt);
+out_dput:
+	dput(s->s_root);
 
-	return init_pts_mount(fs_type, flags, data, mnt);
+out_undo_sget:
+	up_write(&s->s_umount);
+	deactivate_super(s);
+	return error;
 }
+
 #else
 /*
  * This supports only the legacy single-instance semantics (no
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index 92969f8..858fba1 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -156,7 +156,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
 
 	bucket = dir_hash(ls, name, namelen);
 
-	write_lock(&ls->ls_dirtbl[bucket].lock);
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
 
 	de = search_bucket(ls, name, namelen, bucket);
 
@@ -173,7 +173,7 @@ void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen
 	list_del(&de->list);
 	kfree(de);
  out:
-	write_unlock(&ls->ls_dirtbl[bucket].lock);
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
 }
 
 void dlm_dir_clear(struct dlm_ls *ls)
@@ -185,14 +185,14 @@ void dlm_dir_clear(struct dlm_ls *ls)
 	DLM_ASSERT(list_empty(&ls->ls_recover_list), );
 
 	for (i = 0; i < ls->ls_dirtbl_size; i++) {
-		write_lock(&ls->ls_dirtbl[i].lock);
+		spin_lock(&ls->ls_dirtbl[i].lock);
 		head = &ls->ls_dirtbl[i].list;
 		while (!list_empty(head)) {
 			de = list_entry(head->next, struct dlm_direntry, list);
 			list_del(&de->list);
 			put_free_de(ls, de);
 		}
-		write_unlock(&ls->ls_dirtbl[i].lock);
+		spin_unlock(&ls->ls_dirtbl[i].lock);
 	}
 }
 
@@ -307,17 +307,17 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 
 	bucket = dir_hash(ls, name, namelen);
 
-	write_lock(&ls->ls_dirtbl[bucket].lock);
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
 	de = search_bucket(ls, name, namelen, bucket);
 	if (de) {
 		*r_nodeid = de->master_nodeid;
-		write_unlock(&ls->ls_dirtbl[bucket].lock);
+		spin_unlock(&ls->ls_dirtbl[bucket].lock);
 		if (*r_nodeid == nodeid)
 			return -EEXIST;
 		return 0;
 	}
 
-	write_unlock(&ls->ls_dirtbl[bucket].lock);
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
 
 	if (namelen > DLM_RESNAME_MAXLEN)
 		return -EINVAL;
@@ -330,7 +330,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 	de->length = namelen;
 	memcpy(de->name, name, namelen);
 
-	write_lock(&ls->ls_dirtbl[bucket].lock);
+	spin_lock(&ls->ls_dirtbl[bucket].lock);
 	tmp = search_bucket(ls, name, namelen, bucket);
 	if (tmp) {
 		kfree(de);
@@ -339,7 +339,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
 		list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list);
 	}
 	*r_nodeid = de->master_nodeid;
-	write_unlock(&ls->ls_dirtbl[bucket].lock);
+	spin_unlock(&ls->ls_dirtbl[bucket].lock);
 	return 0;
 }
 
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 076e86f..d01ca0a 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -99,7 +99,7 @@ struct dlm_direntry {
 
 struct dlm_dirtable {
 	struct list_head	list;
-	rwlock_t		lock;
+	spinlock_t		lock;
 };
 
 struct dlm_rsbtable {
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 01e7d39..205ec95 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -835,7 +835,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
 		lkb->lkb_wait_count++;
 		hold_lkb(lkb);
 
-		log_debug(ls, "add overlap %x cur %d new %d count %d flags %x",
+		log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
 			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
 			  lkb->lkb_wait_count, lkb->lkb_flags);
 		goto out;
@@ -851,7 +851,7 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
 	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
  out:
 	if (error)
-		log_error(ls, "add_to_waiters %x error %d flags %x %d %d %s",
+		log_error(ls, "addwait error %x %d flags %x %d %d %s",
 			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
 			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
 	mutex_unlock(&ls->ls_waiters_mutex);
@@ -863,23 +863,55 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype)
    request reply on the requestqueue) between dlm_recover_waiters_pre() which
    set RESEND and dlm_recover_waiters_post() */
 
-static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
+static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
+				struct dlm_message *ms)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int overlap_done = 0;
 
 	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
+		log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
 		overlap_done = 1;
 		goto out_del;
 	}
 
 	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
+		log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
 		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
 		overlap_done = 1;
 		goto out_del;
 	}
 
+	/* Cancel state was preemptively cleared by a successful convert,
+	   see next comment, nothing to do. */
+
+	if ((mstype == DLM_MSG_CANCEL_REPLY) &&
+	    (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
+		log_debug(ls, "remwait %x cancel_reply wait_type %d",
+			  lkb->lkb_id, lkb->lkb_wait_type);
+		return -1;
+	}
+
+	/* Remove for the convert reply, and premptively remove for the
+	   cancel reply.  A convert has been granted while there's still
+	   an outstanding cancel on it (the cancel is moot and the result
+	   in the cancel reply should be 0).  We preempt the cancel reply
+	   because the app gets the convert result and then can follow up
+	   with another op, like convert.  This subsequent op would see the
+	   lingering state of the cancel and fail with -EBUSY. */
+
+	if ((mstype == DLM_MSG_CONVERT_REPLY) &&
+	    (lkb->lkb_wait_type == DLM_MSG_CONVERT) &&
+	    is_overlap_cancel(lkb) && ms && !ms->m_result) {
+		log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
+			  lkb->lkb_id);
+		lkb->lkb_wait_type = 0;
+		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
+		lkb->lkb_wait_count--;
+		goto out_del;
+	}
+
 	/* N.B. type of reply may not always correspond to type of original
 	   msg due to lookup->request optimization, verify others? */
 
@@ -888,8 +920,8 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 		goto out_del;
 	}
 
-	log_error(ls, "remove_from_waiters lkid %x flags %x types %d %d",
-		  lkb->lkb_id, lkb->lkb_flags, mstype, lkb->lkb_wait_type);
+	log_error(ls, "remwait error %x reply %d flags %x no wait_type",
+		  lkb->lkb_id, mstype, lkb->lkb_flags);
 	return -1;
 
  out_del:
@@ -899,7 +931,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 	   this would happen */
 
 	if (overlap_done && lkb->lkb_wait_type) {
-		log_error(ls, "remove_from_waiters %x reply %d give up on %d",
+		log_error(ls, "remwait error %x reply %d wait_type %d overlap",
 			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
 		lkb->lkb_wait_count--;
 		lkb->lkb_wait_type = 0;
@@ -921,7 +953,7 @@ static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
 	int error;
 
 	mutex_lock(&ls->ls_waiters_mutex);
-	error = _remove_from_waiters(lkb, mstype);
+	error = _remove_from_waiters(lkb, mstype, NULL);
 	mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
 }
@@ -936,7 +968,7 @@ static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
 
 	if (ms != &ls->ls_stub_ms)
 		mutex_lock(&ls->ls_waiters_mutex);
-	error = _remove_from_waiters(lkb, ms->m_type);
+	error = _remove_from_waiters(lkb, ms->m_type, ms);
 	if (ms != &ls->ls_stub_ms)
 		mutex_unlock(&ls->ls_waiters_mutex);
 	return error;
@@ -2083,6 +2115,11 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 	lkb->lkb_timeout_cs = args->timeout;
 	rv = 0;
  out:
+	if (rv)
+		log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
+			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
+			  lkb->lkb_status, lkb->lkb_wait_type,
+			  lkb->lkb_resource->res_name);
 	return rv;
 }
 
@@ -2149,6 +2186,13 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 			goto out;
 		}
 
+		/* there's nothing to cancel */
+		if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
+		    !lkb->lkb_wait_type) {
+			rv = -EBUSY;
+			goto out;
+		}
+
 		switch (lkb->lkb_wait_type) {
 		case DLM_MSG_LOOKUP:
 		case DLM_MSG_REQUEST:
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index aa32e5f..cd8e2df 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -487,7 +487,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace,
 		goto out_lkbfree;
 	for (i = 0; i < size; i++) {
 		INIT_LIST_HEAD(&ls->ls_dirtbl[i].list);
-		rwlock_init(&ls->ls_dirtbl[i].lock);
+		spin_lock_init(&ls->ls_dirtbl[i].lock);
 	}
 
 	INIT_LIST_HEAD(&ls->ls_waiters);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 103a5eb..609108a 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -2,7 +2,7 @@
 *******************************************************************************
 **
 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
-**  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
+**  Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
 **
 **  This copyrighted material is made available to anyone wishing to use,
 **  modify, copy, or redistribute it subject to the terms and conditions
@@ -21,7 +21,7 @@
  *
  * Cluster nodes are referred to by their nodeids. nodeids are
  * simply 32 bit numbers to the locking module - if they need to
- * be expanded for the cluster infrastructure then that is it's
+ * be expanded for the cluster infrastructure then that is its
  * responsibility. It is this layer's
  * responsibility to resolve these into IP address or
  * whatever it needs for inter-node communication.
@@ -36,9 +36,9 @@
  * of high load. Also, this way, the sending thread can collect together
  * messages bound for one node and send them in one block.
  *
- * lowcomms will choose to use wither TCP or SCTP as its transport layer
+ * lowcomms will choose to use either TCP or SCTP as its transport layer
  * depending on the configuration variable 'protocol'. This should be set
- * to 0 (default) for TCP or 1 for SCTP. It shouldbe configured using a
+ * to 0 (default) for TCP or 1 for SCTP. It should be configured using a
  * cluster-wide mechanism as it must be the same on all nodes of the cluster
  * for the DLM to function.
  *
@@ -48,11 +48,11 @@
 #include <net/sock.h>
 #include <net/tcp.h>
 #include <linux/pagemap.h>
-#include <linux/idr.h>
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/sctp.h>
 #include <net/sctp/user.h>
+#include <net/ipv6.h>
 
 #include "dlm_internal.h"
 #include "lowcomms.h"
@@ -60,6 +60,7 @@
 #include "config.h"
 
 #define NEEDED_RMEM (4*1024*1024)
+#define CONN_HASH_SIZE 32
 
 struct cbuf {
 	unsigned int base;
@@ -114,6 +115,7 @@ struct connection {
 	int retries;
 #define MAX_CONNECT_RETRIES 3
 	int sctp_assoc;
+	struct hlist_node list;
 	struct connection *othercon;
 	struct work_struct rwork; /* Receive workqueue */
 	struct work_struct swork; /* Send workqueue */
@@ -138,14 +140,37 @@ static int dlm_local_count;
 static struct workqueue_struct *recv_workqueue;
 static struct workqueue_struct *send_workqueue;
 
-static DEFINE_IDR(connections_idr);
+static struct hlist_head connection_hash[CONN_HASH_SIZE];
 static DEFINE_MUTEX(connections_lock);
-static int max_nodeid;
 static struct kmem_cache *con_cache;
 
 static void process_recv_sockets(struct work_struct *work);
 static void process_send_sockets(struct work_struct *work);
 
+
+/* This is deliberately very simple because most clusters have simple
+   sequential nodeids, so we should be able to go straight to a connection
+   struct in the array */
+static inline int nodeid_hash(int nodeid)
+{
+	return nodeid & (CONN_HASH_SIZE-1);
+}
+
+static struct connection *__find_con(int nodeid)
+{
+	int r;
+	struct hlist_node *h;
+	struct connection *con;
+
+	r = nodeid_hash(nodeid);
+
+	hlist_for_each_entry(con, h, &connection_hash[r], list) {
+		if (con->nodeid == nodeid)
+			return con;
+	}
+	return NULL;
+}
+
 /*
  * If 'allocation' is zero then we don't attempt to create a new
  * connection structure for this node.
@@ -154,31 +179,17 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
 {
 	struct connection *con = NULL;
 	int r;
-	int n;
 
-	con = idr_find(&connections_idr, nodeid);
+	con = __find_con(nodeid);
 	if (con || !alloc)
 		return con;
 
-	r = idr_pre_get(&connections_idr, alloc);
-	if (!r)
-		return NULL;
-
 	con = kmem_cache_zalloc(con_cache, alloc);
 	if (!con)
 		return NULL;
 
-	r = idr_get_new_above(&connections_idr, con, nodeid, &n);
-	if (r) {
-		kmem_cache_free(con_cache, con);
-		return NULL;
-	}
-
-	if (n != nodeid) {
-		idr_remove(&connections_idr, n);
-		kmem_cache_free(con_cache, con);
-		return NULL;
-	}
+	r = nodeid_hash(nodeid);
+	hlist_add_head(&con->list, &connection_hash[r]);
 
 	con->nodeid = nodeid;
 	mutex_init(&con->sock_mutex);
@@ -189,19 +200,30 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc)
 
 	/* Setup action pointers for child sockets */
 	if (con->nodeid) {
-		struct connection *zerocon = idr_find(&connections_idr, 0);
+		struct connection *zerocon = __find_con(0);
 
 		con->connect_action = zerocon->connect_action;
 		if (!con->rx_action)
 			con->rx_action = zerocon->rx_action;
 	}
 
-	if (nodeid > max_nodeid)
-		max_nodeid = nodeid;
-
 	return con;
 }
 
+/* Loop round all connections */
+static void foreach_conn(void (*conn_func)(struct connection *c))
+{
+	int i;
+	struct hlist_node *h, *n;
+	struct connection *con;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(con, h, n, &connection_hash[i], list){
+			conn_func(con);
+		}
+	}
+}
+
 static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 {
 	struct connection *con;
@@ -217,14 +239,17 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 static struct connection *assoc2con(int assoc_id)
 {
 	int i;
+	struct hlist_node *h;
 	struct connection *con;
 
 	mutex_lock(&connections_lock);
-	for (i=0; i<=max_nodeid; i++) {
-		con = __nodeid2con(i, 0);
-		if (con && con->sctp_assoc == assoc_id) {
-			mutex_unlock(&connections_lock);
-			return con;
+
+	for (i = 0 ; i < CONN_HASH_SIZE; i++) {
+		hlist_for_each_entry(con, h, &connection_hash[i], list) {
+			if (con && con->sctp_assoc == assoc_id) {
+				mutex_unlock(&connections_lock);
+				return con;
+			}
 		}
 	}
 	mutex_unlock(&connections_lock);
@@ -250,8 +275,7 @@ static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr)
 	} else {
 		struct sockaddr_in6 *in6  = (struct sockaddr_in6 *) &addr;
 		struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr;
-		memcpy(&ret6->sin6_addr, &in6->sin6_addr,
-		       sizeof(in6->sin6_addr));
+		ipv6_addr_copy(&ret6->sin6_addr, &in6->sin6_addr);
 	}
 
 	return 0;
@@ -376,25 +400,23 @@ static void sctp_send_shutdown(sctp_assoc_t associd)
 		log_print("send EOF to node failed: %d", ret);
 }
 
+static void sctp_init_failed_foreach(struct connection *con)
+{
+	con->sctp_assoc = 0;
+	if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
+		if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
+			queue_work(send_workqueue, &con->swork);
+	}
+}
+
 /* INIT failed but we don't know which node...
    restart INIT on all pending nodes */
 static void sctp_init_failed(void)
 {
-	int i;
-	struct connection *con;
-
 	mutex_lock(&connections_lock);
-	for (i=1; i<=max_nodeid; i++) {
-		con = __nodeid2con(i, 0);
-		if (!con)
-			continue;
-		con->sctp_assoc = 0;
-		if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
-			if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
-				queue_work(send_workqueue, &con->swork);
-			}
-		}
-	}
+
+	foreach_conn(sctp_init_failed_foreach);
+
 	mutex_unlock(&connections_lock);
 }
 
@@ -1313,13 +1335,10 @@ out_connect:
 
 static void clean_one_writequeue(struct connection *con)
 {
-	struct list_head *list;
-	struct list_head *temp;
+	struct writequeue_entry *e, *safe;
 
 	spin_lock(&con->writequeue_lock);
-	list_for_each_safe(list, temp, &con->writequeue) {
-		struct writequeue_entry *e =
-			list_entry(list, struct writequeue_entry, list);
+	list_for_each_entry_safe(e, safe, &con->writequeue, list) {
 		list_del(&e->list);
 		free_entry(e);
 	}
@@ -1369,14 +1388,7 @@ static void process_send_sockets(struct work_struct *work)
 /* Discard all entries on the write queues */
 static void clean_writequeues(void)
 {
-	int nodeid;
-
-	for (nodeid = 1; nodeid <= max_nodeid; nodeid++) {
-		struct connection *con = __nodeid2con(nodeid, 0);
-
-		if (con)
-			clean_one_writequeue(con);
-	}
+	foreach_conn(clean_one_writequeue);
 }
 
 static void work_stop(void)
@@ -1406,23 +1418,29 @@ static int work_start(void)
 	return 0;
 }
 
-void dlm_lowcomms_stop(void)
+static void stop_conn(struct connection *con)
 {
-	int i;
-	struct connection *con;
+	con->flags |= 0x0F;
+	if (con->sock)
+		con->sock->sk->sk_user_data = NULL;
+}
 
+static void free_conn(struct connection *con)
+{
+	close_connection(con, true);
+	if (con->othercon)
+		kmem_cache_free(con_cache, con->othercon);
+	hlist_del(&con->list);
+	kmem_cache_free(con_cache, con);
+}
+
+void dlm_lowcomms_stop(void)
+{
 	/* Set all the flags to prevent any
 	   socket activity.
 	*/
 	mutex_lock(&connections_lock);
-	for (i = 0; i <= max_nodeid; i++) {
-		con = __nodeid2con(i, 0);
-		if (con) {
-			con->flags |= 0x0F;
-			if (con->sock)
-				con->sock->sk->sk_user_data = NULL;
-		}
-	}
+	foreach_conn(stop_conn);
 	mutex_unlock(&connections_lock);
 
 	work_stop();
@@ -1430,25 +1448,20 @@ void dlm_lowcomms_stop(void)
 	mutex_lock(&connections_lock);
 	clean_writequeues();
 
-	for (i = 0; i <= max_nodeid; i++) {
-		con = __nodeid2con(i, 0);
-		if (con) {
-			close_connection(con, true);
-			if (con->othercon)
-				kmem_cache_free(con_cache, con->othercon);
-			kmem_cache_free(con_cache, con);
-		}
-	}
-	max_nodeid = 0;
+	foreach_conn(free_conn);
+
 	mutex_unlock(&connections_lock);
 	kmem_cache_destroy(con_cache);
-	idr_init(&connections_idr);
 }
 
 int dlm_lowcomms_start(void)
 {
 	int error = -EINVAL;
 	struct connection *con;
+	int i;
+
+	for (i = 0; i < CONN_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&connection_hash[i]);
 
 	init_local();
 	if (!dlm_local_count) {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 065149e..ebce994 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2006-2008 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2006-2009 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -84,7 +84,7 @@ struct dlm_lock_result32 {
 
 static void compat_input(struct dlm_write_request *kb,
 			 struct dlm_write_request32 *kb32,
-			 size_t count)
+			 int namelen)
 {
 	kb->version[0] = kb32->version[0];
 	kb->version[1] = kb32->version[1];
@@ -96,8 +96,7 @@ static void compat_input(struct dlm_write_request *kb,
 	    kb->cmd == DLM_USER_REMOVE_LOCKSPACE) {
 		kb->i.lspace.flags = kb32->i.lspace.flags;
 		kb->i.lspace.minor = kb32->i.lspace.minor;
-		memcpy(kb->i.lspace.name, kb32->i.lspace.name, count -
-			offsetof(struct dlm_write_request32, i.lspace.name));
+		memcpy(kb->i.lspace.name, kb32->i.lspace.name, namelen);
 	} else if (kb->cmd == DLM_USER_PURGE) {
 		kb->i.purge.nodeid = kb32->i.purge.nodeid;
 		kb->i.purge.pid = kb32->i.purge.pid;
@@ -115,8 +114,7 @@ static void compat_input(struct dlm_write_request *kb,
 		kb->i.lock.bastaddr = (void *)(long)kb32->i.lock.bastaddr;
 		kb->i.lock.lksb = (void *)(long)kb32->i.lock.lksb;
 		memcpy(kb->i.lock.lvb, kb32->i.lock.lvb, DLM_USER_LVB_LEN);
-		memcpy(kb->i.lock.name, kb32->i.lock.name, count -
-			offsetof(struct dlm_write_request32, i.lock.name));
+		memcpy(kb->i.lock.name, kb32->i.lock.name, namelen);
 	}
 }
 
@@ -539,9 +537,16 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 #ifdef CONFIG_COMPAT
 	if (!kbuf->is64bit) {
 		struct dlm_write_request32 *k32buf;
+		int namelen = 0;
+
+		if (count > sizeof(struct dlm_write_request32))
+			namelen = count - sizeof(struct dlm_write_request32);
+
 		k32buf = (struct dlm_write_request32 *)kbuf;
-		kbuf = kmalloc(count + 1 + (sizeof(struct dlm_write_request) -
-			       sizeof(struct dlm_write_request32)), GFP_KERNEL);
+
+		/* add 1 after namelen so that the name string is terminated */
+		kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
+			       GFP_KERNEL);
 		if (!kbuf) {
 			kfree(k32buf);
 			return -ENOMEM;
@@ -549,7 +554,8 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 
 		if (proc)
 			set_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags);
-		compat_input(kbuf, k32buf, count + 1);
+
+		compat_input(kbuf, k32buf, namelen);
 		kfree(k32buf);
 	}
 #endif
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 3e5637f..44d725f 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -18,7 +18,7 @@ static void drop_pagecache_sb(struct super_block *sb)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		if (inode->i_state & (I_FREEING|I_WILL_FREE))
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 			continue;
 		if (inode->i_mapping->nrpages == 0)
 			continue;
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index c01e043..8b65f28 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs(
 	list_for_each_entry(global_auth_tok,
 			    &mount_crypt_stat->global_auth_tok_list,
 			    mount_crypt_stat_list) {
+		if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK)
+			continue;
 		rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig);
 		if (rc) {
 			printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc);
@@ -1322,14 +1324,13 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max,
 }
 
 static int
-ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat,
-				    struct dentry *ecryptfs_dentry,
-				    char *virt)
+ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry,
+				    char *virt, size_t virt_len)
 {
 	int rc;
 
 	rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt,
-				  0, crypt_stat->num_header_bytes_at_front);
+				  0, virt_len);
 	if (rc)
 		printk(KERN_ERR "%s: Error attempting to write header "
 		       "information to lower file; rc = [%d]\n", __func__,
@@ -1339,7 +1340,6 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat,
 
 static int
 ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
-				 struct ecryptfs_crypt_stat *crypt_stat,
 				 char *page_virt, size_t size)
 {
 	int rc;
@@ -1349,6 +1349,17 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry,
 	return rc;
 }
 
+static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask,
+					       unsigned int order)
+{
+	struct page *page;
+
+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+	if (page)
+		return (unsigned long) page_address(page);
+	return 0;
+}
+
 /**
  * ecryptfs_write_metadata
  * @ecryptfs_dentry: The eCryptfs dentry
@@ -1365,7 +1376,9 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 {
 	struct ecryptfs_crypt_stat *crypt_stat =
 		&ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat;
+	unsigned int order;
 	char *virt;
+	size_t virt_len;
 	size_t size = 0;
 	int rc = 0;
 
@@ -1381,33 +1394,35 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
 		rc = -EINVAL;
 		goto out;
 	}
+	virt_len = crypt_stat->num_header_bytes_at_front;
+	order = get_order(virt_len);
 	/* Released in this function */
-	virt = (char *)get_zeroed_page(GFP_KERNEL);
+	virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order);
 	if (!virt) {
 		printk(KERN_ERR "%s: Out of memory\n", __func__);
 		rc = -ENOMEM;
 		goto out;
 	}
-	rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size,
-					 crypt_stat, ecryptfs_dentry);
+	rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat,
+					 ecryptfs_dentry);
 	if (unlikely(rc)) {
 		printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n",
 		       __func__, rc);
 		goto out_free;
 	}
 	if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR)
-		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry,
-						      crypt_stat, virt, size);
+		rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt,
+						      size);
 	else
-		rc = ecryptfs_write_metadata_to_contents(crypt_stat,
-							 ecryptfs_dentry, virt);
+		rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt,
+							 virt_len);
 	if (rc) {
 		printk(KERN_ERR "%s: Error writing metadata out to lower file; "
 		       "rc = [%d]\n", __func__, rc);
 		goto out_free;
 	}
 out_free:
-	free_page((unsigned long)virt);
+	free_pages((unsigned long)virt, order);
 out:
 	return rc;
 }
@@ -1716,7 +1731,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
 {
 	int rc = 0;
 
-	(*copied_name) = kmalloc((name_size + 2), GFP_KERNEL);
+	(*copied_name) = kmalloc((name_size + 1), GFP_KERNEL);
 	if (!(*copied_name)) {
 		rc = -ENOMEM;
 		goto out;
@@ -1726,7 +1741,7 @@ static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size,
 						 * in printing out the
 						 * string in debug
 						 * messages */
-	(*copied_name_size) = (name_size + 1);
+	(*copied_name_size) = name_size;
 out:
 	return rc;
 }
@@ -2206,17 +2221,19 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name,
 					 struct dentry *ecryptfs_dir_dentry,
 					 const char *name, size_t name_size)
 {
+	struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
+		&ecryptfs_superblock_to_private(
+			ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
 	char *decoded_name;
 	size_t decoded_name_size;
 	size_t packet_size;
 	int rc = 0;
 
-	if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
+	if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)
+	    && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED)
+	    && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE)
 	    && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX,
 			ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) {
-		struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
-			&ecryptfs_superblock_to_private(
-				ecryptfs_dir_dentry->d_sb)->mount_crypt_stat;
 		const char *orig_name = name;
 		size_t orig_name_size = name_size;
 
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 5e59658..2dda5ad 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -89,7 +89,7 @@ static void ecryptfs_d_release(struct dentry *dentry)
 	return;
 }
 
-struct dentry_operations ecryptfs_dops = {
+const struct dentry_operations ecryptfs_dops = {
 	.d_revalidate = ecryptfs_d_revalidate,
 	.d_release = ecryptfs_d_release,
 };
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c11fc95..064c582 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -328,6 +328,7 @@ struct ecryptfs_dentry_info {
  */
 struct ecryptfs_global_auth_tok {
 #define ECRYPTFS_AUTH_TOK_INVALID 0x00000001
+#define ECRYPTFS_AUTH_TOK_FNEK    0x00000002
 	u32 flags;
 	struct list_head mount_crypt_stat_list;
 	struct key *global_auth_tok_key;
@@ -579,7 +580,7 @@ extern const struct inode_operations ecryptfs_main_iops;
 extern const struct inode_operations ecryptfs_dir_iops;
 extern const struct inode_operations ecryptfs_symlink_iops;
 extern const struct super_operations ecryptfs_sops;
-extern struct dentry_operations ecryptfs_dops;
+extern const struct dentry_operations ecryptfs_dops;
 extern struct address_space_operations ecryptfs_aops;
 extern int ecryptfs_verbosity;
 extern unsigned int ecryptfs_message_buf_len;
@@ -619,7 +620,6 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       u32 flags);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
-					struct ecryptfs_crypt_stat *crypt_stat,
 					struct inode *ecryptfs_dir_inode,
 					struct nameidata *ecryptfs_nd);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
@@ -696,7 +696,7 @@ ecryptfs_write_header_metadata(char *virt,
 int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig);
 int
 ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-			   char *sig);
+			   char *sig, u32 global_auth_tok_flags);
 int ecryptfs_get_global_auth_tok_for_sig(
 	struct ecryptfs_global_auth_tok **global_auth_tok,
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 5697899..55b3145 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -246,7 +246,6 @@ out:
  */
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
-					struct ecryptfs_crypt_stat *crypt_stat,
 					struct inode *ecryptfs_dir_inode,
 					struct nameidata *ecryptfs_nd)
 {
@@ -254,6 +253,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 	struct vfsmount *lower_mnt;
 	struct inode *lower_inode;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat;
+	struct ecryptfs_crypt_stat *crypt_stat;
 	char *page_virt = NULL;
 	u64 file_size;
 	int rc = 0;
@@ -314,6 +314,11 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 			goto out_free_kmem;
 		}
 	}
+	crypt_stat = &ecryptfs_inode_to_private(
+					ecryptfs_dentry->d_inode)->crypt_stat;
+	/* TODO: lock for crypt_stat comparison */
+	if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
+			ecryptfs_set_default_sizes(crypt_stat);
 	rc = ecryptfs_read_and_validate_header_region(page_virt,
 						      ecryptfs_dentry->d_inode);
 	if (rc) {
@@ -362,9 +367,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 {
 	char *encrypted_and_encoded_name = NULL;
 	size_t encrypted_and_encoded_name_size;
-	struct ecryptfs_crypt_stat *crypt_stat = NULL;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
-	struct ecryptfs_inode_info *inode_info;
 	struct dentry *lower_dir_dentry, *lower_dentry;
 	int rc = 0;
 
@@ -388,26 +391,15 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	}
 	if (lower_dentry->d_inode)
 		goto lookup_and_interpose;
-	inode_info =  ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
-	if (inode_info) {
-		crypt_stat = &inode_info->crypt_stat;
-		/* TODO: lock for crypt_stat comparison */
-		if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED))
-			ecryptfs_set_default_sizes(crypt_stat);
-	}
-	if (crypt_stat)
-		mount_crypt_stat = crypt_stat->mount_crypt_stat;
-	else
-		mount_crypt_stat = &ecryptfs_superblock_to_private(
-			ecryptfs_dentry->d_sb)->mount_crypt_stat;
-	if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES))
-	    && !(mount_crypt_stat && (mount_crypt_stat->flags
-				     & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
+	mount_crypt_stat = &ecryptfs_superblock_to_private(
+				ecryptfs_dentry->d_sb)->mount_crypt_stat;
+	if (!(mount_crypt_stat
+	    && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES)))
 		goto lookup_and_interpose;
 	dput(lower_dentry);
 	rc = ecryptfs_encrypt_and_encode_filename(
 		&encrypted_and_encoded_name, &encrypted_and_encoded_name_size,
-		crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name,
+		NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name,
 		ecryptfs_dentry->d_name.len);
 	if (rc) {
 		printk(KERN_ERR "%s: Error attempting to encrypt and encode "
@@ -426,7 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	}
 lookup_and_interpose:
 	rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
-						 crypt_stat, ecryptfs_dir_inode,
+						 ecryptfs_dir_inode,
 						 ecryptfs_nd);
 	goto out;
 out_d_drop:
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index ff53942..e4a6223 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -2375,7 +2375,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache;
 
 int
 ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
-			     char *sig)
+			     char *sig, u32 global_auth_tok_flags)
 {
 	struct ecryptfs_global_auth_tok *new_auth_tok;
 	int rc = 0;
@@ -2389,6 +2389,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
 		goto out;
 	}
 	memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX);
+	new_auth_tok->flags = global_auth_tok_flags;
 	new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
 	mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
 	list_add(&new_auth_tok->mount_crypt_stat_list,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 789cf2e..aed56c2 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 		case ecryptfs_opt_ecryptfs_sig:
 			sig_src = args[0].from;
 			rc = ecryptfs_add_global_auth_tok(mount_crypt_stat,
-							  sig_src);
+							  sig_src, 0);
 			if (rc) {
 				printk(KERN_ERR "Error attempting to register "
 				       "global sig; rc = [%d]\n", rc);
@@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
 				ECRYPTFS_SIG_SIZE_HEX] = '\0';
 			rc = ecryptfs_add_global_auth_tok(
 				mount_crypt_stat,
-				mount_crypt_stat->global_default_fnek_sig);
+				mount_crypt_stat->global_default_fnek_sig,
+				ECRYPTFS_AUTH_TOK_FNEK);
 			if (rc) {
 				printk(KERN_ERR "Error attempting to register "
 				       "global fnek sig [%s]; rc = [%d]\n",
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 011b9b8..c5c424f 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -417,10 +417,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
 	ep_unregister_pollwait(ep, epi);
 
 	/* Remove the current item from the list of epoll hooks */
-	spin_lock(&file->f_ep_lock);
+	spin_lock(&file->f_lock);
 	if (ep_is_linked(&epi->fllink))
 		list_del_init(&epi->fllink);
-	spin_unlock(&file->f_ep_lock);
+	spin_unlock(&file->f_lock);
 
 	rb_erase(&epi->rbn, &ep->rbr);
 
@@ -538,7 +538,7 @@ void eventpoll_release_file(struct file *file)
 	struct epitem *epi;
 
 	/*
-	 * We don't want to get "file->f_ep_lock" because it is not
+	 * We don't want to get "file->f_lock" because it is not
 	 * necessary. It is not necessary because we're in the "struct file"
 	 * cleanup path, and this means that noone is using this file anymore.
 	 * So, for example, epoll_ctl() cannot hit here sicne if we reach this
@@ -547,6 +547,8 @@ void eventpoll_release_file(struct file *file)
 	 * will correctly serialize the operation. We do need to acquire
 	 * "ep->mtx" after "epmutex" because ep_remove() requires it when called
 	 * from anywhere but ep_free().
+	 *
+	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
 
@@ -785,9 +787,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 		goto error_unregister;
 
 	/* Add the current item to the list of active epoll hook for this file */
-	spin_lock(&tfile->f_ep_lock);
+	spin_lock(&tfile->f_lock);
 	list_add_tail(&epi->fllink, &tfile->f_ep_links);
-	spin_unlock(&tfile->f_ep_lock);
+	spin_unlock(&tfile->f_lock);
 
 	/*
 	 * Add the current item to the RB tree. All RB tree operations are
diff --git a/fs/exec.c b/fs/exec.c
index 0dd60a0..c5128fb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -45,6 +45,7 @@
 #include <linux/proc_fs.h>
 #include <linux/mount.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
@@ -127,6 +128,9 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
+	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+	if (error)
+		goto exit;
 
 	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
 	error = PTR_ERR(file);
@@ -674,6 +678,9 @@ struct file *open_exec(const char *name)
 	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
+	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+	if (err)
+		goto out_path_put;
 
 	file = nameidata_to_filp(&nd, O_RDONLY|O_LARGEFILE);
 	if (IS_ERR(file))
@@ -1051,14 +1058,26 @@ EXPORT_SYMBOL(install_exec_creds);
  */
 void check_unsafe_exec(struct linux_binprm *bprm)
 {
-	struct task_struct *p = current;
+	struct task_struct *p = current, *t;
+	unsigned long flags;
+	unsigned n_fs, n_sighand;
 
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
-	if (atomic_read(&p->fs->count) > 1 ||
-	    atomic_read(&p->files->count) > 1 ||
-	    atomic_read(&p->sighand->count) > 1)
+	n_fs = 1;
+	n_sighand = 1;
+	lock_task_sighand(p, &flags);
+	for (t = next_thread(p); t != p; t = next_thread(t)) {
+		if (t->fs == p->fs)
+			n_fs++;
+		n_sighand++;
+	}
+
+	if (atomic_read(&p->fs->count) > n_fs ||
+	    atomic_read(&p->sighand->count) > n_sighand)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
+
+	unlock_task_sighand(p, &flags);
 }
 
 /* 
@@ -1168,6 +1187,9 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 	retval = security_bprm_check(bprm);
 	if (retval)
 		return retval;
+	retval = ima_bprm_check(bprm);
+	if (retval)
+		return retval;
 
 	/* kernel module loader fixup */
 	/* so we don't try to load run modprobe in kernel space. */
@@ -1268,6 +1290,7 @@ int do_execve(char * filename,
 	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
 	if (retval < 0)
 		goto out_free;
+	current->in_execve = 1;
 
 	retval = -ENOMEM;
 	bprm->cred = prepare_exec_creds();
@@ -1321,6 +1344,7 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
@@ -1339,6 +1363,7 @@ out_file:
 	}
 
 out_unlock:
+	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 
 out_free:
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 4a29d63..7f8d2e5 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -570,7 +570,7 @@ do_more:
 error_return:
 	brelse(bitmap_bh);
 	release_blocks(sb, freed);
-	DQUOT_FREE_BLOCK(inode, freed);
+	vfs_dq_free_block(inode, freed);
 }
 
 /**
@@ -1247,7 +1247,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
+	if (vfs_dq_alloc_block(inode, num)) {
 		*errp = -EDQUOT;
 		return 0;
 	}
@@ -1409,7 +1409,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
+	vfs_dq_free_block(inode, *count-num);
 	*count = num;
 	return ret_block;
 
@@ -1420,7 +1420,7 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
+		vfs_dq_free_block(inode, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 66321a8..15387c9 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -121,8 +121,8 @@ void ext2_free_inode (struct inode * inode)
 	if (!is_bad_inode(inode)) {
 		/* Quota is already initialized in iput() */
 		ext2_xattr_delete_inode(inode);
-	    	DQUOT_FREE_INODE(inode);
-		DQUOT_DROP(inode);
+		vfs_dq_free_inode(inode);
+		vfs_dq_drop(inode);
 	}
 
 	es = EXT2_SB(sb)->s_es;
@@ -586,7 +586,7 @@ got:
 		goto fail_drop;
 	}
 
-	if (DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -605,10 +605,10 @@ got:
 	return inode;
 
 fail_free_drop:
-	DQUOT_FREE_INODE(inode);
+	vfs_dq_free_inode(inode);
 
 fail_drop:
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 23fff2f..b43b9556 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1444,7 +1444,7 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
 		return error;
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		error = DQUOT_TRANSFER(inode, iattr) ? -EDQUOT : 0;
+		error = vfs_dq_transfer(inode, iattr) ? -EDQUOT : 0;
 		if (error)
 			return error;
 	}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index da8bdea..f983225 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	es = sbi->s_es;
 	if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
 	    (old_mount_opt & EXT2_MOUNT_XIP)) &&
-	    invalidate_inodes(sb))
-		ext2_warning(sb, __func__, "busy inodes while remounting "\
-			     "xip remain in cache (no functional problem)");
+	    invalidate_inodes(sb)) {
+		ext2_warning(sb, __func__, "refusing change of xip flag "
+			     "with busy inodes while remounting");
+		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
+		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
+	}
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 	if (*flags & MS_RDONLY) {
@@ -1328,6 +1331,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
 				sb->s_blocksize - offset : toread;
 
 		tmp_bh.b_state = 0;
+		tmp_bh.b_size = sb->s_blocksize;
 		err = ext2_get_block(inode, blk, &tmp_bh, 0);
 		if (err < 0)
 			return err;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 987a526..7913531 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -642,7 +642,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 				ea_bdebug(new_bh, "reusing block");
 
 				error = -EDQUOT;
-				if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+				if (vfs_dq_alloc_block(inode, 1)) {
 					unlock_buffer(new_bh);
 					goto cleanup;
 				}
@@ -699,7 +699,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 		 * as if nothing happened and cleanup the unused block */
 		if (error && error != -ENOSPC) {
 			if (new_bh && new_bh != old_bh)
-				DQUOT_FREE_BLOCK(inode, 1);
+				vfs_dq_free_block(inode, 1);
 			goto cleanup;
 		}
 	} else
@@ -731,7 +731,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
 			le32_add_cpu(&HDR(old_bh)->h_refcount, -1);
 			if (ce)
 				mb_cache_entry_release(ce);
-			DQUOT_FREE_BLOCK(inode, 1);
+			vfs_dq_free_block(inode, 1);
 			mark_buffer_dirty(old_bh);
 			ea_bdebug(old_bh, "refcount now=%d",
 				le32_to_cpu(HDR(old_bh)->h_refcount));
@@ -794,7 +794,7 @@ ext2_xattr_delete_inode(struct inode *inode)
 		mark_buffer_dirty(bh);
 		if (IS_SYNC(inode))
 			sync_dirty_buffer(bh);
-		DQUOT_FREE_BLOCK(inode, 1);
+		vfs_dq_free_block(inode, 1);
 	}
 	EXT2_I(inode)->i_file_acl = 0;
 
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 0dbf1c0..225202d 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -676,7 +676,7 @@ void ext3_free_blocks(handle_t *handle, struct inode *inode,
 	}
 	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
-		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+		vfs_dq_free_block(inode, dquot_freed_blocks);
 	return;
 }
 
@@ -1502,7 +1502,7 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (DQUOT_ALLOC_BLOCK(inode, num)) {
+	if (vfs_dq_alloc_block(inode, num)) {
 		*errp = -EDQUOT;
 		return 0;
 	}
@@ -1714,7 +1714,7 @@ allocated:
 
 	*errp = 0;
 	brelse(bitmap_bh);
-	DQUOT_FREE_BLOCK(inode, *count-num);
+	vfs_dq_free_block(inode, *count-num);
 	*count = num;
 	return ret_block;
 
@@ -1729,7 +1729,7 @@ out:
 	 * Undo the block allocation
 	 */
 	if (!performed_allocation)
-		DQUOT_FREE_BLOCK(inode, *count);
+		vfs_dq_free_block(inode, *count);
 	brelse(bitmap_bh);
 	return 0;
 }
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 8de6c72..dd13d60 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -123,10 +123,10 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	DQUOT_INIT(inode);
+	vfs_dq_init(inode);
 	ext3_xattr_delete_inode(handle, inode);
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
@@ -589,7 +589,7 @@ got:
 		sizeof(struct ext3_inode) - EXT3_GOOD_OLD_INODE_SIZE : 0;
 
 	ret = inode;
-	if(DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -620,10 +620,10 @@ really_out:
 	return ret;
 
 fail_free_drop:
-	DQUOT_FREE_INODE(inode);
+	vfs_dq_free_inode(inode);
 
 fail_drop:
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5fa453b..4a09ff1 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1435,6 +1435,10 @@ static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
+static int buffer_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
 /*
  * Note that we always start a transaction even if we're not journalling
  * data.  This is to preserve ordering: any hole instantiation within
@@ -1505,6 +1509,15 @@ static int ext3_ordered_writepage(struct page *page,
 	if (ext3_journal_current_handle())
 		goto out_fail;
 
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, inode->i_sb->s_blocksize,
+				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	} else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
+		/* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */
+		return block_write_full_page(page, NULL, wbc);
+	}
+	page_bufs = page_buffers(page);
+
 	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
 
 	if (IS_ERR(handle)) {
@@ -1512,11 +1525,6 @@ static int ext3_ordered_writepage(struct page *page,
 		goto out_fail;
 	}
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
-	}
-	page_bufs = page_buffers(page);
 	walk_page_buffers(handle, page_bufs, 0,
 			PAGE_CACHE_SIZE, NULL, bget_one);
 
@@ -3055,7 +3063,7 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
 		if (error) {
 			ext3_journal_stop(handle);
 			return error;
@@ -3146,7 +3154,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
 		ret = 2 * (bpp + indirects) + 2;
 
 #ifdef CONFIG_QUOTA
-	/* We know that structure was already allocated during DQUOT_INIT so
+	/* We know that structure was already allocated during vfs_dq_init so
 	 * we will be updating only the data blocks + inodes */
 	ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
@@ -3237,7 +3245,7 @@ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, vfs_dq_alloc_space() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 4db4ffa..e2fc63c 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2049,7 +2049,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	DQUOT_INIT(dentry->d_inode);
+	vfs_dq_init(dentry->d_inode);
 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2108,7 +2108,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	DQUOT_INIT(dentry->d_inode);
+	vfs_dq_init(dentry->d_inode);
 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2272,7 +2272,7 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	if (new_dentry->d_inode)
-		DQUOT_INIT(new_dentry->d_inode);
+		vfs_dq_init(new_dentry->d_inode);
 	handle = ext3_journal_start(old_dir, 2 *
 					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b70d90e..9e5b8e3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -707,8 +707,6 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
-static int ext3_dquot_initialize(struct inode *inode, int type);
-static int ext3_dquot_drop(struct inode *inode);
 static int ext3_write_dquot(struct dquot *dquot);
 static int ext3_acquire_dquot(struct dquot *dquot);
 static int ext3_release_dquot(struct dquot *dquot);
@@ -723,8 +721,8 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static struct dquot_operations ext3_quota_operations = {
-	.initialize	= ext3_dquot_initialize,
-	.drop		= ext3_dquot_drop,
+	.initialize	= dquot_initialize,
+	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
 	.alloc_inode	= dquot_alloc_inode,
 	.free_space	= dquot_free_space,
@@ -1438,7 +1436,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
 		}
 
 		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
 				"%s: truncating inode %lu to %Ld bytes\n",
@@ -2428,12 +2426,13 @@ static void ext3_write_super (struct super_block * sb)
 
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
-	sb->s_dirt = 0;
-	if (wait)
-		ext3_force_commit(sb);
-	else
-		journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
+	tid_t target;
 
+	sb->s_dirt = 0;
+	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
+		if (wait)
+			log_wait_commit(EXT3_SB(sb)->s_journal, target);
+	}
 	return 0;
 }
 
@@ -2701,7 +2700,7 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
  * Process 1                         Process 2
  * ext3_create()                     quota_sync()
  *   journal_start()                   write_dquot()
- *   DQUOT_INIT()                        down(dqio_mutex)
+ *   vfs_dq_init()                       down(dqio_mutex)
  *     down(dqio_mutex)                    journal_start()
  *
  */
@@ -2713,44 +2712,6 @@ static inline struct inode *dquot_to_inode(struct dquot *dquot)
 	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
 }
 
-static int ext3_dquot_initialize(struct inode *inode, int type)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may create quota structure so we need to reserve enough blocks */
-	handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_initialize(inode, type);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-static int ext3_dquot_drop(struct inode *inode)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may delete quota structure so we need to reserve enough blocks */
-	handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle)) {
-		/*
-		 * We call dquot_drop() anyway to at least release references
-		 * to quota structures so that umount does not hang.
-		 */
-		dquot_drop(inode);
-		return PTR_ERR(handle);
-	}
-	ret = dquot_drop(inode);
-	err = ext3_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
 static int ext3_write_dquot(struct dquot *dquot)
 {
 	int ret, err;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 175414a..83b7be8 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -498,7 +498,7 @@ ext3_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext3_journal_dirty_metadata(handle, bh);
 		if (IS_SYNC(inode))
 			handle->h_sync = 1;
-		DQUOT_FREE_BLOCK(inode, 1);
+		vfs_dq_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 		if (ce)
@@ -774,7 +774,7 @@ inserted:
 				/* The old block is released after updating
 				   the inode. */
 				error = -EDQUOT;
-				if (DQUOT_ALLOC_BLOCK(inode, 1))
+				if (vfs_dq_alloc_block(inode, 1))
 					goto cleanup;
 				error = ext3_journal_get_write_access(handle,
 								      new_bh);
@@ -848,7 +848,7 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	DQUOT_FREE_BLOCK(inode, 1);
+	vfs_dq_free_block(inode, 1);
 	goto cleanup;
 
 bad_block:
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9a50b80..38f40d5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -536,7 +536,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	ext4_mb_free_blocks(handle, inode, block, count,
 			    metadata, &dquot_freed_blocks);
 	if (dquot_freed_blocks)
-		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+		vfs_dq_free_block(inode, dquot_freed_blocks);
 	return;
 }
 
@@ -609,7 +609,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+	    (*retries)++ > 3 ||
+	    !EXT4_SB(sb)->s_journal)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aafc9eb..6083bb38 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -20,6 +20,7 @@
 #include <linux/blkdev.h>
 #include <linux/magic.h>
 #include <linux/jbd2.h>
+#include <linux/quota.h>
 #include "ext4_i.h"
 
 /*
@@ -868,7 +869,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
 {
 	unsigned len = le16_to_cpu(dlen);
 
-	if (len == EXT4_MAX_REC_LEN)
+	if (len == EXT4_MAX_REC_LEN || len == 0)
 		return 1 << 16;
 	return len;
 }
@@ -1098,6 +1099,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern qsize_t ext4_get_reserved_space(struct inode *inode);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e2eab19..e0aa4fe 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1122,7 +1122,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path,
 	struct ext4_extent_idx *ix;
 	struct ext4_extent *ex;
 	ext4_fsblk_t block;
-	int depth, ee_len;
+	int depth;	/* Note, NOT eh_depth; depth from top of tree */
+	int ee_len;
 
 	BUG_ON(path == NULL);
 	depth = path->p_depth;
@@ -1179,7 +1180,8 @@ got_index:
 		if (bh == NULL)
 			return -EIO;
 		eh = ext_block_hdr(bh);
-		if (ext4_ext_check_header(inode, eh, depth)) {
+		/* subtract from p_depth to get proper eh_depth */
+		if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
 			put_bh(bh);
 			return -EIO;
 		}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4fb86a0..fb51b40 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int fatal = 0, err, count;
+	int fatal = 0, err, count, cleared;
 	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
@@ -220,10 +220,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	DQUOT_INIT(inode);
+	vfs_dq_init(inode);
 	ext4_xattr_delete_inode(handle, inode);
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
 
 	is_directory = S_ISDIR(inode->i_mode);
 
@@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		goto error_return;
 
 	/* Ok, now we can actually update the inode bitmaps.. */
-	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-					bit, bitmap_bh->b_data))
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	if (!cleared)
 		ext4_error(sb, "ext4_free_inode",
 			   "bit already cleared for inode %lu", ino);
 	else {
@@ -696,6 +698,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	struct inode *ret;
 	ext4_group_t i;
 	int free = 0;
+	static int once = 1;
 	ext4_group_t flex_group;
 
 	/* Cannot create files in a deleted directory */
@@ -715,6 +718,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 
 	if (sbi->s_log_groups_per_flex) {
 		ret2 = find_group_flex(sb, dir, &group);
+		if (ret2 == -1) {
+			ret2 = find_group_other(sb, dir, &group);
+			if (ret2 == 0 && once)
+				once = 0;
+				printk(KERN_NOTICE "ext4: find_group_flex "
+				       "failed, fallback succeeded dir %lu\n",
+				       dir->i_ino);
+		}
 		goto got_group;
 	}
 
@@ -904,7 +915,7 @@ got:
 	ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
 
 	ret = inode;
-	if (DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		err = -EDQUOT;
 		goto fail_drop;
 	}
@@ -945,10 +956,10 @@ really_out:
 	return ret;
 
 fail_free_drop:
-	DQUOT_FREE_INODE(inode);
+	vfs_dq_free_inode(inode);
 
 fail_drop:
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	inode->i_nlink = 0;
 	unlock_new_inode(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20b..71d3ecd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+					EXT4_SB(inode->i_sb)->s_journal,
+					&EXT4_I(inode)->jinode,
+					new_size);
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
@@ -973,6 +975,17 @@ out:
 	return err;
 }
 
+qsize_t ext4_get_reserved_space(struct inode *inode)
+{
+	unsigned long long total;
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	total = EXT4_I(inode)->i_reserved_data_blocks +
+		EXT4_I(inode)->i_reserved_meta_blocks;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	return total;
+}
 /*
  * Calculate the number of metadata blocks need to reserve
  * to allocate @blocks for non extent file based file
@@ -1034,8 +1047,14 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 	/* update per-inode reservations */
 	BUG_ON(used  > EXT4_I(inode)->i_reserved_data_blocks);
 	EXT4_I(inode)->i_reserved_data_blocks -= used;
-
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	/*
+	 * free those over-booking quota for metadata blocks
+	 */
+
+	if (mdb_free)
+		vfs_dq_release_reservation_block(inode, mdb_free);
 }
 
 /*
@@ -1366,6 +1385,10 @@ retry:
 		goto out;
 	}
 
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
@@ -1375,7 +1398,7 @@ retry:
 	*pagep = page;
 
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext4_get_block);
+				ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
@@ -1547,8 +1570,8 @@ static int ext4_journalled_write_end(struct file *file,
 static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
 {
 	int retries = 0;
-       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       unsigned long md_needed, mdblocks, total = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned long md_needed, mdblocks, total = 0;
 
 	/*
 	 * recalculate the amount of metadata blocks to reserve
@@ -1564,12 +1587,23 @@ repeat:
 	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
 	total = md_needed + nrblocks;
 
+	/*
+	 * Make quota reservation here to prevent quota overflow
+	 * later. Real quota accounting is done at pages writeout
+	 * time.
+	 */
+	if (vfs_dq_reserve_block(inode, total)) {
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		return -EDQUOT;
+	}
+
 	if (ext4_claim_free_blocks(sbi, total)) {
 		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
 			yield();
 			goto repeat;
 		}
+		vfs_dq_release_reservation_block(inode, total);
 		return -ENOSPC;
 	}
 	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
@@ -1623,6 +1657,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
 	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
 	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	vfs_dq_release_reservation_block(inode, release);
 }
 
 static void ext4_da_page_release_reservation(struct page *page,
@@ -2437,6 +2473,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int no_nrwrite_index_update;
 	int pages_written = 0;
 	long pages_skipped;
+	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
@@ -2488,9 +2525,15 @@ static int ext4_da_writepages(struct address_space *mapping,
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
-	if (wbc->range_cyclic)
+	range_cyclic = wbc->range_cyclic;
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index;
-	else
+		if (index)
+			cycled = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = LLONG_MAX;
+		wbc->range_cyclic = 0;
+	} else
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 
 	mpd.wbc = wbc;
@@ -2504,6 +2547,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	wbc->no_nrwrite_index_update = 1;
 	pages_skipped = wbc->pages_skipped;
 
+retry:
 	while (!ret && wbc->nr_to_write > 0) {
 
 		/*
@@ -2530,7 +2574,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 
 		ext4_journal_stop(handle);
 
-		if (mpd.retval == -ENOSPC) {
+		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
 			/* commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
@@ -2546,6 +2590,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 			pages_written += mpd.pages_written;
 			wbc->pages_skipped = pages_skipped;
 			ret = 0;
+			io_done = 1;
 		} else if (wbc->nr_to_write)
 			/*
 			 * There is no more writeout needed
@@ -2554,6 +2599,13 @@ static int ext4_da_writepages(struct address_space *mapping,
 			 */
 			break;
 	}
+	if (!io_done && !cycled) {
+		cycled = 1;
+		index = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = mapping->writeback_index - 1;
+		goto retry;
+	}
 	if (pages_skipped != wbc->pages_skipped)
 		printk(KERN_EMERG "This should not happen leaving %s "
 				"with nr_to_write = %ld ret = %d\n",
@@ -2561,6 +2613,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 
 	/* Update index */
 	index += pages_written;
+	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * set the writeback_index so that range_cyclic
@@ -2648,6 +2701,9 @@ retry:
 		ret = PTR_ERR(handle);
 		goto out;
 	}
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
@@ -4586,7 +4642,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			error = PTR_ERR(handle);
 			goto err_out;
 		}
-		error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+		error = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
 		if (error) {
 			ext4_journal_stop(handle);
 			return error;
@@ -4965,7 +5021,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * i_size has been changed by generic_commit_write() and we thus need
  * to include the updated inode in the current transaction.
  *
- * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+ * Also, vfs_dq_alloc_block() will always dirty the inode when blocks
  * are allocated to the file.
  *
  * If the inode is marked synchronous, we don't honour that here - doing
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index deba54f..b038188 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1447,7 +1447,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
 	struct ext4_free_extent *gex = &ac->ac_g_ex;
 
 	BUG_ON(ex->fe_len <= 0);
-	BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 	BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 	BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
 
@@ -3086,9 +3086,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
 		/* release all the reserved blocks if non delalloc */
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
-	else
+	else {
 		percpu_counter_sub(&sbi->s_dirtyblocks_counter,
 						ac->ac_b_ex.fe_len);
+		/* convert reserved quota blocks to real quota blocks */
+		vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len);
+	}
 
 	if (sbi->s_log_groups_per_flex) {
 		ext4_group_t flex_group = ext4_flex_group(sbi,
@@ -3292,7 +3295,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	}
 	BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
 			start > ac->ac_o_ex.fe_logical);
-	BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
+	BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
 
 	/* now prepare goal request */
 
@@ -3589,6 +3592,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 			struct super_block *sb, struct ext4_prealloc_space *pa)
 {
 	ext4_group_t grp;
+	ext4_fsblk_t grp_blk;
 
 	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
 		return;
@@ -3603,8 +3607,12 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	pa->pa_deleted = 1;
 	spin_unlock(&pa->pa_lock);
 
-	/* -1 is to protect from crossing allocation group */
-	ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL);
+	grp_blk = pa->pa_pstart;
+	/* If linear, pa_pstart may be in the next group when pa is used up */
+	if (pa->pa_linear)
+		grp_blk--;
+
+	ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
 
 	/*
 	 * possible race:
@@ -3693,6 +3701,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	pa->pa_free = pa->pa_len;
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
 	pa->pa_linear = 0;
 
@@ -3755,6 +3765,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
 	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
 	pa->pa_linear = 1;
 
@@ -4476,23 +4487,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 			pa->pa_free -= ac->ac_b_ex.fe_len;
 			pa->pa_len -= ac->ac_b_ex.fe_len;
 			spin_unlock(&pa->pa_lock);
-			/*
-			 * We want to add the pa to the right bucket.
-			 * Remove it from the list and while adding
-			 * make sure the list to which we are adding
-			 * doesn't grow big.
-			 */
-			if (likely(pa->pa_free)) {
-				spin_lock(pa->pa_obj_lock);
-				list_del_rcu(&pa->pa_inode_list);
-				spin_unlock(pa->pa_obj_lock);
-				ext4_mb_add_n_trim(ac);
-			}
 		}
-		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
 	if (ac->alloc_semp)
 		up_read(ac->alloc_semp);
+	if (pa) {
+		/*
+		 * We want to add the pa to the right bucket.
+		 * Remove it from the list and while adding
+		 * make sure the list to which we are adding
+		 * doesn't grow big.  We need to release
+		 * alloc_semp before calling ext4_mb_add_n_trim()
+		 */
+		if (pa->pa_linear && likely(pa->pa_free)) {
+			spin_lock(pa->pa_obj_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			spin_unlock(pa->pa_obj_lock);
+			ext4_mb_add_n_trim(ac);
+		}
+		ext4_mb_put_pa(ac, ac->ac_sb, pa);
+	}
 	if (ac->ac_bitmap_page)
 		page_cache_release(ac->ac_bitmap_page);
 	if (ac->ac_buddy_page)
@@ -4533,7 +4547,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block = 0;
-	unsigned int inquota;
+	unsigned int inquota = 0;
 	unsigned int reserv_blks = 0;
 
 	sb = ar->inode->i_sb;
@@ -4551,9 +4565,17 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 		   (unsigned long long) ar->pleft,
 		   (unsigned long long) ar->pright);
 
-	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
-		/*
-		 * With delalloc we already reserved the blocks
+	/*
+	 * For delayed allocation, we could skip the ENOSPC and
+	 * EDQUOT check, as blocks and quotas have been already
+	 * reserved when data being copied into pagecache.
+	 */
+	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+	else {
+		/* Without delayed allocation we need to verify
+		 * there is enough free blocks to do block allocation
+		 * and verify allocation doesn't exceed the quota limits.
 		 */
 		while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
 			/* let others to free the space */
@@ -4565,19 +4587,16 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			return 0;
 		}
 		reserv_blks = ar->len;
+		while (ar->len && vfs_dq_alloc_block(ar->inode, ar->len)) {
+			ar->flags |= EXT4_MB_HINT_NOPREALLOC;
+			ar->len--;
+		}
+		inquota = ar->len;
+		if (ar->len == 0) {
+			*errp = -EDQUOT;
+			goto out3;
+		}
 	}
-	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
-		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
-		ar->len--;
-	}
-	if (ar->len == 0) {
-		*errp = -EDQUOT;
-		goto out3;
-	}
-	inquota = ar->len;
-
-	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
-		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
 
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (!ac) {
@@ -4643,8 +4662,8 @@ repeat:
 out2:
 	kmem_cache_free(ext4_ac_cachep, ac);
 out1:
-	if (ar->len < inquota)
-		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
+	if (inquota && ar->len < inquota)
+		vfs_dq_free_block(ar->inode, inquota - ar->len);
 out3:
 	if (!ar->len) {
 		if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca..fe64d9f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
 					+ 1);
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
-		goto err_out;
+		return retval;
 	}
 	tmp_inode = ext4_new_inode(handle,
 				inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
 	if (IS_ERR(tmp_inode)) {
 		retval = -ENOMEM;
 		ext4_journal_stop(handle);
-		tmp_inode = NULL;
-		goto err_out;
+		return retval;
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
@@ -618,8 +617,7 @@ err_out:
 
 	ext4_journal_stop(handle);
 
-	if (tmp_inode)
-		iput(tmp_inode);
+	iput(tmp_inode);
 
 	return retval;
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ba702bd..8341024 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2092,7 +2092,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
-	DQUOT_INIT(dentry->d_inode);
+	vfs_dq_init(dentry->d_inode);
 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2151,7 +2151,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
-	DQUOT_INIT(dentry->d_inode);
+	vfs_dq_init(dentry->d_inode);
 	handle = ext4_journal_start(dir, EXT4_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2318,7 +2318,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	if (new_dentry->d_inode)
-		DQUOT_INIT(new_dentry->d_inode);
+		vfs_dq_init(new_dentry->d_inode);
 	handle = ext4_journal_start(old_dir, 2 *
 					EXT4_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5..f7371a6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -926,8 +926,6 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
 
-static int ext4_dquot_initialize(struct inode *inode, int type);
-static int ext4_dquot_drop(struct inode *inode);
 static int ext4_write_dquot(struct dquot *dquot);
 static int ext4_acquire_dquot(struct dquot *dquot);
 static int ext4_release_dquot(struct dquot *dquot);
@@ -942,9 +940,13 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 				const char *data, size_t len, loff_t off);
 
 static struct dquot_operations ext4_quota_operations = {
-	.initialize	= ext4_dquot_initialize,
-	.drop		= ext4_dquot_drop,
+	.initialize	= dquot_initialize,
+	.drop		= dquot_drop,
 	.alloc_space	= dquot_alloc_space,
+	.reserve_space	= dquot_reserve_space,
+	.claim_space	= dquot_claim_space,
+	.release_rsv	= dquot_release_reserved_space,
+	.get_reserved_space = ext4_get_reserved_space,
 	.alloc_inode	= dquot_alloc_inode,
 	.free_space	= dquot_free_space,
 	.free_inode	= dquot_free_inode,
@@ -1802,7 +1804,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		}
 
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 		if (inode->i_nlink) {
 			printk(KERN_DEBUG
 				"%s: truncating inode %lu to %lld bytes\n",
@@ -3046,14 +3048,17 @@ static void ext4_write_super(struct super_block *sb)
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	int ret = 0;
+	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
 	if (EXT4_SB(sb)->s_journal) {
-		if (wait)
-			ret = ext4_force_commit(sb);
-		else
- 			jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
+					      &target)) {
+			if (wait)
+				jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
+						     target);
+		}
 	} else {
 		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
 	}
@@ -3088,7 +3093,6 @@ static int ext4_freeze(struct super_block *sb)
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
 		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
 		if (error)
 			goto out;
@@ -3365,8 +3369,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
  * is locked for write. Otherwise the are possible deadlocks:
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
- *   jbd2_journal_start()                   write_dquot()
- *   DQUOT_INIT()                        down(dqio_mutex)
+ *   jbd2_journal_start()                  write_dquot()
+ *   vfs_dq_init()                         down(dqio_mutex)
  *     down(dqio_mutex)                    jbd2_journal_start()
  *
  */
@@ -3378,44 +3382,6 @@ static inline struct inode *dquot_to_inode(struct dquot *dquot)
 	return sb_dqopt(dquot->dq_sb)->files[dquot->dq_type];
 }
 
-static int ext4_dquot_initialize(struct inode *inode, int type)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may create quota structure so we need to reserve enough blocks */
-	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle))
-		return PTR_ERR(handle);
-	ret = dquot_initialize(inode, type);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
-static int ext4_dquot_drop(struct inode *inode)
-{
-	handle_t *handle;
-	int ret, err;
-
-	/* We may delete quota structure so we need to reserve enough blocks */
-	handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (IS_ERR(handle)) {
-		/*
-		 * We call dquot_drop() anyway to at least release references
-		 * to quota structures so that umount does not hang.
-		 */
-		dquot_drop(inode);
-		return PTR_ERR(handle);
-	}
-	ret = dquot_drop(inode);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-}
-
 static int ext4_write_dquot(struct dquot *dquot)
 {
 	int ret, err;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 157ce65..62b31c2 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -490,7 +490,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 		error = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
-		DQUOT_FREE_BLOCK(inode, 1);
+		vfs_dq_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
 		if (ce)
@@ -784,7 +784,7 @@ inserted:
 				/* The old block is released after updating
 				   the inode. */
 				error = -EDQUOT;
-				if (DQUOT_ALLOC_BLOCK(inode, 1))
+				if (vfs_dq_alloc_block(inode, 1))
 					goto cleanup;
 				error = ext4_journal_get_write_access(handle,
 								      new_bh);
@@ -860,7 +860,7 @@ cleanup:
 	return error;
 
 cleanup_dquot:
-	DQUOT_FREE_BLOCK(inode, 1);
+	vfs_dq_free_block(inode, 1);
 	goto cleanup;
 
 bad_block:
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 6b74d09..de0004f 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 	sector_t blocknr;
 
 	/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
-	mutex_lock(&mapping->host->i_mutex);
+	down_read(&mapping->host->i_alloc_sem);
 	blocknr = generic_block_bmap(mapping, block, fat_get_block);
-	mutex_unlock(&mapping->host->i_mutex);
+	up_read(&mapping->host->i_alloc_sem);
 
 	return blocknr;
 }
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index 7ba03a4..da3f361 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -188,7 +188,7 @@ old_compare:
 	goto out;
 }
 
-static struct dentry_operations msdos_dentry_operations = {
+static const struct dentry_operations msdos_dentry_operations = {
 	.d_hash		= msdos_hash,
 	.d_compare	= msdos_cmp,
 };
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 8ae32e3..a0e00e3 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -166,13 +166,13 @@ static int vfat_cmp(struct dentry *dentry, struct qstr *a, struct qstr *b)
 	return 1;
 }
 
-static struct dentry_operations vfat_ci_dentry_ops = {
+static const struct dentry_operations vfat_ci_dentry_ops = {
 	.d_revalidate	= vfat_revalidate_ci,
 	.d_hash		= vfat_hashi,
 	.d_compare	= vfat_cmpi,
 };
 
-static struct dentry_operations vfat_dentry_ops = {
+static const struct dentry_operations vfat_dentry_ops = {
 	.d_revalidate	= vfat_revalidate,
 	.d_hash		= vfat_hash,
 	.d_compare	= vfat_cmp,
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bd215cc..d865ca66 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -141,7 +141,7 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
 	return ret;
 }
 
-#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | FASYNC | O_DIRECT | O_NOATIME)
+#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
@@ -177,21 +177,21 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		return error;
 
 	/*
-	 * We still need a lock here for now to keep multiple FASYNC calls
-	 * from racing with each other.
+	 * ->fasync() is responsible for setting the FASYNC bit.
 	 */
-	lock_kernel();
-	if ((arg ^ filp->f_flags) & FASYNC) {
-		if (filp->f_op && filp->f_op->fasync) {
-			error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
-			if (error < 0)
-				goto out;
-		}
+	if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op &&
+			filp->f_op->fasync) {
+		error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
+		if (error < 0)
+			goto out;
+		if (error > 0)
+			error = 0;
 	}
-
+	spin_lock(&filp->f_lock);
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
+	spin_unlock(&filp->f_lock);
+
  out:
-	unlock_kernel();
 	return error;
 }
 
@@ -516,7 +516,7 @@ static DEFINE_RWLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __read_mostly;
 
 /*
- * fasync_helper() is used by some character device drivers (mainly mice)
+ * fasync_helper() is used by almost all character device drivers
  * to set up the fasync queue. It returns negative on error, 0 if it did
  * no changes and positive if it added/deleted the entry.
  */
@@ -555,6 +555,13 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
 		result = 1;
 	}
 out:
+	/* Fix up FASYNC bit while still holding fasync_lock */
+	spin_lock(&filp->f_lock);
+	if (on)
+		filp->f_flags |= FASYNC;
+	else
+		filp->f_flags &= ~FASYNC;
+	spin_unlock(&filp->f_lock);
 	write_unlock_irq(&fasync_lock);
 	return result;
 }
diff --git a/fs/file_table.c b/fs/file_table.c
index bbeeac6..b74a8e1 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/fs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/eventpoll.h>
 #include <linux/rcupdate.h>
 #include <linux/mount.h>
@@ -127,6 +128,7 @@ struct file *get_empty_filp(void)
 	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	f->f_cred = get_cred(cred);
+	spin_lock_init(&f->f_lock);
 	eventpoll_init_file(f);
 	/* f->f_version: 0 */
 	return f;
@@ -279,6 +281,7 @@ void __fput(struct file *file)
 	if (file->f_op && file->f_op->release)
 		file->f_op->release(inode, file);
 	security_file_free(file);
+	ima_file_free(file);
 	if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e5eaa62..e3fe991 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret;
 
 	BUG_ON(inode->i_state & I_SYNC);
+	WARN_ON(inode->i_state & I_NEW);
 
 	/* Set I_SYNC, reset I_DIRTY */
 	dirty = inode->i_state & I_DIRTY;
@@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
+	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
 		if (!(inode->i_state & I_DIRTY) &&
@@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb,
 			break;
 		}
 
+		if (inode->i_state & I_NEW) {
+			requeue_io(inode);
+			continue;
+		}
+
 		if (wbc->nonblocking && bdi_write_congested(bdi)) {
 			wbc->encountered_congestion = 1;
 			if (!sb_is_blkdev_sb(sb))
@@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb,
 		list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 			struct address_space *mapping;
 
-			if (inode->i_state & (I_FREEING|I_WILL_FREE))
+			if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
 				continue;
 			mapping = inode->i_mapping;
 			if (mapping->nrpages == 0)
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index fdff346..06da052 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -224,7 +224,7 @@ static int invalid_nodeid(u64 nodeid)
 	return !nodeid || nodeid == FUSE_ROOT_ID;
 }
 
-struct dentry_operations fuse_dentry_operations = {
+const struct dentry_operations fuse_dentry_operations = {
 	.d_revalidate	= fuse_dentry_revalidate,
 };
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 5e64b81..6fc5aed 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -493,7 +493,7 @@ static inline u64 get_node_id(struct inode *inode)
 /** Device operations */
 extern const struct file_operations fuse_dev_operations;
 
-extern struct dentry_operations fuse_dentry_operations;
+extern const struct dentry_operations fuse_dentry_operations;
 
 /**
  * Get a filled in inode
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index e563a64..3a981b7 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,10 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
 	depends on EXPERIMENTAL && (64BIT || LBD)
+	select DLM if GFS2_FS_LOCKING_DLM
+	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
+	select SYSFS if GFS2_FS_LOCKING_DLM
+	select IP_SCTP if DLM_SCTP
 	select FS_POSIX_ACL
 	select CRC32
 	help
@@ -18,17 +22,16 @@ config GFS2_FS
 	  the locking module below. Documentation and utilities for GFS2 can
 	  be found here: http://sources.redhat.com/cluster
 
-	  The "nolock" lock module is now built in to GFS2 by default.
+	  The "nolock" lock module is now built in to GFS2 by default. If
+	  you want to use the DLM, be sure to enable HOTPLUG and IPv4/6
+	  networking.
 
 config GFS2_FS_LOCKING_DLM
-	tristate "GFS2 DLM locking module"
-	depends on GFS2_FS && SYSFS && NET && INET && (IPV6 || IPV6=n)
-	select IP_SCTP if DLM_SCTP
-	select CONFIGFS_FS
-	select DLM
+	bool "GFS2 DLM locking"
+	depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG
 	help
 	  Multiple node locking module for GFS2
 
-	  Most users of GFS2 will require this module. It provides the locking
+	  Most users of GFS2 will require this. It provides the locking
 	  interface between GFS2 and the DLM, which is required to use GFS2
 	  in a cluster environment.
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index c1b4ec6a..a851ea4 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,9 +1,9 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
-	glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
+	glops.o inode.o log.o lops.o main.o meta_io.o \
 	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
-obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
+gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
 
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index e335dce..43764f4 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -15,7 +15,6 @@
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 11ffc56..3a5d3f8 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b7c8e5c..aef4d0c 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -60,7 +60,6 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/vmalloc.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
index f114ba2..dee9b03 100644
--- a/fs/gfs2/eaops.c
+++ b/fs/gfs2/eaops.c
@@ -14,7 +14,6 @@
 #include <linux/capability.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 0d1c76d..899763a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 6b983ae..3984e47 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -10,7 +10,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/delay.h>
 #include <linux/sort.h>
@@ -18,7 +17,6 @@
 #include <linux/kallsyms.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/list.h>
-#include <linux/lm_interface.h>
 #include <linux/wait.h>
 #include <linux/module.h>
 #include <linux/rwsem.h>
@@ -155,13 +153,10 @@ static void glock_free(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct inode *aspace = gl->gl_aspace;
 
-	if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
-		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
-
 	if (aspace)
 		gfs2_aspace_put(aspace);
 
-	kmem_cache_free(gfs2_glock_cachep, gl);
+	sdp->sd_lockstruct.ls_ops->lm_put_lock(gfs2_glock_cachep, gl);
 }
 
 /**
@@ -172,6 +167,7 @@ static void glock_free(struct gfs2_glock *gl)
 
 static void gfs2_glock_hold(struct gfs2_glock *gl)
 {
+	GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
 	atomic_inc(&gl->gl_ref);
 }
 
@@ -211,17 +207,15 @@ int gfs2_glock_put(struct gfs2_glock *gl)
 			atomic_dec(&lru_count);
 		}
 		spin_unlock(&lru_lock);
-		GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
-		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru));
 		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
 		rv = 1;
 		goto out;
 	}
-	write_unlock(gl_lock_addr(gl->gl_hash));
 	/* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
 	if (atomic_read(&gl->gl_ref) == 2)
 		gfs2_glock_schedule_for_reclaim(gl);
+	write_unlock(gl_lock_addr(gl->gl_hash));
 out:
 	return rv;
 }
@@ -256,27 +250,6 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
 }
 
 /**
- * gfs2_glock_find() - Find glock by lock number
- * @sdp: The GFS2 superblock
- * @name: The lock name
- *
- * Returns: NULL, or the struct gfs2_glock with the requested number
- */
-
-static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
-					  const struct lm_lockname *name)
-{
-	unsigned int hash = gl_hash(sdp, name);
-	struct gfs2_glock *gl;
-
-	read_lock(gl_lock_addr(hash));
-	gl = search_bucket(hash, sdp, name);
-	read_unlock(gl_lock_addr(hash));
-
-	return gl;
-}
-
-/**
  * may_grant - check if its ok to grant a new lock
  * @gl: The glock
  * @gh: The lock request which we wish to grant
@@ -523,7 +496,7 @@ out_locked:
 }
 
 static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-				 unsigned int cur_state, unsigned int req_state,
+				 unsigned int req_state,
 				 unsigned int flags)
 {
 	int ret = LM_OUT_ERROR;
@@ -532,7 +505,7 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
 		return req_state == LM_ST_UNLOCKED ? 0 : req_state;
 
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock,
 							 req_state, flags);
 	return ret;
 }
@@ -575,7 +548,7 @@ __acquires(&gl->gl_spin)
 	    gl->gl_state == LM_ST_DEFERRED) &&
 	    !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
 		lck_flags |= LM_FLAG_TRY_1CB;
-	ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+	ret = gfs2_lm_lock(sdp, gl, target, lck_flags);
 
 	if (!(ret & LM_OUT_ASYNC)) {
 		finish_xmote(gl, ret);
@@ -624,10 +597,11 @@ __acquires(&gl->gl_spin)
 
 	GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
 
+	down_read(&gfs2_umount_flush_sem);
 	if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
 	    gl->gl_demote_state != gl->gl_state) {
 		if (find_first_holder(gl))
-			goto out;
+			goto out_unlock;
 		if (nonblock)
 			goto out_sched;
 		set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
@@ -638,23 +612,26 @@ __acquires(&gl->gl_spin)
 			gfs2_demote_wake(gl);
 		ret = do_promote(gl);
 		if (ret == 0)
-			goto out;
+			goto out_unlock;
 		if (ret == 2)
-			return;
+			goto out_sem;
 		gh = find_first_waiter(gl);
 		gl->gl_target = gh->gh_state;
 		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
 			do_error(gl, 0); /* Fail queued try locks */
 	}
 	do_xmote(gl, gh, gl->gl_target);
+out_sem:
+	up_read(&gfs2_umount_flush_sem);
 	return;
 
 out_sched:
 	gfs2_glock_hold(gl);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gfs2_glock_put(gl);
-out:
+out_unlock:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
+	goto out_sem;
 }
 
 static void glock_work_func(struct work_struct *work)
@@ -681,18 +658,6 @@ static void glock_work_func(struct work_struct *work)
 		gfs2_glock_put(gl);
 }
 
-static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		     void **lockp)
-{
-	int error = -EIO;
-	if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
-		return 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
-				sdp->sd_lockstruct.ls_lockspace, name, lockp);
-	return error;
-}
-
 /**
  * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
  * @sdp: The GFS2 superblock
@@ -719,10 +684,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl = search_bucket(hash, sdp, &name);
 	read_unlock(gl_lock_addr(hash));
 
-	if (gl || !create) {
-		*glp = gl;
+	*glp = gl;
+	if (gl)
 		return 0;
-	}
+	if (!create)
+		return -ENOENT;
 
 	gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL);
 	if (!gl)
@@ -736,7 +702,9 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	gl->gl_hash = hash;
 	gl->gl_ops = glops;
-	gl->gl_stamp = jiffies;
+	snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number);
+	memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+	gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
 	gl->gl_sbd = sdp;
@@ -753,10 +721,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 		}
 	}
 
-	error = gfs2_lm_get_lock(sdp, &name, &gl->gl_lock);
-	if (error)
-		goto fail_aspace;
-
 	write_lock(gl_lock_addr(hash));
 	tmp = search_bucket(hash, sdp, &name);
 	if (tmp) {
@@ -772,9 +736,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 
 	return 0;
 
-fail_aspace:
-	if (gl->gl_aspace)
-		gfs2_aspace_put(gl->gl_aspace);
 fail:
 	kmem_cache_free(gfs2_glock_cachep, gl);
 	return error;
@@ -966,7 +927,7 @@ do_cancel:
 	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
 		spin_unlock(&gl->gl_spin);
 		if (sdp->sd_lockstruct.ls_ops->lm_cancel)
-			sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+			sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
 		spin_lock(&gl->gl_spin);
 	}
 	return;
@@ -1051,7 +1012,6 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 			spin_lock(&gl->gl_spin);
 			clear_bit(GLF_LOCK, &gl->gl_flags);
 		}
-		gl->gl_stamp = jiffies;
 		if (list_empty(&gl->gl_holders) &&
 		    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
 		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
@@ -1240,70 +1200,13 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 		gfs2_glock_dq_uninit(&ghs[x]);
 }
 
-static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
-{
-	int error = -EIO;
-	if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
-		return 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
-	return error;
-}
-
-/**
- * gfs2_lvb_hold - attach a LVB from a glock
- * @gl: The glock in question
- *
- */
-
-int gfs2_lvb_hold(struct gfs2_glock *gl)
-{
-	int error;
-
-	if (!atomic_read(&gl->gl_lvb_count)) {
-		error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
-		if (error) 
-			return error;
-		gfs2_glock_hold(gl);
-	}
-	atomic_inc(&gl->gl_lvb_count);
-
-	return 0;
-}
-
-/**
- * gfs2_lvb_unhold - detach a LVB from a glock
- * @gl: The glock in question
- *
- */
-
-void gfs2_lvb_unhold(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-
-	gfs2_glock_hold(gl);
-	gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
-	if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-		if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
-			sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
-		gl->gl_lvb = NULL;
-		gfs2_glock_put(gl);
-	}
-	gfs2_glock_put(gl);
-}
-
-static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
-			unsigned int state)
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
 {
-	struct gfs2_glock *gl;
 	unsigned long delay = 0;
 	unsigned long holdtime;
 	unsigned long now = jiffies;
 
-	gl = gfs2_glock_find(sdp, name);
-	if (!gl)
-		return;
-
+	gfs2_glock_hold(gl);
 	holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
 	if (time_before(now, holdtime))
 		delay = holdtime - now;
@@ -1317,74 +1220,33 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 		gfs2_glock_put(gl);
 }
 
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
-{
-	struct gfs2_jdesc *jd;
-
-	spin_lock(&sdp->sd_jindex_spin);
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_jid != jid)
-			continue;
-		jd->jd_dirty = 1;
-		break;
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-}
-
 /**
- * gfs2_glock_cb - Callback used by locking module
- * @sdp: Pointer to the superblock
- * @type: Type of callback
- * @data: Type dependent data pointer
+ * gfs2_glock_complete - Callback used by locking
+ * @gl: Pointer to the glock
+ * @ret: The return value from the dlm
  *
- * Called by the locking module when it wants to tell us something.
- * Either we need to drop a lock, one of our ASYNC requests completed, or
- * a journal from another client needs to be recovered.
  */
 
-void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 {
-	struct gfs2_sbd *sdp = cb_data;
-
-	switch (type) {
-	case LM_CB_NEED_E:
-		blocking_cb(sdp, data, LM_ST_UNLOCKED);
-		return;
-
-	case LM_CB_NEED_D:
-		blocking_cb(sdp, data, LM_ST_DEFERRED);
-		return;
-
-	case LM_CB_NEED_S:
-		blocking_cb(sdp, data, LM_ST_SHARED);
-		return;
-
-	case LM_CB_ASYNC: {
-		struct lm_async_cb *async = data;
-		struct gfs2_glock *gl;
-
-		down_read(&gfs2_umount_flush_sem);
-		gl = gfs2_glock_find(sdp, &async->lc_name);
-		if (gfs2_assert_warn(sdp, gl))
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	gl->gl_reply = ret;
+	if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
+		struct gfs2_holder *gh;
+		spin_lock(&gl->gl_spin);
+		gh = find_first_waiter(gl);
+		if ((!(gh && (gh->gh_flags & LM_FLAG_NOEXP)) &&
+		     (gl->gl_target != LM_ST_UNLOCKED)) ||
+		    ((ret & ~LM_OUT_ST_MASK) != 0))
+			set_bit(GLF_FROZEN, &gl->gl_flags);
+		spin_unlock(&gl->gl_spin);
+		if (test_bit(GLF_FROZEN, &gl->gl_flags))
 			return;
-		gl->gl_reply = async->lc_ret;
-		set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
-		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-			gfs2_glock_put(gl);
-		up_read(&gfs2_umount_flush_sem);
-		return;
-	}
-
-	case LM_CB_NEED_RECOVERY:
-		gfs2_jdesc_make_dirty(sdp, *(unsigned int *)data);
-		if (sdp->sd_recoverd_process)
-			wake_up_process(sdp->sd_recoverd_process);
-		return;
-
-	default:
-		gfs2_assert_warn(sdp, 0);
-		return;
 	}
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
 }
 
 /**
@@ -1515,6 +1377,25 @@ out:
 	return has_entries;
 }
 
+
+/**
+ * thaw_glock - thaw out a glock which has an unprocessed reply waiting
+ * @gl: The glock to thaw
+ *
+ * N.B. When we freeze a glock, we leave a ref to the glock outstanding,
+ * so this has to result in the ref count being dropped by one.
+ */
+
+static void thaw_glock(struct gfs2_glock *gl)
+{
+	if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+		return;
+	set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+}
+
 /**
  * clear_glock - look at a glock and see if we can free it from glock cache
  * @gl: the glock to look at
@@ -1540,6 +1421,20 @@ static void clear_glock(struct gfs2_glock *gl)
 }
 
 /**
+ * gfs2_glock_thaw - Thaw any frozen glocks
+ * @sdp: The super block
+ *
+ */
+
+void gfs2_glock_thaw(struct gfs2_sbd *sdp)
+{
+	unsigned x;
+
+	for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
+		examine_bucket(thaw_glock, sdp, x);
+}
+
+/**
  * gfs2_gl_hash_clear - Empty out the glock hash table
  * @sdp: the filesystem
  * @wait: wait until it's all gone
@@ -1619,7 +1514,7 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
 	if (flags & LM_FLAG_NOEXP)
 		*p++ = 'e';
 	if (flags & LM_FLAG_ANY)
-		*p++ = 'a';
+		*p++ = 'A';
 	if (flags & LM_FLAG_PRIORITY)
 		*p++ = 'p';
 	if (flags & GL_ASYNC)
@@ -1683,6 +1578,10 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
 		*p++ = 'i';
 	if (test_bit(GLF_REPLY_PENDING, gflags))
 		*p++ = 'r';
+	if (test_bit(GLF_INITIAL, gflags))
+		*p++ = 'I';
+	if (test_bit(GLF_FROZEN, gflags))
+		*p++ = 'F';
 	*p = 0;
 	return buf;
 }
@@ -1717,14 +1616,13 @@ static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 	dtime *= 1000000/HZ; /* demote time in uSec */
 	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
 		dtime = 0;
-	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu a:%d r:%d\n",
 		  state2str(gl->gl_state),
 		  gl->gl_name.ln_type,
 		  (unsigned long long)gl->gl_name.ln_number,
 		  gflags2str(gflags_buf, &gl->gl_flags),
 		  state2str(gl->gl_target),
 		  state2str(gl->gl_demote_state), dtime,
-		  atomic_read(&gl->gl_lvb_count),
 		  atomic_read(&gl->gl_ail_count),
 		  atomic_read(&gl->gl_ref));
 
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 543ec7e..a602a28 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -11,15 +11,130 @@
 #define __GLOCK_DOT_H__
 
 #include <linux/sched.h>
+#include <linux/parser.h>
 #include "incore.h"
 
-/* Flags for lock requests; used in gfs2_holder gh_flag field.
-   From lm_interface.h:
+/* Options for hostdata parser */
+
+enum {
+	Opt_jid,
+	Opt_id,
+	Opt_first,
+	Opt_nodir,
+	Opt_err,
+};
+
+/*
+ * lm_lockname types
+ */
+
+#define LM_TYPE_RESERVED	0x00
+#define LM_TYPE_NONDISK		0x01
+#define LM_TYPE_INODE		0x02
+#define LM_TYPE_RGRP		0x03
+#define LM_TYPE_META		0x04
+#define LM_TYPE_IOPEN		0x05
+#define LM_TYPE_FLOCK		0x06
+#define LM_TYPE_PLOCK		0x07
+#define LM_TYPE_QUOTA		0x08
+#define LM_TYPE_JOURNAL		0x09
+
+/*
+ * lm_lock() states
+ *
+ * SHARED is compatible with SHARED, not with DEFERRED or EX.
+ * DEFERRED is compatible with DEFERRED, not with SHARED or EX.
+ */
+
+#define LM_ST_UNLOCKED		0
+#define LM_ST_EXCLUSIVE		1
+#define LM_ST_DEFERRED		2
+#define LM_ST_SHARED		3
+
+/*
+ * lm_lock() flags
+ *
+ * LM_FLAG_TRY
+ * Don't wait to acquire the lock if it can't be granted immediately.
+ *
+ * LM_FLAG_TRY_1CB
+ * Send one blocking callback if TRY is set and the lock is not granted.
+ *
+ * LM_FLAG_NOEXP
+ * GFS sets this flag on lock requests it makes while doing journal recovery.
+ * These special requests should not be blocked due to the recovery like
+ * ordinary locks would be.
+ *
+ * LM_FLAG_ANY
+ * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may
+ * also be granted in SHARED.  The preferred state is whichever is compatible
+ * with other granted locks, or the specified state if no other locks exist.
+ *
+ * LM_FLAG_PRIORITY
+ * Override fairness considerations.  Suppose a lock is held in a shared state
+ * and there is a pending request for the deferred state.  A shared lock
+ * request with the priority flag would be allowed to bypass the deferred
+ * request and directly join the other shared lock.  A shared lock request
+ * without the priority flag might be forced to wait until the deferred
+ * requested had acquired and released the lock.
+ */
+
 #define LM_FLAG_TRY		0x00000001
 #define LM_FLAG_TRY_1CB		0x00000002
 #define LM_FLAG_NOEXP		0x00000004
 #define LM_FLAG_ANY		0x00000008
-#define LM_FLAG_PRIORITY	0x00000010 */
+#define LM_FLAG_PRIORITY	0x00000010
+#define GL_ASYNC		0x00000040
+#define GL_EXACT		0x00000080
+#define GL_SKIP			0x00000100
+#define GL_ATIME		0x00000200
+#define GL_NOCACHE		0x00000400
+  
+/*
+ * lm_lock() and lm_async_cb return flags
+ *
+ * LM_OUT_ST_MASK
+ * Masks the lower two bits of lock state in the returned value.
+ *
+ * LM_OUT_CANCELED
+ * The lock request was canceled.
+ *
+ * LM_OUT_ASYNC
+ * The result of the request will be returned in an LM_CB_ASYNC callback.
+ *
+ */
+
+#define LM_OUT_ST_MASK		0x00000003
+#define LM_OUT_CANCELED		0x00000008
+#define LM_OUT_ASYNC		0x00000080
+#define LM_OUT_ERROR		0x00000100
+
+/*
+ * lm_recovery_done() messages
+ */
+
+#define LM_RD_GAVEUP		308
+#define LM_RD_SUCCESS		309
+
+#define GLR_TRYFAILED		13
+
+struct lm_lockops {
+	const char *lm_proto_name;
+	int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
+ 	void (*lm_unmount) (struct gfs2_sbd *sdp);
+	void (*lm_withdraw) (struct gfs2_sbd *sdp);
+	void (*lm_put_lock) (struct kmem_cache *cachep, void *gl);
+	unsigned int (*lm_lock) (struct gfs2_glock *gl,
+				 unsigned int req_state, unsigned int flags);
+	void (*lm_cancel) (struct gfs2_glock *gl);
+	const match_table_t *lm_tokens;
+};
+
+#define LM_FLAG_TRY		0x00000001
+#define LM_FLAG_TRY_1CB		0x00000002
+#define LM_FLAG_NOEXP		0x00000004
+#define LM_FLAG_ANY		0x00000008
+#define LM_FLAG_PRIORITY	0x00000010
 
 #define GL_ASYNC		0x00000040
 #define GL_EXACT		0x00000080
@@ -128,10 +243,12 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
 int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
-void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
+void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
+void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
+void gfs2_glock_thaw(struct gfs2_sbd *sdp);
 
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
@@ -141,4 +258,6 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
 int gfs2_register_debugfs(void);
 void gfs2_unregister_debugfs(void);
 
+extern const struct lm_lockops gfs2_dlm_ops;
+
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 8522d3a..bf23a62 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -12,7 +12,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/bio.h>
 
 #include "gfs2.h"
@@ -38,20 +37,25 @@
 static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
-	unsigned int blocks;
 	struct list_head *head = &gl->gl_ail_list;
 	struct gfs2_bufdata *bd;
 	struct buffer_head *bh;
-	int error;
+	struct gfs2_trans tr;
 
-	blocks = atomic_read(&gl->gl_ail_count);
-	if (!blocks)
-		return;
+	memset(&tr, 0, sizeof(tr));
+	tr.tr_revokes = atomic_read(&gl->gl_ail_count);
 
-	error = gfs2_trans_begin(sdp, 0, blocks);
-	if (gfs2_assert_withdraw(sdp, !error))
+	if (!tr.tr_revokes)
 		return;
 
+	/* A shortened, inline version of gfs2_trans_begin() */
+	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
+	tr.tr_ip = (unsigned long)__builtin_return_address(0);
+	INIT_LIST_HEAD(&tr.tr_list_buf);
+	gfs2_log_reserve(sdp, tr.tr_reserved);
+	BUG_ON(current->journal_info);
+	current->journal_info = &tr;
+
 	gfs2_log_lock(sdp);
 	while (!list_empty(head)) {
 		bd = list_entry(head->next, struct gfs2_bufdata,
@@ -72,29 +76,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 }
 
 /**
- * gfs2_pte_inval - Sync and invalidate all PTEs associated with a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_pte_inval(struct gfs2_glock *gl)
-{
-	struct gfs2_inode *ip;
-	struct inode *inode;
-
-	ip = gl->gl_object;
-	inode = &ip->i_inode;
-	if (!ip || !S_ISREG(inode->i_mode))
-		return;
-
-	unmap_shared_mapping_range(inode->i_mapping, 0, 0);
-	if (test_bit(GIF_SW_PAGED, &ip->i_flags))
-		set_bit(GLF_DIRTY, &gl->gl_flags);
-
-}
-
-/**
- * meta_go_sync - sync out the metadata for this glock
+ * rgrp_go_sync - sync out the metadata for this glock
  * @gl: the glock
  *
  * Called when demoting or unlocking an EX glock.  We must flush
@@ -102,36 +84,42 @@ static void gfs2_pte_inval(struct gfs2_glock *gl)
  * not return to caller to demote/unlock the glock until I/O is complete.
  */
 
-static void meta_go_sync(struct gfs2_glock *gl)
+static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-	if (gl->gl_state != LM_ST_EXCLUSIVE)
+	struct address_space *metamapping = gl->gl_aspace->i_mapping;
+	int error;
+
+	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
 		return;
+	BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
 
-	if (test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) {
-		gfs2_log_flush(gl->gl_sbd, gl);
-		gfs2_meta_sync(gl);
-		gfs2_ail_empty_gl(gl);
-	}
+	gfs2_log_flush(gl->gl_sbd, gl);
+	filemap_fdatawrite(metamapping);
+	error = filemap_fdatawait(metamapping);
+        mapping_set_error(metamapping, error);
+	gfs2_ail_empty_gl(gl);
 }
 
 /**
- * meta_go_inval - invalidate the metadata for this glock
+ * rgrp_go_inval - invalidate the metadata for this glock
  * @gl: the glock
  * @flags:
  *
+ * We never used LM_ST_DEFERRED with resource groups, so that we
+ * should always see the metadata flag set here.
+ *
  */
 
-static void meta_go_inval(struct gfs2_glock *gl, int flags)
+static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-	if (!(flags & DIO_METADATA))
-		return;
+	struct address_space *mapping = gl->gl_aspace->i_mapping;
 
-	gfs2_meta_inval(gl);
-	if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex))
-		gl->gl_sbd->sd_rindex_uptodate = 0;
-	else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
-		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
+	BUG_ON(!(flags & DIO_METADATA));
+	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+	truncate_inode_pages(mapping, 0);
 
+	if (gl->gl_object) {
+		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
 		rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
 	}
 }
@@ -148,48 +136,54 @@ static void inode_go_sync(struct gfs2_glock *gl)
 	struct address_space *metamapping = gl->gl_aspace->i_mapping;
 	int error;
 
-	if (gl->gl_state != LM_ST_UNLOCKED)
-		gfs2_pte_inval(gl);
-	if (gl->gl_state != LM_ST_EXCLUSIVE)
-		return;
-
 	if (ip && !S_ISREG(ip->i_inode.i_mode))
 		ip = NULL;
+	if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+		unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
+		return;
 
-	if (test_bit(GLF_DIRTY, &gl->gl_flags)) {
-		gfs2_log_flush(gl->gl_sbd, gl);
-		filemap_fdatawrite(metamapping);
-		if (ip) {
-			struct address_space *mapping = ip->i_inode.i_mapping;
-			filemap_fdatawrite(mapping);
-			error = filemap_fdatawait(mapping);
-			mapping_set_error(mapping, error);
-		}
-		error = filemap_fdatawait(metamapping);
-		mapping_set_error(metamapping, error);
-		clear_bit(GLF_DIRTY, &gl->gl_flags);
-		gfs2_ail_empty_gl(gl);
+	BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
+
+	gfs2_log_flush(gl->gl_sbd, gl);
+	filemap_fdatawrite(metamapping);
+	if (ip) {
+		struct address_space *mapping = ip->i_inode.i_mapping;
+		filemap_fdatawrite(mapping);
+		error = filemap_fdatawait(mapping);
+		mapping_set_error(mapping, error);
 	}
+	error = filemap_fdatawait(metamapping);
+	mapping_set_error(metamapping, error);
+	gfs2_ail_empty_gl(gl);
 }
 
 /**
  * inode_go_inval - prepare a inode glock to be released
  * @gl: the glock
  * @flags:
+ * 
+ * Normally we invlidate everything, but if we are moving into
+ * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we
+ * can keep hold of the metadata, since it won't have changed.
  *
  */
 
 static void inode_go_inval(struct gfs2_glock *gl, int flags)
 {
 	struct gfs2_inode *ip = gl->gl_object;
-	int meta = (flags & DIO_METADATA);
 
-	if (meta) {
-		gfs2_meta_inval(gl);
+	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
+
+	if (flags & DIO_METADATA) {
+		struct address_space *mapping = gl->gl_aspace->i_mapping;
+		truncate_inode_pages(mapping, 0);
 		if (ip)
 			set_bit(GIF_INVALID, &ip->i_flags);
 	}
 
+	if (ip == GFS2_I(gl->gl_sbd->sd_rindex))
+		gl->gl_sbd->sd_rindex_uptodate = 0;
 	if (ip && S_ISREG(ip->i_inode.i_mode))
 		truncate_inode_pages(ip->i_inode.i_mapping, 0);
 }
@@ -390,20 +384,7 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
 	return 0;
 }
 
-/**
- * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-
-static int quota_go_demote_ok(const struct gfs2_glock *gl)
-{
-	return !atomic_read(&gl->gl_lvb_count);
-}
-
 const struct gfs2_glock_operations gfs2_meta_glops = {
-	.go_xmote_th = meta_go_sync,
 	.go_type = LM_TYPE_META,
 };
 
@@ -418,8 +399,8 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 };
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
-	.go_xmote_th = meta_go_sync,
-	.go_inval = meta_go_inval,
+	.go_xmote_th = rgrp_go_sync,
+	.go_inval = rgrp_go_inval,
 	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
@@ -448,7 +429,6 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
 };
 
 const struct gfs2_glock_operations gfs2_quota_glops = {
-	.go_demote_ok = quota_go_demote_ok,
 	.go_type = LM_TYPE_QUOTA,
 };
 
@@ -456,3 +436,15 @@ const struct gfs2_glock_operations gfs2_journal_glops = {
 	.go_type = LM_TYPE_JOURNAL,
 };
 
+const struct gfs2_glock_operations *gfs2_glops_list[] = {
+	[LM_TYPE_META] = &gfs2_meta_glops,
+	[LM_TYPE_INODE] = &gfs2_inode_glops,
+	[LM_TYPE_RGRP] = &gfs2_rgrp_glops,
+	[LM_TYPE_NONDISK] = &gfs2_trans_glops,
+	[LM_TYPE_IOPEN] = &gfs2_iopen_glops,
+	[LM_TYPE_FLOCK] = &gfs2_flock_glops,
+	[LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
+	[LM_TYPE_QUOTA] = &gfs2_quota_glops,
+	[LM_TYPE_JOURNAL] = &gfs2_journal_glops,
+};
+
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index a1d9b5b..b3aa2e3 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -21,5 +21,6 @@ extern const struct gfs2_glock_operations gfs2_flock_glops;
 extern const struct gfs2_glock_operations gfs2_nondisk_glops;
 extern const struct gfs2_glock_operations gfs2_quota_glops;
 extern const struct gfs2_glock_operations gfs2_journal_glops;
+extern const struct gfs2_glock_operations *gfs2_glops_list[];
 
 #endif /* __GLOPS_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 608849d..399d1b9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,8 @@
 
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/dlm.h>
+#include <linux/buffer_head.h>
 
 #define DIO_WAIT	0x00000010
 #define DIO_METADATA	0x00000020
@@ -26,6 +28,7 @@ struct gfs2_trans;
 struct gfs2_ail;
 struct gfs2_jdesc;
 struct gfs2_sbd;
+struct lm_lockops;
 
 typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
 
@@ -121,6 +124,28 @@ struct gfs2_bufdata {
 	struct list_head bd_ail_gl_list;
 };
 
+/*
+ * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
+ * prefix of lock_dlm_ gets awkward.
+ */
+
+#define GDLM_STRNAME_BYTES	25
+#define GDLM_LVB_SIZE		32
+
+enum {
+	DFL_BLOCK_LOCKS		= 0,
+};
+
+struct lm_lockname {
+	u64 ln_number;
+	unsigned int ln_type;
+};
+
+#define lm_name_equal(name1, name2) \
+        (((name1)->ln_number == (name2)->ln_number) && \
+         ((name1)->ln_type == (name2)->ln_type))
+
+
 struct gfs2_glock_operations {
 	void (*go_xmote_th) (struct gfs2_glock *gl);
 	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
@@ -162,6 +187,8 @@ enum {
 	GLF_LFLUSH			= 7,
 	GLF_INVALIDATE_IN_PROGRESS	= 8,
 	GLF_REPLY_PENDING		= 9,
+	GLF_INITIAL			= 10,
+	GLF_FROZEN			= 11,
 };
 
 struct gfs2_glock {
@@ -176,16 +203,15 @@ struct gfs2_glock {
 	unsigned int gl_target;
 	unsigned int gl_reply;
 	unsigned int gl_hash;
+	unsigned int gl_req;
 	unsigned int gl_demote_state; /* state requested by remote node */
 	unsigned long gl_demote_time; /* time of first demote request */
 	struct list_head gl_holders;
 
 	const struct gfs2_glock_operations *gl_ops;
-	void *gl_lock;
-	char *gl_lvb;
-	atomic_t gl_lvb_count;
-
-	unsigned long gl_stamp;
+	char gl_strname[GDLM_STRNAME_BYTES];
+	struct dlm_lksb gl_lksb;
+	char gl_lvb[32];
 	unsigned long gl_tchange;
 	void *gl_object;
 
@@ -283,7 +309,9 @@ enum {
 
 struct gfs2_quota_data {
 	struct list_head qd_list;
-	unsigned int qd_count;
+	struct list_head qd_reclaim;
+
+	atomic_t qd_count;
 
 	u32 qd_id;
 	unsigned long qd_flags;		/* QDF_... */
@@ -303,7 +331,6 @@ struct gfs2_quota_data {
 
 	u64 qd_sync_gen;
 	unsigned long qd_last_warn;
-	unsigned long qd_last_touched;
 };
 
 struct gfs2_trans {
@@ -390,7 +417,7 @@ struct gfs2_args {
 	unsigned int ar_suiddir:1;		/* suiddir support */
 	unsigned int ar_data:2;			/* ordered/writeback */
 	unsigned int ar_meta:1;			/* mount metafs */
-	unsigned int ar_num_glockd;		/* Number of glockd threads */
+	unsigned int ar_discard:1;		/* discard requests */
 };
 
 struct gfs2_tune {
@@ -406,7 +433,6 @@ struct gfs2_tune {
 	unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */
 	unsigned int gt_quota_scale_num; /* Numerator */
 	unsigned int gt_quota_scale_den; /* Denominator */
-	unsigned int gt_quota_cache_secs;
 	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
 	unsigned int gt_new_files_jdata;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
@@ -445,6 +471,31 @@ struct gfs2_sb_host {
 
 	char sb_lockproto[GFS2_LOCKNAME_LEN];
 	char sb_locktable[GFS2_LOCKNAME_LEN];
+	u8 sb_uuid[16];
+};
+
+/*
+ * lm_mount() return values
+ *
+ * ls_jid - the journal ID this node should use
+ * ls_first - this node is the first to mount the file system
+ * ls_lockspace - lock module's context for this file system
+ * ls_ops - lock module's functions
+ */
+
+struct lm_lockstruct {
+	u32 ls_id;
+	unsigned int ls_jid;
+	unsigned int ls_first;
+	unsigned int ls_first_done;
+	unsigned int ls_nodir;
+	const struct lm_lockops *ls_ops;
+	unsigned long ls_flags;
+	dlm_lockspace_t *ls_dlm;
+
+	int ls_recover_jid;
+	int ls_recover_jid_done;
+	int ls_recover_jid_status;
 };
 
 struct gfs2_sbd {
@@ -520,7 +571,6 @@ struct gfs2_sbd {
 	spinlock_t sd_jindex_spin;
 	struct mutex sd_jindex_mutex;
 	unsigned int sd_journals;
-	unsigned long sd_jindex_refresh_time;
 
 	struct gfs2_jdesc *sd_jdesc;
 	struct gfs2_holder sd_journal_gh;
@@ -540,7 +590,6 @@ struct gfs2_sbd {
 
 	struct list_head sd_quota_list;
 	atomic_t sd_quota_count;
-	spinlock_t sd_quota_spin;
 	struct mutex sd_quota_mutex;
 	wait_queue_head_t sd_quota_wait;
 	struct list_head sd_trunc_list;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3b87c18..7b277d44 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -16,7 +16,6 @@
 #include <linux/sort.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/security.h>
 #include <linux/time.h>
 
@@ -137,16 +136,16 @@ void gfs2_set_iop(struct inode *inode)
 
 	if (S_ISREG(mode)) {
 		inode->i_op = &gfs2_file_iops;
-		if (sdp->sd_args.ar_localflocks)
-			inode->i_fop = &gfs2_file_fops_nolock;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = gfs2_file_fops_nolock;
 		else
-			inode->i_fop = &gfs2_file_fops;
+			inode->i_fop = gfs2_file_fops;
 	} else if (S_ISDIR(mode)) {
 		inode->i_op = &gfs2_dir_iops;
-		if (sdp->sd_args.ar_localflocks)
-			inode->i_fop = &gfs2_dir_fops_nolock;
+		if (gfs2_localflocks(sdp))
+			inode->i_fop = gfs2_dir_fops_nolock;
 		else
-			inode->i_fop = &gfs2_dir_fops;
+			inode->i_fop = gfs2_dir_fops;
 	} else if (S_ISLNK(mode)) {
 		inode->i_op = &gfs2_symlink_iops;
 	} else {
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d532936..dca4fee 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -101,12 +101,26 @@ void gfs2_dinode_print(const struct gfs2_inode *ip);
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
 extern const struct inode_operations gfs2_symlink_iops;
-extern const struct file_operations gfs2_file_fops;
-extern const struct file_operations gfs2_dir_fops;
-extern const struct file_operations gfs2_file_fops_nolock;
-extern const struct file_operations gfs2_dir_fops_nolock;
+extern const struct file_operations *gfs2_file_fops_nolock;
+extern const struct file_operations *gfs2_dir_fops_nolock;
 
 extern void gfs2_set_inode_flags(struct inode *inode);
+ 
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+extern const struct file_operations *gfs2_file_fops;
+extern const struct file_operations *gfs2_dir_fops;
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return sdp->sd_args.ar_localflocks;
+}
+#else /* Single node only */
+#define gfs2_file_fops NULL
+#define gfs2_dir_fops NULL
+static inline int gfs2_localflocks(const struct gfs2_sbd *sdp)
+{
+	return 1;
+}
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
 
 #endif /* __INODE_DOT_H__ */
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
new file mode 100644
index 0000000..46df988
--- /dev/null
+++ b/fs/gfs2/lock_dlm.c
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
+ * Copyright (C) 2004-2009 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#include <linux/fs.h>
+#include <linux/dlm.h>
+#include <linux/types.h>
+#include <linux/gfs2_ondisk.h>
+
+#include "incore.h"
+#include "glock.h"
+#include "util.h"
+
+
+static void gdlm_ast(void *arg)
+{
+	struct gfs2_glock *gl = arg;
+	unsigned ret = gl->gl_state;
+
+	BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
+
+	if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
+		memset(gl->gl_lvb, 0, GDLM_LVB_SIZE);
+
+	switch (gl->gl_lksb.sb_status) {
+	case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
+		kmem_cache_free(gfs2_glock_cachep, gl);
+		return;
+	case -DLM_ECANCEL: /* Cancel while getting lock */
+		ret |= LM_OUT_CANCELED;
+		goto out;
+	case -EAGAIN: /* Try lock fails */
+		goto out;
+	case -EINVAL: /* Invalid */
+	case -ENOMEM: /* Out of memory */
+		ret |= LM_OUT_ERROR;
+		goto out;
+	case 0: /* Success */
+		break;
+	default: /* Something unexpected */
+		BUG();
+	}
+
+	ret = gl->gl_req;
+	if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) {
+		if (gl->gl_req == LM_ST_SHARED)
+			ret = LM_ST_DEFERRED;
+		else if (gl->gl_req == LM_ST_DEFERRED)
+			ret = LM_ST_SHARED;
+		else
+			BUG();
+	}
+
+	set_bit(GLF_INITIAL, &gl->gl_flags);
+	gfs2_glock_complete(gl, ret);
+	return;
+out:
+	if (!test_bit(GLF_INITIAL, &gl->gl_flags))
+		gl->gl_lksb.sb_lkid = 0;
+	gfs2_glock_complete(gl, ret);
+}
+
+static void gdlm_bast(void *arg, int mode)
+{
+	struct gfs2_glock *gl = arg;
+
+	switch (mode) {
+	case DLM_LOCK_EX:
+		gfs2_glock_cb(gl, LM_ST_UNLOCKED);
+		break;
+	case DLM_LOCK_CW:
+		gfs2_glock_cb(gl, LM_ST_DEFERRED);
+		break;
+	case DLM_LOCK_PR:
+		gfs2_glock_cb(gl, LM_ST_SHARED);
+		break;
+	default:
+		printk(KERN_ERR "unknown bast mode %d", mode);
+		BUG();
+	}
+}
+
+/* convert gfs lock-state to dlm lock-mode */
+
+static int make_mode(const unsigned int lmstate)
+{
+	switch (lmstate) {
+	case LM_ST_UNLOCKED:
+		return DLM_LOCK_NL;
+	case LM_ST_EXCLUSIVE:
+		return DLM_LOCK_EX;
+	case LM_ST_DEFERRED:
+		return DLM_LOCK_CW;
+	case LM_ST_SHARED:
+		return DLM_LOCK_PR;
+	}
+	printk(KERN_ERR "unknown LM state %d", lmstate);
+	BUG();
+	return -1;
+}
+
+static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
+		      const int req)
+{
+	u32 lkf = 0;
+
+	if (gfs_flags & LM_FLAG_TRY)
+		lkf |= DLM_LKF_NOQUEUE;
+
+	if (gfs_flags & LM_FLAG_TRY_1CB) {
+		lkf |= DLM_LKF_NOQUEUE;
+		lkf |= DLM_LKF_NOQUEUEBAST;
+	}
+
+	if (gfs_flags & LM_FLAG_PRIORITY) {
+		lkf |= DLM_LKF_NOORDER;
+		lkf |= DLM_LKF_HEADQUE;
+	}
+
+	if (gfs_flags & LM_FLAG_ANY) {
+		if (req == DLM_LOCK_PR)
+			lkf |= DLM_LKF_ALTCW;
+		else if (req == DLM_LOCK_CW)
+			lkf |= DLM_LKF_ALTPR;
+		else
+			BUG();
+	}
+
+	if (lkid != 0) 
+		lkf |= DLM_LKF_CONVERT;
+
+	lkf |= DLM_LKF_VALBLK;
+
+	return lkf;
+}
+
+static unsigned int gdlm_lock(struct gfs2_glock *gl,
+			      unsigned int req_state, unsigned int flags)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	int error;
+	int req;
+	u32 lkf;
+
+	gl->gl_req = req_state;
+	req = make_mode(req_state);
+	lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
+
+	/*
+	 * Submit the actual lock request.
+	 */
+
+	error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname,
+			 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
+	if (error == -EAGAIN)
+		return 0;
+	if (error)
+		return LM_OUT_ERROR;
+	return LM_OUT_ASYNC;
+}
+
+static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
+{
+	struct gfs2_glock *gl = ptr;
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	int error;
+
+	if (gl->gl_lksb.sb_lkid == 0) {
+		kmem_cache_free(cachep, gl);
+		return;
+	}
+
+	error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
+			   NULL, gl);
+	if (error) {
+		printk(KERN_ERR "gdlm_unlock %x,%llx err=%d\n",
+		       gl->gl_name.ln_type,
+		       (unsigned long long)gl->gl_name.ln_number, error);
+		return;
+	}
+}
+
+static void gdlm_cancel(struct gfs2_glock *gl)
+{
+	struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct;
+	dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl);
+}
+
+static int gdlm_mount(struct gfs2_sbd *sdp, const char *fsname)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	int error;
+
+	if (fsname == NULL) {
+		fs_info(sdp, "no fsname found\n");
+		return -EINVAL;
+	}
+
+	error = dlm_new_lockspace(fsname, strlen(fsname), &ls->ls_dlm,
+				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
+				  (ls->ls_nodir ? DLM_LSFL_NODIR : 0),
+				  GDLM_LVB_SIZE);
+	if (error)
+		printk(KERN_ERR "dlm_new_lockspace error %d", error);
+
+	return error;
+}
+
+static void gdlm_unmount(struct gfs2_sbd *sdp)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+
+	if (ls->ls_dlm) {
+		dlm_release_lockspace(ls->ls_dlm, 2);
+		ls->ls_dlm = NULL;
+	}
+}
+
+static const match_table_t dlm_tokens = {
+	{ Opt_jid, "jid=%d"},
+	{ Opt_id, "id=%d"},
+	{ Opt_first, "first=%d"},
+	{ Opt_nodir, "nodir=%d"},
+	{ Opt_err, NULL },
+};
+
+const struct lm_lockops gfs2_dlm_ops = {
+	.lm_proto_name = "lock_dlm",
+	.lm_mount = gdlm_mount,
+	.lm_unmount = gdlm_unmount,
+	.lm_put_lock = gdlm_put_lock,
+	.lm_lock = gdlm_lock,
+	.lm_cancel = gdlm_cancel,
+	.lm_tokens = &dlm_tokens,
+};
+
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
deleted file mode 100644
index 523243a..0000000
--- a/fs/gfs2/locking.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <linux/wait.h>
-#include <linux/sched.h>
-#include <linux/kmod.h>
-#include <linux/fs.h>
-#include <linux/delay.h>
-#include <linux/lm_interface.h>
-
-struct lmh_wrapper {
-	struct list_head lw_list;
-	const struct lm_lockops *lw_ops;
-};
-
-static int nolock_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj);
-
-/* List of registered low-level locking protocols.  A file system selects one
-   of them by name at mount time, e.g. lock_nolock, lock_dlm. */
-
-static const struct lm_lockops nolock_ops = {
-	.lm_proto_name = "lock_nolock",
-	.lm_mount = nolock_mount,
-};
-
-static struct lmh_wrapper nolock_proto  = {
-	.lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
-	.lw_ops = &nolock_ops,
-};
-
-static LIST_HEAD(lmh_list);
-static DEFINE_MUTEX(lmh_lock);
-
-static int nolock_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj)
-{
-	char *c;
-	unsigned int jid;
-
-	c = strstr(host_data, "jid=");
-	if (!c)
-		jid = 0;
-	else {
-		c += 4;
-		sscanf(c, "%u", &jid);
-	}
-
-	lockstruct->ls_jid = jid;
-	lockstruct->ls_first = 1;
-	lockstruct->ls_lvb_size = min_lvb_size;
-	lockstruct->ls_ops = &nolock_ops;
-	lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-
-	return 0;
-}
-
-/**
- * gfs2_register_lockproto - Register a low-level locking protocol
- * @proto: the protocol definition
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-int gfs2_register_lockproto(const struct lm_lockops *proto)
-{
-	struct lmh_wrapper *lw;
-
-	mutex_lock(&lmh_lock);
-
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
-			mutex_unlock(&lmh_lock);
-			printk(KERN_INFO "GFS2: protocol %s already exists\n",
-			       proto->lm_proto_name);
-			return -EEXIST;
-		}
-	}
-
-	lw = kzalloc(sizeof(struct lmh_wrapper), GFP_KERNEL);
-	if (!lw) {
-		mutex_unlock(&lmh_lock);
-		return -ENOMEM;
-	}
-
-	lw->lw_ops = proto;
-	list_add(&lw->lw_list, &lmh_list);
-
-	mutex_unlock(&lmh_lock);
-
-	return 0;
-}
-
-/**
- * gfs2_unregister_lockproto - Unregister a low-level locking protocol
- * @proto: the protocol definition
- *
- */
-
-void gfs2_unregister_lockproto(const struct lm_lockops *proto)
-{
-	struct lmh_wrapper *lw;
-
-	mutex_lock(&lmh_lock);
-
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto->lm_proto_name)) {
-			list_del(&lw->lw_list);
-			mutex_unlock(&lmh_lock);
-			kfree(lw);
-			return;
-		}
-	}
-
-	mutex_unlock(&lmh_lock);
-
-	printk(KERN_WARNING "GFS2: can't unregister lock protocol %s\n",
-	       proto->lm_proto_name);
-}
-
-/**
- * gfs2_mount_lockproto - Mount a lock protocol
- * @proto_name - the name of the protocol
- * @table_name - the name of the lock space
- * @host_data - data specific to this host
- * @cb - the callback to the code using the lock module
- * @sdp - The GFS2 superblock
- * @min_lvb_size - the mininum LVB size that the caller can deal with
- * @flags - LM_MFLAG_*
- * @lockstruct - a structure returned describing the mount
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
-			 lm_callback_t cb, void *cb_data,
-			 unsigned int min_lvb_size, int flags,
-			 struct lm_lockstruct *lockstruct,
-			 struct kobject *fskobj)
-{
-	struct lmh_wrapper *lw = NULL;
-	int try = 0;
-	int error, found;
-
-
-retry:
-	mutex_lock(&lmh_lock);
-
-	if (list_empty(&nolock_proto.lw_list))
-		list_add(&nolock_proto.lw_list, &lmh_list);
-
-	found = 0;
-	list_for_each_entry(lw, &lmh_list, lw_list) {
-		if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
-			found = 1;
-			break;
-		}
-	}
-
-	if (!found) {
-		if (!try && capable(CAP_SYS_MODULE)) {
-			try = 1;
-			mutex_unlock(&lmh_lock);
-			request_module(proto_name);
-			goto retry;
-		}
-		printk(KERN_INFO "GFS2: can't find protocol %s\n", proto_name);
-		error = -ENOENT;
-		goto out;
-	}
-
-	if (lw->lw_ops->lm_owner &&
-	    !try_module_get(lw->lw_ops->lm_owner)) {
-		try = 0;
-		mutex_unlock(&lmh_lock);
-		msleep(1000);
-		goto retry;
-	}
-
-	error = lw->lw_ops->lm_mount(table_name, host_data, cb, cb_data,
-				     min_lvb_size, flags, lockstruct, fskobj);
-	if (error)
-		module_put(lw->lw_ops->lm_owner);
-out:
-	mutex_unlock(&lmh_lock);
-	return error;
-}
-
-void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
-{
-	mutex_lock(&lmh_lock);
-	if (lockstruct->ls_ops->lm_unmount)
-		lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
-	if (lockstruct->ls_ops->lm_owner)
-		module_put(lockstruct->ls_ops->lm_owner);
-	mutex_unlock(&lmh_lock);
-}
-
-/**
- * gfs2_withdraw_lockproto - abnormally unmount a lock module
- * @lockstruct: the lockstruct passed into mount
- *
- */
-
-void gfs2_withdraw_lockproto(struct lm_lockstruct *lockstruct)
-{
-	mutex_lock(&lmh_lock);
-	lockstruct->ls_ops->lm_withdraw(lockstruct->ls_lockspace);
-	if (lockstruct->ls_ops->lm_owner)
-		module_put(lockstruct->ls_ops->lm_owner);
-	mutex_unlock(&lmh_lock);
-}
-
-EXPORT_SYMBOL_GPL(gfs2_register_lockproto);
-EXPORT_SYMBOL_GPL(gfs2_unregister_lockproto);
-
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
deleted file mode 100644
index 2609bb6..0000000
--- a/fs/gfs2/locking/dlm/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
-lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o
-
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
deleted file mode 100644
index 2482c90..0000000
--- a/fs/gfs2/locking/dlm/lock.c
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-static char junk_lvb[GDLM_LVB_SIZE];
-
-
-/* convert dlm lock-mode to gfs lock-state */
-
-static s16 gdlm_make_lmstate(s16 dlmmode)
-{
-	switch (dlmmode) {
-	case DLM_LOCK_IV:
-	case DLM_LOCK_NL:
-		return LM_ST_UNLOCKED;
-	case DLM_LOCK_EX:
-		return LM_ST_EXCLUSIVE;
-	case DLM_LOCK_CW:
-		return LM_ST_DEFERRED;
-	case DLM_LOCK_PR:
-		return LM_ST_SHARED;
-	}
-	gdlm_assert(0, "unknown DLM mode %d", dlmmode);
-	return -1;
-}
-
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
-   thread gets to it. */
-
-static void queue_submit(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->delay_list, &ls->submit);
-	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
-}
-
-static void wake_up_ast(struct gdlm_lock *lp)
-{
-	clear_bit(LFL_AST_WAIT, &lp->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-
-static void gdlm_delete_lp(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	if (!list_empty(&lp->delay_list))
-		list_del_init(&lp->delay_list);
-	ls->all_locks_count--;
-	spin_unlock(&ls->async_lock);
-
-	kfree(lp);
-}
-
-static void gdlm_queue_delayed(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->delay_list, &ls->delayed);
-	spin_unlock(&ls->async_lock);
-}
-
-static void process_complete(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	struct lm_async_cb acb;
-
-	memset(&acb, 0, sizeof(acb));
-
-	if (lp->lksb.sb_status == -DLM_ECANCEL) {
-		log_info("complete dlm cancel %x,%llx flags %lx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		if (lp->cur == DLM_LOCK_IV)
-			lp->lksb.sb_lkid = 0;
-		goto out;
-	}
-
-	if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-		if (lp->lksb.sb_status != -DLM_EUNLOCK) {
-			log_info("unlock sb_status %d %x,%llx flags %lx",
-				 lp->lksb.sb_status, lp->lockname.ln_type,
-				 (unsigned long long)lp->lockname.ln_number,
-				 lp->flags);
-			return;
-		}
-
-		lp->cur = DLM_LOCK_IV;
-		lp->req = DLM_LOCK_IV;
-		lp->lksb.sb_lkid = 0;
-
-		if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
-			gdlm_delete_lp(lp);
-			return;
-		}
-		goto out;
-	}
-
-	if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
-		memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-
-	if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
-		if (lp->req == DLM_LOCK_PR)
-			lp->req = DLM_LOCK_CW;
-		else if (lp->req == DLM_LOCK_CW)
-			lp->req = DLM_LOCK_PR;
-	}
-
-	/*
-	 * A canceled lock request.  The lock was just taken off the delayed
-	 * list and was never even submitted to dlm.
-	 */
-
-	if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
-		log_info("complete internal cancel %x,%llx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number);
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		goto out;
-	}
-
-	/*
-	 * An error occured.
-	 */
-
-	if (lp->lksb.sb_status) {
-		/* a "normal" error */
-		if ((lp->lksb.sb_status == -EAGAIN) &&
-		    (lp->lkf & DLM_LKF_NOQUEUE)) {
-			lp->req = lp->cur;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		}
-
-		/* this could only happen with cancels I think */
-		log_info("ast sb_status %d %x,%llx flags %lx",
-			 lp->lksb.sb_status, lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-		return;
-	}
-
-	/*
-	 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
-	 */
-
-	if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * A lock has been demoted to NL because it initially completed during
-	 * BLOCK_LOCKS.  Now it must be requested in the originally requested
-	 * mode.
-	 */
-
-	if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
-		gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-		gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-
-		lp->cur = DLM_LOCK_NL;
-		lp->req = lp->prev_req;
-		lp->prev_req = DLM_LOCK_IV;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-		if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-		    !test_bit(LFL_NOBLOCK, &lp->flags))
-			gdlm_queue_delayed(lp);
-		else
-			queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * A request is granted during dlm recovery.  It may be granted
-	 * because the locks of a failed node were cleared.  In that case,
-	 * there may be inconsistent data beneath this lock and we must wait
-	 * for recovery to complete to use it.  When gfs recovery is done this
-	 * granted lock will be converted to NL and then reacquired in this
-	 * granted state.
-	 */
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-	    !test_bit(LFL_NOBLOCK, &lp->flags) &&
-	    lp->req != DLM_LOCK_NL) {
-
-		lp->cur = lp->req;
-		lp->prev_req = lp->req;
-		lp->req = DLM_LOCK_NL;
-		lp->lkf |= DLM_LKF_CONVERT;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		log_debug("rereq %x,%llx id %x %d,%d",
-			  lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number,
-			  lp->lksb.sb_lkid, lp->cur, lp->req);
-
-		set_bit(LFL_REREQUEST, &lp->flags);
-		queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * DLM demoted the lock to NL before it was granted so GFS must be
-	 * told it cannot cache data for this lock.
-	 */
-
-	if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-out:
-	/*
-	 * This is an internal lock_dlm lock
-	 */
-
-	if (test_bit(LFL_INLOCK, &lp->flags)) {
-		clear_bit(LFL_NOBLOCK, &lp->flags);
-		lp->cur = lp->req;
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * Normal completion of a lock request.  Tell GFS it now has the lock.
-	 */
-
-	clear_bit(LFL_NOBLOCK, &lp->flags);
-	lp->cur = lp->req;
-
-	acb.lc_name = lp->lockname;
-	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-
-	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-
-static void gdlm_ast(void *astarg)
-{
-	struct gdlm_lock *lp = astarg;
-	clear_bit(LFL_ACTIVE, &lp->flags);
-	process_complete(lp);
-}
-
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
-	struct gdlm_ls *ls = lp->ls;
-	unsigned int cb = 0;
-
-	switch (gdlm_make_lmstate(bast_mode)) {
-	case LM_ST_EXCLUSIVE:
-		cb = LM_CB_NEED_E;
-		break;
-	case LM_ST_DEFERRED:
-		cb = LM_CB_NEED_D;
-		break;
-	case LM_ST_SHARED:
-		cb = LM_CB_NEED_S;
-		break;
-	default:
-		gdlm_assert(0, "unknown bast mode %u", bast_mode);
-	}
-
-	ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-
-
-static void gdlm_bast(void *astarg, int mode)
-{
-	struct gdlm_lock *lp = astarg;
-
-	if (!mode) {
-		printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
-			lp->lockname.ln_type,
-			(unsigned long long)lp->lockname.ln_number);
-		return;
-	}
-
-	process_blocking(lp, mode);
-}
-
-/* convert gfs lock-state to dlm lock-mode */
-
-static s16 make_mode(s16 lmstate)
-{
-	switch (lmstate) {
-	case LM_ST_UNLOCKED:
-		return DLM_LOCK_NL;
-	case LM_ST_EXCLUSIVE:
-		return DLM_LOCK_EX;
-	case LM_ST_DEFERRED:
-		return DLM_LOCK_CW;
-	case LM_ST_SHARED:
-		return DLM_LOCK_PR;
-	}
-	gdlm_assert(0, "unknown LM state %d", lmstate);
-	return -1;
-}
-
-
-/* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
-   DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
-
-static void check_cur_state(struct gdlm_lock *lp, unsigned int cur_state)
-{
-	s16 cur = make_mode(cur_state);
-	if (lp->cur != DLM_LOCK_IV)
-		gdlm_assert(lp->cur == cur, "%d, %d", lp->cur, cur);
-}
-
-static inline unsigned int make_flags(struct gdlm_lock *lp,
-				      unsigned int gfs_flags,
-				      s16 cur, s16 req)
-{
-	unsigned int lkf = 0;
-
-	if (gfs_flags & LM_FLAG_TRY)
-		lkf |= DLM_LKF_NOQUEUE;
-
-	if (gfs_flags & LM_FLAG_TRY_1CB) {
-		lkf |= DLM_LKF_NOQUEUE;
-		lkf |= DLM_LKF_NOQUEUEBAST;
-	}
-
-	if (gfs_flags & LM_FLAG_PRIORITY) {
-		lkf |= DLM_LKF_NOORDER;
-		lkf |= DLM_LKF_HEADQUE;
-	}
-
-	if (gfs_flags & LM_FLAG_ANY) {
-		if (req == DLM_LOCK_PR)
-			lkf |= DLM_LKF_ALTCW;
-		else if (req == DLM_LOCK_CW)
-			lkf |= DLM_LKF_ALTPR;
-	}
-
-	if (lp->lksb.sb_lkid != 0) {
-		lkf |= DLM_LKF_CONVERT;
-	}
-
-	if (lp->lvb)
-		lkf |= DLM_LKF_VALBLK;
-
-	return lkf;
-}
-
-/* make_strname - convert GFS lock numbers to a string */
-
-static inline void make_strname(const struct lm_lockname *lockname,
-				struct gdlm_strname *str)
-{
-	sprintf(str->name, "%8x%16llx", lockname->ln_type,
-		(unsigned long long)lockname->ln_number);
-	str->namelen = GDLM_STRNAME_BYTES;
-}
-
-static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
-			  struct gdlm_lock **lpp)
-{
-	struct gdlm_lock *lp;
-
-	lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
-	if (!lp)
-		return -ENOMEM;
-
-	lp->lockname = *name;
-	make_strname(name, &lp->strname);
-	lp->ls = ls;
-	lp->cur = DLM_LOCK_IV;
-	INIT_LIST_HEAD(&lp->delay_list);
-
-	spin_lock(&ls->async_lock);
-	ls->all_locks_count++;
-	spin_unlock(&ls->async_lock);
-
-	*lpp = lp;
-	return 0;
-}
-
-int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
-		  void **lockp)
-{
-	struct gdlm_lock *lp;
-	int error;
-
-	error = gdlm_create_lp(lockspace, name, &lp);
-
-	*lockp = lp;
-	return error;
-}
-
-void gdlm_put_lock(void *lock)
-{
-	gdlm_delete_lp(lock);
-}
-
-unsigned int gdlm_do_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	int error, bast = 1;
-
-	/*
-	 * When recovery is in progress, delay lock requests for submission
-	 * once recovery is done.  Requests for recovery (NOEXP) and unlocks
-	 * can pass.
-	 */
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-	    !test_bit(LFL_NOBLOCK, &lp->flags) && lp->req != DLM_LOCK_NL) {
-		gdlm_queue_delayed(lp);
-		return LM_OUT_ASYNC;
-	}
-
-	/*
-	 * Submit the actual lock request.
-	 */
-
-	if (test_bit(LFL_NOBAST, &lp->flags))
-		bast = 0;
-
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	log_debug("lk %x,%llx id %x %d,%d %x", lp->lockname.ln_type,
-		  (unsigned long long)lp->lockname.ln_number, lp->lksb.sb_lkid,
-		  lp->cur, lp->req, lp->lkf);
-
-	error = dlm_lock(ls->dlm_lockspace, lp->req, &lp->lksb, lp->lkf,
-			 lp->strname.name, lp->strname.namelen, 0, gdlm_ast,
-			 lp, bast ? gdlm_bast : NULL);
-
-	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
-		lp->lksb.sb_status = -EAGAIN;
-		gdlm_ast(lp);
-		error = 0;
-	}
-
-	if (error) {
-		log_error("%s: gdlm_lock %x,%llx err=%d cur=%d req=%d lkf=%x "
-			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number, error,
-			  lp->cur, lp->req, lp->lkf, lp->flags);
-		return LM_OUT_ERROR;
-	}
-	return LM_OUT_ASYNC;
-}
-
-static unsigned int gdlm_do_unlock(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	unsigned int lkf = 0;
-	int error;
-
-	set_bit(LFL_DLM_UNLOCK, &lp->flags);
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	if (lp->lvb)
-		lkf = DLM_LKF_VALBLK;
-
-	log_debug("un %x,%llx %x %d %x", lp->lockname.ln_type,
-		  (unsigned long long)lp->lockname.ln_number,
-		  lp->lksb.sb_lkid, lp->cur, lkf);
-
-	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, lkf, NULL, lp);
-
-	if (error) {
-		log_error("%s: gdlm_unlock %x,%llx err=%d cur=%d req=%d lkf=%x "
-			  "flags=%lx", ls->fsname, lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number, error,
-			  lp->cur, lp->req, lp->lkf, lp->flags);
-		return LM_OUT_ERROR;
-	}
-	return LM_OUT_ASYNC;
-}
-
-unsigned int gdlm_lock(void *lock, unsigned int cur_state,
-		       unsigned int req_state, unsigned int flags)
-{
-	struct gdlm_lock *lp = lock;
-
-	if (req_state == LM_ST_UNLOCKED)
-		return gdlm_unlock(lock, cur_state);
-
-	if (req_state == LM_ST_UNLOCKED)
-		return gdlm_unlock(lock, cur_state);
-
-	clear_bit(LFL_DLM_CANCEL, &lp->flags);
-	if (flags & LM_FLAG_NOEXP)
-		set_bit(LFL_NOBLOCK, &lp->flags);
-
-	check_cur_state(lp, cur_state);
-	lp->req = make_mode(req_state);
-	lp->lkf = make_flags(lp, flags, lp->cur, lp->req);
-
-	return gdlm_do_lock(lp);
-}
-
-unsigned int gdlm_unlock(void *lock, unsigned int cur_state)
-{
-	struct gdlm_lock *lp = lock;
-
-	clear_bit(LFL_DLM_CANCEL, &lp->flags);
-	if (lp->cur == DLM_LOCK_IV)
-		return 0;
-	return gdlm_do_unlock(lp);
-}
-
-void gdlm_cancel(void *lock)
-{
-	struct gdlm_lock *lp = lock;
-	struct gdlm_ls *ls = lp->ls;
-	int error, delay_list = 0;
-
-	if (test_bit(LFL_DLM_CANCEL, &lp->flags))
-		return;
-
-	log_info("gdlm_cancel %x,%llx flags %lx", lp->lockname.ln_type,
-		 (unsigned long long)lp->lockname.ln_number, lp->flags);
-
-	spin_lock(&ls->async_lock);
-	if (!list_empty(&lp->delay_list)) {
-		list_del_init(&lp->delay_list);
-		delay_list = 1;
-	}
-	spin_unlock(&ls->async_lock);
-
-	if (delay_list) {
-		set_bit(LFL_CANCEL, &lp->flags);
-		set_bit(LFL_ACTIVE, &lp->flags);
-		gdlm_ast(lp);
-		return;
-	}
-
-	if (!test_bit(LFL_ACTIVE, &lp->flags) ||
-	    test_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-		log_info("gdlm_cancel skip %x,%llx flags %lx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number, lp->flags);
-		return;
-	}
-
-	/* the lock is blocked in the dlm */
-
-	set_bit(LFL_DLM_CANCEL, &lp->flags);
-	set_bit(LFL_ACTIVE, &lp->flags);
-
-	error = dlm_unlock(ls->dlm_lockspace, lp->lksb.sb_lkid, DLM_LKF_CANCEL,
-			   NULL, lp);
-
-	log_info("gdlm_cancel rv %d %x,%llx flags %lx", error,
-		 lp->lockname.ln_type,
-		 (unsigned long long)lp->lockname.ln_number, lp->flags);
-
-	if (error == -EBUSY)
-		clear_bit(LFL_DLM_CANCEL, &lp->flags);
-}
-
-static int gdlm_add_lvb(struct gdlm_lock *lp)
-{
-	char *lvb;
-
-	lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
-	if (!lvb)
-		return -ENOMEM;
-
-	lp->lksb.sb_lvbptr = lvb;
-	lp->lvb = lvb;
-	return 0;
-}
-
-static void gdlm_del_lvb(struct gdlm_lock *lp)
-{
-	kfree(lp->lvb);
-	lp->lvb = NULL;
-	lp->lksb.sb_lvbptr = NULL;
-}
-
-static int gdlm_ast_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/* This can do a synchronous dlm request (requiring a lock_dlm thread to get
-   the completion) because gfs won't call hold_lvb() during a callback (from
-   the context of a lock_dlm thread). */
-
-static int hold_null_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_lock *lpn = NULL;
-	int error;
-
-	if (lp->hold_null) {
-		printk(KERN_INFO "lock_dlm: lvb already held\n");
-		return 0;
-	}
-
-	error = gdlm_create_lp(lp->ls, &lp->lockname, &lpn);
-	if (error)
-		goto out;
-
-	lpn->lksb.sb_lvbptr = junk_lvb;
-	lpn->lvb = junk_lvb;
-
-	lpn->req = DLM_LOCK_NL;
-	lpn->lkf = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE;
-	set_bit(LFL_NOBAST, &lpn->flags);
-	set_bit(LFL_INLOCK, &lpn->flags);
-	set_bit(LFL_AST_WAIT, &lpn->flags);
-
-	gdlm_do_lock(lpn);
-	wait_on_bit(&lpn->flags, LFL_AST_WAIT, gdlm_ast_wait, TASK_UNINTERRUPTIBLE);
-	error = lpn->lksb.sb_status;
-	if (error) {
-		printk(KERN_INFO "lock_dlm: hold_null_lock dlm error %d\n",
-		       error);
-		gdlm_delete_lp(lpn);
-		lpn = NULL;
-	}
-out:
-	lp->hold_null = lpn;
-	return error;
-}
-
-/* This cannot do a synchronous dlm request (requiring a lock_dlm thread to get
-   the completion) because gfs may call unhold_lvb() during a callback (from
-   the context of a lock_dlm thread) which could cause a deadlock since the
-   other lock_dlm thread could be engaged in recovery. */
-
-static void unhold_null_lock(struct gdlm_lock *lp)
-{
-	struct gdlm_lock *lpn = lp->hold_null;
-
-	gdlm_assert(lpn, "%x,%llx", lp->lockname.ln_type,
-		    (unsigned long long)lp->lockname.ln_number);
-	lpn->lksb.sb_lvbptr = NULL;
-	lpn->lvb = NULL;
-	set_bit(LFL_UNLOCK_DELETE, &lpn->flags);
-	gdlm_do_unlock(lpn);
-	lp->hold_null = NULL;
-}
-
-/* Acquire a NL lock because gfs requires the value block to remain
-   intact on the resource while the lvb is "held" even if it's holding no locks
-   on the resource. */
-
-int gdlm_hold_lvb(void *lock, char **lvbp)
-{
-	struct gdlm_lock *lp = lock;
-	int error;
-
-	error = gdlm_add_lvb(lp);
-	if (error)
-		return error;
-
-	*lvbp = lp->lvb;
-
-	error = hold_null_lock(lp);
-	if (error)
-		gdlm_del_lvb(lp);
-
-	return error;
-}
-
-void gdlm_unhold_lvb(void *lock, char *lvb)
-{
-	struct gdlm_lock *lp = lock;
-
-	unhold_null_lock(lp);
-	gdlm_del_lvb(lp);
-}
-
-void gdlm_submit_delayed(struct gdlm_ls *ls)
-{
-	struct gdlm_lock *lp, *safe;
-
-	spin_lock(&ls->async_lock);
-	list_for_each_entry_safe(lp, safe, &ls->delayed, delay_list) {
-		list_del_init(&lp->delay_list);
-		list_add_tail(&lp->delay_list, &ls->submit);
-	}
-	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
-}
-
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
deleted file mode 100644
index 3c98e7c..0000000
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef LOCK_DLM_DOT_H
-#define LOCK_DLM_DOT_H
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/list.h>
-#include <linux/socket.h>
-#include <linux/delay.h>
-#include <linux/kthread.h>
-#include <linux/kobject.h>
-#include <linux/fcntl.h>
-#include <linux/wait.h>
-#include <net/sock.h>
-
-#include <linux/dlm.h>
-#include <linux/dlm_plock.h>
-#include <linux/lm_interface.h>
-
-/*
- * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a
- * prefix of lock_dlm_ gets awkward.  Externally, GFS refers to this module
- * as "lock_dlm".
- */
-
-#define GDLM_STRNAME_BYTES	24
-#define GDLM_LVB_SIZE		32
-#define GDLM_DROP_COUNT		0
-#define GDLM_DROP_PERIOD	60
-#define GDLM_NAME_LEN		128
-
-/* GFS uses 12 bytes to identify a resource (32 bit type + 64 bit number).
-   We sprintf these numbers into a 24 byte string of hex values to make them
-   human-readable (to make debugging simpler.) */
-
-struct gdlm_strname {
-	unsigned char		name[GDLM_STRNAME_BYTES];
-	unsigned short		namelen;
-};
-
-enum {
-	DFL_BLOCK_LOCKS		= 0,
-	DFL_SPECTATOR		= 1,
-	DFL_WITHDRAW		= 2,
-};
-
-struct gdlm_ls {
-	u32		id;
-	int			jid;
-	int			first;
-	int			first_done;
-	unsigned long		flags;
-	struct kobject		kobj;
-	char			clustername[GDLM_NAME_LEN];
-	char			fsname[GDLM_NAME_LEN];
-	int			fsflags;
-	dlm_lockspace_t		*dlm_lockspace;
-	lm_callback_t		fscb;
-	struct gfs2_sbd		*sdp;
-	int			recover_jid;
-	int			recover_jid_done;
-	int			recover_jid_status;
-	spinlock_t		async_lock;
-	struct list_head	delayed;
-	struct list_head	submit;
-	u32		all_locks_count;
-	wait_queue_head_t	wait_control;
-	struct task_struct	*thread;
-	wait_queue_head_t	thread_wait;
-};
-
-enum {
-	LFL_NOBLOCK		= 0,
-	LFL_NOCACHE		= 1,
-	LFL_DLM_UNLOCK		= 2,
-	LFL_DLM_CANCEL		= 3,
-	LFL_SYNC_LVB		= 4,
-	LFL_FORCE_PROMOTE	= 5,
-	LFL_REREQUEST		= 6,
-	LFL_ACTIVE		= 7,
-	LFL_INLOCK		= 8,
-	LFL_CANCEL		= 9,
-	LFL_NOBAST		= 10,
-	LFL_HEADQUE		= 11,
-	LFL_UNLOCK_DELETE	= 12,
-	LFL_AST_WAIT		= 13,
-};
-
-struct gdlm_lock {
-	struct gdlm_ls		*ls;
-	struct lm_lockname	lockname;
-	struct gdlm_strname	strname;
-	char			*lvb;
-	struct dlm_lksb		lksb;
-
-	s16			cur;
-	s16			req;
-	s16			prev_req;
-	u32			lkf;		/* dlm flags DLM_LKF_ */
-	unsigned long		flags;		/* lock_dlm flags LFL_ */
-
-	struct list_head	delay_list;	/* delayed */
-	struct gdlm_lock	*hold_null;	/* NL lock for hold_lvb */
-};
-
-#define gdlm_assert(assertion, fmt, args...)                                  \
-do {                                                                          \
-	if (unlikely(!(assertion))) {                                         \
-		printk(KERN_EMERG "lock_dlm: fatal assertion failed \"%s\"\n" \
-				  "lock_dlm:  " fmt "\n",                     \
-				  #assertion, ##args);                        \
-		BUG();                                                        \
-	}                                                                     \
-} while (0)
-
-#define log_print(lev, fmt, arg...) printk(lev "lock_dlm: " fmt "\n" , ## arg)
-#define log_info(fmt, arg...)  log_print(KERN_INFO , fmt , ## arg)
-#define log_error(fmt, arg...) log_print(KERN_ERR , fmt , ## arg)
-#ifdef LOCK_DLM_LOG_DEBUG
-#define log_debug(fmt, arg...) log_print(KERN_DEBUG , fmt , ## arg)
-#else
-#define log_debug(fmt, arg...)
-#endif
-
-/* sysfs.c */
-
-int gdlm_sysfs_init(void);
-void gdlm_sysfs_exit(void);
-int gdlm_kobject_setup(struct gdlm_ls *, struct kobject *);
-void gdlm_kobject_release(struct gdlm_ls *);
-
-/* thread.c */
-
-int gdlm_init_threads(struct gdlm_ls *);
-void gdlm_release_threads(struct gdlm_ls *);
-
-/* lock.c */
-
-void gdlm_submit_delayed(struct gdlm_ls *);
-unsigned int gdlm_do_lock(struct gdlm_lock *);
-
-int gdlm_get_lock(void *, struct lm_lockname *, void **);
-void gdlm_put_lock(void *);
-unsigned int gdlm_lock(void *, unsigned int, unsigned int, unsigned int);
-unsigned int gdlm_unlock(void *, unsigned int);
-void gdlm_cancel(void *);
-int gdlm_hold_lvb(void *, char **);
-void gdlm_unhold_lvb(void *, char *);
-
-/* mount.c */
-
-extern const struct lm_lockops gdlm_ops;
-
-#endif
-
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
deleted file mode 100644
index b9a03a7..0000000
--- a/fs/gfs2/locking/dlm/main.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/init.h>
-
-#include "lock_dlm.h"
-
-static int __init init_lock_dlm(void)
-{
-	int error;
-
-	error = gfs2_register_lockproto(&gdlm_ops);
-	if (error) {
-		printk(KERN_WARNING "lock_dlm:  can't register protocol: %d\n",
-		       error);
-		return error;
-	}
-
-	error = gdlm_sysfs_init();
-	if (error) {
-		gfs2_unregister_lockproto(&gdlm_ops);
-		return error;
-	}
-
-	printk(KERN_INFO
-	       "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
-	return 0;
-}
-
-static void __exit exit_lock_dlm(void)
-{
-	gdlm_sysfs_exit();
-	gfs2_unregister_lockproto(&gdlm_ops);
-}
-
-module_init(init_lock_dlm);
-module_exit(exit_lock_dlm);
-
-MODULE_DESCRIPTION("GFS DLM Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
deleted file mode 100644
index 1aa7eb6..0000000
--- a/fs/gfs2/locking/dlm/mount.c
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-const struct lm_lockops gdlm_ops;
-
-
-static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
-				 int flags, char *table_name)
-{
-	struct gdlm_ls *ls;
-	char buf[256], *p;
-
-	ls = kzalloc(sizeof(struct gdlm_ls), GFP_KERNEL);
-	if (!ls)
-		return NULL;
-
-	ls->fscb = cb;
-	ls->sdp = sdp;
-	ls->fsflags = flags;
-	spin_lock_init(&ls->async_lock);
-	INIT_LIST_HEAD(&ls->delayed);
-	INIT_LIST_HEAD(&ls->submit);
-	init_waitqueue_head(&ls->thread_wait);
-	init_waitqueue_head(&ls->wait_control);
-	ls->jid = -1;
-
-	strncpy(buf, table_name, 256);
-	buf[255] = '\0';
-
-	p = strchr(buf, ':');
-	if (!p) {
-		log_info("invalid table_name \"%s\"", table_name);
-		kfree(ls);
-		return NULL;
-	}
-	*p = '\0';
-	p++;
-
-	strncpy(ls->clustername, buf, GDLM_NAME_LEN);
-	strncpy(ls->fsname, p, GDLM_NAME_LEN);
-
-	return ls;
-}
-
-static int make_args(struct gdlm_ls *ls, char *data_arg, int *nodir)
-{
-	char data[256];
-	char *options, *x, *y;
-	int error = 0;
-
-	memset(data, 0, 256);
-	strncpy(data, data_arg, 255);
-
-	if (!strlen(data)) {
-		log_error("no mount options, (u)mount helpers not installed");
-		return -EINVAL;
-	}
-
-	for (options = data; (x = strsep(&options, ":")); ) {
-		if (!*x)
-			continue;
-
-		y = strchr(x, '=');
-		if (y)
-			*y++ = 0;
-
-		if (!strcmp(x, "jid")) {
-			if (!y) {
-				log_error("need argument to jid");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->jid);
-
-		} else if (!strcmp(x, "first")) {
-			if (!y) {
-				log_error("need argument to first");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->first);
-
-		} else if (!strcmp(x, "id")) {
-			if (!y) {
-				log_error("need argument to id");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", &ls->id);
-
-		} else if (!strcmp(x, "nodir")) {
-			if (!y) {
-				log_error("need argument to nodir");
-				error = -EINVAL;
-				break;
-			}
-			sscanf(y, "%u", nodir);
-
-		} else {
-			log_error("unkonwn option: %s", x);
-			error = -EINVAL;
-			break;
-		}
-	}
-
-	return error;
-}
-
-static int gdlm_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj)
-{
-	struct gdlm_ls *ls;
-	int error = -ENOMEM, nodir = 0;
-
-	if (min_lvb_size > GDLM_LVB_SIZE)
-		goto out;
-
-	ls = init_gdlm(cb, cb_data, flags, table_name);
-	if (!ls)
-		goto out;
-
-	error = make_args(ls, host_data, &nodir);
-	if (error)
-		goto out;
-
-	error = gdlm_init_threads(ls);
-	if (error)
-		goto out_free;
-
-	error = gdlm_kobject_setup(ls, fskobj);
-	if (error)
-		goto out_thread;
-
-	error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname),
-				  &ls->dlm_lockspace,
-				  DLM_LSFL_FS | DLM_LSFL_NEWEXCL |
-				  (nodir ? DLM_LSFL_NODIR : 0),
-				  GDLM_LVB_SIZE);
-	if (error) {
-		log_error("dlm_new_lockspace error %d", error);
-		goto out_kobj;
-	}
-
-	lockstruct->ls_jid = ls->jid;
-	lockstruct->ls_first = ls->first;
-	lockstruct->ls_lockspace = ls;
-	lockstruct->ls_ops = &gdlm_ops;
-	lockstruct->ls_flags = 0;
-	lockstruct->ls_lvb_size = GDLM_LVB_SIZE;
-	return 0;
-
-out_kobj:
-	gdlm_kobject_release(ls);
-out_thread:
-	gdlm_release_threads(ls);
-out_free:
-	kfree(ls);
-out:
-	return error;
-}
-
-static void gdlm_unmount(void *lockspace)
-{
-	struct gdlm_ls *ls = lockspace;
-
-	log_debug("unmount flags %lx", ls->flags);
-
-	/* FIXME: serialize unmount and withdraw in case they
-	   happen at once.  Also, if unmount follows withdraw,
-	   wait for withdraw to finish. */
-
-	if (test_bit(DFL_WITHDRAW, &ls->flags))
-		goto out;
-
-	gdlm_kobject_release(ls);
-	dlm_release_lockspace(ls->dlm_lockspace, 2);
-	gdlm_release_threads(ls);
-	BUG_ON(ls->all_locks_count);
-out:
-	kfree(ls);
-}
-
-static void gdlm_recovery_done(void *lockspace, unsigned int jid,
-                               unsigned int message)
-{
-	char env_jid[20];
-	char env_status[20];
-	char *envp[] = { env_jid, env_status, NULL };
-	struct gdlm_ls *ls = lockspace;
-	ls->recover_jid_done = jid;
-	ls->recover_jid_status = message;
-	sprintf(env_jid, "JID=%d", jid);
-	sprintf(env_status, "RECOVERY=%s",
-		message == LM_RD_SUCCESS ? "Done" : "Failed");
-	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
-}
-
-static void gdlm_others_may_mount(void *lockspace)
-{
-	char *message = "FIRSTMOUNT=Done";
-	char *envp[] = { message, NULL };
-	struct gdlm_ls *ls = lockspace;
-	ls->first_done = 1;
-	kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp);
-}
-
-/* Userspace gets the offline uevent, blocks new gfs locks on
-   other mounters, and lets us know (sets WITHDRAW flag).  Then,
-   userspace leaves the mount group while we leave the lockspace. */
-
-static void gdlm_withdraw(void *lockspace)
-{
-	struct gdlm_ls *ls = lockspace;
-
-	kobject_uevent(&ls->kobj, KOBJ_OFFLINE);
-
-	wait_event_interruptible(ls->wait_control,
-				 test_bit(DFL_WITHDRAW, &ls->flags));
-
-	dlm_release_lockspace(ls->dlm_lockspace, 2);
-	gdlm_release_threads(ls);
-	gdlm_kobject_release(ls);
-}
-
-static int gdlm_plock(void *lockspace, struct lm_lockname *name,
-	       struct file *file, int cmd, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl);
-}
-
-static int gdlm_punlock(void *lockspace, struct lm_lockname *name,
-		 struct file *file, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl);
-}
-
-static int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
-		   struct file *file, struct file_lock *fl)
-{
-	struct gdlm_ls *ls = lockspace;
-	return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl);
-}
-
-const struct lm_lockops gdlm_ops = {
-	.lm_proto_name = "lock_dlm",
-	.lm_mount = gdlm_mount,
-	.lm_others_may_mount = gdlm_others_may_mount,
-	.lm_unmount = gdlm_unmount,
-	.lm_withdraw = gdlm_withdraw,
-	.lm_get_lock = gdlm_get_lock,
-	.lm_put_lock = gdlm_put_lock,
-	.lm_lock = gdlm_lock,
-	.lm_unlock = gdlm_unlock,
-	.lm_plock = gdlm_plock,
-	.lm_punlock = gdlm_punlock,
-	.lm_plock_get = gdlm_plock_get,
-	.lm_cancel = gdlm_cancel,
-	.lm_hold_lvb = gdlm_hold_lvb,
-	.lm_unhold_lvb = gdlm_unhold_lvb,
-	.lm_recovery_done = gdlm_recovery_done,
-	.lm_owner = THIS_MODULE,
-};
-
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
deleted file mode 100644
index 9b7edcf..0000000
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/ctype.h>
-#include <linux/stat.h>
-
-#include "lock_dlm.h"
-
-static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
-}
-
-static ssize_t block_show(struct gdlm_ls *ls, char *buf)
-{
-	ssize_t ret;
-	int val = 0;
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags))
-		val = 1;
-	ret = sprintf(buf, "%d\n", val);
-	return ret;
-}
-
-static ssize_t block_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ssize_t ret = len;
-	int val;
-
-	val = simple_strtol(buf, NULL, 0);
-
-	if (val == 1)
-		set_bit(DFL_BLOCK_LOCKS, &ls->flags);
-	else if (val == 0) {
-		clear_bit(DFL_BLOCK_LOCKS, &ls->flags);
-		gdlm_submit_delayed(ls);
-	} else {
-		ret = -EINVAL;
-	}
-	return ret;
-}
-
-static ssize_t withdraw_show(struct gdlm_ls *ls, char *buf)
-{
-	ssize_t ret;
-	int val = 0;
-
-	if (test_bit(DFL_WITHDRAW, &ls->flags))
-		val = 1;
-	ret = sprintf(buf, "%d\n", val);
-	return ret;
-}
-
-static ssize_t withdraw_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ssize_t ret = len;
-	int val;
-
-	val = simple_strtol(buf, NULL, 0);
-
-	if (val == 1)
-		set_bit(DFL_WITHDRAW, &ls->flags);
-	else
-		ret = -EINVAL;
-	wake_up(&ls->wait_control);
-	return ret;
-}
-
-static ssize_t id_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%u\n", ls->id);
-}
-
-static ssize_t jid_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->jid);
-}
-
-static ssize_t first_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->first);
-}
-
-static ssize_t first_done_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->first_done);
-}
-
-static ssize_t recover_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid);
-}
-
-static ssize_t recover_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ls->recover_jid = simple_strtol(buf, NULL, 0);
-	ls->fscb(ls->sdp, LM_CB_NEED_RECOVERY, &ls->recover_jid);
-	return len;
-}
-
-static ssize_t recover_done_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid_done);
-}
-
-static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->recover_jid_status);
-}
-
-struct gdlm_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gdlm_ls *, char *);
-	ssize_t (*store)(struct gdlm_ls *, const char *, size_t);
-};
-
-#define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
-GDLM_ATTR(id,             0444, id_show,             NULL);
-GDLM_ATTR(jid,            0444, jid_show,            NULL);
-GDLM_ATTR(first,          0444, first_show,          NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-
-static struct attribute *gdlm_attrs[] = {
-	&gdlm_attr_proto_name.attr,
-	&gdlm_attr_block.attr,
-	&gdlm_attr_withdraw.attr,
-	&gdlm_attr_id.attr,
-	&gdlm_attr_jid.attr,
-	&gdlm_attr_first.attr,
-	&gdlm_attr_first_done.attr,
-	&gdlm_attr_recover.attr,
-	&gdlm_attr_recover_done.attr,
-	&gdlm_attr_recover_status.attr,
-	NULL,
-};
-
-static ssize_t gdlm_attr_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-	struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
-	return a->show ? a->show(ls, buf) : 0;
-}
-
-static ssize_t gdlm_attr_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buf, size_t len)
-{
-	struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-	struct gdlm_attr *a = container_of(attr, struct gdlm_attr, attr);
-	return a->store ? a->store(ls, buf, len) : len;
-}
-
-static struct sysfs_ops gdlm_attr_ops = {
-	.show  = gdlm_attr_show,
-	.store = gdlm_attr_store,
-};
-
-static struct kobj_type gdlm_ktype = {
-	.default_attrs = gdlm_attrs,
-	.sysfs_ops     = &gdlm_attr_ops,
-};
-
-static struct kset *gdlm_kset;
-
-int gdlm_kobject_setup(struct gdlm_ls *ls, struct kobject *fskobj)
-{
-	int error;
-
-	ls->kobj.kset = gdlm_kset;
-	error = kobject_init_and_add(&ls->kobj, &gdlm_ktype, fskobj,
-				     "lock_module");
-	if (error)
-		log_error("can't register kobj %d", error);
-	kobject_uevent(&ls->kobj, KOBJ_ADD);
-
-	return error;
-}
-
-void gdlm_kobject_release(struct gdlm_ls *ls)
-{
-	kobject_put(&ls->kobj);
-}
-
-static int gdlm_uevent(struct kset *kset, struct kobject *kobj,
-		       struct kobj_uevent_env *env)
-{
-        struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj);
-        add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname);
-        add_uevent_var(env, "LOCKPROTO=lock_dlm");
-        return 0;
-}
-
-static struct kset_uevent_ops gdlm_uevent_ops = {
-	.uevent = gdlm_uevent,
-};
-
-
-int gdlm_sysfs_init(void)
-{
-	gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj);
-	if (!gdlm_kset) {
-		printk(KERN_WARNING "%s: can not create kset\n", __func__);
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-void gdlm_sysfs_exit(void)
-{
-	kset_unregister(gdlm_kset);
-}
-
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
deleted file mode 100644
index 38823ef..0000000
--- a/fs/gfs2/locking/dlm/thread.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include "lock_dlm.h"
-
-static inline int no_work(struct gdlm_ls *ls)
-{
-	int ret;
-
-	spin_lock(&ls->async_lock);
-	ret = list_empty(&ls->submit);
-	spin_unlock(&ls->async_lock);
-
-	return ret;
-}
-
-static int gdlm_thread(void *data)
-{
-	struct gdlm_ls *ls = (struct gdlm_ls *) data;
-	struct gdlm_lock *lp = NULL;
-
-	while (!kthread_should_stop()) {
-		wait_event_interruptible(ls->thread_wait,
-				!no_work(ls) || kthread_should_stop());
-
-		spin_lock(&ls->async_lock);
-
-		if (!list_empty(&ls->submit)) {
-			lp = list_entry(ls->submit.next, struct gdlm_lock,
-					delay_list);
-			list_del_init(&lp->delay_list);
-			spin_unlock(&ls->async_lock);
-			gdlm_do_lock(lp);
-			spin_lock(&ls->async_lock);
-		}
-		spin_unlock(&ls->async_lock);
-	}
-
-	return 0;
-}
-
-int gdlm_init_threads(struct gdlm_ls *ls)
-{
-	struct task_struct *p;
-	int error;
-
-	p = kthread_run(gdlm_thread, ls, "lock_dlm");
-	error = IS_ERR(p);
-	if (error) {
-		log_error("can't start lock_dlm thread %d", error);
-		return error;
-	}
-	ls->thread = p;
-
-	return 0;
-}
-
-void gdlm_release_threads(struct gdlm_ls *ls)
-{
-	kthread_stop(ls->thread);
-}
-
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ad30585..98918a7 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -14,7 +14,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 4390f6f..80e4f5f 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,7 +13,6 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7cacfde..a6892ed 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,7 +14,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/atomic.h>
 
 #include "gfs2.h"
@@ -23,6 +22,12 @@
 #include "sys.h"
 #include "util.h"
 #include "glock.h"
+#include "quota.h"
+
+static struct shrinker qd_shrinker = {
+	.shrink = gfs2_shrink_qd_memory,
+	.seeks = DEFAULT_SEEKS,
+};
 
 static void gfs2_init_inode_once(void *foo)
 {
@@ -41,8 +46,6 @@ static void gfs2_init_glock_once(void *foo)
 	INIT_HLIST_NODE(&gl->gl_list);
 	spin_lock_init(&gl->gl_spin);
 	INIT_LIST_HEAD(&gl->gl_holders);
-	gl->gl_lvb = NULL;
-	atomic_set(&gl->gl_lvb_count, 0);
 	INIT_LIST_HEAD(&gl->gl_lru);
 	INIT_LIST_HEAD(&gl->gl_ail_list);
 	atomic_set(&gl->gl_ail_count, 0);
@@ -100,6 +103,8 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_quotad_cachep)
 		goto fail;
 
+	register_shrinker(&qd_shrinker);
+
 	error = register_filesystem(&gfs2_fs_type);
 	if (error)
 		goto fail;
@@ -117,6 +122,7 @@ static int __init init_gfs2_fs(void)
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
 fail:
+	unregister_shrinker(&qd_shrinker);
 	gfs2_glock_exit();
 
 	if (gfs2_quotad_cachep)
@@ -145,6 +151,7 @@ fail:
 
 static void __exit exit_gfs2_fs(void)
 {
+	unregister_shrinker(&qd_shrinker);
 	gfs2_glock_exit();
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0985362..8d6f132 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -19,7 +19,6 @@
 #include <linux/delay.h>
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -90,27 +89,6 @@ void gfs2_aspace_put(struct inode *aspace)
 }
 
 /**
- * gfs2_meta_inval - Invalidate all buffers associated with a glock
- * @gl: the glock
- *
- */
-
-void gfs2_meta_inval(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct inode *aspace = gl->gl_aspace;
-	struct address_space *mapping = gl->gl_aspace->i_mapping;
-
-	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
-
-	atomic_inc(&aspace->i_writecount);
-	truncate_inode_pages(mapping, 0);
-	atomic_dec(&aspace->i_writecount);
-
-	gfs2_assert_withdraw(sdp, !mapping->nrpages);
-}
-
-/**
  * gfs2_meta_sync - Sync all buffers associated with a glock
  * @gl: The glock
  *
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index b1a5f36..de270c2 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -40,7 +40,6 @@ static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh,
 struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp);
 void gfs2_aspace_put(struct inode *aspace);
 
-void gfs2_meta_inval(struct gfs2_glock *gl);
 void gfs2_meta_sync(struct gfs2_glock *gl);
 
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index 3cb0a44..f7e8527 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -12,12 +12,11 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/parser.h>
 
 #include "gfs2.h"
 #include "incore.h"
-#include "mount.h"
+#include "super.h"
 #include "sys.h"
 #include "util.h"
 
@@ -37,11 +36,15 @@ enum {
 	Opt_quota_off,
 	Opt_quota_account,
 	Opt_quota_on,
+	Opt_quota,
+	Opt_noquota,
 	Opt_suiddir,
 	Opt_nosuiddir,
 	Opt_data_writeback,
 	Opt_data_ordered,
 	Opt_meta,
+	Opt_discard,
+	Opt_nodiscard,
 	Opt_err,
 };
 
@@ -61,11 +64,15 @@ static const match_table_t tokens = {
 	{Opt_quota_off, "quota=off"},
 	{Opt_quota_account, "quota=account"},
 	{Opt_quota_on, "quota=on"},
+	{Opt_quota, "quota"},
+	{Opt_noquota, "noquota"},
 	{Opt_suiddir, "suiddir"},
 	{Opt_nosuiddir, "nosuiddir"},
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_ordered, "data=ordered"},
 	{Opt_meta, "meta"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
 	{Opt_err, NULL}
 };
 
@@ -77,101 +84,46 @@ static const match_table_t tokens = {
  * Return: errno
  */
 
-int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
+int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 {
-	struct gfs2_args *args = &sdp->sd_args;
-	char *data = data_arg;
-	char *options, *o, *v;
-	int error = 0;
-
-	if (!remount) {
-		/*  Set some defaults  */
-		args->ar_quota = GFS2_QUOTA_DEFAULT;
-		args->ar_data = GFS2_DATA_DEFAULT;
-	}
+	char *o;
+	int token;
+	substring_t tmp[MAX_OPT_ARGS];
 
 	/* Split the options into tokens with the "," character and
 	   process them */
 
-	for (options = data; (o = strsep(&options, ",")); ) {
-		int token;
-		substring_t tmp[MAX_OPT_ARGS];
-
-		if (!*o)
+	while (1) {
+		o = strsep(&options, ",");
+		if (o == NULL)
+			break;
+		if (*o == '\0')
 			continue;
 
 		token = match_token(o, tokens, tmp);
 		switch (token) {
 		case Opt_lockproto:
-			v = match_strdup(&tmp[0]);
-			if (!v) {
-				fs_info(sdp, "no memory for lockproto\n");
-				error = -ENOMEM;
-				goto out_error;
-			}
-
-			if (remount && strcmp(v, args->ar_lockproto)) {
-				kfree(v);
-				goto cant_remount;
-			}
-			
-			strncpy(args->ar_lockproto, v, GFS2_LOCKNAME_LEN);
-			args->ar_lockproto[GFS2_LOCKNAME_LEN - 1] = 0;
-			kfree(v);
+			match_strlcpy(args->ar_lockproto, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
 			break;
 		case Opt_locktable:
-			v = match_strdup(&tmp[0]);
-			if (!v) {
-				fs_info(sdp, "no memory for locktable\n");
-				error = -ENOMEM;
-				goto out_error;
-			}
-
-			if (remount && strcmp(v, args->ar_locktable)) {
-				kfree(v);
-				goto cant_remount;
-			}
-
-			strncpy(args->ar_locktable, v, GFS2_LOCKNAME_LEN);
-			args->ar_locktable[GFS2_LOCKNAME_LEN - 1]  = 0;
-			kfree(v);
+			match_strlcpy(args->ar_locktable, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
 			break;
 		case Opt_hostdata:
-			v = match_strdup(&tmp[0]);
-			if (!v) {
-				fs_info(sdp, "no memory for hostdata\n");
-				error = -ENOMEM;
-				goto out_error;
-			}
-
-			if (remount && strcmp(v, args->ar_hostdata)) {
-				kfree(v);
-				goto cant_remount;
-			}
-
-			strncpy(args->ar_hostdata, v, GFS2_LOCKNAME_LEN);
-			args->ar_hostdata[GFS2_LOCKNAME_LEN - 1] = 0;
-			kfree(v);
+			match_strlcpy(args->ar_hostdata, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
 			break;
 		case Opt_spectator:
-			if (remount && !args->ar_spectator)
-				goto cant_remount;
 			args->ar_spectator = 1;
-			sdp->sd_vfs->s_flags |= MS_RDONLY;
 			break;
 		case Opt_ignore_local_fs:
-			if (remount && !args->ar_ignore_local_fs)
-				goto cant_remount;
 			args->ar_ignore_local_fs = 1;
 			break;
 		case Opt_localflocks:
-			if (remount && !args->ar_localflocks)
-				goto cant_remount;
 			args->ar_localflocks = 1;
 			break;
 		case Opt_localcaching:
-			if (remount && !args->ar_localcaching)
-				goto cant_remount;
 			args->ar_localcaching = 1;
 			break;
 		case Opt_debug:
@@ -181,25 +133,23 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 			args->ar_debug = 0;
 			break;
 		case Opt_upgrade:
-			if (remount && !args->ar_upgrade)
-				goto cant_remount;
 			args->ar_upgrade = 1;
 			break;
 		case Opt_acl:
 			args->ar_posix_acl = 1;
-			sdp->sd_vfs->s_flags |= MS_POSIXACL;
 			break;
 		case Opt_noacl:
 			args->ar_posix_acl = 0;
-			sdp->sd_vfs->s_flags &= ~MS_POSIXACL;
 			break;
 		case Opt_quota_off:
+		case Opt_noquota:
 			args->ar_quota = GFS2_QUOTA_OFF;
 			break;
 		case Opt_quota_account:
 			args->ar_quota = GFS2_QUOTA_ACCOUNT;
 			break;
 		case Opt_quota_on:
+		case Opt_quota:
 			args->ar_quota = GFS2_QUOTA_ON;
 			break;
 		case Opt_suiddir:
@@ -215,29 +165,21 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount)
 			args->ar_data = GFS2_DATA_ORDERED;
 			break;
 		case Opt_meta:
-			if (remount && args->ar_meta != 1)
-				goto cant_remount;
 			args->ar_meta = 1;
 			break;
+		case Opt_discard:
+			args->ar_discard = 1;
+			break;
+		case Opt_nodiscard:
+			args->ar_discard = 0;
+			break;
 		case Opt_err:
 		default:
-			fs_info(sdp, "unknown option: %s\n", o);
-			error = -EINVAL;
-			goto out_error;
+			fs_info(sdp, "invalid mount option: %s\n", o);
+			return -EINVAL;
 		}
 	}
 
-out_error:
-	if (error)
-		fs_info(sdp, "invalid mount option(s)\n");
-
-	if (data != data_arg)
-		kfree(data);
-
-	return error;
-
-cant_remount:
-	fs_info(sdp, "can't remount with option %s\n", o);
-	return -EINVAL;
+	return 0;
 }
 
diff --git a/fs/gfs2/mount.h b/fs/gfs2/mount.h
deleted file mode 100644
index 401288a..0000000
--- a/fs/gfs2/mount.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __MOUNT_DOT_H__
-#define __MOUNT_DOT_H__
-
-struct gfs2_sbd;
-
-int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount);
-
-#endif /* __MOUNT_DOT_H__ */
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 4ddab67..a6dde17 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -19,7 +19,6 @@
 #include <linux/writeback.h>
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/backing-dev.h>
 
 #include "gfs2.h"
@@ -442,6 +441,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 	 */
 	if (unlikely(page->index)) {
 		zero_user(page, 0, PAGE_CACHE_SIZE);
+		SetPageUptodate(page);
 		return 0;
 	}
 
@@ -1096,6 +1096,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
 	.releasepage = gfs2_releasepage,
 	.direct_IO = gfs2_direct_IO,
 	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations gfs2_ordered_aops = {
@@ -1111,6 +1112,7 @@ static const struct address_space_operations gfs2_ordered_aops = {
 	.releasepage = gfs2_releasepage,
 	.direct_IO = gfs2_direct_IO,
 	.migratepage = buffer_migrate_page,
+	.is_partially_uptodate = block_is_partially_uptodate,
 };
 
 static const struct address_space_operations gfs2_jdata_aops = {
@@ -1125,6 +1127,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
 	.bmap = gfs2_bmap,
 	.invalidatepage = gfs2_invalidatepage,
 	.releasepage = gfs2_releasepage,
+	.is_partially_uptodate = block_is_partially_uptodate,
 };
 
 void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index c2ad363..022c66c 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -108,7 +107,7 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
 	return 0;
 }
 
-struct dentry_operations gfs2_dops = {
+const struct dentry_operations gfs2_dops = {
 	.d_revalidate = gfs2_drevalidate,
 	.d_hash = gfs2_dhash,
 };
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 7fdeb14..9200ef2 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -14,7 +14,6 @@
 #include <linux/exportfs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 93fe41b..3b9e8de 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -20,9 +20,10 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/writeback.h>
 #include <asm/uaccess.h>
+#include <linux/dlm.h>
+#include <linux/dlm_plock.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -354,7 +355,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	if (ret)
 		goto out;
 
+	set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
 	set_bit(GIF_SW_PAGED, &ip->i_flags);
+
 	ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required);
 	if (ret || !alloc_required)
 		goto out_unlock;
@@ -560,57 +563,24 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
 	return ret;
 }
 
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+
 /**
  * gfs2_setlease - acquire/release a file lease
  * @file: the file pointer
  * @arg: lease type
  * @fl: file lock
  *
+ * We don't currently have a way to enforce a lease across the whole
+ * cluster; until we do, disable leases (by just returning -EINVAL),
+ * unless the administrator has requested purely local locking.
+ *
  * Returns: errno
  */
 
 static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
 {
-	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
-
-	/*
-	 * We don't currently have a way to enforce a lease across the whole
-	 * cluster; until we do, disable leases (by just returning -EINVAL),
-	 * unless the administrator has requested purely local locking.
-	 */
-	if (!sdp->sd_args.ar_localflocks)
-		return -EINVAL;
-	return generic_setlease(file, arg, fl);
-}
-
-static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		      struct file *file, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
-				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-	return error;
-}
-
-static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		  struct file *file, int cmd, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_plock(
-				sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
-	return error;
-}
-
-static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
-		    struct file *file, struct file_lock *fl)
-{
-	int error = -EIO;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		error = sdp->sd_lockstruct.ls_ops->lm_punlock(
-				sdp->sd_lockstruct.ls_lockspace, name, file, fl);
-	return error;
+	return -EINVAL;
 }
 
 /**
@@ -626,9 +596,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
 	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
-	struct lm_lockname name =
-		{ .ln_number = ip->i_no_addr,
-		  .ln_type = LM_TYPE_PLOCK };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
@@ -640,12 +608,14 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 		cmd = F_SETLK;
 		fl->fl_type = F_UNLCK;
 	}
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
 	if (IS_GETLK(cmd))
-		return gfs2_lm_plock_get(sdp, &name, file, fl);
+		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else if (fl->fl_type == F_UNLCK)
-		return gfs2_lm_punlock(sdp, &name, file, fl);
+		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else
-		return gfs2_lm_plock(sdp, &name, file, cmd, fl);
+		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
 }
 
 static int do_flock(struct file *file, int cmd, struct file_lock *fl)
@@ -732,7 +702,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	}
 }
 
-const struct file_operations gfs2_file_fops = {
+const struct file_operations *gfs2_file_fops = &(const struct file_operations){
 	.llseek		= gfs2_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
@@ -750,7 +720,7 @@ const struct file_operations gfs2_file_fops = {
 	.setlease	= gfs2_setlease,
 };
 
-const struct file_operations gfs2_dir_fops = {
+const struct file_operations *gfs2_dir_fops = &(const struct file_operations){
 	.readdir	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
@@ -760,7 +730,9 @@ const struct file_operations gfs2_dir_fops = {
 	.flock		= gfs2_flock,
 };
 
-const struct file_operations gfs2_file_fops_nolock = {
+#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
+
+const struct file_operations *gfs2_file_fops_nolock = &(const struct file_operations){
 	.llseek		= gfs2_llseek,
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
@@ -773,10 +745,10 @@ const struct file_operations gfs2_file_fops_nolock = {
 	.fsync		= gfs2_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
-	.setlease	= gfs2_setlease,
+	.setlease	= generic_setlease,
 };
 
-const struct file_operations gfs2_dir_fops_nolock = {
+const struct file_operations *gfs2_dir_fops_nolock = &(const struct file_operations){
 	.readdir	= gfs2_readdir,
 	.unlocked_ioctl	= gfs2_ioctl,
 	.open		= gfs2_open,
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index f91eebd..51883b3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,7 +17,6 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -25,7 +24,6 @@
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
-#include "mount.h"
 #include "recovery.h"
 #include "rgrp.h"
 #include "super.h"
@@ -64,7 +62,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_quota_warn_period = 10;
 	gt->gt_quota_scale_num = 1;
 	gt->gt_quota_scale_den = 1;
-	gt->gt_quota_cache_secs = 300;
 	gt->gt_quota_quantum = 60;
 	gt->gt_new_files_jdata = 0;
 	gt->gt_max_readahead = 1 << 18;
@@ -100,7 +97,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	mutex_init(&sdp->sd_jindex_mutex);
 
 	INIT_LIST_HEAD(&sdp->sd_quota_list);
-	spin_lock_init(&sdp->sd_quota_spin);
 	mutex_init(&sdp->sd_quota_mutex);
 	init_waitqueue_head(&sdp->sd_quota_wait);
 	INIT_LIST_HEAD(&sdp->sd_trunc_list);
@@ -238,6 +234,7 @@ static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf)
 
 	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
 	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
+	memcpy(sb->sb_uuid, str->sb_uuid, 16);
 }
 
 /**
@@ -299,15 +296,15 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
 	__free_page(page);
 	return 0;
 }
+
 /**
  * gfs2_read_sb - Read super block
  * @sdp: The GFS2 superblock
- * @gl: the glock for the superblock (assumed to be held)
  * @silent: Don't print message if mount fails
  *
  */
 
-static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
+static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 {
 	u32 hash_blocks, ind_blocks, leaf_blocks;
 	u32 tmp_blocks;
@@ -527,7 +524,7 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
 		return ret;
 	}
 
-	ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent);
+	ret = gfs2_read_sb(sdp, silent);
 	if (ret) {
 		fs_err(sdp, "can't read superblock: %d\n", ret);
 		goto out;
@@ -630,13 +627,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 	return rc;
 }
 
-static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
+static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
 {
-	if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
-		return;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
-					sdp->sd_lockstruct.ls_lockspace);
+	char *message = "FIRSTMOUNT=Done";
+	char *envp[] = { message, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ls->ls_first_done = 1;
+	kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
 /**
@@ -796,7 +793,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 			}
 		}
 
-		gfs2_lm_others_may_mount(sdp);
+		gfs2_others_may_mount(sdp);
 	} else if (!sdp->sd_args.ar_spectator) {
 		error = gfs2_recover_journal(sdp->sd_jdesc);
 		if (error) {
@@ -1005,7 +1002,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo)
 		goto fail_quotad;
 
 	sdp->sd_log_flush_time = jiffies;
-	sdp->sd_jindex_refresh_time = jiffies;
 
 	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
 	error = IS_ERR(p);
@@ -1033,6 +1029,17 @@ fail:
 	return error;
 }
 
+static const match_table_t nolock_tokens = {
+	{ Opt_jid, "jid=%d\n", },
+	{ Opt_err, NULL },
+};
+
+static const struct lm_lockops nolock_ops = {
+	.lm_proto_name = "lock_nolock",
+	.lm_put_lock = kmem_cache_free,
+	.lm_tokens = &nolock_tokens,
+};
+
 /**
  * gfs2_lm_mount - mount a locking protocol
  * @sdp: the filesystem
@@ -1044,31 +1051,73 @@ fail:
 
 static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 {
-	char *proto = sdp->sd_proto_name;
-	char *table = sdp->sd_table_name;
-	int flags = LM_MFLAG_CONV_NODROP;
-	int error;
+	const struct lm_lockops *lm;
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	struct gfs2_args *args = &sdp->sd_args;
+	const char *proto = sdp->sd_proto_name;
+	const char *table = sdp->sd_table_name;
+	const char *fsname;
+	char *o, *options;
+	int ret;
 
-	if (sdp->sd_args.ar_spectator)
-		flags |= LM_MFLAG_SPECTATOR;
+	if (!strcmp("lock_nolock", proto)) {
+		lm = &nolock_ops;
+		sdp->sd_args.ar_localflocks = 1;
+		sdp->sd_args.ar_localcaching = 1;
+#ifdef CONFIG_GFS2_FS_LOCKING_DLM
+	} else if (!strcmp("lock_dlm", proto)) {
+		lm = &gfs2_dlm_ops;
+#endif
+	} else {
+		printk(KERN_INFO "GFS2: can't find protocol %s\n", proto);
+		return -ENOENT;
+	}
 
 	fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
 
-	error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
-				     gfs2_glock_cb, sdp,
-				     GFS2_MIN_LVB_SIZE, flags,
-				     &sdp->sd_lockstruct, &sdp->sd_kobj);
-	if (error) {
-		fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
-			proto, table, sdp->sd_args.ar_hostdata);
-		goto out;
-	}
+	ls->ls_ops = lm;
+	ls->ls_first = 1;
+	ls->ls_id = 0;
 
-	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
-	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
-				  GFS2_MIN_LVB_SIZE)) {
-		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
-		goto out;
+	for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
+		substring_t tmp[MAX_OPT_ARGS];
+		int token, option;
+
+		if (!o || !*o)
+			continue;
+
+		token = match_token(o, *lm->lm_tokens, tmp);
+		switch (token) {
+		case Opt_jid:
+			ret = match_int(&tmp[0], &option);
+			if (ret || option < 0) 
+				goto hostdata_error;
+			ls->ls_jid = option;
+			break;
+		case Opt_id:
+			ret = match_int(&tmp[0], &option);
+			if (ret)
+				goto hostdata_error;
+			ls->ls_id = option;
+			break;
+		case Opt_first:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_first = option;
+			break;
+		case Opt_nodir:
+			ret = match_int(&tmp[0], &option);
+			if (ret || (option != 0 && option != 1))
+				goto hostdata_error;
+			ls->ls_nodir = option;
+			break;
+		case Opt_err:
+		default:
+hostdata_error:
+			fs_info(sdp, "unknown hostdata (%s)\n", o);
+			return -EINVAL;
+		}
 	}
 
 	if (sdp->sd_args.ar_spectator)
@@ -1077,22 +1126,25 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 		snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
 			 sdp->sd_lockstruct.ls_jid);
 
-	fs_info(sdp, "Joined cluster. Now mounting FS...\n");
-
-	if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
-	    !sdp->sd_args.ar_ignore_local_fs) {
-		sdp->sd_args.ar_localflocks = 1;
-		sdp->sd_args.ar_localcaching = 1;
+	fsname = strchr(table, ':');
+	if (fsname)
+		fsname++;
+	if (lm->lm_mount == NULL) {
+		fs_info(sdp, "Now mounting FS...\n");
+		return 0;
 	}
-
-out:
-	return error;
+	ret = lm->lm_mount(sdp, fsname);
+	if (ret == 0)
+		fs_info(sdp, "Joined cluster. Now mounting FS...\n");
+	return ret;
 }
 
 void gfs2_lm_unmount(struct gfs2_sbd *sdp)
 {
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
+	const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	    lm->lm_unmount)
+		lm->lm_unmount(sdp);
 }
 
 /**
@@ -1116,12 +1168,20 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 		return -ENOMEM;
 	}
 
-	error = gfs2_mount_args(sdp, (char *)data, 0);
+	sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
+	sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
+
+	error = gfs2_mount_args(sdp, &sdp->sd_args, data);
 	if (error) {
 		printk(KERN_WARNING "GFS2: can't parse mount arguments\n");
 		goto fail;
 	}
 
+	if (sdp->sd_args.ar_spectator)
+                sb->s_flags |= MS_RDONLY;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+
 	sb->s_magic = GFS2_MAGIC;
 	sb->s_op = &gfs2_super_ops;
 	sb->s_export_op = &gfs2_export_ops;
@@ -1199,6 +1259,8 @@ fail_sb:
 		dput(sdp->sd_root_dir);
 	if (sdp->sd_master_dir)
 		dput(sdp->sd_master_dir);
+	if (sb->s_root)
+		dput(sb->s_root);
 	sb->s_root = NULL;
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4987754..abd5429 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,7 +18,6 @@
 #include <linux/posix_acl.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/fiemap.h>
 #include <asm/uaccess.h>
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 320323d..4580195 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -19,7 +19,6 @@
 #include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/time.h>
 
 #include "gfs2.h"
@@ -27,7 +26,6 @@
 #include "glock.h"
 #include "inode.h"
 #include "log.h"
-#include "mount.h"
 #include "quota.h"
 #include "recovery.h"
 #include "rgrp.h"
@@ -40,6 +38,8 @@
 #include "bmap.h"
 #include "meta_io.h"
 
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+
 /**
  * gfs2_write_inode - Make sure the inode is stable on the disk
  * @inode: The inode
@@ -435,25 +435,45 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
 	int error;
 
-	error = gfs2_mount_args(sdp, data, 1);
+	error = gfs2_mount_args(sdp, &args, data);
 	if (error)
 		return error;
 
+	/* Not allowed to change locking details */
+	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+		return -EINVAL;
+
+	/* Some flags must not be changed */
+	if (args_neq(&args, &sdp->sd_args, spectator) ||
+	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
+	    args_neq(&args, &sdp->sd_args, localflocks) ||
+	    args_neq(&args, &sdp->sd_args, localcaching) ||
+	    args_neq(&args, &sdp->sd_args, meta))
+		return -EINVAL;
+
 	if (sdp->sd_args.ar_spectator)
 		*flags |= MS_RDONLY;
-	else {
-		if (*flags & MS_RDONLY) {
-			if (!(sb->s_flags & MS_RDONLY))
-				error = gfs2_make_fs_ro(sdp);
-		} else if (!(*flags & MS_RDONLY) &&
-			   (sb->s_flags & MS_RDONLY)) {
+
+	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+		if (*flags & MS_RDONLY)
+			error = gfs2_make_fs_ro(sdp);
+		else
 			error = gfs2_make_fs_rw(sdp);
-		}
+		if (error)
+			return error;
 	}
 
-	return error;
+	sdp->sd_args = args;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	else
+		sb->s_flags &= ~MS_POSIXACL;
+	return 0;
 }
 
 /**
@@ -588,6 +608,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 		}
 		seq_printf(s, ",data=%s", state);
 	}
+	if (args->ar_discard)
+		seq_printf(s, ",discard");
 
 	return 0;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index b08d096..8d53f66 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -45,7 +45,6 @@
 #include <linux/fs.h>
 #include <linux/bio.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
@@ -80,6 +79,51 @@ struct gfs2_quota_change_host {
 	u32 qc_id;
 };
 
+static LIST_HEAD(qd_lru_list);
+static atomic_t qd_lru_count = ATOMIC_INIT(0);
+static spinlock_t qd_lru_lock = SPIN_LOCK_UNLOCKED;
+
+int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+{
+	struct gfs2_quota_data *qd;
+	struct gfs2_sbd *sdp;
+
+	if (nr == 0)
+		goto out;
+
+	if (!(gfp_mask & __GFP_FS))
+		return -1;
+
+	spin_lock(&qd_lru_lock);
+	while (nr && !list_empty(&qd_lru_list)) {
+		qd = list_entry(qd_lru_list.next,
+				struct gfs2_quota_data, qd_reclaim);
+		sdp = qd->qd_gl->gl_sbd;
+
+		/* Free from the filesystem-specific list */
+		list_del(&qd->qd_list);
+
+		gfs2_assert_warn(sdp, !qd->qd_change);
+		gfs2_assert_warn(sdp, !qd->qd_slot_count);
+		gfs2_assert_warn(sdp, !qd->qd_bh_count);
+
+		gfs2_glock_put(qd->qd_gl);
+		atomic_dec(&sdp->sd_quota_count);
+
+		/* Delete it from the common reclaim list */
+		list_del_init(&qd->qd_reclaim);
+		atomic_dec(&qd_lru_count);
+		spin_unlock(&qd_lru_lock);
+		kmem_cache_free(gfs2_quotad_cachep, qd);
+		spin_lock(&qd_lru_lock);
+		nr--;
+	}
+	spin_unlock(&qd_lru_lock);
+
+out:
+	return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
+}
+
 static u64 qd2offset(struct gfs2_quota_data *qd)
 {
 	u64 offset;
@@ -100,22 +144,18 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
 	if (!qd)
 		return -ENOMEM;
 
-	qd->qd_count = 1;
+	atomic_set(&qd->qd_count, 1);
 	qd->qd_id = id;
 	if (user)
 		set_bit(QDF_USER, &qd->qd_flags);
 	qd->qd_slot = -1;
+	INIT_LIST_HEAD(&qd->qd_reclaim);
 
 	error = gfs2_glock_get(sdp, 2 * (u64)id + !user,
 			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
 	if (error)
 		goto fail;
 
-	error = gfs2_lvb_hold(qd->qd_gl);
-	gfs2_glock_put(qd->qd_gl);
-	if (error)
-		goto fail;
-
 	*qdp = qd;
 
 	return 0;
@@ -135,11 +175,17 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 
 	for (;;) {
 		found = 0;
-		spin_lock(&sdp->sd_quota_spin);
+		spin_lock(&qd_lru_lock);
 		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
 			if (qd->qd_id == id &&
 			    !test_bit(QDF_USER, &qd->qd_flags) == !user) {
-				qd->qd_count++;
+				if (!atomic_read(&qd->qd_count) &&
+				    !list_empty(&qd->qd_reclaim)) {
+					/* Remove it from reclaim list */
+					list_del_init(&qd->qd_reclaim);
+					atomic_dec(&qd_lru_count);
+				}
+				atomic_inc(&qd->qd_count);
 				found = 1;
 				break;
 			}
@@ -155,11 +201,11 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 			new_qd = NULL;
 		}
 
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 
 		if (qd || !create) {
 			if (new_qd) {
-				gfs2_lvb_unhold(new_qd->qd_gl);
+				gfs2_glock_put(new_qd->qd_gl);
 				kmem_cache_free(gfs2_quotad_cachep, new_qd);
 			}
 			*qdp = qd;
@@ -175,21 +221,18 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create,
 static void qd_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-
-	spin_lock(&sdp->sd_quota_spin);
-	gfs2_assert(sdp, qd->qd_count);
-	qd->qd_count++;
-	spin_unlock(&sdp->sd_quota_spin);
+	gfs2_assert(sdp, atomic_read(&qd->qd_count));
+	atomic_inc(&qd->qd_count);
 }
 
 static void qd_put(struct gfs2_quota_data *qd)
 {
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-	spin_lock(&sdp->sd_quota_spin);
-	gfs2_assert(sdp, qd->qd_count);
-	if (!--qd->qd_count)
-		qd->qd_last_touched = jiffies;
-	spin_unlock(&sdp->sd_quota_spin);
+	if (atomic_dec_and_lock(&qd->qd_count, &qd_lru_lock)) {
+		/* Add to the reclaim list */
+		list_add_tail(&qd->qd_reclaim, &qd_lru_list);
+		atomic_inc(&qd_lru_count);
+		spin_unlock(&qd_lru_lock);
+	}
 }
 
 static int slot_get(struct gfs2_quota_data *qd)
@@ -198,10 +241,10 @@ static int slot_get(struct gfs2_quota_data *qd)
 	unsigned int c, o = 0, b;
 	unsigned char byte = 0;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 
 	if (qd->qd_slot_count++) {
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 		return 0;
 	}
 
@@ -225,13 +268,13 @@ found:
 
 	sdp->sd_quota_bitmap[c][o] |= 1 << b;
 
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	return 0;
 
 fail:
 	qd->qd_slot_count--;
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 	return -ENOSPC;
 }
 
@@ -239,23 +282,23 @@ static void slot_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 }
 
 static void slot_put(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	if (!--qd->qd_slot_count) {
 		gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
 		qd->qd_slot = -1;
 	}
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 }
 
 static int bh_get(struct gfs2_quota_data *qd)
@@ -330,7 +373,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	if (sdp->sd_vfs->s_flags & MS_RDONLY)
 		return 0;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 
 	list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
 		if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
@@ -341,8 +384,8 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 		list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
 
 		set_bit(QDF_LOCKED, &qd->qd_flags);
-		gfs2_assert_warn(sdp, qd->qd_count);
-		qd->qd_count++;
+		gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
+		atomic_inc(&qd->qd_count);
 		qd->qd_change_sync = qd->qd_change;
 		gfs2_assert_warn(sdp, qd->qd_slot_count);
 		qd->qd_slot_count++;
@@ -354,7 +397,7 @@ static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp)
 	if (!found)
 		qd = NULL;
 
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	if (qd) {
 		gfs2_assert_warn(sdp, qd->qd_change_sync);
@@ -379,24 +422,24 @@ static int qd_trylock(struct gfs2_quota_data *qd)
 	if (sdp->sd_vfs->s_flags & MS_RDONLY)
 		return 0;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 
 	if (test_bit(QDF_LOCKED, &qd->qd_flags) ||
 	    !test_bit(QDF_CHANGE, &qd->qd_flags)) {
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 		return 0;
 	}
 
 	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
 
 	set_bit(QDF_LOCKED, &qd->qd_flags);
-	gfs2_assert_warn(sdp, qd->qd_count);
-	qd->qd_count++;
+	gfs2_assert_warn(sdp, atomic_read(&qd->qd_count));
+	atomic_inc(&qd->qd_count);
 	qd->qd_change_sync = qd->qd_change;
 	gfs2_assert_warn(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
 
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	gfs2_assert_warn(sdp, qd->qd_change_sync);
 	if (bh_get(qd)) {
@@ -556,9 +599,9 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 	x = be64_to_cpu(qc->qc_change) + change;
 	qc->qc_change = cpu_to_be64(x);
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	qd->qd_change = x;
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	if (!x) {
 		gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags));
@@ -802,8 +845,8 @@ restart:
 		loff_t pos;
 		gfs2_glock_dq_uninit(q_gh);
 		error = gfs2_glock_nq_init(qd->qd_gl,
-					  LM_ST_EXCLUSIVE, GL_NOCACHE,
-					  q_gh);
+					   LM_ST_EXCLUSIVE, GL_NOCACHE,
+					   q_gh);
 		if (error)
 			return error;
 
@@ -820,7 +863,6 @@ restart:
 
 		gfs2_glock_dq_uninit(&i_gh);
 
-
 		gfs2_quota_in(&q, buf);
 		qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
 		qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
@@ -890,9 +932,9 @@ static int need_sync(struct gfs2_quota_data *qd)
 	if (!qd->qd_qb.qb_limit)
 		return 0;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	value = qd->qd_change;
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	spin_lock(&gt->gt_spin);
 	num = gt->gt_quota_scale_num;
@@ -985,9 +1027,9 @@ int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid)
 			continue;
 
 		value = (s64)be64_to_cpu(qd->qd_qb.qb_value);
-		spin_lock(&sdp->sd_quota_spin);
+		spin_lock(&qd_lru_lock);
 		value += qd->qd_change;
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 
 		if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) {
 			print_message(qd, "exceeded");
@@ -1171,13 +1213,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			qd->qd_change = qc.qc_change;
 			qd->qd_slot = slot;
 			qd->qd_slot_count = 1;
-			qd->qd_last_touched = jiffies;
 
-			spin_lock(&sdp->sd_quota_spin);
+			spin_lock(&qd_lru_lock);
 			gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
 			list_add(&qd->qd_list, &sdp->sd_quota_list);
 			atomic_inc(&sdp->sd_quota_count);
-			spin_unlock(&sdp->sd_quota_spin);
+			spin_unlock(&qd_lru_lock);
 
 			found++;
 		}
@@ -1197,73 +1238,48 @@ fail:
 	return error;
 }
 
-static void gfs2_quota_scan(struct gfs2_sbd *sdp)
-{
-	struct gfs2_quota_data *qd, *safe;
-	LIST_HEAD(dead);
-
-	spin_lock(&sdp->sd_quota_spin);
-	list_for_each_entry_safe(qd, safe, &sdp->sd_quota_list, qd_list) {
-		if (!qd->qd_count &&
-		    time_after_eq(jiffies, qd->qd_last_touched +
-			        gfs2_tune_get(sdp, gt_quota_cache_secs) * HZ)) {
-			list_move(&qd->qd_list, &dead);
-			gfs2_assert_warn(sdp,
-					 atomic_read(&sdp->sd_quota_count) > 0);
-			atomic_dec(&sdp->sd_quota_count);
-		}
-	}
-	spin_unlock(&sdp->sd_quota_spin);
-
-	while (!list_empty(&dead)) {
-		qd = list_entry(dead.next, struct gfs2_quota_data, qd_list);
-		list_del(&qd->qd_list);
-
-		gfs2_assert_warn(sdp, !qd->qd_change);
-		gfs2_assert_warn(sdp, !qd->qd_slot_count);
-		gfs2_assert_warn(sdp, !qd->qd_bh_count);
-
-		gfs2_lvb_unhold(qd->qd_gl);
-		kmem_cache_free(gfs2_quotad_cachep, qd);
-	}
-}
-
 void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 {
 	struct list_head *head = &sdp->sd_quota_list;
 	struct gfs2_quota_data *qd;
 	unsigned int x;
 
-	spin_lock(&sdp->sd_quota_spin);
+	spin_lock(&qd_lru_lock);
 	while (!list_empty(head)) {
 		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
 
-		if (qd->qd_count > 1 ||
-		    (qd->qd_count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
+		if (atomic_read(&qd->qd_count) > 1 ||
+		    (atomic_read(&qd->qd_count) &&
+		     !test_bit(QDF_CHANGE, &qd->qd_flags))) {
 			list_move(&qd->qd_list, head);
-			spin_unlock(&sdp->sd_quota_spin);
+			spin_unlock(&qd_lru_lock);
 			schedule();
-			spin_lock(&sdp->sd_quota_spin);
+			spin_lock(&qd_lru_lock);
 			continue;
 		}
 
 		list_del(&qd->qd_list);
+		/* Also remove if this qd exists in the reclaim list */
+		if (!list_empty(&qd->qd_reclaim)) {
+			list_del_init(&qd->qd_reclaim);
+			atomic_dec(&qd_lru_count);
+		}
 		atomic_dec(&sdp->sd_quota_count);
-		spin_unlock(&sdp->sd_quota_spin);
+		spin_unlock(&qd_lru_lock);
 
-		if (!qd->qd_count) {
+		if (!atomic_read(&qd->qd_count)) {
 			gfs2_assert_warn(sdp, !qd->qd_change);
 			gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		} else
 			gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
-		gfs2_lvb_unhold(qd->qd_gl);
+		gfs2_glock_put(qd->qd_gl);
 		kmem_cache_free(gfs2_quotad_cachep, qd);
 
-		spin_lock(&sdp->sd_quota_spin);
+		spin_lock(&qd_lru_lock);
 	}
-	spin_unlock(&sdp->sd_quota_spin);
+	spin_unlock(&qd_lru_lock);
 
 	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
 
@@ -1341,9 +1357,6 @@ int gfs2_quotad(void *data)
 		quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
 				   &quotad_timeo, &tune->gt_quota_quantum);
 
-		/* FIXME: This should be turned into a shrinker */
-		gfs2_quota_scan(sdp);
-
 		/* Check for & recover partially truncated inodes */
 		quotad_check_trunc_list(sdp);
 
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index cec9032..0fa5fa6 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -49,4 +49,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	return ret;
 }
 
+extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index efd09c3..247e8f7 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/lm_interface.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
@@ -427,20 +426,23 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 }
 
 
-static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
-				  unsigned int message)
+static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
+                               unsigned int message)
 {
-	if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
-		return;
-
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		sdp->sd_lockstruct.ls_ops->lm_recovery_done(
-			sdp->sd_lockstruct.ls_lockspace, jid, message);
+	char env_jid[20];
+	char env_status[20];
+	char *envp[] = { env_jid, env_status, NULL };
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        ls->ls_recover_jid_done = jid;
+        ls->ls_recover_jid_status = message;
+	sprintf(env_jid, "JID=%d", jid);
+	sprintf(env_status, "RECOVERY=%s",
+		message == LM_RD_SUCCESS ? "Done" : "Failed");
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
-
 /**
- * gfs2_recover_journal - recovery a given journal
+ * gfs2_recover_journal - recover a given journal
  * @jd: the struct gfs2_jdesc describing the journal
  *
  * Acquire the journal's lock, check to see if the journal is clean, and
@@ -561,7 +563,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
 		gfs2_glock_dq_uninit(&ji_gh);
 
-	gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
 
 	if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
 		gfs2_glock_dq_uninit(&j_gh);
@@ -581,7 +583,7 @@ fail_gunlock_j:
 	fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done");
 
 fail:
-	gfs2_lm_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
+	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
 	return error;
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8b01c63..f03d024 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -13,8 +13,8 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <linux/prefetch.h>
+#include <linux/blkdev.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -132,81 +132,90 @@ static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
 }
 
 /**
+ * gfs2_bit_search
+ * @ptr: Pointer to bitmap data
+ * @mask: Mask to use (normally 0x55555.... but adjusted for search start)
+ * @state: The state we are searching for
+ *
+ * We xor the bitmap data with a patter which is the bitwise opposite
+ * of what we are looking for, this gives rise to a pattern of ones
+ * wherever there is a match. Since we have two bits per entry, we
+ * take this pattern, shift it down by one place and then and it with
+ * the original. All the even bit positions (0,2,4, etc) then represent
+ * successful matches, so we mask with 0x55555..... to remove the unwanted
+ * odd bit positions.
+ *
+ * This allows searching of a whole u64 at once (32 blocks) with a
+ * single test (on 64 bit arches).
+ */
+
+static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
+{
+	u64 tmp;
+	static const u64 search[] = {
+		[0] = 0xffffffffffffffffULL,
+		[1] = 0xaaaaaaaaaaaaaaaaULL,
+		[2] = 0x5555555555555555ULL,
+		[3] = 0x0000000000000000ULL,
+	};
+	tmp = le64_to_cpu(*ptr) ^ search[state];
+	tmp &= (tmp >> 1);
+	tmp &= mask;
+	return tmp;
+}
+
+/**
  * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
  *       a block in a given allocation state.
  * @buffer: the buffer that holds the bitmaps
- * @buflen: the length (in bytes) of the buffer
+ * @len: the length (in bytes) of the buffer
  * @goal: start search at this block's bit-pair (within @buffer)
- * @old_state: GFS2_BLKST_XXX the state of the block we're looking for.
+ * @state: GFS2_BLKST_XXX the state of the block we're looking for.
  *
  * Scope of @goal and returned block number is only within this bitmap buffer,
  * not entire rgrp or filesystem.  @buffer will be offset from the actual
- * beginning of a bitmap block buffer, skipping any header structures.
+ * beginning of a bitmap block buffer, skipping any header structures, but
+ * headers are always a multiple of 64 bits long so that the buffer is
+ * always aligned to a 64 bit boundary.
+ *
+ * The size of the buffer is in bytes, but is it assumed that it is
+ * always ok to to read a complete multiple of 64 bits at the end
+ * of the block in case the end is no aligned to a natural boundary.
  *
  * Return: the block number (bitmap buffer scope) that was found
  */
 
-static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
-		       u8 old_state)
+static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
+		       u32 goal, u8 state)
 {
-	const u8 *byte, *start, *end;
-	int bit, startbit;
-	u32 g1, g2, misaligned;
-	unsigned long *plong;
-	unsigned long lskipval;
-
-	lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
-	g1 = (goal / GFS2_NBBY);
-	start = buffer + g1;
-	byte = start;
-        end = buffer + buflen;
-	g2 = ALIGN(g1, sizeof(unsigned long));
-	plong = (unsigned long *)(buffer + g2);
-	startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
-	misaligned = g2 - g1;
-	if (!misaligned)
-		goto ulong_aligned;
-/* parse the bitmap a byte at a time */
-misaligned:
-	while (byte < end) {
-		if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
-			return goal +
-				(((byte - start) * GFS2_NBBY) +
-				 ((bit - startbit) >> 1));
-		}
-		bit += GFS2_BIT_SIZE;
-		if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
-			bit = 0;
-			byte++;
-			misaligned--;
-			if (!misaligned) {
-				plong = (unsigned long *)byte;
-				goto ulong_aligned;
-			}
-		}
-	}
-	return BFITNOENT;
-
-/* parse the bitmap a unsigned long at a time */
-ulong_aligned:
-	/* Stop at "end - 1" or else prefetch can go past the end and segfault.
-	   We could "if" it but we'd lose some of the performance gained.
-	   This way will only slow down searching the very last 4/8 bytes
-	   depending on architecture.  I've experimented with several ways
-	   of writing this section such as using an else before the goto
-	   but this one seems to be the fastest. */
-	while ((unsigned char *)plong < end - sizeof(unsigned long)) {
-		prefetch(plong + 1);
-		if (((*plong) & LBITMASK) != lskipval)
-			break;
-		plong++;
-	}
-	if ((unsigned char *)plong < end) {
-		byte = (const u8 *)plong;
-		misaligned += sizeof(unsigned long) - 1;
-		goto misaligned;
+	u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1);
+	const __le64 *ptr = ((__le64 *)buf) + (goal >> 5);
+	const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64)));
+	u64 tmp;
+	u64 mask = 0x5555555555555555ULL;
+	u32 bit;
+
+	BUG_ON(state > 3);
+
+	/* Mask off bits we don't care about at the start of the search */
+	mask <<= spoint;
+	tmp = gfs2_bit_search(ptr, mask, state);
+	ptr++;
+	while(tmp == 0 && ptr < end) {
+		tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state);
+		ptr++;
 	}
-	return BFITNOENT;
+	/* Mask off any bits which are more than len bytes from the start */
+	if (ptr == end && (len & (sizeof(u64) - 1)))
+		tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1))));
+	/* Didn't find anything, so return */
+	if (tmp == 0)
+		return BFITNOENT;
+	ptr--;
+	bit = fls64(tmp);
+	bit--;		/* fls64 always adds one to the bit count */
+	bit /= 2;	/* two bits per entry in the bitmap */
+	return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit;
 }
 
 /**
@@ -831,6 +840,58 @@ void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd)
 	spin_unlock(&sdp->sd_rindex_spin);
 }
 
+static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
+				    const struct gfs2_bitmap *bi)
+{
+	struct super_block *sb = sdp->sd_vfs;
+	struct block_device *bdev = sb->s_bdev;
+	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
+					   bdev_hardsect_size(sb->s_bdev);
+	u64 blk;
+	sector_t start = 0;
+	sector_t nr_sects = 0;
+	int rv;
+	unsigned int x;
+
+	for (x = 0; x < bi->bi_len; x++) {
+		const u8 *orig = bi->bi_bh->b_data + bi->bi_offset + x;
+		const u8 *clone = bi->bi_clone + bi->bi_offset + x;
+		u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1));
+		diff &= 0x55;
+		if (diff == 0)
+			continue;
+		blk = offset + ((bi->bi_start + x) * GFS2_NBBY);
+		blk *= sects_per_blk; /* convert to sectors */
+		while(diff) {
+			if (diff & 1) {
+				if (nr_sects == 0)
+					goto start_new_extent;
+				if ((start + nr_sects) != blk) {
+					rv = blkdev_issue_discard(bdev, start,
+							    nr_sects, GFP_NOFS);
+					if (rv)
+						goto fail;
+					nr_sects = 0;
+start_new_extent:
+					start = blk;
+				}
+				nr_sects += sects_per_blk;
+			}
+			diff >>= 2;
+			blk += sects_per_blk;
+		}
+	}
+	if (nr_sects) {
+		rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+		if (rv)
+			goto fail;
+	}
+	return;
+fail:
+	fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv);
+	sdp->sd_args.ar_discard = 0;
+}
+
 void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 {
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
@@ -841,6 +902,8 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 		struct gfs2_bitmap *bi = rgd->rd_bits + x;
 		if (!bi->bi_clone)
 			continue;
+		if (sdp->sd_args.ar_discard)
+			gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
 		memcpy(bi->bi_clone + bi->bi_offset,
 		       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
 	}
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 141b781..601913e 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -15,7 +15,6 @@
 #include <linux/crc32.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
-#include <linux/lm_interface.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -339,7 +338,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 				    struct gfs2_holder *t_gh)
 {
 	struct gfs2_inode *ip;
-	struct gfs2_holder ji_gh;
 	struct gfs2_jdesc *jd;
 	struct lfcc *lfcc;
 	LIST_HEAD(list);
@@ -387,7 +385,6 @@ out:
 		gfs2_glock_dq_uninit(&lfcc->gh);
 		kfree(lfcc);
 	}
-	gfs2_glock_dq_uninit(&ji_gh);
 	return error;
 }
 
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index f6b8b00..b56413e 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -14,7 +14,7 @@
 #include <linux/dcache.h>
 #include "incore.h"
 
-void gfs2_lm_unmount(struct gfs2_sbd *sdp);
+extern void gfs2_lm_unmount(struct gfs2_sbd *sdp);
 
 static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 {
@@ -27,27 +27,29 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
 
 void gfs2_jindex_free(struct gfs2_sbd *sdp);
 
-struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
-int gfs2_jdesc_check(struct gfs2_jdesc *jd);
+extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
 
-int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
-			      struct gfs2_inode **ipp);
+extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid);
+extern int gfs2_jdesc_check(struct gfs2_jdesc *jd);
 
-int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
+				     struct gfs2_inode **ipp);
 
-int gfs2_statfs_init(struct gfs2_sbd *sdp);
-void gfs2_statfs_change(struct gfs2_sbd *sdp,
-			s64 total, s64 free, s64 dinodes);
-int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
 
-int gfs2_freeze_fs(struct gfs2_sbd *sdp);
-void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
+extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
+extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
+			       s64 dinodes);
+extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
+
+extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
+extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
 
 extern struct file_system_type gfs2_fs_type;
 extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
-extern struct dentry_operations gfs2_dops;
+extern const struct dentry_operations gfs2_dops;
 
 #endif /* __SUPER_DOT_H__ */
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 26c1fa7..7655f502 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -14,9 +14,8 @@
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
+#include <linux/gfs2_ondisk.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -25,6 +24,7 @@
 #include "glock.h"
 #include "quota.h"
 #include "util.h"
+#include "glops.h"
 
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
@@ -37,6 +37,30 @@ static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf)
 	return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname);
 }
 
+static int gfs2_uuid_valid(const u8 *uuid)
+{
+	int i;
+
+	for (i = 0; i < 16; i++) {
+		if (uuid[i])
+			return 1;
+	}
+	return 0;
+}
+
+static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	const u8 *uuid = sdp->sd_sb.sb_uuid;
+	buf[0] = '\0';
+	if (!gfs2_uuid_valid(uuid))
+		return 0;
+	return snprintf(buf, PAGE_SIZE, "%02X%02X%02X%02X-%02X%02X-"
+			"%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n",
+			uuid[0], uuid[1], uuid[2], uuid[3], uuid[4], uuid[5],
+			uuid[6], uuid[7], uuid[8], uuid[9], uuid[10], uuid[11],
+			uuid[12], uuid[13], uuid[14], uuid[15]);
+}
+
 static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
 {
 	unsigned int count;
@@ -148,6 +172,46 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf,
 	return len;
 }
 
+static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct gfs2_glock *gl;
+	const struct gfs2_glock_operations *glops;
+	unsigned int glmode;
+	unsigned int gltype;
+	unsigned long long glnum;
+	char mode[16];
+	int rv;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum,
+		    mode);
+	if (rv != 3)
+		return -EINVAL;
+
+	if (strcmp(mode, "EX") == 0)
+		glmode = LM_ST_UNLOCKED;
+	else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0))
+		glmode = LM_ST_DEFERRED;
+	else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0))
+		glmode = LM_ST_SHARED;
+	else
+		return -EINVAL;
+
+	if (gltype > LM_TYPE_JOURNAL)
+		return -EINVAL;
+	glops = gfs2_glops_list[gltype];
+	if (glops == NULL)
+		return -EINVAL;
+	rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl);
+	if (rv)
+		return rv;
+	gfs2_glock_cb(gl, glmode);
+	gfs2_glock_put(gl);
+	return len;
+}
+
 struct gfs2_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct gfs2_sbd *, char *);
@@ -159,22 +223,26 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
 
 GFS2_ATTR(id,                  0444, id_show,       NULL);
 GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
+GFS2_ATTR(uuid,                0444, uuid_show,     NULL);
 GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
 GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
 GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
 GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
 GFS2_ATTR(quota_refresh_user,  0200, NULL,          quota_refresh_user_store);
 GFS2_ATTR(quota_refresh_group, 0200, NULL,          quota_refresh_group_store);
+GFS2_ATTR(demote_rq,           0200, NULL,	    demote_rq_store);
 
 static struct attribute *gfs2_attrs[] = {
 	&gfs2_attr_id.attr,
 	&gfs2_attr_fsname.attr,
+	&gfs2_attr_uuid.attr,
 	&gfs2_attr_freeze.attr,
 	&gfs2_attr_withdraw.attr,
 	&gfs2_attr_statfs_sync.attr,
 	&gfs2_attr_quota_sync.attr,
 	&gfs2_attr_quota_refresh_user.attr,
 	&gfs2_attr_quota_refresh_group.attr,
+	&gfs2_attr_demote_rq.attr,
 	NULL,
 };
 
@@ -224,14 +292,145 @@ static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
 
 LOCKSTRUCT_ATTR(jid,      "%u\n");
 LOCKSTRUCT_ATTR(first,    "%u\n");
-LOCKSTRUCT_ATTR(lvb_size, "%u\n");
-LOCKSTRUCT_ATTR(flags,    "%d\n");
 
 static struct attribute *lockstruct_attrs[] = {
 	&lockstruct_attr_jid.attr,
 	&lockstruct_attr_first.attr,
-	&lockstruct_attr_lvb_size.attr,
-	&lockstruct_attr_flags.attr,
+	NULL,
+};
+
+/*
+ * lock_module. Originally from lock_dlm
+ */
+
+static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf)
+{
+	const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops;
+	return sprintf(buf, "%s\n", ops->lm_proto_name);
+}
+
+static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret;
+	int val = 0;
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
+		val = 1;
+	ret = sprintf(buf, "%d\n", val);
+	return ret;
+}
+
+static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ssize_t ret = len;
+	int val;
+
+	val = simple_strtol(buf, NULL, 0);
+
+	if (val == 1)
+		set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+	else if (val == 0) {
+		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
+		smp_mb__after_clear_bit();
+		gfs2_glock_thaw(sdp);
+	} else {
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%u\n", ls->ls_id);
+}
+
+static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first);
+}
+
+static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_first_done);
+}
+
+static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid);
+}
+
+static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+{
+	struct gfs2_jdesc *jd;
+
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (jd->jd_jid != jid)
+			continue;
+		jd->jd_dirty = 1;
+		break;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+}
+
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
+	gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
+	if (sdp->sd_recoverd_process)
+		wake_up_process(sdp->sd_recoverd_process);
+	return len;
+}
+
+static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_done);
+}
+
+static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
+{
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
+}
+
+struct gdlm_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *sdp, char *);
+	ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
+};
+
+#define GDLM_ATTR(_name,_mode,_show,_store) \
+static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+
+GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
+GDLM_ATTR(block,          0644, block_show,          block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
+GDLM_ATTR(id,             0444, lkid_show,           NULL);
+GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
+GDLM_ATTR(recover,        0644, recover_show,        recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+
+static struct attribute *lock_module_attrs[] = {
+	&gdlm_attr_proto_name.attr,
+	&gdlm_attr_block.attr,
+	&gdlm_attr_withdraw.attr,
+	&gdlm_attr_id.attr,
+	&lockstruct_attr_jid.attr,
+	&gdlm_attr_first.attr,
+	&gdlm_attr_first_done.attr,
+	&gdlm_attr_recover.attr,
+	&gdlm_attr_recover_done.attr,
+	&gdlm_attr_recover_status.attr,
 	NULL,
 };
 
@@ -373,7 +572,6 @@ TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
-TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
 TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
@@ -389,7 +587,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_complain_secs.attr,
 	&tune_attr_statfs_slow.attr,
 	&tune_attr_quota_simul_sync.attr,
-	&tune_attr_quota_cache_secs.attr,
 	&tune_attr_stall_secs.attr,
 	&tune_attr_statfs_quantum.attr,
 	&tune_attr_recoverd_secs.attr,
@@ -414,6 +611,11 @@ static struct attribute_group tune_group = {
 	.attrs = tune_attrs,
 };
 
+static struct attribute_group lock_module_group = {
+	.name = "lock_module",
+	.attrs = lock_module_attrs,
+};
+
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
 	int error;
@@ -436,9 +638,15 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail_args;
 
+	error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
+	if (error)
+		goto fail_tune;
+
 	kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
 	return 0;
 
+fail_tune:
+	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_args:
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 fail_lockstruct:
@@ -455,15 +663,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
+	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
 
+
 static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
 		       struct kobj_uevent_env *env)
 {
 	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	const u8 *uuid = sdp->sd_sb.sb_uuid;
+
 	add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
 	add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+	if (gfs2_uuid_valid(uuid)) {
+		add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
+			       "%02X%02X-%02X%02X%02X%02X%02X%02X",
+			       uuid[0], uuid[1], uuid[2], uuid[3], uuid[4],
+			       uuid[5], uuid[6], uuid[7], uuid[8], uuid[9],
+			       uuid[10], uuid[11], uuid[12], uuid[13],
+			       uuid[14], uuid[15]);
+	}
 	return 0;
 }
 
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index f677b8a..053752d 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -12,9 +12,8 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
 #include <linux/kallsyms.h>
-#include <linux/lm_interface.h>
+#include <linux/gfs2_ondisk.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -88,9 +87,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 
 	if (!tr->tr_touched) {
 		gfs2_log_release(sdp, tr->tr_reserved);
-		gfs2_glock_dq(&tr->tr_t_gh);
-		gfs2_holder_uninit(&tr->tr_t_gh);
-		kfree(tr);
+		if (tr->tr_t_gh.gh_gl) {
+			gfs2_glock_dq(&tr->tr_t_gh);
+			gfs2_holder_uninit(&tr->tr_t_gh);
+			kfree(tr);
+		}
 		return;
 	}
 
@@ -106,9 +107,11 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 	}
 
 	gfs2_log_commit(sdp, tr);
-        gfs2_glock_dq(&tr->tr_t_gh);
-        gfs2_holder_uninit(&tr->tr_t_gh);
-        kfree(tr);
+	if (tr->tr_t_gh.gh_gl) {
+		gfs2_glock_dq(&tr->tr_t_gh);
+		gfs2_holder_uninit(&tr->tr_t_gh);
+		kfree(tr);
+	}
 
 	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
 		gfs2_log_flush(sdp, NULL);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 374f50e..9d12b11 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -13,7 +13,6 @@
 #include <linux/buffer_head.h>
 #include <linux/crc32.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/lm_interface.h>
 #include <asm/uaccess.h>
 
 #include "gfs2.h"
@@ -35,6 +34,8 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
 
 int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 {
+	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+	const struct lm_lockops *lm = ls->ls_ops;
 	va_list args;
 
 	if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
@@ -47,8 +48,12 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
 	fs_err(sdp, "about to withdraw this file system\n");
 	BUG_ON(sdp->sd_args.ar_debug);
 
-	fs_err(sdp, "telling LM to withdraw\n");
-	gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
+	kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+
+	if (lm->lm_unmount) {
+		fs_err(sdp, "telling LM to unmount\n");
+		lm->lm_unmount(sdp);
+	}
 	fs_err(sdp, "withdrawn\n");
 	dump_stack();
 
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 9955232..052387e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -213,7 +213,7 @@ extern void hfs_mdb_put(struct super_block *);
 extern int hfs_part_find(struct super_block *, sector_t *, sector_t *);
 
 /* string.c */
-extern struct dentry_operations hfs_dentry_operations;
+extern const struct dentry_operations hfs_dentry_operations;
 
 extern int hfs_hash_dentry(struct dentry *, struct qstr *);
 extern int hfs_strcmp(const unsigned char *, unsigned int,
diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c
index 5bf89ec..7478f5c 100644
--- a/fs/hfs/sysdep.c
+++ b/fs/hfs/sysdep.c
@@ -31,7 +31,7 @@ static int hfs_revalidate_dentry(struct dentry *dentry, struct nameidata *nd)
 	return 1;
 }
 
-struct dentry_operations hfs_dentry_operations =
+const struct dentry_operations hfs_dentry_operations =
 {
 	.d_revalidate	= hfs_revalidate_dentry,
 	.d_hash		= hfs_hash_dentry,
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index f027a90..5c10d80 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -327,7 +327,7 @@ void hfsplus_file_truncate(struct inode *);
 /* inode.c */
 extern const struct address_space_operations hfsplus_aops;
 extern const struct address_space_operations hfsplus_btree_aops;
-extern struct dentry_operations hfsplus_dentry_operations;
+extern const struct dentry_operations hfsplus_dentry_operations;
 
 void hfsplus_inode_read_fork(struct inode *, struct hfsplus_fork_raw *);
 void hfsplus_inode_write_fork(struct inode *, struct hfsplus_fork_raw *);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f105ee9..1bcf597 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -137,7 +137,7 @@ const struct address_space_operations hfsplus_aops = {
 	.writepages	= hfsplus_writepages,
 };
 
-struct dentry_operations hfsplus_dentry_operations = {
+const struct dentry_operations hfsplus_dentry_operations = {
 	.d_hash       = hfsplus_hash_dentry,
 	.d_compare    = hfsplus_compare_dentry,
 };
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5c538e0..fe02ad4 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -31,12 +31,12 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
 
 #define FILE_HOSTFS_I(file) HOSTFS_I((file)->f_path.dentry->d_inode)
 
-int hostfs_d_delete(struct dentry *dentry)
+static int hostfs_d_delete(struct dentry *dentry)
 {
 	return 1;
 }
 
-struct dentry_operations hostfs_dentry_ops = {
+static const struct dentry_operations hostfs_dentry_ops = {
 	.d_delete		= hostfs_d_delete,
 };
 
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 0831912..940d6d1 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -49,7 +49,7 @@ static int hpfs_compare_dentry(struct dentry *dentry, struct qstr *a, struct qst
 	return 0;
 }
 
-static struct dentry_operations hpfs_dentry_operations = {
+static const struct dentry_operations hpfs_dentry_operations = {
 	.d_hash		= hpfs_hash_dentry,
 	.d_compare	= hpfs_compare_dentry,
 };
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37..9b800d9 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	if (hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
-				len >> huge_page_shift(h), vma))
+				len >> huge_page_shift(h), vma,
+				vma->vm_flags))
 		goto out;
 
 	ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
 			can_do_mlock());
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
 	int error = -ENOMEM;
 	struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 
 	error = -ENOMEM;
 	if (hugetlb_reserve_pages(inode, 0,
-			size >> huge_page_shift(hstate_inode(inode)), NULL))
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/fs/inode.c b/fs/inode.c
index 913ab2d..d06d6d2 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
@@ -147,13 +148,13 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_cdev = NULL;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
-	if (security_inode_alloc(inode)) {
-		if (inode->i_sb->s_op->destroy_inode)
-			inode->i_sb->s_op->destroy_inode(inode);
-		else
-			kmem_cache_free(inode_cachep, (inode));
-		return NULL;
-	}
+
+	if (security_inode_alloc(inode))
+		goto out_free_inode;
+
+	/* allocate and initialize an i_integrity */
+	if (ima_inode_alloc(inode))
+		goto out_free_security;
 
 	spin_lock_init(&inode->i_lock);
 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -189,6 +190,15 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_mapping = mapping;
 
 	return inode;
+
+out_free_security:
+	security_inode_free(inode);
+out_free_inode:
+	if (inode->i_sb->s_op->destroy_inode)
+		inode->i_sb->s_op->destroy_inode(inode);
+	else
+		kmem_cache_free(inode_cachep, (inode));
+	return NULL;
 }
 EXPORT_SYMBOL(inode_init_always);
 
@@ -284,7 +294,7 @@ void clear_inode(struct inode *inode)
 	BUG_ON(!(inode->i_state & I_FREEING));
 	BUG_ON(inode->i_state & I_CLEAR);
 	inode_sync_wait(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	if (inode->i_sb->s_op->clear_inode)
 		inode->i_sb->s_op->clear_inode(inode);
 	if (S_ISBLK(inode->i_mode) && inode->i_bdev)
@@ -356,9 +366,12 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose)
 		if (tmp == head)
 			break;
 		inode = list_entry(tmp, struct inode, i_sb_list);
+		if (inode->i_state & I_NEW)
+			continue;
 		invalidate_inode_buffers(inode);
 		if (!atomic_read(&inode->i_count)) {
 			list_move(&inode->i_list, dispose);
+			WARN_ON(inode->i_state & I_NEW);
 			inode->i_state |= I_FREEING;
 			count++;
 			continue;
@@ -460,6 +473,7 @@ static void prune_icache(int nr_to_scan)
 				continue;
 		}
 		list_move(&inode->i_list, &freeable);
+		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_FREEING;
 		nr_pruned++;
 	}
@@ -656,6 +670,7 @@ void unlock_new_inode(struct inode *inode)
 	 * just created it (so there can be no old holders
 	 * that haven't tested I_LOCK).
 	 */
+	WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW));
 	inode->i_state &= ~(I_LOCK|I_NEW);
 	wake_up_inode(inode);
 }
@@ -1145,6 +1160,7 @@ void generic_delete_inode(struct inode *inode)
 
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
+	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
@@ -1154,7 +1170,7 @@ void generic_delete_inode(struct inode *inode)
 	if (op->delete_inode) {
 		void (*delete)(struct inode *) = op->delete_inode;
 		if (!is_bad_inode(inode))
-			DQUOT_INIT(inode);
+			vfs_dq_init(inode);
 		/* Filesystems implementing their own
 		 * s_op->delete_inode are required to call
 		 * truncate_inode_pages and clear_inode()
@@ -1186,16 +1202,19 @@ static void generic_forget_inode(struct inode *inode)
 			spin_unlock(&inode_lock);
 			return;
 		}
+		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state |= I_WILL_FREE;
 		spin_unlock(&inode_lock);
 		write_inode_now(inode, 1);
 		spin_lock(&inode_lock);
+		WARN_ON(inode->i_state & I_NEW);
 		inode->i_state &= ~I_WILL_FREE;
 		inodes_stat.nr_unused--;
 		hlist_del_init(&inode->i_hash);
 	}
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
+	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
@@ -1283,6 +1302,40 @@ sector_t bmap(struct inode * inode, sector_t block)
 }
 EXPORT_SYMBOL(bmap);
 
+/*
+ * With relative atime, only update atime if the previous atime is
+ * earlier than either the ctime or mtime or if at least a day has
+ * passed since the last atime update.
+ */
+static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
+			     struct timespec now)
+{
+
+	if (!(mnt->mnt_flags & MNT_RELATIME))
+		return 1;
+	/*
+	 * Is mtime younger than atime? If yes, update atime:
+	 */
+	if (timespec_compare(&inode->i_mtime, &inode->i_atime) >= 0)
+		return 1;
+	/*
+	 * Is ctime younger than atime? If yes, update atime:
+	 */
+	if (timespec_compare(&inode->i_ctime, &inode->i_atime) >= 0)
+		return 1;
+
+	/*
+	 * Is the previous atime value older than a day? If yes,
+	 * update atime:
+	 */
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+		return 1;
+	/*
+	 * Good, we can skip the atime update:
+	 */
+	return 0;
+}
+
 /**
  *	touch_atime	-	update the access time
  *	@mnt: mount the inode is accessed on
@@ -1310,17 +1363,12 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
 		goto out;
 	if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
 		goto out;
-	if (mnt->mnt_flags & MNT_RELATIME) {
-		/*
-		 * With relative atime, only update atime if the previous
-		 * atime is earlier than either the ctime or mtime.
-		 */
-		if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
-		    timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
-			goto out;
-	}
 
 	now = current_fs_time(inode->i_sb);
+
+	if (!relatime_need_update(mnt, inode, now))
+		goto out;
+
 	if (timespec_equal(&inode->i_atime, &now))
 		goto out;
 
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 240ec639..ac2d47e 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -404,10 +404,12 @@ static int ioctl_fionbio(struct file *filp, int __user *argp)
 	if (O_NONBLOCK != O_NDELAY)
 		flag |= O_NDELAY;
 #endif
+	spin_lock(&filp->f_lock);
 	if (on)
 		filp->f_flags |= flag;
 	else
 		filp->f_flags &= ~flag;
+	spin_unlock(&filp->f_lock);
 	return error;
 }
 
@@ -425,18 +427,12 @@ static int ioctl_fioasync(unsigned int fd, struct file *filp,
 	/* Did FASYNC state change ? */
 	if ((flag ^ filp->f_flags) & FASYNC) {
 		if (filp->f_op && filp->f_op->fasync)
+			/* fasync() adjusts filp->f_flags */
 			error = filp->f_op->fasync(fd, filp, on);
 		else
 			error = -ENOTTY;
 	}
-	if (error)
-		return error;
-
-	if (on)
-		filp->f_flags |= FASYNC;
-	else
-		filp->f_flags &= ~FASYNC;
-	return error;
+	return error < 0 ? error : 0;
 }
 
 static int ioctl_fsfreeze(struct file *filp)
@@ -499,17 +495,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		break;
 
 	case FIONBIO:
-		/* BKL needed to avoid races tweaking f_flags */
-		lock_kernel();
 		error = ioctl_fionbio(filp, argp);
-		unlock_kernel();
 		break;
 
 	case FIOASYNC:
-		/* BKL needed to avoid races tweaking f_flags */
-		lock_kernel();
 		error = ioctl_fioasync(fd, filp, argp);
-		unlock_kernel();
 		break;
 
 	case FIOQSIZE:
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 6147ec3..13d2edd 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,7 +114,7 @@ static const struct super_operations isofs_sops = {
 };
 
 
-static struct dentry_operations isofs_dentry_ops[] = {
+static const struct dentry_operations isofs_dentry_ops[] = {
 	{
 		.d_hash		= isofs_hash,
 		.d_compare	= isofs_dentry_cmp,
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52..e79c078 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
  */
 int __log_start_commit(journal_t *journal, tid_t target)
 {
@@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal)
 
 /*
  * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
  */
 int journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
-		ret = __log_start_commit(journal, tid);
-		if (ret && ptid)
+		__log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
 			*ptid = tid;
-	} else if (journal->j_committing_transaction && ptid) {
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
 		/*
 		 * If ext3_write_super() recently started a commit, then we
 		 * have to wait for completion of that transaction
 		 */
-		*ptid = journal->j_committing_transaction->t_tid;
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
 	spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb34300..5814410 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
  */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 
 /*
  * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
  */
 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
-		ret = __jbd2_log_start_commit(journal, tid);
-		if (ret && ptid)
+		__jbd2_log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
 			*ptid = tid;
-	} else if (journal->j_committing_transaction && ptid) {
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
 		/*
 		 * If ext3_write_super() recently started a commit, then we
 		 * have to wait for completion of that transaction
 		 */
-		*ptid = journal->j_committing_transaction->t_tid;
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
 	spin_unlock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e34..28ce21d 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
 }
 
 /*
- * This function must be called when inode is journaled in ordered mode
- * before truncation happens. It starts writeout of truncated part in
- * case it is in the committing transaction so that we stand to ordered
- * mode consistency guarantees.
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way.  If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
  */
-int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+					struct jbd2_inode *jinode,
 					loff_t new_size)
 {
-	journal_t *journal;
-	transaction_t *commit_trans;
+	transaction_t *inode_trans, *commit_trans;
 	int ret = 0;
 
-	if (!inode->i_transaction && !inode->i_next_transaction)
+	/* This is a quick check to avoid locking if not necessary */
+	if (!jinode->i_transaction)
 		goto out;
-	journal = inode->i_transaction->t_journal;
+	/* Locks are here just to force reading of recent values, it is
+	 * enough that the transaction was not committing before we started
+	 * a transaction adding the inode to orphan list */
 	spin_lock(&journal->j_state_lock);
 	commit_trans = journal->j_committing_transaction;
 	spin_unlock(&journal->j_state_lock);
-	if (inode->i_transaction == commit_trans) {
-		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+	spin_lock(&journal->j_list_lock);
+	inode_trans = jinode->i_transaction;
+	spin_unlock(&journal->j_list_lock);
+	if (inode_trans == commit_trans) {
+		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
 			new_size, LLONG_MAX);
 		if (ret)
 			jbd2_journal_abort(journal, ret);
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3cceef4..e958010 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c)
 			spin_unlock(&c->erase_completion_lock);
 			
 
-		/* This thread is purely an optimisation. But if it runs when
-		   other things could be running, it actually makes things a
-		   lot worse. Use yield() and put it at the back of the runqueue
-		   every time. Especially during boot, pulling an inode in
-		   with read_inode() is much preferable to having the GC thread
-		   get there first. */
-		yield();
+		/* Problem - immediately after bootup, the GCD spends a lot
+		 * of time in places like jffs2_kill_fragtree(); so much so
+		 * that userspace processes (like gdm and X) are starved
+		 * despite plenty of cond_resched()s and renicing.  Yield()
+		 * doesn't help, either (presumably because userspace and GCD
+		 * are generally competing for a higher latency resource -
+		 * disk).
+		 * This forces the GCD to slow the hell down.   Pulling an
+		 * inode in with read_inode() is much preferable to having
+		 * the GC thread get there first. */
+		schedule_timeout_interruptible(msecs_to_jiffies(50));
 
 		/* Put_super will send a SIGKILL and then wait on the sem.
 		 */
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 6ca08ad..1fc1e92 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 				struct jffs2_tmp_dnode_info *tn)
 {
 	uint32_t fn_end = tn->fn->ofs + tn->fn->size;
-	struct jffs2_tmp_dnode_info *this;
+	struct jffs2_tmp_dnode_info *this, *ptn;
 
 	dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
 
@@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 	if (this) {
 		/* If the node is coincident with another at a lower address,
 		   back up until the other node is found. It may be relevant */
-		while (this->overlapped)
-			this = tn_prev(this);
-
-		/* First node should never be marked overlapped */
-		BUG_ON(!this);
+		while (this->overlapped) {
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
+		}
 		dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
 	}
 
@@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 			}
 			if (!this->overlapped)
 				break;
-			this = tn_prev(this);
+
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
 		}
 	}
 
@@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
 		eat_last(&rii->tn_root, &last->rb);
 		ver_insert(&ver_root, last);
 
-		if (unlikely(last->overlapped))
-			continue;
+		if (unlikely(last->overlapped)) {
+			if (pen)
+				continue;
+			/*
+			 * We killed a node which set the overlapped
+			 * flags during the scan. Fix it up.
+			 */
+			last->overlapped = 0;
+		}
 
 		/* Now we have a bunch of nodes in reverse version
 		   order, in the tree at ver_root. Most of the time,
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index d3e5c33..a166c16 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -233,7 +233,7 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr)
 
 	if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
 	    (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-		if (DQUOT_TRANSFER(inode, iattr))
+		if (vfs_dq_transfer(inode, iattr))
 			return -EDQUOT;
 	}
 
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b00ee9f..b2ae190 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -158,9 +158,9 @@ void jfs_delete_inode(struct inode *inode)
 		/*
 		 * Free the inode from the quota allocation.
 		 */
-		DQUOT_INIT(inode);
-		DQUOT_FREE_INODE(inode);
-		DQUOT_DROP(inode);
+		vfs_dq_init(inode);
+		vfs_dq_free_inode(inode);
+		vfs_dq_drop(inode);
 	}
 
 	clear_inode(inode);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 4dcc058..925871e 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -381,10 +381,10 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 		 * It's time to move the inline table to an external
 		 * page and begin to build the xtree
 		 */
-		if (DQUOT_ALLOC_BLOCK(ip, sbi->nbperpage))
+		if (vfs_dq_alloc_block(ip, sbi->nbperpage))
 			goto clean_up;
 		if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) {
-			DQUOT_FREE_BLOCK(ip, sbi->nbperpage);
+			vfs_dq_free_block(ip, sbi->nbperpage);
 			goto clean_up;
 		}
 
@@ -408,7 +408,7 @@ static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot)
 			memcpy(&jfs_ip->i_dirtable, temp_table,
 			       sizeof (temp_table));
 			dbFree(ip, xaddr, sbi->nbperpage);
-			DQUOT_FREE_BLOCK(ip, sbi->nbperpage);
+			vfs_dq_free_block(ip, sbi->nbperpage);
 			goto clean_up;
 		}
 		ip->i_size = PSIZE;
@@ -1027,7 +1027,7 @@ static int dtSplitUp(tid_t tid,
 			n = xlen;
 
 		/* Allocate blocks to quota. */
-		if (DQUOT_ALLOC_BLOCK(ip, n)) {
+		if (vfs_dq_alloc_block(ip, n)) {
 			rc = -EDQUOT;
 			goto extendOut;
 		}
@@ -1308,7 +1308,7 @@ static int dtSplitUp(tid_t tid,
 
 	/* Rollback quota allocation */
 	if (rc && quota_allocation)
-		DQUOT_FREE_BLOCK(ip, quota_allocation);
+		vfs_dq_free_block(ip, quota_allocation);
 
       dtSplitUp_Exit:
 
@@ -1369,7 +1369,7 @@ static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split,
 		return -EIO;
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
 		release_metapage(rmp);
 		return -EDQUOT;
 	}
@@ -1916,7 +1916,7 @@ static int dtSplitRoot(tid_t tid,
 	rp = rmp->data;
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
 		release_metapage(rmp);
 		return -EDQUOT;
 	}
@@ -2287,7 +2287,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 	xlen = lengthPXD(&fp->header.self);
 
 	/* Free quota allocation. */
-	DQUOT_FREE_BLOCK(ip, xlen);
+	vfs_dq_free_block(ip, xlen);
 
 	/* free/invalidate its buffer page */
 	discard_metapage(fmp);
@@ -2363,7 +2363,7 @@ static int dtDeleteUp(tid_t tid, struct inode *ip,
 				xlen = lengthPXD(&p->header.self);
 
 				/* Free quota allocation */
-				DQUOT_FREE_BLOCK(ip, xlen);
+				vfs_dq_free_block(ip, xlen);
 
 				/* free/invalidate its buffer page */
 				discard_metapage(mp);
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 16bc326..bbbd5f2 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -141,7 +141,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 	}
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, nxlen)) {
+	if (vfs_dq_alloc_block(ip, nxlen)) {
 		dbFree(ip, nxaddr, (s64) nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
 		return -EDQUOT;
@@ -164,7 +164,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
 	 */
 	if (rc) {
 		dbFree(ip, nxaddr, nxlen);
-		DQUOT_FREE_BLOCK(ip, nxlen);
+		vfs_dq_free_block(ip, nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
 		return (rc);
 	}
@@ -256,7 +256,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		goto exit;
 
 	/* Allocat blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, nxlen)) {
+	if (vfs_dq_alloc_block(ip, nxlen)) {
 		dbFree(ip, nxaddr, (s64) nxlen);
 		mutex_unlock(&JFS_IP(ip)->commit_mutex);
 		return -EDQUOT;
@@ -297,7 +297,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		/* extend the extent */
 		if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) {
 			dbFree(ip, xaddr + xlen, delta);
-			DQUOT_FREE_BLOCK(ip, nxlen);
+			vfs_dq_free_block(ip, nxlen);
 			goto exit;
 		}
 	} else {
@@ -308,7 +308,7 @@ int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr)
 		 */
 		if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) {
 			dbFree(ip, nxaddr, nxlen);
-			DQUOT_FREE_BLOCK(ip, nxlen);
+			vfs_dq_free_block(ip, nxlen);
 			goto exit;
 		}
 	}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index d4d142c..dc0e021 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -116,7 +116,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	/*
 	 * Allocate inode to quota.
 	 */
-	if (DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		rc = -EDQUOT;
 		goto fail_drop;
 	}
@@ -162,7 +162,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
 	return inode;
 
 fail_drop:
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 fail_unlock:
 	inode->i_nlink = 0;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index adb2faf..1eff7db 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -47,5 +47,5 @@ extern const struct file_operations jfs_dir_operations;
 extern const struct inode_operations jfs_file_inode_operations;
 extern const struct file_operations jfs_file_operations;
 extern const struct inode_operations jfs_symlink_inode_operations;
-extern struct dentry_operations jfs_ci_dentry_operations;
+extern const struct dentry_operations jfs_ci_dentry_operations;
 #endif				/* _H_JFS_INODE */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 44380fb..d654a64 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -585,10 +585,10 @@ int xtInsert(tid_t tid,		/* transaction id */
 			hint = addressXAD(xad) + lengthXAD(xad) - 1;
 		} else
 			hint = 0;
-		if ((rc = DQUOT_ALLOC_BLOCK(ip, xlen)))
+		if ((rc = vfs_dq_alloc_block(ip, xlen)))
 			goto out;
 		if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) {
-			DQUOT_FREE_BLOCK(ip, xlen);
+			vfs_dq_free_block(ip, xlen);
 			goto out;
 		}
 	}
@@ -617,7 +617,7 @@ int xtInsert(tid_t tid,		/* transaction id */
 			/* undo data extent allocation */
 			if (*xaddrp == 0) {
 				dbFree(ip, xaddr, (s64) xlen);
-				DQUOT_FREE_BLOCK(ip, xlen);
+				vfs_dq_free_block(ip, xlen);
 			}
 			return rc;
 		}
@@ -985,7 +985,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
 	rbn = addressPXD(pxd);
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
 		rc = -EDQUOT;
 		goto clean_up;
 	}
@@ -1195,7 +1195,7 @@ xtSplitPage(tid_t tid, struct inode *ip,
 
 	/* Rollback quota allocation. */
 	if (quota_allocation)
-		DQUOT_FREE_BLOCK(ip, quota_allocation);
+		vfs_dq_free_block(ip, quota_allocation);
 
 	return (rc);
 }
@@ -1252,7 +1252,7 @@ xtSplitRoot(tid_t tid,
 		return -EIO;
 
 	/* Allocate blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, lengthPXD(pxd))) {
+	if (vfs_dq_alloc_block(ip, lengthPXD(pxd))) {
 		release_metapage(rmp);
 		return -EDQUOT;
 	}
@@ -3680,7 +3680,7 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag)
 		ip->i_size = newsize;
 
 	/* update quota allocation to reflect freed blocks */
-	DQUOT_FREE_BLOCK(ip, nfreed);
+	vfs_dq_free_block(ip, nfreed);
 
 	/*
 	 * free tlock of invalidated pages
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index b4de56b..514ee2e 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -35,7 +35,7 @@
 /*
  * forward references
  */
-struct dentry_operations jfs_ci_dentry_operations;
+const struct dentry_operations jfs_ci_dentry_operations;
 
 static s64 commitZeroLink(tid_t, struct inode *);
 
@@ -356,7 +356,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry)
 	jfs_info("jfs_rmdir: dip:0x%p name:%s", dip, dentry->d_name.name);
 
 	/* Init inode for quota operations. */
-	DQUOT_INIT(ip);
+	vfs_dq_init(ip);
 
 	/* directory must be empty to be removed */
 	if (!dtEmpty(ip)) {
@@ -483,7 +483,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry)
 	jfs_info("jfs_unlink: dip:0x%p name:%s", dip, dentry->d_name.name);
 
 	/* Init inode for quota operations. */
-	DQUOT_INIT(ip);
+	vfs_dq_init(ip);
 
 	if ((rc = get_UCSname(&dname, dentry)))
 		goto out;
@@ -1136,7 +1136,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	} else if (new_ip) {
 		IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL);
 		/* Init inode for quota operations. */
-		DQUOT_INIT(new_ip);
+		vfs_dq_init(new_ip);
 	}
 
 	/*
@@ -1595,7 +1595,7 @@ out:
 	return result;
 }
 
-struct dentry_operations jfs_ci_dentry_operations =
+const struct dentry_operations jfs_ci_dentry_operations =
 {
 	.d_hash = jfs_ci_hash,
 	.d_compare = jfs_ci_compare,
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 9b7f2cd..61dfa81 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -260,14 +260,14 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 	nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits;
 
 	/* Allocate new blocks to quota. */
-	if (DQUOT_ALLOC_BLOCK(ip, nblocks)) {
+	if (vfs_dq_alloc_block(ip, nblocks)) {
 		return -EDQUOT;
 	}
 
 	rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno);
 	if (rc) {
 		/*Rollback quota allocation. */
-		DQUOT_FREE_BLOCK(ip, nblocks);
+		vfs_dq_free_block(ip, nblocks);
 		return rc;
 	}
 
@@ -332,7 +332,7 @@ static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size,
 
       failed:
 	/* Rollback quota allocation. */
-	DQUOT_FREE_BLOCK(ip, nblocks);
+	vfs_dq_free_block(ip, nblocks);
 
 	dbFree(ip, blkno, nblocks);
 	return rc;
@@ -538,7 +538,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
 
 	if (blocks_needed > current_blocks) {
 		/* Allocate new blocks to quota. */
-		if (DQUOT_ALLOC_BLOCK(inode, blocks_needed))
+		if (vfs_dq_alloc_block(inode, blocks_needed))
 			return -EDQUOT;
 
 		quota_allocation = blocks_needed;
@@ -602,7 +602,7 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size)
       clean_up:
 	/* Rollback quota allocation */
 	if (quota_allocation)
-		DQUOT_FREE_BLOCK(inode, quota_allocation);
+		vfs_dq_free_block(inode, quota_allocation);
 
 	return (rc);
 }
@@ -677,7 +677,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf,
 
 	/* If old blocks exist, they must be removed from quota allocation. */
 	if (old_blocks)
-		DQUOT_FREE_BLOCK(inode, old_blocks);
+		vfs_dq_free_block(inode, old_blocks);
 
 	inode->i_ctime = CURRENT_TIME;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 49b4409..4910a36 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -44,7 +44,7 @@ static int simple_delete_dentry(struct dentry *dentry)
  */
 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	static struct dentry_operations simple_dentry_operations = {
+	static const struct dentry_operations simple_dentry_operations = {
 		.d_delete = simple_delete_dentry,
 	};
 
@@ -242,7 +242,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 
 Enomem:
 	up_write(&s->s_umount);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 1f3b0fc..aedc47a 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
 	return 0;
 }
 
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
+						 struct in6_addr *addr_mapped)
+{
+	const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
+
+	switch (sap->sa_family) {
+	case AF_INET6:
+		return &((const struct sockaddr_in6 *)sap)->sin6_addr;
+	case AF_INET:
+		ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
+		return addr_mapped;
+	}
+
+	return NULL;
+}
+
+/*
+ * If lockd is using a PF_INET6 listener, all incoming requests appear
+ * to come from AF_INET6 remotes.  The address of AF_INET remotes are
+ * mapped to AF_INET6 automatically by the network layer.  In case the
+ * user passed an AF_INET server address at mount time, ensure both
+ * addresses are AF_INET6 before comparing them.
+ */
+static int nlmclnt_cmp_addr(const struct nlm_host *host,
+			    const struct sockaddr *sap)
+{
+	const struct in6_addr *addr1;
+	const struct in6_addr *addr2;
+	struct in6_addr addr1_mapped;
+	struct in6_addr addr2_mapped;
+
+	addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
+	if (likely(addr1 != NULL)) {
+		addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
+		if (likely(addr2 != NULL))
+			return ipv6_addr_equal(addr1, addr2);
+	}
+
+	return 0;
+}
+#else	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
+static int nlmclnt_cmp_addr(const struct nlm_host *host,
+			    const struct sockaddr *sap)
+{
+	return nlm_cmp_addr(nlm_addr(host), sap);
+}
+#endif	/* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
+
 /*
  * The server lockd has called us back to tell us the lock was granted
  */
@@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 		 */
 		if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
 			continue;
-		if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
+		if (!nlmclnt_cmp_addr(block->b_host, addr))
 			continue;
 		if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
 			continue;
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6063a8e..763b78a 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 		case -EAGAIN:
 			ret = nlm_lck_denied;
-			goto out;
+			break;
 		case FILE_LOCK_DEFERRED:
 			if (wait)
 				break;
@@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 	}
 
+	ret = nlm_lck_denied;
+	if (!wait)
+		goto out;
+
 	ret = nlm_lck_blocked;
 
 	/* Append to list of blocked */
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index d1d1eb8..618865b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -3,7 +3,7 @@
  *
  *  Copyright (C) 1991, 1992  Linus Torvalds
  *
- *  Copyright (C) 1996  Gertjan van Wingerde    (gertjan@cs.vu.nl)
+ *  Copyright (C) 1996  Gertjan van Wingerde
  *	Minix V2 fs support.
  *
  *  Modified for 680x0 by Andreas Schwab
diff --git a/fs/namei.c b/fs/namei.c
index bbc15c2..d040ce1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -24,6 +24,7 @@
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -850,6 +851,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 		if (err == -EAGAIN)
 			err = inode_permission(nd->path.dentry->d_inode,
 					       MAY_EXEC);
+		if (!err)
+			err = ima_path_check(&nd->path, MAY_EXEC);
  		if (err)
 			break;
 
@@ -1470,7 +1473,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	error = security_inode_create(dir, dentry, mode);
 	if (error)
 		return error;
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->create(dir, dentry, mode, nd);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -1486,29 +1489,32 @@ int may_open(struct path *path, int acc_mode, int flag)
 	if (!inode)
 		return -ENOENT;
 
-	if (S_ISLNK(inode->i_mode))
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFLNK:
 		return -ELOOP;
-	
-	if (S_ISDIR(inode->i_mode) && (acc_mode & MAY_WRITE))
-		return -EISDIR;
-
-	/*
-	 * FIFO's, sockets and device files are special: they don't
-	 * actually live on the filesystem itself, and as such you
-	 * can write to them even if the filesystem is read-only.
-	 */
-	if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
-	    	flag &= ~O_TRUNC;
-	} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
+	case S_IFDIR:
+		if (acc_mode & MAY_WRITE)
+			return -EISDIR;
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
 		if (path->mnt->mnt_flags & MNT_NODEV)
 			return -EACCES;
-
+		/*FALLTHRU*/
+	case S_IFIFO:
+	case S_IFSOCK:
 		flag &= ~O_TRUNC;
+		break;
 	}
 
 	error = inode_permission(inode, acc_mode);
 	if (error)
 		return error;
+
+	error = ima_path_check(path,
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+	if (error)
+		return error;
 	/*
 	 * An append-only file must be opened in append mode for writing.
 	 */
@@ -1544,7 +1550,7 @@ int may_open(struct path *path, int acc_mode, int flag)
 			error = security_path_truncate(path, 0,
 					       ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
 		if (!error) {
-			DQUOT_INIT(inode);
+			vfs_dq_init(inode);
 
 			error = do_truncate(dentry, 0,
 					    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
@@ -1555,7 +1561,7 @@ int may_open(struct path *path, int acc_mode, int flag)
 			return error;
 	} else
 		if (flag & FMODE_WRITE)
-			DQUOT_INIT(inode);
+			vfs_dq_init(inode);
 
 	return 0;
 }
@@ -1938,7 +1944,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	if (error)
 		return error;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->mknod(dir, dentry, mode, dev);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -2037,7 +2043,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	if (error)
 		return error;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->mkdir(dir, dentry, mode);
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
@@ -2123,7 +2129,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->rmdir)
 		return -EPERM;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 
 	mutex_lock(&dentry->d_inode->i_mutex);
 	dentry_unhash(dentry);
@@ -2210,7 +2216,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 	if (!dir->i_op->unlink)
 		return -EPERM;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 
 	mutex_lock(&dentry->d_inode->i_mutex);
 	if (d_mountpoint(dentry))
@@ -2321,7 +2327,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
 	if (error)
 		return error;
 
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->symlink(dir, dentry, oldname);
 	if (!error)
 		fsnotify_create(dir, dentry);
@@ -2405,7 +2411,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		return error;
 
 	mutex_lock(&inode->i_mutex);
-	DQUOT_INIT(dir);
+	vfs_dq_init(dir);
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
@@ -2604,8 +2610,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (!old_dir->i_op->rename)
 		return -EPERM;
 
-	DQUOT_INIT(old_dir);
-	DQUOT_INIT(new_dir);
+	vfs_dq_init(old_dir);
+	vfs_dq_init(new_dir);
 
 	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4..0a42e0e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -397,11 +397,10 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	spin_unlock(&vfsmount_lock);
 }
 
-int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
 	mnt->mnt_root = dget(sb->s_root);
-	return 0;
 }
 
 EXPORT_SYMBOL(simple_set_mnt);
@@ -614,9 +613,11 @@ static inline void __mntput(struct vfsmount *mnt)
 	 */
 	for_each_possible_cpu(cpu) {
 		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		if (cpu_writer->mnt != mnt)
-			continue;
 		spin_lock(&cpu_writer->lock);
+		if (cpu_writer->mnt != mnt) {
+			spin_unlock(&cpu_writer->lock);
+			continue;
+		}
 		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
 		cpu_writer->count = 0;
 		/*
@@ -778,6 +779,7 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
 		{ MNT_NOATIME, ",noatime" },
 		{ MNT_NODIRATIME, ",nodiratime" },
 		{ MNT_RELATIME, ",relatime" },
+		{ MNT_STRICTATIME, ",strictatime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
@@ -1917,6 +1919,9 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (data_page)
 		((char *)data_page)[PAGE_SIZE - 1] = 0;
 
+	/* Default to relatime */
+	mnt_flags |= MNT_RELATIME;
+
 	/* Separate the per-mountpoint flags */
 	if (flags & MS_NOSUID)
 		mnt_flags |= MNT_NOSUID;
@@ -1928,13 +1933,14 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		mnt_flags |= MNT_NOATIME;
 	if (flags & MS_NODIRATIME)
 		mnt_flags |= MNT_NODIRATIME;
-	if (flags & MS_RELATIME)
-		mnt_flags |= MNT_RELATIME;
+	if (flags & MS_STRICTATIME)
+		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
 	if (flags & MS_RDONLY)
 		mnt_flags |= MNT_READONLY;
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
-		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
+		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
+		   MS_STRICTATIME);
 
 	/* ... and get the mountpoint */
 	retval = kern_path(dir_name, LOOKUP_FOLLOW, &path);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 07e9715..9c59072 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -79,7 +79,7 @@ static int ncp_hash_dentry(struct dentry *, struct qstr *);
 static int ncp_compare_dentry (struct dentry *, struct qstr *, struct qstr *);
 static int ncp_delete_dentry(struct dentry *);
 
-static struct dentry_operations ncp_dentry_operations =
+static const struct dentry_operations ncp_dentry_operations =
 {
 	.d_revalidate	= ncp_lookup_validate,
 	.d_hash		= ncp_hash_dentry,
@@ -87,7 +87,7 @@ static struct dentry_operations ncp_dentry_operations =
 	.d_delete	= ncp_delete_dentry,
 };
 
-struct dentry_operations ncp_root_dentry_operations =
+const struct dentry_operations ncp_root_dentry_operations =
 {
 	.d_hash		= ncp_hash_dentry,
 	.d_compare	= ncp_compare_dentry,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 9b728f3..574158a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -255,6 +255,32 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
 	}
 	return 0;
 }
+
+/*
+ * Test if two ip6 socket addresses refer to the same socket by
+ * comparing relevant fields. The padding bytes specifically, are not
+ * compared. sin6_flowinfo is not compared because it only affects QoS
+ * and sin6_scope_id is only compared if the address is "link local"
+ * because "link local" addresses need only be unique to a specific
+ * link. Conversely, ordinary unicast addresses might have different
+ * sin6_scope_id.
+ *
+ * The caller should ensure both socket addresses are AF_INET6.
+ */
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
+	const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
+
+	if (!ipv6_addr_equal(&saddr1->sin6_addr,
+			     &saddr1->sin6_addr))
+		return 0;
+	if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+	    saddr1->sin6_scope_id != saddr2->sin6_scope_id)
+		return 0;
+	return saddr1->sin6_port == saddr2->sin6_port;
+}
 #else
 static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
 				 const struct sockaddr_in *sa2)
@@ -270,9 +296,52 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
 	return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
 			(const struct sockaddr_in *)sa2);
 }
+
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
+				const struct sockaddr * sa2)
+{
+	return 0;
+}
 #endif
 
 /*
+ * Test if two ip4 socket addresses refer to the same socket, by
+ * comparing relevant fields. The padding bytes specifically, are
+ * not compared.
+ *
+ * The caller should ensure both socket addresses are AF_INET.
+ */
+static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
+				const struct sockaddr *sa2)
+{
+	const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
+	const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
+
+	if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
+		return 0;
+	return saddr1->sin_port == saddr2->sin_port;
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields.
+ */
+static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
+			    const struct sockaddr *sa2)
+{
+	if (sa1->sa_family != sa2->sa_family)
+		return 0;
+
+	switch (sa1->sa_family) {
+	case AF_INET:
+		return nfs_sockaddr_cmp_ip4(sa1, sa2);
+	case AF_INET6:
+		return nfs_sockaddr_cmp_ip6(sa1, sa2);
+	}
+	return 0;
+}
+
+/*
  * Find a client by IP address and protocol version
  * - returns NULL if no such client
  */
@@ -344,8 +413,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
 static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data)
 {
 	struct nfs_client *clp;
+	const struct sockaddr *sap = data->addr;
 
 	list_for_each_entry(clp, &nfs_client_list, cl_share_link) {
+	        const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr;
 		/* Don't match clients that failed to initialise properly */
 		if (clp->cl_cons_state < 0)
 			continue;
@@ -358,7 +429,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 			continue;
 
 		/* Match the full socket address */
-		if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0)
+		if (!nfs_sockaddr_cmp(sap, clap))
 			continue;
 
 		atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e35c819..78bf72f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -899,7 +899,7 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
 	iput(inode);
 }
 
-struct dentry_operations nfs_dentry_operations = {
+const struct dentry_operations nfs_dentry_operations = {
 	.d_revalidate	= nfs_lookup_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
@@ -967,7 +967,7 @@ out:
 #ifdef CONFIG_NFS_V4
 static int nfs_open_revalidate(struct dentry *, struct nameidata *);
 
-struct dentry_operations nfs4_dentry_operations = {
+const struct dentry_operations nfs4_dentry_operations = {
 	.d_revalidate	= nfs_open_revalidate,
 	.d_delete	= nfs_dentry_delete,
 	.d_iput		= nfs_dentry_iput,
@@ -1892,8 +1892,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
 	cache.cred = cred;
 	cache.jiffies = jiffies;
 	status = NFS_PROTO(inode)->access(inode, &cache);
-	if (status != 0)
+	if (status != 0) {
+		if (status == -ESTALE) {
+			nfs_zap_caches(inode);
+			if (!S_ISDIR(inode->i_mode))
+				set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+		}
 		return status;
+	}
 	nfs_access_add_cache(inode, &cache);
 out:
 	if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index cef6255..6bbf0e6 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 {
 	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_fattr fattr;
-	struct page *pages[NFSACL_MAXPAGES] = { };
+	struct page *pages[NFSACL_MAXPAGES];
 	struct nfs3_setaclargs args = {
 		.inode = inode,
 		.mask = NFS_ACL,
@@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		.rpc_argp	= &args,
 		.rpc_resp	= &fattr,
 	};
-	int status, count;
+	int status;
 
 	status = -EOPNOTSUPP;
 	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	if (S_ISDIR(inode->i_mode)) {
 		args.mask |= NFS_DFACL;
 		args.acl_default = dfacl;
+		args.len = nfsacl_size(acl, dfacl);
+	} else
+		args.len = nfsacl_size(acl, NULL);
+
+	if (args.len > NFS_ACL_INLINE_BUFSIZE) {
+		unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT);
+
+		status = -ENOMEM;
+		do {
+			args.pages[args.npages] = alloc_page(GFP_KERNEL);
+			if (args.pages[args.npages] == NULL)
+				goto out_freepages;
+			args.npages++;
+		} while (args.npages < npages);
 	}
 
 	dprintk("NFS call setacl\n");
@@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	nfs_zap_acl_cache(inode);
 	dprintk("NFS reply setacl: %d\n", status);
 
-	/* pages may have been allocated at the xdr layer. */
-	for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
-		__free_page(args.pages[count]);
-
 	switch (status) {
 		case 0:
 			status = nfs_refresh_inode(inode, &fattr);
@@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 		case -ENOTSUPP:
 			status = -EOPNOTSUPP;
 	}
+out_freepages:
+	while (args.npages != 0) {
+		args.npages--;
+		__free_page(args.pages[args.npages]);
+	}
 out:
 	return status;
 }
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 11cddde..6cdeacf 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -82,8 +82,10 @@
 #define NFS3_commitres_sz	(1+NFS3_wcc_data_sz+2)
 
 #define ACL3_getaclargs_sz	(NFS3_fh_sz+1)
-#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+2*(2+5*3))
-#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+2*(2+5*3))
+#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
+#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+ \
+				XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE))
 #define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
 
 /*
@@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p,
                    struct nfs3_setaclargs *args)
 {
 	struct xdr_buf *buf = &req->rq_snd_buf;
-	unsigned int base, len_in_head, len = nfsacl_size(
-		(args->mask & NFS_ACL)   ? args->acl_access  : NULL,
-		(args->mask & NFS_DFACL) ? args->acl_default : NULL);
-	int count, err;
+	unsigned int base;
+	int err;
 
 	p = xdr_encode_fhandle(p, NFS_FH(args->inode));
 	*p++ = htonl(args->mask);
-	base = (char *)p - (char *)buf->head->iov_base;
-	/* put as much of the acls into head as possible. */
-	len_in_head = min_t(unsigned int, buf->head->iov_len - base, len);
-	len -= len_in_head;
-	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2));
-
-	for (count = 0; (count << PAGE_SHIFT) < len; count++) {
-		args->pages[count] = alloc_page(GFP_KERNEL);
-		if (!args->pages[count]) {
-			while (count)
-				__free_page(args->pages[--count]);
-			return -ENOMEM;
-		}
-	}
-	xdr_encode_pages(buf, args->pages, 0, len);
+	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+	base = req->rq_slen;
+
+	if (args->npages != 0)
+		xdr_encode_pages(buf, args->pages, 0, args->len);
+	else
+		req->rq_slen += args->len;
 
 	err = nfsacl_encode(buf, base, args->inode,
 			    (args->mask & NFS_ACL) ?
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 4e4d332..84345de 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -179,7 +179,7 @@ struct nfs4_state_recovery_ops {
 	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
 };
 
-extern struct dentry_operations nfs4_dentry_operations;
+extern const struct dentry_operations nfs4_dentry_operations;
 extern const struct inode_operations nfs4_dir_inode_operations;
 
 /* inode.c */
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 30befc3..2a2a0a7 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -21,7 +21,9 @@
 #define NFSDBG_FACILITY		NFSDBG_VFS
 
 /*
- * Check if fs_root is valid
+ * Convert the NFSv4 pathname components into a standard posix path.
+ *
+ * Note that the resulting string will be placed at the end of the buffer
  */
 static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname,
 					 char *buffer, ssize_t buflen)
@@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 {
 	struct vfsmount *mnt = ERR_PTR(-ENOENT);
 	char *mnt_path;
-	int page2len;
+	unsigned int maxbuflen;
 	unsigned int s;
 
 	mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE);
 	if (IS_ERR(mnt_path))
 		return mnt;
 	mountdata->mnt_path = mnt_path;
-	page2 += strlen(mnt_path) + 1;
-	page2len = PAGE_SIZE - strlen(mnt_path) - 1;
+	maxbuflen = mnt_path - 1 - page2;
 
 	for (s = 0; s < location->nservers; s++) {
 		const struct nfs4_string *buf = &location->servers[s];
 		struct sockaddr_storage addr;
 
-		if (buf->len <= 0 || buf->len >= PAGE_SIZE)
+		if (buf->len <= 0 || buf->len >= maxbuflen)
 			continue;
 
 		mountdata->addr = (struct sockaddr *)&addr;
@@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 			continue;
 		nfs_set_port(mountdata->addr, NFS_PORT);
 
-		strncpy(page2, buf->data, page2len);
-		page2[page2len] = '\0';
+		memcpy(page2, buf->data, buf->len);
+		page2[buf->len] = '\0';
 		mountdata->hostname = page2;
 
 		snprintf(page, PAGE_SIZE, "%s:%s",
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f65953b..9250067 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2596,6 +2596,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_LOOKUPP]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_NVERIFY]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OPEN]		= (nfsd4_enc)nfsd4_encode_open,
+	[OP_OPENATTR]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_OPEN_CONFIRM]	= (nfsd4_enc)nfsd4_encode_open_confirm,
 	[OP_OPEN_DOWNGRADE]	= (nfsd4_enc)nfsd4_encode_open_downgrade,
 	[OP_PUTFH]		= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6e50aaa..78376b6 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -356,7 +356,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
 			put_write_access(inode);
 			goto out_nfserr;
 		}
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 	}
 
 	/* sanitize the mode change */
@@ -723,7 +723,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 		else
 			flags = O_WRONLY|O_LARGEFILE;
 
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 	}
 	*filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt),
 			    flags, cred);
@@ -998,8 +998,11 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 
 	if (!EX_ISSYNC(exp))
 		stable = 0;
-	if (stable && !EX_WGATHER(exp))
+	if (stable && !EX_WGATHER(exp)) {
+		spin_lock(&file->f_lock);
 		file->f_flags |= O_SYNC;
+		spin_unlock(&file->f_lock);
+	}
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28..220c13f 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
 	int ret;
 
 	do {
-		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
 			return -ENOSPC;
 		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
 	} while (ret == -EAGAIN);
@@ -380,6 +380,14 @@ void inotify_unmount_inodes(struct list_head *list)
 		struct list_head *watches;
 
 		/*
+		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * I_WILL_FREE, or I_NEW which is fine because by that point
+		 * the inode cannot have any associated watches.
+		 */
+		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+			continue;
+
+		/*
 		 * If i_count is zero, the inode cannot have any watches and
 		 * doing an __iget/iput with MS_ACTIVE clear would actually
 		 * evict all inodes with zero i_count from icache which is
@@ -388,14 +396,6 @@ void inotify_unmount_inodes(struct list_head *list)
 		if (!atomic_read(&inode->i_count))
 			continue;
 
-		/*
-		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or
-		 * I_WILL_FREE which is fine because by that point the inode
-		 * cannot have any associated watches.
-		 */
-		if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))
-			continue;
-
 		need_iput_tmp = need_iput;
 		need_iput = NULL;
 		/* In case inotify_remove_watch_locked() drops a reference. */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d861096..19e3a96 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
 
 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
-			(OCFS2_I(inode)->ip_clusters != rec->e_cpos),
+			(OCFS2_I(inode)->ip_clusters !=
+			 le32_to_cpu(rec->e_cpos)),
 			"Device %s, asking for sparse allocation: inode %llu, "
 			"cpos %u, clusters %u\n",
 			osb->dev_str,
@@ -4796,6 +4797,29 @@ out:
 	return ret;
 }
 
+static int ocfs2_replace_extent_rec(struct inode *inode,
+				    handle_t *handle,
+				    struct ocfs2_path *path,
+				    struct ocfs2_extent_list *el,
+				    int split_index,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	int ret;
+
+	ret = ocfs2_path_bh_journal_access(handle, inode, path,
+					   path_num_items(path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el->l_recs[split_index] = *split_rec;
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+	return ret;
+}
+
 /*
  * Mark part or all of the extent record at split_index in the leaf
  * pointed to by path as written. This removes the unwritten
@@ -4885,7 +4909,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 
 	if (ctxt.c_contig_type == CONTIG_NONE) {
 		if (ctxt.c_split_covers_rec)
-			el->l_recs[split_index] = *split_rec;
+			ret = ocfs2_replace_extent_rec(inode, handle,
+						       path, el,
+						       split_index, split_rec);
 		else
 			ret = ocfs2_split_and_insert(inode, handle, path, et,
 						     &last_eb_bh, split_index,
@@ -5390,6 +5416,9 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		goto out;
 	}
 
+	vfs_dq_free_space_nodirty(inode,
+				  ocfs2_clusters_to_bytes(inode->i_sb, len));
+
 	ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
 				  dealloc);
 	if (ret) {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a067a6c..8e1709a 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page,
 	size = i_size_read(inode);
 
 	if (size > PAGE_CACHE_SIZE ||
-	    size > ocfs2_max_inline_data(inode->i_sb)) {
+	    size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) {
 		ocfs2_error(inode->i_sb,
 			    "Inode %llu has with inline data has bad size: %Lu",
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
 	int ret, written = 0;
 	loff_t end = pos + len;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_dinode *di = NULL;
 
 	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
 	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
@@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
 	/*
 	 * Check whether the write can fit.
 	 */
-	if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb))
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	if (mmap_page ||
+	    end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di))
 		return 0;
 
 do_inline_write:
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b1cc7c3..7d60448 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -38,6 +38,7 @@
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
+#include "super.h"
 
 
 static int ocfs2_dentry_revalidate(struct dentry *dentry,
@@ -294,6 +295,34 @@ out_attach:
 	return ret;
 }
 
+static DEFINE_SPINLOCK(dentry_list_lock);
+
+/* We limit the number of dentry locks to drop in one go. We have
+ * this limit so that we don't starve other users of ocfs2_wq. */
+#define DL_INODE_DROP_COUNT 64
+
+/* Drop inode references from dentry locks */
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+	struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+					       dentry_lock_work);
+	struct ocfs2_dentry_lock *dl;
+	int drop_count = DL_INODE_DROP_COUNT;
+
+	spin_lock(&dentry_list_lock);
+	while (osb->dentry_lock_list && drop_count--) {
+		dl = osb->dentry_lock_list;
+		osb->dentry_lock_list = dl->dl_next;
+		spin_unlock(&dentry_list_lock);
+		iput(dl->dl_inode);
+		kfree(dl);
+		spin_lock(&dentry_list_lock);
+	}
+	if (osb->dentry_lock_list)
+		queue_work(ocfs2_wq, &osb->dentry_lock_work);
+	spin_unlock(&dentry_list_lock);
+}
+
 /*
  * ocfs2_dentry_iput() and friends.
  *
@@ -318,16 +347,23 @@ out_attach:
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
 				   struct ocfs2_dentry_lock *dl)
 {
-	iput(dl->dl_inode);
 	ocfs2_simple_drop_lockres(osb, &dl->dl_lockres);
 	ocfs2_lock_res_free(&dl->dl_lockres);
-	kfree(dl);
+
+	/* We leave dropping of inode reference to ocfs2_wq as that can
+	 * possibly lead to inode deletion which gets tricky */
+	spin_lock(&dentry_list_lock);
+	if (!osb->dentry_lock_list)
+		queue_work(ocfs2_wq, &osb->dentry_lock_work);
+	dl->dl_next = osb->dentry_lock_list;
+	osb->dentry_lock_list = dl;
+	spin_unlock(&dentry_list_lock);
 }
 
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
 			   struct ocfs2_dentry_lock *dl)
 {
-	int unlock = 0;
+	int unlock;
 
 	BUG_ON(dl->dl_count == 0);
 
@@ -419,7 +455,7 @@ out_move:
 	d_move(dentry, target);
 }
 
-struct dentry_operations ocfs2_dentry_ops = {
+const struct dentry_operations ocfs2_dentry_ops = {
 	.d_revalidate		= ocfs2_dentry_revalidate,
 	.d_iput			= ocfs2_dentry_iput,
 };
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index c091c34..faa12e7 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -26,11 +26,16 @@
 #ifndef OCFS2_DCACHE_H
 #define OCFS2_DCACHE_H
 
-extern struct dentry_operations ocfs2_dentry_ops;
+extern const struct dentry_operations ocfs2_dentry_ops;
 
 struct ocfs2_dentry_lock {
+	/* Use count of dentry lock */
 	unsigned int		dl_count;
-	u64			dl_parent_blkno;
+	union {
+		/* Linked list of dentry locks to release */
+		struct ocfs2_dentry_lock *dl_next;
+		u64			dl_parent_blkno;
+	};
 
 	/*
 	 * The ocfs2_dentry_lock keeps an inode reference until
@@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
 			   struct ocfs2_dentry_lock *dl);
 
+void ocfs2_drop_dl_inodes(struct work_struct *work);
+
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
 				      int skip_unhashed);
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 54e182a..0a28139 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1849,12 +1849,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 		if (!mle) {
 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
 			    res->owner != assert->node_idx) {
-				mlog(ML_ERROR, "assert_master from "
-					  "%u, but current owner is "
-					  "%u! (%.*s)\n",
-				       assert->node_idx, res->owner,
-				       namelen, name);
-				goto kill;
+				mlog(ML_ERROR, "DIE! Mastery assert from %u, "
+				     "but current owner is %u! (%.*s)\n",
+				     assert->node_idx, res->owner, namelen,
+				     name);
+				__dlm_print_one_lock_resource(res);
+				BUG();
 			}
 		} else if (mle->type != DLM_MLE_MIGRATION) {
 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d129520..4060bb3 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,8 +181,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 		spin_lock(&res->spinlock);
 		/* This ensures that clear refmap is sent after the set */
-		__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
-						  DLM_LOCK_RES_MIGRATING));
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
 		spin_unlock(&res->spinlock);
 
 		/* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 86ca085..fcf879e 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 	else
 		BUG_ON(res->owner == dlm->node_num);
 
-	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->ast_lock);
 	/* We want to be sure that we're not freeing a lock
 	 * that still has AST's pending... */
 	in_use = !list_empty(&lock->ast_list);
-	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm->ast_lock);
 	if (in_use) {
 	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
 		    "while waiting for an ast!", res->lockname.len,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b0c4cad..7219a86 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -320,9 +320,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert);
-#define ocfs2_log_dlm_error(_func, _err, _lockres) do {			\
-	mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
-	     _err, _func, _lockres->l_name);				\
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {					\
+	if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY)				\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n",	\
+		     _err, _func, _lockres->l_name);					\
+	else										\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n",	\
+		     _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name,	\
+		     (unsigned int)ocfs2_get_dentry_lock_ino(_lockres));		\
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -2860,6 +2865,10 @@ static void ocfs2_unlock_ast(void *opaque, int error)
 	case OCFS2_UNLOCK_CANCEL_CONVERT:
 		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
 		lockres->l_action = OCFS2_AST_INVALID;
+		/* Downconvert thread may have requeued this lock, we
+		 * need to wake it. */
+		if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
+			ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres));
 		break;
 	case OCFS2_UNLOCK_DROP_LOCK:
 		lockres->l_level = DLM_LOCK_IV;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e..172850a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
 					       loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+				OCFS2_SB(inode->i_sb)->journal->j_journal,
+				&OCFS2_I(inode)->ip_jinode,
+				new_size);
 }
 
 #endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 084aba8..4b11762 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -532,7 +532,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
 
 		fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
 
-		fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb));
+		fe->id2.i_data.id_count = cpu_to_le16(
+				ocfs2_max_inline_data_with_xattr(osb->sb, fe));
 	} else {
 		fel = &fe->id2.i_list;
 		fel->l_tree_depth = 0;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index ad5c24a..946d3c3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -210,6 +210,7 @@ struct ocfs2_journal;
 struct ocfs2_slot_info;
 struct ocfs2_recovery_map;
 struct ocfs2_quota_recovery;
+struct ocfs2_dentry_lock;
 struct ocfs2_super
 {
 	struct task_struct *commit_task;
@@ -325,6 +326,11 @@ struct ocfs2_super
 	struct list_head blocked_lock_list;
 	unsigned long blocked_lock_count;
 
+	/* List of dentry locks to release. Anyone can add locks to
+	 * the list, ocfs2_wq processes the list  */
+	struct ocfs2_dentry_lock *dentry_lock_list;
+	struct work_struct dentry_lock_work;
+
 	wait_queue_head_t		osb_mount_event;
 
 	/* Truncate log info */
@@ -335,6 +341,9 @@ struct ocfs2_super
 	struct ocfs2_node_map		osb_recovering_orphan_dirs;
 	unsigned int			*osb_orphan_wipes;
 	wait_queue_head_t		osb_wipe_event;
+
+	/* used to protect metaecc calculation check of xattr. */
+	spinlock_t osb_xattr_lock;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index c7ae45a..2332ef7 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -1070,12 +1070,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
 		 offsetof(struct ocfs2_dinode, id2.i_symlink);
 }
 
-static inline int ocfs2_max_inline_data(struct super_block *sb)
-{
-	return sb->s_blocksize -
-		offsetof(struct ocfs2_dinode, id2.i_data.id_data);
-}
-
 static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb,
 						   struct ocfs2_dinode *di)
 {
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index f4efa89..1ed0f7c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -754,7 +754,9 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	if (dquot->dq_flags & mask)
 		sync = 1;
 	spin_unlock(&dq_data_lock);
-	if (!sync) {
+	/* This is a slight hack but we can't afford getting global quota
+	 * lock if we already have a transaction started. */
+	if (!sync || journal_current_handle()) {
 		status = ocfs2_write_dquot(dquot);
 		goto out;
 	}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 43ed113..7ac83a8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1537,6 +1537,13 @@ static int ocfs2_get_sector(struct super_block *sb,
 	unlock_buffer(*bh);
 	ll_rw_block(READ, 1, bh);
 	wait_on_buffer(*bh);
+	if (!buffer_uptodate(*bh)) {
+		mlog_errno(-EIO);
+		brelse(*bh);
+		*bh = NULL;
+		return -EIO;
+	}
+
 	return 0;
 }
 
@@ -1747,6 +1754,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	INIT_LIST_HEAD(&osb->blocked_lock_list);
 	osb->blocked_lock_count = 0;
 	spin_lock_init(&osb->osb_lock);
+	spin_lock_init(&osb->osb_xattr_lock);
 	ocfs2_init_inode_steal_slot(osb);
 
 	atomic_set(&osb->alloc_stats.moves, 0);
@@ -1887,6 +1895,9 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery);
 	journal->j_state = OCFS2_JOURNAL_FREE;
 
+	INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes);
+	osb->dentry_lock_list = NULL;
+
 	/* get some pseudo constants for clustersize bits */
 	osb->s_clustersize_bits =
 		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e1d638a..2563df8 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt {
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_XATTR_HEADER_GAP	4
 #define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
 					 - sizeof(struct ocfs2_xattr_header) \
-					 - sizeof(__u32))
+					 - OCFS2_XATTR_HEADER_GAP)
 #define OCFS2_XATTR_FREE_IN_BLOCK(ptr)	((ptr)->i_sb->s_blocksize \
 					 - sizeof(struct ocfs2_xattr_block) \
 					 - sizeof(struct ocfs2_xattr_header) \
-					 - sizeof(__u32))
+					 - OCFS2_XATTR_HEADER_GAP)
 
 static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
@@ -274,10 +275,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 			       bucket->bu_blocks, bucket->bu_bhs, 0,
 			       NULL);
 	if (!rc) {
+		spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 		rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
 						 bucket->bu_bhs,
 						 bucket->bu_blocks,
 						 &bucket_xh(bucket)->xh_check);
+		spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 		if (rc)
 			mlog_errno(rc);
 	}
@@ -310,9 +313,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
 {
 	int i;
 
+	spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 	ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
 				   bucket->bu_bhs, bucket->bu_blocks,
 				   &bucket_xh(bucket)->xh_check);
+	spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 
 	for (i = 0; i < bucket->bu_blocks; i++)
 		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
@@ -542,8 +547,12 @@ int ocfs2_calc_xattr_init(struct inode *dir,
 	 * when blocksize = 512, may reserve one more cluser for
 	 * xattr bucket, otherwise reserve one metadata block
 	 * for them is ok.
+	 * If this is a new directory with inline data,
+	 * we choose to reserve the entire inline area for
+	 * directory contents and force an external xattr block.
 	 */
 	if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+	    (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) ||
 	    (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
 		ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
 		if (ret) {
@@ -1507,7 +1516,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		last += 1;
 	}
 
-	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
 	if (free < 0)
 		return -EIO;
 
@@ -2190,7 +2199,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 		last += 1;
 	}
 
-	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
 	if (free < 0)
 		return 0;
 
@@ -2592,8 +2601,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 
 	if (!ret) {
 		/* Update inode ctime. */
-		ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_journal_access_di(ctxt->handle, inode,
+					      xis->inode_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -4729,13 +4739,6 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	vb.vb_xv = (struct ocfs2_xattr_value_root *)
 		(vb.vb_bh->b_data + offset % blocksize);
 
-	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
-						OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
-	}
-
 	/*
 	 * From here on out we have to dirty the bucket.  The generic
 	 * value calls only modify one of the bucket's bhs, but we need
@@ -4748,12 +4751,18 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_dirty;
+		goto out;
+	}
+
+	ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+						OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
 	xe->xe_value_size = cpu_to_le64(len);
 
-out_dirty:
 	ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
 
 out:
@@ -4786,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
 						char *val,
 						int value_len)
 {
-	int offset;
+	int ret, offset, block_off;
 	struct ocfs2_xattr_value_root *xv;
 	struct ocfs2_xattr_entry *xe = xs->here;
+	struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
+	void *base;
 
 	BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe));
 
-	offset = le16_to_cpu(xe->xe_name_offset) +
-		 OCFS2_XATTR_SIZE(xe->xe_name_len);
+	ret = ocfs2_xattr_bucket_get_name_value(inode, xh,
+						xe - xh->xh_entries,
+						&block_off,
+						&offset);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-	xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
+	base = bucket_block(xs->bucket, block_off);
+	xv = (struct ocfs2_xattr_value_root *)(base + offset +
+		 OCFS2_XATTR_SIZE(xe->xe_name_len));
 
-	return __ocfs2_xattr_set_value_outside(inode, handle,
-					       xv, val, value_len);
+	ret = __ocfs2_xattr_set_value_outside(inode, handle,
+					      xv, val, value_len);
+	if (ret)
+		mlog_errno(ret);
+out:
+	return ret;
 }
 
 static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -5061,8 +5084,8 @@ try_again:
 	xh_free_start = le16_to_cpu(xh->xh_free_start);
 	header_size = sizeof(struct ocfs2_xattr_header) +
 			count * sizeof(struct ocfs2_xattr_entry);
-	max_free = OCFS2_XATTR_BUCKET_SIZE -
-		le16_to_cpu(xh->xh_name_value_len) - header_size;
+	max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
+		le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
 
 	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
 			"of %u which exceed block size\n",
@@ -5095,7 +5118,7 @@ try_again:
 			need = 0;
 	}
 
-	free = xh_free_start - header_size;
+	free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
 	/*
 	 * We need to make sure the new name/value pair
 	 * can exist in the same block.
@@ -5128,7 +5151,8 @@ try_again:
 			}
 
 			xh_free_start = le16_to_cpu(xh->xh_free_start);
-			free = xh_free_start - header_size;
+			free = xh_free_start - header_size
+				- OCFS2_XATTR_HEADER_GAP;
 			if (xh_free_start % blocksize < need)
 				free -= xh_free_start % blocksize;
 
diff --git a/fs/open.c b/fs/open.c
index a3a78ce..75b6167 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -273,7 +273,7 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
 	if (!error)
 		error = security_path_truncate(&path, length, 0);
 	if (!error) {
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 		error = do_truncate(path.dentry, length, 0, NULL);
 	}
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d72024..38e337d 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -400,7 +400,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev->devt = devt;
 
 	/* delay uevent until 'holders' subdir is created */
-	pdev->uevent_suppress = 1;
+	dev_set_uevent_suppress(pdev, 1);
 	err = device_add(pdev);
 	if (err)
 		goto out_put;
@@ -410,7 +410,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	if (!p->holder_dir)
 		goto out_del;
 
-	pdev->uevent_suppress = 0;
+	dev_set_uevent_suppress(pdev, 0);
 	if (flags & ADDPART_FLAG_WHOLEDISK) {
 		err = device_create_file(pdev, &dev_attr_whole_disk);
 		if (err)
@@ -422,7 +422,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	rcu_assign_pointer(ptbl->part[partno], p);
 
 	/* suppress uevent if the disk supresses it */
-	if (!ddev->uevent_suppress)
+	if (!dev_get_uevent_suppress(pdev))
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
 	return p;
@@ -455,7 +455,7 @@ void register_disk(struct gendisk *disk)
 	dev_set_name(ddev, disk->disk_name);
 
 	/* delay uevents, until we scanned partition table */
-	ddev->uevent_suppress = 1;
+	dev_set_uevent_suppress(ddev, 1);
 
 	if (device_add(ddev))
 		return;
@@ -490,7 +490,7 @@ void register_disk(struct gendisk *disk)
 
 exit:
 	/* announce disk after possible partitions are created */
-	ddev->uevent_suppress = 0;
+	dev_set_uevent_suppress(ddev, 0);
 	kobject_uevent(&ddev->kobj, KOBJ_ADD);
 
 	/* announce possible partitions */
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 1e064c4..4629768 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -21,20 +21,38 @@
  * compute the block number from a
  * cyl-cyl-head-head structure
  */
-static inline int
+static sector_t
 cchh2blk (struct vtoc_cchh *ptr, struct hd_geometry *geo) {
-        return ptr->cc * geo->heads * geo->sectors +
-	       ptr->hh * geo->sectors;
+
+	sector_t cyl;
+	__u16 head;
+
+	/*decode cylinder and heads for large volumes */
+	cyl = ptr->hh & 0xFFF0;
+	cyl <<= 12;
+	cyl |= ptr->cc;
+	head = ptr->hh & 0x000F;
+	return cyl * geo->heads * geo->sectors +
+	       head * geo->sectors;
 }
 
 /*
  * compute the block number from a
  * cyl-cyl-head-head-block structure
  */
-static inline int
+static sector_t
 cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
-        return ptr->cc * geo->heads * geo->sectors +
-		ptr->hh * geo->sectors +
+
+	sector_t cyl;
+	__u16 head;
+
+	/*decode cylinder and heads for large volumes */
+	cyl = ptr->hh & 0xFFF0;
+	cyl <<= 12;
+	cyl |= ptr->cc;
+	head = ptr->hh & 0x000F;
+	return	cyl * geo->heads * geo->sectors +
+		head * geo->sectors +
 		ptr->b;
 }
 
@@ -43,14 +61,15 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
 int
 ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int blocksize, offset, size,res;
-	loff_t i_size;
+	int blocksize, res;
+	loff_t i_size, offset, size, fmt_size;
 	dasd_information2_t *info;
 	struct hd_geometry *geo;
 	char type[5] = {0,};
 	char name[7] = {0,};
 	union label_t {
-		struct vtoc_volume_label vol;
+		struct vtoc_volume_label_cdl vol;
+		struct vtoc_volume_label_ldl lnx;
 		struct vtoc_cms_label cms;
 	} *label;
 	unsigned char *data;
@@ -85,14 +104,16 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	if (data == NULL)
 		goto out_readerr;
 
-	strncpy (type, data, 4);
-	if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD")))
-		strncpy(name, data + 8, 6);
-	else
-		strncpy(name, data + 4, 6);
 	memcpy(label, data, sizeof(union label_t));
 	put_dev_sector(sect);
 
+	if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
+		strncpy(type, label->vol.vollbl, 4);
+		strncpy(name, label->vol.volid, 6);
+	} else {
+		strncpy(type, label->lnx.vollbl, 4);
+		strncpy(name, label->lnx.volid, 6);
+	}
 	EBCASC(type, 4);
 	EBCASC(name, 6);
 
@@ -110,36 +131,54 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 			/*
 			 * VM style CMS1 labeled disk
 			 */
+			blocksize = label->cms.block_size;
 			if (label->cms.disk_offset != 0) {
 				printk("CMS1/%8s(MDSK):", name);
 				/* disk is reserved minidisk */
-				blocksize = label->cms.block_size;
 				offset = label->cms.disk_offset;
 				size = (label->cms.block_count - 1)
 					* (blocksize >> 9);
 			} else {
 				printk("CMS1/%8s:", name);
 				offset = (info->label_block + 1);
-				size = i_size >> 9;
+				size = label->cms.block_count
+					* (blocksize >> 9);
 			}
+			put_partition(state, 1, offset*(blocksize >> 9),
+				      size-offset*(blocksize >> 9));
 		} else {
-			/*
-			 * Old style LNX1 or unlabeled disk
-			 */
-			if (strncmp(type, "LNX1", 4) == 0)
-				printk ("LNX1/%8s:", name);
-			else
+			if (strncmp(type, "LNX1", 4) == 0) {
+				printk("LNX1/%8s:", name);
+				if (label->lnx.ldl_version == 0xf2) {
+					fmt_size = label->lnx.formatted_blocks
+						* (blocksize >> 9);
+				} else if (!strcmp(info->type, "ECKD")) {
+					/* formated w/o large volume support */
+					fmt_size = geo->cylinders * geo->heads
+					      * geo->sectors * (blocksize >> 9);
+				} else {
+					/* old label and no usable disk geometry
+					 * (e.g. DIAG) */
+					fmt_size = i_size >> 9;
+				}
+				size = i_size >> 9;
+				if (fmt_size < size)
+					size = fmt_size;
+				offset = (info->label_block + 1);
+			} else {
+				/* unlabeled disk */
 				printk("(nonl)");
-			offset = (info->label_block + 1);
-			size = i_size >> 9;
-		}
-		put_partition(state, 1, offset*(blocksize >> 9),
+				size = i_size >> 9;
+				offset = (info->label_block + 1);
+			}
+			put_partition(state, 1, offset*(blocksize >> 9),
 				      size-offset*(blocksize >> 9));
+		}
 	} else if (info->format == DASD_FORMAT_CDL) {
 		/*
 		 * New style CDL formatted disk
 		 */
-		unsigned int blk;
+		sector_t blk;
 		int counter;
 
 		/*
@@ -166,7 +205,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 				/* skip FMT4 / FMT5 / FMT7 labels */
 				if (f1.DS1FMTID == _ascebc['4']
 				    || f1.DS1FMTID == _ascebc['5']
-				    || f1.DS1FMTID == _ascebc['7']) {
+				    || f1.DS1FMTID == _ascebc['7']
+				    || f1.DS1FMTID == _ascebc['9']) {
 					blk++;
 					data = read_dev_sector(bdev, blk *
 							       (blocksize/512),
@@ -174,8 +214,9 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 					continue;
 				}
 
-				/* only FMT1 valid at this point */
-				if (f1.DS1FMTID != _ascebc['1'])
+				/* only FMT1 and 8 labels valid at this point */
+				if (f1.DS1FMTID != _ascebc['1'] &&
+				    f1.DS1FMTID != _ascebc['8'])
 					break;
 
 				/* OK, we got valid partition data */
diff --git a/fs/pipe.c b/fs/pipe.c
index 3a48ba5..4af7aa5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -667,10 +667,7 @@ pipe_read_fasync(int fd, struct file *filp, int on)
 	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
 	mutex_unlock(&inode->i_mutex);
 
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
@@ -684,10 +681,7 @@ pipe_write_fasync(int fd, struct file *filp, int on)
 	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
 	mutex_unlock(&inode->i_mutex);
 
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
@@ -699,18 +693,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on)
 	int retval;
 
 	mutex_lock(&inode->i_mutex);
-
 	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
-
-	if (retval >= 0)
+	if (retval >= 0) {
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
-
+		if (retval < 0) /* this can happen only if on == T */
+			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
+	}
 	mutex_unlock(&inode->i_mutex);
-
-	if (retval < 0)
-		return retval;
-
-	return 0;
+	return retval;
 }
 
 
@@ -870,7 +860,7 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 				dentry->d_inode->i_ino);
 }
 
-static struct dentry_operations pipefs_dentry_operations = {
+static const struct dentry_operations pipefs_dentry_operations = {
 	.d_delete	= pipefs_delete_dentry,
 	.d_dname	= pipefs_dname,
 };
@@ -1034,11 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
 	return error;
 }
 
-int do_pipe(int *fd)
-{
-	return do_pipe_flags(fd, 0);
-}
-
 /*
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0c9de19..e0afd32 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -146,15 +146,22 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
 	return count;
 }
 
-static struct fs_struct *get_fs_struct(struct task_struct *task)
+static int get_fs_path(struct task_struct *task, struct path *path, bool root)
 {
 	struct fs_struct *fs;
+	int result = -ENOENT;
+
 	task_lock(task);
 	fs = task->fs;
-	if(fs)
-		atomic_inc(&fs->count);
+	if (fs) {
+		read_lock(&fs->lock);
+		*path = root ? fs->root : fs->pwd;
+		path_get(path);
+		read_unlock(&fs->lock);
+		result = 0;
+	}
 	task_unlock(task);
-	return fs;
+	return result;
 }
 
 static int get_nr_threads(struct task_struct *tsk)
@@ -172,42 +179,24 @@ static int get_nr_threads(struct task_struct *tsk)
 static int proc_cwd_link(struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
-	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
 
 	if (task) {
-		fs = get_fs_struct(task);
+		result = get_fs_path(task, path, 0);
 		put_task_struct(task);
 	}
-	if (fs) {
-		read_lock(&fs->lock);
-		*path = fs->pwd;
-		path_get(&fs->pwd);
-		read_unlock(&fs->lock);
-		result = 0;
-		put_fs_struct(fs);
-	}
 	return result;
 }
 
 static int proc_root_link(struct inode *inode, struct path *path)
 {
 	struct task_struct *task = get_proc_task(inode);
-	struct fs_struct *fs = NULL;
 	int result = -ENOENT;
 
 	if (task) {
-		fs = get_fs_struct(task);
+		result = get_fs_path(task, path, 1);
 		put_task_struct(task);
 	}
-	if (fs) {
-		read_lock(&fs->lock);
-		*path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
-		result = 0;
-		put_fs_struct(fs);
-	}
 	return result;
 }
 
@@ -596,7 +585,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 	struct task_struct *task = get_proc_task(inode);
 	struct nsproxy *nsp;
 	struct mnt_namespace *ns = NULL;
-	struct fs_struct *fs = NULL;
 	struct path root;
 	struct proc_mounts *p;
 	int ret = -EINVAL;
@@ -610,22 +598,16 @@ static int mounts_open_common(struct inode *inode, struct file *file,
 				get_mnt_ns(ns);
 		}
 		rcu_read_unlock();
-		if (ns)
-			fs = get_fs_struct(task);
+		if (ns && get_fs_path(task, &root, 1) == 0)
+			ret = 0;
 		put_task_struct(task);
 	}
 
 	if (!ns)
 		goto err;
-	if (!fs)
+	if (ret)
 		goto err_put_ns;
 
-	read_lock(&fs->lock);
-	root = fs->root;
-	path_get(&root);
-	read_unlock(&fs->lock);
-	put_fs_struct(fs);
-
 	ret = -ENOMEM;
 	p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
 	if (!p)
@@ -1545,7 +1527,7 @@ static int pid_delete_dentry(struct dentry * dentry)
 	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
 }
 
-static struct dentry_operations pid_dentry_operations =
+static const struct dentry_operations pid_dentry_operations =
 {
 	.d_revalidate	= pid_revalidate,
 	.d_delete	= pid_delete_dentry,
@@ -1717,7 +1699,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static struct dentry_operations tid_fd_dentry_operations =
+static const struct dentry_operations tid_fd_dentry_operations =
 {
 	.d_revalidate	= tid_fd_revalidate,
 	.d_delete	= pid_delete_dentry,
@@ -2339,7 +2321,7 @@ static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return 0;
 }
 
-static struct dentry_operations proc_base_dentry_operations =
+static const struct dentry_operations proc_base_dentry_operations =
 {
 	.d_revalidate	= proc_base_revalidate,
 	.d_delete	= pid_delete_dentry,
@@ -3066,7 +3048,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	int retval = -ENOENT;
 	ino_t ino;
 	int tid;
-	unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
 	struct pid_namespace *ns;
 
 	task = get_proc_task(inode);
@@ -3083,18 +3064,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 		goto out_no_task;
 	retval = 0;
 
-	switch (pos) {
+	switch ((unsigned long)filp->f_pos) {
 	case 0:
 		ino = inode->i_ino;
-		if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0)
+		if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
 			goto out;
-		pos++;
+		filp->f_pos++;
 		/* fall through */
 	case 1:
 		ino = parent_ino(dentry);
-		if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0)
+		if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
 			goto out;
-		pos++;
+		filp->f_pos++;
 		/* fall through */
 	}
 
@@ -3104,9 +3085,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 	ns = filp->f_dentry->d_sb->s_fs_info;
 	tid = (int)filp->f_version;
 	filp->f_version = 0;
-	for (task = first_tid(leader, tid, pos - 2, ns);
+	for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
 	     task;
-	     task = next_tid(task), pos++) {
+	     task = next_tid(task), filp->f_pos++) {
 		tid = task_pid_nr_ns(task, ns);
 		if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
 			/* returning this tgid failed, save it as the first
@@ -3117,7 +3098,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
 		}
 	}
 out:
-	filp->f_pos = pos;
 	put_task_struct(leader);
 out_no_task:
 	return retval;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index db7fa5c..5d2989e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -363,7 +363,7 @@ static int proc_delete_dentry(struct dentry * dentry)
 	return 1;
 }
 
-static struct dentry_operations proc_dentry_operations =
+static const struct dentry_operations proc_dentry_operations =
 {
 	.d_delete	= proc_delete_dentry,
 };
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3e76bb9..d8bb5c6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 			}
 		}
 		unlock_new_inode(inode);
-	} else
+	} else {
 	       module_put(de->owner);
+	       de_put(de);
+	}
 	return inode;
 
 out_ino:
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 767d95a..e998383 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = {
 #define KPF_RECLAIM    9
 #define KPF_BUDDY     10
 
-#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos)
+#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos)
 
 static ssize_t kpageflags_read(struct file *file, char __user *buf,
 			     size_t count, loff_t *ppos)
@@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 		else
 			kflags = ppage->flags;
 
-		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
+		uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
 			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
 			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
 			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 94fcfff..9b1e4e9 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -7,7 +7,7 @@
 #include <linux/security.h>
 #include "internal.h"
 
-static struct dentry_operations proc_sys_dentry_operations;
+static const struct dentry_operations proc_sys_dentry_operations;
 static const struct file_operations proc_sys_file_operations;
 static const struct inode_operations proc_sys_inode_operations;
 static const struct file_operations proc_sys_dir_file_operations;
@@ -396,7 +396,7 @@ static int proc_sys_compare(struct dentry *dir, struct qstr *qstr,
 	return !sysctl_is_seen(PROC_I(dentry->d_inode)->sysctl);
 }
 
-static struct dentry_operations proc_sys_dentry_operations = {
+static const struct dentry_operations proc_sys_dentry_operations = {
 	.d_revalidate	= proc_sys_revalidate,
 	.d_delete	= proc_sys_delete,
 	.d_compare	= proc_sys_compare,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f6299a2..1e15a2b 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -83,7 +83,8 @@ static int proc_get_sb(struct file_system_type *fs_type,
 		ns->proc_mnt = mnt;
 	}
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 }
 
 static void proc_kill_sb(struct super_block *sb)
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
new file mode 100644
index 0000000..8047e01
--- /dev/null
+++ b/fs/quota/Kconfig
@@ -0,0 +1,59 @@
+#
+#  Quota configuration
+#
+
+config QUOTA
+	bool "Quota support"
+	help
+	  If you say Y here, you will be able to set per user limits for disk
+	  usage (also called disk quotas). Currently, it works for the
+	  ext2, ext3, and reiserfs file system. ext3 also supports journalled
+	  quotas for which you don't need to run quotacheck(8) after an unclean
+	  shutdown.
+	  For further details, read the Quota mini-HOWTO, available from
+	  <http://www.tldp.org/docs.html#howto>, or the documentation provided
+	  with the quota tools. Probably the quota support is only useful for
+	  multi user systems. If unsure, say N.
+
+config QUOTA_NETLINK_INTERFACE
+	bool "Report quota messages through netlink interface"
+	depends on QUOTA && NET
+	help
+	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+	  hardlimit, etc.) will be reported through netlink interface. If unsure,
+	  say Y.
+
+config PRINT_QUOTA_WARNING
+	bool "Print quota warnings to console (OBSOLETE)"
+	depends on QUOTA
+	default y
+	help
+	  If you say Y here, quota warnings (about exceeding softlimit, reaching
+	  hardlimit, etc.) will be printed to the process' controlling terminal.
+	  Note that this behavior is currently deprecated and may go away in
+	  future. Please use notification via netlink socket instead.
+
+# Generic support for tree structured quota files. Selected when needed.
+config QUOTA_TREE
+	 tristate
+
+config QFMT_V1
+	tristate "Old quota format support"
+	depends on QUOTA
+	help
+	  This quota format was (is) used by kernels earlier than 2.4.22. If
+	  you have quota working and you don't want to convert to new quota
+	  format say Y here.
+
+config QFMT_V2
+	tristate "Quota format v2 support"
+	depends on QUOTA
+	select QUOTA_TREE
+	help
+	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
+	  need this functionality say Y here.
+
+config QUOTACTL
+	bool
+	depends on XFS_QUOTA || QUOTA
+	default y
diff --git a/fs/quota/Makefile b/fs/quota/Makefile
new file mode 100644
index 0000000..385a083
--- /dev/null
+++ b/fs/quota/Makefile
@@ -0,0 +1,14 @@
+#
+# Makefile for the Linux filesystems.
+#
+# 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+#
+
+obj-y :=
+
+obj-$(CONFIG_QUOTA)		+= dquot.o
+obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
+obj-$(CONFIG_QFMT_V2)		+= quota_v2.o
+obj-$(CONFIG_QUOTA_TREE)	+= quota_tree.o
+obj-$(CONFIG_QUOTACTL)		+= quota.o
diff --git a/fs/dquot.c b/fs/quota/dquot.c
index bca3cac..2ca967a 100644
--- a/fs/dquot.c
+++ b/fs/quota/dquot.c
@@ -129,9 +129,10 @@
  * i_mutex on quota files is special (it's below dqio_mutex)
  */
 
-static DEFINE_SPINLOCK(dq_list_lock);
-static DEFINE_SPINLOCK(dq_state_lock);
-DEFINE_SPINLOCK(dq_data_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
+EXPORT_SYMBOL(dq_data_lock);
 
 static char *quotatypes[] = INITQFNAMES;
 static struct quota_format_type *quota_formats;	/* List of registered formats */
@@ -148,35 +149,46 @@ int register_quota_format(struct quota_format_type *fmt)
 	spin_unlock(&dq_list_lock);
 	return 0;
 }
+EXPORT_SYMBOL(register_quota_format);
 
 void unregister_quota_format(struct quota_format_type *fmt)
 {
 	struct quota_format_type **actqf;
 
 	spin_lock(&dq_list_lock);
-	for (actqf = &quota_formats; *actqf && *actqf != fmt; actqf = &(*actqf)->qf_next);
+	for (actqf = &quota_formats; *actqf && *actqf != fmt;
+	     actqf = &(*actqf)->qf_next)
+		;
 	if (*actqf)
 		*actqf = (*actqf)->qf_next;
 	spin_unlock(&dq_list_lock);
 }
+EXPORT_SYMBOL(unregister_quota_format);
 
 static struct quota_format_type *find_quota_format(int id)
 {
 	struct quota_format_type *actqf;
 
 	spin_lock(&dq_list_lock);
-	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next);
+	for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
+	     actqf = actqf->qf_next)
+		;
 	if (!actqf || !try_module_get(actqf->qf_owner)) {
 		int qm;
 
 		spin_unlock(&dq_list_lock);
 		
-		for (qm = 0; module_names[qm].qm_fmt_id && module_names[qm].qm_fmt_id != id; qm++);
-		if (!module_names[qm].qm_fmt_id || request_module(module_names[qm].qm_mod_name))
+		for (qm = 0; module_names[qm].qm_fmt_id &&
+			     module_names[qm].qm_fmt_id != id; qm++)
+			;
+		if (!module_names[qm].qm_fmt_id ||
+		    request_module(module_names[qm].qm_mod_name))
 			return NULL;
 
 		spin_lock(&dq_list_lock);
-		for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id; actqf = actqf->qf_next);
+		for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
+		     actqf = actqf->qf_next)
+			;
 		if (actqf && !try_module_get(actqf->qf_owner))
 			actqf = NULL;
 	}
@@ -215,6 +227,7 @@ static unsigned int dq_hash_bits, dq_hash_mask;
 static struct hlist_head *dquot_hash;
 
 struct dqstats dqstats;
+EXPORT_SYMBOL(dqstats);
 
 static inline unsigned int
 hashfn(const struct super_block *sb, unsigned int id, int type)
@@ -230,7 +243,8 @@ hashfn(const struct super_block *sb, unsigned int id, int type)
  */
 static inline void insert_dquot_hash(struct dquot *dquot)
 {
-	struct hlist_head *head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type);
+	struct hlist_head *head;
+	head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id, dquot->dq_type);
 	hlist_add_head(&dquot->dq_hash, head);
 }
 
@@ -239,17 +253,19 @@ static inline void remove_dquot_hash(struct dquot *dquot)
 	hlist_del_init(&dquot->dq_hash);
 }
 
-static inline struct dquot *find_dquot(unsigned int hashent, struct super_block *sb, unsigned int id, int type)
+static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
+				unsigned int id, int type)
 {
 	struct hlist_node *node;
 	struct dquot *dquot;
 
 	hlist_for_each (node, dquot_hash+hashent) {
 		dquot = hlist_entry(node, struct dquot, dq_hash);
-		if (dquot->dq_sb == sb && dquot->dq_id == id && dquot->dq_type == type)
+		if (dquot->dq_sb == sb && dquot->dq_id == id &&
+		    dquot->dq_type == type)
 			return dquot;
 	}
-	return NODQUOT;
+	return NULL;
 }
 
 /* Add a dquot to the tail of the free list */
@@ -309,6 +325,7 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
 	spin_unlock(&dq_list_lock);
 	return 0;
 }
+EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 
 /* This function needs dq_list_lock */
 static inline int clear_dquot_dirty(struct dquot *dquot)
@@ -345,8 +362,10 @@ int dquot_acquire(struct dquot *dquot)
 	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
 		ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
 		/* Write the info if needed */
-		if (info_dirty(&dqopt->info[dquot->dq_type]))
-			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type);
+		if (info_dirty(&dqopt->info[dquot->dq_type])) {
+			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+						dquot->dq_sb, dquot->dq_type);
+		}
 		if (ret < 0)
 			goto out_iolock;
 		if (ret2 < 0) {
@@ -360,6 +379,7 @@ out_iolock:
 	mutex_unlock(&dquot->dq_lock);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_acquire);
 
 /*
  *	Write dquot to disk
@@ -380,8 +400,10 @@ int dquot_commit(struct dquot *dquot)
 	 * => we have better not writing it */
 	if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
 		ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
-		if (info_dirty(&dqopt->info[dquot->dq_type]))
-			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type);
+		if (info_dirty(&dqopt->info[dquot->dq_type])) {
+			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+						dquot->dq_sb, dquot->dq_type);
+		}
 		if (ret >= 0)
 			ret = ret2;
 	}
@@ -389,6 +411,7 @@ out_sem:
 	mutex_unlock(&dqopt->dqio_mutex);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_commit);
 
 /*
  *	Release dquot
@@ -406,8 +429,10 @@ int dquot_release(struct dquot *dquot)
 	if (dqopt->ops[dquot->dq_type]->release_dqblk) {
 		ret = dqopt->ops[dquot->dq_type]->release_dqblk(dquot);
 		/* Write the info */
-		if (info_dirty(&dqopt->info[dquot->dq_type]))
-			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(dquot->dq_sb, dquot->dq_type);
+		if (info_dirty(&dqopt->info[dquot->dq_type])) {
+			ret2 = dqopt->ops[dquot->dq_type]->write_file_info(
+						dquot->dq_sb, dquot->dq_type);
+		}
 		if (ret >= 0)
 			ret = ret2;
 	}
@@ -417,6 +442,7 @@ out_dqlock:
 	mutex_unlock(&dquot->dq_lock);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_release);
 
 void dquot_destroy(struct dquot *dquot)
 {
@@ -516,6 +542,7 @@ out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_scan_active);
 
 int vfs_quota_sync(struct super_block *sb, int type)
 {
@@ -533,7 +560,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
 		spin_lock(&dq_list_lock);
 		dirty = &dqopt->info[cnt].dqi_dirty_list;
 		while (!list_empty(dirty)) {
-			dquot = list_first_entry(dirty, struct dquot, dq_dirty);
+			dquot = list_first_entry(dirty, struct dquot,
+						 dq_dirty);
 			/* Dirty and inactive can be only bad dquot... */
 			if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
 				clear_dquot_dirty(dquot);
@@ -563,6 +591,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
 
 	return 0;
 }
+EXPORT_SYMBOL(vfs_quota_sync);
 
 /* Free unused dquots from cache */
 static void prune_dqcache(int count)
@@ -672,6 +701,7 @@ we_slept:
 	put_dquot_last(dquot);
 	spin_unlock(&dq_list_lock);
 }
+EXPORT_SYMBOL(dqput);
 
 struct dquot *dquot_alloc(struct super_block *sb, int type)
 {
@@ -685,7 +715,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 
 	dquot = sb->dq_op->alloc_dquot(sb, type);
 	if(!dquot)
-		return NODQUOT;
+		return NULL;
 
 	mutex_init(&dquot->dq_lock);
 	INIT_LIST_HEAD(&dquot->dq_free);
@@ -711,10 +741,10 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
 	unsigned int hashent = hashfn(sb, id, type);
-	struct dquot *dquot = NODQUOT, *empty = NODQUOT;
+	struct dquot *dquot = NULL, *empty = NULL;
 
         if (!sb_has_quota_active(sb, type))
-		return NODQUOT;
+		return NULL;
 we_slept:
 	spin_lock(&dq_list_lock);
 	spin_lock(&dq_state_lock);
@@ -725,15 +755,17 @@ we_slept:
 	}
 	spin_unlock(&dq_state_lock);
 
-	if ((dquot = find_dquot(hashent, sb, id, type)) == NODQUOT) {
-		if (empty == NODQUOT) {
+	dquot = find_dquot(hashent, sb, id, type);
+	if (!dquot) {
+		if (!empty) {
 			spin_unlock(&dq_list_lock);
-			if ((empty = get_empty_dquot(sb, type)) == NODQUOT)
+			empty = get_empty_dquot(sb, type);
+			if (!empty)
 				schedule();	/* Try to wait for a moment... */
 			goto we_slept;
 		}
 		dquot = empty;
-		empty = NODQUOT;
+		empty = NULL;
 		dquot->dq_id = id;
 		/* all dquots go on the inuse_list */
 		put_inuse(dquot);
@@ -749,13 +781,14 @@ we_slept:
 		dqstats.lookups++;
 		spin_unlock(&dq_list_lock);
 	}
-	/* Wait for dq_lock - after this we know that either dquot_release() is already
-	 * finished or it will be canceled due to dq_count > 1 test */
+	/* Wait for dq_lock - after this we know that either dquot_release() is
+	 * already finished or it will be canceled due to dq_count > 1 test */
 	wait_on_dquot(dquot);
-	/* Read the dquot and instantiate it (everything done only if needed) */
-	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && sb->dq_op->acquire_dquot(dquot) < 0) {
+	/* Read the dquot / allocate space in quota file */
+	if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) &&
+	    sb->dq_op->acquire_dquot(dquot) < 0) {
 		dqput(dquot);
-		dquot = NODQUOT;
+		dquot = NULL;
 		goto out;
 	}
 #ifdef __DQUOT_PARANOIA
@@ -767,6 +800,7 @@ out:
 
 	return dquot;
 }
+EXPORT_SYMBOL(dqget);
 
 static int dqinit_needed(struct inode *inode, int type)
 {
@@ -775,9 +809,9 @@ static int dqinit_needed(struct inode *inode, int type)
 	if (IS_NOQUOTA(inode))
 		return 0;
 	if (type != -1)
-		return inode->i_dquot[type] == NODQUOT;
+		return !inode->i_dquot[type];
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			return 1;
 	return 0;
 }
@@ -789,12 +823,12 @@ static void add_dquot_ref(struct super_block *sb, int type)
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW))
+			continue;
 		if (!atomic_read(&inode->i_writecount))
 			continue;
 		if (!dqinit_needed(inode, type))
 			continue;
-		if (inode->i_state & (I_FREEING|I_WILL_FREE))
-			continue;
 
 		__iget(inode);
 		spin_unlock(&inode_lock);
@@ -813,7 +847,10 @@ static void add_dquot_ref(struct super_block *sb, int type)
 	iput(old_inode);
 }
 
-/* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */
+/*
+ * Return 0 if dqput() won't block.
+ * (note that 1 doesn't necessarily mean blocking)
+ */
 static inline int dqput_blocks(struct dquot *dquot)
 {
 	if (atomic_read(&dquot->dq_count) <= 1)
@@ -821,22 +858,27 @@ static inline int dqput_blocks(struct dquot *dquot)
 	return 0;
 }
 
-/* Remove references to dquots from inode - add dquot to list for freeing if needed */
-/* We can't race with anybody because we hold dqptr_sem for writing... */
+/*
+ * Remove references to dquots from inode and add dquot to list for freeing
+ * if we have the last referece to dquot
+ * We can't race with anybody because we hold dqptr_sem for writing...
+ */
 static int remove_inode_dquot_ref(struct inode *inode, int type,
 				  struct list_head *tofree_head)
 {
 	struct dquot *dquot = inode->i_dquot[type];
 
-	inode->i_dquot[type] = NODQUOT;
-	if (dquot != NODQUOT) {
+	inode->i_dquot[type] = NULL;
+	if (dquot) {
 		if (dqput_blocks(dquot)) {
 #ifdef __DQUOT_PARANOIA
 			if (atomic_read(&dquot->dq_count) != 1)
 				printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count));
 #endif
 			spin_lock(&dq_list_lock);
-			list_add(&dquot->dq_free, tofree_head);	/* As dquot must have currently users it can't be on the free list... */
+			/* As dquot must have currently users it can't be on
+			 * the free list... */
+			list_add(&dquot->dq_free, tofree_head);
 			spin_unlock(&dq_list_lock);
 			return 1;
 		}
@@ -846,19 +888,22 @@ static int remove_inode_dquot_ref(struct inode *inode, int type,
 	return 0;
 }
 
-/* Free list of dquots - called from inode.c */
-/* dquots are removed from inodes, no new references can be got so we are the only ones holding reference */
+/*
+ * Free list of dquots
+ * Dquots are removed from inodes and no new references can be got so we are
+ * the only ones holding reference
+ */
 static void put_dquot_list(struct list_head *tofree_head)
 {
 	struct list_head *act_head;
 	struct dquot *dquot;
 
 	act_head = tofree_head->next;
-	/* So now we have dquots on the list... Just free them */
 	while (act_head != tofree_head) {
 		dquot = list_entry(act_head, struct dquot, dq_free);
 		act_head = act_head->next;
-		list_del_init(&dquot->dq_free);	/* Remove dquot from the list so we won't have problems... */
+		/* Remove dquot from the list so we won't have problems... */
+		list_del_init(&dquot->dq_free);
 		dqput(dquot);
 	}
 }
@@ -870,6 +915,12 @@ static void remove_dquot_ref(struct super_block *sb, int type,
 
 	spin_lock(&inode_lock);
 	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		/*
+		 *  We have to scan also I_NEW inodes because they can already
+		 *  have quota pointer initialized. Luckily, we need to touch
+		 *  only quota pointers and these have separate locking
+		 *  (dqptr_sem).
+		 */
 		if (!IS_NOQUOTA(inode))
 			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
@@ -899,7 +950,29 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
 	dquot->dq_dqb.dqb_curspace += number;
 }
 
-static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
+static inline void dquot_resv_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_rsvspace += number;
+}
+
+/*
+ * Claim reserved quota space
+ */
+static void dquot_claim_reserved_space(struct dquot *dquot,
+						qsize_t number)
+{
+	WARN_ON(dquot->dq_dqb.dqb_rsvspace < number);
+	dquot->dq_dqb.dqb_curspace += number;
+	dquot->dq_dqb.dqb_rsvspace -= number;
+}
+
+static inline
+void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
+{
+	dquot->dq_dqb.dqb_rsvspace -= number;
+}
+
+static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 {
 	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
 	    dquot->dq_dqb.dqb_curinodes >= number)
@@ -911,7 +984,7 @@ static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
 	clear_bit(DQ_INODES_B, &dquot->dq_flags);
 }
 
-static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
+static void dquot_decr_space(struct dquot *dquot, qsize_t number)
 {
 	if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
 	    dquot->dq_dqb.dqb_curspace >= number)
@@ -938,7 +1011,7 @@ static int warning_issued(struct dquot *dquot, const int warntype)
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 static int flag_print_warnings = 1;
 
-static inline int need_print_warning(struct dquot *dquot)
+static int need_print_warning(struct dquot *dquot)
 {
 	if (!flag_print_warnings)
 		return 0;
@@ -1057,10 +1130,7 @@ static void send_warning(const struct dquot *dquot, const char warntype)
 		goto attr_err_out;
 	genlmsg_end(skb, msg_head);
 
-	ret = genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
-	if (ret < 0 && ret != -ESRCH)
-		printk(KERN_ERR
-			"VFS: Failed to send notification message: %d\n", ret);
+	genlmsg_multicast(skb, 0, quota_genl_family.id, GFP_NOFS);
 	return;
 attr_err_out:
 	printk(KERN_ERR "VFS: Not enough space to compose quota message!\n");
@@ -1068,13 +1138,17 @@ err_out:
 	kfree_skb(skb);
 }
 #endif
-
-static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
+/*
+ * Write warnings to the console and send warning messages over netlink.
+ *
+ * Note that this function can sleep.
+ */
+static void flush_warnings(struct dquot *const *dquots, char *warntype)
 {
 	int i;
 
 	for (i = 0; i < MAXQUOTAS; i++)
-		if (dquots[i] != NODQUOT && warntype[i] != QUOTA_NL_NOWARN &&
+		if (dquots[i] && warntype[i] != QUOTA_NL_NOWARN &&
 		    !warning_issued(dquots[i], warntype[i])) {
 #ifdef CONFIG_PRINT_QUOTA_WARNING
 			print_warning(dquots[i], warntype[i]);
@@ -1085,42 +1159,47 @@ static inline void flush_warnings(struct dquot * const *dquots, char *warntype)
 		}
 }
 
-static inline char ignore_hardlimit(struct dquot *dquot)
+static int ignore_hardlimit(struct dquot *dquot)
 {
 	struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type];
 
 	return capable(CAP_SYS_RESOURCE) &&
-	    (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || !(info->dqi_flags & V1_DQF_RSQUASH));
+	       (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
+		!(info->dqi_flags & V1_DQF_RSQUASH));
 }
 
 /* needs dq_data_lock */
 static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 {
+	qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
+
 	*warntype = QUOTA_NL_NOWARN;
 	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
 	if (dquot->dq_dqb.dqb_ihardlimit &&
-	   (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_ihardlimit &&
+	    newinodes > dquot->dq_dqb.dqb_ihardlimit &&
             !ignore_hardlimit(dquot)) {
 		*warntype = QUOTA_NL_IHARDWARN;
 		return NO_QUOTA;
 	}
 
 	if (dquot->dq_dqb.dqb_isoftlimit &&
-	   (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_isoftlimit &&
-	    dquot->dq_dqb.dqb_itime && get_seconds() >= dquot->dq_dqb.dqb_itime &&
+	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&
+	    dquot->dq_dqb.dqb_itime &&
+	    get_seconds() >= dquot->dq_dqb.dqb_itime &&
             !ignore_hardlimit(dquot)) {
 		*warntype = QUOTA_NL_ISOFTLONGWARN;
 		return NO_QUOTA;
 	}
 
 	if (dquot->dq_dqb.dqb_isoftlimit &&
-	   (dquot->dq_dqb.dqb_curinodes + inodes) > dquot->dq_dqb.dqb_isoftlimit &&
+	    newinodes > dquot->dq_dqb.dqb_isoftlimit &&
 	    dquot->dq_dqb.dqb_itime == 0) {
 		*warntype = QUOTA_NL_ISOFTWARN;
-		dquot->dq_dqb.dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
+		dquot->dq_dqb.dqb_itime = get_seconds() +
+		    sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace;
 	}
 
 	return QUOTA_OK;
@@ -1129,13 +1208,19 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
 /* needs dq_data_lock */
 static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
 {
+	qsize_t tspace;
+	struct super_block *sb = dquot->dq_sb;
+
 	*warntype = QUOTA_NL_NOWARN;
-	if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+	if (!sb_has_quota_limits_enabled(sb, dquot->dq_type) ||
 	    test_bit(DQ_FAKE_B, &dquot->dq_flags))
 		return QUOTA_OK;
 
+	tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
+		+ space;
+
 	if (dquot->dq_dqb.dqb_bhardlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
+	    tspace > dquot->dq_dqb.dqb_bhardlimit &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BHARDWARN;
@@ -1143,8 +1228,9 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
-	    dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
+	    dquot->dq_dqb.dqb_btime &&
+	    get_seconds() >= dquot->dq_dqb.dqb_btime &&
             !ignore_hardlimit(dquot)) {
 		if (!prealloc)
 			*warntype = QUOTA_NL_BSOFTLONGWARN;
@@ -1152,11 +1238,12 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 	}
 
 	if (dquot->dq_dqb.dqb_bsoftlimit &&
-	    dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
+	    tspace > dquot->dq_dqb.dqb_bsoftlimit &&
 	    dquot->dq_dqb.dqb_btime == 0) {
 		if (!prealloc) {
 			*warntype = QUOTA_NL_BSOFTWARN;
-			dquot->dq_dqb.dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace;
+			dquot->dq_dqb.dqb_btime = get_seconds() +
+			    sb_dqopt(sb)->info[dquot->dq_type].dqi_bgrace;
 		}
 		else
 			/*
@@ -1171,15 +1258,18 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
 
 static int info_idq_free(struct dquot *dquot, qsize_t inodes)
 {
+	qsize_t newinodes;
+
 	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
 	    dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
 	    !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
 		return QUOTA_NL_NOWARN;
 
-	if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
+	newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
+	if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
 		return QUOTA_NL_ISOFTBELOW;
 	if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
-	    dquot->dq_dqb.dqb_curinodes - inodes < dquot->dq_dqb.dqb_ihardlimit)
+	    newinodes < dquot->dq_dqb.dqb_ihardlimit)
 		return QUOTA_NL_IHARDBELOW;
 	return QUOTA_NL_NOWARN;
 }
@@ -1206,7 +1296,7 @@ int dquot_initialize(struct inode *inode, int type)
 {
 	unsigned int id = 0;
 	int cnt, ret = 0;
-	struct dquot *got[MAXQUOTAS] = { NODQUOT, NODQUOT };
+	struct dquot *got[MAXQUOTAS] = { NULL, NULL };
 	struct super_block *sb = inode->i_sb;
 
 	/* First test before acquiring mutex - solves deadlocks when we
@@ -1239,9 +1329,9 @@ int dquot_initialize(struct inode *inode, int type)
 		/* Avoid races with quotaoff() */
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
-		if (inode->i_dquot[cnt] == NODQUOT) {
+		if (!inode->i_dquot[cnt]) {
 			inode->i_dquot[cnt] = got[cnt];
-			got[cnt] = NODQUOT;
+			got[cnt] = NULL;
 		}
 	}
 out_err:
@@ -1251,6 +1341,7 @@ out_err:
 		dqput(got[cnt]);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_initialize);
 
 /*
  * 	Release all quotas referenced by inode
@@ -1263,7 +1354,7 @@ int dquot_drop(struct inode *inode)
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		put[cnt] = inode->i_dquot[cnt];
-		inode->i_dquot[cnt] = NODQUOT;
+		inode->i_dquot[cnt] = NULL;
 	}
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 
@@ -1271,6 +1362,7 @@ int dquot_drop(struct inode *inode)
 		dqput(put[cnt]);
 	return 0;
 }
+EXPORT_SYMBOL(dquot_drop);
 
 /* Wrapper to remove references to quota structures from inode */
 void vfs_dq_drop(struct inode *inode)
@@ -1287,12 +1379,13 @@ void vfs_dq_drop(struct inode *inode)
 		 * must assure that nobody can come after the DQUOT_DROP and
 		 * add quota pointers back anyway */
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt] != NODQUOT)
+			if (inode->i_dquot[cnt])
 				break;
 		if (cnt < MAXQUOTAS)
 			inode->i_sb->dq_op->drop(inode);
 	}
 }
+EXPORT_SYMBOL(vfs_dq_drop);
 
 /*
  * Following four functions update i_blocks+i_bytes fields and
@@ -1306,51 +1399,93 @@ void vfs_dq_drop(struct inode *inode)
 /*
  * This operation can block, but only after everything is updated
  */
-int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
+int __dquot_alloc_space(struct inode *inode, qsize_t number,
+			int warn, int reserve)
 {
-	int cnt, ret = NO_QUOTA;
+	int cnt, ret = QUOTA_OK;
 	char warntype[MAXQUOTAS];
 
-	/* First test before acquiring mutex - solves deadlocks when we
-         * re-enter the quota code and are already holding the mutex */
-	if (IS_NOQUOTA(inode)) {
-out_add:
-		inode_add_bytes(inode, number);
-		return QUOTA_OK;
-	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = QUOTA_NL_NOWARN;
 
-	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	if (IS_NOQUOTA(inode)) {	/* Now we can do reliable test... */
-		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
-		goto out_add;
-	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
-		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA)
-			goto warn_put_all;
+		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
+		    == NO_QUOTA) {
+			ret = NO_QUOTA;
+			goto out_unlock;
+		}
 	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
-		dquot_incr_space(inode->i_dquot[cnt], number);
+		if (reserve)
+			dquot_resv_space(inode->i_dquot[cnt], number);
+		else
+			dquot_incr_space(inode->i_dquot[cnt], number);
 	}
-	inode_add_bytes(inode, number);
-	ret = QUOTA_OK;
-warn_put_all:
+	if (!reserve)
+		inode_add_bytes(inode, number);
+out_unlock:
 	spin_unlock(&dq_data_lock);
-	if (ret == QUOTA_OK)
-		/* Dirtify all the dquots - this can block when journalling */
-		for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-			if (inode->i_dquot[cnt])
-				mark_dquot_dirty(inode->i_dquot[cnt]);
 	flush_warnings(inode->i_dquot, warntype);
+	return ret;
+}
+
+int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
+{
+	int cnt, ret = QUOTA_OK;
+
+	/*
+	 * First test before acquiring mutex - solves deadlocks when we
+	 * re-enter the quota code and are already holding the mutex
+	 */
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out_unlock;
+	}
+
+	ret = __dquot_alloc_space(inode, number, warn, 0);
+	if (ret == NO_QUOTA)
+		goto out_unlock;
+
+	/* Dirtify all the dquots - this can block when journalling */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (inode->i_dquot[cnt])
+			mark_dquot_dirty(inode->i_dquot[cnt]);
+out_unlock:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
 	return ret;
 }
+EXPORT_SYMBOL(dquot_alloc_space);
+
+int dquot_reserve_space(struct inode *inode, qsize_t number, int warn)
+{
+	int ret = QUOTA_OK;
+
+	if (IS_NOQUOTA(inode))
+		goto out;
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))
+		goto out_unlock;
+
+	ret = __dquot_alloc_space(inode, number, warn, 1);
+out_unlock:
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(dquot_reserve_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -1373,14 +1508,15 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
-		if (check_idq(inode->i_dquot[cnt], number, warntype+cnt) == NO_QUOTA)
+		if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
+		    == NO_QUOTA)
 			goto warn_put_all;
 	}
 
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		dquot_incr_inodes(inode->i_dquot[cnt], number);
 	}
@@ -1396,6 +1532,73 @@ warn_put_all:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_alloc_inode);
+
+int dquot_claim_space(struct inode *inode, qsize_t number)
+{
+	int cnt;
+	int ret = QUOTA_OK;
+
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))	{
+		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		inode_add_bytes(inode, number);
+		goto out;
+	}
+
+	spin_lock(&dq_data_lock);
+	/* Claim reserved quotas to allocated quotas */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt])
+			dquot_claim_reserved_space(inode->i_dquot[cnt],
+							number);
+	}
+	/* Update inode bytes */
+	inode_add_bytes(inode, number);
+	spin_unlock(&dq_data_lock);
+	/* Dirtify all the dquots - this can block when journalling */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+		if (inode->i_dquot[cnt])
+			mark_dquot_dirty(inode->i_dquot[cnt]);
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(dquot_claim_space);
+
+/*
+ * Release reserved quota space
+ */
+void dquot_release_reserved_space(struct inode *inode, qsize_t number)
+{
+	int cnt;
+
+	if (IS_NOQUOTA(inode))
+		goto out;
+
+	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	if (IS_NOQUOTA(inode))
+		goto out_unlock;
+
+	spin_lock(&dq_data_lock);
+	/* Release reserved dquots */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt])
+			dquot_free_reserved_space(inode->i_dquot[cnt], number);
+	}
+	spin_unlock(&dq_data_lock);
+
+out_unlock:
+	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+out:
+	return;
+}
+EXPORT_SYMBOL(dquot_release_reserved_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -1421,7 +1624,7 @@ out_sub:
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
 		dquot_decr_space(inode->i_dquot[cnt], number);
@@ -1436,6 +1639,7 @@ out_sub:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
+EXPORT_SYMBOL(dquot_free_space);
 
 /*
  * This operation can block, but only after everything is updated
@@ -1458,7 +1662,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt] == NODQUOT)
+		if (!inode->i_dquot[cnt])
 			continue;
 		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
 		dquot_decr_inodes(inode->i_dquot[cnt], number);
@@ -1472,6 +1676,20 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
+EXPORT_SYMBOL(dquot_free_inode);
+
+/*
+ * call back function, get reserved quota space from underlying fs
+ */
+qsize_t dquot_get_reserved_space(struct inode *inode)
+{
+	qsize_t reserved_space = 0;
+
+	if (sb_any_quota_active(inode->i_sb) &&
+	    inode->i_sb->dq_op->get_reserved_space)
+		reserved_space = inode->i_sb->dq_op->get_reserved_space(inode);
+	return reserved_space;
+}
 
 /*
  * Transfer the number of inode and blocks from one diskquota to an other.
@@ -1481,7 +1699,8 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
  */
 int dquot_transfer(struct inode *inode, struct iattr *iattr)
 {
-	qsize_t space;
+	qsize_t space, cur_space;
+	qsize_t rsv_space = 0;
 	struct dquot *transfer_from[MAXQUOTAS];
 	struct dquot *transfer_to[MAXQUOTAS];
 	int cnt, ret = QUOTA_OK;
@@ -1496,22 +1715,16 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		return QUOTA_OK;
 	/* Initialize the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		transfer_from[cnt] = NODQUOT;
-		transfer_to[cnt] = NODQUOT;
+		transfer_from[cnt] = NULL;
+		transfer_to[cnt] = NULL;
 		warntype_to[cnt] = QUOTA_NL_NOWARN;
-		switch (cnt) {
-			case USRQUOTA:
-				if (!chuid)
-					continue;
-				transfer_to[cnt] = dqget(inode->i_sb, iattr->ia_uid, cnt);
-				break;
-			case GRPQUOTA:
-				if (!chgid)
-					continue;
-				transfer_to[cnt] = dqget(inode->i_sb, iattr->ia_gid, cnt);
-				break;
-		}
 	}
+	if (chuid)
+		transfer_to[USRQUOTA] = dqget(inode->i_sb, iattr->ia_uid,
+					      USRQUOTA);
+	if (chgid)
+		transfer_to[GRPQUOTA] = dqget(inode->i_sb, iattr->ia_gid,
+					      GRPQUOTA);
 
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Now recheck reliably when holding dqptr_sem */
@@ -1520,10 +1733,12 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		goto put_all;
 	}
 	spin_lock(&dq_data_lock);
-	space = inode_get_bytes(inode);
+	cur_space = inode_get_bytes(inode);
+	rsv_space = dquot_get_reserved_space(inode);
+	space = cur_space + rsv_space;
 	/* Build the transfer_from list and check the limits */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (transfer_to[cnt] == NODQUOT)
+		if (!transfer_to[cnt])
 			continue;
 		transfer_from[cnt] = inode->i_dquot[cnt];
 		if (check_idq(transfer_to[cnt], 1, warntype_to + cnt) ==
@@ -1539,7 +1754,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		/*
 		 * Skip changes for same uid or gid or for turned off quota-type.
 		 */
-		if (transfer_to[cnt] == NODQUOT)
+		if (!transfer_to[cnt])
 			continue;
 
 		/* Due to IO error we might not have transfer_from[] structure */
@@ -1549,11 +1764,14 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 			warntype_from_space[cnt] =
 				info_bdq_free(transfer_from[cnt], space);
 			dquot_decr_inodes(transfer_from[cnt], 1);
-			dquot_decr_space(transfer_from[cnt], space);
+			dquot_decr_space(transfer_from[cnt], cur_space);
+			dquot_free_reserved_space(transfer_from[cnt],
+						  rsv_space);
 		}
 
 		dquot_incr_inodes(transfer_to[cnt], 1);
-		dquot_incr_space(transfer_to[cnt], space);
+		dquot_incr_space(transfer_to[cnt], cur_space);
+		dquot_resv_space(transfer_to[cnt], rsv_space);
 
 		inode->i_dquot[cnt] = transfer_to[cnt];
 	}
@@ -1567,7 +1785,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 		if (transfer_to[cnt]) {
 			mark_dquot_dirty(transfer_to[cnt]);
 			/* The reference we got is transferred to the inode */
-			transfer_to[cnt] = NODQUOT;
+			transfer_to[cnt] = NULL;
 		}
 	}
 warn_put_all:
@@ -1585,10 +1803,11 @@ over_quota:
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Clear dquot pointers we don't want to dqput() */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		transfer_from[cnt] = NODQUOT;
+		transfer_from[cnt] = NULL;
 	ret = NO_QUOTA;
 	goto warn_put_all;
 }
+EXPORT_SYMBOL(dquot_transfer);
 
 /* Wrapper for transferring ownership of an inode */
 int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
@@ -1600,7 +1819,7 @@ int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
 	}
 	return 0;
 }
-
+EXPORT_SYMBOL(vfs_dq_transfer);
 
 /*
  * Write info of quota file to disk
@@ -1615,6 +1834,7 @@ int dquot_commit_info(struct super_block *sb, int type)
 	mutex_unlock(&dqopt->dqio_mutex);
 	return ret;
 }
+EXPORT_SYMBOL(dquot_commit_info);
 
 /*
  * Definitions of diskquota operations.
@@ -1700,8 +1920,8 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 		drop_dquot_ref(sb, cnt);
 		invalidate_dquots(sb, cnt);
 		/*
-		 * Now all dquots should be invalidated, all writes done so we should be only
-		 * users of the info. No locks needed.
+		 * Now all dquots should be invalidated, all writes done so we
+		 * should be only users of the info. No locks needed.
 		 */
 		if (info_dirty(&dqopt->info[cnt]))
 			sb->dq_op->write_info(sb, cnt);
@@ -1739,10 +1959,12 @@ int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			if (!sb_has_quota_loaded(sb, cnt)) {
-				mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
+				mutex_lock_nested(&toputinode[cnt]->i_mutex,
+						  I_MUTEX_QUOTA);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
-				truncate_inode_pages(&toputinode[cnt]->i_data, 0);
+				truncate_inode_pages(&toputinode[cnt]->i_data,
+						     0);
 				mutex_unlock(&toputinode[cnt]->i_mutex);
 				mark_inode_dirty(toputinode[cnt]);
 			}
@@ -1767,13 +1989,14 @@ put_inodes:
 		}
 	return ret;
 }
+EXPORT_SYMBOL(vfs_quota_disable);
 
 int vfs_quota_off(struct super_block *sb, int type, int remount)
 {
 	return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
 				 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
 }
-
+EXPORT_SYMBOL(vfs_quota_off);
 /*
  *	Turn quotas on on a device
  */
@@ -1831,7 +2054,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		 * possible) Also nobody should write to the file - we use
 		 * special IO operations which ignore the immutable bit. */
 		down_write(&dqopt->dqptr_sem);
-		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
+					     S_NOQUOTA);
 		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
 		up_write(&dqopt->dqptr_sem);
 		sb->dq_op->drop(inode);
@@ -1850,7 +2074,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	dqopt->info[type].dqi_fmt_id = format_id;
 	INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
 	mutex_lock(&dqopt->dqio_mutex);
-	if ((error = dqopt->ops[type]->read_file_info(sb, type)) < 0) {
+	error = dqopt->ops[type]->read_file_info(sb, type);
+	if (error < 0) {
 		mutex_unlock(&dqopt->dqio_mutex);
 		goto out_file_init;
 	}
@@ -1930,6 +2155,7 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
 					     DQUOT_LIMITS_ENABLED);
 	return error;
 }
+EXPORT_SYMBOL(vfs_quota_on_path);
 
 int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 		 int remount)
@@ -1947,6 +2173,7 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
 	}
 	return error;
 }
+EXPORT_SYMBOL(vfs_quota_on);
 
 /*
  * More powerful function for turning on quotas allowing setting
@@ -1993,6 +2220,7 @@ out_lock:
 load_quota:
 	return vfs_load_quota_inode(inode, type, format_id, flags);
 }
+EXPORT_SYMBOL(vfs_quota_enable);
 
 /*
  * This function is used when filesystem needs to initialize quotas
@@ -2022,6 +2250,7 @@ out:
 	dput(dentry);
 	return error;
 }
+EXPORT_SYMBOL(vfs_quota_on_mount);
 
 /* Wrapper to turn on quotas when remounting rw */
 int vfs_dq_quota_on_remount(struct super_block *sb)
@@ -2038,6 +2267,7 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
 	}
 	return ret;
 }
+EXPORT_SYMBOL(vfs_dq_quota_on_remount);
 
 static inline qsize_t qbtos(qsize_t blocks)
 {
@@ -2057,7 +2287,7 @@ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	spin_lock(&dq_data_lock);
 	di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
 	di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
-	di->dqb_curspace = dm->dqb_curspace;
+	di->dqb_curspace = dm->dqb_curspace + dm->dqb_rsvspace;
 	di->dqb_ihardlimit = dm->dqb_ihardlimit;
 	di->dqb_isoftlimit = dm->dqb_isoftlimit;
 	di->dqb_curinodes = dm->dqb_curinodes;
@@ -2067,18 +2297,20 @@ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	spin_unlock(&dq_data_lock);
 }
 
-int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di)
+int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
+		  struct if_dqblk *di)
 {
 	struct dquot *dquot;
 
 	dquot = dqget(sb, id, type);
-	if (dquot == NODQUOT)
+	if (!dquot)
 		return -ESRCH;
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
 
 	return 0;
 }
+EXPORT_SYMBOL(vfs_get_dqblk);
 
 /* Generic routine for setting common part of quota structure */
 static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
@@ -2097,7 +2329,7 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 
 	spin_lock(&dq_data_lock);
 	if (di->dqb_valid & QIF_SPACE) {
-		dm->dqb_curspace = di->dqb_curspace;
+		dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace;
 		check_blim = 1;
 		__set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
 	}
@@ -2130,22 +2362,25 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	}
 
 	if (check_blim) {
-		if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
+		if (!dm->dqb_bsoftlimit ||
+		    dm->dqb_curspace < dm->dqb_bsoftlimit) {
 			dm->dqb_btime = 0;
 			clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-		}
-		else if (!(di->dqb_valid & QIF_BTIME))	/* Set grace only if user hasn't provided his own... */
+		} else if (!(di->dqb_valid & QIF_BTIME))
+			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
 	}
 	if (check_ilim) {
-		if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) {
+		if (!dm->dqb_isoftlimit ||
+		    dm->dqb_curinodes < dm->dqb_isoftlimit) {
 			dm->dqb_itime = 0;
 			clear_bit(DQ_INODES_B, &dquot->dq_flags);
-		}
-		else if (!(di->dqb_valid & QIF_ITIME))	/* Set grace only if user hasn't provided his own... */
+		} else if (!(di->dqb_valid & QIF_ITIME))
+			/* Set grace only if user hasn't provided his own... */
 			dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
 	}
-	if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit)
+	if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
+	    dm->dqb_isoftlimit)
 		clear_bit(DQ_FAKE_B, &dquot->dq_flags);
 	else
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
@@ -2155,7 +2390,8 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
 	return 0;
 }
 
-int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di)
+int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
+		  struct if_dqblk *di)
 {
 	struct dquot *dquot;
 	int rc;
@@ -2170,6 +2406,7 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 out:
 	return rc;
 }
+EXPORT_SYMBOL(vfs_set_dqblk);
 
 /* Generic routine for getting common part of quota file information */
 int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
@@ -2191,6 +2428,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return 0;
 }
+EXPORT_SYMBOL(vfs_get_dqinfo);
 
 /* Generic routine for setting common part of quota file information */
 int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
@@ -2210,7 +2448,8 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	if (ii->dqi_valid & IIF_IGRACE)
 		mi->dqi_igrace = ii->dqi_igrace;
 	if (ii->dqi_valid & IIF_FLAGS)
-		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | (ii->dqi_flags & DQF_MASK);
+		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) |
+				(ii->dqi_flags & DQF_MASK);
 	spin_unlock(&dq_data_lock);
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
@@ -2219,6 +2458,7 @@ out:
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return err;
 }
+EXPORT_SYMBOL(vfs_set_dqinfo);
 
 struct quotactl_ops vfs_quotactl_ops = {
 	.quota_on	= vfs_quota_on,
@@ -2368,43 +2608,10 @@ static int __init dquot_init(void)
 
 #ifdef CONFIG_QUOTA_NETLINK_INTERFACE
 	if (genl_register_family(&quota_genl_family) != 0)
-		printk(KERN_ERR "VFS: Failed to create quota netlink interface.\n");
+		printk(KERN_ERR
+		       "VFS: Failed to create quota netlink interface.\n");
 #endif
 
 	return 0;
 }
 module_init(dquot_init);
-
-EXPORT_SYMBOL(register_quota_format);
-EXPORT_SYMBOL(unregister_quota_format);
-EXPORT_SYMBOL(dqstats);
-EXPORT_SYMBOL(dq_data_lock);
-EXPORT_SYMBOL(vfs_quota_enable);
-EXPORT_SYMBOL(vfs_quota_on);
-EXPORT_SYMBOL(vfs_quota_on_path);
-EXPORT_SYMBOL(vfs_quota_on_mount);
-EXPORT_SYMBOL(vfs_quota_disable);
-EXPORT_SYMBOL(vfs_quota_off);
-EXPORT_SYMBOL(dquot_scan_active);
-EXPORT_SYMBOL(vfs_quota_sync);
-EXPORT_SYMBOL(vfs_get_dqinfo);
-EXPORT_SYMBOL(vfs_set_dqinfo);
-EXPORT_SYMBOL(vfs_get_dqblk);
-EXPORT_SYMBOL(vfs_set_dqblk);
-EXPORT_SYMBOL(dquot_commit);
-EXPORT_SYMBOL(dquot_commit_info);
-EXPORT_SYMBOL(dquot_acquire);
-EXPORT_SYMBOL(dquot_release);
-EXPORT_SYMBOL(dquot_mark_dquot_dirty);
-EXPORT_SYMBOL(dquot_initialize);
-EXPORT_SYMBOL(dquot_drop);
-EXPORT_SYMBOL(vfs_dq_drop);
-EXPORT_SYMBOL(dqget);
-EXPORT_SYMBOL(dqput);
-EXPORT_SYMBOL(dquot_alloc_space);
-EXPORT_SYMBOL(dquot_alloc_inode);
-EXPORT_SYMBOL(dquot_free_space);
-EXPORT_SYMBOL(dquot_free_inode);
-EXPORT_SYMBOL(dquot_transfer);
-EXPORT_SYMBOL(vfs_dq_transfer);
-EXPORT_SYMBOL(vfs_dq_quota_on_remount);
diff --git a/fs/quota.c b/fs/quota/quota.c
index d76ada9..b7f5a46 100644
--- a/fs/quota.c
+++ b/fs/quota/quota.c
@@ -20,7 +20,8 @@
 #include <linux/types.h>
 
 /* Check validity of generic quotactl commands */
-static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+static int generic_quotactl_valid(struct super_block *sb, int type, int cmd,
+				  qid_t id)
 {
 	if (type >= MAXQUOTAS)
 		return -EINVAL;
@@ -72,7 +73,8 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
 		case Q_SETINFO:
 		case Q_SETQUOTA:
 		case Q_GETQUOTA:
-			/* This is just informative test so we are satisfied without a lock */
+			/* This is just an informative test so we are satisfied
+			 * without the lock */
 			if (!sb_has_quota_active(sb, type))
 				return -ESRCH;
 	}
@@ -92,7 +94,8 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
 }
 
 /* Check validity of XFS Quota Manager commands */
-static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd,
+			      qid_t id)
 {
 	if (type >= XQM_MAXQUOTAS)
 		return -EINVAL;
@@ -142,7 +145,8 @@ static int xqm_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t i
 	return 0;
 }
 
-static int check_quotactl_valid(struct super_block *sb, int type, int cmd, qid_t id)
+static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
+				qid_t id)
 {
 	int error;
 
@@ -180,7 +184,8 @@ static void quota_sync_sb(struct super_block *sb, int type)
 			continue;
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
-		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
+		mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex,
+				  I_MUTEX_QUOTA);
 		truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
 		mutex_unlock(&sb_dqopt(sb)->files[cnt]->i_mutex);
 	}
@@ -200,14 +205,15 @@ void sync_dquots(struct super_block *sb, int type)
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		/* This test just improves performance so it needn't be reliable... */
+		/* This test just improves performance so it needn't be
+		 * reliable... */
 		for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 			if (type != -1 && type != cnt)
 				continue;
 			if (!sb_has_quota_active(sb, cnt))
 				continue;
 			if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
-			    list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
+			   list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
 				continue;
 			break;
 		}
@@ -227,7 +233,8 @@ restart:
 }
 
 /* Copy parameters and call proper function */
-static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __user *addr)
+static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
+		       void __user *addr)
 {
 	int ret;
 
@@ -235,7 +242,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 		case Q_QUOTAON: {
 			char *pathname;
 
-			if (IS_ERR(pathname = getname(addr)))
+			pathname = getname(addr);
+			if (IS_ERR(pathname))
 				return PTR_ERR(pathname);
 			ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
 			putname(pathname);
@@ -261,7 +269,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 		case Q_GETINFO: {
 			struct if_dqinfo info;
 
-			if ((ret = sb->s_qcop->get_info(sb, type, &info)))
+			ret = sb->s_qcop->get_info(sb, type, &info);
+			if (ret)
 				return ret;
 			if (copy_to_user(addr, &info, sizeof(info)))
 				return -EFAULT;
@@ -277,7 +286,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 		case Q_GETQUOTA: {
 			struct if_dqblk idq;
 
-			if ((ret = sb->s_qcop->get_dqblk(sb, type, id, &idq)))
+			ret = sb->s_qcop->get_dqblk(sb, type, id, &idq);
+			if (ret)
 				return ret;
 			if (copy_to_user(addr, &idq, sizeof(idq)))
 				return -EFAULT;
@@ -322,7 +332,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
 		case Q_XGETQUOTA: {
 			struct fs_disk_quota fdq;
 
-			if ((ret = sb->s_qcop->get_xquota(sb, type, id, &fdq)))
+			ret = sb->s_qcop->get_xquota(sb, type, id, &fdq);
+			if (ret)
 				return ret;
 			if (copy_to_user(addr, &fdq, sizeof(fdq)))
 				return -EFAULT;
@@ -341,7 +352,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
  * look up a superblock on which quota ops will be performed
  * - use the name of a block device to find the superblock thereon
  */
-static inline struct super_block *quotactl_block(const char __user *special)
+static struct super_block *quotactl_block(const char __user *special)
 {
 #ifdef CONFIG_BLOCK
 	struct block_device *bdev;
diff --git a/fs/quota_tree.c b/fs/quota/quota_tree.c
index 953404c..f81f4bc 100644
--- a/fs/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -22,8 +22,6 @@ MODULE_LICENSE("GPL");
 
 #define __QUOTA_QT_PARANOIA
 
-typedef char *dqbuf_t;
-
 static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
 {
 	unsigned int epb = info->dqi_usable_bs >> 2;
@@ -35,46 +33,42 @@ static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
 }
 
 /* Number of entries in one blocks */
-static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+static int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
 {
 	return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
 	       / info->dqi_entry_size;
 }
 
-static dqbuf_t getdqbuf(size_t size)
+static char *getdqbuf(size_t size)
 {
-	dqbuf_t buf = kmalloc(size, GFP_NOFS);
+	char *buf = kmalloc(size, GFP_NOFS);
 	if (!buf)
-		printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+		printk(KERN_WARNING
+		       "VFS: Not enough memory for quota buffers.\n");
 	return buf;
 }
 
-static inline void freedqbuf(dqbuf_t buf)
-{
-	kfree(buf);
-}
-
-static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+static ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 {
 	struct super_block *sb = info->dqi_sb;
 
 	memset(buf, 0, info->dqi_usable_bs);
-	return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+	return sb->s_op->quota_read(sb, info->dqi_type, buf,
 	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
 }
 
-static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
 {
 	struct super_block *sb = info->dqi_sb;
 
-	return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+	return sb->s_op->quota_write(sb, info->dqi_type, buf,
 	       info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
 }
 
 /* Remove empty block from list and return it */
 static int get_free_dqblk(struct qtree_mem_dqinfo *info)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	int ret, blk;
 
@@ -98,12 +92,12 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info)
 	mark_info_dirty(info->dqi_sb, info->dqi_type);
 	ret = blk;
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
 /* Insert empty block to the list */
-static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
 {
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	int err;
@@ -120,9 +114,10 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
 }
 
 /* Remove given block from the list of blocks with free entries */
-static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
+			       uint blk)
 {
-	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	char *tmpbuf = getdqbuf(info->dqi_usable_bs);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	uint nextblk = le32_to_cpu(dh->dqdh_next_free);
 	uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
@@ -153,21 +148,24 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint
 		info->dqi_free_entry = nextblk;
 		mark_info_dirty(info->dqi_sb, info->dqi_type);
 	}
-	freedqbuf(tmpbuf);
+	kfree(tmpbuf);
 	dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
 	/* No matter whether write succeeds block is out of list */
 	if (write_blk(info, blk, buf) < 0)
-		printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+		printk(KERN_ERR
+		       "VFS: Can't write block (%u) with free entries.\n",
+		       blk);
 	return 0;
 out_buf:
-	freedqbuf(tmpbuf);
+	kfree(tmpbuf);
 	return err;
 }
 
 /* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
+			       uint blk)
 {
-	dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+	char *tmpbuf = getdqbuf(info->dqi_usable_bs);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	int err;
 
@@ -188,12 +186,12 @@ static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint
 		if (err < 0)
 			goto out_buf;
 	}
-	freedqbuf(tmpbuf);
+	kfree(tmpbuf);
 	info->dqi_free_entry = blk;
 	mark_info_dirty(info->dqi_sb, info->dqi_type);
 	return 0;
 out_buf:
-	freedqbuf(tmpbuf);
+	kfree(tmpbuf);
 	return err;
 }
 
@@ -215,7 +213,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 {
 	uint blk, i;
 	struct qt_disk_dqdbheader *dh;
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	char *ddquot;
 
 	*err = 0;
@@ -233,11 +231,12 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 		blk = get_free_dqblk(info);
 		if ((int)blk < 0) {
 			*err = blk;
-			freedqbuf(buf);
+			kfree(buf);
 			return 0;
 		}
 		memset(buf, 0, info->dqi_usable_bs);
-		/* This is enough as block is already zeroed and entry list is empty... */
+		/* This is enough as the block is already zeroed and the entry
+		 * list is empty... */
 		info->dqi_free_entry = blk;
 		mark_info_dirty(dquot->dq_sb, dquot->dq_type);
 	}
@@ -253,9 +252,12 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	}
 	le16_add_cpu(&dh->dqdh_entries, 1);
 	/* Find free structure in block */
-	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
-	     i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
-	     i++, ddquot += info->dqi_entry_size);
+	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
+	for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
+		if (qtree_entry_unused(info, ddquot))
+			break;
+		ddquot += info->dqi_entry_size;
+	}
 #ifdef __QUOTA_QT_PARANOIA
 	if (i == qtree_dqstr_in_blk(info)) {
 		printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
@@ -273,10 +275,10 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 	dquot->dq_off = (blk << info->dqi_blocksize_bits) +
 			sizeof(struct qt_disk_dqdbheader) +
 			i * info->dqi_entry_size;
-	freedqbuf(buf);
+	kfree(buf);
 	return blk;
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return 0;
 }
 
@@ -284,7 +286,7 @@ out_buf:
 static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			  uint *treeblk, int depth)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	int ret = 0, newson = 0, newact = 0;
 	__le32 *ref;
 	uint newblk;
@@ -333,7 +335,7 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		put_free_dqblk(info, buf, *treeblk);
 	}
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -346,14 +348,15 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
 }
 
 /*
- *	We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ * We don't have to be afraid of deadlocks as we never have quotas on quota
+ * files...
  */
 int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 {
 	int type = dquot->dq_type;
 	struct super_block *sb = dquot->dq_sb;
 	ssize_t ret;
-	dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+	char *ddquot = getdqbuf(info->dqi_entry_size);
 
 	if (!ddquot)
 		return -ENOMEM;
@@ -364,15 +367,15 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		if (ret < 0) {
 			printk(KERN_ERR "VFS: Error %zd occurred while "
 					"creating quota.\n", ret);
-			freedqbuf(ddquot);
+			kfree(ddquot);
 			return ret;
 		}
 	}
 	spin_lock(&dq_data_lock);
 	info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
 	spin_unlock(&dq_data_lock);
-	ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
-					info->dqi_entry_size, dquot->dq_off);
+	ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size,
+				    dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
 		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
 		       sb->s_id);
@@ -382,7 +385,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		ret = 0;
 	}
 	dqstats.writes++;
-	freedqbuf(ddquot);
+	kfree(ddquot);
 
 	return ret;
 }
@@ -393,7 +396,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			uint blk)
 {
 	struct qt_disk_dqdbheader *dh;
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	int ret = 0;
 
 	if (!buf)
@@ -444,7 +447,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	}
 	dquot->dq_off = 0;	/* Quota is now unattached */
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -452,7 +455,7 @@ out_buf:
 static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		       uint *blk, int depth)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	int ret = 0;
 	uint newblk;
 	__le32 *ref = (__le32 *)buf;
@@ -475,9 +478,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		int i;
 		ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
 		/* Block got empty? */
-		for (i = 0;
-		     i < (info->dqi_usable_bs >> 2) && !ref[i];
-		     i++);
+		for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++)
+			;
 		/* Don't put the root block into the free block list */
 		if (i == (info->dqi_usable_bs >> 2)
 		    && *blk != QT_TREEOFF) {
@@ -491,7 +493,7 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 		}
 	}
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -510,7 +512,7 @@ EXPORT_SYMBOL(qtree_delete_dquot);
 static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 				 struct dquot *dquot, uint blk)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	loff_t ret = 0;
 	int i;
 	char *ddquot;
@@ -522,9 +524,12 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
 		goto out_buf;
 	}
-	for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
-	     i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
-	     i++, ddquot += info->dqi_entry_size);
+	ddquot = buf + sizeof(struct qt_disk_dqdbheader);
+	for (i = 0; i < qtree_dqstr_in_blk(info); i++) {
+		if (info->dqi_ops->is_id(ddquot, dquot))
+			break;
+		ddquot += info->dqi_entry_size;
+	}
 	if (i == qtree_dqstr_in_blk(info)) {
 		printk(KERN_ERR "VFS: Quota for id %u referenced "
 		  "but not present.\n", dquot->dq_id);
@@ -535,7 +540,7 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 		  qt_disk_dqdbheader) + i * info->dqi_entry_size;
 	}
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -543,7 +548,7 @@ out_buf:
 static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 				struct dquot *dquot, uint blk, int depth)
 {
-	dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+	char *buf = getdqbuf(info->dqi_usable_bs);
 	loff_t ret = 0;
 	__le32 *ref = (__le32 *)buf;
 
@@ -563,7 +568,7 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 	else
 		ret = find_block_dqentry(info, dquot, blk);
 out_buf:
-	freedqbuf(buf);
+	kfree(buf);
 	return ret;
 }
 
@@ -579,7 +584,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	int type = dquot->dq_type;
 	struct super_block *sb = dquot->dq_sb;
 	loff_t offset;
-	dqbuf_t ddquot;
+	char *ddquot;
 	int ret = 0;
 
 #ifdef __QUOTA_QT_PARANOIA
@@ -607,8 +612,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	ddquot = getdqbuf(info->dqi_entry_size);
 	if (!ddquot)
 		return -ENOMEM;
-	ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
-				   info->dqi_entry_size, dquot->dq_off);
+	ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
+				   dquot->dq_off);
 	if (ret != info->dqi_entry_size) {
 		if (ret >= 0)
 			ret = -EIO;
@@ -616,7 +621,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 				"structure for id %u.\n", dquot->dq_id);
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 		memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
-		freedqbuf(ddquot);
+		kfree(ddquot);
 		goto out;
 	}
 	spin_lock(&dq_data_lock);
@@ -627,7 +632,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	    !dquot->dq_dqb.dqb_isoftlimit)
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 	spin_unlock(&dq_data_lock);
-	freedqbuf(ddquot);
+	kfree(ddquot);
 out:
 	dqstats.reads++;
 	return ret;
@@ -638,7 +643,8 @@ EXPORT_SYMBOL(qtree_read_dquot);
  * the only one operating on dquot (thanks to dq_lock) */
 int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 {
-	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+	if (test_bit(DQ_FAKE_B, &dquot->dq_flags) &&
+	    !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
 		return qtree_delete_dquot(info, dquot);
 	return 0;
 }
diff --git a/fs/quota_tree.h b/fs/quota/quota_tree.h
index a1ab8db..a1ab8db 100644
--- a/fs/quota_tree.h
+++ b/fs/quota/quota_tree.h
diff --git a/fs/quota_v1.c b/fs/quota/quota_v1.c
index b4af1c6..0edcf42 100644
--- a/fs/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -62,11 +62,14 @@ static int v1_read_dqblk(struct dquot *dquot)
 
 	/* Set structure to 0s in case read fails/is after end of file */
 	memset(&dqblk, 0, sizeof(struct v1_disk_dqblk));
-	dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
+	dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, (char *)&dqblk,
+			sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
 
 	v1_disk2mem_dqblk(&dquot->dq_dqb, &dqblk);
-	if (dquot->dq_dqb.dqb_bhardlimit == 0 && dquot->dq_dqb.dqb_bsoftlimit == 0 &&
-	    dquot->dq_dqb.dqb_ihardlimit == 0 && dquot->dq_dqb.dqb_isoftlimit == 0)
+	if (dquot->dq_dqb.dqb_bhardlimit == 0 &&
+	    dquot->dq_dqb.dqb_bsoftlimit == 0 &&
+	    dquot->dq_dqb.dqb_ihardlimit == 0 &&
+	    dquot->dq_dqb.dqb_isoftlimit == 0)
 		set_bit(DQ_FAKE_B, &dquot->dq_flags);
 	dqstats.reads++;
 
@@ -81,13 +84,16 @@ static int v1_commit_dqblk(struct dquot *dquot)
 
 	v1_mem2disk_dqblk(&dqblk, &dquot->dq_dqb);
 	if (dquot->dq_id == 0) {
-		dqblk.dqb_btime = sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
-		dqblk.dqb_itime = sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace;
+		dqblk.dqb_btime =
+			sb_dqopt(dquot->dq_sb)->info[type].dqi_bgrace;
+		dqblk.dqb_itime =
+			sb_dqopt(dquot->dq_sb)->info[type].dqi_igrace;
 	}
 	ret = 0;
 	if (sb_dqopt(dquot->dq_sb)->files[type])
-		ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, (char *)&dqblk,
-					sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id));
+		ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
+			(char *)&dqblk, sizeof(struct v1_disk_dqblk),
+			v1_dqoff(dquot->dq_id));
 	if (ret != sizeof(struct v1_disk_dqblk)) {
 		printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
 			dquot->dq_sb->s_id);
@@ -130,15 +136,20 @@ static int v1_check_quota_file(struct super_block *sb, int type)
 		return 0;
 	blocks = isize >> BLOCK_SIZE_BITS;
 	off = isize & (BLOCK_SIZE - 1);
-	if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) % sizeof(struct v1_disk_dqblk))
+	if ((blocks % sizeof(struct v1_disk_dqblk) * BLOCK_SIZE + off) %
+	    sizeof(struct v1_disk_dqblk))
 		return 0;
-	/* Doublecheck whether we didn't get file with new format - with old quotactl() this could happen */
-	size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
+	/* Doublecheck whether we didn't get file with new format - with old
+	 * quotactl() this could happen */
+	size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader))
 		return 1;	/* Probably not new format */
 	if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type])
 		return 1;	/* Definitely not new format */
-	printk(KERN_INFO "VFS: %s: Refusing to turn on old quota format on given file. It probably contains newer quota format.\n", sb->s_id);
+	printk(KERN_INFO
+	       "VFS: %s: Refusing to turn on old quota format on given file."
+	       " It probably contains newer quota format.\n", sb->s_id);
         return 0;		/* Seems like a new format file -> refuse it */
 }
 
@@ -148,7 +159,9 @@ static int v1_read_file_info(struct super_block *sb, int type)
 	struct v1_disk_dqblk dqblk;
 	int ret;
 
-	if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) {
+	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
 		if (ret >= 0)
 			ret = -EIO;
 		goto out;
@@ -157,8 +170,10 @@ static int v1_read_file_info(struct super_block *sb, int type)
 	/* limits are stored as unsigned 32-bit data */
 	dqopt->info[type].dqi_maxblimit = 0xffffffff;
 	dqopt->info[type].dqi_maxilimit = 0xffffffff;
-	dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
-	dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
+	dqopt->info[type].dqi_igrace =
+			dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
+	dqopt->info[type].dqi_bgrace =
+			dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
 out:
 	return ret;
 }
@@ -170,8 +185,9 @@ static int v1_write_file_info(struct super_block *sb, int type)
 	int ret;
 
 	dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY;
-	if ((ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
-	    sizeof(struct v1_disk_dqblk), v1_dqoff(0))) != sizeof(struct v1_disk_dqblk)) {
+	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
+	if (ret != sizeof(struct v1_disk_dqblk)) {
 		if (ret >= 0)
 			ret = -EIO;
 		goto out;
diff --git a/fs/quota_v2.c b/fs/quota/quota_v2.c
index b618b56..a5475fb 100644
--- a/fs/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -54,7 +54,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
 	static const uint quota_magics[] = V2_INITQMAGICS;
 	static const uint quota_versions[] = V2_INITQVERSIONS;
  
-	size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0);
+	size = sb->s_op->quota_read(sb, type, (char *)&dqhead,
+				    sizeof(struct v2_disk_dqheader), 0);
 	if (size != sizeof(struct v2_disk_dqheader)) {
 		printk("quota_v2: failed read expected=%zd got=%zd\n",
 			sizeof(struct v2_disk_dqheader), size);
diff --git a/fs/quotaio_v1.h b/fs/quota/quotaio_v1.h
index 746654b..746654b 100644
--- a/fs/quotaio_v1.h
+++ b/fs/quota/quotaio_v1.h
diff --git a/fs/quotaio_v2.h b/fs/quota/quotaio_v2.h
index 530fe58..530fe58 100644
--- a/fs/quotaio_v2.h
+++ b/fs/quota/quotaio_v2.h
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index b9b567a..995ef1d 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -18,7 +18,6 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/ramfs.h>
-#include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include <linux/mman.h>
 
@@ -114,6 +113,9 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 		if (!pagevec_add(&lru_pvec, page))
 			__pagevec_lru_add_file(&lru_pvec);
 
+		/* prevent the page from being discarded on memory pressure */
+		SetPageDirty(page);
+
 		unlock_page(page);
 	}
 
@@ -126,6 +128,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	return -EFBIG;
 
  add_error:
+	pagevec_lru_add_file(&lru_pvec);
 	page_cache_release(pages + loop);
 	for (loop++; loop < npages; loop++)
 		__free_page(pages + loop);
@@ -201,11 +204,6 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
 	if (ret)
 		return ret;
 
-	/* by providing our own setattr() method, we skip this quotaism */
-	if ((old_ia_valid & ATTR_UID && ia->ia_uid != inode->i_uid) ||
-	    (old_ia_valid & ATTR_GID && ia->ia_gid != inode->i_gid))
-		ret = DQUOT_TRANSFER(inode, ia) ? -EDQUOT : 0;
-
 	/* pick out size-changing events */
 	if (ia->ia_valid & ATTR_SIZE) {
 		loff_t size = i_size_read(inode);
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 4646caa..f32d142 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -430,7 +430,7 @@ static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
 
 	journal_mark_dirty(th, s, sbh);
 	if (for_unformatted)
-		DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
+		vfs_dq_free_block_nodirty(inode, 1);
 }
 
 void reiserfs_free_block(struct reiserfs_transaction_handle *th,
@@ -1055,7 +1055,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 			       amount_needed, hint->inode->i_uid);
 #endif
 		quota_ret =
-		    DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
+		    vfs_dq_alloc_block_nodirty(hint->inode, amount_needed);
 		if (quota_ret)	/* Quota exceeded? */
 			return QUOTA_EXCEEDED;
 		if (hint->preallocate && hint->prealloc_size) {
@@ -1064,8 +1064,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 				       "reiserquota: allocating (prealloc) %d blocks id=%u",
 				       hint->prealloc_size, hint->inode->i_uid);
 #endif
-			quota_ret =
-			    DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode,
+			quota_ret = vfs_dq_prealloc_block_nodirty(hint->inode,
 							 hint->prealloc_size);
 			if (quota_ret)
 				hint->preallocate = hint->prealloc_size = 0;
@@ -1098,7 +1097,10 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 					       nr_allocated,
 					       hint->inode->i_uid);
 #endif
-				DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);	/* Free not allocated blocks */
+				/* Free not allocated blocks */
+				vfs_dq_free_block_nodirty(hint->inode,
+					amount_needed + hint->prealloc_size -
+					nr_allocated);
 			}
 			while (nr_allocated--)
 				reiserfs_free_block(hint->th, hint->inode,
@@ -1129,7 +1131,7 @@ static inline int blocknrs_and_prealloc_arrays_from_search_start
 			       REISERFS_I(hint->inode)->i_prealloc_count,
 			       hint->inode->i_uid);
 #endif
-		DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed +
+		vfs_dq_free_block_nodirty(hint->inode, amount_needed +
 					 hint->prealloc_size - nr_allocated -
 					 REISERFS_I(hint->inode)->
 					 i_prealloc_count);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 55fce92..823227a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -53,7 +53,7 @@ void reiserfs_delete_inode(struct inode *inode)
 		 * after delete_object so that quota updates go into the same transaction as
 		 * stat data deletion */
 		if (!err) 
-			DQUOT_FREE_INODE(inode);
+			vfs_dq_free_inode(inode);
 
 		if (journal_end(&th, inode->i_sb, jbegin_count))
 			goto out;
@@ -1763,7 +1763,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	BUG_ON(!th->t_trans_id);
 
-	if (DQUOT_ALLOC_INODE(inode)) {
+	if (vfs_dq_alloc_inode(inode)) {
 		err = -EDQUOT;
 		goto out_end_trans;
 	}
@@ -1947,12 +1947,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 	INODE_PKEY(inode)->k_objectid = 0;
 
 	/* Quota change must be inside a transaction for journaling */
-	DQUOT_FREE_INODE(inode);
+	vfs_dq_free_inode(inode);
 
       out_end_trans:
 	journal_end(th, th->t_super, th->t_blocks_allocated);
 	/* Drop can be outside and it needs more credits so it's better to have it outside */
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	inode->i_flags |= S_NOQUOTA;
 	make_bad_inode(inode);
 
@@ -3119,7 +3119,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
 				if (error)
 					goto out;
 				error =
-				    DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+				    vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
 				if (error) {
 					journal_end(&th, inode->i_sb,
 						    jbegin_count);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 738967f..639d635 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -555,7 +555,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
 */
 static int drop_new_inode(struct inode *inode)
 {
-	DQUOT_DROP(inode);
+	vfs_dq_drop(inode);
 	make_bad_inode(inode);
 	inode->i_flags |= S_NOQUOTA;
 	iput(inode);
@@ -563,7 +563,7 @@ static int drop_new_inode(struct inode *inode)
 }
 
 /* utility function that does setup for reiserfs_new_inode.  
-** DQUOT_INIT needs lots of credits so it's better to have it
+** vfs_dq_init needs lots of credits so it's better to have it
 ** outside of a transaction, so we had to pull some bits of
 ** reiserfs_new_inode out into this func.
 */
@@ -586,7 +586,7 @@ static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
 	} else {
 		inode->i_gid = current_fsgid();
 	}
-	DQUOT_INIT(inode);
+	vfs_dq_init(inode);
 	return 0;
 }
 
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index abbc64d..73aaa33 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1297,7 +1297,7 @@ int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct treepath
 		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
 		       quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
 #endif
-	DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+	vfs_dq_free_space_nodirty(p_s_inode, quota_cut_bytes);
 
 	/* Return deleted body length */
 	return n_ret_value;
@@ -1383,7 +1383,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
 					       quota_cut_bytes, inode->i_uid,
 					       key2type(key));
 #endif
-				DQUOT_FREE_SPACE_NODIRTY(inode,
+				vfs_dq_free_space_nodirty(inode,
 							 quota_cut_bytes);
 			}
 			break;
@@ -1734,7 +1734,7 @@ int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
 		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
 		       quota_cut_bytes, p_s_inode->i_uid, '?');
 #endif
-	DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+	vfs_dq_free_space_nodirty(p_s_inode, quota_cut_bytes);
 	return n_ret_value;
 }
 
@@ -1971,7 +1971,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 		       key2type(&(p_s_key->on_disk_key)));
 #endif
 
-	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
+	if (vfs_dq_alloc_space_nodirty(inode, n_pasted_size)) {
 		pathrelse(p_s_search_path);
 		return -EDQUOT;
 	}
@@ -2027,7 +2027,7 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
 		       n_pasted_size, inode->i_uid,
 		       key2type(&(p_s_key->on_disk_key)));
 #endif
-	DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
+	vfs_dq_free_space_nodirty(inode, n_pasted_size);
 	return retval;
 }
 
@@ -2060,7 +2060,7 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 #endif
 		/* We can't dirty inode here. It would be immediately written but
 		 * appropriate stat item isn't inserted yet... */
-		if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
+		if (vfs_dq_alloc_space_nodirty(inode, quota_bytes)) {
 			pathrelse(p_s_path);
 			return -EDQUOT;
 		}
@@ -2112,6 +2112,6 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct treepath
 		       quota_bytes, inode->i_uid, head2type(p_s_ih));
 #endif
 	if (inode)
-		DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes);
+		vfs_dq_free_space_nodirty(inode, quota_bytes);
 	return retval;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f3c820b..5dbafb7 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -250,7 +250,7 @@ static int finish_unfinished(struct super_block *s)
 			retval = remove_save_link_only(s, &save_link_key, 0);
 			continue;
 		}
-		DQUOT_INIT(inode);
+		vfs_dq_init(inode);
 
 		if (truncate && S_ISDIR(inode->i_mode)) {
 			/* We got a truncate request for a dir which is impossible.
@@ -629,8 +629,6 @@ static const struct super_operations reiserfs_sops = {
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 
-static int reiserfs_dquot_initialize(struct inode *, int);
-static int reiserfs_dquot_drop(struct inode *);
 static int reiserfs_write_dquot(struct dquot *);
 static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
@@ -639,8 +637,8 @@ static int reiserfs_write_info(struct super_block *, int);
 static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
 
 static struct dquot_operations reiserfs_quota_operations = {
-	.initialize = reiserfs_dquot_initialize,
-	.drop = reiserfs_dquot_drop,
+	.initialize = dquot_initialize,
+	.drop = dquot_drop,
 	.alloc_space = dquot_alloc_space,
 	.alloc_inode = dquot_alloc_inode,
 	.free_space = dquot_free_space,
@@ -1896,58 +1894,6 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
 
 #ifdef CONFIG_QUOTA
-static int reiserfs_dquot_initialize(struct inode *inode, int type)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-
-	/* We may create quota structure so we need to reserve enough blocks */
-	reiserfs_write_lock(inode->i_sb);
-	ret =
-	    journal_begin(&th, inode->i_sb,
-			  2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (ret)
-		goto out;
-	ret = dquot_initialize(inode, type);
-	err =
-	    journal_end(&th, inode->i_sb,
-			2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
-	if (!ret && err)
-		ret = err;
-      out:
-	reiserfs_write_unlock(inode->i_sb);
-	return ret;
-}
-
-static int reiserfs_dquot_drop(struct inode *inode)
-{
-	struct reiserfs_transaction_handle th;
-	int ret, err;
-
-	/* We may delete quota structure so we need to reserve enough blocks */
-	reiserfs_write_lock(inode->i_sb);
-	ret =
-	    journal_begin(&th, inode->i_sb,
-			  2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (ret) {
-		/*
-		 * We call dquot_drop() anyway to at least release references
-		 * to quota structures so that umount does not hang.
-		 */
-		dquot_drop(inode);
-		goto out;
-	}
-	ret = dquot_drop(inode);
-	err =
-	    journal_end(&th, inode->i_sb,
-			2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
-	if (!ret && err)
-		ret = err;
-      out:
-	reiserfs_write_unlock(inode->i_sb);
-	return ret;
-}
-
 static int reiserfs_write_dquot(struct dquot *dquot)
 {
 	struct reiserfs_transaction_handle th;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ad92461..ae881cc 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -1136,7 +1136,7 @@ xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 	return 1;
 }
 
-static struct dentry_operations xattr_lookup_poison_ops = {
+static const struct dentry_operations xattr_lookup_poison_ops = {
 	.d_compare = xattr_lookup_poison,
 };
 
diff --git a/fs/seq_file.c b/fs/seq_file.c
index b569ff1..a1a4cfe 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,12 +48,78 @@ int seq_open(struct file *file, const struct seq_operations *op)
 	 */
 	file->f_version = 0;
 
-	/* SEQ files support lseek, but not pread/pwrite */
-	file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
+	/*
+	 * seq_files support lseek() and pread().  They do not implement
+	 * write() at all, but we clear FMODE_PWRITE here for historical
+	 * reasons.
+	 *
+	 * If a client of seq_files a) implements file.write() and b) wishes to
+	 * support pwrite() then that client will need to implement its own
+	 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+	 */
+	file->f_mode &= ~FMODE_PWRITE;
 	return 0;
 }
 EXPORT_SYMBOL(seq_open);
 
+static int traverse(struct seq_file *m, loff_t offset)
+{
+	loff_t pos = 0, index;
+	int error = 0;
+	void *p;
+
+	m->version = 0;
+	index = 0;
+	m->count = m->from = 0;
+	if (!offset) {
+		m->index = index;
+		return 0;
+	}
+	if (!m->buf) {
+		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
+		if (!m->buf)
+			return -ENOMEM;
+	}
+	p = m->op->start(m, &index);
+	while (p) {
+		error = PTR_ERR(p);
+		if (IS_ERR(p))
+			break;
+		error = m->op->show(m, p);
+		if (error < 0)
+			break;
+		if (unlikely(error)) {
+			error = 0;
+			m->count = 0;
+		}
+		if (m->count == m->size)
+			goto Eoverflow;
+		if (pos + m->count > offset) {
+			m->from = offset - pos;
+			m->count -= m->from;
+			m->index = index;
+			break;
+		}
+		pos += m->count;
+		m->count = 0;
+		if (pos == offset) {
+			index++;
+			m->index = index;
+			break;
+		}
+		p = m->op->next(m, p, &index);
+	}
+	m->op->stop(m, p);
+	m->index = index;
+	return error;
+
+Eoverflow:
+	m->op->stop(m, p);
+	kfree(m->buf);
+	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
+	return !m->buf ? -ENOMEM : -EAGAIN;
+}
+
 /**
  *	seq_read -	->read() method for sequential files.
  *	@file: the file to read from
@@ -73,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	int err = 0;
 
 	mutex_lock(&m->lock);
+
+	/* Don't assume *ppos is where we left it */
+	if (unlikely(*ppos != m->read_pos)) {
+		m->read_pos = *ppos;
+		while ((err = traverse(m, *ppos)) == -EAGAIN)
+			;
+		if (err) {
+			/* With prejudice... */
+			m->read_pos = 0;
+			m->version = 0;
+			m->index = 0;
+			m->count = 0;
+			goto Done;
+		}
+	}
+
 	/*
 	 * seq_file->op->..m_start/m_stop/m_next may do special actions
 	 * or optimisations based on the file->f_version, so we want to
@@ -172,8 +254,10 @@ Fill:
 Done:
 	if (!copied)
 		copied = err;
-	else
+	else {
 		*ppos += copied;
+		m->read_pos += copied;
+	}
 	file->f_version = m->version;
 	mutex_unlock(&m->lock);
 	return copied;
@@ -186,63 +270,6 @@ Efault:
 }
 EXPORT_SYMBOL(seq_read);
 
-static int traverse(struct seq_file *m, loff_t offset)
-{
-	loff_t pos = 0, index;
-	int error = 0;
-	void *p;
-
-	m->version = 0;
-	index = 0;
-	m->count = m->from = 0;
-	if (!offset) {
-		m->index = index;
-		return 0;
-	}
-	if (!m->buf) {
-		m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL);
-		if (!m->buf)
-			return -ENOMEM;
-	}
-	p = m->op->start(m, &index);
-	while (p) {
-		error = PTR_ERR(p);
-		if (IS_ERR(p))
-			break;
-		error = m->op->show(m, p);
-		if (error < 0)
-			break;
-		if (unlikely(error)) {
-			error = 0;
-			m->count = 0;
-		}
-		if (m->count == m->size)
-			goto Eoverflow;
-		if (pos + m->count > offset) {
-			m->from = offset - pos;
-			m->count -= m->from;
-			m->index = index;
-			break;
-		}
-		pos += m->count;
-		m->count = 0;
-		if (pos == offset) {
-			index++;
-			m->index = index;
-			break;
-		}
-		p = m->op->next(m, p, &index);
-	}
-	m->op->stop(m, p);
-	return error;
-
-Eoverflow:
-	m->op->stop(m, p);
-	kfree(m->buf);
-	m->buf = kmalloc(m->size <<= 1, GFP_KERNEL);
-	return !m->buf ? -ENOMEM : -EAGAIN;
-}
-
 /**
  *	seq_lseek -	->llseek() method for sequential files.
  *	@file: the file in question
@@ -265,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 			if (offset < 0)
 				break;
 			retval = offset;
-			if (offset != file->f_pos) {
+			if (offset != m->read_pos) {
 				while ((retval=traverse(m, offset)) == -EAGAIN)
 					;
 				if (retval) {
 					/* with extreme prejudice... */
 					file->f_pos = 0;
+					m->read_pos = 0;
 					m->version = 0;
 					m->index = 0;
 					m->count = 0;
 				} else {
+					m->read_pos = offset;
 					retval = file->f_pos = offset;
 				}
 			}
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index e7ddd03..3e4803b 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -277,7 +277,7 @@ static int smb_hash_dentry(struct dentry *, struct qstr *);
 static int smb_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 static int smb_delete_dentry(struct dentry *);
 
-static struct dentry_operations smbfs_dentry_operations =
+static const struct dentry_operations smbfs_dentry_operations =
 {
 	.d_revalidate	= smb_lookup_validate,
 	.d_hash		= smb_hash_dentry,
@@ -285,7 +285,7 @@ static struct dentry_operations smbfs_dentry_operations =
 	.d_delete	= smb_delete_dentry,
 };
 
-static struct dentry_operations smbfs_dentry_operations_case =
+static const struct dentry_operations smbfs_dentry_operations_case =
 {
 	.d_revalidate	= smb_lookup_validate,
 	.d_delete	= smb_delete_dentry,
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index c837dfc..2a79603 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb,
  * generated a larger block - this does occasionally happen with zlib).
  */
 int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
-			int length, u64 *next_index, int srclength)
+			int length, u64 *next_index, int srclength, int pages)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	struct buffer_head **bh;
@@ -184,7 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 				offset = 0;
 			}
 
-			if (msblk->stream.avail_out == 0) {
+			if (msblk->stream.avail_out == 0 && page < pages) {
 				msblk->stream.next_out = buffer[page++];
 				msblk->stream.avail_out = PAGE_CACHE_SIZE;
 			}
@@ -201,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 				zlib_init = 1;
 			}
 
-			zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH);
+			zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH);
 
 			if (msblk->stream.avail_in == 0 && k < b)
 				put_bh(bh[k++]);
 		} while (zlib_err == Z_OK);
 
 		if (zlib_err != Z_STREAM_END) {
-			ERROR("zlib_inflate returned unexpected result"
-				" 0x%x, srclength %d, avail_in %d,"
-				" avail_out %d\n", zlib_err, srclength,
-				msblk->stream.avail_in,
-				msblk->stream.avail_out);
+			ERROR("zlib_inflate error, data probably corrupt\n");
 			goto release_mutex;
 		}
 
 		zlib_err = zlib_inflateEnd(&msblk->stream);
 		if (zlib_err != Z_OK) {
-			ERROR("zlib_inflateEnd returned unexpected result 0x%x,"
-				" srclength %d\n", zlib_err, srclength);
+			ERROR("zlib_inflate error, data probably corrupt\n");
 			goto release_mutex;
 		}
 		length = msblk->stream.total_out;
@@ -268,7 +263,8 @@ block_release:
 		put_bh(bh[k]);
 
 read_failure:
-	ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+	ERROR("squashfs_read_data failed to read block 0x%llx\n",
+					(unsigned long long) index);
 	kfree(bh);
 	return -EIO;
 }
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f29eda1..1c4739e 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
 
 			entry->length = squashfs_read_data(sb, entry->data,
 				block, length, &entry->next_index,
-				cache->block_size);
+				cache->block_size, cache->pages);
 
 			spin_lock(&cache->lock);
 
@@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
 	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
 		data[i] = buffer;
 	res = squashfs_read_data(sb, data, block, length |
-		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
 	kfree(data);
 	return res;
 }
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 7a63398..9101dbd 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 	type = le16_to_cpu(sqshb_ino->inode_type);
 	switch (type) {
 	case SQUASHFS_REG_TYPE: {
-		unsigned int frag_offset, frag_size, frag;
+		unsigned int frag_offset, frag;
+		int frag_size;
 		u64 frag_blk;
 		struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
 
@@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		break;
 	}
 	case SQUASHFS_LREG_TYPE: {
-		unsigned int frag_offset, frag_size, frag;
+		unsigned int frag_offset, frag;
+		int frag_size;
 		u64 frag_blk;
 		struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
 
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6b2515d..0e9feb6 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
 
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
-				int);
+				int, int);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 071df5b..681ec0d 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void)
 		return err;
 	}
 
-	printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+	printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
 		"Phillip Lougher\n");
 
 	return 0;
diff --git a/fs/super.c b/fs/super.c
index 645e540..2ba4815 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		 * lock ordering than usbfs:
 		 */
 		lockdep_set_class(&s->s_lock, &type->s_lock_key);
-		down_write(&s->s_umount);
+		/*
+		 * sget() can have s_umount recursion.
+		 *
+		 * When it cannot find a suitable sb, it allocates a new
+		 * one (this one), and tries again to find a suitable old
+		 * one.
+		 *
+		 * In case that succeeds, it will acquire the s_umount
+		 * lock of the old one. Since these are clearly distrinct
+		 * locks, and this object isn't exposed yet, there's no
+		 * risk of deadlocks.
+		 *
+		 * Annotate this by putting this lock in a different
+		 * subclass.
+		 */
+		down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
 		mutex_init(&s->s_vfs_rename_mutex);
@@ -182,7 +197,7 @@ void deactivate_super(struct super_block *s)
 	if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
 		s->s_count -= S_BIAS-1;
 		spin_unlock(&sb_lock);
-		DQUOT_OFF(s, 0);
+		vfs_dq_off(s, 0);
 		down_write(&s->s_umount);
 		fs->kill_sb(s);
 		put_filesystem(fs);
@@ -251,7 +266,7 @@ EXPORT_SYMBOL(unlock_super);
 void __fsync_super(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
-	DQUOT_SYNC(sb);
+	vfs_dq_sync(sb);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
@@ -301,7 +316,7 @@ void generic_shutdown_super(struct super_block *sb)
 		/*
 		 * wait for asynchronous fs operations to finish before going further
 		 */
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
@@ -356,8 +371,10 @@ retry:
 				continue;
 			if (!grab_super(old))
 				goto retry;
-			if (s)
+			if (s) {
+				up_write(&s->s_umount);
 				destroy_super(s);
+			}
 			return old;
 		}
 	}
@@ -372,6 +389,7 @@ retry:
 	err = set(s, data);
 	if (err) {
 		spin_unlock(&sb_lock);
+		up_write(&s->s_umount);
 		destroy_super(s);
 		return ERR_PTR(err);
 	}
@@ -470,7 +488,7 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
@@ -637,7 +655,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 			mark_files_ro(sb);
 		else if (!fs_may_remount_ro(sb))
 			return -EBUSY;
-		retval = DQUOT_OFF(sb, 1);
+		retval = vfs_dq_off(sb, 1);
 		if (retval < 0 && retval != -ENOSYS)
 			return -EBUSY;
 	}
@@ -652,11 +670,11 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
 	if (remount_rw)
-		DQUOT_ON_REMOUNT(sb);
+		vfs_dq_quota_on_remount(sb);
 	return 0;
 }
 
-static void do_emergency_remount(unsigned long foo)
+static void do_emergency_remount(struct work_struct *work)
 {
 	struct super_block *sb;
 
@@ -679,12 +697,19 @@ static void do_emergency_remount(unsigned long foo)
 		spin_lock(&sb_lock);
 	}
 	spin_unlock(&sb_lock);
+	kfree(work);
 	printk("Emergency Remount complete\n");
 }
 
 void emergency_remount(void)
 {
-	pdflush_operation(do_emergency_remount, 0);
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_emergency_remount);
+		schedule_work(work);
+	}
 }
 
 /*
@@ -813,7 +838,8 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		bdev->bd_super = s;
 	}
 
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 
 error_s:
 	error = PTR_ERR(s);
@@ -859,7 +885,8 @@ int get_sb_nodev(struct file_system_type *fs_type,
 		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -891,7 +918,8 @@ int get_sb_single(struct file_system_type *fs_type,
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_single);
diff --git a/fs/sync.c b/fs/sync.c
index a16d53e..7abc65f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -25,7 +25,7 @@ static void do_sync(unsigned long wait)
 {
 	wakeup_pdflush(0);
 	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	DQUOT_SYNC(NULL);
+	vfs_dq_sync(NULL);
 	sync_supers();		/* Write the superblocks */
 	sync_filesystems(0);	/* Start syncing the filesystems */
 	sync_filesystems(wait);	/* Waitingly sync the filesystems */
@@ -42,9 +42,21 @@ SYSCALL_DEFINE0(sync)
 	return 0;
 }
 
+static void do_sync_work(struct work_struct *work)
+{
+	do_sync(0);
+	kfree(work);
+}
+
 void emergency_sync(void)
 {
-	pdflush_operation(do_sync, 0);
+	struct work_struct *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (work) {
+		INIT_WORK(work, do_sync_work);
+		schedule_work(work);
+	}
 }
 
 /*
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index f2c478c..07703d3 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -21,15 +21,28 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
+#include <linux/mm.h>
 
 #include <asm/uaccess.h>
 
 #include "sysfs.h"
 
+/*
+ * There's one bin_buffer for each open file.
+ *
+ * filp->private_data points to bin_buffer and
+ * sysfs_dirent->s_bin_attr.buffers points to a the bin_buffer s
+ * sysfs_dirent->s_bin_attr.buffers is protected by sysfs_bin_lock
+ */
+static DEFINE_MUTEX(sysfs_bin_lock);
+
 struct bin_buffer {
-	struct mutex	mutex;
-	void		*buffer;
-	int		mmapped;
+	struct mutex			mutex;
+	void				*buffer;
+	int				mmapped;
+	struct vm_operations_struct 	*vm_ops;
+	struct file			*file;
+	struct hlist_node		list;
 };
 
 static int
@@ -168,6 +181,175 @@ out_free:
 	return count;
 }
 
+static void bin_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+
+	if (!bb->vm_ops || !bb->vm_ops->open)
+		return;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return;
+
+	bb->vm_ops->open(vma);
+
+	sysfs_put_active_two(attr_sd);
+}
+
+static void bin_vma_close(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+
+	if (!bb->vm_ops || !bb->vm_ops->close)
+		return;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return;
+
+	bb->vm_ops->close(vma);
+
+	sysfs_put_active_two(attr_sd);
+}
+
+static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops || !bb->vm_ops->fault)
+		return VM_FAULT_SIGBUS;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return VM_FAULT_SIGBUS;
+
+	ret = bb->vm_ops->fault(vma, vmf);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+
+static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops)
+		return -EINVAL;
+
+	if (!bb->vm_ops->page_mkwrite)
+		return 0;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return -EINVAL;
+
+	ret = bb->vm_ops->page_mkwrite(vma, page);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+
+static int bin_access(struct vm_area_struct *vma, unsigned long addr,
+		  void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops || !bb->vm_ops->access)
+		return -EINVAL;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return -EINVAL;
+
+	ret = bb->vm_ops->access(vma, addr, buf, len, write);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int bin_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops || !bb->vm_ops->set_policy)
+		return 0;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return -EINVAL;
+
+	ret = bb->vm_ops->set_policy(vma, new);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+
+static struct mempolicy *bin_get_policy(struct vm_area_struct *vma,
+					unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	struct mempolicy *pol;
+
+	if (!bb->vm_ops || !bb->vm_ops->get_policy)
+		return vma->vm_policy;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return vma->vm_policy;
+
+	pol = bb->vm_ops->get_policy(vma, addr);
+
+	sysfs_put_active_two(attr_sd);
+	return pol;
+}
+
+static int bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
+			const nodemask_t *to, unsigned long flags)
+{
+	struct file *file = vma->vm_file;
+	struct bin_buffer *bb = file->private_data;
+	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
+	int ret;
+
+	if (!bb->vm_ops || !bb->vm_ops->migrate)
+		return 0;
+
+	if (!sysfs_get_active_two(attr_sd))
+		return 0;
+
+	ret = bb->vm_ops->migrate(vma, from, to, flags);
+
+	sysfs_put_active_two(attr_sd);
+	return ret;
+}
+#endif
+
+static struct vm_operations_struct bin_vm_ops = {
+	.open		= bin_vma_open,
+	.close		= bin_vma_close,
+	.fault		= bin_fault,
+	.page_mkwrite	= bin_page_mkwrite,
+	.access		= bin_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= bin_set_policy,
+	.get_policy	= bin_get_policy,
+	.migrate	= bin_migrate,
+#endif
+};
+
 static int mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct bin_buffer *bb = file->private_data;
@@ -179,18 +361,37 @@ static int mmap(struct file *file, struct vm_area_struct *vma)
 	mutex_lock(&bb->mutex);
 
 	/* need attr_sd for attr, its parent for kobj */
+	rc = -ENODEV;
 	if (!sysfs_get_active_two(attr_sd))
-		return -ENODEV;
+		goto out_unlock;
 
 	rc = -EINVAL;
-	if (attr->mmap)
-		rc = attr->mmap(kobj, attr, vma);
+	if (!attr->mmap)
+		goto out_put;
+
+	rc = attr->mmap(kobj, attr, vma);
+	if (rc)
+		goto out_put;
 
-	if (rc == 0 && !bb->mmapped)
-		bb->mmapped = 1;
-	else
-		sysfs_put_active_two(attr_sd);
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
 
+	rc = -EINVAL;
+	if (bb->mmapped && bb->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	rc = 0;
+	bb->mmapped = 1;
+	bb->vm_ops = vma->vm_ops;
+	vma->vm_ops = &bin_vm_ops;
+out_put:
+	sysfs_put_active_two(attr_sd);
+out_unlock:
 	mutex_unlock(&bb->mutex);
 
 	return rc;
@@ -223,8 +424,13 @@ static int open(struct inode * inode, struct file * file)
 		goto err_out;
 
 	mutex_init(&bb->mutex);
+	bb->file = file;
 	file->private_data = bb;
 
+	mutex_lock(&sysfs_bin_lock);
+	hlist_add_head(&bb->list, &attr_sd->s_bin_attr.buffers);
+	mutex_unlock(&sysfs_bin_lock);
+
 	/* open succeeded, put active references */
 	sysfs_put_active_two(attr_sd);
 	return 0;
@@ -237,11 +443,12 @@ static int open(struct inode * inode, struct file * file)
 
 static int release(struct inode * inode, struct file * file)
 {
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
 	struct bin_buffer *bb = file->private_data;
 
-	if (bb->mmapped)
-		sysfs_put_active_two(attr_sd);
+	mutex_lock(&sysfs_bin_lock);
+	hlist_del(&bb->list);
+	mutex_unlock(&sysfs_bin_lock);
+
 	kfree(bb->buffer);
 	kfree(bb);
 	return 0;
@@ -256,6 +463,26 @@ const struct file_operations bin_fops = {
 	.release	= release,
 };
 
+
+void unmap_bin_file(struct sysfs_dirent *attr_sd)
+{
+	struct bin_buffer *bb;
+	struct hlist_node *tmp;
+
+	if (sysfs_type(attr_sd) != SYSFS_KOBJ_BIN_ATTR)
+		return;
+
+	mutex_lock(&sysfs_bin_lock);
+
+	hlist_for_each_entry(bb, tmp, &attr_sd->s_bin_attr.buffers, list) {
+		struct inode *inode = bb->file->f_path.dentry->d_inode;
+
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+
+	mutex_unlock(&sysfs_bin_lock);
+}
+
 /**
  *	sysfs_create_bin_file - create binary file for object.
  *	@kobj:	object.
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 82d3b79..d88d0fa 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -302,7 +302,7 @@ static void sysfs_d_iput(struct dentry * dentry, struct inode * inode)
 	iput(inode);
 }
 
-static struct dentry_operations sysfs_dentry_ops = {
+static const struct dentry_operations sysfs_dentry_ops = {
 	.d_iput		= sysfs_d_iput,
 };
 
@@ -434,6 +434,26 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 }
 
 /**
+ *	sysfs_pathname - return full path to sysfs dirent
+ *	@sd: sysfs_dirent whose path we want
+ *	@path: caller allocated buffer
+ *
+ *	Gives the name "/" to the sysfs_root entry; any path returned
+ *	is relative to wherever sysfs is mounted.
+ *
+ *	XXX: does no error checking on @path size
+ */
+static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
+{
+	if (sd->s_parent) {
+		sysfs_pathname(sd->s_parent, path);
+		strcat(path, "/");
+	}
+	strcat(path, sd->s_name);
+	return path;
+}
+
+/**
  *	sysfs_add_one - add sysfs_dirent to parent
  *	@acxt: addrm context to use
  *	@sd: sysfs_dirent to be added
@@ -458,8 +478,16 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 	int ret;
 
 	ret = __sysfs_add_one(acxt, sd);
-	WARN(ret == -EEXIST, KERN_WARNING "sysfs: duplicate filename '%s' "
-		       "can not be created\n", sd->s_name);
+	if (ret == -EEXIST) {
+		char *path = kzalloc(PATH_MAX, GFP_KERNEL);
+		WARN(1, KERN_WARNING
+		     "sysfs: cannot create duplicate filename '%s'\n",
+		     (path == NULL) ? sd->s_name :
+		     strcat(strcat(sysfs_pathname(acxt->parent_sd, path), "/"),
+		            sd->s_name));
+		kfree(path);
+	}
+
 	return ret;
 }
 
@@ -581,6 +609,7 @@ void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
 
 		sysfs_drop_dentry(sd);
 		sysfs_deactivate(sd);
+		unmap_bin_file(sd);
 		sysfs_put(sd);
 	}
 }
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1f4a3f8..289c43a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -659,13 +659,16 @@ void sysfs_remove_file_from_group(struct kobject *kobj,
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
 
 struct sysfs_schedule_callback_struct {
-	struct kobject 		*kobj;
+	struct list_head	workq_list;
+	struct kobject		*kobj;
 	void			(*func)(void *);
 	void			*data;
 	struct module		*owner;
 	struct work_struct	work;
 };
 
+static DEFINE_MUTEX(sysfs_workq_mutex);
+static LIST_HEAD(sysfs_workq);
 static void sysfs_schedule_callback_work(struct work_struct *work)
 {
 	struct sysfs_schedule_callback_struct *ss = container_of(work,
@@ -674,6 +677,9 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
 	(ss->func)(ss->data);
 	kobject_put(ss->kobj);
 	module_put(ss->owner);
+	mutex_lock(&sysfs_workq_mutex);
+	list_del(&ss->workq_list);
+	mutex_unlock(&sysfs_workq_mutex);
 	kfree(ss);
 }
 
@@ -695,15 +701,25 @@ static void sysfs_schedule_callback_work(struct work_struct *work)
  * until @func returns.
  *
  * Returns 0 if the request was submitted, -ENOMEM if storage could not
- * be allocated, -ENODEV if a reference to @owner isn't available.
+ * be allocated, -ENODEV if a reference to @owner isn't available,
+ * -EAGAIN if a callback has already been scheduled for @kobj.
  */
 int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 		void *data, struct module *owner)
 {
-	struct sysfs_schedule_callback_struct *ss;
+	struct sysfs_schedule_callback_struct *ss, *tmp;
 
 	if (!try_module_get(owner))
 		return -ENODEV;
+
+	mutex_lock(&sysfs_workq_mutex);
+	list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list)
+		if (ss->kobj == kobj) {
+			mutex_unlock(&sysfs_workq_mutex);
+			return -EAGAIN;
+		}
+	mutex_unlock(&sysfs_workq_mutex);
+
 	ss = kmalloc(sizeof(*ss), GFP_KERNEL);
 	if (!ss) {
 		module_put(owner);
@@ -715,6 +731,10 @@ int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *),
 	ss->data = data;
 	ss->owner = owner;
 	INIT_WORK(&ss->work, sysfs_schedule_callback_work);
+	INIT_LIST_HEAD(&ss->workq_list);
+	mutex_lock(&sysfs_workq_mutex);
+	list_add_tail(&ss->workq_list, &sysfs_workq);
+	mutex_unlock(&sysfs_workq_mutex);
 	schedule_work(&ss->work);
 	return 0;
 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index dfa3d94..555f0ff 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -147,6 +147,7 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
 	struct bin_attribute *bin_attr;
 
+	inode->i_private = sysfs_get(sd);
 	inode->i_mapping->a_ops = &sysfs_aops;
 	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
 	inode->i_op = &sysfs_inode_operations;
@@ -214,6 +215,22 @@ struct inode * sysfs_get_inode(struct sysfs_dirent *sd)
 	return inode;
 }
 
+/*
+ * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
+ * To prevent the sysfs inode numbers from being freed prematurely we take a
+ * reference to sysfs_dirent from the sysfs inode.  A
+ * super_operations.delete_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void sysfs_delete_inode(struct inode *inode)
+{
+	struct sysfs_dirent *sd  = inode->i_private;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+	sysfs_put(sd);
+}
+
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name)
 {
 	struct sysfs_addrm_cxt acxt;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index ab343e3..4974995 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -17,11 +17,10 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/magic.h>
 
 #include "sysfs.h"
 
-/* Random magic number */
-#define SYSFS_MAGIC 0x62656572
 
 static struct vfsmount *sysfs_mount;
 struct super_block * sysfs_sb = NULL;
@@ -30,6 +29,7 @@ struct kmem_cache *sysfs_dir_cachep;
 static const struct super_operations sysfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
+	.delete_inode	= sysfs_delete_inode,
 };
 
 struct sysfs_dirent sysfs_root = {
@@ -53,7 +53,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
 	sysfs_sb = sb;
 
 	/* get root inode, initialize and unlock it */
+	mutex_lock(&sysfs_mutex);
 	inode = sysfs_get_inode(&sysfs_root);
+	mutex_unlock(&sysfs_mutex);
 	if (!inode) {
 		pr_debug("sysfs: could not get root inode\n");
 		return -ENOMEM;
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 93c6d6b..3fa0d984 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -28,6 +28,7 @@ struct sysfs_elem_attr {
 
 struct sysfs_elem_bin_attr {
 	struct bin_attribute	*bin_attr;
+	struct hlist_head	buffers;
 };
 
 /*
@@ -145,6 +146,7 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
  * inode.c
  */
 struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
+void sysfs_delete_inode(struct inode *inode);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
 int sysfs_inode_init(void);
@@ -163,6 +165,7 @@ int sysfs_add_file_mode(struct sysfs_dirent *dir_sd,
  * bin.c
  */
 extern const struct file_operations bin_fops;
+void unmap_bin_file(struct sysfs_dirent *attr_sd);
 
 /*
  * symlink.c
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index a1f1ef3..33e047b 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -38,7 +38,7 @@ static int sysv_hash(struct dentry *dentry, struct qstr *qstr)
 	return 0;
 }
 
-struct dentry_operations sysv_dentry_operations = {
+const struct dentry_operations sysv_dentry_operations = {
 	.d_hash		= sysv_hash,
 };
 
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 38ebe3f..5784a31 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -170,7 +170,7 @@ extern const struct file_operations sysv_file_operations;
 extern const struct file_operations sysv_dir_operations;
 extern const struct address_space_operations sysv_aops;
 extern const struct super_operations sysv_sops;
-extern struct dentry_operations sysv_dentry_operations;
+extern const struct dentry_operations sysv_dentry_operations;
 
 
 enum {
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8..b042bd7 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
-		return -EINVAL;
-	if (clockid != CLOCK_MONOTONIC &&
-	    clockid != CLOCK_REALTIME)
+	if ((flags & ~TFD_CREATE_FLAGS) ||
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME))
 		return -EINVAL;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-			       flags & (O_CLOEXEC | O_NONBLOCK));
+			       flags & TFD_SHARED_FCNTL_FLAGS);
 	if (ufd < 0)
 		kfree(ctx);
 
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
 		return -EFAULT;
 
-	if (!timespec_valid(&ktmr.it_value) ||
+	if ((flags & ~TFD_SETTIME_FLAGS) ||
+	    !timespec_valid(&ktmr.it_value) ||
 	    !timespec_valid(&ktmr.it_interval))
 		return -EINVAL;
 
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 175f9c5..f393620 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -689,7 +689,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
 }
 
 /**
- * ubifs_get_free_space - return amount of free space.
+ * ubifs_get_free_space_nolock - return amount of free space.
  * @c: UBIFS file-system description object
  *
  * This function calculates amount of free space to report to user-space.
@@ -704,16 +704,14 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
  * traditional file-systems, because they have way less overhead than UBIFS.
  * So, to keep users happy, UBIFS tries to take the overhead into account.
  */
-long long ubifs_get_free_space(struct ubifs_info *c)
+long long ubifs_get_free_space_nolock(struct ubifs_info *c)
 {
-	int min_idx_lebs, rsvd_idx_lebs, lebs;
+	int rsvd_idx_lebs, lebs;
 	long long available, outstanding, free;
 
-	spin_lock(&c->space_lock);
-	min_idx_lebs = c->min_idx_lebs;
-	ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
+	ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	available = ubifs_calc_available(c, min_idx_lebs);
+	available = ubifs_calc_available(c, c->min_idx_lebs);
 
 	/*
 	 * When reporting free space to user-space, UBIFS guarantees that it is
@@ -726,15 +724,14 @@ long long ubifs_get_free_space(struct ubifs_info *c)
 	 * Note, the calculations below are similar to what we have in
 	 * 'do_budget_space()', so refer there for comments.
 	 */
-	if (min_idx_lebs > c->lst.idx_lebs)
-		rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs;
+	if (c->min_idx_lebs > c->lst.idx_lebs)
+		rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs;
 	else
 		rsvd_idx_lebs = 0;
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
 	       c->lst.taken_empty_lebs;
 	lebs -= rsvd_idx_lebs;
 	available += lebs * (c->dark_wm - c->leb_overhead);
-	spin_unlock(&c->space_lock);
 
 	if (available > outstanding)
 		free = ubifs_reported_space(c, available - outstanding);
@@ -742,3 +739,21 @@ long long ubifs_get_free_space(struct ubifs_info *c)
 		free = 0;
 	return free;
 }
+
+/**
+ * ubifs_get_free_space - return amount of free space.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and retuns amount of free space to report to
+ * user-space.
+ */
+long long ubifs_get_free_space(struct ubifs_info *c)
+{
+	long long free;
+
+	spin_lock(&c->space_lock);
+	free = ubifs_get_free_space_nolock(c);
+	spin_unlock(&c->space_lock);
+
+	return free;
+}
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 792c5a1..e975bd8 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -620,9 +620,11 @@ void dbg_dump_budg(struct ubifs_info *c)
 	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
 	printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
 	       c->gc_lnum, c->ihead_lnum);
-	for (i = 0; i < c->jhead_cnt; i++)
-		printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
-		       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
+	/* If we are in R/O mode, journal heads do not exist */
+	if (c->jheads)
+		for (i = 0; i < c->jhead_cnt; i++)
+			printk(KERN_DEBUG "\tjhead %d\t LEB %d\n",
+			       c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum);
 	for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
 		bud = rb_entry(rb, struct ubifs_bud, rb);
 		printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
@@ -637,10 +639,7 @@ void dbg_dump_budg(struct ubifs_info *c)
 	/* Print budgeting predictions */
 	available = ubifs_calc_available(c, c->min_idx_lebs);
 	outstanding = c->budg_data_growth + c->budg_dd_growth;
-	if (available > outstanding)
-		free = ubifs_reported_space(c, available - outstanding);
-	else
-		free = 0;
+	free = ubifs_get_free_space_nolock(c);
 	printk(KERN_DEBUG "Budgeting predictions:\n");
 	printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
 	       available, outstanding, free);
@@ -861,6 +860,65 @@ void dbg_dump_index(struct ubifs_info *c)
 }
 
 /**
+ * dbg_save_space_info - save information about flash space.
+ * @c: UBIFS file-system description object
+ *
+ * This function saves information about UBIFS free space, dirty space, etc, in
+ * order to check it later.
+ */
+void dbg_save_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+
+	ubifs_get_lp_stats(c, &d->saved_lst);
+
+	spin_lock(&c->space_lock);
+	d->saved_free = ubifs_get_free_space_nolock(c);
+	spin_unlock(&c->space_lock);
+}
+
+/**
+ * dbg_check_space_info - check flash space information.
+ * @c: UBIFS file-system description object
+ *
+ * This function compares current flash space information with the information
+ * which was saved when the 'dbg_save_space_info()' function was called.
+ * Returns zero if the information has not changed, and %-EINVAL it it has
+ * changed.
+ */
+int dbg_check_space_info(struct ubifs_info *c)
+{
+	struct ubifs_debug_info *d = c->dbg;
+	struct ubifs_lp_stats lst;
+	long long avail, free;
+
+	spin_lock(&c->space_lock);
+	avail = ubifs_calc_available(c, c->min_idx_lebs);
+	spin_unlock(&c->space_lock);
+	free = ubifs_get_free_space(c);
+
+	if (free != d->saved_free) {
+		ubifs_err("free space changed from %lld to %lld",
+			  d->saved_free, free);
+		goto out;
+	}
+
+	return 0;
+
+out:
+	ubifs_msg("saved lprops statistics dump");
+	dbg_dump_lstats(&d->saved_lst);
+	ubifs_get_lp_stats(c, &lst);
+	ubifs_msg("current lprops statistics dump");
+	dbg_dump_lstats(&d->saved_lst);
+	spin_lock(&c->space_lock);
+	dbg_dump_budg(c);
+	spin_unlock(&c->space_lock);
+	dump_stack();
+	return -EINVAL;
+}
+
+/**
  * dbg_check_synced_i_size - check synchronized inode size.
  * @inode: inode to check
  *
@@ -1349,7 +1407,7 @@ int dbg_check_tnc(struct ubifs_info *c, int extra)
  * @c: UBIFS file-system description object
  * @leaf_cb: called for each leaf node
  * @znode_cb: called for each indexing node
- * @priv: private date which is passed to callbacks
+ * @priv: private data which is passed to callbacks
  *
  * This function walks the UBIFS index and calls the @leaf_cb for each leaf
  * node and @znode_cb for each indexing node. Returns zero in case of success
@@ -2409,7 +2467,7 @@ void ubifs_debugging_exit(struct ubifs_info *c)
  * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
  * contain the stuff specific to particular file-system mounts.
  */
-static struct dentry *debugfs_rootdir;
+static struct dentry *dfs_rootdir;
 
 /**
  * dbg_debugfs_init - initialize debugfs file-system.
@@ -2421,9 +2479,9 @@ static struct dentry *debugfs_rootdir;
  */
 int dbg_debugfs_init(void)
 {
-	debugfs_rootdir = debugfs_create_dir("ubifs", NULL);
-	if (IS_ERR(debugfs_rootdir)) {
-		int err = PTR_ERR(debugfs_rootdir);
+	dfs_rootdir = debugfs_create_dir("ubifs", NULL);
+	if (IS_ERR(dfs_rootdir)) {
+		int err = PTR_ERR(dfs_rootdir);
 		ubifs_err("cannot create \"ubifs\" debugfs directory, "
 			  "error %d\n", err);
 		return err;
@@ -2437,7 +2495,7 @@ int dbg_debugfs_init(void)
  */
 void dbg_debugfs_exit(void)
 {
-	debugfs_remove(debugfs_rootdir);
+	debugfs_remove(dfs_rootdir);
 }
 
 static int open_debugfs_file(struct inode *inode, struct file *file)
@@ -2452,13 +2510,13 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
 	struct ubifs_info *c = file->private_data;
 	struct ubifs_debug_info *d = c->dbg;
 
-	if (file->f_path.dentry == d->dump_lprops)
+	if (file->f_path.dentry == d->dfs_dump_lprops)
 		dbg_dump_lprops(c);
-	else if (file->f_path.dentry == d->dump_budg) {
+	else if (file->f_path.dentry == d->dfs_dump_budg) {
 		spin_lock(&c->space_lock);
 		dbg_dump_budg(c);
 		spin_unlock(&c->space_lock);
-	} else if (file->f_path.dentry == d->dump_tnc) {
+	} else if (file->f_path.dentry == d->dfs_dump_tnc) {
 		mutex_lock(&c->tnc_mutex);
 		dbg_dump_tnc(c);
 		mutex_unlock(&c->tnc_mutex);
@@ -2469,7 +2527,7 @@ static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
 	return count;
 }
 
-static const struct file_operations debugfs_fops = {
+static const struct file_operations dfs_fops = {
 	.open = open_debugfs_file,
 	.write = write_debugfs_file,
 	.owner = THIS_MODULE,
@@ -2494,36 +2552,32 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
 	struct dentry *dent;
 	struct ubifs_debug_info *d = c->dbg;
 
-	sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
-	d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name,
-					      debugfs_rootdir);
-	if (IS_ERR(d->debugfs_dir)) {
-		err = PTR_ERR(d->debugfs_dir);
+	sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+	d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir);
+	if (IS_ERR(d->dfs_dir)) {
+		err = PTR_ERR(d->dfs_dir);
 		ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
-			  d->debugfs_dir_name, err);
+			  d->dfs_dir_name, err);
 		goto out;
 	}
 
 	fname = "dump_lprops";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_lprops = dent;
+	d->dfs_dump_lprops = dent;
 
 	fname = "dump_budg";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_budg = dent;
+	d->dfs_dump_budg = dent;
 
 	fname = "dump_tnc";
-	dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
-				   &debugfs_fops);
+	dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops);
 	if (IS_ERR(dent))
 		goto out_remove;
-	d->dump_tnc = dent;
+	d->dfs_dump_tnc = dent;
 
 	return 0;
 
@@ -2531,7 +2585,7 @@ out_remove:
 	err = PTR_ERR(dent);
 	ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
 		  fname, err);
-	debugfs_remove_recursive(d->debugfs_dir);
+	debugfs_remove_recursive(d->dfs_dir);
 out:
 	return err;
 }
@@ -2542,7 +2596,7 @@ out:
  */
 void dbg_debugfs_exit_fs(struct ubifs_info *c)
 {
-	debugfs_remove_recursive(c->dbg->debugfs_dir);
+	debugfs_remove_recursive(c->dbg->dfs_dir);
 }
 
 #endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 9820d69..c1cd73b 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -41,15 +41,17 @@
  * @chk_lpt_wastage: used by LPT tree size checker
  * @chk_lpt_lebs: used by LPT tree size checker
  * @new_nhead_offs: used by LPT tree size checker
- * @new_ihead_lnum: used by debugging to check ihead_lnum
- * @new_ihead_offs: used by debugging to check ihead_offs
+ * @new_ihead_lnum: used by debugging to check @c->ihead_lnum
+ * @new_ihead_offs: used by debugging to check @c->ihead_offs
  *
- * debugfs_dir_name: name of debugfs directory containing this file-system's
- *                   files
- * debugfs_dir: direntry object of the file-system debugfs directory
- * dump_lprops: "dump lprops" debugfs knob
- * dump_budg: "dump budgeting information" debugfs knob
- * dump_tnc: "dump TNC" debugfs knob
+ * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()')
+ * @saved_free: saved free space (used by 'dbg_save_space_info()')
+ *
+ * dfs_dir_name: name of debugfs directory containing this file-system's files
+ * dfs_dir: direntry object of the file-system debugfs directory
+ * dfs_dump_lprops: "dump lprops" debugfs knob
+ * dfs_dump_budg: "dump budgeting information" debugfs knob
+ * dfs_dump_tnc: "dump TNC" debugfs knob
  */
 struct ubifs_debug_info {
 	void *buf;
@@ -69,11 +71,14 @@ struct ubifs_debug_info {
 	int new_ihead_lnum;
 	int new_ihead_offs;
 
-	char debugfs_dir_name[100];
-	struct dentry *debugfs_dir;
-	struct dentry *dump_lprops;
-	struct dentry *dump_budg;
-	struct dentry *dump_tnc;
+	struct ubifs_lp_stats saved_lst;
+	long long saved_free;
+
+	char dfs_dir_name[100];
+	struct dentry *dfs_dir;
+	struct dentry *dfs_dump_lprops;
+	struct dentry *dfs_dump_budg;
+	struct dentry *dfs_dump_tnc;
 };
 
 #define ubifs_assert(expr) do {                                                \
@@ -297,7 +302,8 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
 		   dbg_znode_callback znode_cb, void *priv);
 
 /* Checking functions */
-
+void dbg_save_space_info(struct ubifs_info *c);
+int dbg_check_space_info(struct ubifs_info *c);
 int dbg_check_lprops(struct ubifs_info *c);
 int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot);
@@ -439,6 +445,8 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
 
 #define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
 #define dbg_old_index_check_init(c, zroot)         0
+#define dbg_save_space_info(c)                     ({})
+#define dbg_check_space_info(c)                    0
 #define dbg_check_old_index(c, zroot)              0
 #define dbg_check_cats(c)                          0
 #define dbg_check_ltab(c)                          0
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index f448ab1..f55d523 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -482,30 +482,29 @@ static int ubifs_dir_release(struct inode *dir, struct file *file)
 }
 
 /**
- * lock_2_inodes - lock two UBIFS inodes.
+ * lock_2_inodes - a wrapper for locking two UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
  */
 static void lock_2_inodes(struct inode *inode1, struct inode *inode2)
 {
-	if (inode1->i_ino < inode2->i_ino) {
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_2);
-		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_3);
-	} else {
-		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_3);
-	}
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
 }
 
 /**
- * unlock_2_inodes - unlock two UBIFS inodes inodes.
+ * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
  */
 static void unlock_2_inodes(struct inode *inode1, struct inode *inode2)
 {
-	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 	mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 }
 
 static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
@@ -527,6 +526,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu",
 		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
@@ -580,6 +581,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu",
 		dentry->d_name.len, dentry->d_name.name, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
@@ -667,7 +670,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len,
 		dentry->d_name.name, inode->i_ino, dir->i_ino);
-
+	ubifs_assert(mutex_is_locked(&dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 	err = check_dir_empty(c, dentry->d_inode);
 	if (err)
 		return err;
@@ -922,59 +926,30 @@ out_budg:
 }
 
 /**
- * lock_3_inodes - lock three UBIFS inodes for rename.
+ * lock_3_inodes - a wrapper for locking three UBIFS inodes.
  * @inode1: first inode
  * @inode2: second inode
  * @inode3: third inode
  *
- * For 'ubifs_rename()', @inode1 may be the same as @inode2 whereas @inode3 may
- * be null.
+ * This function is used for 'ubifs_rename()' and @inode1 may be the same as
+ * @inode2 whereas @inode3 may be %NULL.
+ *
+ * We do not implement any tricks to guarantee strict lock ordering, because
+ * VFS has already done it for us on the @i_mutex. So this is just a simple
+ * wrapper function.
  */
 static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
 			  struct inode *inode3)
 {
-	struct inode *i1, *i2, *i3;
-
-	if (!inode3) {
-		if (inode1 != inode2) {
-			lock_2_inodes(inode1, inode2);
-			return;
-		}
-		mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
-		return;
-	}
-
-	if (inode1 == inode2) {
-		lock_2_inodes(inode1, inode3);
-		return;
-	}
-
-	/* 3 different inodes */
-	if (inode1 < inode2) {
-		i3 = inode2;
-		if (inode1 < inode3) {
-			i1 = inode1;
-			i2 = inode3;
-		} else {
-			i1 = inode3;
-			i2 = inode1;
-		}
-	} else {
-		i3 = inode1;
-		if (inode2 < inode3) {
-			i1 = inode2;
-			i2 = inode3;
-		} else {
-			i1 = inode3;
-			i2 = inode2;
-		}
-	}
-	mutex_lock_nested(&ubifs_inode(i1)->ui_mutex, WB_MUTEX_1);
-	lock_2_inodes(i2, i3);
+	mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1);
+	if (inode2 != inode1)
+		mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2);
+	if (inode3)
+		mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3);
 }
 
 /**
- * unlock_3_inodes - unlock three UBIFS inodes for rename.
+ * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename.
  * @inode1: first inode
  * @inode2: second inode
  * @inode3: third inode
@@ -982,11 +957,11 @@ static void lock_3_inodes(struct inode *inode1, struct inode *inode2,
 static void unlock_3_inodes(struct inode *inode1, struct inode *inode2,
 			    struct inode *inode3)
 {
-	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
-	if (inode1 != inode2)
-		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
 	if (inode3)
 		mutex_unlock(&ubifs_inode(inode3)->ui_mutex);
+	if (inode1 != inode2)
+		mutex_unlock(&ubifs_inode(inode2)->ui_mutex);
+	mutex_unlock(&ubifs_inode(inode1)->ui_mutex);
 }
 
 static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -1020,6 +995,11 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		"dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
 		old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
 		new_dentry->d_name.name, new_dir->i_ino);
+	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
+	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+	if (unlink)
+		ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+
 
 	if (unlink && is_dir) {
 		err = check_dir_empty(c, new_inode);
@@ -1199,7 +1179,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
-struct inode_operations ubifs_dir_inode_operations = {
+const struct inode_operations ubifs_dir_inode_operations = {
 	.lookup      = ubifs_lookup,
 	.create      = ubifs_create,
 	.link        = ubifs_link,
@@ -1219,7 +1199,7 @@ struct inode_operations ubifs_dir_inode_operations = {
 #endif
 };
 
-struct file_operations ubifs_dir_operations = {
+const struct file_operations ubifs_dir_operations = {
 	.llseek         = ubifs_dir_llseek,
 	.release        = ubifs_dir_release,
 	.read           = generic_read_dir,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index bf37374..93b6de5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -432,7 +432,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
 	struct page *page;
 
-
 	ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
 
 	if (unlikely(c->ro_media))
@@ -1541,7 +1540,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-struct address_space_operations ubifs_file_address_operations = {
+const struct address_space_operations ubifs_file_address_operations = {
 	.readpage       = ubifs_readpage,
 	.writepage      = ubifs_writepage,
 	.write_begin    = ubifs_write_begin,
@@ -1551,7 +1550,7 @@ struct address_space_operations ubifs_file_address_operations = {
 	.releasepage    = ubifs_releasepage,
 };
 
-struct inode_operations ubifs_file_inode_operations = {
+const struct inode_operations ubifs_file_inode_operations = {
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 #ifdef CONFIG_UBIFS_FS_XATTR
@@ -1562,14 +1561,14 @@ struct inode_operations ubifs_file_inode_operations = {
 #endif
 };
 
-struct inode_operations ubifs_symlink_inode_operations = {
+const struct inode_operations ubifs_symlink_inode_operations = {
 	.readlink    = generic_readlink,
 	.follow_link = ubifs_follow_link,
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 };
 
-struct file_operations ubifs_file_operations = {
+const struct file_operations ubifs_file_operations = {
 	.llseek         = generic_file_llseek,
 	.read           = do_sync_read,
 	.write          = do_sync_write,
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9832f9a..a711d33 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -31,6 +31,26 @@
  * to be reused. Garbage collection will cause the number of dirty index nodes
  * to grow, however sufficient space is reserved for the index to ensure the
  * commit will never run out of space.
+ *
+ * Notes about dead watermark. At current UBIFS implementation we assume that
+ * LEBs which have less than @c->dead_wm bytes of free + dirty space are full
+ * and not worth garbage-collecting. The dead watermark is one min. I/O unit
+ * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS
+ * Garbage Collector has to synchronize the GC head's write buffer before
+ * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can
+ * actually reclaim even very small pieces of dirty space by garbage collecting
+ * enough dirty LEBs, but we do not bother doing this at this implementation.
+ *
+ * Notes about dark watermark. The results of GC work depends on how big are
+ * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed,
+ * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would
+ * have to waste large pieces of free space at the end of LEB B, because nodes
+ * from LEB A would not fit. And the worst situation is when all nodes are of
+ * maximum size. So dark watermark is the amount of free + dirty space in LEB
+ * which are guaranteed to be reclaimable. If LEB has less space, the GC migh
+ * be unable to reclaim it. So, LEBs with free + dirty greater than dark
+ * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
+ * good, and GC takes extra care when moving them.
  */
 
 #include <linux/pagemap.h>
@@ -381,7 +401,7 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp)
 
 		/*
 		 * Don't release the LEB until after the next commit, because
-		 * it may contain date which is needed for recovery. So
+		 * it may contain data which is needed for recovery. So
 		 * although we freed this LEB, it will become usable only after
 		 * the commit.
 		 */
@@ -810,8 +830,9 @@ out:
  * ubifs_destroy_idx_gc - destroy idx_gc list.
  * @c: UBIFS file-system description object
  *
- * This function destroys the idx_gc list. It is called when unmounting or
- * remounting read-only so locks are not needed.
+ * This function destroys the @c->idx_gc list. It is called when unmounting
+ * so locks are not needed. Returns zero in case of success and a negative
+ * error code in case of failure.
  */
 void ubifs_destroy_idx_gc(struct ubifs_info *c)
 {
@@ -824,7 +845,6 @@ void ubifs_destroy_idx_gc(struct ubifs_info *c)
 		list_del(&idx_gc->list);
 		kfree(idx_gc);
 	}
-
 }
 
 /**
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 0168271..e8e632a 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -29,7 +29,7 @@
  * would have been wasted for padding to the nearest minimal I/O unit boundary.
  * Instead, data first goes to the write-buffer and is flushed when the
  * buffer is full or when it is not used for some time (by timer). This is
- * similarto the mechanism is used by JFFS2.
+ * similar to the mechanism is used by JFFS2.
  *
  * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
  * mutexes defined inside these objects. Since sometimes upper-level code
@@ -75,7 +75,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * @lnum: logical eraseblock number
  * @offs: offset within the logical eraseblock
  * @quiet: print no messages
- * @chk_crc: indicates whether to always check the CRC
+ * @must_chk_crc: indicates whether to always check the CRC
  *
  * This function checks node magic number and CRC checksum. This function also
  * validates node length to prevent UBIFS from becoming crazy when an attacker
@@ -83,11 +83,17 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
  * node length in the common header could cause UBIFS to read memory outside of
  * allocated buffer when checking the CRC checksum.
  *
- * This function returns zero in case of success %-EUCLEAN in case of bad CRC
- * or magic.
+ * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
+ * true, which is controlled by corresponding UBIFS mount option. However, if
+ * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
+ * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is
+ * ignored and CRC is checked.
+ *
+ * This function returns zero in case of success and %-EUCLEAN in case of bad
+ * CRC or magic.
  */
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet, int chk_crc)
+		     int offs, int quiet, int must_chk_crc)
 {
 	int err = -EINVAL, type, node_len;
 	uint32_t crc, node_crc, magic;
@@ -123,9 +129,9 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		   node_len > c->ranges[type].max_len)
 		goto out_len;
 
-	if (!chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc)
-		if (c->no_chk_data_crc)
-			return 0;
+	if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc &&
+	     c->no_chk_data_crc)
+		return 0;
 
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 9b7c54e..a11ca09 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -208,7 +208,7 @@ again:
 	offs = 0;
 
 out:
-	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, UBI_SHORTTERM);
+	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
 	if (err)
 		goto out_unlock;
 
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index dfd2bce..4cdd284 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -635,10 +635,10 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
  * @c: UBIFS file-system description object
  * @st: return statistics
  */
-void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *st)
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst)
 {
 	spin_lock(&c->space_lock);
-	memcpy(st, &c->lst, sizeof(struct ubifs_lp_stats));
+	memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats));
 	spin_unlock(&c->space_lock);
 }
 
@@ -678,6 +678,9 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
 
 out:
 	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err("cannot change properties of LEB %d, error %d",
+			  lnum, err);
 	return err;
 }
 
@@ -714,6 +717,9 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty,
 
 out:
 	ubifs_release_lprops(c);
+	if (err)
+		ubifs_err("cannot update properties of LEB %d, error %d",
+			  lnum, err);
 	return err;
 }
 
@@ -737,6 +743,8 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp)
 	lpp = ubifs_lpt_lookup(c, lnum);
 	if (IS_ERR(lpp)) {
 		err = PTR_ERR(lpp);
+		ubifs_err("cannot read properties of LEB %d, error %d",
+			  lnum, err);
 		goto out;
 	}
 
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 96ca957..3216a1f 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -556,23 +556,23 @@ no_space:
 }
 
 /**
- * next_pnode - find next pnode.
+ * next_pnode_to_dirty - find next pnode to dirty.
  * @c: UBIFS file-system description object
  * @pnode: pnode
  *
- * This function returns the next pnode or %NULL if there are no more pnodes.
+ * This function returns the next pnode to dirty or %NULL if there are no more
+ * pnodes.  Note that pnodes that have never been written (lnum == 0) are
+ * skipped.
  */
-static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
-				      struct ubifs_pnode *pnode)
+static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c,
+					       struct ubifs_pnode *pnode)
 {
 	struct ubifs_nnode *nnode;
 	int iip;
 
 	/* Try to go right */
 	nnode = pnode->parent;
-	iip = pnode->iip + 1;
-	if (iip < UBIFS_LPT_FANOUT) {
-		/* We assume here that LEB zero is never an LPT LEB */
+	for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) {
 		if (nnode->nbranch[iip].lnum)
 			return ubifs_get_pnode(c, nnode, iip);
 	}
@@ -583,8 +583,11 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
 		nnode = nnode->parent;
 		if (!nnode)
 			return NULL;
-		/* We assume here that LEB zero is never an LPT LEB */
-	} while (iip >= UBIFS_LPT_FANOUT || !nnode->nbranch[iip].lnum);
+		for (; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+       } while (iip >= UBIFS_LPT_FANOUT);
 
 	/* Go right */
 	nnode = ubifs_get_nnode(c, nnode, iip);
@@ -593,12 +596,29 @@ static struct ubifs_pnode *next_pnode(struct ubifs_info *c,
 
 	/* Go down to level 1 */
 	while (nnode->level > 1) {
-		nnode = ubifs_get_nnode(c, nnode, 0);
+		for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) {
+			if (nnode->nbranch[iip].lnum)
+				break;
+		}
+		if (iip >= UBIFS_LPT_FANOUT) {
+			/*
+			 * Should not happen, but we need to keep going
+			 * if it does.
+			 */
+			iip = 0;
+		}
+		nnode = ubifs_get_nnode(c, nnode, iip);
 		if (IS_ERR(nnode))
 			return (void *)nnode;
 	}
 
-	return ubifs_get_pnode(c, nnode, 0);
+	for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++)
+		if (nnode->nbranch[iip].lnum)
+			break;
+	if (iip >= UBIFS_LPT_FANOUT)
+		/* Should not happen, but we need to keep going if it does */
+		iip = 0;
+	return ubifs_get_pnode(c, nnode, iip);
 }
 
 /**
@@ -688,7 +708,7 @@ static int make_tree_dirty(struct ubifs_info *c)
 	pnode = pnode_lookup(c, 0);
 	while (pnode) {
 		do_make_pnode_dirty(c, pnode);
-		pnode = next_pnode(c, pnode);
+		pnode = next_pnode_to_dirty(c, pnode);
 		if (IS_ERR(pnode))
 			return PTR_ERR(pnode);
 	}
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index 71d5493..a88f338 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -354,7 +354,7 @@ int ubifs_write_master(struct ubifs_info *c)
 	int err, lnum, offs, len;
 
 	if (c->ro_media)
-		return -EINVAL;
+		return -EROFS;
 
 	lnum = UBIFS_MST_LNUM;
 	offs = c->mst_offs + c->mst_node_alsz;
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9e6f403..152a7b3 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -46,7 +46,7 @@
  * Orphans are accumulated in a rb-tree. When an inode's link count drops to
  * zero, the inode number is added to the rb-tree. It is removed from the tree
  * when the inode is deleted.  Any new orphans that are in the orphan tree when
- * the commit is run, are written to the orphan area in 1 or more orph nodes.
+ * the commit is run, are written to the orphan area in 1 or more orphan nodes.
  * If the orphan area is full, it is consolidated to make space.  There is
  * always enough space because validation prevents the user from creating more
  * than the maximum number of orphans allowed.
@@ -231,7 +231,7 @@ static int tot_avail_orphs(struct ubifs_info *c)
 }
 
 /**
- * do_write_orph_node - write a node
+ * do_write_orph_node - write a node to the orphan head.
  * @c: UBIFS file-system description object
  * @len: length of node
  * @atomic: write atomically
@@ -264,11 +264,11 @@ static int do_write_orph_node(struct ubifs_info *c, int len, int atomic)
 }
 
 /**
- * write_orph_node - write an orph node
+ * write_orph_node - write an orphan node.
  * @c: UBIFS file-system description object
  * @atomic: write atomically
  *
- * This function builds an orph node from the cnext list and writes it to the
+ * This function builds an orphan node from the cnext list and writes it to the
  * orphan head. On success, %0 is returned, otherwise a negative error code
  * is returned.
  */
@@ -326,11 +326,11 @@ static int write_orph_node(struct ubifs_info *c, int atomic)
 }
 
 /**
- * write_orph_nodes - write orph nodes until there are no more to commit
+ * write_orph_nodes - write orphan nodes until there are no more to commit.
  * @c: UBIFS file-system description object
  * @atomic: write atomically
  *
- * This function writes orph nodes for all the orphans to commit. On success,
+ * This function writes orphan nodes for all the orphans to commit. On success,
  * %0 is returned, otherwise a negative error code is returned.
  */
 static int write_orph_nodes(struct ubifs_info *c, int atomic)
@@ -478,14 +478,14 @@ int ubifs_orphan_end_commit(struct ubifs_info *c)
 }
 
 /**
- * clear_orphans - erase all LEBs used for orphans.
+ * ubifs_clear_orphans - erase all LEBs used for orphans.
  * @c: UBIFS file-system description object
  *
  * If recovery is not required, then the orphans from the previous session
  * are not needed. This function locates the LEBs used to record
  * orphans, and un-maps them.
  */
-static int clear_orphans(struct ubifs_info *c)
+int ubifs_clear_orphans(struct ubifs_info *c)
 {
 	int lnum, err;
 
@@ -547,9 +547,9 @@ static int insert_dead_orphan(struct ubifs_info *c, ino_t inum)
  * do_kill_orphans - remove orphan inodes from the index.
  * @c: UBIFS file-system description object
  * @sleb: scanned LEB
- * @last_cmt_no: cmt_no of last orph node read is passed and returned here
+ * @last_cmt_no: cmt_no of last orphan node read is passed and returned here
  * @outofdate: whether the LEB is out of date is returned here
- * @last_flagged: whether the end orph node is encountered
+ * @last_flagged: whether the end orphan node is encountered
  *
  * This function is a helper to the 'kill_orphans()' function. It goes through
  * every orphan node in a LEB and for every inode number recorded, removes
@@ -580,8 +580,8 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 		/*
 		 * The commit number on the master node may be less, because
 		 * of a failed commit. If there are several failed commits in a
-		 * row, the commit number written on orph nodes will continue to
-		 * increase (because the commit number is adjusted here) even
+		 * row, the commit number written on orphan nodes will continue
+		 * to increase (because the commit number is adjusted here) even
 		 * though the commit number on the master node stays the same
 		 * because the master node has not been re-written.
 		 */
@@ -589,9 +589,9 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
 			c->cmt_no = cmt_no;
 		if (cmt_no < *last_cmt_no && *last_flagged) {
 			/*
-			 * The last orph node had a higher commit number and was
-			 * flagged as the last written for that commit number.
-			 * That makes this orph node, out of date.
+			 * The last orphan node had a higher commit number and
+			 * was flagged as the last written for that commit
+			 * number. That makes this orphan node, out of date.
 			 */
 			if (!first) {
 				ubifs_err("out of order commit number %llu in "
@@ -658,10 +658,10 @@ static int kill_orphans(struct ubifs_info *c)
 	/*
 	 * Orph nodes always start at c->orph_first and are written to each
 	 * successive LEB in turn. Generally unused LEBs will have been unmapped
-	 * but may contain out of date orph nodes if the unmap didn't go
-	 * through. In addition, the last orph node written for each commit is
+	 * but may contain out of date orphan nodes if the unmap didn't go
+	 * through. In addition, the last orphan node written for each commit is
 	 * marked (top bit of orph->cmt_no is set to 1). It is possible that
-	 * there are orph nodes from the next commit (i.e. the commit did not
+	 * there are orphan nodes from the next commit (i.e. the commit did not
 	 * complete successfully). In that case, no orphans will have been lost
 	 * due to the way that orphans are written, and any orphans added will
 	 * be valid orphans anyway and so can be deleted.
@@ -718,7 +718,7 @@ int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only)
 	if (unclean)
 		err = kill_orphans(c);
 	else if (!read_only)
-		err = clear_orphans(c);
+		err = ubifs_clear_orphans(c);
 
 	return err;
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 89556ee..c5c9835 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -397,6 +397,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_namelen = UBIFS_MAX_NLEN;
 	buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]);
 	buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]);
+	ubifs_assert(buf->f_bfree <= c->block_cnt);
 	return 0;
 }
 
@@ -432,33 +433,24 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	int i, err;
 	struct ubifs_info *c = sb->s_fs_info;
 	struct writeback_control wbc = {
-		.sync_mode   = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
+		.sync_mode   = WB_SYNC_ALL,
 		.range_start = 0,
 		.range_end   = LLONG_MAX,
 		.nr_to_write = LONG_MAX,
 	};
 
 	/*
-	 * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an
-	 * advisory thing to help the file system shove lots of data into the
-	 * queues. If some gets missed then it'll be picked up on the second
+	 * Zero @wait is just an advisory thing to help the file system shove
+	 * lots of data into the queues, and there will be the second
 	 * '->sync_fs()' call, with non-zero @wait.
 	 */
+	if (!wait)
+		return 0;
 
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
 	/*
-	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
-	 * do this if it waits for an already running commit.
-	 */
-	for (i = 0; i < c->jhead_cnt; i++) {
-		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
-		if (err)
-			return err;
-	}
-
-	/*
 	 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
 	 * pages, so synchronize them first, then commit the journal. Strictly
 	 * speaking, it is not necessary to commit the journal here,
@@ -469,6 +461,16 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	 */
 	generic_sync_sb_inodes(sb, &wbc);
 
+	/*
+	 * Synchronize write buffers, because 'ubifs_run_commit()' does not
+	 * do this if it waits for an already running commit.
+	 */
+	for (i = 0; i < c->jhead_cnt; i++) {
+		err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+		if (err)
+			return err;
+	}
+
 	err = ubifs_run_commit(c);
 	if (err)
 		return err;
@@ -572,15 +574,8 @@ static int init_constants_early(struct ubifs_info *c)
 	c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX;
 
 	/*
-	 * Initialize dead and dark LEB space watermarks.
-	 *
-	 * Dead space is the space which cannot be used. Its watermark is
-	 * equivalent to min. I/O unit or minimum node size if it is greater
-	 * then min. I/O unit.
-	 *
-	 * Dark space is the space which might be used, or might not, depending
-	 * on which node should be written to the LEB. Its watermark is
-	 * equivalent to maximum UBIFS node size.
+	 * Initialize dead and dark LEB space watermarks. See gc.c for comments
+	 * about these values.
 	 */
 	c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size);
 	c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size);
@@ -741,12 +736,12 @@ static void init_constants_master(struct ubifs_info *c)
  * take_gc_lnum - reserve GC LEB.
  * @c: UBIFS file-system description object
  *
- * This function ensures that the LEB reserved for garbage collection is
- * unmapped and is marked as "taken" in lprops. We also have to set free space
- * to LEB size and dirty space to zero, because lprops may contain out-of-date
- * information if the file-system was un-mounted before it has been committed.
- * This function returns zero in case of success and a negative error code in
- * case of failure.
+ * This function ensures that the LEB reserved for garbage collection is marked
+ * as "taken" in lprops. We also have to set free space to LEB size and dirty
+ * space to zero, because lprops may contain out-of-date information if the
+ * file-system was un-mounted before it has been committed. This function
+ * returns zero in case of success and a negative error code in case of
+ * failure.
  */
 static int take_gc_lnum(struct ubifs_info *c)
 {
@@ -757,10 +752,6 @@ static int take_gc_lnum(struct ubifs_info *c)
 		return -EINVAL;
 	}
 
-	err = ubifs_leb_unmap(c, c->gc_lnum);
-	if (err)
-		return err;
-
 	/* And we have to tell lprops that this LEB is taken */
 	err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0,
 				  LPROPS_TAKEN, 0, 0);
@@ -966,13 +957,16 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
 
 		token = match_token(p, tokens, args);
 		switch (token) {
+		/*
+		 * %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
+		 * We accepte them in order to be backware-compatible. But this
+		 * should be removed at some point.
+		 */
 		case Opt_fast_unmount:
 			c->mount_opts.unmount_mode = 2;
-			c->fast_unmount = 1;
 			break;
 		case Opt_norm_unmount:
 			c->mount_opts.unmount_mode = 1;
-			c->fast_unmount = 0;
 			break;
 		case Opt_bulk_read:
 			c->mount_opts.bulk_read = 2;
@@ -1094,12 +1088,7 @@ static int check_free_space(struct ubifs_info *c)
 		ubifs_err("insufficient free space to mount in read/write mode");
 		dbg_dump_budg(c);
 		dbg_dump_lprops(c);
-		/*
-		 * We return %-EINVAL instead of %-ENOSPC because it seems to
-		 * be the closest error code mentioned in the mount function
-		 * documentation.
-		 */
-		return -EINVAL;
+		return -ENOSPC;
 	}
 	return 0;
 }
@@ -1286,10 +1275,19 @@ static int mount_ubifs(struct ubifs_info *c)
 			if (err)
 				goto out_orphans;
 			err = ubifs_rcvry_gc_commit(c);
-		} else
+		} else {
 			err = take_gc_lnum(c);
-		if (err)
-			goto out_orphans;
+			if (err)
+				goto out_orphans;
+
+			/*
+			 * GC LEB may contain garbage if there was an unclean
+			 * reboot, and it should be un-mapped.
+			 */
+			err = ubifs_leb_unmap(c, c->gc_lnum);
+			if (err)
+				return err;
+		}
 
 		err = dbg_check_lprops(c);
 		if (err)
@@ -1298,6 +1296,16 @@ static int mount_ubifs(struct ubifs_info *c)
 		err = ubifs_recover_size(c);
 		if (err)
 			goto out_orphans;
+	} else {
+		/*
+		 * Even if we mount read-only, we have to set space in GC LEB
+		 * to proper value because this affects UBIFS free space
+		 * reporting. We do not want to have a situation when
+		 * re-mounting from R/O to R/W changes amount of free space.
+		 */
+		err = take_gc_lnum(c);
+		if (err)
+			goto out_orphans;
 	}
 
 	spin_lock(&ubifs_infos_lock);
@@ -1310,14 +1318,17 @@ static int mount_ubifs(struct ubifs_info *c)
 		else {
 			c->need_recovery = 0;
 			ubifs_msg("recovery completed");
+			/* GC LEB has to be empty and taken at this point */
+			ubifs_assert(c->lst.taken_empty_lebs == 1);
 		}
-	}
+	} else
+		ubifs_assert(c->lst.taken_empty_lebs == 1);
 
-	err = dbg_debugfs_init_fs(c);
+	err = dbg_check_filesystem(c);
 	if (err)
 		goto out_infos;
 
-	err = dbg_check_filesystem(c);
+	err = dbg_debugfs_init_fs(c);
 	if (err)
 		goto out_infos;
 
@@ -1351,7 +1362,6 @@ static int mount_ubifs(struct ubifs_info *c)
 	       c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],
 	       c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],
 	       c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);
-	dbg_msg("fast unmount:        %d", c->fast_unmount);
 	dbg_msg("big_lpt              %d", c->big_lpt);
 	dbg_msg("log LEBs:            %d (%d - %d)",
 		c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
@@ -1475,10 +1485,8 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 {
 	int err, lnum;
 
-	if (c->ro_media)
-		return -EINVAL;
-
 	mutex_lock(&c->umount_mutex);
+	dbg_save_space_info(c);
 	c->remounting_rw = 1;
 	c->always_chk_crc = 1;
 
@@ -1514,6 +1522,12 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		err = ubifs_recover_inl_heads(c, c->sbuf);
 		if (err)
 			goto out;
+	} else {
+		/* A readonly mount is not allowed to have orphans */
+		ubifs_assert(c->tot_orphans == 0);
+		err = ubifs_clear_orphans(c);
+		if (err)
+			goto out;
 	}
 
 	if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) {
@@ -1569,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	if (c->need_recovery)
 		err = ubifs_rcvry_gc_commit(c);
 	else
-		err = take_gc_lnum(c);
+		err = ubifs_leb_unmap(c, c->gc_lnum);
 	if (err)
 		goto out;
 
@@ -1582,8 +1596,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	c->vfs_sb->s_flags &= ~MS_RDONLY;
 	c->remounting_rw = 0;
 	c->always_chk_crc = 0;
+	err = dbg_check_space_info(c);
 	mutex_unlock(&c->umount_mutex);
-	return 0;
+	return err;
 
 out:
 	vfree(c->orph_buf);
@@ -1603,43 +1618,18 @@ out:
 }
 
 /**
- * commit_on_unmount - commit the journal when un-mounting.
- * @c: UBIFS file-system description object
- *
- * This function is called during un-mounting and re-mounting, and it commits
- * the journal unless the "fast unmount" mode is enabled.
- */
-static void commit_on_unmount(struct ubifs_info *c)
-{
-	struct super_block *sb = c->vfs_sb;
-	long long bud_bytes;
-
-	/*
-	 * This function is called before the background thread is stopped, so
-	 * we may race with ongoing commit, which means we have to take
-	 * @c->bud_lock to access @c->bud_bytes.
-	 */
-	spin_lock(&c->buds_lock);
-	bud_bytes = c->bud_bytes;
-	spin_unlock(&c->buds_lock);
-
-	if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
-		ubifs_run_commit(c);
-}
-
-/**
  * ubifs_remount_ro - re-mount in read-only mode.
  * @c: UBIFS file-system description object
  *
- * We rely on VFS to have stopped writing. Possibly the background thread could
- * be running a commit, however kthread_stop will wait in that case.
+ * We assume VFS has stopped writing. Possibly the background thread could be
+ * running a commit, however kthread_stop will wait in that case.
  */
 static void ubifs_remount_ro(struct ubifs_info *c)
 {
 	int i, err;
 
 	ubifs_assert(!c->need_recovery);
-	commit_on_unmount(c);
+	ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));
 
 	mutex_lock(&c->umount_mutex);
 	if (c->bgt) {
@@ -1647,27 +1637,29 @@ static void ubifs_remount_ro(struct ubifs_info *c)
 		c->bgt = NULL;
 	}
 
+	dbg_save_space_info(c);
+
 	for (i = 0; i < c->jhead_cnt; i++) {
 		ubifs_wbuf_sync(&c->jheads[i].wbuf);
 		del_timer_sync(&c->jheads[i].wbuf.timer);
 	}
 
-	if (!c->ro_media) {
-		c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
-		c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
-		c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
-		err = ubifs_write_master(c);
-		if (err)
-			ubifs_ro_mode(c, err);
-	}
+	c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY);
+	c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
+	c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum);
+	err = ubifs_write_master(c);
+	if (err)
+		ubifs_ro_mode(c, err);
 
-	ubifs_destroy_idx_gc(c);
 	free_wbufs(c);
 	vfree(c->orph_buf);
 	c->orph_buf = NULL;
 	vfree(c->ileb_buf);
 	c->ileb_buf = NULL;
 	ubifs_lpt_free(c, 1);
+	err = dbg_check_space_info(c);
+	if (err)
+		ubifs_ro_mode(c, err);
 	mutex_unlock(&c->umount_mutex);
 }
 
@@ -1760,11 +1752,20 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
+		if (c->ro_media) {
+			ubifs_msg("cannot re-mount due to prior errors");
+			return -EROFS;
+		}
 		err = ubifs_remount_rw(c);
 		if (err)
 			return err;
-	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY))
+	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
+		if (c->ro_media) {
+			ubifs_msg("cannot re-mount due to prior errors");
+			return -EROFS;
+		}
 		ubifs_remount_ro(c);
+	}
 
 	if (c->bulk_read == 1)
 		bu_init(c);
@@ -1774,10 +1775,11 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		c->bu.buf = NULL;
 	}
 
+	ubifs_assert(c->lst.taken_empty_lebs == 1);
 	return 0;
 }
 
-struct super_operations ubifs_super_operations = {
+const struct super_operations ubifs_super_operations = {
 	.alloc_inode   = ubifs_alloc_inode,
 	.destroy_inode = ubifs_destroy_inode,
 	.put_super     = ubifs_put_super,
@@ -2032,7 +2034,8 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	/* 'fill_super()' opens ubi again so we must close it here */
 	ubi_close_volume(ubi);
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 
 out_deact:
 	up_write(&sb->s_umount);
@@ -2044,15 +2047,6 @@ out_close:
 
 static void ubifs_kill_sb(struct super_block *sb)
 {
-	struct ubifs_info *c = sb->s_fs_info;
-
-	/*
-	 * We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
-	 * in order to be outside BKL.
-	 */
-	if (sb->s_root)
-		commit_on_unmount(c);
-	/* The un-mount routine is actually done in put_super() */
 	generic_shutdown_super(sb);
 }
 
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f7e36f5..fa28a84 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -443,6 +443,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
  * This function performs that same function as ubifs_read_node except that
  * it does not require that there is actually a node present and instead
  * the return code indicates if a node was read.
+ *
+ * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
+ * is true (it is controlled by corresponding mount option). However, if
+ * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always
+ * checked.
  */
 static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 			 int len, int lnum, int offs)
@@ -470,9 +475,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
 	if (node_len != len)
 		return 0;
 
-	if (type == UBIFS_DATA_NODE && !c->always_chk_crc)
-		if (c->no_chk_data_crc)
-			return 0;
+	if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc)
+		return 1;
 
 	crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
 	node_crc = le32_to_cpu(ch->crc);
@@ -1506,7 +1510,7 @@ out:
  *
  * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function
  * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares
- * maxumum possible amount of nodes for bulk-read.
+ * maximum possible amount of nodes for bulk-read.
  */
 int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
 {
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index fc2a4cc..039a68b 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -426,9 +426,9 @@ struct ubifs_unclean_leb {
  * LEB properties flags.
  *
  * LPROPS_UNCAT: not categorized
- * LPROPS_DIRTY: dirty > 0, not index
+ * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index
  * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
- * LPROPS_FREE: free > 0, not empty, not index
+ * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index
  * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
  * LPROPS_EMPTY: LEB is empty, not taken
  * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken
@@ -961,7 +961,6 @@ struct ubifs_debug_info;
  * @cs_lock: commit state lock
  * @cmt_wq: wait queue to sleep on if the log is full and a commit is running
  *
- * @fast_unmount: do not run journal commit before un-mounting
  * @big_lpt: flag that LPT is too big to write whole during commit
  * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
  *                   recovery)
@@ -1202,7 +1201,6 @@ struct ubifs_info {
 	spinlock_t cs_lock;
 	wait_queue_head_t cmt_wq;
 
-	unsigned int fast_unmount:1;
 	unsigned int big_lpt:1;
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
@@ -1405,13 +1403,13 @@ extern struct list_head ubifs_infos;
 extern spinlock_t ubifs_infos_lock;
 extern atomic_long_t ubifs_clean_zn_cnt;
 extern struct kmem_cache *ubifs_inode_slab;
-extern struct super_operations ubifs_super_operations;
-extern struct address_space_operations ubifs_file_address_operations;
-extern struct file_operations ubifs_file_operations;
-extern struct inode_operations ubifs_file_inode_operations;
-extern struct file_operations ubifs_dir_operations;
-extern struct inode_operations ubifs_dir_inode_operations;
-extern struct inode_operations ubifs_symlink_inode_operations;
+extern const struct super_operations ubifs_super_operations;
+extern const struct address_space_operations ubifs_file_address_operations;
+extern const struct file_operations ubifs_file_operations;
+extern const struct inode_operations ubifs_file_inode_operations;
+extern const struct file_operations ubifs_dir_operations;
+extern const struct inode_operations ubifs_dir_inode_operations;
+extern const struct inode_operations ubifs_symlink_inode_operations;
 extern struct backing_dev_info ubifs_backing_dev_info;
 extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT];
 
@@ -1428,7 +1426,7 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
 		     int offs, int dtype);
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
-		     int offs, int quiet, int chk_crc);
+		     int offs, int quiet, int must_chk_crc);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
 void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last);
 int ubifs_io_init(struct ubifs_info *c);
@@ -1495,6 +1493,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode,
 void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
 			 struct ubifs_budget_req *req);
 long long ubifs_get_free_space(struct ubifs_info *c);
+long long ubifs_get_free_space_nolock(struct ubifs_info *c);
 int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
 void ubifs_convert_page_budget(struct ubifs_info *c);
 long long ubifs_reported_space(const struct ubifs_info *c, long long free);
@@ -1603,6 +1602,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum);
 int ubifs_orphan_start_commit(struct ubifs_info *c);
 int ubifs_orphan_end_commit(struct ubifs_info *c);
 int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only);
+int ubifs_clear_orphans(struct ubifs_info *c);
 
 /* lpt.c */
 int ubifs_calc_lpt_geom(struct ubifs_info *c);
@@ -1646,7 +1646,7 @@ const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
 					   const struct ubifs_lprops *lp,
 					   int free, int dirty, int flags,
 					   int idx_gc_cnt);
-void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *stats);
+void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst);
 void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
 		      int cat);
 void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops,
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 1b809bd4..2bb788a 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -206,7 +206,7 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
 					((char *)bh->b_data)[(bit + i) >> 3]);
 			} else {
 				if (inode)
-					DQUOT_FREE_BLOCK(inode, 1);
+					vfs_dq_free_block(inode, 1);
 				udf_add_free_space(sbi, sbi->s_partition, 1);
 			}
 		}
@@ -261,11 +261,11 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
 		while (bit < (sb->s_blocksize << 3) && block_count > 0) {
 			if (!udf_test_bit(bit, bh->b_data))
 				goto out;
-			else if (DQUOT_PREALLOC_BLOCK(inode, 1))
+			else if (vfs_dq_prealloc_block(inode, 1))
 				goto out;
 			else if (!udf_clear_bit(bit, bh->b_data)) {
 				udf_debug("bit already cleared for block %d\n", bit);
-				DQUOT_FREE_BLOCK(inode, 1);
+				vfs_dq_free_block(inode, 1);
 				goto out;
 			}
 			block_count--;
@@ -393,7 +393,7 @@ got_block:
 	/*
 	 * Check quota for allocation of this block.
 	 */
-	if (inode && DQUOT_ALLOC_BLOCK(inode, 1)) {
+	if (inode && vfs_dq_alloc_block(inode, 1)) {
 		mutex_unlock(&sbi->s_alloc_mutex);
 		*err = -EDQUOT;
 		return 0;
@@ -452,7 +452,7 @@ static void udf_table_free_blocks(struct super_block *sb,
 	/* We do this up front - There are some error conditions that
 	   could occure, but.. oh well */
 	if (inode)
-		DQUOT_FREE_BLOCK(inode, count);
+		vfs_dq_free_block(inode, count);
 	if (udf_add_free_space(sbi, sbi->s_partition, count))
 		mark_buffer_dirty(sbi->s_lvid_bh);
 
@@ -700,7 +700,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
 		epos.offset -= adsize;
 
 		alloc_count = (elen >> sb->s_blocksize_bits);
-		if (inode && DQUOT_PREALLOC_BLOCK(inode,
+		if (inode && vfs_dq_prealloc_block(inode,
 			alloc_count > block_count ? block_count : alloc_count))
 			alloc_count = 0;
 		else if (alloc_count > block_count) {
@@ -806,7 +806,7 @@ static int udf_table_new_block(struct super_block *sb,
 	goal_eloc.logicalBlockNum++;
 	goal_elen -= sb->s_blocksize;
 
-	if (inode && DQUOT_ALLOC_BLOCK(inode, 1)) {
+	if (inode && vfs_dq_alloc_block(inode, 1)) {
 		brelse(goal_epos.bh);
 		mutex_unlock(&sbi->s_alloc_mutex);
 		*err = -EDQUOT;
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 31fc842..47dbe56 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -36,8 +36,8 @@ void udf_free_inode(struct inode *inode)
 	 * Note: we must free any quota before locking the superblock,
 	 * as writing the quota to disk may need the lock as well.
 	 */
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
 
 	clear_inode(inode);
 
@@ -154,8 +154,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
 	insert_inode_hash(inode);
 	mark_inode_dirty(inode);
 
-	if (DQUOT_ALLOC_INODE(inode)) {
-		DQUOT_DROP(inode);
+	if (vfs_dq_alloc_inode(inode)) {
+		vfs_dq_drop(inode);
 		inode->i_flags |= S_NOQUOTA;
 		inode->i_nlink = 0;
 		iput(inode);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 0d9ada1..54c16ec 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -85,7 +85,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 				   "bit already cleared for fragment %u", i);
 	}
 	
-	DQUOT_FREE_BLOCK (inode, count);
+	vfs_dq_free_block(inode, count);
 
 	
 	fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
@@ -195,7 +195,7 @@ do_more:
 		ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 		if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 			ufs_clusteracct (sb, ucpi, blkno, 1);
-		DQUOT_FREE_BLOCK(inode, uspi->s_fpb);
+		vfs_dq_free_block(inode, uspi->s_fpb);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
 		uspi->cs_total.cs_nbfree++;
@@ -556,7 +556,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 		fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
 	for (i = oldcount; i < newcount; i++)
 		ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-	if(DQUOT_ALLOC_BLOCK(inode, count)) {
+	if (vfs_dq_alloc_block(inode, count)) {
 		*err = -EDQUOT;
 		return 0;
 	}
@@ -664,7 +664,7 @@ cg_found:
 		for (i = count; i < uspi->s_fpb; i++)
 			ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
 		i = uspi->s_fpb - count;
-		DQUOT_FREE_BLOCK(inode, i);
+		vfs_dq_free_block(inode, i);
 
 		fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
 		uspi->cs_total.cs_nffree += i;
@@ -676,7 +676,7 @@ cg_found:
 	result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
 	if (result == INVBLOCK)
 		return 0;
-	if(DQUOT_ALLOC_BLOCK(inode, count)) {
+	if (vfs_dq_alloc_block(inode, count)) {
 		*err = -EDQUOT;
 		return 0;
 	}
@@ -747,7 +747,7 @@ gotit:
 	ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
 	if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
 		ufs_clusteracct (sb, ucpi, blkno, -1);
-	if(DQUOT_ALLOC_BLOCK(inode, uspi->s_fpb)) {
+	if (vfs_dq_alloc_block(inode, uspi->s_fpb)) {
 		*err = -EDQUOT;
 		return INVBLOCK;
 	}
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 6f5dcf0..3527c00 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -95,8 +95,8 @@ void ufs_free_inode (struct inode * inode)
 
 	is_directory = S_ISDIR(inode->i_mode);
 
-	DQUOT_FREE_INODE(inode);
-	DQUOT_DROP(inode);
+	vfs_dq_free_inode(inode);
+	vfs_dq_drop(inode);
 
 	clear_inode (inode);
 
@@ -355,8 +355,8 @@ cg_found:
 
 	unlock_super (sb);
 
-	if (DQUOT_ALLOC_INODE(inode)) {
-		DQUOT_DROP(inode);
+	if (vfs_dq_alloc_inode(inode)) {
+		vfs_dq_drop(inode);
 		err = -EDQUOT;
 		goto fail_without_unlock;
 	}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 39f8778..3d2512c 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -622,7 +622,6 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	mode_t mode;
-	unsigned i;
 
 	/*
 	 * Copy data to the in-core inode.
@@ -655,11 +654,12 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 
 	
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
-			ufsi->i_u1.i_data[i] = ufs_inode->ui_u2.ui_addr.ui_db[i];
+		memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
 	} else {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
-			ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i];
+		memcpy(ufsi->i_u1.i_symlink, ufs_inode->ui_u2.ui_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink) - 1);
+		ufsi->i_u1.i_symlink[sizeof(ufs_inode->ui_u2.ui_symlink) - 1] = 0;
 	}
 	return 0;
 }
@@ -669,7 +669,6 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 	struct ufs_inode_info *ufsi = UFS_I(inode);
 	struct super_block *sb = inode->i_sb;
 	mode_t mode;
-	unsigned i;
 
 	UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino);
 	/*
@@ -704,12 +703,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode)
 	*/
 
 	if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
-			ufsi->i_u1.u2_i_data[i] =
-				ufs2_inode->ui_u2.ui_addr.ui_db[i];
+		memcpy(ufsi->i_u1.u2_i_data, &ufs2_inode->ui_u2.ui_addr,
+		       sizeof(ufs2_inode->ui_u2.ui_addr));
 	} else {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
-			ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i];
+		memcpy(ufsi->i_u1.i_symlink, ufs2_inode->ui_u2.ui_symlink,
+		       sizeof(ufs2_inode->ui_u2.ui_symlink) - 1);
+		ufsi->i_u1.i_symlink[sizeof(ufs2_inode->ui_u2.ui_symlink) - 1] = 0;
 	}
 	return 0;
 }
@@ -781,7 +780,6 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 {
 	struct super_block *sb = inode->i_sb;
  	struct ufs_inode_info *ufsi = UFS_I(inode);
- 	unsigned i;
 
 	ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
 	ufs_inode->ui_nlink = cpu_to_fs16(sb, inode->i_nlink);
@@ -809,12 +807,12 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode)
 		/* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
 		ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.i_data[0];
 	} else if (inode->i_blocks) {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
-			ufs_inode->ui_u2.ui_addr.ui_db[i] = ufsi->i_u1.i_data[i];
+		memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.i_data,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
 	}
 	else {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
-			ufs_inode->ui_u2.ui_symlink[i] = ufsi->i_u1.i_symlink[i];
+		memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink));
 	}
 
 	if (!inode->i_nlink)
@@ -825,7 +823,6 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
 {
 	struct super_block *sb = inode->i_sb;
  	struct ufs_inode_info *ufsi = UFS_I(inode);
- 	unsigned i;
 
 	UFSD("ENTER\n");
 	ufs_inode->ui_mode = cpu_to_fs16(sb, inode->i_mode);
@@ -850,11 +847,11 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode)
 		/* ufs_inode->ui_u2.ui_addr.ui_db[0] = cpu_to_fs32(sb, inode->i_rdev); */
 		ufs_inode->ui_u2.ui_addr.ui_db[0] = ufsi->i_u1.u2_i_data[0];
 	} else if (inode->i_blocks) {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++)
-			ufs_inode->ui_u2.ui_addr.ui_db[i] = ufsi->i_u1.u2_i_data[i];
+		memcpy(&ufs_inode->ui_u2.ui_addr, ufsi->i_u1.u2_i_data,
+		       sizeof(ufs_inode->ui_u2.ui_addr));
 	} else {
-		for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++)
-			ufs_inode->ui_u2.ui_symlink[i] = ufsi->i_u1.i_symlink[i];
+		memcpy(&ufs_inode->ui_u2.ui_symlink, ufsi->i_u1.i_symlink,
+		       sizeof(ufs_inode->ui_u2.ui_symlink));
  	}
 
 	if (!inode->i_nlink)
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index e3a9b1f..23119fe 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -147,7 +147,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 	} else {
 		/* fast symlink */
 		inode->i_op = &ufs_fast_symlink_inode_operations;
-		memcpy((char*)&UFS_I(inode)->i_u1.i_data,symname,l);
+		memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l);
 		inode->i_size = l-1;
 	}
 	mark_inode_dirty(inode);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index e65212d..e1c1fc5 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -41,7 +41,7 @@
  * Stefan Reinauer <stepan@home.culture.mipt.ru>
  *
  * Module usage counts added on 96/04/29 by
- * Gertjan van Wingerde <gertjan@cs.vu.nl>
+ * Gertjan van Wingerde <gwingerde@gmail.com>
  *
  * Clean swab support on 19970406 by
  * Francois-Rene Rideau <fare@tunes.org>
@@ -636,6 +636,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned block_size, super_block_size;
 	unsigned flags;
 	unsigned super_block_offset;
+	unsigned maxsymlen;
 	int ret = -EINVAL;
 
 	uspi = NULL;
@@ -1069,6 +1070,16 @@ magic_found:
 		uspi->s_maxsymlinklen =
 		    fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen);
 
+	if (uspi->fs_magic == UFS2_MAGIC)
+		maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR);
+	else
+		maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR);
+	if (uspi->s_maxsymlinklen > maxsymlen) {
+		ufs_warning(sb, __func__, "ufs_read_super: excessive maximum "
+			    "fast symlink size (%u)\n", uspi->s_maxsymlinklen);
+		uspi->s_maxsymlinklen = maxsymlen;
+	}
+
 	inode = ufs_iget(sb, UFS_ROOTINO);
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 11c0351..69b3427 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -23,7 +23,7 @@ struct ufs_sb_info {
 struct ufs_inode_info {
 	union {
 		__fs32	i_data[15];
-		__u8	i_symlink[4*15];
+		__u8	i_symlink[2 * 4 * 15];
 		__fs64	u2_i_data[15];
 	} i_u1;
 	__u32	i_flags;
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d71dc44..aa1016b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -34,6 +34,12 @@
 #include <linux/backing-dev.h>
 #include <linux/freezer.h>
 
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
 STATIC int xfsbufd_wakeup(int, gfp_t);
@@ -166,6 +172,75 @@ test_page_region(
 }
 
 /*
+ *	Mapping of multi-page buffers into contiguous virtual space
+ */
+
+typedef struct a_list {
+	void		*vm_addr;
+	struct a_list	*next;
+} a_list_t;
+
+static a_list_t		*as_free_head;
+static int		as_list_len;
+static DEFINE_SPINLOCK(as_lock);
+
+/*
+ *	Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+	void		*addr)
+{
+	a_list_t	*aentry;
+
+#ifdef CONFIG_XEN
+	/*
+	 * Xen needs to be able to make sure it can get an exclusive
+	 * RO mapping of pages it wants to turn into a pagetable.  If
+	 * a newly allocated page is also still being vmap()ed by xfs,
+	 * it will cause pagetable construction to fail.  This is a
+	 * quick workaround to always eagerly unmap pages so that Xen
+	 * is happy.
+	 */
+	vunmap(addr);
+	return;
+#endif
+
+	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
+	if (likely(aentry)) {
+		spin_lock(&as_lock);
+		aentry->next = as_free_head;
+		aentry->vm_addr = addr;
+		as_free_head = aentry;
+		as_list_len++;
+		spin_unlock(&as_lock);
+	} else {
+		vunmap(addr);
+	}
+}
+
+STATIC void
+purge_addresses(void)
+{
+	a_list_t	*aentry, *old;
+
+	if (as_free_head == NULL)
+		return;
+
+	spin_lock(&as_lock);
+	aentry = as_free_head;
+	as_free_head = NULL;
+	as_list_len = 0;
+	spin_unlock(&as_lock);
+
+	while ((old = aentry) != NULL) {
+		vunmap(aentry->vm_addr);
+		aentry = aentry->next;
+		kfree(old);
+	}
+}
+
+/*
  *	Internal xfs_buf_t object manipulation
  */
 
@@ -264,7 +339,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
+			free_address(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -386,8 +461,10 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                                       -1, PAGE_KERNEL);
+		if (as_list_len > 64)
+			purge_addresses();
+		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
+					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
 		bp->b_addr += bp->b_offset;
@@ -1364,10 +1441,12 @@ xfs_unregister_buftarg(
 
 void
 xfs_free_buftarg(
-	xfs_buftarg_t		*btp)
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*btp)
 {
 	xfs_flush_buftarg(btp, 1);
-	xfs_blkdev_issue_flush(btp);
+	if (mp->m_flags & XFS_MOUNT_BARRIER)
+		xfs_blkdev_issue_flush(btp);
 	xfs_free_bufhash(btp);
 	iput(btp->bt_mapping->host);
 
@@ -1672,6 +1751,8 @@ xfsbufd(
 			count++;
 		}
 
+		if (as_list_len > 0)
+			purge_addresses();
 		if (count)
 			blk_run_address_space(target->bt_mapping);
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 288ae7c..9b4d666 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
  *	Handling of buftargs.
  */
 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
-extern void xfs_free_buftarg(xfs_buftarg_t *);
+extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c71e226..32ae502 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -734,15 +734,15 @@ xfs_close_devices(
 {
 	if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
 		struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
-		xfs_free_buftarg(mp->m_logdev_targp);
+		xfs_free_buftarg(mp, mp->m_logdev_targp);
 		xfs_blkdev_put(logdev);
 	}
 	if (mp->m_rtdev_targp) {
 		struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
-		xfs_free_buftarg(mp->m_rtdev_targp);
+		xfs_free_buftarg(mp, mp->m_rtdev_targp);
 		xfs_blkdev_put(rtdev);
 	}
-	xfs_free_buftarg(mp->m_ddev_targp);
+	xfs_free_buftarg(mp, mp->m_ddev_targp);
 }
 
 /*
@@ -811,9 +811,9 @@ xfs_open_devices(
 
  out_free_rtdev_targ:
 	if (mp->m_rtdev_targp)
-		xfs_free_buftarg(mp->m_rtdev_targp);
+		xfs_free_buftarg(mp, mp->m_rtdev_targp);
  out_free_ddev_targ:
-	xfs_free_buftarg(mp->m_ddev_targp);
+	xfs_free_buftarg(mp, mp->m_ddev_targp);
  out_close_rtdev:
 	if (rtdev)
 		xfs_blkdev_put(rtdev);
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 2ed0353..a608e72 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -371,7 +371,11 @@ xfs_quiesce_attr(
 	/* flush inodes and push all remaining buffers out to disk */
 	xfs_quiesce_fs(mp);
 
-	ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+	/*
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
+	 */
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
 
 	/* Push the superblock and write an unmount record */
 	error = xfs_log_sbcount(mp, 1);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b4c1ee7..f8278cf 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -55,17 +55,11 @@ xfs_swapext(
 	struct file	*file, *target_file;
 	int		error = 0;
 
-	sxp = kmem_alloc(sizeof(xfs_swapext_t), KM_MAYFAIL);
-	if (!sxp) {
-		error = XFS_ERROR(ENOMEM);
-		goto out;
-	}
-
 	/* Pull information for the target fd */
 	file = fget((int)sxp->sx_fdtarget);
 	if (!file) {
 		error = XFS_ERROR(EINVAL);
-		goto out_free_sxp;
+		goto out;
 	}
 
 	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) {
@@ -109,8 +103,6 @@ xfs_swapext(
 	fput(target_file);
  out_put_file:
 	fput(file);
- out_free_sxp:
-	kmem_free(sxp);
  out:
 	return error;
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e2fb621..478e587 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -246,9 +246,6 @@ xfs_iget_cache_miss(
 		goto out_destroy;
 	}
 
-	if (lock_flags)
-		xfs_ilock(ip, lock_flags);
-
 	/*
 	 * Preload the radix tree so we can insert safely under the
 	 * write spinlock. Note that we cannot sleep inside the preload
@@ -256,7 +253,16 @@ xfs_iget_cache_miss(
 	 */
 	if (radix_tree_preload(GFP_KERNEL)) {
 		error = EAGAIN;
-		goto out_unlock;
+		goto out_destroy;
+	}
+
+	/*
+	 * Because the inode hasn't been added to the radix-tree yet it can't
+	 * be found by another thread, so we can do the non-sleeping lock here.
+	 */
+	if (lock_flags) {
+		if (!xfs_ilock_nowait(ip, lock_flags))
+			BUG();
 	}
 
 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
@@ -284,7 +290,6 @@ xfs_iget_cache_miss(
 out_preload_end:
 	write_unlock(&pag->pag_ici_lock);
 	radix_tree_preload_end();
-out_unlock:
 	if (lock_flags)
 		xfs_iunlock(ip, lock_flags);
 out_destroy:
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 35cca98..61af610 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -70,16 +70,21 @@ STATIC void	xlog_recover_check_summary(xlog_t *);
 xfs_buf_t *
 xlog_get_bp(
 	xlog_t		*log,
-	int		num_bblks)
+	int		nbblks)
 {
-	ASSERT(num_bblks > 0);
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_get_bp(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return NULL;
+	}
 
 	if (log->l_sectbb_log) {
-		if (num_bblks > 1)
-			num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
-		num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
+		if (nbblks > 1)
+			nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
+		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 	}
-	return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
+	return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
 }
 
 void
@@ -102,6 +107,13 @@ xlog_bread(
 {
 	int		error;
 
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_bread(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return EFSCORRUPTED;
+	}
+
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -139,6 +151,13 @@ xlog_bwrite(
 {
 	int		error;
 
+	if (nbblks <= 0 || nbblks > log->l_logBBsize) {
+		xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks);
+		XFS_ERROR_REPORT("xlog_bwrite(1)",
+				 XFS_ERRLEVEL_HIGH, log->l_mp);
+		return EFSCORRUPTED;
+	}
+
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
@@ -1436,10 +1455,19 @@ xlog_recover_add_to_trans(
 	item = item->ri_prev;
 
 	if (item->ri_total == 0) {		/* first region to be added */
-		item->ri_total	= in_f->ilf_size;
-		ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
-		item->ri_buf = kmem_zalloc((item->ri_total *
-					    sizeof(xfs_log_iovec_t)), KM_SLEEP);
+		if (in_f->ilf_size == 0 ||
+		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
+			xlog_warn(
+	"XFS: bad number of regions (%d) in inode log format",
+				  in_f->ilf_size);
+			ASSERT(0);
+			return XFS_ERROR(EIO);
+		}
+
+		item->ri_total = in_f->ilf_size;
+		item->ri_buf =
+			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+				    KM_SLEEP);
 	}
 	ASSERT(item->ri_total > item->ri_cnt);
 	/* Description region is ri_buf[0] */