summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-16 17:52:55 (GMT)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-16 17:52:55 (GMT)
commitadd096909da63ef32d6766f6771c07c9f16c6ee5 (patch)
tree58594bcf68cbb6f777d5270d098ab8ca69cbaee3 /fs
parente245befce7af0a1e1347079ed62695b059594bd4 (diff)
parent54c57dc3b6578356c0a428c767d4bf080254a2ee (diff)
downloadlinux-fsl-qoriq-add096909da63ef32d6766f6771c07c9f16c6ee5.tar.xz
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (32 commits) [PATCH] ocfs2: zero_user_page conversion ocfs2: Support xfs style space reservation ioctls ocfs2: support for removing file regions ocfs2: update truncate handling of partial clusters ocfs2: btree support for removal of arbirtrary extents ocfs2: Support creation of unwritten extents ocfs2: support writing of unwritten extents ocfs2: small cleanup of ocfs2_write_begin_nolock() ocfs2: btree changes for unwritten extents ocfs2: abstract btree growing calls ocfs2: use all extent block suballocators ocfs2: plug truncate into cached dealloc routines ocfs2: simplify deallocation locking ocfs2: harden buffer check during mapping of page blocks ocfs2: shared writeable mmap ocfs2: factor out write aops into nolock variants ocfs2: rework ocfs2_buffered_write_cluster() ocfs2: take ip_alloc_sem during entire truncate ocfs2: Add "preferred slot" mount option [KJ PATCH] Replacing memset(<addr>,0,PAGE_SIZE) with clear_page() in fs/ocfs2/dlm/dlmrecovery.c ...
Diffstat (limited to 'fs')
-rw-r--r--fs/configfs/configfs_internal.h7
-rw-r--r--fs/configfs/dir.c289
-rw-r--r--fs/configfs/file.c28
-rw-r--r--fs/configfs/item.c29
-rw-r--r--fs/dlm/config.c20
-rw-r--r--fs/ocfs2/alloc.c2676
-rw-r--r--fs/ocfs2/alloc.h43
-rw-r--r--fs/ocfs2/aops.c1015
-rw-r--r--fs/ocfs2/aops.h61
-rw-r--r--fs/ocfs2/cluster/heartbeat.c96
-rw-r--r--fs/ocfs2/cluster/heartbeat.h6
-rw-r--r--fs/ocfs2/cluster/nodemanager.c42
-rw-r--r--fs/ocfs2/cluster/nodemanager.h5
-rw-r--r--fs/ocfs2/cluster/tcp.c21
-rw-r--r--fs/ocfs2/dir.c2
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c8
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c40
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c79
-rw-r--r--fs/ocfs2/dlmglue.c6
-rw-r--r--fs/ocfs2/endian.h5
-rw-r--r--fs/ocfs2/extent_map.c41
-rw-r--r--fs/ocfs2/file.c702
-rw-r--r--fs/ocfs2/file.h10
-rw-r--r--fs/ocfs2/heartbeat.c10
-rw-r--r--fs/ocfs2/ioctl.c15
-rw-r--r--fs/ocfs2/journal.c6
-rw-r--r--fs/ocfs2/journal.h2
-rw-r--r--fs/ocfs2/mmap.c167
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h14
-rw-r--r--fs/ocfs2/ocfs2_fs.h33
-rw-r--r--fs/ocfs2/slot_map.c12
-rw-r--r--fs/ocfs2/suballoc.c46
-rw-r--r--fs/ocfs2/suballoc.h17
-rw-r--r--fs/ocfs2/super.c27
-rw-r--r--fs/ocfs2/super.h2
36 files changed, 4548 insertions, 1036 deletions
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index 7b48c03..3b0185f 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -29,10 +29,11 @@
struct configfs_dirent {
atomic_t s_count;
+ int s_dependent_count;
struct list_head s_sibling;
struct list_head s_children;
struct list_head s_links;
- void * s_element;
+ void * s_element;
int s_type;
umode_t s_mode;
struct dentry * s_dentry;
@@ -41,8 +42,8 @@ struct configfs_dirent {
#define CONFIGFS_ROOT 0x0001
#define CONFIGFS_DIR 0x0002
-#define CONFIGFS_ITEM_ATTR 0x0004
-#define CONFIGFS_ITEM_LINK 0x0020
+#define CONFIGFS_ITEM_ATTR 0x0004
+#define CONFIGFS_ITEM_LINK 0x0020
#define CONFIGFS_USET_DIR 0x0040
#define CONFIGFS_USET_DEFAULT 0x0080
#define CONFIGFS_USET_DROPPING 0x0100
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5e6e37e..2f436d4 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -355,6 +355,10 @@ static int configfs_detach_prep(struct dentry *dentry)
/* Mark that we've taken i_mutex */
sd->s_type |= CONFIGFS_USET_DROPPING;
+ /*
+ * Yup, recursive. If there's a problem, blame
+ * deep nesting of default_groups
+ */
ret = configfs_detach_prep(sd->s_dentry);
if (!ret)
continue;
@@ -562,7 +566,7 @@ static int populate_groups(struct config_group *group)
/*
* All of link_obj/unlink_obj/link_group/unlink_group require that
- * subsys->su_sem is held.
+ * subsys->su_mutex is held.
*/
static void unlink_obj(struct config_item *item)
@@ -714,6 +718,28 @@ static void configfs_detach_group(struct config_item *item)
}
/*
+ * After the item has been detached from the filesystem view, we are
+ * ready to tear it out of the hierarchy. Notify the client before
+ * we do that so they can perform any cleanup that requires
+ * navigating the hierarchy. A client does not need to provide this
+ * callback. The subsystem semaphore MUST be held by the caller, and
+ * references must be valid for both items. It also assumes the
+ * caller has validated ci_type.
+ */
+static void client_disconnect_notify(struct config_item *parent_item,
+ struct config_item *item)
+{
+ struct config_item_type *type;
+
+ type = parent_item->ci_type;
+ BUG_ON(!type);
+
+ if (type->ct_group_ops && type->ct_group_ops->disconnect_notify)
+ type->ct_group_ops->disconnect_notify(to_config_group(parent_item),
+ item);
+}
+
+/*
* Drop the initial reference from make_item()/make_group()
* This function assumes that reference is held on item
* and that item holds a valid reference to the parent. Also, it
@@ -733,11 +759,244 @@ static void client_drop_item(struct config_item *parent_item,
*/
if (type->ct_group_ops && type->ct_group_ops->drop_item)
type->ct_group_ops->drop_item(to_config_group(parent_item),
- item);
+ item);
else
config_item_put(item);
}
+#ifdef DEBUG
+static void configfs_dump_one(struct configfs_dirent *sd, int level)
+{
+ printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd));
+
+#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type);
+ type_print(CONFIGFS_ROOT);
+ type_print(CONFIGFS_DIR);
+ type_print(CONFIGFS_ITEM_ATTR);
+ type_print(CONFIGFS_ITEM_LINK);
+ type_print(CONFIGFS_USET_DIR);
+ type_print(CONFIGFS_USET_DEFAULT);
+ type_print(CONFIGFS_USET_DROPPING);
+#undef type_print
+}
+
+static int configfs_dump(struct configfs_dirent *sd, int level)
+{
+ struct configfs_dirent *child_sd;
+ int ret = 0;
+
+ configfs_dump_one(sd, level);
+
+ if (!(sd->s_type & (CONFIGFS_DIR|CONFIGFS_ROOT)))
+ return 0;
+
+ list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+ ret = configfs_dump(child_sd, level + 2);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+#endif
+
+
+/*
+ * configfs_depend_item() and configfs_undepend_item()
+ *
+ * WARNING: Do not call these from a configfs callback!
+ *
+ * This describes these functions and their helpers.
+ *
+ * Allow another kernel system to depend on a config_item. If this
+ * happens, the item cannot go away until the dependant can live without
+ * it. The idea is to give client modules as simple an interface as
+ * possible. When a system asks them to depend on an item, they just
+ * call configfs_depend_item(). If the item is live and the client
+ * driver is in good shape, we'll happily do the work for them.
+ *
+ * Why is the locking complex? Because configfs uses the VFS to handle
+ * all locking, but this function is called outside the normal
+ * VFS->configfs path. So it must take VFS locks to prevent the
+ * VFS->configfs stuff (configfs_mkdir(), configfs_rmdir(), etc). This is
+ * why you can't call these functions underneath configfs callbacks.
+ *
+ * Note, btw, that this can be called at *any* time, even when a configfs
+ * subsystem isn't registered, or when configfs is loading or unloading.
+ * Just like configfs_register_subsystem(). So we take the same
+ * precautions. We pin the filesystem. We lock each i_mutex _in_order_
+ * on our way down the tree. If we can find the target item in the
+ * configfs tree, it must be part of the subsystem tree as well, so we
+ * do not need the subsystem semaphore. Holding the i_mutex chain locks
+ * out mkdir() and rmdir(), who might be racing us.
+ */
+
+/*
+ * configfs_depend_prep()
+ *
+ * Only subdirectories count here. Files (CONFIGFS_NOT_PINNED) are
+ * attributes. This is similar but not the same to configfs_detach_prep().
+ * Note that configfs_detach_prep() expects the parent to be locked when it
+ * is called, but we lock the parent *inside* configfs_depend_prep(). We
+ * do that so we can unlock it if we find nothing.
+ *
+ * Here we do a depth-first search of the dentry hierarchy looking for
+ * our object. We take i_mutex on each step of the way down. IT IS
+ * ESSENTIAL THAT i_mutex LOCKING IS ORDERED. If we come back up a branch,
+ * we'll drop the i_mutex.
+ *
+ * If the target is not found, -ENOENT is bubbled up and we have released
+ * all locks. If the target was found, the locks will be cleared by
+ * configfs_depend_rollback().
+ *
+ * This adds a requirement that all config_items be unique!
+ *
+ * This is recursive because the locking traversal is tricky. There isn't
+ * much on the stack, though, so folks that need this function - be careful
+ * about your stack! Patches will be accepted to make it iterative.
+ */
+static int configfs_depend_prep(struct dentry *origin,
+ struct config_item *target)
+{
+ struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+ int ret = 0;
+
+ BUG_ON(!origin || !sd);
+
+ /* Lock this guy on the way down */
+ mutex_lock(&sd->s_dentry->d_inode->i_mutex);
+ if (sd->s_element == target) /* Boo-yah */
+ goto out;
+
+ list_for_each_entry(child_sd, &sd->s_children, s_sibling) {
+ if (child_sd->s_type & CONFIGFS_DIR) {
+ ret = configfs_depend_prep(child_sd->s_dentry,
+ target);
+ if (!ret)
+ goto out; /* Child path boo-yah */
+ }
+ }
+
+ /* We looped all our children and didn't find target */
+ mutex_unlock(&sd->s_dentry->d_inode->i_mutex);
+ ret = -ENOENT;
+
+out:
+ return ret;
+}
+
+/*
+ * This is ONLY called if configfs_depend_prep() did its job. So we can
+ * trust the entire path from item back up to origin.
+ *
+ * We walk backwards from item, unlocking each i_mutex. We finish by
+ * unlocking origin.
+ */
+static void configfs_depend_rollback(struct dentry *origin,
+ struct config_item *item)
+{
+ struct dentry *dentry = item->ci_dentry;
+
+ while (dentry != origin) {
+ mutex_unlock(&dentry->d_inode->i_mutex);
+ dentry = dentry->d_parent;
+ }
+
+ mutex_unlock(&origin->d_inode->i_mutex);
+}
+
+int configfs_depend_item(struct configfs_subsystem *subsys,
+ struct config_item *target)
+{
+ int ret;
+ struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+ struct config_item *s_item = &subsys->su_group.cg_item;
+
+ /*
+ * Pin the configfs filesystem. This means we can safely access
+ * the root of the configfs filesystem.
+ */
+ ret = configfs_pin_fs();
+ if (ret)
+ return ret;
+
+ /*
+ * Next, lock the root directory. We're going to check that the
+ * subsystem is really registered, and so we need to lock out
+ * configfs_[un]register_subsystem().
+ */
+ mutex_lock(&configfs_sb->s_root->d_inode->i_mutex);
+
+ root_sd = configfs_sb->s_root->d_fsdata;
+
+ list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+ if (p->s_type & CONFIGFS_DIR) {
+ if (p->s_element == s_item) {
+ subsys_sd = p;
+ break;
+ }
+ }
+ }
+
+ if (!subsys_sd) {
+ ret = -ENOENT;
+ goto out_unlock_fs;
+ }
+
+ /* Ok, now we can trust subsys/s_item */
+
+ /* Scan the tree, locking i_mutex recursively, return 0 if found */
+ ret = configfs_depend_prep(subsys_sd->s_dentry, target);
+ if (ret)
+ goto out_unlock_fs;
+
+ /* We hold all i_mutexes from the subsystem down to the target */
+ p = target->ci_dentry->d_fsdata;
+ p->s_dependent_count += 1;
+
+ configfs_depend_rollback(subsys_sd->s_dentry, target);
+
+out_unlock_fs:
+ mutex_unlock(&configfs_sb->s_root->d_inode->i_mutex);
+
+ /*
+ * If we succeeded, the fs is pinned via other methods. If not,
+ * we're done with it anyway. So release_fs() is always right.
+ */
+ configfs_release_fs();
+
+ return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item);
+
+/*
+ * Release the dependent linkage. This is much simpler than
+ * configfs_depend_item() because we know that that the client driver is
+ * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
+ */
+void configfs_undepend_item(struct configfs_subsystem *subsys,
+ struct config_item *target)
+{
+ struct configfs_dirent *sd;
+
+ /*
+ * Since we can trust everything is pinned, we just need i_mutex
+ * on the item.
+ */
+ mutex_lock(&target->ci_dentry->d_inode->i_mutex);
+
+ sd = target->ci_dentry->d_fsdata;
+ BUG_ON(sd->s_dependent_count < 1);
+
+ sd->s_dependent_count -= 1;
+
+ /*
+ * After this unlock, we cannot trust the item to stay alive!
+ * DO NOT REFERENCE item after this unlock.
+ */
+ mutex_unlock(&target->ci_dentry->d_inode->i_mutex);
+}
+EXPORT_SYMBOL(configfs_undepend_item);
static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
@@ -783,7 +1042,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name);
- down(&subsys->su_sem);
+ mutex_lock(&subsys->su_mutex);
group = NULL;
item = NULL;
if (type->ct_group_ops->make_group) {
@@ -797,7 +1056,7 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (item)
link_obj(parent_item, item);
}
- up(&subsys->su_sem);
+ mutex_unlock(&subsys->su_mutex);
kfree(name);
if (!item) {
@@ -841,13 +1100,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
out_unlink:
if (ret) {
/* Tear down everything we built up */
- down(&subsys->su_sem);
+ mutex_lock(&subsys->su_mutex);
+
+ client_disconnect_notify(parent_item, item);
if (group)
unlink_group(group);
else
unlink_obj(item);
client_drop_item(parent_item, item);
- up(&subsys->su_sem);
+
+ mutex_unlock(&subsys->su_mutex);
if (module_got)
module_put(owner);
@@ -881,6 +1143,13 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
if (sd->s_type & CONFIGFS_USET_DEFAULT)
return -EPERM;
+ /*
+ * Here's where we check for dependents. We're protected by
+ * i_mutex.
+ */
+ if (sd->s_dependent_count)
+ return -EBUSY;
+
/* Get a working ref until we have the child */
parent_item = configfs_get_config_item(dentry->d_parent);
subsys = to_config_group(parent_item)->cg_subsys;
@@ -910,17 +1179,19 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry)
if (sd->s_type & CONFIGFS_USET_DIR) {
configfs_detach_group(item);
- down(&subsys->su_sem);
+ mutex_lock(&subsys->su_mutex);
+ client_disconnect_notify(parent_item, item);
unlink_group(to_config_group(item));
} else {
configfs_detach_item(item);
- down(&subsys->su_sem);
+ mutex_lock(&subsys->su_mutex);
+ client_disconnect_notify(parent_item, item);
unlink_obj(item);
}
client_drop_item(parent_item, item);
- up(&subsys->su_sem);
+ mutex_unlock(&subsys->su_mutex);
/* Drop our reference from above */
config_item_put(item);
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 3527c7c..a3658f9 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -27,19 +27,26 @@
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
-#include <asm/semaphore.h>
#include <linux/configfs.h>
#include "configfs_internal.h"
+/*
+ * A simple attribute can only be 4096 characters. Why 4k? Because the
+ * original code limited it to PAGE_SIZE. That's a bad idea, though,
+ * because an attribute of 16k on ia64 won't work on x86. So we limit to
+ * 4k, our minimum common page size.
+ */
+#define SIMPLE_ATTR_SIZE 4096
struct configfs_buffer {
size_t count;
loff_t pos;
char * page;
struct configfs_item_operations * ops;
- struct semaphore sem;
+ struct mutex mutex;
int needs_read_fill;
};
@@ -69,7 +76,7 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
count = ops->show_attribute(item,attr,buffer->page);
buffer->needs_read_fill = 0;
- BUG_ON(count > (ssize_t)PAGE_SIZE);
+ BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
if (count >= 0)
buffer->count = count;
else
@@ -102,7 +109,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
struct configfs_buffer * buffer = file->private_data;
ssize_t retval = 0;
- down(&buffer->sem);
+ mutex_lock(&buffer->mutex);
if (buffer->needs_read_fill) {
if ((retval = fill_read_buffer(file->f_path.dentry,buffer)))
goto out;
@@ -112,7 +119,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp
retval = simple_read_from_buffer(buf, count, ppos, buffer->page,
buffer->count);
out:
- up(&buffer->sem);
+ mutex_unlock(&buffer->mutex);
return retval;
}
@@ -137,8 +144,8 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size
if (!buffer->page)
return -ENOMEM;
- if (count >= PAGE_SIZE)
- count = PAGE_SIZE - 1;
+ if (count >= SIMPLE_ATTR_SIZE)
+ count = SIMPLE_ATTR_SIZE - 1;
error = copy_from_user(buffer->page,buf,count);
buffer->needs_read_fill = 1;
/* if buf is assumed to contain a string, terminate it by \0,
@@ -193,13 +200,13 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
struct configfs_buffer * buffer = file->private_data;
ssize_t len;
- down(&buffer->sem);
+ mutex_lock(&buffer->mutex);
len = fill_write_buffer(buffer, buf, count);
if (len > 0)
len = flush_write_buffer(file->f_path.dentry, buffer, count);
if (len > 0)
*ppos += len;
- up(&buffer->sem);
+ mutex_unlock(&buffer->mutex);
return len;
}
@@ -253,7 +260,7 @@ static int check_perm(struct inode * inode, struct file * file)
error = -ENOMEM;
goto Enomem;
}
- init_MUTEX(&buffer->sem);
+ mutex_init(&buffer->mutex);
buffer->needs_read_fill = 1;
buffer->ops = ops;
file->private_data = buffer;
@@ -292,6 +299,7 @@ static int configfs_release(struct inode * inode, struct file * filp)
if (buffer) {
if (buffer->page)
free_page((unsigned long)buffer->page);
+ mutex_destroy(&buffer->mutex);
kfree(buffer);
}
return 0;
diff --git a/fs/configfs/item.c b/fs/configfs/item.c
index 2442120..76dc4c3 100644
--- a/fs/configfs/item.c
+++ b/fs/configfs/item.c
@@ -62,7 +62,6 @@ void config_item_init(struct config_item * item)
* dynamically allocated string that @item->ci_name points to.
* Otherwise, use the static @item->ci_namebuf array.
*/
-
int config_item_set_name(struct config_item * item, const char * fmt, ...)
{
int error = 0;
@@ -139,12 +138,7 @@ struct config_item * config_item_get(struct config_item * item)
return item;
}
-/**
- * config_item_cleanup - free config_item resources.
- * @item: item.
- */
-
-void config_item_cleanup(struct config_item * item)
+static void config_item_cleanup(struct config_item * item)
{
struct config_item_type * t = item->ci_type;
struct config_group * s = item->ci_group;
@@ -179,39 +173,35 @@ void config_item_put(struct config_item * item)
kref_put(&item->ci_kref, config_item_release);
}
-
/**
* config_group_init - initialize a group for use
* @k: group
*/
-
void config_group_init(struct config_group *group)
{
config_item_init(&group->cg_item);
INIT_LIST_HEAD(&group->cg_children);
}
-
/**
- * config_group_find_obj - search for item in group.
+ * config_group_find_item - search for item in group.
* @group: group we're looking in.
* @name: item's name.
*
- * Lock group via @group->cg_subsys, and iterate over @group->cg_list,
- * looking for a matching config_item. If matching item is found
- * take a reference and return the item.
+ * Iterate over @group->cg_list, looking for a matching config_item.
+ * If matching item is found take a reference and return the item.
+ * Caller must have locked group via @group->cg_subsys->su_mtx.
*/
-
-struct config_item * config_group_find_obj(struct config_group * group, const char * name)
+struct config_item *config_group_find_item(struct config_group *group,
+ const char *name)
{
struct list_head * entry;
struct config_item * ret = NULL;
- /* XXX LOCKING! */
list_for_each(entry,&group->cg_children) {
struct config_item * item = to_item(entry);
if (config_item_name(item) &&
- !strcmp(config_item_name(item), name)) {
+ !strcmp(config_item_name(item), name)) {
ret = config_item_get(item);
break;
}
@@ -219,9 +209,8 @@ struct config_item * config_group_find_obj(struct config_group * group, const ch
return ret;
}
-
EXPORT_SYMBOL(config_item_init);
EXPORT_SYMBOL(config_group_init);
EXPORT_SYMBOL(config_item_get);
EXPORT_SYMBOL(config_item_put);
-EXPORT_SYMBOL(config_group_find_obj);
+EXPORT_SYMBOL(config_group_find_item);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 5069b2c..2f8e3c8 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -133,14 +133,6 @@ static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
return len;
}
-#define __CONFIGFS_ATTR(_name,_mode,_read,_write) { \
- .attr = { .ca_name = __stringify(_name), \
- .ca_mode = _mode, \
- .ca_owner = THIS_MODULE }, \
- .show = _read, \
- .store = _write, \
-}
-
#define CLUSTER_ATTR(name, check_zero) \
static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len) \
{ \
@@ -615,7 +607,7 @@ static struct clusters clusters_root = {
int dlm_config_init(void)
{
config_group_init(&clusters_root.subsys.su_group);
- init_MUTEX(&clusters_root.subsys.su_sem);
+ mutex_init(&clusters_root.subsys.su_mutex);
return configfs_register_subsystem(&clusters_root.subsys);
}
@@ -759,9 +751,9 @@ static struct space *get_space(char *name)
if (!space_list)
return NULL;
- down(&space_list->cg_subsys->su_sem);
- i = config_group_find_obj(space_list, name);
- up(&space_list->cg_subsys->su_sem);
+ mutex_lock(&space_list->cg_subsys->su_mutex);
+ i = config_group_find_item(space_list, name);
+ mutex_unlock(&space_list->cg_subsys->su_mutex);
return to_space(i);
}
@@ -780,7 +772,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
if (!comm_list)
return NULL;
- down(&clusters_root.subsys.su_sem);
+ mutex_lock(&clusters_root.subsys.su_mutex);
list_for_each_entry(i, &comm_list->cg_children, ci_entry) {
cm = to_comm(i);
@@ -800,7 +792,7 @@ static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr)
break;
}
}
- up(&clusters_root.subsys.su_sem);
+ mutex_unlock(&clusters_root.subsys.su_mutex);
if (!found)
cm = NULL;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7..f5e11f4 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,8 @@
#include "buffer_head_io.h"
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ struct ocfs2_extent_block *eb);
/*
* Structures which describe a path through a btree, and functions to
@@ -117,6 +119,31 @@ static void ocfs2_free_path(struct ocfs2_path *path)
}
/*
+ * All the elements of src into dest. After this call, src could be freed
+ * without affecting dest.
+ *
+ * Both paths should have the same root. Any non-root elements of dest
+ * will be freed.
+ */
+static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+ int i;
+
+ BUG_ON(path_root_bh(dest) != path_root_bh(src));
+ BUG_ON(path_root_el(dest) != path_root_el(src));
+
+ ocfs2_reinit_path(dest, 1);
+
+ for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+ dest->p_node[i].bh = src->p_node[i].bh;
+ dest->p_node[i].el = src->p_node[i].el;
+
+ if (dest->p_node[i].bh)
+ get_bh(dest->p_node[i].bh);
+ }
+}
+
+/*
* Make the *dest path the same as src and re-initialize src path to
* have a root only.
*/
@@ -212,10 +239,41 @@ out:
return ret;
}
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
+{
+ int ret = -1;
+ int i;
+ struct ocfs2_extent_rec *rec;
+ u32 rec_end, rec_start, clusters;
+
+ for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+ rec = &el->l_recs[i];
+
+ rec_start = le32_to_cpu(rec->e_cpos);
+ clusters = ocfs2_rec_clusters(el, rec);
+
+ rec_end = rec_start + clusters;
+
+ if (v_cluster >= rec_start && v_cluster < rec_end) {
+ ret = i;
+ break;
+ }
+ }
+
+ return ret;
+}
+
enum ocfs2_contig_type {
CONTIG_NONE = 0,
CONTIG_LEFT,
- CONTIG_RIGHT
+ CONTIG_RIGHT,
+ CONTIG_LEFTRIGHT,
};
@@ -253,6 +311,14 @@ static enum ocfs2_contig_type
{
u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+ /*
+ * Refuse to coalesce extent records with different flag
+ * fields - we don't want to mix unwritten extents with user
+ * data.
+ */
+ if (ext->e_flags != insert_rec->e_flags)
+ return CONTIG_NONE;
+
if (ocfs2_extents_adjacent(ext, insert_rec) &&
ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
return CONTIG_RIGHT;
@@ -277,7 +343,14 @@ enum ocfs2_append_type {
APPEND_TAIL,
};
+enum ocfs2_split_type {
+ SPLIT_NONE = 0,
+ SPLIT_LEFT,
+ SPLIT_RIGHT,
+};
+
struct ocfs2_insert_type {
+ enum ocfs2_split_type ins_split;
enum ocfs2_append_type ins_appending;
enum ocfs2_contig_type ins_contig;
int ins_contig_index;
@@ -285,6 +358,13 @@ struct ocfs2_insert_type {
int ins_tree_depth;
};
+struct ocfs2_merge_ctxt {
+ enum ocfs2_contig_type c_contig_type;
+ int c_has_empty_extent;
+ int c_split_covers_rec;
+ int c_used_tail_recs;
+};
+
/*
* How many free extents have we got before we need more meta data?
*/
@@ -384,13 +464,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
eb->h_blkno = cpu_to_le64(first_blkno);
eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
-
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
- /* we always use slot zero's suballocator */
- eb->h_suballoc_slot = 0;
-#else
eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
-#endif
eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
eb->h_list.l_count =
cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
@@ -461,7 +535,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct buffer_head *eb_bh,
- struct buffer_head *last_eb_bh,
+ struct buffer_head **last_eb_bh,
struct ocfs2_alloc_context *meta_ac)
{
int status, new_blocks, i;
@@ -476,7 +550,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
mlog_entry_void();
- BUG_ON(!last_eb_bh);
+ BUG_ON(!last_eb_bh || !*last_eb_bh);
fe = (struct ocfs2_dinode *) fe_bh->b_data;
@@ -507,7 +581,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
goto bail;
}
- eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+ eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
@@ -568,7 +642,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* journal_dirty erroring as it won't unless we've aborted the
* handle (in which case we would never be here) so reserving
* the write with journal_access is all we need to do. */
- status = ocfs2_journal_access(handle, inode, last_eb_bh,
+ status = ocfs2_journal_access(handle, inode, *last_eb_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
@@ -601,10 +675,10 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* next_leaf on the previously last-extent-block. */
fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
- eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
- status = ocfs2_journal_dirty(handle, last_eb_bh);
+ status = ocfs2_journal_dirty(handle, *last_eb_bh);
if (status < 0)
mlog_errno(status);
status = ocfs2_journal_dirty(handle, fe_bh);
@@ -616,6 +690,14 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
mlog_errno(status);
}
+ /*
+ * Some callers want to track the rightmost leaf so pass it
+ * back here.
+ */
+ brelse(*last_eb_bh);
+ get_bh(new_eb_bhs[0]);
+ *last_eb_bh = new_eb_bhs[0];
+
status = 0;
bail:
if (new_eb_bhs) {
@@ -829,6 +911,87 @@ bail:
}
/*
+ * Grow a b-tree so that it has more records.
+ *
+ * We might shift the tree depth in which case existing paths should
+ * be considered invalid.
+ *
+ * Tree depth after the grow is returned via *final_depth.
+ *
+ * *last_eb_bh will be updated by ocfs2_add_branch().
+ */
+static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
+ struct buffer_head *di_bh, int *final_depth,
+ struct buffer_head **last_eb_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret, shift;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *bh = NULL;
+
+ BUG_ON(meta_ac == NULL);
+
+ shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
+ if (shift < 0) {
+ ret = shift;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /* We traveled all the way to the bottom of the allocation tree
+ * and didn't find room for any more extents - we need to add
+ * another tree level */
+ if (shift) {
+ BUG_ON(bh);
+ mlog(0, "need to shift tree depth (current = %d)\n", depth);
+
+ /* ocfs2_shift_tree_depth will return us a buffer with
+ * the new extent block (so we can pass that to
+ * ocfs2_add_branch). */
+ ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
+ meta_ac, &bh);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ depth++;
+ if (depth == 1) {
+ /*
+ * Special case: we have room now if we shifted from
+ * tree_depth 0, so no more work needs to be done.
+ *
+ * We won't be calling add_branch, so pass
+ * back *last_eb_bh as the new leaf. At depth
+ * zero, it should always be null so there's
+ * no reason to brelse.
+ */
+ BUG_ON(*last_eb_bh);
+ get_bh(bh);
+ *last_eb_bh = bh;
+ goto out;
+ }
+ }
+
+ /* call ocfs2_add_branch to add the final part of the tree with
+ * the new data. */
+ mlog(0, "add branch. bh = %p\n", bh);
+ ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
+ meta_ac);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ if (final_depth)
+ *final_depth = depth;
+ brelse(bh);
+ return ret;
+}
+
+/*
* This is only valid for leaf nodes, which are the only ones that can
* have empty extents anyway.
*/
@@ -934,6 +1097,22 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
}
+static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
+{
+ int size, num_recs = le16_to_cpu(el->l_next_free_rec);
+
+ BUG_ON(num_recs == 0);
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+ num_recs--;
+ size = num_recs * sizeof(struct ocfs2_extent_rec);
+ memmove(&el->l_recs[0], &el->l_recs[1], size);
+ memset(&el->l_recs[num_recs], 0,
+ sizeof(struct ocfs2_extent_rec));
+ el->l_next_free_rec = cpu_to_le16(num_recs);
+ }
+}
+
/*
* Create an empty extent record .
*
@@ -1211,6 +1390,10 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
* immediately to their right.
*/
left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+ if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+ BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
+ left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
+ }
left_clusters -= le32_to_cpu(left_rec->e_cpos);
left_rec->e_int_clusters = cpu_to_le32(left_clusters);
@@ -1531,10 +1714,16 @@ out:
return ret;
}
+/*
+ * Extend the transaction by enough credits to complete the rotation,
+ * and still leave at least the original number of credits allocated
+ * to this transaction.
+ */
static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+ int op_credits,
struct ocfs2_path *path)
{
- int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+ int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
if (handle->h_buffer_credits < credits)
return ocfs2_extend_trans(handle, credits);
@@ -1568,6 +1757,29 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
return 0;
}
+static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
+{
+ int next_free = le16_to_cpu(el->l_next_free_rec);
+ unsigned int range;
+ struct ocfs2_extent_rec *rec;
+
+ if (next_free == 0)
+ return 0;
+
+ rec = &el->l_recs[0];
+ if (ocfs2_is_empty_extent(rec)) {
+ /* Empty list. */
+ if (next_free == 1)
+ return 0;
+ rec = &el->l_recs[1];
+ }
+
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+ return 1;
+ return 0;
+}
+
/*
* Rotate all the records in a btree right one record, starting at insert_cpos.
*
@@ -1586,11 +1798,12 @@ static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
*/
static int ocfs2_rotate_tree_right(struct inode *inode,
handle_t *handle,
+ enum ocfs2_split_type split,
u32 insert_cpos,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
- int ret, start;
+ int ret, start, orig_credits = handle->h_buffer_credits;
u32 cpos;
struct ocfs2_path *left_path = NULL;
@@ -1657,9 +1870,9 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
(unsigned long long)
path_leaf_bh(left_path)->b_blocknr);
- if (ocfs2_rotate_requires_path_adjustment(left_path,
+ if (split == SPLIT_NONE &&
+ ocfs2_rotate_requires_path_adjustment(left_path,
insert_cpos)) {
- mlog(0, "Path adjustment required\n");
/*
* We've rotated the tree as much as we
@@ -1687,7 +1900,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
right_path->p_tree_depth);
ret = ocfs2_extend_rotate_transaction(handle, start,
- right_path);
+ orig_credits, right_path);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1700,6 +1913,24 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
goto out;
}
+ if (split != SPLIT_NONE &&
+ ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
+ insert_cpos)) {
+ /*
+ * A rotate moves the rightmost left leaf
+ * record over to the leftmost right leaf
+ * slot. If we're doing an extent split
+ * instead of a real insert, then we have to
+ * check that the extent to be split wasn't
+ * just moved over. If it was, then we can
+ * exit here, passing left_path back -
+ * ocfs2_split_extent() is smart enough to
+ * search both leaves.
+ */
+ *ret_left_path = left_path;
+ goto out_ret_path;
+ }
+
/*
* There is no need to re-read the next right path
* as we know that it'll be our current left
@@ -1722,6 +1953,1031 @@ out_ret_path:
return ret;
}
+static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *path)
+{
+ int i, idx;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_block *eb;
+ u32 range;
+
+ /* Path should always be rightmost. */
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+ BUG_ON(eb->h_next_leaf_blk != 0ULL);
+
+ el = &eb->h_list;
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+ idx = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[idx];
+ range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+
+ for (i = 0; i < path->p_tree_depth; i++) {
+ el = path->p_node[i].el;
+ idx = le16_to_cpu(el->l_next_free_rec) - 1;
+ rec = &el->l_recs[idx];
+
+ rec->e_int_clusters = cpu_to_le32(range);
+ le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
+
+ ocfs2_journal_dirty(handle, path->p_node[i].bh);
+ }
+}
+
+static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_path *path, int unlink_start)
+{
+ int ret, i;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+ struct buffer_head *bh;
+
+ for(i = unlink_start; i < path_num_items(path); i++) {
+ bh = path->p_node[i].bh;
+
+ eb = (struct ocfs2_extent_block *)bh->b_data;
+ /*
+ * Not all nodes might have had their final count
+ * decremented by the caller - handle this here.
+ */
+ el = &eb->h_list;
+ if (le16_to_cpu(el->l_next_free_rec) > 1) {
+ mlog(ML_ERROR,
+ "Inode %llu, attempted to remove extent block "
+ "%llu with %u records\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(el->l_next_free_rec));
+
+ ocfs2_journal_dirty(handle, bh);
+ ocfs2_remove_from_cache(inode, bh);
+ continue;
+ }
+
+ el->l_next_free_rec = 0;
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+ ocfs2_journal_dirty(handle, bh);
+
+ ret = ocfs2_cache_extent_block_free(dealloc, eb);
+ if (ret)
+ mlog_errno(ret);
+
+ ocfs2_remove_from_cache(inode, bh);
+ }
+}
+
+static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int i;
+ struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+ struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_block *eb;
+
+ el = path_leaf_el(left_path);
+
+ eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
+
+ for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
+ if (root_el->l_recs[i].e_blkno == eb->h_blkno)
+ break;
+
+ BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
+
+ memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
+ le16_add_cpu(&root_el->l_next_free_rec, -1);
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+ eb->h_next_leaf_blk = 0;
+
+ ocfs2_journal_dirty(handle, root_bh);
+ ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+
+ ocfs2_unlink_path(inode, handle, dealloc, right_path,
+ subtree_index + 1);
+}
+
+static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ int subtree_index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ int *deleted)
+{
+ int ret, i, del_right_subtree = 0, right_has_empty = 0;
+ struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
+ struct ocfs2_extent_block *eb;
+
+ *deleted = 0;
+
+ right_leaf_el = path_leaf_el(right_path);
+ left_leaf_el = path_leaf_el(left_path);
+ root_bh = left_path->p_node[subtree_index].bh;
+ BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+ if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
+ return 0;
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
+ if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
+ /*
+ * It's legal for us to proceed if the right leaf is
+ * the rightmost one and it has an empty extent. There
+ * are two cases to handle - whether the leaf will be
+ * empty after removal or not. If the leaf isn't empty
+ * then just remove the empty extent up front. The
+ * next block will handle empty leaves by flagging
+ * them for unlink.
+ *
+ * Non rightmost leaves will throw -EAGAIN and the
+ * caller can manually move the subtree and retry.
+ */
+
+ if (eb->h_next_leaf_blk != 0ULL)
+ return -EAGAIN;
+
+ if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
+ ret = ocfs2_journal_access(handle, inode,
+ path_leaf_bh(right_path),
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_empty_extent(right_leaf_el);
+ } else
+ right_has_empty = 1;
+ }
+
+ if (eb->h_next_leaf_blk == 0ULL &&
+ le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
+ /*
+ * We have to update i_last_eb_blk during the meta
+ * data delete.
+ */
+ ret = ocfs2_journal_access(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ del_right_subtree = 1;
+ }
+
+ /*
+ * Getting here with an empty extent in the right path implies
+ * that it's the rightmost path and will be deleted.
+ */
+ BUG_ON(right_has_empty && !del_right_subtree);
+
+ ret = ocfs2_journal_access(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+ ret = ocfs2_journal_access(handle, inode,
+ right_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode,
+ left_path->p_node[i].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (!right_has_empty) {
+ /*
+ * Only do this if we're moving a real
+ * record. Otherwise, the action is delayed until
+ * after removal of the right path in which case we
+ * can do a simple shift to remove the empty extent.
+ */
+ ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
+ memset(&right_leaf_el->l_recs[0], 0,
+ sizeof(struct ocfs2_extent_rec));
+ }
+ if (eb->h_next_leaf_blk == 0ULL) {
+ /*
+ * Move recs over to get rid of empty extent, decrease
+ * next_free. This is allowed to remove the last
+ * extent in our leaf (setting l_next_free_rec to
+ * zero) - the delete code below won't care.
+ */
+ ocfs2_remove_empty_extent(right_leaf_el);
+ }
+
+ ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
+ if (ret)
+ mlog_errno(ret);
+ ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
+ if (ret)
+ mlog_errno(ret);
+
+ if (del_right_subtree) {
+ ocfs2_unlink_subtree(inode, handle, left_path, right_path,
+ subtree_index, dealloc);
+ ocfs2_update_edge_lengths(inode, handle, left_path);
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+ di->i_last_eb_blk = eb->h_blkno;
+
+ /*
+ * Removal of the extent in the left leaf was skipped
+ * above so we could delete the right path
+ * 1st.
+ */
+ if (right_has_empty)
+ ocfs2_remove_empty_extent(left_leaf_el);
+
+ ret = ocfs2_journal_dirty(handle, di_bh);
+ if (ret)
+ mlog_errno(ret);
+
+ *deleted = 1;
+ } else
+ ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+ subtree_index);
+
+out:
+ return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the right of the current one.
+ *
+ * Will return zero if the path passed in is already the rightmost path.
+ *
+ * This looks similar, but is subtly different to
+ * ocfs2_find_cpos_for_left_leaf().
+ */
+static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
+ struct ocfs2_path *path, u32 *cpos)
+{
+ int i, j, ret = 0;
+ u64 blkno;
+ struct ocfs2_extent_list *el;
+
+ *cpos = 0;
+
+ if (path->p_tree_depth == 0)
+ return 0;
+
+ blkno = path_leaf_bh(path)->b_blocknr;
+
+ /* Start at the tree node just above the leaf and work our way up. */
+ i = path->p_tree_depth - 1;
+ while (i >= 0) {
+ int next_free;
+
+ el = path->p_node[i].el;
+
+ /*
+ * Find the extent record just after the one in our
+ * path.
+ */
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+ if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+ if (j == (next_free - 1)) {
+ if (i == 0) {
+ /*
+ * We've determined that the
+ * path specified is already
+ * the rightmost one - return a
+ * cpos of zero.
+ */
+ goto out;
+ }
+ /*
+ * The rightmost record points to our
+ * leaf - we need to travel up the
+ * tree one level.
+ */
+ goto next_node;
+ }
+
+ *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
+ goto out;
+ }
+ }
+
+ /*
+ * If we got here, we never found a valid node where
+ * the tree indicated one should be.
+ */
+ ocfs2_error(sb,
+ "Invalid extent tree at extent block %llu\n",
+ (unsigned long long)blkno);
+ ret = -EROFS;
+ goto out;
+
+next_node:
+ blkno = path->p_node[i].bh->b_blocknr;
+ i--;
+ }
+
+out:
+ return ret;
+}
+
+static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
+ handle_t *handle,
+ struct buffer_head *bh,
+ struct ocfs2_extent_list *el)
+{
+ int ret;
+
+ if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+ return 0;
+
+ ret = ocfs2_journal_access(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_remove_empty_extent(el);
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int __ocfs2_rotate_tree_left(struct inode *inode,
+ handle_t *handle, int orig_credits,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_path **empty_extent_path)
+{
+ int ret, subtree_root, deleted;
+ u32 right_cpos;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_path *right_path = NULL;
+
+ BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
+
+ *empty_extent_path = NULL;
+
+ ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
+ &right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ left_path = ocfs2_new_path(path_root_bh(path),
+ path_root_el(path));
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_cp_path(left_path, path);
+
+ right_path = ocfs2_new_path(path_root_bh(path),
+ path_root_el(path));
+ if (!right_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (right_cpos) {
+ ret = ocfs2_find_path(inode, right_path, right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_root = ocfs2_find_subtree_root(inode, left_path,
+ right_path);
+
+ mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+ subtree_root,
+ (unsigned long long)
+ right_path->p_node[subtree_root].bh->b_blocknr,
+ right_path->p_tree_depth);
+
+ ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
+ orig_credits, left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
+ right_path, subtree_root,
+ dealloc, &deleted);
+ if (ret == -EAGAIN) {
+ /*
+ * The rotation has to temporarily stop due to
+ * the right subtree having an empty
+ * extent. Pass it back to the caller for a
+ * fixup.
+ */
+ *empty_extent_path = right_path;
+ right_path = NULL;
+ goto out;
+ }
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * The subtree rotate might have removed records on
+ * the rightmost edge. If so, then rotation is
+ * complete.
+ */
+ if (deleted)
+ break;
+
+ ocfs2_mv_path(left_path, right_path);
+
+ ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
+ &right_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ ocfs2_free_path(right_path);
+ ocfs2_free_path(left_path);
+
+ return ret;
+}
+
+static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, subtree_index;
+ u32 cpos;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+
+ /*
+ * XXX: This code assumes that the root is an inode, which is
+ * true for now but may change as tree code gets generic.
+ */
+ di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ ret = -EIO;
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has invalid path root",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ goto out;
+ }
+
+ /*
+ * There's two ways we handle this depending on
+ * whether path is the only existing one.
+ */
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(inode, handle, path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (cpos) {
+ /*
+ * We have a path to the left of this one - it needs
+ * an update too.
+ */
+ left_path = ocfs2_new_path(path_root_bh(path),
+ path_root_el(path));
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(inode, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(inode, handle, left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+
+ ocfs2_unlink_subtree(inode, handle, left_path, path,
+ subtree_index, dealloc);
+ ocfs2_update_edge_lengths(inode, handle, left_path);
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
+ di->i_last_eb_blk = eb->h_blkno;
+ } else {
+ /*
+ * 'path' is also the leftmost path which
+ * means it must be the only one. This gets
+ * handled differently because we want to
+ * revert the inode back to having extents
+ * in-line.
+ */
+ ocfs2_unlink_path(inode, handle, dealloc, path, 1);
+
+ el = &di->id2.i_list;
+ el->l_tree_depth = 0;
+ el->l_next_free_rec = 0;
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+
+ di->i_last_eb_blk = 0;
+ }
+
+ ocfs2_journal_dirty(handle, path_root_bh(path));
+
+out:
+ ocfs2_free_path(left_path);
+ return ret;
+}
+
+/*
+ * Left rotation of btree records.
+ *
+ * In many ways, this is (unsurprisingly) the opposite of right
+ * rotation. We start at some non-rightmost path containing an empty
+ * extent in the leaf block. The code works its way to the rightmost
+ * path by rotating records to the left in every subtree.
+ *
+ * This is used by any code which reduces the number of extent records
+ * in a leaf. After removal, an empty record should be placed in the
+ * leftmost list position.
+ *
+ * This won't handle a length update of the rightmost path records if
+ * the rightmost tree leaf record is removed so the caller is
+ * responsible for detecting and correcting that.
+ */
+static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *path,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, orig_credits = handle->h_buffer_credits;
+ struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+
+ el = path_leaf_el(path);
+ if (!ocfs2_is_empty_extent(&el->l_recs[0]))
+ return 0;
+
+ if (path->p_tree_depth == 0) {
+rightmost_no_delete:
+ /*
+ * In-inode extents. This is trivially handled, so do
+ * it up front.
+ */
+ ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
+ path_leaf_bh(path),
+ path_leaf_el(path));
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Handle rightmost branch now. There's several cases:
+ * 1) simple rotation leaving records in there. That's trivial.
+ * 2) rotation requiring a branch delete - there's no more
+ * records left. Two cases of this:
+ * a) There are branches to the left.
+ * b) This is also the leftmost (the only) branch.
+ *
+ * 1) is handled via ocfs2_rotate_rightmost_leaf_left()
+ * 2a) we need the left branch so that we can update it with the unlink
+ * 2b) we need to bring the inode back to inline extents.
+ */
+
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+ el = &eb->h_list;
+ if (eb->h_next_leaf_blk == 0) {
+ /*
+ * This gets a bit tricky if we're going to delete the
+ * rightmost path. Get the other cases out of the way
+ * 1st.
+ */
+ if (le16_to_cpu(el->l_next_free_rec) > 1)
+ goto rightmost_no_delete;
+
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ret = -EIO;
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has empty extent block at %llu",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ goto out;
+ }
+
+ /*
+ * XXX: The caller can not trust "path" any more after
+ * this as it will have been deleted. What do we do?
+ *
+ * In theory the rotate-for-merge code will never get
+ * here because it'll always ask for a rotate in a
+ * nonempty list.
+ */
+
+ ret = ocfs2_remove_rightmost_path(inode, handle, path,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Now we can loop, remembering the path we get from -EAGAIN
+ * and restarting from there.
+ */
+try_rotate:
+ ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
+ dealloc, &restart_path);
+ if (ret && ret != -EAGAIN) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ while (ret == -EAGAIN) {
+ tmp_path = restart_path;
+ restart_path = NULL;
+
+ ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
+ tmp_path, dealloc,
+ &restart_path);
+ if (ret && ret != -EAGAIN) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ocfs2_free_path(tmp_path);
+ tmp_path = NULL;
+
+ if (ret == 0)
+ goto try_rotate;
+ }
+
+out:
+ ocfs2_free_path(tmp_path);
+ ocfs2_free_path(restart_path);
+ return ret;
+}
+
+static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
+ int index)
+{
+ struct ocfs2_extent_rec *rec = &el->l_recs[index];
+ unsigned int size;
+
+ if (rec->e_leaf_clusters == 0) {
+ /*
+ * We consumed all of the merged-from record. An empty
+ * extent cannot exist anywhere but the 1st array
+ * position, so move things over if the merged-from
+ * record doesn't occupy that position.
+ *
+ * This creates a new empty extent so the caller
+ * should be smart enough to have removed any existing
+ * ones.
+ */
+ if (index > 0) {
+ BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+ size = index * sizeof(struct ocfs2_extent_rec);
+ memmove(&el->l_recs[1], &el->l_recs[0], size);
+ }
+
+ /*
+ * Always memset - the caller doesn't check whether it
+ * created an empty extent, so there could be junk in
+ * the other fields.
+ */
+ memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+ }
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the beginning of the record at index + 1.
+ */
+static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
+ handle_t *handle,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_extent_list *el, int index)
+{
+ int ret;
+ unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+ struct ocfs2_extent_rec *left_rec;
+ struct ocfs2_extent_rec *right_rec;
+
+ BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
+
+ left_rec = &el->l_recs[index];
+ right_rec = &el->l_recs[index + 1];
+
+ ret = ocfs2_journal_access(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
+
+ le32_add_cpu(&right_rec->e_cpos, -split_clusters);
+ le64_add_cpu(&right_rec->e_blkno,
+ -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+ le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
+
+ ocfs2_cleanup_merge(el, index);
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+/*
+ * Remove split_rec clusters from the record at index and merge them
+ * onto the tail of the record at index - 1.
+ */
+static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
+ handle_t *handle,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_extent_list *el, int index)
+{
+ int ret, has_empty_extent = 0;
+ unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
+ struct ocfs2_extent_rec *left_rec;
+ struct ocfs2_extent_rec *right_rec;
+
+ BUG_ON(index <= 0);
+
+ left_rec = &el->l_recs[index - 1];
+ right_rec = &el->l_recs[index];
+ if (ocfs2_is_empty_extent(&el->l_recs[0]))
+ has_empty_extent = 1;
+
+ ret = ocfs2_journal_access(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (has_empty_extent && index == 1) {
+ /*
+ * The easy case - we can just plop the record right in.
+ */
+ *left_rec = *split_rec;
+
+ has_empty_extent = 0;
+ } else {
+ le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
+ }
+
+ le32_add_cpu(&right_rec->e_cpos, split_clusters);
+ le64_add_cpu(&right_rec->e_blkno,
+ ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
+ le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
+
+ ocfs2_cleanup_merge(el, index);
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ return ret;
+}
+
+static int ocfs2_try_to_merge_extent(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_path *left_path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ struct ocfs2_merge_ctxt *ctxt)
+
+{
+ int ret = 0, delete_tail_recs = 0;
+ struct ocfs2_extent_list *el = path_leaf_el(left_path);
+ struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+
+ BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
+
+ if (ctxt->c_split_covers_rec) {
+ delete_tail_recs++;
+
+ if (ctxt->c_contig_type == CONTIG_LEFTRIGHT ||
+ ctxt->c_has_empty_extent)
+ delete_tail_recs++;
+
+ if (ctxt->c_has_empty_extent) {
+ /*
+ * The merge code will need to create an empty
+ * extent to take the place of the newly
+ * emptied slot. Remove any pre-existing empty
+ * extents - having more than one in a leaf is
+ * illegal.
+ */
+ ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ split_index--;
+ rec = &el->l_recs[split_index];
+ }
+ }
+
+ if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
+ /*
+ * Left-right contig implies this.
+ */
+ BUG_ON(!ctxt->c_split_covers_rec);
+ BUG_ON(split_index == 0);
+
+ /*
+ * Since the leftright insert always covers the entire
+ * extent, this call will delete the insert record
+ * entirely, resulting in an empty extent record added to
+ * the extent block.
+ *
+ * Since the adding of an empty extent shifts
+ * everything back to the right, there's no need to
+ * update split_index here.
+ */
+ ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path),
+ handle, split_rec, el, split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We can only get this from logic error above.
+ */
+ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+ /*
+ * The left merge left us with an empty extent, remove
+ * it.
+ */
+ ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ split_index--;
+ rec = &el->l_recs[split_index];
+
+ /*
+ * Note that we don't pass split_rec here on purpose -
+ * we've merged it into the left side.
+ */
+ ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path),
+ handle, rec, el, split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
+
+ ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+ dealloc);
+ /*
+ * Error from this last rotate is not critical, so
+ * print but don't bubble it up.
+ */
+ if (ret)
+ mlog_errno(ret);
+ ret = 0;
+ } else {
+ /*
+ * Merge a record to the left or right.
+ *
+ * 'contig_type' is relative to the existing record,
+ * so for example, if we're "right contig", it's to
+ * the record on the left (hence the left merge).
+ */
+ if (ctxt->c_contig_type == CONTIG_RIGHT) {
+ ret = ocfs2_merge_rec_left(inode,
+ path_leaf_bh(left_path),
+ handle, split_rec, el,
+ split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ ret = ocfs2_merge_rec_right(inode,
+ path_leaf_bh(left_path),
+ handle, split_rec, el,
+ split_index);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (ctxt->c_split_covers_rec) {
+ /*
+ * The merge may have left an empty extent in
+ * our leaf. Try to rotate it away.
+ */
+ ret = ocfs2_rotate_tree_left(inode, handle, left_path,
+ dealloc);
+ if (ret)
+ mlog_errno(ret);
+ ret = 0;
+ }
+ }
+
+out:
+ return ret;
+}
+
+static void ocfs2_subtract_from_rec(struct super_block *sb,
+ enum ocfs2_split_type split,
+ struct ocfs2_extent_rec *rec,
+ struct ocfs2_extent_rec *split_rec)
+{
+ u64 len_blocks;
+
+ len_blocks = ocfs2_clusters_to_blocks(sb,
+ le16_to_cpu(split_rec->e_leaf_clusters));
+
+ if (split == SPLIT_LEFT) {
+ /*
+ * Region is on the left edge of the existing
+ * record.
+ */
+ le32_add_cpu(&rec->e_cpos,
+ le16_to_cpu(split_rec->e_leaf_clusters));
+ le64_add_cpu(&rec->e_blkno, len_blocks);
+ le16_add_cpu(&rec->e_leaf_clusters,
+ -le16_to_cpu(split_rec->e_leaf_clusters));
+ } else {
+ /*
+ * Region is on the right edge of the existing
+ * record.
+ */
+ le16_add_cpu(&rec->e_leaf_clusters,
+ -le16_to_cpu(split_rec->e_leaf_clusters));
+ }
+}
+
/*
* Do the final bits of extent record insertion at the target leaf
* list. If this leaf is part of an allocation tree, it is assumed
@@ -1738,6 +2994,15 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+ if (insert->ins_split != SPLIT_NONE) {
+ i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
+ BUG_ON(i == -1);
+ rec = &el->l_recs[i];
+ ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
+ insert_rec);
+ goto rotate;
+ }
+
/*
* Contiguous insert - either left or right.
*/
@@ -1792,6 +3057,7 @@ static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
return;
}
+rotate:
/*
* Ok, we have to rotate.
*
@@ -1815,13 +3081,53 @@ static inline void ocfs2_update_dinode_clusters(struct inode *inode,
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
+static void ocfs2_adjust_rightmost_records(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_path *path,
+ struct ocfs2_extent_rec *insert_rec)
+{
+ int ret, i, next_free;
+ struct buffer_head *bh;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_rec *rec;
+
+ /*
+ * Update everything except the leaf block.
+ */
+ for (i = 0; i < path->p_tree_depth; i++) {
+ bh = path->p_node[i].bh;
+ el = path->p_node[i].el;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0) {
+ ocfs2_error(inode->i_sb,
+ "Dinode %llu has a bad extent list",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ret = -EIO;
+ return;
+ }
+
+ rec = &el->l_recs[next_free - 1];
+
+ rec->e_int_clusters = insert_rec->e_cpos;
+ le32_add_cpu(&rec->e_int_clusters,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
+ le32_add_cpu(&rec->e_int_clusters,
+ -le32_to_cpu(rec->e_cpos));
+
+ ret = ocfs2_journal_dirty(handle, bh);
+ if (ret)
+ mlog_errno(ret);
+
+ }
+}
+
static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
struct ocfs2_extent_rec *insert_rec,
struct ocfs2_path *right_path,
struct ocfs2_path **ret_left_path)
{
- int ret, i, next_free;
- struct buffer_head *bh;
+ int ret, next_free;
struct ocfs2_extent_list *el;
struct ocfs2_path *left_path = NULL;
@@ -1887,40 +3193,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
goto out;
}
- el = path_root_el(right_path);
- bh = path_root_bh(right_path);
- i = 0;
- while (1) {
- struct ocfs2_extent_rec *rec;
-
- next_free = le16_to_cpu(el->l_next_free_rec);
- if (next_free == 0) {
- ocfs2_error(inode->i_sb,
- "Dinode %llu has a bad extent list",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- ret = -EIO;
- goto out;
- }
-
- rec = &el->l_recs[next_free - 1];
-
- rec->e_int_clusters = insert_rec->e_cpos;
- le32_add_cpu(&rec->e_int_clusters,
- le16_to_cpu(insert_rec->e_leaf_clusters));
- le32_add_cpu(&rec->e_int_clusters,
- -le32_to_cpu(rec->e_cpos));
-
- ret = ocfs2_journal_dirty(handle, bh);
- if (ret)
- mlog_errno(ret);
-
- /* Don't touch the leaf node */
- if (++i >= right_path->p_tree_depth)
- break;
-
- bh = right_path->p_node[i].bh;
- el = right_path->p_node[i].el;
- }
+ ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
*ret_left_path = left_path;
ret = 0;
@@ -1931,6 +3204,83 @@ out:
return ret;
}
+static void ocfs2_split_record(struct inode *inode,
+ struct ocfs2_path *left_path,
+ struct ocfs2_path *right_path,
+ struct ocfs2_extent_rec *split_rec,
+ enum ocfs2_split_type split)
+{
+ int index;
+ u32 cpos = le32_to_cpu(split_rec->e_cpos);
+ struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
+ struct ocfs2_extent_rec *rec, *tmprec;
+
+ right_el = path_leaf_el(right_path);;
+ if (left_path)
+ left_el = path_leaf_el(left_path);
+
+ el = right_el;
+ insert_el = right_el;
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index != -1) {
+ if (index == 0 && left_path) {
+ BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
+
+ /*
+ * This typically means that the record
+ * started in the left path but moved to the
+ * right as a result of rotation. We either
+ * move the existing record to the left, or we
+ * do the later insert there.
+ *
+ * In this case, the left path should always
+ * exist as the rotate code will have passed
+ * it back for a post-insert update.
+ */
+
+ if (split == SPLIT_LEFT) {
+ /*
+ * It's a left split. Since we know
+ * that the rotate code gave us an
+ * empty extent in the left path, we
+ * can just do the insert there.
+ */
+ insert_el = left_el;
+ } else {
+ /*
+ * Right split - we have to move the
+ * existing record over to the left
+ * leaf. The insert will be into the
+ * newly created empty extent in the
+ * right leaf.
+ */
+ tmprec = &right_el->l_recs[index];
+ ocfs2_rotate_leaf(left_el, tmprec);
+ el = left_el;
+
+ memset(tmprec, 0, sizeof(*tmprec));
+ index = ocfs2_search_extent_list(left_el, cpos);
+ BUG_ON(index == -1);
+ }
+ }
+ } else {
+ BUG_ON(!left_path);
+ BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
+ /*
+ * Left path is easy - we can just allow the insert to
+ * happen.
+ */
+ el = left_el;
+ insert_el = left_el;
+ index = ocfs2_search_extent_list(el, cpos);
+ BUG_ON(index == -1);
+ }
+
+ rec = &el->l_recs[index];
+ ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
+ ocfs2_rotate_leaf(insert_el, split_rec);
+}
+
/*
* This function only does inserts on an allocation b-tree. For dinode
* lists, ocfs2_insert_at_leaf() is called directly.
@@ -1948,7 +3298,6 @@ static int ocfs2_insert_path(struct inode *inode,
{
int ret, subtree_index;
struct buffer_head *leaf_bh = path_leaf_bh(right_path);
- struct ocfs2_extent_list *el;
/*
* Pass both paths to the journal. The majority of inserts
@@ -1984,9 +3333,18 @@ static int ocfs2_insert_path(struct inode *inode,
}
}
- el = path_leaf_el(right_path);
+ if (insert->ins_split != SPLIT_NONE) {
+ /*
+ * We could call ocfs2_insert_at_leaf() for some types
+ * of splits, but it's easier to just let one seperate
+ * function sort it all out.
+ */
+ ocfs2_split_record(inode, left_path, right_path,
+ insert_rec, insert->ins_split);
+ } else
+ ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
+ insert, inode);
- ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
ret = ocfs2_journal_dirty(handle, leaf_bh);
if (ret)
mlog_errno(ret);
@@ -2075,7 +3433,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* can wind up skipping both of these two special cases...
*/
if (rotate) {
- ret = ocfs2_rotate_tree_right(inode, handle,
+ ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
le32_to_cpu(insert_rec->e_cpos),
right_path, &left_path);
if (ret) {
@@ -2100,8 +3458,9 @@ static int ocfs2_do_insert_extent(struct inode *inode,
}
out_update_clusters:
- ocfs2_update_dinode_clusters(inode, di,
- le16_to_cpu(insert_rec->e_leaf_clusters));
+ if (type->ins_split == SPLIT_NONE)
+ ocfs2_update_dinode_clusters(inode, di,
+ le16_to_cpu(insert_rec->e_leaf_clusters));
ret = ocfs2_journal_dirty(handle, di_bh);
if (ret)
@@ -2114,6 +3473,44 @@ out:
return ret;
}
+static enum ocfs2_contig_type
+ocfs2_figure_merge_contig_type(struct inode *inode,
+ struct ocfs2_extent_list *el, int index,
+ struct ocfs2_extent_rec *split_rec)
+{
+ struct ocfs2_extent_rec *rec;
+ enum ocfs2_contig_type ret = CONTIG_NONE;
+
+ /*
+ * We're careful to check for an empty extent record here -
+ * the merge code will know what to do if it sees one.
+ */
+
+ if (index > 0) {
+ rec = &el->l_recs[index - 1];
+ if (index == 1 && ocfs2_is_empty_extent(rec)) {
+ if (split_rec->e_cpos == el->l_recs[index].e_cpos)
+ ret = CONTIG_RIGHT;
+ } else {
+ ret = ocfs2_extent_contig(inode, rec, split_rec);
+ }
+ }
+
+ if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) {
+ enum ocfs2_contig_type contig_type;
+
+ rec = &el->l_recs[index + 1];
+ contig_type = ocfs2_extent_contig(inode, rec, split_rec);
+
+ if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
+ ret = CONTIG_LEFTRIGHT;
+ else if (ret == CONTIG_NONE)
+ ret = contig_type;
+ }
+
+ return ret;
+}
+
static void ocfs2_figure_contig_type(struct inode *inode,
struct ocfs2_insert_type *insert,
struct ocfs2_extent_list *el,
@@ -2205,6 +3602,8 @@ static int ocfs2_figure_insert_type(struct inode *inode,
struct ocfs2_path *path = NULL;
struct buffer_head *bh = NULL;
+ insert->ins_split = SPLIT_NONE;
+
el = &di->id2.i_list;
insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
@@ -2327,9 +3726,10 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
u32 cpos,
u64 start_blk,
u32 new_clusters,
+ u8 flags,
struct ocfs2_alloc_context *meta_ac)
{
- int status, shift;
+ int status;
struct buffer_head *last_eb_bh = NULL;
struct buffer_head *bh = NULL;
struct ocfs2_insert_type insert = {0, };
@@ -2350,6 +3750,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
rec.e_cpos = cpu_to_le32(cpos);
rec.e_blkno = cpu_to_le64(start_blk);
rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+ rec.e_flags = flags;
status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
&insert);
@@ -2364,55 +3765,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
insert.ins_free_records, insert.ins_tree_depth);
- /*
- * Avoid growing the tree unless we're out of records and the
- * insert type requres one.
- */
- if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
- goto out_add;
-
- shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
- if (shift < 0) {
- status = shift;
- mlog_errno(status);
- goto bail;
- }
-
- /* We traveled all the way to the bottom of the allocation tree
- * and didn't find room for any more extents - we need to add
- * another tree level */
- if (shift) {
- BUG_ON(bh);
- mlog(0, "need to shift tree depth "
- "(current = %d)\n", insert.ins_tree_depth);
-
- /* ocfs2_shift_tree_depth will return us a buffer with
- * the new extent block (so we can pass that to
- * ocfs2_add_branch). */
- status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
- meta_ac, &bh);
- if (status < 0) {
+ if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) {
+ status = ocfs2_grow_tree(inode, handle, fe_bh,
+ &insert.ins_tree_depth, &last_eb_bh,
+ meta_ac);
+ if (status) {
mlog_errno(status);
goto bail;
}
- insert.ins_tree_depth++;
- /* Special case: we have room now if we shifted from
- * tree_depth 0 */
- if (insert.ins_tree_depth == 1)
- goto out_add;
- }
-
- /* call ocfs2_add_branch to add the final part of the tree with
- * the new data. */
- mlog(0, "add branch. bh = %p\n", bh);
- status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
- meta_ac);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
}
-out_add:
/* Finally, we can add clusters. This might rotate the tree for us. */
status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
if (status < 0)
@@ -2431,7 +3793,720 @@ bail:
return status;
}
-static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+static void ocfs2_make_right_split_rec(struct super_block *sb,
+ struct ocfs2_extent_rec *split_rec,
+ u32 cpos,
+ struct ocfs2_extent_rec *rec)
+{
+ u32 rec_cpos = le32_to_cpu(rec->e_cpos);
+ u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
+
+ memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
+
+ split_rec->e_cpos = cpu_to_le32(cpos);
+ split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
+
+ split_rec->e_blkno = rec->e_blkno;
+ le64_add_cpu(&split_rec->e_blkno,
+ ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
+
+ split_rec->e_flags = rec->e_flags;
+}
+
+static int ocfs2_split_and_insert(struct inode *inode,
+ handle_t *handle,
+ struct ocfs2_path *path,
+ struct buffer_head *di_bh,
+ struct buffer_head **last_eb_bh,
+ int split_index,
+ struct ocfs2_extent_rec *orig_split_rec,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret = 0, depth;
+ unsigned int insert_range, rec_range, do_leftright = 0;
+ struct ocfs2_extent_rec tmprec;
+ struct ocfs2_extent_list *rightmost_el;
+ struct ocfs2_extent_rec rec;
+ struct ocfs2_extent_rec split_rec = *orig_split_rec;
+ struct ocfs2_insert_type insert;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_dinode *di;
+
+leftright:
+ /*
+ * Store a copy of the record on the stack - it might move
+ * around as the tree is manipulated below.
+ */
+ rec = path_leaf_el(path)->l_recs[split_index];
+
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+ rightmost_el = &di->id2.i_list;
+
+ depth = le16_to_cpu(rightmost_el->l_tree_depth);
+ if (depth) {
+ BUG_ON(!(*last_eb_bh));
+ eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
+ rightmost_el = &eb->h_list;
+ }
+
+ if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+ le16_to_cpu(rightmost_el->l_count)) {
+ int old_depth = depth;
+
+ ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (old_depth != depth) {
+ eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
+ rightmost_el = &eb->h_list;
+ }
+ }
+
+ memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+ insert.ins_appending = APPEND_NONE;
+ insert.ins_contig = CONTIG_NONE;
+ insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+ - le16_to_cpu(rightmost_el->l_next_free_rec);
+ insert.ins_tree_depth = depth;
+
+ insert_range = le32_to_cpu(split_rec.e_cpos) +
+ le16_to_cpu(split_rec.e_leaf_clusters);
+ rec_range = le32_to_cpu(rec.e_cpos) +
+ le16_to_cpu(rec.e_leaf_clusters);
+
+ if (split_rec.e_cpos == rec.e_cpos) {
+ insert.ins_split = SPLIT_LEFT;
+ } else if (insert_range == rec_range) {
+ insert.ins_split = SPLIT_RIGHT;
+ } else {
+ /*
+ * Left/right split. We fake this as a right split
+ * first and then make a second pass as a left split.
+ */
+ insert.ins_split = SPLIT_RIGHT;
+
+ ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
+ &rec);
+
+ split_rec = tmprec;
+
+ BUG_ON(do_leftright);
+ do_leftright = 1;
+ }
+
+ ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
+ &insert);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (do_leftright == 1) {
+ u32 cpos;
+ struct ocfs2_extent_list *el;
+
+ do_leftright++;
+ split_rec = *orig_split_rec;
+
+ ocfs2_reinit_path(path, 1);
+
+ cpos = le32_to_cpu(split_rec.e_cpos);
+ ret = ocfs2_find_path(inode, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ split_index = ocfs2_search_extent_list(el, cpos);
+ goto leftright;
+ }
+out:
+
+ return ret;
+}
+
+/*
+ * Mark part or all of the extent record at split_index in the leaf
+ * pointed to by path as written. This removes the unwritten
+ * extent flag.
+ *
+ * Care is taken to handle contiguousness so as to not grow the tree.
+ *
+ * meta_ac is not strictly necessary - we only truly need it if growth
+ * of the tree is required. All other cases will degrade into a less
+ * optimal tree layout.
+ *
+ * last_eb_bh should be the rightmost leaf block for any inode with a
+ * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
+ *
+ * This code is optimized for readability - several passes might be
+ * made over certain portions of the tree. All of those blocks will
+ * have been brought into cache (and pinned via the journal), so the
+ * extra overhead is not expressed in terms of disk reads.
+ */
+static int __ocfs2_mark_extent_written(struct inode *inode,
+ struct buffer_head *di_bh,
+ handle_t *handle,
+ struct ocfs2_path *path,
+ int split_index,
+ struct ocfs2_extent_rec *split_rec,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret = 0;
+ struct ocfs2_extent_list *el = path_leaf_el(path);
+ struct buffer_head *eb_bh, *last_eb_bh = NULL;
+ struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
+ struct ocfs2_merge_ctxt ctxt;
+ struct ocfs2_extent_list *rightmost_el;
+
+ if (!rec->e_flags & OCFS2_EXT_UNWRITTEN) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
+ ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
+ (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
+ ret = -EIO;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb_bh = path_leaf_bh(path);
+ ret = ocfs2_journal_access(handle, inode, eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el,
+ split_index,
+ split_rec);
+
+ /*
+ * The core merge / split code wants to know how much room is
+ * left in this inodes allocation tree, so we pass the
+ * rightmost extent list.
+ */
+ if (path->p_tree_depth) {
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(di->i_last_eb_blk),
+ &last_eb_bh, OCFS2_BH_CACHED, inode);
+ if (ret) {
+ mlog_exit(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ ret = -EROFS;
+ goto out;
+ }
+
+ rightmost_el = &eb->h_list;
+ } else
+ rightmost_el = path_root_el(path);
+
+ ctxt.c_used_tail_recs = le16_to_cpu(rightmost_el->l_next_free_rec);
+ if (ctxt.c_used_tail_recs > 0 &&
+ ocfs2_is_empty_extent(&rightmost_el->l_recs[0]))
+ ctxt.c_used_tail_recs--;
+
+ if (rec->e_cpos == split_rec->e_cpos &&
+ rec->e_leaf_clusters == split_rec->e_leaf_clusters)
+ ctxt.c_split_covers_rec = 1;
+ else
+ ctxt.c_split_covers_rec = 0;
+
+ ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+ mlog(0, "index: %d, contig: %u, used_tail_recs: %u, "
+ "has_empty: %u, split_covers: %u\n", split_index,
+ ctxt.c_contig_type, ctxt.c_used_tail_recs,
+ ctxt.c_has_empty_extent, ctxt.c_split_covers_rec);
+
+ if (ctxt.c_contig_type == CONTIG_NONE) {
+ if (ctxt.c_split_covers_rec)
+ el->l_recs[split_index] = *split_rec;
+ else
+ ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
+ &last_eb_bh, split_index,
+ split_rec, meta_ac);
+ if (ret)
+ mlog_errno(ret);
+ } else {
+ ret = ocfs2_try_to_merge_extent(inode, handle, path,
+ split_index, split_rec,
+ dealloc, &ctxt);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+ ocfs2_journal_dirty(handle, eb_bh);
+
+out:
+ brelse(last_eb_bh);
+ return ret;
+}
+
+/*
+ * Mark the already-existing extent at cpos as written for len clusters.
+ *
+ * If the existing extent is larger than the request, initiate a
+ * split. An attempt will be made at merging with adjacent extents.
+ *
+ * The caller is responsible for passing down meta_ac if we'll need it.
+ */
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+ handle_t *handle, u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, index;
+ u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
+ struct ocfs2_extent_rec split_rec;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *el;
+
+ mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
+ inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
+
+ if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
+ ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
+ "that are being written to, but the feature bit "
+ "is not set in the super block.",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * XXX: This should be fixed up so that we just re-insert the
+ * next extent records.
+ */
+ ocfs2_extent_map_trunc(inode, 0);
+
+ left_path = ocfs2_new_inode_path(di_bh);
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(inode, left_path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ el = path_leaf_el(left_path);
+
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
+ split_rec.e_cpos = cpu_to_le32(cpos);
+ split_rec.e_leaf_clusters = cpu_to_le16(len);
+ split_rec.e_blkno = cpu_to_le64(start_blkno);
+ split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
+ split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
+
+ ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
+ index, &split_rec, meta_ac, dealloc);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ ocfs2_free_path(left_path);
+ return ret;
+}
+
+static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
+ handle_t *handle, struct ocfs2_path *path,
+ int index, u32 new_range,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int ret, depth, credits = handle->h_buffer_credits;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+ struct buffer_head *last_eb_bh = NULL;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *rightmost_el, *el;
+ struct ocfs2_extent_rec split_rec;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_insert_type insert;
+
+ /*
+ * Setup the record to split before we grow the tree.
+ */
+ el = path_leaf_el(path);
+ rec = &el->l_recs[index];
+ ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
+
+ depth = path->p_tree_depth;
+ if (depth > 0) {
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+ le64_to_cpu(di->i_last_eb_blk),
+ &last_eb_bh, OCFS2_BH_CACHED, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ rightmost_el = &eb->h_list;
+ } else
+ rightmost_el = path_leaf_el(path);
+
+ credits += path->p_tree_depth + ocfs2_extend_meta_needed(di);
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
+ le16_to_cpu(rightmost_el->l_count)) {
+ int old_depth = depth;
+
+ ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
+ meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (old_depth != depth) {
+ eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+ rightmost_el = &eb->h_list;
+ }
+ }
+
+ memset(&insert, 0, sizeof(struct ocfs2_insert_type));
+ insert.ins_appending = APPEND_NONE;
+ insert.ins_contig = CONTIG_NONE;
+ insert.ins_split = SPLIT_RIGHT;
+ insert.ins_free_records = le16_to_cpu(rightmost_el->l_count)
+ - le16_to_cpu(rightmost_el->l_next_free_rec);
+ insert.ins_tree_depth = depth;
+
+ ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
+ if (ret)
+ mlog_errno(ret);
+
+out:
+ brelse(last_eb_bh);
+ return ret;
+}
+
+static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
+ struct ocfs2_path *path, int index,
+ struct ocfs2_cached_dealloc_ctxt *dealloc,
+ u32 cpos, u32 len)
+{
+ int ret;
+ u32 left_cpos, rec_range, trunc_range;
+ int wants_rotate = 0, is_rightmost_tree_rec = 0;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_path *left_path = NULL;
+ struct ocfs2_extent_list *el = path_leaf_el(path);
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_block *eb;
+
+ if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
+ ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ index--;
+ }
+
+ if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
+ path->p_tree_depth) {
+ /*
+ * Check whether this is the rightmost tree record. If
+ * we remove all of this record or part of its right
+ * edge then an update of the record lengths above it
+ * will be required.
+ */
+ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
+ if (eb->h_next_leaf_blk == 0)
+ is_rightmost_tree_rec = 1;
+ }
+
+ rec = &el->l_recs[index];
+ if (index == 0 && path->p_tree_depth &&
+ le32_to_cpu(rec->e_cpos) == cpos) {
+ /*
+ * Changing the leftmost offset (via partial or whole
+ * record truncate) of an interior (or rightmost) path
+ * means we have to update the subtree that is formed
+ * by this leaf and the one to it's left.
+ *
+ * There are two cases we can skip:
+ * 1) Path is the leftmost one in our inode tree.
+ * 2) The leaf is rightmost and will be empty after
+ * we remove the extent record - the rotate code
+ * knows how to update the newly formed edge.
+ */
+
+ ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
+ &left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
+ left_path = ocfs2_new_path(path_root_bh(path),
+ path_root_el(path));
+ if (!left_path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(inode, left_path, left_cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ }
+
+ ret = ocfs2_extend_rotate_transaction(handle, 0,
+ handle->h_buffer_credits,
+ path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(inode, handle, path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access_path(inode, handle, left_path);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ trunc_range = cpos + len;
+
+ if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
+ int next_free;
+
+ memset(rec, 0, sizeof(*rec));
+ ocfs2_cleanup_merge(el, index);
+ wants_rotate = 1;
+
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (is_rightmost_tree_rec && next_free > 1) {
+ /*
+ * We skip the edge update if this path will
+ * be deleted by the rotate code.
+ */
+ rec = &el->l_recs[next_free - 1];
+ ocfs2_adjust_rightmost_records(inode, handle, path,
+ rec);
+ }
+ } else if (le32_to_cpu(rec->e_cpos) == cpos) {
+ /* Remove leftmost portion of the record. */
+ le32_add_cpu(&rec->e_cpos, len);
+ le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
+ le16_add_cpu(&rec->e_leaf_clusters, -len);
+ } else if (rec_range == trunc_range) {
+ /* Remove rightmost portion of the record */
+ le16_add_cpu(&rec->e_leaf_clusters, -len);
+ if (is_rightmost_tree_rec)
+ ocfs2_adjust_rightmost_records(inode, handle, path, rec);
+ } else {
+ /* Caller should have trapped this. */
+ mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
+ "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ le32_to_cpu(rec->e_cpos),
+ le16_to_cpu(rec->e_leaf_clusters), cpos, len);
+ BUG();
+ }
+
+ if (left_path) {
+ int subtree_index;
+
+ subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
+ ocfs2_complete_edge_insert(inode, handle, left_path, path,
+ subtree_index);
+ }
+
+ ocfs2_journal_dirty(handle, path_leaf_bh(path));
+
+ ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+out:
+ ocfs2_free_path(left_path);
+ return ret;
+}
+
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+ u32 cpos, u32 len, handle_t *handle,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret, index;
+ u32 rec_range, trunc_range;
+ struct ocfs2_extent_rec *rec;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_path *path;
+
+ ocfs2_extent_map_trunc(inode, 0);
+
+ path = ocfs2_new_inode_path(di_bh);
+ if (!path) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_find_path(inode, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu has an extent at cpos %u which can no "
+ "longer be found.\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * We have 3 cases of extent removal:
+ * 1) Range covers the entire extent rec
+ * 2) Range begins or ends on one edge of the extent rec
+ * 3) Range is in the middle of the extent rec (no shared edges)
+ *
+ * For case 1 we remove the extent rec and left rotate to
+ * fill the hole.
+ *
+ * For case 2 we just shrink the existing extent rec, with a
+ * tree update if the shrinking edge is also the edge of an
+ * extent block.
+ *
+ * For case 3 we do a right split to turn the extent rec into
+ * something case 2 can handle.
+ */
+ rec = &el->l_recs[index];
+ rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
+ trunc_range = cpos + len;
+
+ BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
+
+ mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
+ "(cpos %u, len %u)\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
+ le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+
+ if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
+ ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+ cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ } else {
+ ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
+ trunc_range, meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * The split could have manipulated the tree enough to
+ * move the record location, so we have to look for it again.
+ */
+ ocfs2_reinit_path(path, 1);
+
+ ret = ocfs2_find_path(inode, path, cpos);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ el = path_leaf_el(path);
+ index = ocfs2_search_extent_list(el, cpos);
+ if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu: split at cpos %u lost record.",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ cpos);
+ ret = -EROFS;
+ goto out;
+ }
+
+ /*
+ * Double check our values here. If anything is fishy,
+ * it's easier to catch it at the top level.
+ */
+ rec = &el->l_recs[index];
+ rec_range = le32_to_cpu(rec->e_cpos) +
+ ocfs2_rec_clusters(el, rec);
+ if (rec_range != trunc_range) {
+ ocfs2_error(inode->i_sb,
+ "Inode %llu: error after split at cpos %u"
+ "trunc len %u, existing record is (%u,%u)",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ cpos, len, le32_to_cpu(rec->e_cpos),
+ ocfs2_rec_clusters(el, rec));
+ ret = -EROFS;
+ goto out;
+ }
+
+ ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
+ cpos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+out:
+ ocfs2_free_path(path);
+ return ret;
+}
+
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
{
struct buffer_head *tl_bh = osb->osb_tl_bh;
struct ocfs2_dinode *di;
@@ -2464,10 +4539,10 @@ static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
return current_tail == new_start;
}
-static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
- handle_t *handle,
- u64 start_blk,
- unsigned int num_clusters)
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+ handle_t *handle,
+ u64 start_blk,
+ unsigned int num_clusters)
{
int status, index;
unsigned int start_cluster, tl_count;
@@ -2623,7 +4698,7 @@ bail:
}
/* Expects you to already be holding tl_inode->i_mutex */
-static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
{
int status;
unsigned int num_to_flush;
@@ -2957,6 +5032,219 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
return status;
}
+/*
+ * Delayed de-allocation of suballocator blocks.
+ *
+ * Some sets of block de-allocations might involve multiple suballocator inodes.
+ *
+ * The locking for this can get extremely complicated, especially when
+ * the suballocator inodes to delete from aren't known until deep
+ * within an unrelated codepath.
+ *
+ * ocfs2_extent_block structures are a good example of this - an inode
+ * btree could have been grown by any number of nodes each allocating
+ * out of their own suballoc inode.
+ *
+ * These structures allow the delay of block de-allocation until a
+ * later time, when locking of multiple cluster inodes won't cause
+ * deadlock.
+ */
+
+/*
+ * Describes a single block free from a suballocator
+ */
+struct ocfs2_cached_block_free {
+ struct ocfs2_cached_block_free *free_next;
+ u64 free_blk;
+ unsigned int free_bit;
+};
+
+struct ocfs2_per_slot_free_list {
+ struct ocfs2_per_slot_free_list *f_next_suballocator;
+ int f_inode_type;
+ int f_slot;
+ struct ocfs2_cached_block_free *f_first;
+};
+
+static int ocfs2_free_cached_items(struct ocfs2_super *osb,
+ int sysfile_type,
+ int slot,
+ struct ocfs2_cached_block_free *head)
+{
+ int ret;
+ u64 bg_blkno;
+ handle_t *handle;
+ struct inode *inode;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_cached_block_free *tmp;
+
+ inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
+ if (!inode) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mutex_lock(&inode->i_mutex);
+
+ ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_mutex;
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_unlock;
+ }
+
+ while (head) {
+ bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
+ head->free_bit);
+ mlog(0, "Free bit: (bit %u, blkno %llu)\n",
+ head->free_bit, (unsigned long long)head->free_blk);
+
+ ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
+ head->free_bit, bg_blkno, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_journal;
+ }
+
+ ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_journal;
+ }
+
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+ }
+
+out_journal:
+ ocfs2_commit_trans(osb, handle);
+
+out_unlock:
+ ocfs2_meta_unlock(inode, 1);
+ brelse(di_bh);
+out_mutex:
+ mutex_unlock(&inode->i_mutex);
+ iput(inode);
+out:
+ while(head) {
+ /* Premature exit may have left some dangling items. */
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+ }
+
+ return ret;
+}
+
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+ struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+ int ret = 0, ret2;
+ struct ocfs2_per_slot_free_list *fl;
+
+ if (!ctxt)
+ return 0;
+
+ while (ctxt->c_first_suballocator) {
+ fl = ctxt->c_first_suballocator;
+
+ if (fl->f_first) {
+ mlog(0, "Free items: (type %u, slot %d)\n",
+ fl->f_inode_type, fl->f_slot);
+ ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
+ fl->f_slot, fl->f_first);
+ if (ret2)
+ mlog_errno(ret2);
+ if (!ret)
+ ret = ret2;
+ }
+
+ ctxt->c_first_suballocator = fl->f_next_suballocator;
+ kfree(fl);
+ }
+
+ return ret;
+}
+
+static struct ocfs2_per_slot_free_list *
+ocfs2_find_per_slot_free_list(int type,
+ int slot,
+ struct ocfs2_cached_dealloc_ctxt *ctxt)
+{
+ struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
+
+ while (fl) {
+ if (fl->f_inode_type == type && fl->f_slot == slot)
+ return fl;
+
+ fl = fl->f_next_suballocator;
+ }
+
+ fl = kmalloc(sizeof(*fl), GFP_NOFS);
+ if (fl) {
+ fl->f_inode_type = type;
+ fl->f_slot = slot;
+ fl->f_first = NULL;
+ fl->f_next_suballocator = ctxt->c_first_suballocator;
+
+ ctxt->c_first_suballocator = fl;
+ }
+ return fl;
+}
+
+static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ int type, int slot, u64 blkno,
+ unsigned int bit)
+{
+ int ret;
+ struct ocfs2_per_slot_free_list *fl;
+ struct ocfs2_cached_block_free *item;
+
+ fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
+ if (fl == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ item = kmalloc(sizeof(*item), GFP_NOFS);
+ if (item == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
+ type, slot, bit, (unsigned long long)blkno);
+
+ item->free_blk = blkno;
+ item->free_bit = bit;
+ item->free_next = fl->f_first;
+
+ fl->f_first = item;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ struct ocfs2_extent_block *eb)
+{
+ return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
+ le16_to_cpu(eb->h_suballoc_slot),
+ le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(eb->h_suballoc_bit));
+}
+
/* This function will figure out whether the currently last extent
* block will be deleted, and if it will, what the new last extent
* block will be so we can update his h_next_leaf_blk field, as well
@@ -3238,27 +5526,10 @@ delete:
BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
- if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
- /*
- * This code only understands how to
- * lock the suballocator in slot 0,
- * which is fine because allocation is
- * only ever done out of that
- * suballocator too. A future version
- * might change that however, so avoid
- * a free if we don't know how to
- * handle it. This way an fs incompat
- * bit will not be necessary.
- */
- ret = ocfs2_free_extent_block(handle,
- tc->tc_ext_alloc_inode,
- tc->tc_ext_alloc_bh,
- eb);
-
- /* An error here is not fatal. */
- if (ret < 0)
- mlog_errno(ret);
- }
+ ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
+ /* An error here is not fatal. */
+ if (ret < 0)
+ mlog_errno(ret);
} else {
deleted_eb = 0;
}
@@ -3397,9 +5668,9 @@ static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
return ocfs2_journal_dirty_data(handle, bh);
}
-static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
- struct page **pages, int numpages,
- u64 phys, handle_t *handle)
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
+ loff_t end, struct page **pages,
+ int numpages, u64 phys, handle_t *handle)
{
int i, ret, partial = 0;
void *kaddr;
@@ -3412,26 +5683,14 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
if (numpages == 0)
goto out;
- from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
- if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
- /*
- * Since 'from' has been capped to a value below page
- * size, this calculation won't be able to overflow
- * 'to'
- */
- to = ocfs2_align_bytes_to_clusters(sb, from);
-
- /*
- * The truncate tail in this case should never contain
- * more than one page at maximum. The loop below also
- * assumes this.
- */
- BUG_ON(numpages != 1);
- }
-
+ to = PAGE_CACHE_SIZE;
for(i = 0; i < numpages; i++) {
page = pages[i];
+ from = start & (PAGE_CACHE_SIZE - 1);
+ if ((end >> PAGE_CACHE_SHIFT) == page->index)
+ to = end & (PAGE_CACHE_SIZE - 1);
+
BUG_ON(from > PAGE_CACHE_SIZE);
BUG_ON(to > PAGE_CACHE_SIZE);
@@ -3468,10 +5727,7 @@ static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
flush_dcache_page(page);
- /*
- * Every page after the 1st one should be completely zero'd.
- */
- from = 0;
+ start = (page->index + 1) << PAGE_CACHE_SHIFT;
}
out:
if (pages) {
@@ -3484,24 +5740,26 @@ out:
}
}
-static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
- int *num, u64 *phys)
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
+ struct page **pages, int *num, u64 *phys)
{
int i, numpages = 0, ret = 0;
- unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
unsigned int ext_flags;
struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
unsigned long index;
- u64 next_cluster_bytes;
+ loff_t last_page_bytes;
BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+ BUG_ON(start > end);
- /* Cluster boundary, so we don't need to grab any pages. */
- if ((isize & (csize - 1)) == 0)
+ if (start == end)
goto out;
- ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+ BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
+ (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
+
+ ret = ocfs2_extent_map_get_blocks(inode, start >> sb->s_blocksize_bits,
phys, NULL, &ext_flags);
if (ret) {
mlog_errno(ret);
@@ -3517,8 +5775,8 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
if (ext_flags & OCFS2_EXT_UNWRITTEN)
goto out;
- next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
- index = isize >> PAGE_CACHE_SHIFT;
+ last_page_bytes = PAGE_ALIGN(end);
+ index = start >> PAGE_CACHE_SHIFT;
do {
pages[numpages] = grab_cache_page(mapping, index);
if (!pages[numpages]) {
@@ -3529,7 +5787,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page *
numpages++;
index++;
- } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
+ } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
out:
if (ret != 0) {
@@ -3558,11 +5816,10 @@ out:
* otherwise block_write_full_page() will skip writeout of pages past
* i_size. The new_i_size parameter is passed for this reason.
*/
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
- u64 new_i_size)
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+ u64 range_start, u64 range_end)
{
int ret, numpages;
- loff_t endbyte;
struct page **pages = NULL;
u64 phys;
@@ -3581,7 +5838,8 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
goto out;
}
- ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+ ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
+ &numpages, &phys);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3590,17 +5848,16 @@ int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
if (numpages == 0)
goto out;
- ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
- handle);
+ ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
+ numpages, phys, handle);
/*
* Initiate writeout of the pages we zero'd here. We don't
* wait on them - the truncate_inode_pages() call later will
* do that for us.
*/
- endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
- ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
- endbyte - 1, SYNC_FILE_RANGE_WRITE);
+ ret = do_sync_mapping_range(inode->i_mapping, range_start,
+ range_end - 1, SYNC_FILE_RANGE_WRITE);
if (ret)
mlog_errno(ret);
@@ -3631,8 +5888,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
mlog_entry_void();
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
@@ -3754,7 +6009,6 @@ start:
goto start;
bail:
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_schedule_truncate_log_flush(osb, 1);
@@ -3764,6 +6018,8 @@ bail:
if (handle)
ocfs2_commit_trans(osb, handle);
+ ocfs2_run_deallocs(osb, &tc->tc_dealloc);
+
ocfs2_free_path(path);
/* This will drop the ext_alloc cluster lock for us */
@@ -3774,23 +6030,18 @@ bail:
}
/*
- * Expects the inode to already be locked. This will figure out which
- * inodes need to be locked and will put them on the returned truncate
- * context.
+ * Expects the inode to already be locked.
*/
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
struct ocfs2_truncate_context **tc)
{
- int status, metadata_delete, i;
+ int status;
unsigned int new_i_clusters;
struct ocfs2_dinode *fe;
struct ocfs2_extent_block *eb;
- struct ocfs2_extent_list *el;
struct buffer_head *last_eb_bh = NULL;
- struct inode *ext_alloc_inode = NULL;
- struct buffer_head *ext_alloc_bh = NULL;
mlog_entry_void();
@@ -3810,12 +6061,9 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
mlog_errno(status);
goto bail;
}
+ ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
- metadata_delete = 0;
if (fe->id2.i_list.l_tree_depth) {
- /* If we have a tree, then the truncate may result in
- * metadata deletes. Figure this out from the
- * rightmost leaf block.*/
status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
&last_eb_bh, OCFS2_BH_CACHED, inode);
if (status < 0) {
@@ -3830,43 +6078,10 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
status = -EIO;
goto bail;
}
- el = &(eb->h_list);
-
- i = 0;
- if (ocfs2_is_empty_extent(&el->l_recs[0]))
- i = 1;
- /*
- * XXX: Should we check that next_free_rec contains
- * the extent?
- */
- if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
- metadata_delete = 1;
}
(*tc)->tc_last_eb_bh = last_eb_bh;
- if (metadata_delete) {
- mlog(0, "Will have to delete metadata for this trunc. "
- "locking allocator.\n");
- ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0);
- if (!ext_alloc_inode) {
- status = -ENOMEM;
- mlog_errno(status);
- goto bail;
- }
-
- mutex_lock(&ext_alloc_inode->i_mutex);
- (*tc)->tc_ext_alloc_inode = ext_alloc_inode;
-
- status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1);
- if (status < 0) {
- mlog_errno(status);
- goto bail;
- }
- (*tc)->tc_ext_alloc_bh = ext_alloc_bh;
- (*tc)->tc_ext_alloc_locked = 1;
- }
-
status = 0;
bail:
if (status < 0) {
@@ -3880,16 +6095,13 @@ bail:
static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
{
- if (tc->tc_ext_alloc_inode) {
- if (tc->tc_ext_alloc_locked)
- ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1);
-
- mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex);
- iput(tc->tc_ext_alloc_inode);
- }
-
- if (tc->tc_ext_alloc_bh)
- brelse(tc->tc_ext_alloc_bh);
+ /*
+ * The caller is responsible for completing deallocation
+ * before freeing the context.
+ */
+ if (tc->tc_dealloc.c_first_suballocator != NULL)
+ mlog(ML_NOTICE,
+ "Truncate completion has non-empty dealloc context\n");
if (tc->tc_last_eb_bh)
brelse(tc->tc_last_eb_bh);
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fbcb593..990df48 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -34,7 +34,17 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
u32 cpos,
u64 start_blk,
u32 new_clusters,
+ u8 flags,
struct ocfs2_alloc_context *meta_ac);
+struct ocfs2_cached_dealloc_ctxt;
+int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
+ handle_t *handle, u32 cpos, u32 len, u32 phys,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
+ u32 cpos, u32 len, handle_t *handle,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
struct ocfs2_dinode *fe);
@@ -62,17 +72,41 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode **tl_copy);
int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
struct ocfs2_dinode *tl_copy);
+int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb);
+int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+ handle_t *handle,
+ u64 start_blk,
+ unsigned int num_clusters);
+int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+
+/*
+ * Process local structure which describes the block unlinks done
+ * during an operation. This is populated via
+ * ocfs2_cache_block_dealloc().
+ *
+ * ocfs2_run_deallocs() should be called after the potentially
+ * de-allocating routines. No journal handles should be open, and most
+ * locks should have been dropped.
+ */
+struct ocfs2_cached_dealloc_ctxt {
+ struct ocfs2_per_slot_free_list *c_first_suballocator;
+};
+static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
+{
+ c->c_first_suballocator = NULL;
+}
+int ocfs2_run_deallocs(struct ocfs2_super *osb,
+ struct ocfs2_cached_dealloc_ctxt *ctxt);
struct ocfs2_truncate_context {
- struct inode *tc_ext_alloc_inode;
- struct buffer_head *tc_ext_alloc_bh;
+ struct ocfs2_cached_dealloc_ctxt tc_dealloc;
int tc_ext_alloc_locked; /* is it cluster locked? */
/* these get destroyed once it's passed to ocfs2_commit_truncate. */
struct buffer_head *tc_last_eb_bh;
};
-int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
- u64 new_i_size);
+int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
+ u64 range_start, u64 range_end);
int ocfs2_prepare_truncate(struct ocfs2_super *osb,
struct inode *inode,
struct buffer_head *fe_bh,
@@ -84,6 +118,7 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
u32 cpos, struct buffer_head **leaf_bh);
+int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster);
/*
* Helper function to look at the # of clusters in an extent record.
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a480b09..84bf6e7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
bh = bh->b_this_page, block_start += bsize) {
block_end = block_start + bsize;
+ clear_buffer_new(bh);
+
/*
* Ignore blocks outside of our i/o range -
* they may belong to unallocated clusters.
@@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
* For an allocating write with cluster size >= page
* size, we always write the entire page.
*/
-
- if (buffer_new(bh))
- clear_buffer_new(bh);
+ if (new)
+ set_buffer_new(bh);
if (!buffer_mapped(bh)) {
map_bh(bh, inode->i_sb, *p_blkno);
@@ -711,7 +712,8 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
if (!buffer_uptodate(bh))
set_buffer_uptodate(bh);
} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
- (block_start < from || block_end > to)) {
+ !buffer_new(bh) &&
+ (block_start < from || block_end > to)) {
ll_rw_block(READ, 1, &bh);
*wait_bh++=bh;
}
@@ -738,18 +740,13 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
bh = head;
block_start = 0;
do {
- void *kaddr;
-
block_end = block_start + bsize;
if (block_end <= from)
goto next_bh;
if (block_start >= to)
break;
- kaddr = kmap_atomic(page, KM_USER0);
- memset(kaddr+block_start, 0, bh->b_size);
- flush_dcache_page(page);
- kunmap_atomic(kaddr, KM_USER0);
+ zero_user_page(page, block_start, bh->b_size, KM_USER0);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
@@ -761,217 +758,240 @@ next_bh:
return ret;
}
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES 1
+#else
+#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+
+#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
/*
- * This will copy user data from the buffer page in the splice
- * context.
- *
- * For now, we ignore SPLICE_F_MOVE as that would require some extra
- * communication out all the way to ocfs2_write().
+ * Describe the state of a single cluster to be written to.
*/
-int ocfs2_map_and_write_splice_data(struct inode *inode,
- struct ocfs2_write_ctxt *wc, u64 *p_blkno,
- unsigned int *ret_from, unsigned int *ret_to)
+struct ocfs2_write_cluster_desc {
+ u32 c_cpos;
+ u32 c_phys;
+ /*
+ * Give this a unique field because c_phys eventually gets
+ * filled.
+ */
+ unsigned c_new;
+ unsigned c_unwritten;
+};
+
+static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
{
- int ret;
- unsigned int to, from, cluster_start, cluster_end;
- char *src, *dst;
- struct ocfs2_splice_write_priv *sp = wc->w_private;
- struct pipe_buffer *buf = sp->s_buf;
- unsigned long bytes, src_from;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ return d->c_new || d->c_unwritten;
+}
- ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
- &cluster_end);
+struct ocfs2_write_ctxt {
+ /* Logical cluster position / len of write */
+ u32 w_cpos;
+ u32 w_clen;
- from = sp->s_offset;
- src_from = sp->s_buf_offset;
- bytes = wc->w_count;
+ struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
- if (wc->w_large_pages) {
- /*
- * For cluster size < page size, we have to
- * calculate pos within the cluster and obey
- * the rightmost boundary.
- */
- bytes = min(bytes, (unsigned long)(osb->s_clustersize
- - (wc->w_pos & (osb->s_clustersize - 1))));
- }
- to = from + bytes;
+ /*
+ * This is true if page_size > cluster_size.
+ *
+ * It triggers a set of special cases during write which might
+ * have to deal with allocating writes to partial pages.
+ */
+ unsigned int w_large_pages;
+
+ /*
+ * Pages involved in this write.
+ *
+ * w_target_page is the page being written to by the user.
+ *
+ * w_pages is an array of pages which always contains
+ * w_target_page, and in the case of an allocating write with
+ * page_size < cluster size, it will contain zero'd and mapped
+ * pages adjacent to w_target_page which need to be written
+ * out in so that future reads from that region will get
+ * zero's.
+ */
+ struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
+ unsigned int w_num_pages;
+ struct page *w_target_page;
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
- BUG_ON(from < cluster_start);
- BUG_ON(to > cluster_end);
+ /*
+ * ocfs2_write_end() uses this to know what the real range to
+ * write in the target should be.
+ */
+ unsigned int w_target_from;
+ unsigned int w_target_to;
- if (wc->w_this_page_new)
- ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
- cluster_start, cluster_end, 1);
- else
- ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
- from, to, 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
+ /*
+ * We could use journal_current_handle() but this is cleaner,
+ * IMHO -Mark
+ */
+ handle_t *w_handle;
+
+ struct buffer_head *w_di_bh;
+
+ struct ocfs2_cached_dealloc_ctxt w_dealloc;
+};
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+ int i;
+
+ for(i = 0; i < wc->w_num_pages; i++) {
+ if (wc->w_pages[i] == NULL)
+ continue;
+
+ unlock_page(wc->w_pages[i]);
+ mark_page_accessed(wc->w_pages[i]);
+ page_cache_release(wc->w_pages[i]);
}
- src = buf->ops->map(sp->s_pipe, buf, 1);
- dst = kmap_atomic(wc->w_this_page, KM_USER1);
- memcpy(dst + from, src + src_from, bytes);
- kunmap_atomic(wc->w_this_page, KM_USER1);
- buf->ops->unmap(sp->s_pipe, buf, src);
+ brelse(wc->w_di_bh);
+ kfree(wc);
+}
+
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+ struct ocfs2_super *osb, loff_t pos,
+ unsigned len, struct buffer_head *di_bh)
+{
+ struct ocfs2_write_ctxt *wc;
+
+ wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+ if (!wc)
+ return -ENOMEM;
- wc->w_finished_copy = 1;
+ wc->w_cpos = pos >> osb->s_clustersize_bits;
+ wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
+ get_bh(di_bh);
+ wc->w_di_bh = di_bh;
- *ret_from = from;
- *ret_to = to;
-out:
+ if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+ wc->w_large_pages = 1;
+ else
+ wc->w_large_pages = 0;
+
+ ocfs2_init_dealloc_ctxt(&wc->w_dealloc);
+
+ *wcp = wc;
- return bytes ? (unsigned int)bytes : ret;
+ return 0;
}
/*
- * This will copy user data from the iovec in the buffered write
- * context.
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
*/
-int ocfs2_map_and_write_user_data(struct inode *inode,
- struct ocfs2_write_ctxt *wc, u64 *p_blkno,
- unsigned int *ret_from, unsigned int *ret_to)
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
{
- int ret;
- unsigned int to, from, cluster_start, cluster_end;
- unsigned long bytes, src_from;
- char *dst;
- struct ocfs2_buffered_write_priv *bp = wc->w_private;
- const struct iovec *cur_iov = bp->b_cur_iov;
- char __user *buf;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ unsigned int block_start, block_end;
+ struct buffer_head *head, *bh;
- ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
- &cluster_end);
+ BUG_ON(!PageLocked(page));
+ if (!page_has_buffers(page))
+ return;
- buf = cur_iov->iov_base + bp->b_cur_off;
- src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+ bh = head = page_buffers(page);
+ block_start = 0;
+ do {
+ block_end = block_start + bh->b_size;
- from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+ if (buffer_new(bh)) {
+ if (block_end > from && block_start < to) {
+ if (!PageUptodate(page)) {
+ unsigned start, end;
- /*
- * This is a lot of comparisons, but it reads quite
- * easily, which is important here.
- */
- /* Stay within the src page */
- bytes = PAGE_SIZE - src_from;
- /* Stay within the vector */
- bytes = min(bytes,
- (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
- /* Stay within count */
- bytes = min(bytes, (unsigned long)wc->w_count);
- /*
- * For clustersize > page size, just stay within
- * target page, otherwise we have to calculate pos
- * within the cluster and obey the rightmost
- * boundary.
- */
- if (wc->w_large_pages) {
- /*
- * For cluster size < page size, we have to
- * calculate pos within the cluster and obey
- * the rightmost boundary.
- */
- bytes = min(bytes, (unsigned long)(osb->s_clustersize
- - (wc->w_pos & (osb->s_clustersize - 1))));
- } else {
- /*
- * cluster size > page size is the most common
- * case - we just stay within the target page
- * boundary.
- */
- bytes = min(bytes, PAGE_CACHE_SIZE - from);
- }
+ start = max(from, block_start);
+ end = min(to, block_end);
- to = from + bytes;
+ zero_user_page(page, start, end - start, KM_USER0);
+ set_buffer_uptodate(bh);
+ }
- BUG_ON(from > PAGE_CACHE_SIZE);
- BUG_ON(to > PAGE_CACHE_SIZE);
- BUG_ON(from < cluster_start);
- BUG_ON(to > cluster_end);
+ clear_buffer_new(bh);
+ mark_buffer_dirty(bh);
+ }
+ }
- if (wc->w_this_page_new)
- ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
- cluster_start, cluster_end, 1);
- else
- ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
- from, to, 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
- dst = kmap(wc->w_this_page);
- memcpy(dst + from, bp->b_src_buf + src_from, bytes);
- kunmap(wc->w_this_page);
+/*
+ * Only called when we have a failure during allocating write to write
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ loff_t user_pos, unsigned user_len)
+{
+ int i;
+ unsigned from, to;
+ struct page *tmppage;
- /*
- * XXX: This is slow, but simple. The caller of
- * ocfs2_buffered_write_cluster() is responsible for
- * passing through the iovecs, so it's difficult to
- * predict what our next step is in here after our
- * initial write. A future version should be pushing
- * that iovec manipulation further down.
- *
- * By setting this, we indicate that a copy from user
- * data was done, and subsequent calls for this
- * cluster will skip copying more data.
- */
- wc->w_finished_copy = 1;
+ ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
- *ret_from = from;
- *ret_to = to;
-out:
+ if (wc->w_large_pages) {
+ from = wc->w_target_from;
+ to = wc->w_target_to;
+ } else {
+ from = 0;
+ to = PAGE_CACHE_SIZE;
+ }
+
+ for(i = 0; i < wc->w_num_pages; i++) {
+ tmppage = wc->w_pages[i];
- return bytes ? (unsigned int)bytes : ret;
+ if (ocfs2_should_order_data(inode))
+ walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+ from, to, NULL,
+ ocfs2_journal_dirty_data);
+
+ block_commit_write(tmppage, from, to);
+ }
}
-/*
- * Map, fill and write a page to disk.
- *
- * The work of copying data is done via callback. Newly allocated
- * pages which don't take user data will be zero'd (set 'new' to
- * indicate an allocating write)
- *
- * Returns a negative error code or the number of bytes copied into
- * the page.
- */
-static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
- u64 *p_blkno, struct page *page,
- struct ocfs2_write_ctxt *wc, int new)
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
+ struct ocfs2_write_ctxt *wc,
+ struct page *page, u32 cpos,
+ loff_t user_pos, unsigned user_len,
+ int new)
{
- int ret, copied = 0;
- unsigned int from = 0, to = 0;
+ int ret;
+ unsigned int map_from = 0, map_to = 0;
unsigned int cluster_start, cluster_end;
- unsigned int zero_from = 0, zero_to = 0;
+ unsigned int user_data_from = 0, user_data_to = 0;
- ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+ ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
&cluster_start, &cluster_end);
- if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
- && !wc->w_finished_copy) {
-
- wc->w_this_page = page;
- wc->w_this_page_new = new;
- ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
- if (ret < 0) {
+ if (page == wc->w_target_page) {
+ map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+ map_to = map_from + user_len;
+
+ if (new)
+ ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+ cluster_start, cluster_end,
+ new);
+ else
+ ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+ map_from, map_to, new);
+ if (ret) {
mlog_errno(ret);
goto out;
}
- copied = ret;
-
- zero_from = from;
- zero_to = to;
+ user_data_from = map_from;
+ user_data_to = map_to;
if (new) {
- from = cluster_start;
- to = cluster_end;
+ map_from = cluster_start;
+ map_to = cluster_end;
}
+
+ wc->w_target_from = map_from;
+ wc->w_target_to = map_to;
} else {
/*
* If we haven't allocated the new page yet, we
@@ -980,11 +1000,11 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
*/
BUG_ON(!new);
- from = cluster_start;
- to = cluster_end;
+ map_from = cluster_start;
+ map_to = cluster_end;
ret = ocfs2_map_page_blocks(page, p_blkno, inode,
- cluster_start, cluster_end, 1);
+ cluster_start, cluster_end, new);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1003,108 +1023,113 @@ static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
*/
if (new && !PageUptodate(page))
ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
- wc->w_cpos, zero_from, zero_to);
+ cpos, user_data_from, user_data_to);
flush_dcache_page(page);
- if (ocfs2_should_order_data(inode)) {
- ret = walk_page_buffers(handle,
- page_buffers(page),
- from, to, NULL,
- ocfs2_journal_dirty_data);
- if (ret < 0)
- mlog_errno(ret);
- }
-
- /*
- * We don't use generic_commit_write() because we need to
- * handle our own i_size update.
- */
- ret = block_commit_write(page, from, to);
- if (ret)
- mlog_errno(ret);
out:
-
- return copied ? copied : ret;
+ return ret;
}
/*
- * Do the actual write of some data into an inode. Optionally allocate
- * in order to fulfill the write.
- *
- * cpos is the logical cluster offset within the file to write at
- *
- * 'phys' is the physical mapping of that offset. a 'phys' value of
- * zero indicates that allocation is required. In this case, data_ac
- * and meta_ac should be valid (meta_ac can be null if metadata
- * allocation isn't required).
+ * This function will only grab one clusters worth of pages.
*/
-static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
- struct buffer_head *di_bh,
- struct ocfs2_alloc_context *data_ac,
- struct ocfs2_alloc_context *meta_ac,
- struct ocfs2_write_ctxt *wc)
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
+ struct ocfs2_write_ctxt *wc,
+ u32 cpos, loff_t user_pos, int new,
+ struct page *mmap_page)
{
- int ret, i, numpages = 1, new;
- unsigned int copied = 0;
- u32 tmp_pos;
- u64 v_blkno, p_blkno;
- struct address_space *mapping = file->f_mapping;
+ int ret = 0, i;
+ unsigned long start, target_index, index;
struct inode *inode = mapping->host;
- unsigned long index, start;
- struct page **cpages;
- new = phys == 0 ? 1 : 0;
+ target_index = user_pos >> PAGE_CACHE_SHIFT;
/*
* Figure out how many pages we'll be manipulating here. For
* non allocating write, we just change the one
* page. Otherwise, we'll need a whole clusters worth.
*/
- if (new)
- numpages = ocfs2_pages_per_cluster(inode->i_sb);
-
- cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
- if (!cpages) {
- ret = -ENOMEM;
- mlog_errno(ret);
- return ret;
- }
-
- /*
- * Fill our page array first. That way we've grabbed enough so
- * that we can zero and flush if we error after adding the
- * extent.
- */
if (new) {
- start = ocfs2_align_clusters_to_page_index(inode->i_sb,
- wc->w_cpos);
- v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+ wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+ start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
} else {
- start = wc->w_pos >> PAGE_CACHE_SHIFT;
- v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+ wc->w_num_pages = 1;
+ start = target_index;
}
- for(i = 0; i < numpages; i++) {
+ for(i = 0; i < wc->w_num_pages; i++) {
index = start + i;
- cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
- if (!cpages[i]) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
+ if (index == target_index && mmap_page) {
+ /*
+ * ocfs2_pagemkwrite() is a little different
+ * and wants us to directly use the page
+ * passed in.
+ */
+ lock_page(mmap_page);
+
+ if (mmap_page->mapping != mapping) {
+ unlock_page(mmap_page);
+ /*
+ * Sanity check - the locking in
+ * ocfs2_pagemkwrite() should ensure
+ * that this code doesn't trigger.
+ */
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ page_cache_get(mmap_page);
+ wc->w_pages[i] = mmap_page;
+ } else {
+ wc->w_pages[i] = find_or_create_page(mapping, index,
+ GFP_NOFS);
+ if (!wc->w_pages[i]) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
}
+
+ if (index == target_index)
+ wc->w_target_page = wc->w_pages[i];
}
+out:
+ return ret;
+}
+
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+ u32 phys, unsigned int unwritten,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_write_ctxt *wc, u32 cpos,
+ loff_t user_pos, unsigned user_len)
+{
+ int ret, i, new, should_zero = 0;
+ u64 v_blkno, p_blkno;
+ struct inode *inode = mapping->host;
+
+ new = phys == 0 ? 1 : 0;
+ if (new || unwritten)
+ should_zero = 1;
if (new) {
+ u32 tmp_pos;
+
/*
* This is safe to call with the page locks - it won't take
* any additional semaphores or cluster locks.
*/
- tmp_pos = wc->w_cpos;
+ tmp_pos = cpos;
ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
- &tmp_pos, 1, di_bh, handle,
- data_ac, meta_ac, NULL);
+ &tmp_pos, 1, 0, wc->w_di_bh,
+ wc->w_handle, data_ac,
+ meta_ac, NULL);
/*
* This shouldn't happen because we must have already
* calculated the correct meta data allocation required. The
@@ -1121,159 +1146,433 @@ static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
mlog_errno(ret);
goto out;
}
+ } else if (unwritten) {
+ ret = ocfs2_mark_extent_written(inode, wc->w_di_bh,
+ wc->w_handle, cpos, 1, phys,
+ meta_ac, &wc->w_dealloc);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
}
+ if (should_zero)
+ v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+ else
+ v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
+
+ /*
+ * The only reason this should fail is due to an inability to
+ * find the extent added.
+ */
ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
NULL);
if (ret < 0) {
-
- /*
- * XXX: Should we go readonly here?
- */
-
- mlog_errno(ret);
+ ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
+ "at logical block %llu",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)v_blkno);
goto out;
}
BUG_ON(p_blkno == 0);
- for(i = 0; i < numpages; i++) {
- ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
- wc, new);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
+ for(i = 0; i < wc->w_num_pages; i++) {
+ int tmpret;
+
+ tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
+ wc->w_pages[i], cpos,
+ user_pos, user_len,
+ should_zero);
+ if (tmpret) {
+ mlog_errno(tmpret);
+ if (ret == 0)
+ tmpret = ret;
}
-
- copied += ret;
}
+ /*
+ * We only have cleanup to do in case of allocating write.
+ */
+ if (ret && new)
+ ocfs2_write_failure(inode, wc, user_pos, user_len);
+
out:
- for(i = 0; i < numpages; i++) {
- unlock_page(cpages[i]);
- mark_page_accessed(cpages[i]);
- page_cache_release(cpages[i]);
+
+ return ret;
+}
+
+static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
+ struct ocfs2_alloc_context *data_ac,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_write_ctxt *wc,
+ loff_t pos, unsigned len)
+{
+ int ret, i;
+ struct ocfs2_write_cluster_desc *desc;
+
+ for (i = 0; i < wc->w_clen; i++) {
+ desc = &wc->w_desc[i];
+
+ ret = ocfs2_write_cluster(mapping, desc->c_phys,
+ desc->c_unwritten, data_ac, meta_ac,
+ wc, desc->c_cpos, pos, len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
}
- kfree(cpages);
- return copied ? copied : ret;
+ ret = 0;
+out:
+ return ret;
}
-static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
- struct ocfs2_super *osb, loff_t pos,
- size_t count, ocfs2_page_writer *cb,
- void *cb_priv)
+/*
+ * ocfs2_write_end() wants to know which parts of the target page it
+ * should complete the write on. It's easiest to compute them ahead of
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+ struct ocfs2_write_ctxt *wc,
+ loff_t pos, unsigned len, int alloc)
{
- wc->w_count = count;
- wc->w_pos = pos;
- wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
- wc->w_finished_copy = 0;
+ struct ocfs2_write_cluster_desc *desc;
- if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
- wc->w_large_pages = 1;
- else
- wc->w_large_pages = 0;
+ wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+ wc->w_target_to = wc->w_target_from + len;
- wc->w_write_data_page = cb;
- wc->w_private = cb_priv;
+ if (alloc == 0)
+ return;
+
+ /*
+ * Allocating write - we may have different boundaries based
+ * on page size and cluster size.
+ *
+ * NOTE: We can no longer compute one value from the other as
+ * the actual write length and user provided length may be
+ * different.
+ */
+
+ if (wc->w_large_pages) {
+ /*
+ * We only care about the 1st and last cluster within
+ * our range and whether they should be zero'd or not. Either
+ * value may be extended out to the start/end of a
+ * newly allocated cluster.
+ */
+ desc = &wc->w_desc[0];
+ if (ocfs2_should_zero_cluster(desc))
+ ocfs2_figure_cluster_boundaries(osb,
+ desc->c_cpos,
+ &wc->w_target_from,
+ NULL);
+
+ desc = &wc->w_desc[wc->w_clen - 1];
+ if (ocfs2_should_zero_cluster(desc))
+ ocfs2_figure_cluster_boundaries(osb,
+ desc->c_cpos,
+ NULL,
+ &wc->w_target_to);
+ } else {
+ wc->w_target_from = 0;
+ wc->w_target_to = PAGE_CACHE_SIZE;
+ }
}
/*
- * Write a cluster to an inode. The cluster may not be allocated yet,
- * in which case it will be. This only exists for buffered writes -
- * O_DIRECT takes a more "traditional" path through the kernel.
- *
- * The caller is responsible for incrementing pos, written counts, etc
+ * Populate each single-cluster write descriptor in the write context
+ * with information about the i/o to be done.
*
- * For file systems that don't support sparse files, pre-allocation
- * and page zeroing up until cpos should be done prior to this
- * function call.
- *
- * Callers should be holding i_sem, and the rw cluster lock.
- *
- * Returns the number of user bytes written, or less than zero for
- * error.
+ * Returns the number of clusters that will have to be allocated, as
+ * well as a worst case estimate of the number of extent records that
+ * would have to be created during a write to an unwritten region.
*/
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
- size_t count, ocfs2_page_writer *actor,
- void *priv)
+static int ocfs2_populate_write_desc(struct inode *inode,
+ struct ocfs2_write_ctxt *wc,
+ unsigned int *clusters_to_alloc,
+ unsigned int *extents_to_split)
+{
+ int ret;
+ struct ocfs2_write_cluster_desc *desc;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+ u32 phys = 0;
+ int i;
+
+ *clusters_to_alloc = 0;
+ *extents_to_split = 0;
+
+ for (i = 0; i < wc->w_clen; i++) {
+ desc = &wc->w_desc[i];
+ desc->c_cpos = wc->w_cpos + i;
+
+ if (num_clusters == 0) {
+ /*
+ * Need to look up the next extent record.
+ */
+ ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+ &num_clusters, &ext_flags);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Assume worst case - that we're writing in
+ * the middle of the extent.
+ *
+ * We can assume that the write proceeds from
+ * left to right, in which case the extent
+ * insert code is smart enough to coalesce the
+ * next splits into the previous records created.
+ */
+ if (ext_flags & OCFS2_EXT_UNWRITTEN)
+ *extents_to_split = *extents_to_split + 2;
+ } else if (phys) {
+ /*
+ * Only increment phys if it doesn't describe
+ * a hole.
+ */
+ phys++;
+ }
+
+ desc->c_phys = phys;
+ if (phys == 0) {
+ desc->c_new = 1;
+ *clusters_to_alloc = *clusters_to_alloc + 1;
+ }
+ if (ext_flags & OCFS2_EXT_UNWRITTEN)
+ desc->c_unwritten = 1;
+
+ num_clusters--;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata,
+ struct buffer_head *di_bh, struct page *mmap_page)
{
int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
- ssize_t written = 0;
- u32 phys;
- struct inode *inode = file->f_mapping->host;
+ unsigned int clusters_to_alloc, extents_to_split;
+ struct ocfs2_write_ctxt *wc;
+ struct inode *inode = mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
handle_t *handle;
- struct ocfs2_write_ctxt wc;
-
- ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
- ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
if (ret) {
mlog_errno(ret);
- goto out;
+ return ret;
}
- di = (struct ocfs2_dinode *)di_bh->b_data;
-
- /*
- * Take alloc sem here to prevent concurrent lookups. That way
- * the mapping, zeroing and tree manipulation within
- * ocfs2_write() will be safe against ->readpage(). This
- * should also serve to lock out allocation from a shared
- * writeable region.
- */
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+ ret = ocfs2_populate_write_desc(inode, wc, &clusters_to_alloc,
+ &extents_to_split);
if (ret) {
mlog_errno(ret);
- goto out_meta;
+ goto out;
}
- /* phys == 0 means that allocation is required. */
- if (phys == 0) {
- ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+ di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+ /*
+ * We set w_target_from, w_target_to here so that
+ * ocfs2_write_end() knows which range in the target page to
+ * write out. An allocation requires that we write the entire
+ * cluster range.
+ */
+ if (clusters_to_alloc || extents_to_split) {
+ /*
+ * XXX: We are stretching the limits of
+ * ocfs2_lock_allocators(). It greatly over-estimates
+ * the work to be done.
+ */
+ ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
+ extents_to_split, &data_ac, &meta_ac);
if (ret) {
mlog_errno(ret);
- goto out_meta;
+ goto out;
}
- credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
- }
+ credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+ clusters_to_alloc);
- ret = ocfs2_data_lock(inode, 1);
- if (ret) {
- mlog_errno(ret);
- goto out_meta;
}
+ ocfs2_set_target_boundaries(osb, wc, pos, len,
+ clusters_to_alloc + extents_to_split);
+
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
- goto out_data;
+ goto out;
}
- written = ocfs2_write(file, phys, handle, di_bh, data_ac,
- meta_ac, &wc);
- if (written < 0) {
- ret = written;
+ wc->w_handle = handle;
+
+ /*
+ * We don't want this to fail in ocfs2_write_end(), so do it
+ * here.
+ */
+ ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
mlog_errno(ret);
goto out_commit;
}
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ /*
+ * Fill our page array first. That way we've grabbed enough so
+ * that we can zero and flush if we error after adding the
+ * extent.
+ */
+ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+ clusters_to_alloc + extents_to_split,
+ mmap_page);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
- pos += written;
+ ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
+ len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ *pagep = wc->w_target_page;
+ *fsdata = wc;
+ return 0;
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+
+out:
+ ocfs2_free_write_ctxt(wc);
+
+ if (data_ac)
+ ocfs2_free_alloc_context(data_ac);
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+ return ret;
+}
+
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct inode *inode = mapping->host;
+
+ ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /*
+ * Take alloc sem here to prevent concurrent lookups. That way
+ * the mapping, zeroing and tree manipulation within
+ * ocfs2_write() will be safe against ->readpage(). This
+ * should also serve to lock out allocation from a shared
+ * writeable region.
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ret = ocfs2_data_lock(inode, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_fail;
+ }
+
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+ fsdata, di_bh, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_fail_data;
+ }
+
+ brelse(di_bh);
+
+ return 0;
+
+out_fail_data:
+ ocfs2_data_unlock(inode, 1);
+out_fail:
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(di_bh);
+ ocfs2_meta_unlock(inode, 1);
+
+ return ret;
+}
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int i;
+ unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+ struct inode *inode = mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_write_ctxt *wc = fsdata;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+ handle_t *handle = wc->w_handle;
+ struct page *tmppage;
+
+ if (unlikely(copied < len)) {
+ if (!PageUptodate(wc->w_target_page))
+ copied = 0;
+
+ ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+ start+len);
+ }
+ flush_dcache_page(wc->w_target_page);
+
+ for(i = 0; i < wc->w_num_pages; i++) {
+ tmppage = wc->w_pages[i];
+
+ if (tmppage == wc->w_target_page) {
+ from = wc->w_target_from;
+ to = wc->w_target_to;
+
+ BUG_ON(from > PAGE_CACHE_SIZE ||
+ to > PAGE_CACHE_SIZE ||
+ to < from);
+ } else {
+ /*
+ * Pages adjacent to the target (if any) imply
+ * a hole-filling write in which case we want
+ * to flush their entire range.
+ */
+ from = 0;
+ to = PAGE_CACHE_SIZE;
+ }
+
+ if (ocfs2_should_order_data(inode))
+ walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+ from, to, NULL,
+ ocfs2_journal_dirty_data);
+
+ block_commit_write(tmppage, from, to);
+ }
+
+ pos += copied;
if (pos > inode->i_size) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
@@ -1283,29 +1582,31 @@ ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+ ocfs2_journal_dirty(handle, wc->w_di_bh);
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret)
- mlog_errno(ret);
-
-out_commit:
ocfs2_commit_trans(osb, handle);
-out_data:
- ocfs2_data_unlock(inode, 1);
+ ocfs2_run_deallocs(osb, &wc->w_dealloc);
+
+ ocfs2_free_write_ctxt(wc);
+
+ return copied;
+}
+
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ int ret;
+ struct inode *inode = mapping->host;
-out_meta:
+ ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+
+ ocfs2_data_unlock(inode, 1);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
ocfs2_meta_unlock(inode, 1);
-out:
- brelse(di_bh);
- if (data_ac)
- ocfs2_free_alloc_context(data_ac);
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
- return written ? written : ret;
+ return ret;
}
const struct address_space_operations ocfs2_aops = {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 45821d4..389579b 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -42,57 +42,22 @@ int walk_page_buffers( handle_t *handle,
int (*fn)( handle_t *handle,
struct buffer_head *bh));
-struct ocfs2_write_ctxt;
-typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
- u64 *, unsigned int *, unsigned int *);
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata);
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
- size_t count, ocfs2_page_writer *actor,
- void *priv);
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
-struct ocfs2_write_ctxt {
- size_t w_count;
- loff_t w_pos;
- u32 w_cpos;
- unsigned int w_finished_copy;
+int ocfs2_write_end_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata);
- /* This is true if page_size > cluster_size */
- unsigned int w_large_pages;
-
- /* Filler callback and private data */
- ocfs2_page_writer *w_write_data_page;
- void *w_private;
-
- /* Only valid for the filler callback */
- struct page *w_this_page;
- unsigned int w_this_page_new;
-};
-
-struct ocfs2_buffered_write_priv {
- char *b_src_buf;
- const struct iovec *b_cur_iov; /* Current iovec */
- size_t b_cur_off; /* Offset in the
- * current iovec */
-};
-int ocfs2_map_and_write_user_data(struct inode *inode,
- struct ocfs2_write_ctxt *wc,
- u64 *p_blkno,
- unsigned int *ret_from,
- unsigned int *ret_to);
-
-struct ocfs2_splice_write_priv {
- struct splice_desc *s_sd;
- struct pipe_buffer *s_buf;
- struct pipe_inode_info *s_pipe;
- /* Neither offset value is ever larger than one page */
- unsigned int s_offset;
- unsigned int s_buf_offset;
-};
-int ocfs2_map_and_write_splice_data(struct inode *inode,
- struct ocfs2_write_ctxt *wc,
- u64 *p_blkno,
- unsigned int *ret_from,
- unsigned int *ret_to);
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata,
+ struct buffer_head *di_bh, struct page *mmap_page);
/* all ocfs2_dio_end_io()'s fault */
#define ocfs2_iocb_is_rw_locked(iocb) \
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9791134..2bd7f78 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1335,6 +1335,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
ret = wait_event_interruptible(o2hb_steady_queue,
atomic_read(&reg->hr_steady_iterations) == 0);
if (ret) {
+ /* We got interrupted (hello ptrace!). Clean up */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
reg->hr_task = NULL;
@@ -1345,7 +1346,16 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
goto out;
}
- ret = count;
+ /* Ok, we were woken. Make sure it wasn't by drop_item() */
+ spin_lock(&o2hb_live_lock);
+ hb_task = reg->hr_task;
+ spin_unlock(&o2hb_live_lock);
+
+ if (hb_task)
+ ret = count;
+ else
+ ret = -EIO;
+
out:
if (filp)
fput(filp);
@@ -1523,6 +1533,15 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
if (hb_task)
kthread_stop(hb_task);
+ /*
+ * If we're racing a dev_write(), we need to wake them. They will
+ * check reg->hr_task
+ */
+ if (atomic_read(&reg->hr_steady_iterations) != 0) {
+ atomic_set(&reg->hr_steady_iterations, 0);
+ wake_up(&o2hb_steady_queue);
+ }
+
config_item_put(item);
}
@@ -1665,7 +1684,67 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
}
EXPORT_SYMBOL_GPL(o2hb_setup_callback);
-int o2hb_register_callback(struct o2hb_callback_func *hc)
+static struct o2hb_region *o2hb_find_region(const char *region_uuid)
+{
+ struct o2hb_region *p, *reg = NULL;
+
+ assert_spin_locked(&o2hb_live_lock);
+
+ list_for_each_entry(p, &o2hb_all_regions, hr_all_item) {
+ if (!strcmp(region_uuid, config_item_name(&p->hr_item))) {
+ reg = p;
+ break;
+ }
+ }
+
+ return reg;
+}
+
+static int o2hb_region_get(const char *region_uuid)
+{
+ int ret = 0;
+ struct o2hb_region *reg;
+
+ spin_lock(&o2hb_live_lock);
+
+ reg = o2hb_find_region(region_uuid);
+ if (!reg)
+ ret = -ENOENT;
+ spin_unlock(&o2hb_live_lock);
+
+ if (ret)
+ goto out;
+
+ ret = o2nm_depend_this_node();
+ if (ret)
+ goto out;
+
+ ret = o2nm_depend_item(&reg->hr_item);
+ if (ret)
+ o2nm_undepend_this_node();
+
+out:
+ return ret;
+}
+
+static void o2hb_region_put(const char *region_uuid)
+{
+ struct o2hb_region *reg;
+
+ spin_lock(&o2hb_live_lock);
+
+ reg = o2hb_find_region(region_uuid);
+
+ spin_unlock(&o2hb_live_lock);
+
+ if (reg) {
+ o2nm_undepend_item(&reg->hr_item);
+ o2nm_undepend_this_node();
+ }
+}
+
+int o2hb_register_callback(const char *region_uuid,
+ struct o2hb_callback_func *hc)
{
struct o2hb_callback_func *tmp;
struct list_head *iter;
@@ -1681,6 +1760,12 @@ int o2hb_register_callback(struct o2hb_callback_func *hc)
goto out;
}
+ if (region_uuid) {
+ ret = o2hb_region_get(region_uuid);
+ if (ret)
+ goto out;
+ }
+
down_write(&o2hb_callback_sem);
list_for_each(iter, &hbcall->list) {
@@ -1702,16 +1787,21 @@ out:
}
EXPORT_SYMBOL_GPL(o2hb_register_callback);
-void o2hb_unregister_callback(struct o2hb_callback_func *hc)
+void o2hb_unregister_callback(const char *region_uuid,
+ struct o2hb_callback_func *hc)
{
BUG_ON(hc->hc_magic != O2HB_CB_MAGIC);
mlog(ML_HEARTBEAT, "on behalf of %p for funcs %p\n",
__builtin_return_address(0), hc);
+ /* XXX Can this happen _with_ a region reference? */
if (list_empty(&hc->hc_item))
return;
+ if (region_uuid)
+ o2hb_region_put(region_uuid);
+
down_write(&o2hb_callback_sem);
list_del_init(&hc->hc_item);
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index cc6d40b..35397dd 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -69,8 +69,10 @@ void o2hb_setup_callback(struct o2hb_callback_func *hc,
o2hb_cb_func *func,
void *data,
int priority);
-int o2hb_register_callback(struct o2hb_callback_func *hc);
-void o2hb_unregister_callback(struct o2hb_callback_func *hc);
+int o2hb_register_callback(const char *region_uuid,
+ struct o2hb_callback_func *hc);
+void o2hb_unregister_callback(const char *region_uuid,
+ struct o2hb_callback_func *hc);
void o2hb_fill_node_map(unsigned long *map,
unsigned bytes);
void o2hb_init(void);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 9f5ad0f..af2070d 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -900,6 +900,46 @@ static struct o2nm_cluster_group o2nm_cluster_group = {
},
};
+int o2nm_depend_item(struct config_item *item)
+{
+ return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+void o2nm_undepend_item(struct config_item *item)
+{
+ configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+}
+
+int o2nm_depend_this_node(void)
+{
+ int ret = 0;
+ struct o2nm_node *local_node;
+
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ if (!local_node) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = o2nm_depend_item(&local_node->nd_item);
+ o2nm_node_put(local_node);
+
+out:
+ return ret;
+}
+
+void o2nm_undepend_this_node(void)
+{
+ struct o2nm_node *local_node;
+
+ local_node = o2nm_get_node_by_num(o2nm_this_node());
+ BUG_ON(!local_node);
+
+ o2nm_undepend_item(&local_node->nd_item);
+ o2nm_node_put(local_node);
+}
+
+
static void __exit exit_o2nm(void)
{
if (ocfs2_table_header)
@@ -934,7 +974,7 @@ static int __init init_o2nm(void)
goto out_sysctl;
config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
- init_MUTEX(&o2nm_cluster_group.cs_subsys.su_sem);
+ mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
ret = configfs_register_subsystem(&o2nm_cluster_group.cs_subsys);
if (ret) {
printk(KERN_ERR "nodemanager: Registration returned %d\n", ret);
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 0705221..7c86036 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -77,4 +77,9 @@ struct o2nm_node *o2nm_get_node_by_ip(__be32 addr);
void o2nm_node_get(struct o2nm_node *node);
void o2nm_node_put(struct o2nm_node *node);
+int o2nm_depend_item(struct config_item *item);
+void o2nm_undepend_item(struct config_item *item);
+int o2nm_depend_this_node(void);
+void o2nm_undepend_this_node(void);
+
#endif /* O2CLUSTER_NODEMANAGER_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0b229a9..f0bdfd9 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -261,14 +261,12 @@ out:
static void o2net_complete_nodes_nsw(struct o2net_node *nn)
{
- struct list_head *iter, *tmp;
+ struct o2net_status_wait *nsw, *tmp;
unsigned int num_kills = 0;
- struct o2net_status_wait *nsw;
assert_spin_locked(&nn->nn_lock);
- list_for_each_safe(iter, tmp, &nn->nn_status_list) {
- nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
+ list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) {
o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0);
num_kills++;
}
@@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler);
void o2net_unregister_handler_list(struct list_head *list)
{
- struct list_head *pos, *n;
- struct o2net_msg_handler *nmh;
+ struct o2net_msg_handler *nmh, *n;
write_lock(&o2net_handler_lock);
- list_for_each_safe(pos, n, list) {
- nmh = list_entry(pos, struct o2net_msg_handler,
- nh_unregister_item);
+ list_for_each_entry_safe(nmh, n, list, nh_unregister_item) {
mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n",
nmh->nh_func, nmh->nh_msg_type, nmh->nh_key);
rb_erase(&nmh->nh_node, &o2net_handler_tree);
@@ -1638,8 +1633,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
void o2net_unregister_hb_callbacks(void)
{
- o2hb_unregister_callback(&o2net_hb_up);
- o2hb_unregister_callback(&o2net_hb_down);
+ o2hb_unregister_callback(NULL, &o2net_hb_up);
+ o2hb_unregister_callback(NULL, &o2net_hb_down);
}
int o2net_register_hb_callbacks(void)
@@ -1651,9 +1646,9 @@ int o2net_register_hb_callbacks(void)
o2hb_setup_callback(&o2net_hb_up, O2HB_NODE_UP_CB,
o2net_hb_node_up_cb, NULL, O2NET_HB_PRI);
- ret = o2hb_register_callback(&o2net_hb_up);
+ ret = o2hb_register_callback(NULL, &o2net_hb_up);
if (ret == 0)
- ret = o2hb_register_callback(&o2net_hb_down);
+ ret = o2hb_register_callback(NULL, &o2net_hb_down);
if (ret)
o2net_unregister_hb_callbacks();
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index c441ef1..0d5fdde 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -368,7 +368,7 @@ int ocfs2_do_extend_dir(struct super_block *sb,
u32 offset = OCFS2_I(dir)->ip_clusters;
status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
- 1, parent_fe_bh, handle,
+ 1, 0, parent_fe_bh, handle,
data_ac, meta_ac, NULL);
BUG_ON(status == -EAGAIN);
if (status < 0) {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index d836b98..6954565 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1128,8 +1128,8 @@ bail:
static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm)
{
- o2hb_unregister_callback(&dlm->dlm_hb_up);
- o2hb_unregister_callback(&dlm->dlm_hb_down);
+ o2hb_unregister_callback(NULL, &dlm->dlm_hb_up);
+ o2hb_unregister_callback(NULL, &dlm->dlm_hb_down);
o2net_unregister_handler_list(&dlm->dlm_domain_handlers);
}
@@ -1141,13 +1141,13 @@ static int dlm_register_domain_handlers(struct dlm_ctxt *dlm)
o2hb_setup_callback(&dlm->dlm_hb_down, O2HB_NODE_DOWN_CB,
dlm_hb_node_down_cb, dlm, DLM_HB_NODE_DOWN_PRI);
- status = o2hb_register_callback(&dlm->dlm_hb_down);
+ status = o2hb_register_callback(NULL, &dlm->dlm_hb_down);
if (status)
goto bail;
o2hb_setup_callback(&dlm->dlm_hb_up, O2HB_NODE_UP_CB,
dlm_hb_node_up_cb, dlm, DLM_HB_NODE_UP_PRI);
- status = o2hb_register_callback(&dlm->dlm_hb_up);
+ status = o2hb_register_callback(NULL, &dlm->dlm_hb_up);
if (status)
goto bail;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 6edffca..65b2b9b 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
static void dlm_dump_mles(struct dlm_ctxt *dlm)
{
struct dlm_master_list_entry *mle;
- struct list_head *iter;
mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
spin_lock(&dlm->master_lock);
- list_for_each(iter, &dlm->master_list) {
- mle = list_entry(iter, struct dlm_master_list_entry, list);
+ list_for_each_entry(mle, &dlm->master_list, list)
dlm_print_one_mle(mle);
- }
spin_unlock(&dlm->master_lock);
}
int dlm_dump_all_mles(const char __user *data, unsigned int len)
{
- struct list_head *iter;
struct dlm_ctxt *dlm;
spin_lock(&dlm_domain_lock);
- list_for_each(iter, &dlm_domains) {
- dlm = list_entry (iter, struct dlm_ctxt, list);
+ list_for_each_entry(dlm, &dlm_domains, list) {
mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
dlm_dump_mles(dlm);
}
@@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
char *name, unsigned int namelen)
{
struct dlm_master_list_entry *tmpmle;
- struct list_head *iter;
assert_spin_locked(&dlm->master_lock);
- list_for_each(iter, &dlm->master_list) {
- tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
+ list_for_each_entry(tmpmle, &dlm->master_list, list) {
if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
continue;
dlm_get_mle(tmpmle);
@@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
{
struct dlm_master_list_entry *mle;
- struct list_head *iter;
assert_spin_locked(&dlm->spinlock);
- list_for_each(iter, &dlm->mle_hb_events) {
- mle = list_entry(iter, struct dlm_master_list_entry,
- hb_events);
+ list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
if (node_up)
dlm_mle_node_up(dlm, mle, NULL, idx);
else
@@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
int ret;
int i;
int count = 0;
- struct list_head *queue, *iter;
+ struct list_head *queue;
struct dlm_lock *lock;
assert_spin_locked(&res->spinlock);
@@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
ret = 0;
queue = &res->granted;
for (i = 0; i < 3; i++) {
- list_for_each(iter, queue) {
- lock = list_entry(iter, struct dlm_lock, list);
+ list_for_each_entry(lock, queue, list) {
++count;
if (lock->ml.node == dlm->node_num) {
mlog(0, "found a lock owned by this node still "
@@ -2923,18 +2912,16 @@ again:
static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- struct list_head *iter, *iter2;
struct list_head *queue = &res->granted;
int i, bit;
- struct dlm_lock *lock;
+ struct dlm_lock *lock, *next;
assert_spin_locked(&res->spinlock);
BUG_ON(res->owner == dlm->node_num);
for (i=0; i<3; i++) {
- list_for_each_safe(iter, iter2, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, queue, list) {
if (lock->ml.node != dlm->node_num) {
mlog(0, "putting lock for node %u\n",
lock->ml.node);
@@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
{
int i;
struct list_head *queue = &res->granted;
- struct list_head *iter;
struct dlm_lock *lock;
int nodenum;
@@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
for (i=0; i<3; i++) {
- list_for_each(iter, queue) {
+ list_for_each_entry(lock, queue, list) {
/* up to the caller to make sure this node
* is alive */
- lock = list_entry (iter, struct dlm_lock, list);
if (lock->ml.node != dlm->node_num) {
spin_unlock(&res->spinlock);
return lock->ml.node;
@@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
{
- struct list_head *iter, *iter2;
- struct dlm_master_list_entry *mle;
+ struct dlm_master_list_entry *mle, *next;
struct dlm_lock_resource *res;
unsigned int hash;
@@ -3245,9 +3229,7 @@ top:
/* clean the master list */
spin_lock(&dlm->master_lock);
- list_for_each_safe(iter, iter2, &dlm->master_list) {
- mle = list_entry(iter, struct dlm_master_list_entry, list);
-
+ list_for_each_entry_safe(mle, next, &dlm->master_list, list) {
BUG_ON(mle->type != DLM_MLE_BLOCK &&
mle->type != DLM_MLE_MASTER &&
mle->type != DLM_MLE_MIGRATION);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 671c4ed..a2c3316 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struct *work)
struct dlm_ctxt *dlm =
container_of(work, struct dlm_ctxt, dispatched_work);
LIST_HEAD(tmp_list);
- struct list_head *iter, *iter2;
- struct dlm_work_item *item;
+ struct dlm_work_item *item, *next;
dlm_workfunc_t *workfunc;
int tot=0;
@@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struct *work)
list_splice_init(&dlm->work_list, &tmp_list);
spin_unlock(&dlm->work_lock);
- list_for_each_safe(iter, iter2, &tmp_list) {
+ list_for_each_entry(item, &tmp_list, list) {
tot++;
}
mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
- list_for_each_safe(iter, iter2, &tmp_list) {
- item = list_entry(iter, struct dlm_work_item, list);
+ list_for_each_entry_safe(item, next, &tmp_list, list) {
workfunc = item->func;
list_del_init(&item->list);
@@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
{
int status = 0;
struct dlm_reco_node_data *ndata;
- struct list_head *iter;
int all_nodes_done;
int destroy = 0;
int pass = 0;
@@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
/* safe to access the node data list without a lock, since this
* process is the only one to change the list */
- list_for_each(iter, &dlm->reco.node_data) {
- ndata = list_entry (iter, struct dlm_reco_node_data, list);
+ list_for_each_entry(ndata, &dlm->reco.node_data, list) {
BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
@@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
* done, or if anyone died */
all_nodes_done = 1;
spin_lock(&dlm_reco_state_lock);
- list_for_each(iter, &dlm->reco.node_data) {
- ndata = list_entry (iter, struct dlm_reco_node_data, list);
-
+ list_for_each_entry(ndata, &dlm->reco.node_data, list) {
mlog(0, "checking recovery state of node %u\n",
ndata->node_num);
switch (ndata->state) {
@@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
{
- struct list_head *iter, *iter2;
- struct dlm_reco_node_data *ndata;
+ struct dlm_reco_node_data *ndata, *next;
LIST_HEAD(tmplist);
spin_lock(&dlm_reco_state_lock);
list_splice_init(&dlm->reco.node_data, &tmplist);
spin_unlock(&dlm_reco_state_lock);
- list_for_each_safe(iter, iter2, &tmplist) {
- ndata = list_entry (iter, struct dlm_reco_node_data, list);
+ list_for_each_entry_safe(ndata, next, &tmplist, list) {
list_del_init(&ndata->list);
kfree(ndata);
}
@@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
struct dlm_lock_resource *res;
struct dlm_ctxt *dlm;
LIST_HEAD(resources);
- struct list_head *iter;
int ret;
u8 dead_node, reco_master;
int skip_all_done = 0;
@@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
/* any errors returned will be due to the new_master dying,
* the dlm_reco_thread should detect this */
- list_for_each(iter, &resources) {
- res = list_entry (iter, struct dlm_lock_resource, recovering);
+ list_for_each_entry(res, &resources, recovering) {
ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
DLM_MRES_RECOVERY);
if (ret < 0) {
@@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
{
struct dlm_ctxt *dlm = data;
struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf;
- struct list_head *iter;
struct dlm_reco_node_data *ndata = NULL;
int ret = -EINVAL;
@@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data,
dlm->reco.dead_node, done->node_idx, dlm->node_num);
spin_lock(&dlm_reco_state_lock);
- list_for_each(iter, &dlm->reco.node_data) {
- ndata = list_entry (iter, struct dlm_reco_node_data, list);
+ list_for_each_entry(ndata, &dlm->reco.node_data, list) {
if (ndata->node_num != done->node_idx)
continue;
@@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
struct list_head *list,
u8 dead_node)
{
- struct dlm_lock_resource *res;
- struct list_head *iter, *iter2;
+ struct dlm_lock_resource *res, *next;
struct dlm_lock *lock;
spin_lock(&dlm->spinlock);
- list_for_each_safe(iter, iter2, &dlm->reco.resources) {
- res = list_entry (iter, struct dlm_lock_resource, recovering);
+ list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -1169,7 +1155,7 @@ static void dlm_init_migratable_lockres(struct dlm_migratable_lockres *mres,
u8 flags, u8 master)
{
/* mres here is one full page */
- memset(mres, 0, PAGE_SIZE);
+ clear_page(mres);
mres->lockname_len = namelen;
memcpy(mres->lockname, lockname, namelen);
mres->num_locks = 0;
@@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_migratable_lockres *mres,
u8 send_to, u8 flags)
{
- struct list_head *queue, *iter;
+ struct list_head *queue;
int total_locks, i;
u64 mig_cookie = 0;
struct dlm_lock *lock;
@@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
total_locks = 0;
for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) {
queue = dlm_list_idx_to_ptr(res, i);
- list_for_each(iter, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
-
+ list_for_each_entry(lock, queue, list) {
/* add another lock. */
total_locks++;
if (!dlm_add_lock_to_array(lock, mres, i))
@@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_lockstatus *lksb = NULL;
int ret = 0;
int i, j, bad;
- struct list_head *iter;
struct dlm_lock *lock = NULL;
u8 from = O2NM_MAX_NODES;
unsigned int added = 0;
@@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) {
tmpq = dlm_list_idx_to_ptr(res, j);
- list_for_each(iter, tmpq) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry(lock, tmpq, list) {
if (lock->ml.cookie != ml->cookie)
lock = NULL;
else
@@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
int i;
- struct list_head *queue, *iter, *iter2;
- struct dlm_lock *lock;
+ struct list_head *queue;
+ struct dlm_lock *lock, *next;
res->state |= DLM_LOCK_RES_RECOVERING;
if (!list_empty(&res->recovering)) {
@@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
/* find any pending locks and put them back on proper list */
for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) {
queue = dlm_list_idx_to_ptr(res, i);
- list_for_each_safe(iter, iter2, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, queue, list) {
dlm_lock_get(lock);
if (lock->convert_pending) {
/* move converting lock back to granted */
@@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
u8 dead_node, u8 new_master)
{
int i;
- struct list_head *iter, *iter2;
struct hlist_node *hash_iter;
struct hlist_head *bucket;
-
- struct dlm_lock_resource *res;
+ struct dlm_lock_resource *res, *next;
mlog_entry_void();
assert_spin_locked(&dlm->spinlock);
- list_for_each_safe(iter, iter2, &dlm->reco.resources) {
- res = list_entry (iter, struct dlm_lock_resource, recovering);
+ list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
if (res->owner == dead_node) {
list_del_init(&res->recovering);
spin_lock(&res->spinlock);
@@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalidation(struct dlm_lock *lock, int local)
static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res, u8 dead_node)
{
- struct list_head *iter, *queue;
+ struct list_head *queue;
struct dlm_lock *lock;
int blank_lvb = 0, local = 0;
int i;
@@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) {
queue = dlm_list_idx_to_ptr(res, i);
- list_for_each(iter, queue) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry(lock, queue, list) {
if (lock->ml.node == search_node) {
if (dlm_lvb_needs_invalidation(lock, local)) {
/* zero the lksb lvb and lockres lvb */
@@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res, u8 dead_node)
{
- struct list_head *iter, *tmpiter;
- struct dlm_lock *lock;
+ struct dlm_lock *lock, *next;
unsigned int freed = 0;
/* this node is the lockres master:
@@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
assert_spin_locked(&res->spinlock);
/* TODO: check pending_asts, pending_basts here */
- list_for_each_safe(iter, tmpiter, &res->granted) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, &res->granted, list) {
if (lock->ml.node == dead_node) {
list_del_init(&lock->list);
dlm_lock_put(lock);
freed++;
}
}
- list_for_each_safe(iter, tmpiter, &res->converting) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, &res->converting, list) {
if (lock->ml.node == dead_node) {
list_del_init(&lock->list);
dlm_lock_put(lock);
freed++;
}
}
- list_for_each_safe(iter, tmpiter, &res->blocked) {
- lock = list_entry (iter, struct dlm_lock, list);
+ list_for_each_entry_safe(lock, next, &res->blocked, list) {
if (lock->ml.node == dead_node) {
list_del_init(&lock->list);
dlm_lock_put(lock);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index d1bd305..f71250e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_lock_level(int level)
static void lockres_set_flags(struct ocfs2_lock_res *lockres,
unsigned long newflags)
{
- struct list_head *pos, *tmp;
- struct ocfs2_mask_waiter *mw;
+ struct ocfs2_mask_waiter *mw, *tmp;
assert_spin_locked(&lockres->l_lock);
lockres->l_flags = newflags;
- list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
- mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
+ list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) {
if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
continue;
diff --git a/fs/ocfs2/endian.h b/fs/ocfs2/endian.h
index f226b22..ff25762 100644
--- a/fs/ocfs2/endian.h
+++ b/fs/ocfs2/endian.h
@@ -32,6 +32,11 @@ static inline void le32_add_cpu(__le32 *var, u32 val)
*var = cpu_to_le32(le32_to_cpu(*var) + val);
}
+static inline void le64_add_cpu(__le64 *var, u64 val)
+{
+ *var = cpu_to_le64(le64_to_cpu(*var) + val);
+}
+
static inline void le32_and_cpu(__le32 *var, u32 val)
{
*var = cpu_to_le32(le32_to_cpu(*var) & val);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ba2b2ab..03c1d365 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
*/
void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
{
- struct list_head *p, *n;
- struct ocfs2_extent_map_item *emi;
+ struct ocfs2_extent_map_item *emi, *n;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_extent_map *em = &oi->ip_extent_map;
LIST_HEAD(tmp_list);
unsigned int range;
spin_lock(&oi->ip_lock);
- list_for_each_safe(p, n, &em->em_list) {
- emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
-
+ list_for_each_entry_safe(emi, n, &em->em_list, ei_list) {
if (emi->ei_cpos >= cpos) {
/* Full truncate of this record. */
list_move(&emi->ei_list, &tmp_list);
@@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
}
spin_unlock(&oi->ip_lock);
- list_for_each_safe(p, n, &tmp_list) {
- emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+ list_for_each_entry_safe(emi, n, &tmp_list, ei_list) {
list_del(&emi->ei_list);
kfree(emi);
}
@@ -377,37 +373,6 @@ out:
return ret;
}
-/*
- * Return the index of the extent record which contains cluster #v_cluster.
- * -1 is returned if it was not found.
- *
- * Should work fine on interior and exterior nodes.
- */
-static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
- u32 v_cluster)
-{
- int ret = -1;
- int i;
- struct ocfs2_extent_rec *rec;
- u32 rec_end, rec_start, clusters;
-
- for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
- rec = &el->l_recs[i];
-
- rec_start = le32_to_cpu(rec->e_cpos);
- clusters = ocfs2_rec_clusters(el, rec);
-
- rec_end = rec_start + clusters;
-
- if (v_cluster >= rec_start && v_cluster < rec_end) {
- ret = i;
- break;
- }
- }
-
- return ret;
-}
-
int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
unsigned int *extent_flags)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4979b66..f04c7aa 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
int status;
handle_t *handle;
struct ocfs2_dinode *di;
+ u64 cluster_bytes;
mlog_entry_void();
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
/*
* Do this before setting i_size.
*/
- status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+ cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+ status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
+ cluster_bytes);
if (status) {
mlog_errno(status);
goto out_commit;
@@ -326,9 +329,6 @@ static int ocfs2_truncate_file(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)new_i_size);
- unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
- truncate_inode_pages(inode->i_mapping, new_i_size);
-
fe = (struct ocfs2_dinode *) di_bh->b_data;
if (!OCFS2_IS_VALID_DINODE(fe)) {
OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -363,16 +363,23 @@ static int ocfs2_truncate_file(struct inode *inode,
if (new_i_size == le64_to_cpu(fe->i_size))
goto bail;
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
/* This forces other nodes to sync and drop their pages. Do
* this even if we have a truncate without allocation change -
* ocfs2 cluster sizes can be much greater than page size, so
* we have to truncate them anyway. */
status = ocfs2_data_lock(inode, 1);
if (status < 0) {
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
mlog_errno(status);
goto bail;
}
+ unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+ truncate_inode_pages(inode->i_mapping, new_i_size);
+
/* alright, we're going to need to do a full blown alloc size
* change. Orphan the inode so that recovery can complete the
* truncate if necessary. This does the task of marking
@@ -399,6 +406,8 @@ static int ocfs2_truncate_file(struct inode *inode,
bail_unlock_data:
ocfs2_data_unlock(inode, 1);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
bail:
mlog_exit(status);
@@ -419,6 +428,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode,
u32 *logical_offset,
u32 clusters_to_add,
+ int mark_unwritten,
struct buffer_head *fe_bh,
handle_t *handle,
struct ocfs2_alloc_context *data_ac,
@@ -431,9 +441,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
enum ocfs2_alloc_restarted reason = RESTART_NONE;
u32 bit_off, num_bits;
u64 block;
+ u8 flags = 0;
BUG_ON(!clusters_to_add);
+ if (mark_unwritten)
+ flags = OCFS2_EXT_UNWRITTEN;
+
free_extents = ocfs2_num_free_extents(osb, inode, fe);
if (free_extents < 0) {
status = free_extents;
@@ -483,7 +497,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
*logical_offset, block, num_bits,
- meta_ac);
+ flags, meta_ac);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -516,25 +530,31 @@ leave:
* For a given allocation, determine which allocators will need to be
* accessed, and lock them, reserving the appropriate number of bits.
*
- * Called from ocfs2_extend_allocation() for file systems which don't
- * support holes, and from ocfs2_write() for file systems which
- * understand sparse inodes.
+ * Sparse file systems call this from ocfs2_write_begin_nolock()
+ * and ocfs2_allocate_unwritten_extents().
+ *
+ * File systems which don't support holes call this from
+ * ocfs2_extend_allocation().
*/
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
- u32 clusters_to_add,
+ u32 clusters_to_add, u32 extents_to_split,
struct ocfs2_alloc_context **data_ac,
struct ocfs2_alloc_context **meta_ac)
{
- int ret, num_free_extents;
+ int ret = 0, num_free_extents;
+ unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
*meta_ac = NULL;
- *data_ac = NULL;
+ if (data_ac)
+ *data_ac = NULL;
+
+ BUG_ON(clusters_to_add != 0 && data_ac == NULL);
mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
- "clusters_to_add = %u\n",
+ "clusters_to_add = %u, extents_to_split = %u\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
- le32_to_cpu(di->i_clusters), clusters_to_add);
+ le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
num_free_extents = ocfs2_num_free_extents(osb, inode, di);
if (num_free_extents < 0) {
@@ -552,9 +572,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
*
* Most of the time we'll only be seeing this 1 cluster at a time
* anyway.
+ *
+ * Always lock for any unwritten extents - we might want to
+ * add blocks during a split.
*/
if (!num_free_extents ||
- (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+ (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
if (ret < 0) {
if (ret != -ENOSPC)
@@ -563,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
}
}
+ if (clusters_to_add == 0)
+ goto out;
+
ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
if (ret < 0) {
if (ret != -ENOSPC)
@@ -585,14 +611,13 @@ out:
return ret;
}
-static int ocfs2_extend_allocation(struct inode *inode,
- u32 clusters_to_add)
+static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten)
{
int status = 0;
int restart_func = 0;
- int drop_alloc_sem = 0;
int credits;
- u32 prev_clusters, logical_start;
+ u32 prev_clusters;
struct buffer_head *bh = NULL;
struct ocfs2_dinode *fe = NULL;
handle_t *handle = NULL;
@@ -607,7 +632,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
* This function only exists for file systems which don't
* support holes.
*/
- BUG_ON(ocfs2_sparse_alloc(osb));
+ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
OCFS2_BH_CACHED, inode);
@@ -623,19 +648,10 @@ static int ocfs2_extend_allocation(struct inode *inode,
goto leave;
}
- logical_start = OCFS2_I(inode)->ip_clusters;
-
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
- /* blocks peope in read/write from reading our allocation
- * until we're done changing it. We depend on i_mutex to block
- * other extend/truncate calls while we're here. Ordering wrt
- * start_trans is important here -- always do it before! */
- down_write(&OCFS2_I(inode)->ip_alloc_sem);
- drop_alloc_sem = 1;
-
- status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+ status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
&meta_ac);
if (status) {
mlog_errno(status);
@@ -668,6 +684,7 @@ restarted_transaction:
inode,
&logical_start,
clusters_to_add,
+ mark_unwritten,
bh,
handle,
data_ac,
@@ -720,10 +737,6 @@ restarted_transaction:
OCFS2_I(inode)->ip_clusters, i_size_read(inode));
leave:
- if (drop_alloc_sem) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
- drop_alloc_sem = 0;
- }
if (handle) {
ocfs2_commit_trans(osb, handle);
handle = NULL;
@@ -749,6 +762,25 @@ leave:
return status;
}
+static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten)
+{
+ int ret;
+
+ /*
+ * The alloc sem blocks peope in read/write from reading our
+ * allocation until we're done changing it. We depend on
+ * i_mutex to block other extend/truncate calls while we're
+ * here.
+ */
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
+ mark_unwritten);
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ return ret;
+}
+
/* Some parts of this taken from generic_cont_expand, which turned out
* to be too fragile to do exactly what we need without us having to
* worry about recursive locking in ->prepare_write() and
@@ -890,7 +922,9 @@ static int ocfs2_extend_file(struct inode *inode,
}
if (clusters_to_add) {
- ret = ocfs2_extend_allocation(inode, clusters_to_add);
+ ret = ocfs2_extend_allocation(inode,
+ OCFS2_I(inode)->ip_clusters,
+ clusters_to_add, 0);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
@@ -995,6 +1029,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
goto bail_unlock;
}
+ /*
+ * This will intentionally not wind up calling vmtruncate(),
+ * since all the work for a size change has been done above.
+ * Otherwise, we could get into problems with truncate as
+ * ip_alloc_sem is used there to protect against i_size
+ * changes.
+ */
status = inode_setattr(inode, attr);
if (status < 0) {
mlog_errno(status);
@@ -1070,17 +1111,16 @@ out:
return ret;
}
-static int ocfs2_write_remove_suid(struct inode *inode)
+static int __ocfs2_write_remove_suid(struct inode *inode,
+ struct buffer_head *bh)
{
int ret;
- struct buffer_head *bh = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di;
mlog_entry("(Inode %llu, mode 0%o)\n",
- (unsigned long long)oi->ip_blkno, inode->i_mode);
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (handle == NULL) {
@@ -1089,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode)
goto out;
}
- ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_trans;
- }
-
ret = ocfs2_journal_access(handle, inode, bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
- goto out_bh;
+ goto out_trans;
}
inode->i_mode &= ~S_ISUID;
@@ -1112,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
ret = ocfs2_journal_dirty(handle, bh);
if (ret < 0)
mlog_errno(ret);
-out_bh:
- brelse(bh);
+
out_trans:
ocfs2_commit_trans(osb, handle);
out:
@@ -1159,6 +1192,460 @@ out:
return ret;
}
+static int ocfs2_write_remove_suid(struct inode *inode)
+{
+ int ret;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_inode_info *oi = OCFS2_I(inode);
+
+ ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+ oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = __ocfs2_write_remove_suid(inode, bh);
+out:
+ brelse(bh);
+ return ret;
+}
+
+/*
+ * Allocate enough extents to cover the region starting at byte offset
+ * start for len bytes. Existing extents are skipped, any extents
+ * added are marked as "unwritten".
+ */
+static int ocfs2_allocate_unwritten_extents(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret;
+ u32 cpos, phys_cpos, clusters, alloc_size;
+
+ /*
+ * We consider both start and len to be inclusive.
+ */
+ cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
+ clusters -= cpos;
+
+ while (clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+ &alloc_size, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * Hole or existing extent len can be arbitrary, so
+ * cap it to our own allocation request.
+ */
+ if (alloc_size > clusters)
+ alloc_size = clusters;
+
+ if (phys_cpos) {
+ /*
+ * We already have an allocation at this
+ * region so we can safely skip it.
+ */
+ goto next;
+ }
+
+ ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+next:
+ cpos += alloc_size;
+ clusters -= alloc_size;
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int __ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh,
+ u32 cpos, u32 phys_cpos, u32 len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+ if (handle == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_journal_access(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ OCFS2_I(inode)->ip_clusters -= len;
+ di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
+
+ ret = ocfs2_journal_dirty(handle, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
+/*
+ * Truncate a byte range, avoiding pages within partial clusters. This
+ * preserves those pages for the zeroing code to write to.
+ */
+static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
+ u64 byte_len)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ loff_t start, end;
+ struct address_space *mapping = inode->i_mapping;
+
+ start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
+ end = byte_start + byte_len;
+ end = end & ~(osb->s_clustersize - 1);
+
+ if (start < end) {
+ unmap_mapping_range(mapping, start, end - start, 0);
+ truncate_inode_pages_range(mapping, start, end - 1);
+ }
+}
+
+static int ocfs2_zero_partial_clusters(struct inode *inode,
+ u64 start, u64 len)
+{
+ int ret = 0;
+ u64 tmpend, end = start + len;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ unsigned int csize = osb->s_clustersize;
+ handle_t *handle;
+
+ /*
+ * The "start" and "end" values are NOT necessarily part of
+ * the range whose allocation is being deleted. Rather, this
+ * is what the user passed in with the request. We must zero
+ * partial clusters here. There's no need to worry about
+ * physical allocation - the zeroing code knows to skip holes.
+ */
+ mlog(0, "byte start: %llu, end: %llu\n",
+ (unsigned long long)start, (unsigned long long)end);
+
+ /*
+ * If both edges are on a cluster boundary then there's no
+ * zeroing required as the region is part of the allocation to
+ * be truncated.
+ */
+ if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (handle == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ /*
+ * We want to get the byte offset of the end of the 1st cluster.
+ */
+ tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
+ if (tmpend > end)
+ tmpend = end;
+
+ mlog(0, "1st range: start: %llu, tmpend: %llu\n",
+ (unsigned long long)start, (unsigned long long)tmpend);
+
+ ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
+ if (ret)
+ mlog_errno(ret);
+
+ if (tmpend < end) {
+ /*
+ * This may make start and end equal, but the zeroing
+ * code will skip any work in that case so there's no
+ * need to catch it up here.
+ */
+ start = end & ~(osb->s_clustersize - 1);
+
+ mlog(0, "2nd range: start: %llu, end: %llu\n",
+ (unsigned long long)start, (unsigned long long)end);
+
+ ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
+ if (ret)
+ mlog_errno(ret);
+ }
+
+ ocfs2_commit_trans(osb, handle);
+out:
+ return ret;
+}
+
+static int ocfs2_remove_inode_range(struct inode *inode,
+ struct buffer_head *di_bh, u64 byte_start,
+ u64 byte_len)
+{
+ int ret = 0;
+ u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_cached_dealloc_ctxt dealloc;
+
+ ocfs2_init_dealloc_ctxt(&dealloc);
+
+ if (byte_len == 0)
+ return 0;
+
+ trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
+ trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
+ if (trunc_len >= trunc_start)
+ trunc_len -= trunc_start;
+ else
+ trunc_len = 0;
+
+ mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)byte_start,
+ (unsigned long long)byte_len, trunc_start, trunc_len);
+
+ ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = trunc_start;
+ while (trunc_len) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
+ &alloc_size, NULL);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (alloc_size > trunc_len)
+ alloc_size = trunc_len;
+
+ /* Only do work for non-holes */
+ if (phys_cpos != 0) {
+ ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
+ phys_cpos, alloc_size,
+ &dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ cpos += alloc_size;
+ trunc_len -= alloc_size;
+ }
+
+ ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
+
+out:
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &dealloc);
+
+ return ret;
+}
+
+/*
+ * Parts of this function taken from xfs_change_file_space()
+ */
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+ struct ocfs2_space_resv *sr)
+{
+ int ret;
+ s64 llen;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ handle_t *handle;
+ unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
+
+ if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
+ !ocfs2_writes_unwritten_extents(osb))
+ return -ENOTTY;
+ else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
+ !ocfs2_sparse_alloc(osb))
+ return -ENOTTY;
+
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EBADF;
+
+ if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+ return -EROFS;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * This prevents concurrent writes on other nodes
+ */
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_rw_unlock;
+ }
+
+ if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
+ ret = -EPERM;
+ goto out_meta_unlock;
+ }
+
+ switch (sr->l_whence) {
+ case 0: /*SEEK_SET*/
+ break;
+ case 1: /*SEEK_CUR*/
+ sr->l_start += file->f_pos;
+ break;
+ case 2: /*SEEK_END*/
+ sr->l_start += i_size_read(inode);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out_meta_unlock;
+ }
+ sr->l_whence = 0;
+
+ llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
+
+ if (sr->l_start < 0
+ || sr->l_start > max_off
+ || (sr->l_start + llen) < 0
+ || (sr->l_start + llen) > max_off) {
+ ret = -EINVAL;
+ goto out_meta_unlock;
+ }
+
+ if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
+ if (sr->l_len <= 0) {
+ ret = -EINVAL;
+ goto out_meta_unlock;
+ }
+ }
+
+ if (should_remove_suid(file->f_path.dentry)) {
+ ret = __ocfs2_write_remove_suid(inode, di_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_meta_unlock;
+ }
+ }
+
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ switch (cmd) {
+ case OCFS2_IOC_RESVSP:
+ case OCFS2_IOC_RESVSP64:
+ /*
+ * This takes unsigned offsets, but the signed ones we
+ * pass have been checked against overflow above.
+ */
+ ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
+ sr->l_len);
+ break;
+ case OCFS2_IOC_UNRESVSP:
+ case OCFS2_IOC_UNRESVSP64:
+ ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
+ sr->l_len);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_meta_unlock;
+ }
+
+ /*
+ * We update c/mtime for these changes
+ */
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out_meta_unlock;
+ }
+
+ inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+ ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
+ if (ret < 0)
+ mlog_errno(ret);
+
+ ocfs2_commit_trans(osb, handle);
+
+out_meta_unlock:
+ brelse(di_bh);
+ ocfs2_meta_unlock(inode, 1);
+out_rw_unlock:
+ ocfs2_rw_unlock(inode, 1);
+
+ mutex_unlock(&inode->i_mutex);
+out:
+ return ret;
+}
+
static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
loff_t *ppos,
size_t count,
@@ -1329,15 +1816,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
*basep = base;
}
-static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+static struct page * ocfs2_get_write_source(char **ret_src_buf,
const struct iovec *cur_iov,
size_t iov_offset)
{
int ret;
- char *buf;
+ char *buf = cur_iov->iov_base + iov_offset;
struct page *src_page = NULL;
+ unsigned long off;
- buf = cur_iov->iov_base + iov_offset;
+ off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
if (!segment_eq(get_fs(), KERNEL_DS)) {
/*
@@ -1349,18 +1837,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp
(unsigned long)buf & PAGE_CACHE_MASK, 1,
0, 0, &src_page, NULL);
if (ret == 1)
- bp->b_src_buf = kmap(src_page);
+ *ret_src_buf = kmap(src_page) + off;
else
src_page = ERR_PTR(-EFAULT);
} else {
- bp->b_src_buf = buf;
+ *ret_src_buf = buf;
}
return src_page;
}
-static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
- struct page *page)
+static void ocfs2_put_write_source(struct page *page)
{
if (page) {
kunmap(page);
@@ -1376,10 +1863,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
{
int ret = 0;
ssize_t copied, total = 0;
- size_t iov_offset = 0;
+ size_t iov_offset = 0, bytes;
+ loff_t pos;
const struct iovec *cur_iov = iov;
- struct ocfs2_buffered_write_priv bp;
- struct page *page;
+ struct page *user_page, *page;
+ char *buf, *dst;
+ void *fsdata;
/*
* handle partial DIO write. Adjust cur_iov if needed.
@@ -1387,21 +1876,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
do {
- bp.b_cur_off = iov_offset;
- bp.b_cur_iov = cur_iov;
+ pos = *ppos;
- page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
- if (IS_ERR(page)) {
- ret = PTR_ERR(page);
+ user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
+ if (IS_ERR(user_page)) {
+ ret = PTR_ERR(user_page);
goto out;
}
- copied = ocfs2_buffered_write_cluster(file, *ppos, count,
- ocfs2_map_and_write_user_data,
- &bp);
+ /* Stay within our page boundaries */
+ bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
+ (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
+ /* Stay within the vector boundary */
+ bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
+ /* Stay within count */
+ bytes = min(bytes, count);
+
+ page = NULL;
+ ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
+ &page, &fsdata);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- ocfs2_put_write_source(&bp, page);
+ dst = kmap_atomic(page, KM_USER0);
+ memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
+ kunmap_atomic(dst, KM_USER0);
+ flush_dcache_page(page);
+ ocfs2_put_write_source(user_page);
+ copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
+ bytes, page, fsdata);
if (copied < 0) {
mlog_errno(copied);
ret = copied;
@@ -1409,7 +1915,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
}
total += copied;
- *ppos = *ppos + copied;
+ *ppos = pos + copied;
count -= copied;
ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1579,52 +2085,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
struct pipe_buffer *buf,
struct splice_desc *sd)
{
- int ret, count, total = 0;
+ int ret, count;
ssize_t copied = 0;
- struct ocfs2_splice_write_priv sp;
+ struct file *file = sd->u.file;
+ unsigned int offset;
+ struct page *page = NULL;
+ void *fsdata;
+ char *src, *dst;
ret = buf->ops->confirm(pipe, buf);
if (ret)
goto out;
- sp.s_sd = sd;
- sp.s_buf = buf;
- sp.s_pipe = pipe;
- sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
- sp.s_buf_offset = buf->offset;
-
+ offset = sd->pos & ~PAGE_CACHE_MASK;
count = sd->len;
- if (count + sp.s_offset > PAGE_CACHE_SIZE)
- count = PAGE_CACHE_SIZE - sp.s_offset;
+ if (count + offset > PAGE_CACHE_SIZE)
+ count = PAGE_CACHE_SIZE - offset;
- do {
- /*
- * splice wants us to copy up to one page at a
- * time. For pagesize > cluster size, this means we
- * might enter ocfs2_buffered_write_cluster() more
- * than once, so keep track of our progress here.
- */
- copied = ocfs2_buffered_write_cluster(sd->u.file,
- (loff_t)sd->pos + total,
- count,
- ocfs2_map_and_write_splice_data,
- &sp);
- if (copied < 0) {
- mlog_errno(copied);
- ret = copied;
- goto out;
- }
+ ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
+ &page, &fsdata);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
- count -= copied;
- sp.s_offset += copied;
- sp.s_buf_offset += copied;
- total += copied;
- } while (count);
+ src = buf->ops->map(pipe, buf, 1);
+ dst = kmap_atomic(page, KM_USER1);
+ memcpy(dst + offset, src + buf->offset, count);
+ kunmap_atomic(page, KM_USER1);
+ buf->ops->unmap(pipe, buf, src);
- ret = 0;
+ copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
+ page, fsdata);
+ if (copied < 0) {
+ mlog_errno(copied);
+ ret = copied;
+ goto out;
+ }
out:
- return total ? total : ret;
+ return copied ? copied : ret;
}
static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index a4dd1fa..36fe27f 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,15 +39,16 @@ enum ocfs2_alloc_restarted {
};
int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
struct inode *inode,
- u32 *cluster_start,
+ u32 *logical_offset,
u32 clusters_to_add,
+ int mark_unwritten,
struct buffer_head *fe_bh,
handle_t *handle,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
- enum ocfs2_alloc_restarted *reason);
+ enum ocfs2_alloc_restarted *reason_ret);
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
- u32 clusters_to_add,
+ u32 clusters_to_add, u32 extents_to_split,
struct ocfs2_alloc_context **data_ac,
struct ocfs2_alloc_context **meta_ac);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
@@ -61,4 +62,7 @@ int ocfs2_should_update_atime(struct inode *inode,
int ocfs2_update_inode_atime(struct inode *inode,
struct buffer_head *bh);
+int ocfs2_change_file_space(struct file *file, unsigned int cmd,
+ struct ocfs2_space_resv *sr);
+
#endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index b25ef63..352eb4a 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -157,16 +157,16 @@ int ocfs2_register_hb_callbacks(struct ocfs2_super *osb)
if (ocfs2_mount_local(osb))
return 0;
- status = o2hb_register_callback(&osb->osb_hb_down);
+ status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_down);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = o2hb_register_callback(&osb->osb_hb_up);
+ status = o2hb_register_callback(osb->uuid_str, &osb->osb_hb_up);
if (status < 0) {
mlog_errno(status);
- o2hb_unregister_callback(&osb->osb_hb_down);
+ o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
}
bail:
@@ -178,8 +178,8 @@ void ocfs2_clear_hb_callbacks(struct ocfs2_super *osb)
if (ocfs2_mount_local(osb))
return;
- o2hb_unregister_callback(&osb->osb_hb_down);
- o2hb_unregister_callback(&osb->osb_hb_up);
+ o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_down);
+ o2hb_unregister_callback(osb->uuid_str, &osb->osb_hb_up);
}
void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index f3ad21a..bd68c3f 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -14,6 +14,7 @@
#include "ocfs2.h"
#include "alloc.h"
#include "dlmglue.h"
+#include "file.h"
#include "inode.h"
#include "journal.h"
@@ -115,6 +116,7 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
{
unsigned int flags;
int status;
+ struct ocfs2_space_resv sr;
switch (cmd) {
case OCFS2_IOC_GETFLAGS:
@@ -130,6 +132,14 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
return ocfs2_set_inode_attr(inode, flags,
OCFS2_FL_MODIFIABLE);
+ case OCFS2_IOC_RESVSP:
+ case OCFS2_IOC_RESVSP64:
+ case OCFS2_IOC_UNRESVSP:
+ case OCFS2_IOC_UNRESVSP64:
+ if (copy_from_user(&sr, (int __user *) arg, sizeof(sr)))
+ return -EFAULT;
+
+ return ocfs2_change_file_space(filp, cmd, &sr);
default:
return -ENOTTY;
}
@@ -148,6 +158,11 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case OCFS2_IOC32_SETFLAGS:
cmd = OCFS2_IOC_SETFLAGS;
break;
+ case OCFS2_IOC_RESVSP:
+ case OCFS2_IOC_RESVSP64:
+ case OCFS2_IOC_UNRESVSP:
+ case OCFS2_IOC_UNRESVSP64:
+ break;
default:
return -ENOIOCTLCMD;
}
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index dc11880..dbfb20b 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
container_of(work, struct ocfs2_journal, j_recovery_work);
struct ocfs2_super *osb = journal->j_osb;
struct ocfs2_dinode *la_dinode, *tl_dinode;
- struct ocfs2_la_recovery_item *item;
- struct list_head *p, *n;
+ struct ocfs2_la_recovery_item *item, *n;
LIST_HEAD(tmp_la_list);
mlog_entry_void();
@@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
spin_unlock(&journal->j_lock);
- list_for_each_safe(p, n, &tmp_la_list) {
- item = list_entry(p, struct ocfs2_la_recovery_item, lri_list);
+ list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
list_del_init(&item->lri_list);
mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3db5de4..ce60aab 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -289,6 +289,8 @@ int ocfs2_journal_dirty_data(handle_t *handle,
#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
+ OCFS2_TRUNCATE_LOG_UPDATE)
+#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+
/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
* bitmap block for the new bit) */
#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158..d79aa12 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,11 +37,29 @@
#include "ocfs2.h"
+#include "aops.h"
#include "dlmglue.h"
#include "file.h"
#include "inode.h"
#include "mmap.h"
+static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
+{
+ /* The best way to deal with signals in the vm path is
+ * to block them upfront, rather than allowing the
+ * locking paths to return -ERESTARTSYS. */
+ sigfillset(blocked);
+
+ /* We should technically never get a bad return value
+ * from sigprocmask */
+ return sigprocmask(SIG_BLOCK, blocked, oldset);
+}
+
+static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
+{
+ return sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
static struct page *ocfs2_nopage(struct vm_area_struct * area,
unsigned long address,
int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
type);
- /* The best way to deal with signals in this path is
- * to block them upfront, rather than allowing the
- * locking paths to return -ERESTARTSYS. */
- sigfillset(&blocked);
-
- /* We should technically never get a bad ret return
- * from sigprocmask */
- ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+ ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
page = filemap_nopage(area, address, type);
- ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+ ret = ocfs2_vm_op_unblock_sigs(&oldset);
if (ret < 0)
mlog_errno(ret);
out:
@@ -76,28 +87,136 @@ out:
return page;
}
-static struct vm_operations_struct ocfs2_file_vm_ops = {
- .nopage = ocfs2_nopage,
-};
+static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+ struct page *page)
+{
+ int ret;
+ struct address_space *mapping = inode->i_mapping;
+ loff_t pos = page->index << PAGE_CACHE_SHIFT;
+ unsigned int len = PAGE_CACHE_SIZE;
+ pgoff_t last_index;
+ struct page *locked_page = NULL;
+ void *fsdata;
+ loff_t size = i_size_read(inode);
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+ /*
+ * Another node might have truncated while we were waiting on
+ * cluster locks.
+ */
+ last_index = size >> PAGE_CACHE_SHIFT;
+ if (page->index > last_index) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * The i_size check above doesn't catch the case where nodes
+ * truncated and then re-extended the file. We'll re-check the
+ * page mapping after taking the page lock inside of
+ * ocfs2_write_begin_nolock().
+ */
+ if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Call ocfs2_write_begin() and ocfs2_write_end() to take
+ * advantage of the allocation code there. We pass a write
+ * length of the whole page (chopped to i_size) to make sure
+ * the whole thing is allocated.
+ *
+ * Since we know the page is up to date, we don't have to
+ * worry about ocfs2_write_begin() skipping some buffer reads
+ * because the "write" would invalidate their data.
+ */
+ if (page->index == last_index)
+ len = size & ~PAGE_CACHE_MASK;
+
+ ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+ &fsdata, di_bh, page);
+ if (ret) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+ fsdata);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ BUG_ON(ret != len);
+ ret = 0;
+out:
+ return ret;
+}
+
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
{
- int ret = 0, lock_level = 0;
- struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
+ struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ struct buffer_head *di_bh = NULL;
+ sigset_t blocked, oldset;
+ int ret, ret2;
+
+ ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ /*
+ * The cluster locks taken will block a truncate from another
+ * node. Taking the data lock will also ensure that we don't
+ * attempt page truncation as part of a downconvert.
+ */
+ ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
- * Only support shared writeable mmap for local mounts which
- * don't know about holes.
+ * The alloc sem should be enough to serialize with
+ * ocfs2_truncate_file() changing i_size as well as any thread
+ * modifying the inode btree.
*/
- if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
- ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
- ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
- mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
- /* This is -EINVAL because generic_file_readonly_mmap
- * returns it in a similar situation. */
- return -EINVAL;
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ ret = ocfs2_data_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out_meta_unlock;
}
+ ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+
+ ocfs2_data_unlock(inode, 1);
+
+out_meta_unlock:
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(di_bh);
+ ocfs2_meta_unlock(inode, 1);
+
+out:
+ ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+ if (ret2 < 0)
+ mlog_errno(ret2);
+
+ return ret;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+ .nopage = ocfs2_nopage,
+ .page_mkwrite = ocfs2_page_mkwrite,
+};
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int ret = 0, lock_level = 0;
+
ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
file->f_vfsmnt, &lock_level);
if (ret < 0) {
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 36289e6..d430fda 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1674,7 +1674,7 @@ static int ocfs2_symlink(struct inode *dir,
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations;
- status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+ status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0,
new_fe_bh,
handle, data_ac, NULL,
NULL);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a860633..5cc90a4 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -219,6 +219,7 @@ struct ocfs2_super
u16 max_slots;
s16 node_num;
s16 slot_num;
+ s16 preferred_slot;
int s_sectsize_bits;
int s_clustersize;
int s_clustersize_bits;
@@ -305,6 +306,19 @@ static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
+{
+ /*
+ * Support for sparse files is a pre-requisite
+ */
+ if (!ocfs2_sparse_alloc(osb))
+ return 0;
+
+ if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_UNWRITTEN)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index f0d9eb0..82f8a75 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,7 @@
#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
| OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP 0
+#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
/*
* Heartbeat-only devices are missing journals and other files. The
@@ -116,6 +116,11 @@
*/
#define OCFS2_FEATURE_COMPAT_BACKUP_SB 0x0001
+/*
+ * Unwritten extents support.
+ */
+#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
+
/* The byte offset of the first backup block will be 1G.
* The following will be 4G, 16G, 64G, 256G and 1T.
*/
@@ -170,6 +175,32 @@
#define OCFS2_IOC32_SETFLAGS _IOW('f', 2, int)
/*
+ * Space reservation / allocation / free ioctls and argument structure
+ * are designed to be compatible with XFS.
+ *
+ * ALLOCSP* and FREESP* are not and will never be supported, but are
+ * included here for completeness.
+ */
+struct ocfs2_space_resv {
+ __s16 l_type;
+ __s16 l_whence;
+ __s64 l_start;
+ __s64 l_len; /* len == 0 means until end of file */
+ __s32 l_sysid;
+ __u32 l_pid;
+ __s32 l_pad[4]; /* reserve area */
+};
+
+#define OCFS2_IOC_ALLOCSP _IOW ('X', 10, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP _IOW ('X', 11, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP _IOW ('X', 40, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP _IOW ('X', 41, struct ocfs2_space_resv)
+#define OCFS2_IOC_ALLOCSP64 _IOW ('X', 36, struct ocfs2_space_resv)
+#define OCFS2_IOC_FREESP64 _IOW ('X', 37, struct ocfs2_space_resv)
+#define OCFS2_IOC_RESVSP64 _IOW ('X', 42, struct ocfs2_space_resv)
+#define OCFS2_IOC_UNRESVSP64 _IOW ('X', 43, struct ocfs2_space_resv)
+
+/*
* Journal Flags (ocfs2_dinode.id1.journal1.i_flags)
*/
#define OCFS2_JOURNAL_DIRTY_FL (0x00000001) /* Journal needs recovery */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index d8b7906..af4882b 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -121,17 +121,25 @@ static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
return ret;
}
-static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si)
+static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred)
{
int i;
s16 ret = OCFS2_INVALID_SLOT;
+ if (preferred >= 0 && preferred < si->si_num_slots) {
+ if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) {
+ ret = preferred;
+ goto out;
+ }
+ }
+
for(i = 0; i < si->si_num_slots; i++) {
if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) {
ret = (s16) i;
break;
}
}
+out:
return ret;
}
@@ -248,7 +256,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
if (slot == OCFS2_INVALID_SLOT) {
/* if no slot yet, then just take 1st available
* one. */
- slot = __ocfs2_find_empty_slot(si);
+ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot == OCFS2_INVALID_SLOT) {
spin_unlock(&si->si_lock);
mlog(ML_ERROR, "no free slots available!\n");
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index e343762..d9c5c9f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -98,14 +98,6 @@ static int ocfs2_relink_block_group(handle_t *handle,
u16 chain);
static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
u32 wanted);
-static int ocfs2_free_suballoc_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct buffer_head *alloc_bh,
- unsigned int start_bit,
- u64 bg_blkno,
- unsigned int count);
-static inline u64 ocfs2_which_suballoc_group(u64 block,
- unsigned int bit);
static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
u64 bg_blkno,
u16 bg_bit_off);
@@ -496,13 +488,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
(*ac)->ac_which = OCFS2_AC_USE_META;
-
-#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
- slot = 0;
-#else
slot = osb->slot_num;
-#endif
-
(*ac)->ac_group_search = ocfs2_block_group_search;
status = ocfs2_reserve_suballoc_bits(osb, (*ac),
@@ -1626,12 +1612,12 @@ bail:
/*
* expects the suballoc inode to already be locked.
*/
-static int ocfs2_free_suballoc_bits(handle_t *handle,
- struct inode *alloc_inode,
- struct buffer_head *alloc_bh,
- unsigned int start_bit,
- u64 bg_blkno,
- unsigned int count)
+int ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count)
{
int status = 0;
u32 tmp_used;
@@ -1703,13 +1689,6 @@ bail:
return status;
}
-static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
-{
- u64 group = block - (u64) bit;
-
- return group;
-}
-
int ocfs2_free_dinode(handle_t *handle,
struct inode *inode_alloc_inode,
struct buffer_head *inode_alloc_bh,
@@ -1723,19 +1702,6 @@ int ocfs2_free_dinode(handle_t *handle,
inode_alloc_bh, bit, bg_blkno, 1);
}
-int ocfs2_free_extent_block(handle_t *handle,
- struct inode *eb_alloc_inode,
- struct buffer_head *eb_alloc_bh,
- struct ocfs2_extent_block *eb)
-{
- u64 blk = le64_to_cpu(eb->h_blkno);
- u16 bit = le16_to_cpu(eb->h_suballoc_bit);
- u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
-
- return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
- bit, bg_blkno, 1);
-}
-
int ocfs2_free_clusters(handle_t *handle,
struct inode *bitmap_inode,
struct buffer_head *bitmap_bh,
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 1a3c94c..f212dc0 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -86,20 +86,29 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb,
u32 *cluster_start,
u32 *num_clusters);
+int ocfs2_free_suballoc_bits(handle_t *handle,
+ struct inode *alloc_inode,
+ struct buffer_head *alloc_bh,
+ unsigned int start_bit,
+ u64 bg_blkno,
+ unsigned int count);
int ocfs2_free_dinode(handle_t *handle,
struct inode *inode_alloc_inode,
struct buffer_head *inode_alloc_bh,
struct ocfs2_dinode *di);
-int ocfs2_free_extent_block(handle_t *handle,
- struct inode *eb_alloc_inode,
- struct buffer_head *eb_alloc_bh,
- struct ocfs2_extent_block *eb);
int ocfs2_free_clusters(handle_t *handle,
struct inode *bitmap_inode,
struct buffer_head *bitmap_bh,
u64 start_blk,
unsigned int num_clusters);
+static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
+{
+ u64 group = block - (u64) bit;
+
+ return group;
+}
+
static inline u32 ocfs2_cluster_from_desc(struct ocfs2_super *osb,
u64 bg_blkno)
{
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 86b559c..3a5a1ed 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -82,7 +82,8 @@ MODULE_AUTHOR("Oracle");
MODULE_LICENSE("GPL");
static int ocfs2_parse_options(struct super_block *sb, char *options,
- unsigned long *mount_opt, int is_remount);
+ unsigned long *mount_opt, s16 *slot,
+ int is_remount);
static void ocfs2_put_super(struct super_block *sb);
static int ocfs2_mount_volume(struct super_block *sb);
static int ocfs2_remount(struct super_block *sb, int *flags, char *data);
@@ -114,8 +115,6 @@ static void ocfs2_write_super(struct super_block *sb);
static struct inode *ocfs2_alloc_inode(struct super_block *sb);
static void ocfs2_destroy_inode(struct inode *inode);
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
-
static const struct super_operations ocfs2_sops = {
.statfs = ocfs2_statfs,
.alloc_inode = ocfs2_alloc_inode,
@@ -140,6 +139,7 @@ enum {
Opt_data_ordered,
Opt_data_writeback,
Opt_atime_quantum,
+ Opt_slot,
Opt_err,
};
@@ -154,6 +154,7 @@ static match_table_t tokens = {
{Opt_data_ordered, "data=ordered"},
{Opt_data_writeback, "data=writeback"},
{Opt_atime_quantum, "atime_quantum=%u"},
+ {Opt_slot, "preferred_slot=%u"},
{Opt_err, NULL}
};
@@ -318,7 +319,7 @@ static void ocfs2_destroy_inode(struct inode *inode)
/* From xfs_super.c:xfs_max_file_offset
* Copyright (c) 2000-2004 Silicon Graphics, Inc.
*/
-static unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
+unsigned long long ocfs2_max_file_offset(unsigned int blockshift)
{
unsigned int pagefactor = 1;
unsigned int bitshift = BITS_PER_LONG - 1;
@@ -355,9 +356,10 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
int incompat_features;
int ret = 0;
unsigned long parsed_options;
+ s16 slot;
struct ocfs2_super *osb = OCFS2_SB(sb);
- if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
+ if (!ocfs2_parse_options(sb, data, &parsed_options, &slot, 1)) {
ret = -EINVAL;
goto out;
}
@@ -534,6 +536,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
struct dentry *root;
int status, sector_size;
unsigned long parsed_opt;
+ s16 slot;
struct inode *inode = NULL;
struct ocfs2_super *osb = NULL;
struct buffer_head *bh = NULL;
@@ -541,7 +544,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
mlog_entry("%p, %p, %i", sb, data, silent);
- if (!ocfs2_parse_options(sb, data, &parsed_opt, 0)) {
+ if (!ocfs2_parse_options(sb, data, &parsed_opt, &slot, 0)) {
status = -EINVAL;
goto read_super_error;
}
@@ -571,6 +574,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
brelse(bh);
bh = NULL;
osb->s_mount_opt = parsed_opt;
+ osb->preferred_slot = slot;
sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -713,6 +717,7 @@ static struct file_system_type ocfs2_fs_type = {
static int ocfs2_parse_options(struct super_block *sb,
char *options,
unsigned long *mount_opt,
+ s16 *slot,
int is_remount)
{
int status;
@@ -722,6 +727,7 @@ static int ocfs2_parse_options(struct super_block *sb,
options ? options : "(none)");
*mount_opt = 0;
+ *slot = OCFS2_INVALID_SLOT;
if (!options) {
status = 1;
@@ -782,6 +788,15 @@ static int ocfs2_parse_options(struct super_block *sb,
else
osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
break;
+ case Opt_slot:
+ option = 0;
+ if (match_int(&args[0], &option)) {
+ status = 0;
+ goto bail;
+ }
+ if (option)
+ *slot = (s16)option;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h
index 783f527..3b9cb3d 100644
--- a/fs/ocfs2/super.h
+++ b/fs/ocfs2/super.h
@@ -45,4 +45,6 @@ void __ocfs2_abort(struct super_block *sb,
#define ocfs2_abort(sb, fmt, args...) __ocfs2_abort(sb, __PRETTY_FUNCTION__, fmt, ##args)
+unsigned long long ocfs2_max_file_offset(unsigned int blockshift);
+
#endif /* OCFS2_SUPER_H */