Home Home > GIT Browse
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2019-03-20 11:14:50 +0000
committerFilipe Manana <fdmanana@suse.com>2019-03-20 11:14:50 +0000
commite1d910066fd394492356f6de5fbc48dc69a04283 (patch)
treee58fcf5dcaa7307e1c0352a34309d968894f36da
parentdb613f3d4e08211f4f2861d96fb6e3db5866936c (diff)
Btrfs: fix fsync after succession of renames of different files
(bsc#1126481).
-rw-r--r--patches.suse/btrfs-fix-fsync-after-succession-of-renames-of-diffe.patch361
-rw-r--r--series.conf1
2 files changed, 362 insertions, 0 deletions
diff --git a/patches.suse/btrfs-fix-fsync-after-succession-of-renames-of-diffe.patch b/patches.suse/btrfs-fix-fsync-after-succession-of-renames-of-diffe.patch
new file mode 100644
index 0000000000..61d6d54289
--- /dev/null
+++ b/patches.suse/btrfs-fix-fsync-after-succession-of-renames-of-diffe.patch
@@ -0,0 +1,361 @@
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 13 Feb 2019 12:14:03 +0000
+Patch-mainline: 5.1-rc1
+Git-commit: 6b5fc433a7ad6711052d1aa4be0debc6316b219f
+References: bsc#1126481
+Subject: [PATCH] Btrfs: fix fsync after succession of renames of different
+ files
+
+After a succession of rename operations of different files and fsyncing
+one of them, such that each file gets a new name that corresponds to an
+old name of another file, we can end up with a log that will cause a
+failure when attempted to replay at mount time (an EEXIST error).
+We currently have correct behaviour when such succession of renames
+involves only two files, but if there are more files involved, we end up
+not logging all the inodes that are needed, therefore resulting in a
+failure when attempting to replay the log.
+
+Example:
+
+ $ mkfs.btrfs -f /dev/sdb
+ $ mount /dev/sdb /mnt
+
+ $ mkdir /mnt/testdir
+ $ touch /mnt/testdir/fname1
+ $ touch /mnt/testdir/fname2
+
+ $ sync
+
+ $ mv /mnt/testdir/fname1 /mnt/testdir/fname3
+ $ mv /mnt/testdir/fname2 /mnt/testdir/fname4
+ $ ln /mnt/testdir/fname3 /mnt/testdir/fname2
+
+ $ touch /mnt/testdir/fname1
+ $ xfs_io -c "fsync" /mnt/testdir/fname1
+
+ <power failure>
+
+ $ mount /dev/sdb /mnt
+ mount: mount /dev/sdb on /mnt failed: File exists
+
+So fix this by checking all inode dependencies when logging an inode. That
+is, if one logged inode A has a new name that matches the old name of some
+other inode B, check if inode B has a new name that matches the old name
+of some other inode C, and so on. This fix is implemented not by doing any
+recursive function calls but by using an iterative method using a linked
+list that is used in a first-in-first-out fashion.
+
+A test case for fstests follows soon.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+---
+ fs/btrfs/tree-log.c | 241 ++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 197 insertions(+), 44 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index c1feff1ffceb..ce394b750544 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1359,6 +1359,67 @@ static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
+ return ret;
+ }
+
++static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
++ struct inode *dir, struct inode *inode, const char *name,
++ int namelen, u64 ref_index)
++{
++ struct btrfs_dir_item *dir_item;
++ struct btrfs_key key;
++ struct btrfs_path *path;
++ struct inode *other_inode = NULL;
++ int ret;
++
++ path = btrfs_alloc_path();
++ if (!path)
++ return -ENOMEM;
++
++ dir_item = btrfs_lookup_dir_item(NULL, root, path,
++ btrfs_ino(BTRFS_I(dir)),
++ name, namelen, 0);
++ if (!dir_item) {
++ btrfs_release_path(path);
++ goto add_link;
++ } else if (IS_ERR(dir_item)) {
++ ret = PTR_ERR(dir_item);
++ goto out;
++ }
++
++ /*
++ * Our inode's dentry collides with the dentry of another inode which is
++ * in the log but not yet processed since it has a higher inode number.
++ * So delete that other dentry.
++ */
++ btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
++ btrfs_release_path(path);
++ other_inode = read_one_inode(root, key.objectid);
++ if (!other_inode) {
++ ret = -ENOENT;
++ goto out;
++ }
++ ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
++ name, namelen);
++ if (ret)
++ goto out;
++ /*
++ * If we dropped the link count to 0, bump it so that later the iput()
++ * on the inode will not free it. We will fixup the link count later.
++ */
++ if (other_inode->i_nlink == 0)
++ inc_nlink(other_inode);
++
++ ret = btrfs_run_delayed_items(trans, trans->fs_info);
++ if (ret)
++ goto out;
++add_link:
++ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
++ name, namelen, 0, ref_index);
++out:
++ iput(other_inode);
++ btrfs_free_path(path);
++
++ return ret;
++}
++
+ /*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+@@ -1495,9 +1556,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+ goto out;
+
+ /* insert our name */
+- ret = btrfs_add_link(trans, BTRFS_I(dir),
+- BTRFS_I(inode),
+- name, namelen, 0, ref_index);
++ ret = add_link(trans, root, dir, inode, name, namelen,
++ ref_index);
+ if (ret)
+ goto out;
+
+@@ -4960,8 +5020,12 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ btrfs_dir_item_key_to_cpu(search_path->nodes[0],
+ di, &di_key);
+ if (di_key.type == BTRFS_INODE_ITEM_KEY) {
+- ret = 1;
+- *other_ino = di_key.objectid;
++ if (di_key.objectid != key->objectid) {
++ ret = 1;
++ *other_ino = di_key.objectid;
++ } else {
++ ret = 0;
++ }
+ } else {
+ ret = -EAGAIN;
+ }
+@@ -4981,6 +5045,126 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb,
+ return ret;
+ }
+
++struct btrfs_ino_list {
++ u64 ino;
++ struct list_head list;
++};
++
++static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
++ struct btrfs_root *root,
++ struct btrfs_path *path,
++ struct btrfs_log_ctx *ctx,
++ u64 ino)
++{
++ struct btrfs_ino_list *ino_elem;
++ LIST_HEAD(inode_list);
++ int ret = 0;
++
++ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
++ if (!ino_elem)
++ return -ENOMEM;
++ ino_elem->ino = ino;
++ list_add_tail(&ino_elem->list, &inode_list);
++
++ while (!list_empty(&inode_list)) {
++ struct btrfs_fs_info *fs_info = root->fs_info;
++ struct btrfs_key key;
++ struct inode *inode;
++
++ ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
++ list);
++ ino = ino_elem->ino;
++ list_del(&ino_elem->list);
++ kfree(ino_elem);
++ if (ret)
++ continue;
++
++ btrfs_release_path(path);
++
++ key.objectid = ino;
++ key.type = BTRFS_INODE_ITEM_KEY;
++ key.offset = 0;
++ inode = btrfs_iget(fs_info->sb, &key, root, NULL);
++ /*
++ * If the other inode that had a conflicting dir entry was
++ * deleted in the current transaction, we don't need to do more
++ * work nor fallback to a transaction commit.
++ */
++ if (IS_ERR(inode)) {
++ ret = PTR_ERR(inode);
++ if (ret == -ENOENT)
++ ret = 0;
++ continue;
++ }
++ /*
++ * We are safe logging the other inode without acquiring its
++ * lock as long as we log with the LOG_INODE_EXISTS mode. We
++ * are safe against concurrent renames of the other inode as
++ * well because during a rename we pin the log and update the
++ * log with the new name before we unpin it.
++ */
++ ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
++ LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
++ if (ret) {
++ iput(inode);
++ continue;
++ }
++
++ key.objectid = ino;
++ key.type = BTRFS_INODE_REF_KEY;
++ key.offset = 0;
++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++ if (ret < 0) {
++ iput(inode);
++ continue;
++ }
++
++ while (true) {
++ struct extent_buffer *leaf = path->nodes[0];
++ int slot = path->slots[0];
++ u64 other_ino = 0;
++
++ if (slot >= btrfs_header_nritems(leaf)) {
++ ret = btrfs_next_leaf(root, path);
++ if (ret < 0) {
++ break;
++ } else if (ret > 0) {
++ ret = 0;
++ break;
++ }
++ continue;
++ }
++
++ btrfs_item_key_to_cpu(leaf, &key, slot);
++ if (key.objectid != ino ||
++ (key.type != BTRFS_INODE_REF_KEY &&
++ key.type != BTRFS_INODE_EXTREF_KEY)) {
++ ret = 0;
++ break;
++ }
++
++ ret = btrfs_check_ref_name_override(leaf, slot, &key,
++ BTRFS_I(inode), &other_ino);
++ if (ret < 0)
++ break;
++ if (ret > 0) {
++ ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
++ if (!ino_elem) {
++ ret = -ENOMEM;
++ break;
++ }
++ ino_elem->ino = other_ino;
++ list_add_tail(&ino_elem->list, &inode_list);
++ ret = 0;
++ }
++ path->slots[0]++;
++ }
++ iput(inode);
++ }
++
++ return ret;
++}
++
+ /* log a single inode in the tree log.
+ * At least one parent directory for this inode must exist in the tree
+ * or be logged already.
+@@ -5022,6 +5206,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ u64 logged_isize = 0;
+ bool need_log_inode_item = true;
+ bool xattrs_logged = false;
++ bool recursive_logging = (inode_only == LOG_OTHER_INODE);
+
+ path = btrfs_alloc_path();
+ if (!path)
+@@ -5163,7 +5348,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+
+ if ((min_key.type == BTRFS_INODE_REF_KEY ||
+ min_key.type == BTRFS_INODE_EXTREF_KEY) &&
+- inode->generation == trans->transid) {
++ inode->generation == trans->transid &&
++ !recursive_logging) {
+ u64 other_ino = 0;
+
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+@@ -5174,9 +5360,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ goto out_unlock;
+ } else if (ret > 0 && ctx &&
+ other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+- struct btrfs_key inode_key;
+- struct inode *other_inode;
+-
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+@@ -5192,43 +5375,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+ goto out_unlock;
+ }
+ ins_nr = 0;
+- btrfs_release_path(path);
+- inode_key.objectid = other_ino;
+- inode_key.type = BTRFS_INODE_ITEM_KEY;
+- inode_key.offset = 0;
+- other_inode = btrfs_iget(fs_info->sb,
+- &inode_key, root,
+- NULL);
+- /*
+- * If the other inode that had a conflicting dir
+- * entry was deleted in the current transaction,
+- * we don't need to do more work nor fallback to
+- * a transaction commit.
+- */
+- if (other_inode == ERR_PTR(-ENOENT)) {
+- goto next_key;
+- } else if (IS_ERR(other_inode)) {
+- err = PTR_ERR(other_inode);
+- goto out_unlock;
+- }
+- /*
+- * We are safe logging the other inode without
+- * acquiring its i_mutex as long as we log with
+- * the LOG_INODE_EXISTS mode. We're safe against
+- * concurrent renames of the other inode as well
+- * because during a rename we pin the log and
+- * update the log with the new name before we
+- * unpin it.
+- */
+- err = btrfs_log_inode(trans, root,
+- BTRFS_I(other_inode),
+- LOG_OTHER_INODE, 0, LLONG_MAX,
+- ctx);
+- iput(other_inode);
++
++ err = log_conflicting_inodes(trans, root, path,
++ ctx, other_ino);
+ if (err)
+ goto out_unlock;
+- else
+- goto next_key;
++ btrfs_release_path(path);
++ goto next_key;
+ }
+ }
+
+--
+2.19.0
+
diff --git a/series.conf b/series.conf
index 2511fd7715..0ba9ccd433 100644
--- a/series.conf
+++ b/series.conf
@@ -20814,6 +20814,7 @@
patches.suse/0003-btrfs-qgroup-Introduce-per-root-swapped-blocks-infra.patch
patches.suse/0004-btrfs-qgroup-Use-delayed-subtree-rescan-for-balance.patch
patches.suse/0005-btrfs-qgroup-Cleanup-old-subtree-swap-code.patch
+ patches.suse/btrfs-fix-fsync-after-succession-of-renames-of-diffe.patch
patches.suse/btrfs-ensure-that-a-dup-or-raid1-block-group-has-exactly-two-stripes.patch
patches.arch/powerpc-pseries-Perform-full-re-add-of-CPU-for-topol.patch
patches.suse/powerpc-64s-clear-on-stack-exception-marker-upon-exception-return.patch