Home Home > GIT Browse > SLE15-SP1
diff options
authorDave Chinner <dchinner@redhat.com>2014-05-20 08:18:09 +1000
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-08-14 09:38:26 +0800
commit23647e98786630a3d0ef65cc0304dc75f2da0eb7 (patch)
parenta7129df1c448ee418dd53c00404bfc5a168aca53 (diff)
xfs: log vector rounding leaks log space
commit 110dc24ad2ae4e9b94b08632fe1eb2fcdff83045 upstream. The addition of direct formatting of log items into the CIL linear buffer added alignment restrictions that the start of each vector needed to be 64 bit aligned. Hence padding was added in xlog_finish_iovec() to round up the vector length to ensure the next vector started with the correct alignment. This adds a small number of bytes to the size of the linear buffer that is otherwise unused. The issue is that we then use the linear buffer size to determine the log space used by the log item, and this includes the unused space. Hence when we account for space used by the log item, it's more than is actually written into the iclogs, and hence we slowly leak this space. This results on log hangs when reserving space, with threads getting stuck with these stack traces: Call Trace: [<ffffffff81d15989>] schedule+0x29/0x70 [<ffffffff8150d3a2>] xlog_grant_head_wait+0xa2/0x1a0 [<ffffffff8150d55d>] xlog_grant_head_check+0xbd/0x140 [<ffffffff8150ee33>] xfs_log_reserve+0x103/0x220 [<ffffffff814b7f05>] xfs_trans_reserve+0x2f5/0x310 ..... The 4 bytes is significant. Brain Foster did all the hard work in tracking down a reproducable leak to inode chunk allocation (it went away with the ikeep mount option). His rough numbers were that creating 50,000 inodes leaked 11 log blocks. This turns out to be roughly 800 inode chunks or 1600 inode cluster buffers. That works out at roughly 4 bytes per cluster buffer logged, and at that I started looking for a 4 byte leak in the buffer logging code. What I found was that a struct xfs_buf_log_format structure for an inode cluster buffer is 28 bytes in length. This gets rounded up to 32 bytes, but the vector length remains 28 bytes. Hence the CIL ticket reservation is decremented by 32 bytes (via lv->lv_buf_len) for that vector rather than 28 bytes which are written into the log. The fix for this problem is to separately track the bytes used by the log vectors in the item and use that instead of the buffer length when accounting for the log space that will be used by the formatted log item. Again, thanks to Brian Foster for doing all the hard work and long hours to isolate this leak and make finding the bug relatively simple. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Brian Foster <bfoster@redhat.com> Signed-off-by: Dave Chinner <david@fromorbit.com> Cc: Bill <billstuff2001@sbcglobal.net> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2 files changed, 17 insertions, 9 deletions
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index b0f4ef77fa70..bf9781e9fd92 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -24,7 +24,8 @@ struct xfs_log_vec {
struct xfs_log_iovec *lv_iovecp; /* iovec array */
struct xfs_log_item *lv_item; /* owner */
char *lv_buf; /* formatted buffer */
- int lv_buf_len; /* size of formatted buffer */
+ int lv_bytes; /* accounted space in buffer */
+ int lv_buf_len; /* aligned size of buffer */
int lv_size; /* size of allocated lv */
@@ -52,15 +53,21 @@ xlog_prepare_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec **vecp,
return vec->i_addr;
+ * We need to make sure the next buffer is naturally aligned for the biggest
+ * basic data type we put into it. We already accounted for this padding when
+ * sizing the buffer.
+ *
+ * However, this padding does not get written into the log, and hence we have to
+ * track the space used by the log vectors separately to prevent log space hangs
+ * due to inaccurate accounting (i.e. a leak) of the used log space through the
+ * CIL context ticket.
+ */
static inline void
xlog_finish_iovec(struct xfs_log_vec *lv, struct xfs_log_iovec *vec, int len)
- /*
- * We need to make sure the next buffer is naturally aligned for the
- * biggest basic data type we put into it. We already accounted for
- * this when sizing the buffer.
- */
lv->lv_buf_len += round_up(len, sizeof(uint64_t));
+ lv->lv_bytes += len;
vec->i_len = len;
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4ef6fdbced78..bcfbaae4702c 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -97,7 +97,7 @@ xfs_cil_prepare_item(
/* Account for the new LV being passed in */
if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) {
- *diff_len += lv->lv_buf_len;
+ *diff_len += lv->lv_bytes;
*diff_iovecs += lv->lv_niovecs;
@@ -111,7 +111,7 @@ xfs_cil_prepare_item(
else if (old_lv != lv) {
ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED);
- *diff_len -= old_lv->lv_buf_len;
+ *diff_len -= old_lv->lv_bytes;
*diff_iovecs -= old_lv->lv_niovecs;
@@ -239,7 +239,7 @@ xlog_cil_insert_format_items(
* that the space reservation accounting is correct.
*diff_iovecs -= lv->lv_niovecs;
- *diff_len -= lv->lv_buf_len;
+ *diff_len -= lv->lv_bytes;
} else {
/* allocate new data chunk */
lv = kmem_zalloc(buf_size, KM_SLEEP|KM_NOFS);
@@ -259,6 +259,7 @@ xlog_cil_insert_format_items(
/* The allocated data region lies beyond the iovec region */
lv->lv_buf_len = 0;
+ lv->lv_bytes = 0;
lv->lv_buf = (char *)lv + buf_size - nbytes;
ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t)));