ocfs2: implement directory read-ahead

Uptodate.c now knows about read-ahead buffers. Use some more aggressive logic in ocfs2_readdir(). The two functions which currently use directory read-ahead are ocfs2_find_entry() and ocfs2_readdir(). Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

ocfs2: implement directory read-ahead
Uptodate.c now knows about read-ahead buffers. Use some more aggressive logic in ocfs2_readdir(). The two functions which currently use directory read-ahead are ocfs2_find_entry() and ocfs2_readdir(). Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
aa958874 · Mark Fasheh · e0b4096d · aa958874 · aa958874 · aa958874
Commit aa958874 authored Apr 21, 2006 by Mark Fasheh
7 changed files
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -100,6 +100,9 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 	mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n",
 		   (unsigned long long)block, nr, flags, inode);
+	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
+	       (!inode || !(flags & OCFS2_BH_CACHED)));
 	if (osb == NULL || osb->sb == NULL || bhs == NULL) {
 		status = -EINVAL;
 		mlog_errno(status);
@@ -140,6 +143,30 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 		bh = bhs[i];
 		ignore_cache = 0;
+		/* There are three read-ahead cases here which we need to
+		 * be concerned with. All three assume a buffer has
+		 * previously been submitted with OCFS2_BH_READAHEAD
+		 * and it hasn't yet completed I/O.
+		 *
+		 * 1) The current request is sync to disk. This rarely
+		 *    happens these days, and never when performance
+		 *    matters - the code can just wait on the buffer
+		 *    lock and re-submit.
+		 *
+		 * 2) The current request is cached, but not
+		 *    readahead. ocfs2_buffer_uptodate() will return
+		 *    false anyway, so we'll wind up waiting on the
+		 *    buffer lock to do I/O. We re-check the request
+		 *    with after getting the lock to avoid a re-submit.
+		 *
+		 * 3) The current request is readahead (and so must
+		 *    also be a caching one). We short circuit if the
+		 *    buffer is locked (under I/O) and if it's in the
+		 *    uptodate cache. The re-check from #2 catches the
+		 *    case that the previous read-ahead completes just
+		 *    before our is-it-in-flight check.
+		 */
 		if (flags & OCFS2_BH_CACHED &&
 		    !ocfs2_buffer_uptodate(inode, bh)) {
 			mlog(ML_UPTODATE,
@@ -169,6 +196,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 				continue;
 			}
+			/* A read-ahead request was made - if the
+			 * buffer is already under read-ahead from a
+			 * previously submitted request than we are
+			 * done here. */
+			if ((flags & OCFS2_BH_READAHEAD)
+			    && ocfs2_buffer_read_ahead(inode, bh))
+				continue;
 			lock_buffer(bh);
 			if (buffer_jbd(bh)) {
 #ifdef CATCH_BH_JBD_RACES
@@ -181,13 +216,22 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 				continue;
 #endif
 			}
+			/* Re-check ocfs2_buffer_uptodate() as a
+			 * previously read-ahead buffer may have
+			 * completed I/O while we were waiting for the
+			 * buffer lock. */
+			if ((flags & OCFS2_BH_CACHED)
+			    && !(flags & OCFS2_BH_READAHEAD)
+			    && ocfs2_buffer_uptodate(inode, bh)) {
+				unlock_buffer(bh);
+				continue;
+			}
 			clear_buffer_uptodate(bh);
 			get_bh(bh); /* for end_buffer_read_sync() */
 			bh->b_end_io = end_buffer_read_sync;
-			if (flags & OCFS2_BH_READAHEAD)
+			submit_bh(READ, bh);
-				submit_bh(READA, bh);
-			else
-				submit_bh(READ, bh);
 			continue;
 		}
 	}
@@ -197,34 +241,39 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
 	for (i = (nr - 1); i >= 0; i--) {
 		bh = bhs[i];
-		/* We know this can't have changed as we hold the
+		if (!(flags & OCFS2_BH_READAHEAD)) {
-		 * inode sem. Avoid doing any work on the bh if the
+			/* We know this can't have changed as we hold the
-		 * journal has it. */
+			 * inode sem. Avoid doing any work on the bh if the
-		if (!buffer_jbd(bh))
+			 * journal has it. */
-			wait_on_buffer(bh);
+			if (!buffer_jbd(bh))
+				wait_on_buffer(bh);
-		if (!buffer_uptodate(bh)) {
-			/* Status won't be cleared from here on out,
+			if (!buffer_uptodate(bh)) {
-			 * so we can safely record this and loop back
+				/* Status won't be cleared from here on out,
-			 * to cleanup the other buffers. Don't need to
+				 * so we can safely record this and loop back
-			 * remove the clustered uptodate information
+				 * to cleanup the other buffers. Don't need to
-			 * for this bh as it's not marked locally
+				 * remove the clustered uptodate information
-			 * uptodate. */
+				 * for this bh as it's not marked locally
-			status = -EIO;
+				 * uptodate. */
-			brelse(bh);
+				status = -EIO;
-			bhs[i] = NULL;
+				brelse(bh);
-			continue;
+				bhs[i] = NULL;
+				continue;
+			}
 		}
+		/* Always set the buffer in the cache, even if it was
+		 * a forced read, or read-ahead which hasn't yet
+		 * completed. */
 		if (inode)
 			ocfs2_set_buffer_uptodate(inode, bh);
 	}
 	if (inode)
 		mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
-	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s\n", 
+	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 
 	     (unsigned long long)block, nr,
-	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
+	     (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags);
 bail:

--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -49,7 +49,7 @@ int ocfs2_read_blocks(struct ocfs2_super          *osb,
 #define OCFS2_BH_CACHED            1
-#define OCFS2_BH_READAHEAD         8	/* use this to pass READA down to submit_bh */
+#define OCFS2_BH_READAHEAD         8
 static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off,
 				   struct buffer_head **bh, int flags,

--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -74,14 +74,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
 	int error = 0;
-	unsigned long offset, blk;
+	unsigned long offset, blk, last_ra_blk = 0;
-	int i, num, stored;
+	int i, stored;
 	struct buffer_head * bh, * tmp;
 	struct ocfs2_dir_entry * de;
 	int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	struct super_block * sb = inode->i_sb;
-	int have_disk_lock = 0;
+	unsigned int ra_sectors = 16;
 	mlog_entry("dirino=%llu\n",
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -95,9 +95,8 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			mlog_errno(error);
 		/* we haven't got any yet, so propagate the error. */
 		stored = error;
-		goto bail;
+		goto bail_nolock;
 	}
-	have_disk_lock = 1;
 	offset = filp->f_pos & (sb->s_blocksize - 1);
@@ -113,16 +112,21 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 			continue;
 		}
-		/*
+		/* The idea here is to begin with 8k read-ahead and to stay
-		 * Do the readahead (8k)
+		 * 4k ahead of our current position.
-		 */
+		 *
-		if (!offset) {
+		 * TODO: Use the pagecache for this. We just need to
-			for (i = 16 >> (sb->s_blocksize_bits - 9), num = 0;
+		 * make sure it's cluster-safe... */
+		if (!last_ra_blk
+		    || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
+			for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
 			     i > 0; i--) {
 				tmp = ocfs2_bread(inode, ++blk, &err, 1);
 				if (tmp)
 					brelse(tmp);
 			}
+			last_ra_blk = blk;
+			ra_sectors = 8;
 		}
 revalidate:
@@ -194,9 +198,9 @@ revalidate:
 	stored = 0;
 bail:
-	if (have_disk_lock)
+	ocfs2_meta_unlock(inode, 0);
-		ocfs2_meta_unlock(inode, 0);
+bail_nolock:
 	mlog_exit(stored);
 	return stored;

--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1050,12 +1050,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 	u64 p_blkno;
 	int readflags = OCFS2_BH_CACHED;
-#if 0
-	/* only turn this on if we know we can deal with read_block
-	 * returning nothing */
 	if (reada)
 		readflags |= OCFS2_BH_READAHEAD;
-#endif
 	if (((u64)block << inode->i_sb->s_blocksize_bits) >=
 	    i_size_read(inode)) {

--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -56,6 +56,7 @@
 #include "journal.h"
 #include "namei.h"
 #include "suballoc.h"
+#include "super.h"
 #include "symlink.h"
 #include "sysfile.h"
 #include "uptodate.h"
@@ -1962,13 +1963,8 @@ restart:
 				}
 				num++;
-				/* XXX: questionable readahead stuff here */
 				bh = ocfs2_bread(dir, b++, &err, 1);
 				bh_use[ra_max] = bh;
-#if 0		// ???
-				if (bh)
-					ll_rw_block(READ, 1, &bh);
-#endif
 			}
 		}
 		if ((bh = bh_use[ra_ptr++]) == NULL)
@@ -1976,6 +1972,10 @@ restart:
 		wait_on_buffer(bh);
 		if (!buffer_uptodate(bh)) {
 			/* read error, skip block & hope for the best */
+			ocfs2_error(dir->i_sb, "reading directory %llu, "
+				    "offset %lu\n",
+				    (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				    block);
 			brelse(bh);
 			goto next;
 		}

--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -206,7 +206,10 @@ static int ocfs2_buffer_cached(struct ocfs2_inode_info *oi,
 }
 /* Warning: even if it returns true, this does *not* guarantee that
- * the block is stored in our inode metadata cache. */
+ * the block is stored in our inode metadata cache. 
+ * 
+ * This can be called under lock_buffer()
+ */
 int ocfs2_buffer_uptodate(struct inode *inode,
 			  struct buffer_head *bh)
 {
@@ -226,6 +229,16 @@ int ocfs2_buffer_uptodate(struct inode *inode,
 	return ocfs2_buffer_cached(OCFS2_I(inode), bh);
 }
+/* 
+ * Determine whether a buffer is currently out on a read-ahead request.
+ * ip_io_sem should be held to serialize submitters with the logic here.
+ */
+int ocfs2_buffer_read_ahead(struct inode *inode,
+			    struct buffer_head *bh)
+{
+	return buffer_locked(bh) && ocfs2_buffer_cached(OCFS2_I(inode), bh);
+}
 /* Requires ip_lock */
 static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
 				     sector_t block)
@@ -403,7 +416,11 @@ out_free:
 *
 * Note that this function may actually fail to insert the block if
 * memory cannot be allocated. This is not fatal however (but may
- * result in a performance penalty) */
+ * result in a performance penalty)
+ *
+ * Readahead buffers can be passed in here before the I/O request is
+ * completed.
+ */
 void ocfs2_set_buffer_uptodate(struct inode *inode,
 			       struct buffer_head *bh)
 {

--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -40,5 +40,7 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
 				   struct buffer_head *bh);
 void ocfs2_remove_from_cache(struct inode *inode,
 			     struct buffer_head *bh);
+int ocfs2_buffer_read_ahead(struct inode *inode,
+			    struct buffer_head *bh);
 #endif /* OCFS2_UPTODATE_H */