Commit 30773840 authored by Theodore Ts'o's avatar Theodore Ts'o

ext4: add fsync batch tuning knobs

Add new mount options, min_batch_time and max_batch_time, which
controls how long the jbd2 layer should wait for additional filesystem
operations to get batched with a synchronous write transaction.
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent d7cfa468
...@@ -283,6 +283,35 @@ delalloc (*) Deferring block allocation until write-out time. ...@@ -283,6 +283,35 @@ delalloc (*) Deferring block allocation until write-out time.
nodelalloc Disable delayed allocation. Blocks are allocation nodelalloc Disable delayed allocation. Blocks are allocation
when data is copied from user to page cache. when data is copied from user to page cache.
max_batch_time=usec Maximum amount of time ext4 should wait for
additional filesystem operations to be batch
together with a synchronous write operation.
Since a synchronous write operation is going to
force a commit and then a wait for the I/O
complete, it doesn't cost much, and can be a
huge throughput win, we wait for a small amount
of time to see if any other transactions can
piggyback on the synchronous write. The
algorithm used is designed to automatically tune
for the speed of the disk, by measuring the
amount of time (on average) that it takes to
finish committing a transaction. Call this time
the "commit time". If the time that the
transactoin has been running is less than the
commit time, ext4 will try sleeping for the
commit time to see if other operations will join
the transaction. The commit time is capped by
the max_batch_time, which defaults to 15000us
(15ms). This optimization can be turned off
entirely by setting max_batch_time to 0.
min_batch_time=usec This parameter sets the commit time (as
described above) to be at least min_batch_time.
It defaults to zero microseconds. Increasing
this parameter may improve the throughput of
multi-threaded, synchronous workloads on very
fast disks, at the cost of increasing latency.
Data Mode Data Mode
========= =========
There are 3 different data modes: There are 3 different data modes:
......
...@@ -328,6 +328,7 @@ struct ext4_mount_options { ...@@ -328,6 +328,7 @@ struct ext4_mount_options {
uid_t s_resuid; uid_t s_resuid;
gid_t s_resgid; gid_t s_resgid;
unsigned long s_commit_interval; unsigned long s_commit_interval;
u32 s_min_batch_time, s_max_batch_time;
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
int s_jquota_fmt; int s_jquota_fmt;
char *s_qf_names[MAXQUOTAS]; char *s_qf_names[MAXQUOTAS];
...@@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ...@@ -805,6 +806,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_ORDERED 0x0040
#define EXT4_DEFM_JMODE_WBACK 0x0060 #define EXT4_DEFM_JMODE_WBACK 0x0060
/*
* Default journal batch times
*/
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
/* /*
* Structure of a directory entry * Structure of a directory entry
*/ */
......
...@@ -74,6 +74,8 @@ struct ext4_sb_info { ...@@ -74,6 +74,8 @@ struct ext4_sb_info {
struct journal_s *s_journal; struct journal_s *s_journal;
struct list_head s_orphan; struct list_head s_orphan;
unsigned long s_commit_interval; unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
struct block_device *journal_bdev; struct block_device *journal_bdev;
#ifdef CONFIG_JBD2_DEBUG #ifdef CONFIG_JBD2_DEBUG
struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */
......
...@@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) ...@@ -705,10 +705,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
#endif #endif
if (!test_opt(sb, RESERVATION)) if (!test_opt(sb, RESERVATION))
seq_puts(seq, ",noreservation"); seq_puts(seq, ",noreservation");
if (sbi->s_commit_interval) { if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
seq_printf(seq, ",commit=%u", seq_printf(seq, ",commit=%u",
(unsigned) (sbi->s_commit_interval / HZ)); (unsigned) (sbi->s_commit_interval / HZ));
} }
if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
seq_printf(seq, ",min_batch_time=%u",
(unsigned) sbi->s_min_batch_time);
}
if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
seq_printf(seq, ",max_batch_time=%u",
(unsigned) sbi->s_min_batch_time);
}
/* /*
* We're changing the default of barrier mount option, so * We're changing the default of barrier mount option, so
* let's always display its mount state so it's clear what its * let's always display its mount state so it's clear what its
...@@ -874,7 +883,8 @@ enum { ...@@ -874,7 +883,8 @@ enum {
Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit, Opt_journal_checksum, Opt_journal_async_commit,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_data_err_abort, Opt_data_err_ignore, Opt_data_err_abort, Opt_data_err_ignore,
...@@ -913,6 +923,8 @@ static const match_table_t tokens = { ...@@ -913,6 +923,8 @@ static const match_table_t tokens = {
{Opt_nobh, "nobh"}, {Opt_nobh, "nobh"},
{Opt_bh, "bh"}, {Opt_bh, "bh"},
{Opt_commit, "commit=%u"}, {Opt_commit, "commit=%u"},
{Opt_min_batch_time, "min_batch_time=%u"},
{Opt_max_batch_time, "max_batch_time=%u"},
{Opt_journal_update, "journal=update"}, {Opt_journal_update, "journal=update"},
{Opt_journal_inum, "journal=%u"}, {Opt_journal_inum, "journal=%u"},
{Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_dev, "journal_dev=%u"},
...@@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb, ...@@ -1131,6 +1143,22 @@ static int parse_options(char *options, struct super_block *sb,
option = JBD2_DEFAULT_MAX_COMMIT_AGE; option = JBD2_DEFAULT_MAX_COMMIT_AGE;
sbi->s_commit_interval = HZ * option; sbi->s_commit_interval = HZ * option;
break; break;
case Opt_max_batch_time:
if (match_int(&args[0], &option))
return 0;
if (option < 0)
return 0;
if (option == 0)
option = EXT4_DEF_MAX_BATCH_TIME;
sbi->s_max_batch_time = option;
break;
case Opt_min_batch_time:
if (match_int(&args[0], &option))
return 0;
if (option < 0)
return 0;
sbi->s_min_batch_time = option;
break;
case Opt_data_journal: case Opt_data_journal:
data_opt = EXT4_MOUNT_JOURNAL_DATA; data_opt = EXT4_MOUNT_JOURNAL_DATA;
goto datacheck; goto datacheck;
...@@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ...@@ -1979,6 +2007,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
sbi->s_resgid = le16_to_cpu(es->s_def_resgid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, RESERVATION);
set_opt(sbi->s_mount_opt, BARRIER); set_opt(sbi->s_mount_opt, BARRIER);
...@@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) ...@@ -2524,11 +2555,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
{ {
struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_commit_interval)
journal->j_commit_interval = sbi->s_commit_interval; journal->j_commit_interval = sbi->s_commit_interval;
/* We could also set up an ext4-specific default for the commit journal->j_min_batch_time = sbi->s_min_batch_time;
* interval here, but for now we'll just fall back to the jbd journal->j_max_batch_time = sbi->s_max_batch_time;
* default. */
spin_lock(&journal->j_state_lock); spin_lock(&journal->j_state_lock);
if (test_opt(sb, BARRIER)) if (test_opt(sb, BARRIER))
...@@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ...@@ -3042,6 +3071,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
old_opts.s_resuid = sbi->s_resuid; old_opts.s_resuid = sbi->s_resuid;
old_opts.s_resgid = sbi->s_resgid; old_opts.s_resgid = sbi->s_resgid;
old_opts.s_commit_interval = sbi->s_commit_interval; old_opts.s_commit_interval = sbi->s_commit_interval;
old_opts.s_min_batch_time = sbi->s_min_batch_time;
old_opts.s_max_batch_time = sbi->s_max_batch_time;
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
old_opts.s_jquota_fmt = sbi->s_jquota_fmt; old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) for (i = 0; i < MAXQUOTAS; i++)
...@@ -3178,6 +3209,8 @@ restore_opts: ...@@ -3178,6 +3209,8 @@ restore_opts:
sbi->s_resuid = old_opts.s_resuid; sbi->s_resuid = old_opts.s_resuid;
sbi->s_resgid = old_opts.s_resgid; sbi->s_resgid = old_opts.s_resgid;
sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_commit_interval = old_opts.s_commit_interval;
sbi->s_min_batch_time = old_opts.s_min_batch_time;
sbi->s_max_batch_time = old_opts.s_max_batch_time;
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
sbi->s_jquota_fmt = old_opts.s_jquota_fmt; sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
for (i = 0; i < MAXQUOTAS; i++) { for (i = 0; i < MAXQUOTAS; i++) {
......
...@@ -964,6 +964,8 @@ static journal_t * journal_init_common (void) ...@@ -964,6 +964,8 @@ static journal_t * journal_init_common (void)
spin_lock_init(&journal->j_state_lock); spin_lock_init(&journal->j_state_lock);
journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE);
journal->j_min_batch_time = 0;
journal->j_max_batch_time = 15000; /* 15ms */
/* The journal is marked for error until we succeed with recovery! */ /* The journal is marked for error until we succeed with recovery! */
journal->j_flags = JBD2_ABORT; journal->j_flags = JBD2_ABORT;
......
...@@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle) ...@@ -1255,8 +1255,10 @@ int jbd2_journal_stop(handle_t *handle)
trans_time = ktime_to_ns(ktime_sub(ktime_get(), trans_time = ktime_to_ns(ktime_sub(ktime_get(),
transaction->t_start_time)); transaction->t_start_time));
commit_time = max_t(u64, commit_time,
1000*journal->j_min_batch_time);
commit_time = min_t(u64, commit_time, commit_time = min_t(u64, commit_time,
1000*jiffies_to_usecs(1)); 1000*journal->j_max_batch_time);
if (trans_time < commit_time) { if (trans_time < commit_time) {
ktime_t expires = ktime_add_ns(ktime_get(), ktime_t expires = ktime_add_ns(ktime_get(),
......
...@@ -956,6 +956,14 @@ struct journal_s ...@@ -956,6 +956,14 @@ struct journal_s
*/ */
u64 j_average_commit_time; u64 j_average_commit_time;
/*
* minimum and maximum times that we should wait for
* additional filesystem operations to get batched into a
* synchronous handle in microseconds
*/
u32 j_min_batch_time;
u32 j_max_batch_time;
/* This function is called when a transaction is closed */ /* This function is called when a transaction is closed */
void (*j_commit_callback)(journal_t *, void (*j_commit_callback)(journal_t *,
transaction_t *); transaction_t *);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment