Commit 58617d5e authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: Remove automatic enabling of the HUGE_FILE feature flag
  ext4: Replace hackish ext4_mb_poll_new_transaction with commit callback
  ext4: Update Documentation/filesystems/ext4.txt
  ext4: Remove unused mount options: nomballoc, mballoc, nocheck
  ext4: Remove compile warnings when building w/o CONFIG_PROC_FS
  ext4: Add missing newlines to printk messages
  ext4: Fix file fragmentation during large file write.
  vfs: Add no_nrwrite_index_update writeback control flag
  vfs: Remove the range_cont writeback mode.
  ext4: Use tag dirty lookup during mpage_da_submit_io
  ext4: let the block device know when unused blocks can be discarded
  ext4: Don't reuse released data blocks until transaction commits
  ext4: Use an rbtree for tracking blocks freed during transaction.
  ext4: Do mballoc init before doing filesystem recovery
  ext4: Free ext4_prealloc_space using kmem_cache_free
  ext4: Fix Kconfig typo for ext4dev
  ext4: Remove an old reference to ext4dev in Makefile comment
parents 26e9a397 f287a1a5
......@@ -2,19 +2,24 @@
Ext4 Filesystem
===============
This is a development version of the ext4 filesystem, an advanced level
of the ext3 filesystem which incorporates scalability and reliability
enhancements for supporting large filesystems (64 bit) in keeping with
increasing disk capacities and state-of-the-art feature requirements.
Ext4 is an an advanced level of the ext3 filesystem which incorporates
scalability and reliability enhancements for supporting large filesystems
(64 bit) in keeping with increasing disk capacities and state-of-the-art
feature requirements.
Mailing list: linux-ext4@vger.kernel.org
Mailing list: linux-ext4@vger.kernel.org
Web site: http://ext4.wiki.kernel.org
1. Quick usage instructions:
===========================
Note: More extensive information for getting started with ext4 can be
found at the ext4 wiki site at the URL:
http://ext4.wiki.kernel.org/index.php/Ext4_Howto
- Compile and install the latest version of e2fsprogs (as of this
writing version 1.41) from:
writing version 1.41.3) from:
http://sourceforge.net/project/showfiles.php?group_id=2406
......@@ -36,11 +41,9 @@ Mailing list: linux-ext4@vger.kernel.org
# mke2fs -t ext4 /dev/hda1
Or configure an existing ext3 filesystem to support extents and set
the test_fs flag to indicate that it's ok for an in-development
filesystem to touch this filesystem:
Or to configure an existing ext3 filesystem to support extents:
# tune2fs -O extents -E test_fs /dev/hda1
# tune2fs -O extents /dev/hda1
If the filesystem was created with 128 byte inodes, it can be
converted to use 256 byte for greater efficiency via:
......@@ -104,8 +107,8 @@ exist yet so I'm not sure they're in the near-term roadmap.
The big performance win will come with mballoc, delalloc and flex_bg
grouping of bitmaps and inode tables. Some test results available here:
- http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
- http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
- http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html
- http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html
3. Options
==========
......@@ -214,9 +217,6 @@ noreservation
bsddf (*) Make 'df' act like BSD.
minixdf Make 'df' act like Minix.
check=none Don't do extra checking of bitmaps on mount.
nocheck
debug Extra debugging information is sent to syslog.
errors=remount-ro(*) Remount the filesystem read-only on an error.
......@@ -253,8 +253,6 @@ nobh (a) cache disk block mapping information
"nobh" option tries to avoid associating buffer
heads (supported only for "writeback" mode).
mballoc (*) Use the multiple block allocator for block allocation
nomballoc disabled multiple block allocator for block allocation.
stripe=n Number of filesystem blocks that mballoc will try
to use for allocation size and alignment. For RAID5/6
systems this should be the number of data
......
......@@ -160,7 +160,7 @@ config EXT4_FS
filesystem initially.
To compile this file system support as a module, choose M here. The
module will be called ext4dev.
module will be called ext4.
If unsure, say N.
......
......@@ -71,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3
obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4dev
obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4
obj-$(CONFIG_JBD) += jbd/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_EXT2_FS) += ext2/
......
......@@ -568,8 +568,16 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
/* this isn't the right place to decide whether block is metadata
* inode.c/extents.c knows better, but for safety ... */
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) ||
ext4_should_journal_data(inode))
if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
metadata = 1;
/* We need to make sure we don't reuse
* block released untill the transaction commit.
* writeback mode have weak data consistency so
* don't force data as metadata when freeing block
* for writeback mode.
*/
if (metadata == 0 && !ext4_should_writeback_data(inode))
metadata = 1;
sb = inode->i_sb;
......
......@@ -511,7 +511,6 @@ do { \
/*
* Mount flags
*/
#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */
#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */
#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */
......
......@@ -99,9 +99,6 @@ struct ext4_sb_info {
struct inode *s_buddy_cache;
long s_blocks_reserved;
spinlock_t s_reserve_lock;
struct list_head s_active_transaction;
struct list_head s_closed_transaction;
struct list_head s_committed_transaction;
spinlock_t s_md_lock;
tid_t s_last_transaction;
unsigned short *s_mb_offsets, *s_mb_maxs;
......
......@@ -1648,6 +1648,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
int ret = 0, err, nr_pages, i;
unsigned long index, end;
struct pagevec pvec;
long pages_skipped;
BUG_ON(mpd->next_page <= mpd->first_page);
pagevec_init(&pvec, 0);
......@@ -1655,20 +1656,30 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
end = mpd->next_page - 1;
while (index <= end) {
/* XXX: optimize tail */
nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
/*
* We can use PAGECACHE_TAG_DIRTY lookup here because
* even though we have cleared the dirty flag on the page
* We still keep the page in the radix tree with tag
* PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io.
* The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback
* which is called via the below writepage callback.
*/
nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY,
min(end - index,
(pgoff_t)PAGEVEC_SIZE-1) + 1);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
index = page->index;
if (index > end)
break;
index++;
pages_skipped = mpd->wbc->pages_skipped;
err = mapping->a_ops->writepage(page, mpd->wbc);
if (!err)
if (!err && (pages_skipped == mpd->wbc->pages_skipped))
/*
* have successfully written the page
* without skipping the same
*/
mpd->pages_written++;
/*
* In error case, we have to continue because
......@@ -2104,7 +2115,6 @@ static int mpage_da_writepages(struct address_space *mapping,
struct writeback_control *wbc,
struct mpage_da_data *mpd)
{
long to_write;
int ret;
if (!mpd->get_block)
......@@ -2119,19 +2129,18 @@ static int mpage_da_writepages(struct address_space *mapping,
mpd->pages_written = 0;
mpd->retval = 0;
to_write = wbc->nr_to_write;
ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
/*
* Handle last extent of pages
*/
if (!mpd->io_done && mpd->next_page != mpd->first_page) {
if (mpage_da_map_blocks(mpd) == 0)
mpage_da_submit_io(mpd);
}
wbc->nr_to_write = to_write - mpd->pages_written;
mpd->io_done = 1;
ret = MPAGE_DA_EXTENT_TAIL;
}
wbc->nr_to_write -= mpd->pages_written;
return ret;
}
......@@ -2360,12 +2369,14 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
static int ext4_da_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
pgoff_t index;
int range_whole = 0;
handle_t *handle = NULL;
loff_t range_start = 0;
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
int no_nrwrite_index_update;
long pages_written = 0, pages_skipped;
int needed_blocks, ret = 0, nr_to_writebump = 0;
long to_write, pages_skipped = 0;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
/*
......@@ -2385,23 +2396,26 @@ static int ext4_da_writepages(struct address_space *mapping,
nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
wbc->nr_to_write = sbi->s_mb_stream_request;
}
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
if (!wbc->range_cyclic)
/*
* If range_cyclic is not set force range_cont
* and save the old writeback_index
*/
wbc->range_cont = 1;
range_start = wbc->range_start;
pages_skipped = wbc->pages_skipped;
if (wbc->range_cyclic)
index = mapping->writeback_index;
else
index = wbc->range_start >> PAGE_CACHE_SHIFT;
mpd.wbc = wbc;
mpd.inode = mapping->host;
restart_loop:
to_write = wbc->nr_to_write;
while (!ret && to_write > 0) {
/*
* we don't want write_cache_pages to update
* nr_to_write and writeback_index
*/
no_nrwrite_index_update = wbc->no_nrwrite_index_update;
wbc->no_nrwrite_index_update = 1;
pages_skipped = wbc->pages_skipped;
while (!ret && wbc->nr_to_write > 0) {
/*
* we insert one extent at a time. So we need
......@@ -2422,48 +2436,53 @@ restart_loop:
dump_stack();
goto out_writepages;
}
to_write -= wbc->nr_to_write;
mpd.get_block = ext4_da_get_block_write;
ret = mpage_da_writepages(mapping, wbc, &mpd);
ext4_journal_stop(handle);
if (mpd.retval == -ENOSPC)
if (mpd.retval == -ENOSPC) {
/* commit the transaction which would
* free blocks released in the transaction
* and try again
*/
jbd2_journal_force_commit_nested(sbi->s_journal);
/* reset the retry count */
if (ret == MPAGE_DA_EXTENT_TAIL) {
wbc->pages_skipped = pages_skipped;
ret = 0;
} else if (ret == MPAGE_DA_EXTENT_TAIL) {
/*
* got one extent now try with
* rest of the pages
*/
to_write += wbc->nr_to_write;
pages_written += mpd.pages_written;
wbc->pages_skipped = pages_skipped;
ret = 0;
} else if (wbc->nr_to_write) {
} else if (wbc->nr_to_write)
/*
* There is no more writeout needed
* or we requested for a noblocking writeout
* and we found the device congested
*/
to_write += wbc->nr_to_write;
break;
}
wbc->nr_to_write = to_write;
}
if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
/* We skipped pages in this loop */
wbc->range_start = range_start;
wbc->nr_to_write = to_write +
wbc->pages_skipped - pages_skipped;
wbc->pages_skipped = pages_skipped;
goto restart_loop;
}
if (pages_skipped != wbc->pages_skipped)
printk(KERN_EMERG "This should not happen leaving %s "
"with nr_to_write = %ld ret = %d\n",
__func__, wbc->nr_to_write, ret);
/* Update index */
index += pages_written;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
/*
* set the writeback_index so that range_cyclic
* mode will write it back later
*/
mapping->writeback_index = index;
out_writepages:
wbc->nr_to_write = to_write - nr_to_writebump;
wbc->range_start = range_start;
if (!no_nrwrite_index_update)
wbc->no_nrwrite_index_update = 0;
wbc->nr_to_write -= nr_to_writebump;
return ret;
}
......@@ -4175,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle,
struct inode *inode = &(ei->vfs_inode);
u64 i_blocks = inode->i_blocks;
struct super_block *sb = inode->i_sb;
int err = 0;
if (i_blocks <= ~0U) {
/*
......@@ -4185,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle,
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = 0;
ei->i_flags &= ~EXT4_HUGE_FILE_FL;
} else if (i_blocks <= 0xffffffffffffULL) {
return 0;
}
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
return -EFBIG;
if (i_blocks <= 0xffffffffffffULL) {
/*
* i_blocks can be represented in a 48 bit variable
* as multiple of 512 bytes
*/
err = ext4_update_rocompat_feature(handle, sb,
EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
if (err)
goto err_out;
/* i_block is stored in the split 48 bit fields */
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
ei->i_flags &= ~EXT4_HUGE_FILE_FL;
} else {
/*
* i_blocks should be represented in a 48 bit variable
* as multiple of file system block size
*/
err = ext4_update_rocompat_feature(handle, sb,
EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
if (err)
goto err_out;
ei->i_flags |= EXT4_HUGE_FILE_FL;
/* i_block is stored in file system block size */
i_blocks = i_blocks >> (inode->i_blkbits - 9);
raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
}
err_out:
return err;
return 0;
}
/*
......
......@@ -2300,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
}
INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
meta_group_info[i]->bb_free_root.rb_node = NULL;;
#ifdef DOUBLE_CHECK
{
......@@ -2522,9 +2523,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
}
spin_lock_init(&sbi->s_md_lock);
INIT_LIST_HEAD(&sbi->s_active_transaction);
INIT_LIST_HEAD(&sbi->s_closed_transaction);
INIT_LIST_HEAD(&sbi->s_committed_transaction);
spin_lock_init(&sbi->s_bal_lock);
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
......@@ -2553,6 +2551,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
sbi->s_journal->j_commit_callback = release_blocks_on_commit;
printk(KERN_INFO "EXT4-fs: mballoc enabled\n");
return 0;
}
......@@ -2568,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
list_del(&pa->pa_group_list);
count++;
kfree(pa);
kmem_cache_free(ext4_pspace_cachep, pa);
}
if (count)
mb_debug("mballoc: %u PAs left\n", count);
......@@ -2582,15 +2582,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_group_info *grinfo;
struct ext4_sb_info *sbi = EXT4_SB(sb);
/* release freed, non-committed blocks */
spin_lock(&sbi->s_md_lock);
list_splice_init(&sbi->s_closed_transaction,
&sbi->s_committed_transaction);
list_splice_init(&sbi->s_active_transaction,
&sbi->s_committed_transaction);
spin_unlock(&sbi->s_md_lock);
ext4_mb_free_committed_blocks(sb);
if (sbi->s_group_info) {
for (i = 0; i < sbi->s_groups_count; i++) {
grinfo = ext4_get_group_info(sb, i);
......@@ -2644,61 +2635,57 @@ int ext4_mb_release(struct super_block *sb)
return 0;
}
static noinline_for_stack void
ext4_mb_free_committed_blocks(struct super_block *sb)
/*
* This function is called by the jbd2 layer once the commit has finished,
* so we know we can free the blocks that were released with that commit.
*/
static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
int i;
int count = 0;
int count2 = 0;
struct ext4_free_metadata *md;
struct super_block *sb = journal->j_private;
struct ext4_buddy e4b;
struct ext4_group_info *db;
int err, count = 0, count2 = 0;
struct ext4_free_data *entry;
ext4_fsblk_t discard_block;
struct list_head *l, *ltmp;
if (list_empty(&sbi->s_committed_transaction))
return;
/* there is committed blocks to be freed yet */
do {
/* get next array of blocks */
md = NULL;
spin_lock(&sbi->s_md_lock);
if (!list_empty(&sbi->s_committed_transaction)) {
md = list_entry(sbi->s_committed_transaction.next,
struct ext4_free_metadata, list);
list_del(&md->list);
}
spin_unlock(&sbi->s_md_lock);
if (md == NULL)
break;
list_for_each_safe(l, ltmp, &txn->t_private_list) {
entry = list_entry(l, struct ext4_free_data, list);
mb_debug("gonna free %u blocks in group %lu (0x%p):",
md->num, md->group, md);
entry->count, entry->group, entry);
err = ext4_mb_load_buddy(sb, md->group, &e4b);
err = ext4_mb_load_buddy(sb, entry->group, &e4b);
/* we expect to find existing buddy because it's pinned */
BUG_ON(err != 0);
db = e4b.bd_info;
/* there are blocks to put in buddy to make them really free */
count += md->num;
count += entry->count;
count2++;
ext4_lock_group(sb, md->group);
for (i = 0; i < md->num; i++) {
mb_debug(" %u", md->blocks[i]);
mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
ext4_lock_group(sb, entry->group);
/* Take it out of per group rb tree */
rb_erase(&entry->node, &(db->bb_free_root));
mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
if (!db->bb_free_root.rb_node) {
/* No more items in the per group rb tree
* balance refcounts from ext4_mb_free_metadata()
*/
page_cache_release(e4b.bd_buddy_page);
page_cache_release(e4b.bd_bitmap_page);
}
mb_debug("\n");
ext4_unlock_group(sb, md->group);
/* balance refcounts from ext4_mb_free_metadata() */
page_cache_release(e4b.bd_buddy_page);
page_cache_release(e4b.bd_bitmap_page);
kfree(md);
ext4_unlock_group(sb, entry->group);
discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb)
+ entry->start_blk
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id,
(unsigned long long) discard_block, entry->count);
sb_issue_discard(sb, discard_block, entry->count);
kmem_cache_free(ext4_free_ext_cachep, entry);
ext4_mb_release_desc(&e4b);
} while (md);
}
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
......@@ -2712,6 +2699,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
static int ext4_mb_init_per_dev_proc(struct super_block *sb)
{
#ifdef CONFIG_PROC_FS
mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct proc_dir_entry *proc;
......@@ -2735,10 +2723,14 @@ err_out:
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
return -ENOMEM;
#else
return 0;
#endif
}
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
{
#ifdef CONFIG_PROC_FS
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_proc == NULL)
......@@ -2750,7 +2742,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
#endif
return 0;
}
......@@ -2771,6 +2763,16 @@ int __init init_ext4_mballoc(void)
kmem_cache_destroy(ext4_pspace_cachep);
return -ENOMEM;
}
ext4_free_ext_cachep =
kmem_cache_create("ext4_free_block_extents",
sizeof(struct ext4_free_data),
0, SLAB_RECLAIM_ACCOUNT, NULL);
if (ext4_free_ext_cachep == NULL) {
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
return -ENOMEM;
}
return 0;
}
......@@ -2779,6 +2781,7 @@ void exit_ext4_mballoc(void)
/* XXX: synchronize_rcu(); */
kmem_cache_destroy(ext4_pspace_cachep);
kmem_cache_destroy(ext4_ac_cachep);
kmem_cache_destroy(ext4_free_ext_cachep);
}
......@@ -4324,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
goto out1;
}
ext4_mb_poll_new_transaction(sb, handle);
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
......@@ -4384,35 +4385,20 @@ out1:
return block;
}
static void ext4_mb_poll_new_transaction(struct super_block *sb,
handle_t *handle)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (sbi->s_last_transaction == handle->h_transaction->t_tid)
return;
/* new transaction! time to close last one and free blocks for
* committed transaction. we know that only transaction can be
* active, so previos transaction can be being logged and we
* know that transaction before previous is known to be already
* logged. this means that now we may free blocks freed in all
* transactions before previous one. hope I'm clear enough ... */
spin_lock(&sbi->s_md_lock);
if (sbi->s_last_transaction != handle->h_transaction->t_tid) {
mb_debug("new transaction %lu, old %lu\n",
(unsigned long) handle->h_transaction->t_tid,
(unsigned long) sbi->s_last_transaction);
list_splice_init(&sbi->s_closed_transaction,
&sbi->s_committed_transaction);
list_splice_init(&sbi->s_active_transaction,
&sbi->s_closed_transaction);
sbi->s_last_transaction = handle->h_transaction->t_tid;
}
spin_unlock(&sbi->s_md_lock);
ext4_mb_free_committed_blocks(sb);
/*
* We can merge two free data extents only if the physical blocks
* are contiguous, AND the extents were freed by the same transaction,
* AND the blocks are associated with the same group.
*/
static int can_merge(struct ext4_free_data *entry1,
struct ext4_free_data *entry2)
{
if ((entry1->t_tid == entry2->t_tid) &&
(entry1->group == entry2->group) &&
((entry1->start_blk + entry1->count) == entry2->start_blk))
return 1;
return 0;
}
static noinline_for_stack int
......@@ -4422,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
struct ext4_group_info *db = e4b->bd_info;
struct super_block *sb = e4b->bd_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_free_metadata *md;
int i;
struct ext4_free_data *entry, *new_entry;
struct rb_node **n = &db->bb_free_root.rb_node, *node;
struct rb_node *parent = NULL, *new_node;
BUG_ON(e4b->bd_bitmap_page == NULL);
BUG_ON(e4b->bd_buddy_page == NULL);
new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
new_entry->start_blk = block;
new_entry->group = group;
new_entry->count = count;
new_entry->t_tid = handle->h_transaction->t_tid;
new_node = &new_entry->node;
ext4_lock_group(sb, group);
for (i = 0; i < count; i++) {
md = db->bb_md_cur;
if (md && db->bb_tid != handle->h_transaction->t_tid) {
db->bb_md_cur = NULL;
md = NULL;
if (!*n) {
/* first free block exent. We need to
protect buddy cache from being freed,
* otherwise we'll refresh it from
* on-disk bitmap and lose not-yet-available
* blocks */
page_cache_get(e4b->bd_buddy_page);
page_cache_get(e4b->bd_bitmap_page);
}
while (*n) {
parent = *n;
entry = rb_entry(parent, struct ext4_free_data, node);
if (block < entry->start_blk)
n = &(*n)->rb_left;
else if (block >= (entry->start_blk + entry->count))
n = &(*n)->rb_right;
else {
ext4_error(sb, __func__,
"Double free of blocks %d (%d %d)\n",
block, entry->start_blk, entry->count);
return 0;
}
}
if (md == NULL) {
ext4_unlock_group(sb, group);
md = kmalloc(sizeof(*md), GFP_NOFS);
if (md == NULL)
return -ENOMEM;
md->num = 0;
md->group = group;
ext4_lock_group(sb, group);
if (db->bb_md_cur == NULL) {
spin_lock(&sbi->s_md_lock);
list_add(&md->list, &sbi->s_active_transaction);
spin_unlock(&sbi->s_md_lock);
/* protect buddy cache from being freed,
* otherwise we'll refresh it from
* on-disk bitmap and lose not-yet-available
* blocks */
page_cache_get(e4b->bd_buddy_page);
page_cache_get(e4b->bd_bitmap_page);
db->bb_md_cur = md;
db->bb_tid = handle->h_transaction->t_tid;
mb_debug("new md 0x%p for group %lu\n",
md, md->group);
} else {
kfree(md);
md = db->bb_md_cur;
}
rb_link_node(new_node, parent, n);
rb_insert_color(new_node, &db->bb_free_root);
/* Now try to see the extent can be merged to left and right */
node = rb_prev(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, node);
if (can_merge(entry, new_entry)) {
new_entry->start_blk = entry->start_blk;
new_entry->count += entry->count;
rb_erase(node, &(db->bb_free_root));
spin_lock(&sbi->s_md_lock);
list_del(&entry->list);
spin_unlock(&sbi->s_md_lock);
kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS);
md->blocks[md->num] = block + i;
md->num++;
if (md->num == EXT4_BB_MAX_BLOCKS) {
/* no more space, put full container on a sb's list */
db->bb_md_cur = NULL;
node = rb_next(new_node);
if (node) {
entry = rb_entry(node, struct ext4_free_data, node);
if (can_merge(new_entry, entry)) {
new_entry->count += entry->count;
rb_erase(node, &(db->bb_free_root));
spin_lock(&sbi->s_md_lock);
list_del(&entry->list);
spin_unlock(&sbi->s_md_lock);
kmem_cache_free(ext4_free_ext_cachep, entry);
}
}
/* Add the extent to transaction's private list */
spin_lock(&sbi->s_md_lock);
list_add(&new_entry->list, &handle->h_transaction->t_private_list);
spin_unlock(&sbi->s_md_lock);
ext4_unlock_group(sb, group);
return 0;
}
......@@ -4500,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
*freed = 0;
ext4_mb_poll_new_transaction(sb, handle);
sbi = EXT4_SB(sb);
es = EXT4_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
......
......@@ -18,6 +18,8 @@
#include <linux/pagemap.h>
#include <linux/seq_file.h>
#include <linux/version.h>
#include <linux/blkdev.h>
#include <linux/marker.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "group.h"
......@@ -98,23 +100,29 @@
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
static struct kmem_cache *ext4_free_ext_cachep;
#ifdef EXT4_BB_MAX_BLOCKS
#undef EXT4_BB_MAX_BLOCKS
#endif
#define EXT4_BB_MAX_BLOCKS 30
struct ext4_free_data {
/* this links the free block information from group_info */
struct rb_node node;
struct ext4_free_metadata {
ext4_group_t group;
unsigned short num;
ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS];
/* this links the free block information from ext4_sb_info */
struct list_head list;
/* group which free block extent belongs */
ext4_group_t group;
/* free block extent */
ext4_grpblk_t start_blk;
ext4_grpblk_t count;
/* transaction which freed this extent */
tid_t t_tid;
};
struct ext4_group_info {
unsigned long bb_state;
unsigned long bb_tid;
struct ext4_free_metadata *bb_md_cur;
struct rb_root bb_free_root;
unsigned short bb_first_free;
unsigned short bb_free;
unsigned short bb_fragments;
......@@ -261,8 +269,6 @@ struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *);
static void ext4_mb_free_committed_blocks(struct super_block *);
static void ext4_mb_return_to_preallocation(struct inode *inode,
struct ext4_buddy *e4b, sector_t block,
int count);
......@@ -270,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *,
struct super_block *, struct ext4_prealloc_space *pa);
static int ext4_mb_init_per_dev_proc(struct super_block *sb);
static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
......
......@@ -374,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb)
*/
}
int ext4_update_compat_feature(handle_t *handle,
struct super_block *sb, __u32 compat)
{
int err = 0;
if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) {
err = ext4_journal_get_write_access(handle,
EXT4_SB(sb)->s_sbh);
if (err)
return err;
EXT4_SET_COMPAT_FEATURE(sb, compat);
sb->s_dirt = 1;
handle->h_sync = 1;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
"call ext4_journal_dirty_met adata");
err = ext4_journal_dirty_metadata(handle,
EXT4_SB(sb)->s_sbh);
}
return err;
}
int ext4_update_rocompat_feature(handle_t *handle,
struct super_block *sb, __u32 rocompat)
{
int err = 0;
if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) {
err = ext4_journal_get_write_access(handle,
EXT4_SB(sb)->s_sbh);
if (err)
return err;
EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat);
sb->s_dirt = 1;
handle->h_sync = 1;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
"call ext4_journal_dirty_met adata");
err = ext4_journal_dirty_metadata(handle,
EXT4_SB(sb)->s_sbh);
}
return err;
}
int ext4_update_incompat_feature(handle_t *handle,
struct super_block *sb, __u32 incompat)
{
int err = 0;
if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) {
err = ext4_journal_get_write_access(handle,
EXT4_SB(sb)->s_sbh);
if (err)
return err;
EXT4_SET_INCOMPAT_FEATURE(sb, incompat);
sb->s_dirt = 1;
handle->h_sync = 1;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh,
"call ext4_journal_dirty_met adata");
err = ext4_journal_dirty_metadata(handle,
EXT4_SB(sb)->s_sbh);
}
return err;
}
/*
* Open the external journal device
*/
......@@ -904,7 +844,7 @@ static const struct export_operations ext4_export_ops = {
enum {
Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
......@@ -915,7 +855,7 @@ enum {
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_inode_readahead_blks
};
......@@ -933,8 +873,6 @@ static const match_table_t tokens = {
{Opt_err_panic, "errors=panic"},
{Opt_err_ro, "errors=remount-ro"},
{Opt_nouid32, "nouid32"},
{Opt_nocheck, "nocheck"},
{Opt_nocheck, "check=none"},
{Opt_debug, "debug"},
{Opt_oldalloc, "oldalloc"},
{Opt_orlov, "orlov"},
......@@ -973,8 +911,6 @@ static const match_table_t tokens = {
{Opt_extents, "extents"},
{Opt_noextents, "noextents"},
{Opt_i_version, "i_version"},
{Opt_mballoc, "mballoc"},
{Opt_nomballoc, "nomballoc"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
{Opt_delalloc, "delalloc"},
......@@ -1073,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_nouid32:
set_opt(sbi->s_mount_opt, NO_UID32);
break;
case Opt_nocheck:
clear_opt(sbi->s_mount_opt, CHECK);
break;
case Opt_debug:
set_opt(sbi->s_mount_opt, DEBUG);
break;
......@@ -1618,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb)
if (block_bitmap < first_block || block_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Block bitmap for group %lu not in group "
"(block %llu)!", i, block_bitmap);
"(block %llu)!\n", i, block_bitmap);
return 0;
}
inode_bitmap = ext4_inode_bitmap(sb, gdp);
if (inode_bitmap < first_block || inode_bitmap > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode bitmap for group %lu not in group "
"(block %llu)!", i, inode_bitmap);
"(block %llu)!\n", i, inode_bitmap);
return 0;
}
inode_table = ext4_inode_table(sb, gdp);
......@@ -1633,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb)
inode_table + sbi->s_itb_per_group - 1 > last_block) {
printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
"Inode table for group %lu not in group "
"(block %llu)!", i, inode_table);
"(block %llu)!\n", i, inode_table);
return 0;
}
spin_lock(sb_bgl_lock(sbi, i));
......@@ -1778,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
*
* Note, this does *not* consider any metadata overhead for vfs i_blocks.
*/
static loff_t ext4_max_size(int blkbits)
static loff_t ext4_max_size(int blkbits, int has_huge_files)
{
loff_t res;
loff_t upper_limit = MAX_LFS_FILESIZE;
/* small i_blocks in vfs inode? */
if (sizeof(blkcnt_t) < sizeof(u64)) {
if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
* CONFIG_LSF is not enabled implies the inode
* i_block represent total blocks in 512 bytes
......@@ -1814,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits)
* block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks.
* We need to be 1 filesystem block less than the 2^48 sector limit.
*/
static loff_t ext4_max_bitmap_size(int bits)
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{
loff_t res = EXT4_NDIR_BLOCKS;
int meta_blocks;
......@@ -1827,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits)
* total number of 512 bytes blocks of the file
*/
if (sizeof(blkcnt_t) < sizeof(u64)) {
if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
* CONFIG_LSF is not enabled implies the inode
* i_block represent total blocks in 512 bytes
* 32 == size of vfs inode i_blocks * 8
* !has_huge_files or CONFIG_LSF is not enabled
* implies the inode i_block represent total blocks in
* 512 bytes 32 == size of vfs inode i_blocks * 8
*/
upper_limit = (1LL << 32) - 1;
......@@ -1940,7 +1873,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
int blocksize;
int db_count;
int i;
int needs_recovery;
int needs_recovery, has_huge_files;
__le32 features;
__u64 blocks_count;
int err;
......@@ -2081,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_id, le32_to_cpu(features));
goto failed_mount;
}
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
if (has_huge_files) {
/*
* Large file size enabled file system can only be
* mount if kernel is build with CONFIG_LSF
......@@ -2131,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
......@@ -2456,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"available.\n");
}
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
"requested data journaling mode\n");
clear_opt(sbi->s_mount_opt, DELALLOC);
} else if (test_opt(sb, DELALLOC))
printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
ext4_ext_init(sb);
err = ext4_mb_init(sb, needs_recovery);
if (err) {
printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
err);
goto failed_mount4;
}
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
......@@ -2475,21 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
"requested data journaling mode\n");
clear_opt(sbi->s_mount_opt, DELALLOC);
} else if (test_opt(sb, DELALLOC))
printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
ext4_ext_init(sb);
err = ext4_mb_init(sb, needs_recovery);
if (err) {
printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
err);
goto failed_mount4;
}
lock_kernel();
return 0;
......
......@@ -995,6 +995,9 @@ restart_loop:
}
spin_unlock(&journal->j_list_lock);
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
journal->j_devname, commit_transaction->t_tid,
journal->j_tail_sequence);
......
......@@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
INIT_LIST_HEAD(&transaction->t_inode_list);
INIT_LIST_HEAD(&transaction->t_private_list);
/* Set up the commit timer for the new transaction. */
journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
......
......@@ -641,6 +641,11 @@ struct transaction_s
*/
int t_handle_count;
/*
* For use by the filesystem to store fs-specific data
* structures associated with the transaction
*/
struct list_head t_private_list;
};
struct transaction_run_stats_s {
......@@ -935,6 +940,10 @@ struct journal_s
pid_t j_last_sync_writer;
/* This function is called when a transaction is closed */
void (*j_commit_callback)(journal_t *,
transaction_t *);
/*
* Journal statistics
*/
......
......@@ -63,7 +63,15 @@ struct writeback_control {
unsigned for_writepages:1; /* This is a writepages() call */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned more_io:1; /* more io to be dispatched */
unsigned range_cont:1;
/*
* write_cache_pages() won't update wbc->nr_to_write and
* mapping->writeback_index if no_nrwrite_index_update
* is set. write_cache_pages() may write more than we
* requested and we want to make sure nr_to_write and
* writeback_index are updated in a consistent manner
* so we use a single control to update them
*/
unsigned no_nrwrite_index_update:1;
};
/*
......
......@@ -876,6 +876,7 @@ int write_cache_pages(struct address_space *mapping,
pgoff_t end; /* Inclusive */
int scanned = 0;
int range_whole = 0;
long nr_to_write = wbc->nr_to_write;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
......@@ -939,7 +940,7 @@ retry:
unlock_page(page);
ret = 0;
}
if (ret || (--(wbc->nr_to_write) <= 0))
if (ret || (--nr_to_write <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
......@@ -958,11 +959,12 @@ retry:
index = 0;
goto retry;
}
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = index;
if (!wbc->no_nrwrite_index_update) {
if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
mapping->writeback_index = index;
wbc->nr_to_write = nr_to_write;
}
if (wbc->range_cont)
wbc->range_start = index << PAGE_CACHE_SHIFT;
return ret;
}
EXPORT_SYMBOL(write_cache_pages);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment