Commit c8e20be0 authored by Dave Chinner's avatar Dave Chinner Committed by Alex Elder

xfs: reclaim inodes under a write lock

Make the inode tree reclaim walk exclusive to avoid races with
concurrent sync walkers and lookups. This is a version of a patch
posted by Christoph Hellwig that avoids all the code duplication.
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarAlex Elder <aelder@sgi.com>
parent 7284ce6c
...@@ -65,7 +65,6 @@ xfs_inode_ag_lookup( ...@@ -65,7 +65,6 @@ xfs_inode_ag_lookup(
* as the tree is sparse and a gang lookup walks to find * as the tree is sparse and a gang lookup walks to find
* the number of objects requested. * the number of objects requested.
*/ */
read_lock(&pag->pag_ici_lock);
if (tag == XFS_ICI_NO_TAG) { if (tag == XFS_ICI_NO_TAG) {
nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)&ip, *first_index, 1); (void **)&ip, *first_index, 1);
...@@ -74,7 +73,7 @@ xfs_inode_ag_lookup( ...@@ -74,7 +73,7 @@ xfs_inode_ag_lookup(
(void **)&ip, *first_index, 1, tag); (void **)&ip, *first_index, 1, tag);
} }
if (!nr_found) if (!nr_found)
goto unlock; return NULL;
/* /*
* Update the index for the next lookup. Catch overflows * Update the index for the next lookup. Catch overflows
...@@ -84,13 +83,8 @@ xfs_inode_ag_lookup( ...@@ -84,13 +83,8 @@ xfs_inode_ag_lookup(
*/ */
*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
goto unlock; return NULL;
return ip; return ip;
unlock:
read_unlock(&pag->pag_ici_lock);
return NULL;
} }
STATIC int STATIC int
...@@ -100,7 +94,8 @@ xfs_inode_ag_walk( ...@@ -100,7 +94,8 @@ xfs_inode_ag_walk(
int (*execute)(struct xfs_inode *ip, int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags), struct xfs_perag *pag, int flags),
int flags, int flags,
int tag) int tag,
int exclusive)
{ {
struct xfs_perag *pag = &mp->m_perag[ag]; struct xfs_perag *pag = &mp->m_perag[ag];
uint32_t first_index; uint32_t first_index;
...@@ -114,10 +109,20 @@ restart: ...@@ -114,10 +109,20 @@ restart:
int error = 0; int error = 0;
xfs_inode_t *ip; xfs_inode_t *ip;
if (exclusive)
write_lock(&pag->pag_ici_lock);
else
read_lock(&pag->pag_ici_lock);
ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
if (!ip) if (!ip) {
if (exclusive)
write_unlock(&pag->pag_ici_lock);
else
read_unlock(&pag->pag_ici_lock);
break; break;
}
/* execute releases pag->pag_ici_lock */
error = execute(ip, pag, flags); error = execute(ip, pag, flags);
if (error == EAGAIN) { if (error == EAGAIN) {
skipped++; skipped++;
...@@ -125,9 +130,8 @@ restart: ...@@ -125,9 +130,8 @@ restart:
} }
if (error) if (error)
last_error = error; last_error = error;
/*
* bail out if the filesystem is corrupted. /* bail out if the filesystem is corrupted. */
*/
if (error == EFSCORRUPTED) if (error == EFSCORRUPTED)
break; break;
...@@ -148,7 +152,8 @@ xfs_inode_ag_iterator( ...@@ -148,7 +152,8 @@ xfs_inode_ag_iterator(
int (*execute)(struct xfs_inode *ip, int (*execute)(struct xfs_inode *ip,
struct xfs_perag *pag, int flags), struct xfs_perag *pag, int flags),
int flags, int flags,
int tag) int tag,
int exclusive)
{ {
int error = 0; int error = 0;
int last_error = 0; int last_error = 0;
...@@ -157,7 +162,8 @@ xfs_inode_ag_iterator( ...@@ -157,7 +162,8 @@ xfs_inode_ag_iterator(
for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
if (!mp->m_perag[ag].pag_ici_init) if (!mp->m_perag[ag].pag_ici_init)
continue; continue;
error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); error = xfs_inode_ag_walk(mp, ag, execute, flags, tag,
exclusive);
if (error) { if (error) {
last_error = error; last_error = error;
if (error == EFSCORRUPTED) if (error == EFSCORRUPTED)
...@@ -181,11 +187,7 @@ xfs_sync_inode_valid( ...@@ -181,11 +187,7 @@ xfs_sync_inode_valid(
return EFSCORRUPTED; return EFSCORRUPTED;
} }
/* /* If we can't get a reference on the inode, it must be in reclaim. */
* If we can't get a reference on the inode, it must be in reclaim.
* Leave it for the reclaim code to flush. Also avoid inodes that
* haven't been fully initialised.
*/
if (!igrab(inode)) { if (!igrab(inode)) {
read_unlock(&pag->pag_ici_lock); read_unlock(&pag->pag_ici_lock);
return ENOENT; return ENOENT;
...@@ -282,7 +284,7 @@ xfs_sync_data( ...@@ -282,7 +284,7 @@ xfs_sync_data(
ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
XFS_ICI_NO_TAG); XFS_ICI_NO_TAG, 0);
if (error) if (error)
return XFS_ERROR(error); return XFS_ERROR(error);
...@@ -304,7 +306,7 @@ xfs_sync_attr( ...@@ -304,7 +306,7 @@ xfs_sync_attr(
ASSERT((flags & ~SYNC_WAIT) == 0); ASSERT((flags & ~SYNC_WAIT) == 0);
return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
XFS_ICI_NO_TAG); XFS_ICI_NO_TAG, 0);
} }
STATIC int STATIC int
...@@ -664,60 +666,6 @@ xfs_syncd_stop( ...@@ -664,60 +666,6 @@ xfs_syncd_stop(
kthread_stop(mp->m_sync_task); kthread_stop(mp->m_sync_task);
} }
STATIC int
xfs_reclaim_inode(
xfs_inode_t *ip,
int sync_mode)
{
xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
/* The hash lock here protects a thread in xfs_iget_core from
* racing with us on linking the inode back with a vnode.
* Once we have the XFS_IRECLAIM flag set it will not touch
* us.
*/
write_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
!__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
return -EAGAIN;
}
__xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
xfs_put_perag(ip->i_mount, pag);
/*
* If the inode is still dirty, then flush it out. If the inode
* is not in the AIL, then it will be OK to flush it delwri as
* long as xfs_iflush() does not keep any references to the inode.
* We leave that decision up to xfs_iflush() since it has the
* knowledge of whether it's OK to simply do a delwri flush of
* the inode or whether we need to wait until the inode is
* pulled from the AIL.
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_iflock(ip);
/*
* In the case of a forced shutdown we rely on xfs_iflush() to
* wait for the inode to be unpinned before returning an error.
*/
if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
/* synchronize with xfs_iflush_done */
xfs_iflock(ip);
xfs_ifunlock(ip);
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_ireclaim(ip);
return 0;
}
void void
__xfs_inode_set_reclaim_tag( __xfs_inode_set_reclaim_tag(
struct xfs_perag *pag, struct xfs_perag *pag,
...@@ -760,19 +708,55 @@ __xfs_inode_clear_reclaim_tag( ...@@ -760,19 +708,55 @@ __xfs_inode_clear_reclaim_tag(
} }
STATIC int STATIC int
xfs_reclaim_inode_now( xfs_reclaim_inode(
struct xfs_inode *ip, struct xfs_inode *ip,
struct xfs_perag *pag, struct xfs_perag *pag,
int flags) int sync_mode)
{ {
/* ignore if already under reclaim */ /*
if (xfs_iflags_test(ip, XFS_IRECLAIM)) { * The radix tree lock here protects a thread in xfs_iget from racing
read_unlock(&pag->pag_ici_lock); * with us starting reclaim on the inode. Once we have the
* XFS_IRECLAIM flag set it will not touch us.
*/
spin_lock(&ip->i_flags_lock);
ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
/* ignore as it is already under reclaim */
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
return 0; return 0;
} }
read_unlock(&pag->pag_ici_lock); __xfs_iflags_set(ip, XFS_IRECLAIM);
spin_unlock(&ip->i_flags_lock);
write_unlock(&pag->pag_ici_lock);
/*
* If the inode is still dirty, then flush it out. If the inode
* is not in the AIL, then it will be OK to flush it delwri as
* long as xfs_iflush() does not keep any references to the inode.
* We leave that decision up to xfs_iflush() since it has the
* knowledge of whether it's OK to simply do a delwri flush of
* the inode or whether we need to wait until the inode is
* pulled from the AIL.
* We get the flush lock regardless, though, just to make sure
* we don't free it while it is being flushed.
*/
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_iflock(ip);
return xfs_reclaim_inode(ip, flags); /*
* In the case of a forced shutdown we rely on xfs_iflush() to
* wait for the inode to be unpinned before returning an error.
*/
if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
/* synchronize with xfs_iflush_done */
xfs_iflock(ip);
xfs_ifunlock(ip);
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_ireclaim(ip);
return 0;
} }
int int
...@@ -780,6 +764,6 @@ xfs_reclaim_inodes( ...@@ -780,6 +764,6 @@ xfs_reclaim_inodes(
xfs_mount_t *mp, xfs_mount_t *mp,
int mode) int mode)
{ {
return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
XFS_ICI_RECLAIM_TAG); XFS_ICI_RECLAIM_TAG, 1);
} }
...@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, ...@@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag);
int xfs_inode_ag_iterator(struct xfs_mount *mp, int xfs_inode_ag_iterator(struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
int flags, int tag); int flags, int tag, int write_lock);
#endif #endif
...@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes( ...@@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes(
uint flags) uint flags)
{ {
ASSERT(mp->m_quotainfo); ASSERT(mp->m_quotainfo);
xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0);
} }
/*------------------------------------------------------------------------*/ /*------------------------------------------------------------------------*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment