Commit a022fe09 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Felix Blyakher

xfs: fix locking in xfs_iget_cache_hit

The locking in xfs_iget_cache_hit currently has numerous problems:

 - we clear the reclaim tag without i_flags_lock which protects
   modifications to it
 - we call inode_init_always which can sleep with pag_ici_lock
   held (this is oss.sgi.com BZ #819)
 - we acquire and drop i_flags_lock a lot and thus provide no
   consistency between the various flags we set/clear under it

This patch fixes all that with a major revamp of the locking in
the function.  The new version acquires i_flags_lock early and
only drops it once we need to call into inode_init_always or before
calling xfs_ilock.

This patch fixes a bug seen in the wild where we race modifying the
reclaim tag.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarFelix Blyakher <felixb@sgi.com>
Reviewed-by: default avatarEric Sandeen <sandeen@sandeen.net>
Signed-off-by: default avatarFelix Blyakher <felixb@sgi.com>
parent 79dd43bb
...@@ -708,6 +708,16 @@ xfs_reclaim_inode( ...@@ -708,6 +708,16 @@ xfs_reclaim_inode(
return 0; return 0;
} }
void
__xfs_inode_set_reclaim_tag(
struct xfs_perag *pag,
struct xfs_inode *ip)
{
radix_tree_tag_set(&pag->pag_ici_root,
XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
}
/* /*
* We set the inode flag atomically with the radix tree tag. * We set the inode flag atomically with the radix tree tag.
* Once we get tag lookups on the radix tree, this inode flag * Once we get tag lookups on the radix tree, this inode flag
...@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag( ...@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
read_lock(&pag->pag_ici_lock); read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock); spin_lock(&ip->i_flags_lock);
radix_tree_tag_set(&pag->pag_ici_root, __xfs_inode_set_reclaim_tag(pag, ip);
XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
__xfs_iflags_set(ip, XFS_IRECLAIMABLE); __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock); read_unlock(&pag->pag_ici_lock);
......
...@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); ...@@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
struct xfs_inode *ip); struct xfs_inode *ip);
......
...@@ -134,80 +134,82 @@ xfs_iget_cache_hit( ...@@ -134,80 +134,82 @@ xfs_iget_cache_hit(
int flags, int flags,
int lock_flags) __releases(pag->pag_ici_lock) int lock_flags) __releases(pag->pag_ici_lock)
{ {
struct inode *inode = VFS_I(ip);
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
int error = EAGAIN; int error;
spin_lock(&ip->i_flags_lock);
/* /*
* If INEW is set this inode is being set up * If we are racing with another cache hit that is currently
* If IRECLAIM is set this inode is being torn down * instantiating this inode or currently recycling it out of
* Pause and try again. * reclaimabe state, wait for the initialisation to complete
* before continuing.
*
* XXX(hch): eventually we should do something equivalent to
* wait_on_inode to wait for these flags to be cleared
* instead of polling for it.
*/ */
if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
XFS_STATS_INC(xs_ig_frecycle); XFS_STATS_INC(xs_ig_frecycle);
error = EAGAIN;
goto out_error; goto out_error;
} }
/* If IRECLAIMABLE is set, we've torn down the vfs inode part */ /*
if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { * If lookup is racing with unlink return an error immediately.
*/
/* if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
* If lookup is racing with unlink, then we should return an error = ENOENT;
* error immediately so we don't remove it from the reclaim goto out_error;
* list and potentially leak the inode. }
*/
if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
error = ENOENT;
goto out_error;
}
/*
* If IRECLAIMABLE is set, we've torn down the VFS inode already.
* Need to carefully get it back into useable state.
*/
if (ip->i_flags & XFS_IRECLAIMABLE) {
xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/* /*
* We need to re-initialise the VFS inode as it has been * We need to set XFS_INEW atomically with clearing the
* 'freed' by the VFS. Do this here so we can deal with * reclaimable tag so that we do have an indicator of the
* errors cleanly, then tag it so it can be set up correctly * inode still being initialized.
* later.
*/ */
if (!inode_init_always(mp->m_super, VFS_I(ip))) { ip->i_flags |= XFS_INEW;
error = ENOMEM; ip->i_flags &= ~XFS_IRECLAIMABLE;
goto out_error; __xfs_inode_clear_reclaim_tag(mp, pag, ip);
}
/* spin_unlock(&ip->i_flags_lock);
* We must set the XFS_INEW flag before clearing the read_unlock(&pag->pag_ici_lock);
* XFS_IRECLAIMABLE flag so that if a racing lookup does
* not find the XFS_IRECLAIMABLE above but has the igrab()
* below succeed we can safely check XFS_INEW to detect
* that this inode is still being initialised.
*/
xfs_iflags_set(ip, XFS_INEW);
xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
/* clear the radix tree reclaim flag as well. */ error = -inode_init_always(mp->m_super, inode);
__xfs_inode_clear_reclaim_tag(mp, pag, ip); if (error) {
} else if (!igrab(VFS_I(ip))) { /*
* Re-initializing the inode failed, and we are in deep
* trouble. Try to re-add it to the reclaim list.
*/
read_lock(&pag->pag_ici_lock);
spin_lock(&ip->i_flags_lock);
ip->i_flags &= ~XFS_INEW;
ip->i_flags |= XFS_IRECLAIMABLE;
__xfs_inode_set_reclaim_tag(pag, ip);
goto out_error;
}
inode->i_state = I_LOCK|I_NEW;
} else {
/* If the VFS inode is being torn down, pause and try again. */ /* If the VFS inode is being torn down, pause and try again. */
XFS_STATS_INC(xs_ig_frecycle); if (!igrab(inode)) {
goto out_error; error = EAGAIN;
} else if (xfs_iflags_test(ip, XFS_INEW)) { goto out_error;
/* }
* We are racing with another cache hit that is
* currently recycling this inode out of the XFS_IRECLAIMABLE
* state. Wait for the initialisation to complete before
* continuing.
*/
wait_on_inode(VFS_I(ip));
}
if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { /* We've got a live one. */
error = ENOENT; spin_unlock(&ip->i_flags_lock);
iput(VFS_I(ip)); read_unlock(&pag->pag_ici_lock);
goto out_error;
} }
/* We've got a live one. */
read_unlock(&pag->pag_ici_lock);
if (lock_flags != 0) if (lock_flags != 0)
xfs_ilock(ip, lock_flags); xfs_ilock(ip, lock_flags);
...@@ -217,6 +219,7 @@ xfs_iget_cache_hit( ...@@ -217,6 +219,7 @@ xfs_iget_cache_hit(
return 0; return 0;
out_error: out_error:
spin_unlock(&ip->i_flags_lock);
read_unlock(&pag->pag_ici_lock); read_unlock(&pag->pag_ici_lock);
return error; return error;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment