Commit 99f89551 authored by Eric W. Biederman's avatar Eric W. Biederman Committed by Linus Torvalds

[PATCH] proc: don't lock task_structs indefinitely

Every inode in /proc holds a reference to a struct task_struct.  If a
directory or file is opened and remains open after the the task exits this
pinning continues.  With 8K stacks on a 32bit machine the amount pinned per
file descriptor is about 10K.

Normally I would figure a reasonable per user process limit is about 100
processes.  With 80 processes, with a 1000 file descriptors each I can trigger
the 00M killer on a 32bit kernel, because I have pinned about 800MB of useless
data.

This patch replaces the struct task_struct pointer with a pointer to a struct
task_ref which has a struct task_struct pointer.  The so the pinning of dead
tasks does not happen.

The code now has to contend with the fact that the task may now exit at any
time.  Which is a little but not muh more complicated.

With this change it takes about 1000 processes each opening up 1000 file
descriptors before I can trigger the OOM killer.  Much better.

[mlp@google.com: task_mmu small fixes]
Signed-off-by: default avatarEric W. Biederman <ebiederm@xmission.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Paul Jackson <pj@sgi.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Albert Cahalan <acahalan@gmail.com>
Signed-off-by: default avatarPrasanna Meda <mlp@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 8578cea7
This diff is collapsed.
......@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
static void proc_delete_inode(struct inode *inode)
{
struct proc_dir_entry *de;
struct task_struct *tsk;
truncate_inode_pages(&inode->i_data, 0);
/* Let go of any associated process */
tsk = PROC_I(inode)->task;
if (tsk)
put_task_struct(tsk);
/* Stop tracking associated processes */
tref_put(PROC_I(inode)->tref);
/* Let go of any associated proc directory entry */
de = PROC_I(inode)->pde;
......@@ -94,7 +91,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
if (!ei)
return NULL;
ei->task = NULL;
ei->tref = NULL;
ei->fd = 0;
ei->op.proc_get_link = NULL;
ei->pde = NULL;
......
......@@ -10,6 +10,7 @@
*/
#include <linux/proc_fs.h>
#include <linux/task_ref.h>
struct vmalloc_info {
unsigned long used;
......@@ -41,13 +42,23 @@ extern struct file_operations proc_maps_operations;
extern struct file_operations proc_numa_maps_operations;
extern struct file_operations proc_smaps_operations;
extern struct file_operations proc_maps_operations;
extern struct file_operations proc_numa_maps_operations;
extern struct file_operations proc_smaps_operations;
void free_proc_entry(struct proc_dir_entry *de);
int proc_init_inodecache(void);
static inline struct task_struct *proc_task(struct inode *inode)
static inline struct task_ref *proc_tref(struct inode *inode)
{
return PROC_I(inode)->tref;
}
static inline struct task_struct *get_proc_task(struct inode *inode)
{
return PROC_I(inode)->task;
return get_tref_task(proc_tref(inode));
}
static inline int proc_fd(struct inode *inode)
......
......@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
{
struct vm_area_struct * vma;
int result = -ENOENT;
struct task_struct *task = proc_task(inode);
struct mm_struct * mm = get_task_mm(task);
struct task_struct *task = get_proc_task(inode);
struct mm_struct * mm = NULL;
if (task) {
mm = get_task_mm(task);
put_task_struct(task);
}
if (!mm)
goto out;
down_read(&mm->mmap_sem);
......@@ -120,7 +124,8 @@ struct mem_size_stats
static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
{
struct task_struct *task = m->private;
struct proc_maps_private *priv = m->private;
struct task_struct *task = priv->task;
struct vm_area_struct *vma = v;
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
......@@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v)
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct task_struct *task = m->private;
struct proc_maps_private *priv = m->private;
unsigned long last_addr = m->version;
struct mm_struct *mm;
struct vm_area_struct *vma, *tail_vma;
struct vm_area_struct *vma, *tail_vma = NULL;
loff_t l = *pos;
/* Clear the per syscall fields in priv */
priv->task = NULL;
priv->tail_vma = NULL;
/*
* We remember last_addr rather than next_addr to hit with
* mmap_cache most of the time. We have zero last_addr at
......@@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
if (last_addr == -1UL)
return NULL;
mm = get_task_mm(task);
priv->task = get_tref_task(priv->tref);
if (!priv->task)
return NULL;
mm = get_task_mm(priv->task);
if (!mm)
return NULL;
tail_vma = get_gate_vma(task);
priv->tail_vma = tail_vma = get_gate_vma(priv->task);
down_read(&mm->mmap_sem);
/* Start with last addr hint */
......@@ -350,11 +363,9 @@ out:
return tail_vma;
}
static void m_stop(struct seq_file *m, void *v)
static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
{
struct task_struct *task = m->private;
struct vm_area_struct *vma = v;
if (vma && vma != get_gate_vma(task)) {
if (vma && vma != priv->tail_vma) {
struct mm_struct *mm = vma->vm_mm;
up_read(&mm->mmap_sem);
mmput(mm);
......@@ -363,17 +374,27 @@ static void m_stop(struct seq_file *m, void *v)
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
struct task_struct *task = m->private;
struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
struct vm_area_struct *tail_vma = get_gate_vma(task);
struct vm_area_struct *tail_vma = priv->tail_vma;
(*pos)++;
if (vma && (vma != tail_vma) && vma->vm_next)
return vma->vm_next;
m_stop(m, v);
vma_stop(priv, vma);
return (vma != tail_vma)? tail_vma: NULL;
}
static void m_stop(struct seq_file *m, void *v)
{
struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
vma_stop(priv, vma);
if (priv->task)
put_task_struct(priv->task);
}
static struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
......@@ -391,11 +412,18 @@ static struct seq_operations proc_pid_smaps_op = {
static int do_maps_open(struct inode *inode, struct file *file,
struct seq_operations *ops)
{
struct task_struct *task = proc_task(inode);
int ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = task;
struct proc_maps_private *priv;
int ret = -ENOMEM;
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (priv) {
priv->tref = proc_tref(inode);
ret = seq_open(file, ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = priv;
} else {
kfree(priv);
}
}
return ret;
}
......@@ -409,7 +437,7 @@ struct file_operations proc_maps_operations = {
.open = maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
.release = seq_release_private,
};
#ifdef CONFIG_NUMA
......@@ -431,7 +459,7 @@ struct file_operations proc_numa_maps_operations = {
.open = numa_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
.release = seq_release_private,
};
#endif
......@@ -444,5 +472,5 @@ struct file_operations proc_smaps_operations = {
.open = smaps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
.release = seq_release_private,
};
......@@ -246,7 +246,7 @@ extern void kclist_add(struct kcore_list *, void *, size_t);
#endif
struct proc_inode {
struct task_struct *task;
struct task_ref *tref;
int fd;
union {
int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **);
......@@ -266,4 +266,10 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
return PROC_I(inode)->pde;
}
struct proc_maps_private {
struct task_ref *tref;
struct task_struct *task;
struct vm_area_struct *tail_vma;
};
#endif /* _LINUX_PROC_FS_H */
......@@ -50,6 +50,7 @@
#include <linux/time.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/task_ref.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>
......@@ -2442,31 +2443,43 @@ void __cpuset_memory_pressure_bump(void)
*/
static int proc_cpuset_show(struct seq_file *m, void *v)
{
struct task_ref *tref;
struct task_struct *tsk;
char *buf;
int retval = 0;
int retval;
retval = -ENOMEM;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
return -ENOMEM;
goto out;
retval = -ESRCH;
tref = m->private;
tsk = get_tref_task(tref);
if (!tsk)
goto out_free;
tsk = m->private;
retval = -EINVAL;
mutex_lock(&manage_mutex);
retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
if (retval < 0)
goto out;
goto out_unlock;
seq_puts(m, buf);
seq_putc(m, '\n');
out:
out_unlock:
mutex_unlock(&manage_mutex);
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
static int cpuset_open(struct inode *inode, struct file *file)
{
struct task_struct *tsk = PROC_I(inode)->task;
return single_open(file, proc_cpuset_show, tsk);
struct task_ref *tref = PROC_I(inode)->tref;
return single_open(file, proc_cpuset_show, tref);
}
struct file_operations proc_cpuset_operations = {
......
......@@ -1821,7 +1821,7 @@ static inline void check_huge_range(struct vm_area_struct *vma,
int show_numa_map(struct seq_file *m, void *v)
{
struct task_struct *task = m->private;
struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
struct numa_maps *md;
struct file *file = vma->vm_file;
......@@ -1837,7 +1837,7 @@ int show_numa_map(struct seq_file *m, void *v)
return 0;
mpol_to_str(buffer, sizeof(buffer),
get_vma_policy(task, vma, vma->vm_start));
get_vma_policy(priv->task, vma, vma->vm_start));
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
......@@ -1891,7 +1891,7 @@ out:
kfree(md);
if (m->count < m->size)
m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment