diff -ruN -X excl linux-lt-2.4.0-test7.prev/Documentation/Configure.help linux-lt-2.4.0-test7/Documentation/Configure.help --- linux-lt-2.4.0-test7.prev/Documentation/Configure.help Fri Aug 25 11:19:29 2000 +++ linux-lt-2.4.0-test7/Documentation/Configure.help Fri Aug 25 17:12:24 2000 @@ -14210,6 +14210,25 @@ better 32 MB RAM to avoid excessive linking time. This is only useful for kernel hackers. If unsure, say N. +Advanced resource control +CONFIG_USER_RESOURCE + This patch provides accounting and allows to configure + limits for user's consumption of exhaustible system resources. + The most important resource controlled by this patch is unswappable memory + (either mlock'ed or used by internal kernel structures and buffers). + The main goal of this patch is to protect processes + from running short of important resources because of an accidental + misbehavior of processes or malicious activity aiming to ``kill'' the system. + It's worth to mention that resource limits configured by setrlimit(2) + do not give an acceptable level of protection because they cover only small + fraction of resources and work on a per-process basis. Per-process + accounting doesn't prevent malicious users from spawning a lot of + resource-consuming processes. + +Report resource usage in /proc +CONFIG_USER_RESOURCE_PROC + Allows a system administrator to inspect resource accounts and limits. + Magic System Request Key support CONFIG_MAGIC_SYSRQ If you say Y here, you will have some control over the system even diff -ruN -X excl linux-lt-2.4.0-test7.prev/arch/arm/kernel/ecard.c linux-lt-2.4.0-test7/arch/arm/kernel/ecard.c --- linux-lt-2.4.0-test7.prev/arch/arm/kernel/ecard.c Fri Aug 25 11:19:29 2000 +++ linux-lt-2.4.0-test7/arch/arm/kernel/ecard.c Fri Aug 25 17:12:24 2000 @@ -242,7 +242,7 @@ return 0; } - mm = mm_alloc(); + mm = mm_alloc(NULL); if (mm) { struct mm_struct *active_mm = current->active_mm; diff -ruN -X excl linux-lt-2.4.0-test7.prev/arch/i386/config.in linux-lt-2.4.0-test7/arch/i386/config.in --- linux-lt-2.4.0-test7.prev/arch/i386/config.in Fri Aug 25 11:19:29 2000 +++ linux-lt-2.4.0-test7/arch/i386/config.in Fri Aug 25 17:12:24 2000 @@ -354,6 +354,16 @@ source drivers/usb/Config.in mainmenu_option next_comment +comment 'Resource management' +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + bool 'Per user resource management (EXPERIMENTAL)' CONFIG_USER_RESOURCE + if [ "$CONFIG_USER_RESOURCE" = "y" -a "$CONFIG_PROC_FS" = "y" ]; then + bool ' Report resource usage in /proc' CONFIG_USER_RESOURCE_PROC + fi +fi +endmenu + +mainmenu_option next_comment comment 'Kernel hacking' #bool 'Debug kmalloc/kfree' CONFIG_DEBUG_MALLOC diff -ruN -X excl linux-lt-2.4.0-test7.prev/arch/i386/kernel/entry.S linux-lt-2.4.0-test7/arch/i386/kernel/entry.S --- linux-lt-2.4.0-test7.prev/arch/i386/kernel/entry.S Fri Aug 25 11:19:29 2000 +++ linux-lt-2.4.0-test7/arch/i386/kernel/entry.S Fri Aug 25 17:19:08 2000 @@ -643,6 +643,9 @@ .long SYMBOL_NAME(sys_madvise) .long SYMBOL_NAME(sys_getdents64) /* 220 */ .long SYMBOL_NAME(sys_fcntl64) + .long SYMBOL_NAME(sys_getluid) + .long SYMBOL_NAME(sys_setluid) + .long SYMBOL_NAME(sys_setublimit) /* * NOTE!! This doesn't have to be exact - we just have diff -ruN -X excl linux-lt-2.4.0-test7.prev/arch/i386/mm/fault.c linux-lt-2.4.0-test7/arch/i386/mm/fault.c --- linux-lt-2.4.0-test7.prev/arch/i386/mm/fault.c Fri May 26 10:43:41 2000 +++ linux-lt-2.4.0-test7/arch/i386/mm/fault.c Fri Aug 25 17:12:24 2000 @@ -205,6 +205,7 @@ goto out_of_memory; } +normal_return: /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -293,10 +294,28 @@ * us unable to handle the page fault gracefully. */ out_of_memory: + + if (error_code & 4) { + struct task_struct *worst; + read_lock(&tasklist_lock); + worst = select_worst_task(); + printk(KERN_ERR "VM: killing process %s\n", worst->comm); + if (worst != current) { + force_sig(SIGKILL, worst); + worst->policy = SCHED_FIFO; + worst->rt_priority = 1000; + current->policy |= SCHED_YIELD; + read_unlock(&tasklist_lock); + schedule(); + goto normal_return; + } else { + read_unlock(&tasklist_lock); + up(&mm->mmap_sem); + do_exit(SIGKILL); + } + /* Never reached */ + } up(&mm->mmap_sem); - printk("VM: killing process %s\n", tsk->comm); - if (error_code & 4) - do_exit(SIGKILL); goto no_context; do_sigbus: diff -ruN -X excl linux-lt-2.4.0-test7.prev/arch/i386/mm/init.c linux-lt-2.4.0-test7/arch/i386/mm/init.c --- linux-lt-2.4.0-test7.prev/arch/i386/mm/init.c Thu Aug 10 11:42:08 2000 +++ linux-lt-2.4.0-test7/arch/i386/mm/init.c Fri Aug 25 17:12:24 2000 @@ -133,7 +133,7 @@ return (pte_t *) pmd_page(*pmd) + offset; } -pte_t *get_pte_slow(pmd_t *pmd, unsigned long offset) +pte_t *get_pte_slow(struct user_beancounter *bc, pmd_t *pmd, unsigned long offset) { unsigned long pte; @@ -145,9 +145,11 @@ return (pte_t *)pte + offset; } set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(get_bad_pte_table()))); + uncharge_kmem(bc, PAGE_SIZE); return NULL; } free_page(pte); + uncharge_kmem(bc, PAGE_SIZE); if (pmd_bad(*pmd)) { __handle_bad_pmd(pmd); return NULL; @@ -163,9 +165,9 @@ if(pgd_quicklist) free_pgd_slow(get_pgd_fast()), freed++; if(pmd_quicklist) - free_pmd_slow(get_pmd_fast()), freed++; + free_pmd_slow_kernel(get_pmd_fast()), freed++; if(pte_quicklist) - free_pte_slow(get_pte_fast()), freed++; + free_pte_slow_kernel(get_pte_fast()), freed++; } while(pgtable_cache_size > low); } return freed; diff -ruN -X excl linux-lt-2.4.0-test7.prev/arch/ppc/mm/fault.c linux-lt-2.4.0-test7/arch/ppc/mm/fault.c --- linux-lt-2.4.0-test7.prev/arch/ppc/mm/fault.c Thu Aug 3 16:47:17 2000 +++ linux-lt-2.4.0-test7/arch/ppc/mm/fault.c Fri Aug 25 17:12:24 2000 @@ -157,6 +157,7 @@ goto out_of_memory; } +normal_return: up(&mm->mmap_sem); /* * keep track of tlb+htab misses that are good addrs but @@ -189,9 +190,24 @@ */ out_of_memory: up(&mm->mmap_sem); - printk("VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_exit(SIGKILL); + if (user_mode(regs)) { + struct task_struct *worst; + read_lock(&tasklist_lock); + worst = select_worst_task(); + printk(KERN_ERR "VM: killing process %s\n", worst->comm); + if (worst != current) { + force_sig(SIGKILL, worst); + worst->policy = SCHED_FIFO; + worst->rt_priority = 1000; + current->policy |= SCHED_YIELD; + read_unlock(&tasklist_lock); + schedule(); + goto normal_return; + } else { + read_unlock(&tasklist_lock); + do_exit(SIGKILL); + } + } bad_page_fault(regs, address); return; diff -ruN -X excl linux-lt-2.4.0-test7.prev/arch/sparc/mm/fault.c linux-lt-2.4.0-test7/arch/sparc/mm/fault.c --- linux-lt-2.4.0-test7.prev/arch/sparc/mm/fault.c Wed May 17 17:56:27 2000 +++ linux-lt-2.4.0-test7/arch/sparc/mm/fault.c Fri Aug 25 17:12:24 2000 @@ -261,6 +261,7 @@ default: goto out_of_memory; } +normal_return: up(&mm->mmap_sem); return; @@ -325,9 +326,24 @@ */ out_of_memory: up(&mm->mmap_sem); - printk("VM: killing process %s\n", tsk->comm); - if (from_user) - do_exit(SIGKILL); + if (from_user) { + struct task_struct *worst; + read_lock(&tasklist_lock); + worst = select_worst_task(); + printk(KERN_ERR "VM: killing process %s\n", worst->comm); + if (worst != current) { + force_sig(SIGKILL, worst); + worst->policy = SCHED_FIFO; + worst->rt_priority = 1000; + current->policy |= SCHED_YIELD; + read_unlock(&tasklist_lock); + schedule(); + goto normal_return; + } else { + read_unlock(&tasklist_lock); + do_exit(SIGKILL); + } + } goto no_context; do_sigbus: diff -ruN -X excl linux-lt-2.4.0-test7.prev/arch/sparc64/mm/fault.c linux-lt-2.4.0-test7/arch/sparc64/mm/fault.c --- linux-lt-2.4.0-test7.prev/arch/sparc64/mm/fault.c Fri Aug 25 11:19:29 2000 +++ linux-lt-2.4.0-test7/arch/sparc64/mm/fault.c Fri Aug 25 17:12:24 2000 @@ -308,6 +308,7 @@ goto out_of_memory; } +normal_return: up(&mm->mmap_sem); goto fault_done; @@ -329,9 +330,24 @@ */ out_of_memory: up(&mm->mmap_sem); - printk("VM: killing process %s\n", current->comm); - if (!(regs->tstate & TSTATE_PRIV)) - do_exit(SIGKILL); + if (!(regs->tstate & TSTATE_PRIV)) { + struct task_struct *worst; + read_lock(&tasklist_lock); + worst = select_worst_task(); + printk(KERN_ERR "VM: killing process %s\n", worst->comm); + if (worst != current) { + force_sig(SIGKILL, worst); + worst->policy = SCHED_FIFO; + worst->rt_priority = 1000; + current->policy |= SCHED_YIELD; + read_unlock(&tasklist_lock); + schedule(); + goto normal_return; + } else { + read_unlock(&tasklist_lock); + do_exit(SIGKILL); + } + } goto handle_kernel_fault; do_sigbus: diff -ruN -X excl linux-lt-2.4.0-test7.prev/drivers/char/pty.c linux-lt-2.4.0-test7/drivers/char/pty.c --- linux-lt-2.4.0-test7.prev/drivers/char/pty.c Thu Aug 3 16:47:56 2000 +++ linux-lt-2.4.0-test7/drivers/char/pty.c Fri Aug 25 17:12:24 2000 @@ -98,6 +98,12 @@ tty_unregister_devfs (&tty->link->driver, MINOR (tty->device)); tty_vhangup(tty->link); } + if ((tty->driver.subtype == PTY_TYPE_MASTER) && + (test_bit(TTY_BEANCOUNTER_CHARGED, &tty->flags))) + { + uncharge_pty(tty->charged_bc); + clear_bit(TTY_BEANCOUNTER_CHARGED, &tty->flags); + } } /* @@ -317,6 +323,16 @@ line = MINOR(tty->device) - tty->driver.minor_start; if ((line < 0) || (line >= NR_PTYS)) goto out; + + if ((tty->driver.subtype == PTY_TYPE_MASTER) && + (!test_bit(TTY_BEANCOUNTER_CHARGED, &tty->flags))) + { + if (charge_pty(tty->charged_bc)) + goto out; + set_bit(TTY_BEANCOUNTER_CHARGED, &tty->flags); + } + /* for all further failures uncharge is done in pty_close() */ + pty = (struct pty_struct *)(tty->driver.driver_state) + line; tty->driver_data = pty; diff -ruN -X excl linux-lt-2.4.0-test7.prev/fs/binfmt_elf.c linux-lt-2.4.0-test7/fs/binfmt_elf.c --- linux-lt-2.4.0-test7.prev/fs/binfmt_elf.c Thu Aug 10 11:42:09 2000 +++ linux-lt-2.4.0-test7/fs/binfmt_elf.c Fri Aug 25 17:12:24 2000 @@ -1201,11 +1201,11 @@ pte_t *pte; pgd = pgd_offset(vma->vm_mm, addr); - pmd = pmd_alloc(pgd, addr); + pmd = pmd_alloc(vma->vm_mm->beancounter, pgd, addr); if (!pmd) goto end_coredump; - pte = pte_alloc(pmd, addr); + pte = pte_alloc(vma->vm_mm->beancounter, pmd, addr); if (!pte) goto end_coredump; if (!pte_present(*pte) && diff -ruN -X excl linux-lt-2.4.0-test7.prev/fs/exec.c linux-lt-2.4.0-test7/fs/exec.c --- linux-lt-2.4.0-test7.prev/fs/exec.c Fri Aug 25 11:19:31 2000 +++ linux-lt-2.4.0-test7/fs/exec.c Fri Aug 25 17:12:24 2000 @@ -264,13 +264,13 @@ if (page_count(page) != 1) printk("mem_map disagrees with %p at %08lx\n", page, address); pgd = pgd_offset(tsk->mm, address); - pmd = pmd_alloc(pgd, address); + pmd = pmd_alloc(tsk->mm->beancounter, pgd, address); if (!pmd) { __free_page(page); force_sig(SIGKILL, tsk); return; } - pte = pte_alloc(pmd, address); + pte = pte_alloc(tsk->mm->beancounter, pmd, address); if (!pte) { __free_page(page); force_sig(SIGKILL, tsk); @@ -299,7 +299,11 @@ bprm->loader += stack_base; bprm->exec += stack_base; - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + mpnt = NULL; + if (!charge_memory(current->mm->beancounter, + STACK_TOP - (PAGE_MASK & (unsigned long) bprm->p), + VM_STACK_FLAGS, 1)) + mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!mpnt) return -ENOMEM; @@ -398,7 +402,7 @@ return 0; } - mm = mm_alloc(); + mm = mm_alloc(current->login_bc); if (mm) { struct mm_struct *active_mm = current->active_mm; diff -ruN -X excl linux-lt-2.4.0-test7.prev/fs/locks.c linux-lt-2.4.0-test7/fs/locks.c --- linux-lt-2.4.0-test7.prev/fs/locks.c Fri Aug 25 11:19:31 2000 +++ linux-lt-2.4.0-test7/fs/locks.c Fri Aug 25 17:56:20 2000 @@ -123,16 +123,34 @@ static kmem_cache_t *filelock_cache; /* Allocate an empty lock structure. */ -static struct file_lock *locks_alloc_lock(void) +static struct file_lock *locks_alloc_lock(int do_charge) { - struct file_lock *fl; - fl = kmem_cache_alloc(filelock_cache, SLAB_KERNEL); + struct file_lock *fl = NULL; + struct user_beancounter *bc = current->login_bc; + if (do_charge) { + get_beancounter(bc); + if (charge_flock(bc, sizeof(struct file_lock))) + goto out_with_put; + } + fl = (struct file_lock *) kmem_cache_alloc(filelock_cache, SLAB_KERNEL); + if (!fl) + goto out_uncharge; + fl->charged_bc = bc; + return fl; + +out_uncharge: + if (do_charge) { + uncharge_flock(bc, sizeof(struct file_lock)); +out_with_put: + put_beancounter(bc); + } return fl; } /* Free a lock which is not in use. */ -static inline void locks_free_lock(struct file_lock *fl) +static inline void locks_free_lock(struct file_lock *fl, int do_uncharge) { + struct user_beancounter *bc; if (fl == NULL) { BUG(); return; @@ -147,6 +165,11 @@ if (!list_empty(&fl->fl_link)) panic("Attempting to free lock on active lock list"); + if (do_uncharge) { + bc = fl->charged_bc; + uncharge_flock(bc, sizeof(struct file_lock)); + put_beancounter(bc); + } kmem_cache_free(filelock_cache, fl); } @@ -173,6 +196,7 @@ */ static void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { + new->charged_bc = fl->charged_bc; new->fl_owner = fl->fl_owner; new->fl_pid = fl->fl_pid; new->fl_file = fl->fl_file; @@ -189,7 +213,7 @@ /* Fill in a file_lock structure with an appropriate FLOCK lock. */ static struct file_lock *flock_make_lock(struct file *filp, unsigned int type) { - struct file_lock *fl = locks_alloc_lock(); + struct file_lock *fl = locks_alloc_lock(1); if (fl == NULL) return NULL; @@ -427,7 +451,7 @@ fl->fl_type = F_UNLCK; lock(fl->fl_file, F_SETLK, fl); } - locks_free_lock(fl); + locks_free_lock(fl, 1); } /* Determine if lock sys_fl blocks lock caller_fl. Common functionality @@ -566,8 +590,16 @@ size_t count) { struct file_lock *fl; - struct file_lock *new_fl = locks_alloc_lock(); + /* + * We don't charge here because I believe it would be nonsense, + * if read() operation failed on the limits boundary. + */ + const int do_charge = 0; + struct file_lock *new_fl = locks_alloc_lock(do_charge); int error; + + if (new_fl == NULL) + return -ENOLCK; new_fl->fl_owner = current->files; new_fl->fl_pid = current->pid; @@ -618,7 +650,7 @@ } } unlock_kernel(); - locks_free_lock(new_fl); + locks_free_lock(new_fl, do_charge); return error; } @@ -697,7 +729,7 @@ out: if (new_fl) - locks_free_lock(new_fl); + locks_free_lock(new_fl, 1); return error; } @@ -728,10 +760,13 @@ * We may need two file_lock structures for this operation, * so we get them in advance to avoid races. */ - new_fl = locks_alloc_lock(); - new_fl2 = locks_alloc_lock(); - error = -ENOLCK; /* "no luck" */ - if (!(new_fl && new_fl2)) + if (caller->fl_type != F_UNLCK) + new_fl = locks_alloc_lock(1); + else + new_fl = NULL; + new_fl2 = locks_alloc_lock(0); + error = -ENOLCK; /* "no luck" */ + if (!((caller->fl_type == F_UNLCK || new_fl) && new_fl2)) goto out; if (caller->fl_type != F_UNLCK) { @@ -858,19 +893,31 @@ if (!added) { if (caller->fl_type == F_UNLCK) goto out; + error = -ENOLCK; + if (right && (left == right)) + if (charge_flock(current->login_bc, + sizeof(struct file_lock))) + goto out; locks_copy_lock(new_fl, caller); locks_insert_lock(before, new_fl); new_fl = NULL; + error = 0; } if (right) { if (left == right) { /* The new lock breaks the old one in two pieces, * so we have to use the second new lock. */ + error = -ENOLCK; + if (added) + if (charge_flock(current->login_bc, + sizeof(struct file_lock))) + goto out; left = new_fl2; new_fl2 = NULL; locks_copy_lock(left, right); locks_insert_lock(before, left); + error = 0; } right->fl_start = caller->fl_end + 1; locks_wake_up_blocks(right, 0); @@ -884,9 +931,9 @@ * Free any unused locks. */ if (new_fl) - locks_free_lock(new_fl); + locks_free_lock(new_fl, 1); if (new_fl2) - locks_free_lock(new_fl2); + locks_free_lock(new_fl2, 0); return error; } @@ -941,9 +988,14 @@ int fcntl_getlk(unsigned int fd, struct flock *l) { struct file *filp; - struct file_lock *fl, *file_lock = locks_alloc_lock(); + /* Maybe we shouldn't charge here? */ + const int do_charge = 1; + struct file_lock *fl, *file_lock = locks_alloc_lock(do_charge); struct flock flock; int error; + + if (file_lock == NULL) + return -ENOLCK; error = -EFAULT; if (copy_from_user(&flock, l, sizeof(flock))) @@ -1002,7 +1054,7 @@ out_putf: fput(filp); out: - locks_free_lock(file_lock); + locks_free_lock(file_lock, do_charge); return error; } @@ -1012,10 +1064,16 @@ int fcntl_setlk(unsigned int fd, unsigned int cmd, struct flock *l) { struct file *filp; - struct file_lock *file_lock = locks_alloc_lock(); + + const int do_charge = 1; + struct file_lock *file_lock = locks_alloc_lock(do_charge); + struct flock flock; struct inode *inode; int error; + + if (file_lock == NULL) + return -ENOLCK; /* * This might block, so we do it before checking the inode. @@ -1101,7 +1159,7 @@ out_putf: fput(filp); out: - locks_free_lock(file_lock); + locks_free_lock(file_lock, do_charge); return error; } @@ -1112,7 +1170,7 @@ int fcntl_getlk64(unsigned int fd, struct flock64 *l) { struct file *filp; - struct file_lock *fl, *file_lock = locks_alloc_lock(); + struct file_lock *fl, *file_lock = locks_alloc_lock(0); struct flock64 flock; int error; @@ -1161,7 +1219,7 @@ out_putf: fput(filp); out: - locks_free_lock(file_lock); + locks_free_lock(file_lock, 0); return error; } @@ -1171,7 +1229,7 @@ int fcntl_setlk64(unsigned int fd, unsigned int cmd, struct flock64 *l) { struct file *filp; - struct file_lock *file_lock = locks_alloc_lock(); + struct file_lock *file_lock = locks_alloc_lock(0); struct flock64 flock; struct inode *inode; int error; @@ -1245,7 +1303,7 @@ out_putf: fput(filp); out: - locks_free_lock(file_lock); + locks_free_lock(file_lock, 0); return error; } #endif /* BITS_PER_LONG == 32 */ diff -ruN -X excl linux-lt-2.4.0-test7.prev/fs/select.c linux-lt-2.4.0-test7/fs/select.c --- linux-lt-2.4.0-test7.prev/fs/select.c Thu Aug 3 16:48:19 2000 +++ linux-lt-2.4.0-test7/fs/select.c Fri Aug 25 17:12:24 2000 @@ -55,6 +55,7 @@ void poll_freewait(poll_table* pt) { struct poll_table_page * p = pt->table; + unsigned long int freed_mem = 0; while (p) { struct poll_table_entry * entry; struct poll_table_page *old; @@ -68,7 +69,9 @@ old = p; p = p->next; free_page((unsigned long) old); + freed_mem += PAGE_SIZE; } + uncharge_kmem(current->login_bc, freed_mem); } void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) @@ -77,13 +80,11 @@ if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; - + if (charge_kmem(current->login_bc, PAGE_SIZE, 1)) + goto out_nomem; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); - if (!new_table) { - p->error = -ENOMEM; - __set_current_state(TASK_RUNNING); - return; - } + if (!new_table) + goto out_nomem_uncharge; new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; @@ -100,6 +101,14 @@ init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address,&entry->wait); } + return; + +out_nomem_uncharge: + uncharge_kmem(current->login_bc, PAGE_SIZE); +out_nomem: + p->error = -ENOMEM; + __set_current_state(TASK_RUNNING); + return; } #define __IN(fds, n) (fds->in + n) @@ -235,12 +244,26 @@ static void *select_bits_alloc(int size) { - return kmalloc(6 * size, GFP_KERNEL); + const unsigned long int charged_mem = 6*size; + void *p; + + if (charge_kmem(current->login_bc, charged_mem, 1)) + return NULL; + + p = kmalloc(charged_mem, GFP_KERNEL); + if (p == NULL) + goto out_uncharge; + return p; + +out_uncharge: + uncharge_kmem(current->login_bc, charged_mem); + return NULL; } static void select_bits_free(void *bits, int size) { kfree(bits); + uncharge_kmem(current->login_bc, 6*size); } /* @@ -411,6 +434,7 @@ struct pollfd **fds; poll_table table, *wait; int nchunks, nleft; + unsigned long int charged_mem; /* Do a sanity check on nfds ... */ if (nfds > current->files->max_fds) @@ -430,6 +454,11 @@ wait = NULL; err = -ENOMEM; + charged_mem = (1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *) + + PAGE_SIZE*((nfds+POLLFD_PER_PAGE-1)/POLLFD_PER_PAGE); + if (charge_kmem(current->login_bc, charged_mem, 1)) + goto out_nouncharge; + fds = NULL; if (nfds != 0) { fds = (struct pollfd **)kmalloc( @@ -487,6 +516,8 @@ if (nfds != 0) kfree(fds); out: + uncharge_kmem(current->login_bc, charged_mem); +out_nouncharge: poll_freewait(&table); return err; } diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/asm-i386/pgalloc-2level.h linux-lt-2.4.0-test7/include/asm-i386/pgalloc-2level.h --- linux-lt-2.4.0-test7.prev/include/asm-i386/pgalloc-2level.h Sun Nov 21 02:09:05 1999 +++ linux-lt-2.4.0-test7/include/asm-i386/pgalloc-2level.h Fri Aug 25 17:12:24 2000 @@ -12,12 +12,16 @@ extern __inline__ void free_pmd_fast(pmd_t *pmd) { } extern __inline__ void free_pmd_slow(pmd_t *pmd) { } +extern __inline__ void free_pmd_slow_kernel(pmd_t *pmd) { } -extern inline pmd_t * pmd_alloc(pgd_t *pgd, unsigned long address) +extern inline pmd_t * pmd_alloc_kernel(pgd_t *pgd, unsigned long address) { if (!pgd) BUG(); return (pmd_t *) pgd; } +#define pmd_free_kernel(pmd) free_pmd_slow(pmd) +#define pmd_alloc(bc, pgd, address) pmd_alloc_kernel(pgd, address) +#define pmd_free(bc, pmd) pmd_free_kernel(pmd) #endif /* _I386_PGALLOC_2LEVEL_H */ diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/asm-i386/pgalloc-3level.h linux-lt-2.4.0-test7/include/asm-i386/pgalloc-3level.h --- linux-lt-2.4.0-test7.prev/include/asm-i386/pgalloc-3level.h Sat Dec 4 03:12:23 1999 +++ linux-lt-2.4.0-test7/include/asm-i386/pgalloc-3level.h Fri Aug 25 17:12:24 2000 @@ -37,12 +37,18 @@ pgtable_cache_size++; } -extern __inline__ void free_pmd_slow(pmd_t *pmd) +extern __inline__ void free_pmd_slow_kernel(pmd_t *pmd) { free_page((unsigned long)pmd); } -extern inline pmd_t * pmd_alloc(pgd_t *pgd, unsigned long address) +extern __inline__ void free_pmd_slow(struct user_beancounter *bc, pmd_t *pmd) +{ + free_page((unsigned long)pmd); + uncharge_kmem(bc, PAGE_SIZE); +} + +extern inline pmd_t * pmd_alloc_kernel(pgd_t *pgd, unsigned long address) { if (!pgd) BUG(); @@ -64,5 +70,39 @@ } return (pmd_t *)pgd_page(*pgd) + address; } + +extern inline pmd_t * pmd_alloc(struct user_beancounter *bc, pgd_t *pgd, unsigned long address) +{ + if (!pgd) + BUG(); + address = (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); + if (pgd_none(*pgd)) { + pmd_t *page; + + if (charge_kmem(bc, PAGE_SIZE, 0)) + return NULL; + + page = get_pmd_fast(); + if (!page) + page = get_pmd_slow(); + if (page) { + if (pgd_none(*pgd)) { + set_pgd(pgd, __pgd(1 + __pa(page))); + __flush_tlb(); + return page + address; + } else { + free_pmd_fast(page); + uncharge_kmem(bc, PAGE_SIZE); + } + } else { + uncharge_kmem(bc, PAGE_SIZE); + return NULL; + } + } + return (pmd_t *)pgd_page(*pgd) + address; +} + +#define pmd_free_kernel free_pmd_slow_kernel +#define pmd_free free_pmd_slow #endif /* _I386_PGALLOC_3LEVEL_H */ diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/asm-i386/pgalloc.h linux-lt-2.4.0-test7/include/asm-i386/pgalloc.h --- linux-lt-2.4.0-test7.prev/include/asm-i386/pgalloc.h Thu Aug 3 16:48:19 2000 +++ linux-lt-2.4.0-test7/include/asm-i386/pgalloc.h Fri Aug 25 17:12:24 2000 @@ -65,8 +65,28 @@ free_page((unsigned long)pgd); } -extern pte_t *get_pte_slow(pmd_t *pmd, unsigned long address_preadjusted); -extern pte_t *get_pte_kernel_slow(pmd_t *pmd, unsigned long address_preadjusted); +extern __inline__ pgd_t *pgd_alloc(struct user_beancounter *bc) +{ + pgd_t *ret; + + if (charge_kmem(bc, PAGE_SIZE, 1)) + return NULL; + ret = get_pgd_fast(); + if (ret == NULL) + uncharge_kmem(bc, PAGE_SIZE); + return ret; +} + +extern __inline__ void pgd_free(struct user_beancounter *bc, pgd_t *pgd) +{ + free_pgd_slow(pgd); + uncharge_kmem(bc, PAGE_SIZE); +} + +extern pte_t *get_pte_slow(struct user_beancounter *bc, pmd_t *pmd, + unsigned long address_preadjusted); +extern pte_t *get_pte_kernel_slow(pmd_t *pmd, + unsigned long address_preadjusted); extern __inline__ pte_t *get_pte_fast(void) { @@ -80,22 +100,27 @@ return (pte_t *)ret; } -extern __inline__ void free_pte_fast(pte_t *pte) +extern __inline__ void free_pte_fast(struct user_beancounter *bc, pte_t *pte) { *(unsigned long *)pte = (unsigned long) pte_quicklist; pte_quicklist = (unsigned long *) pte; pgtable_cache_size++; + uncharge_kmem(bc, PAGE_SIZE); } -extern __inline__ void free_pte_slow(pte_t *pte) +extern __inline__ void free_pte_slow_kernel(pte_t *pte) { free_page((unsigned long)pte); } -#define pte_free_kernel(pte) free_pte_slow(pte) -#define pte_free(pte) free_pte_slow(pte) -#define pgd_free(pgd) free_pgd_slow(pgd) -#define pgd_alloc() get_pgd_fast() +#define pte_free_kernel(pte) free_pte_slow_kernel(pte) +#define pte_free(bc, pte) free_pte_slow(bc, pte) + +extern __inline__ void free_pte_slow(struct user_beancounter *bc, pte_t *pte) +{ + free_page((unsigned long)pte); + uncharge_kmem(bc, PAGE_SIZE); +} extern inline pte_t * pte_alloc_kernel(pmd_t * pmd, unsigned long address) { @@ -117,7 +142,7 @@ return (pte_t *) pmd_page(*pmd) + address; } -extern inline pte_t * pte_alloc(pmd_t * pmd, unsigned long address) +extern inline pte_t * pte_alloc(struct user_beancounter *bc, pmd_t * pmd, unsigned long address) { address = (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); @@ -128,10 +153,13 @@ return (pte_t *)pmd_page(*pmd) + address; getnew: { - unsigned long page = (unsigned long) get_pte_fast(); + unsigned long page; + if (charge_kmem(bc, PAGE_SIZE, 0)) + return NULL; + page = (unsigned long) get_pte_fast(); if (!page) - return get_pte_slow(pmd, address); + return get_pte_slow(bc, pmd, address); set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(page))); return (pte_t *)page + address; } @@ -139,16 +167,6 @@ __handle_bad_pmd(pmd); return NULL; } - -/* - * allocating and freeing a pmd is trivial: the 1-entry pmd is - * inside the pgd, so has no extra memory associated with it. - * (In the PAE case we free the page.) - */ -#define pmd_free(pmd) free_pmd_slow(pmd) - -#define pmd_free_kernel pmd_free -#define pmd_alloc_kernel pmd_alloc extern int do_check_pgt_cache(int, int); diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/asm-i386/unistd.h linux-lt-2.4.0-test7/include/asm-i386/unistd.h --- linux-lt-2.4.0-test7.prev/include/asm-i386/unistd.h Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/include/asm-i386/unistd.h Sun Aug 27 17:31:31 2000 @@ -227,6 +227,9 @@ #define __NR_madvise1 219 /* delete when C lib stub is removed */ #define __NR_getdents64 220 #define __NR_fcntl64 221 +#define __NR_getluid 222 +#define __NR_setluid 223 +#define __NR_setublimit 224 /* user-visible error numbers are in the range -1 - -124: see */ diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/linux/beancounter.h linux-lt-2.4.0-test7/include/linux/beancounter.h --- linux-lt-2.4.0-test7.prev/include/linux/beancounter.h Thu Jan 1 07:30:00 1970 +++ linux-lt-2.4.0-test7/include/linux/beancounter.h Fri Aug 25 17:12:24 2000 @@ -0,0 +1,202 @@ +#ifndef _LINUX_BEANCOUNTER_H +#define _LINUX_BEANCOUNTER_H + +/* + * Resource list. + */ + +#define UB_KMEMSIZE 0 /* Unswappable kernel memory size including + struct task, page directories, etc. + Still unimplemeted are: + socket buffers, files, temporary + space for poll, siginfo + */ +#define UB_LOCKEDPAGES 1 /* Mlock()ed pages. */ +#define UB_TOTVMPAGES 2 /* Address space size in pages. */ +#define UB_SHMPAGES 3 /* IPC SHM segment size. */ +#define UB_ZSHMPAGES 4 /* Anonymous shared memory. */ +#define UB_NUMPROC 5 /* Number of processes. */ +#define UB_RESPAGES 6 /* All resident pages, for swapout guarantee. */ +#define UB_SPCGUARPAGES 7 /* Guarantees for address space allocations. + Only barier is used, no accounting. + */ +#define UB_OOMGUARPAGES 8 /* Guarantees against OOM kill. + Only limit is used, no accounting. + */ +#define UB_NUMSOCK 9 /* Number of sockets. */ +#define UB_NUMFLOCK 10 /* Number of file locks. */ +#define UB_NUMPTY 11 /* Number of PTYs. */ +#define UB_NUMSIGINFO 12 /* Number of siginfos. */ +#define UB_RESOURCES 13 + + +#ifdef __KERNEL__ + +#include +#include + +/* + * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form. + */ +#define UB_MAXVALUE ( (1UL << (sizeof(unsigned long)*8-1)) - 1) + + +/* + * Resource management structures + * Serialization issues: + * beancounter list management is protected via per hash entry lock + * task pointers and luid are set only for current task and only once + * refcount is managed atomically + * value and limit comparison and change are protected by per-node spinlock + */ + +struct user_beancounter +{ + atomic_t ub_refcount; + struct user_beancounter *ub_next; + spinlock_t ub_lock; + uid_t ub_uid; + /* dynamic swap-out priority */ + signed long ub_swp_pri; + /* total weight of resident pages, see mm/kubd.c */ + unsigned long long ub_held_pages; + /* consumed resources */ + unsigned long ub_held[UB_RESOURCES]; + /* maximum amount of consumed resources through the whole lifetime */ + unsigned long ub_maxheld[UB_RESOURCES]; /* Max held resources. */ + /* A barrier over which resource allocations are failed gracefully. + * If the amount of consumed memory is over the barrier further sbrk() + * or mmap() calls fail, the existing processes are not killed. */ + unsigned long ub_barrier[UB_RESOURCES]; + /* hard resource limit */ + unsigned long ub_limit[UB_RESOURCES]; +}; + + +#ifndef CONFIG_USER_RESOURCE + +extern inline void get_beancounter(struct user_beancounter *ub) {;} +extern inline void put_beancounter(struct user_beancounter *ub) {;} + +#define UB_DECLARE_CHARGE(name, dclargs, realargs) \ +extern inline int charge_##name dclargs { return 0; } +#define UB_DECLARE_UNCHARGE(name, dclargs, realargs) \ +extern inline void uncharge_##name dclargs { ; } + +#else + +/* + * NULL beancounter is handled separately for better performance of tasks + * without a beancounter. But I'm not sure how much it gains in real life. + * --SAW + */ + +extern void __put_beancounter(struct user_beancounter *ub); + +extern struct task_struct *select_worst_task(void); + +extern inline void put_beancounter(struct user_beancounter *ub) +{ + if (ub == NULL) + return; + __put_beancounter(ub); +} + +/* + * Create a new beancounter reference + */ + +extern inline void get_beancounter(struct user_beancounter *ub) +{ + if (ub == NULL) + return; +#if 0 + printk("get beancounter %p for %.20s pid %d\n", ub, current->comm, current->pid); +#endif + atomic_inc(&ub->ub_refcount); +} + +#define UB_DECLARE_CHARGE(name, dclargs, realargs) \ +extern int __charge_##name dclargs; \ + \ +extern inline int charge_##name dclargs \ +{ \ + if (ub == NULL) \ + return 0; \ + return __charge_##name realargs; \ +} + +#define UB_DECLARE_UNCHARGE(name, dclargs, realargs) \ +extern void __uncharge_##name dclargs; \ + \ +extern inline void uncharge_##name dclargs \ +{ \ + if (ub == NULL) \ + return; \ + __uncharge_##name realargs; \ +} + +#endif /* CONFIG_USER_RESOURCE */ + + +/* + * Resource charging + * Change user's account and compare against limits + */ + +UB_DECLARE_CHARGE(task, (struct user_beancounter *ub), (ub)) +UB_DECLARE_UNCHARGE(task, (struct user_beancounter *ub), (ub)) +UB_DECLARE_CHARGE(memory, + (struct user_beancounter *ub, unsigned long size, unsigned vm_flags, + int strict), + (ub, size, vm_flags, strict)) +UB_DECLARE_UNCHARGE(memory, + (struct user_beancounter *ub, unsigned long size, unsigned vm_flags), + (ub, size, vm_flags)) +UB_DECLARE_CHARGE(locked_mem, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_UNCHARGE(locked_mem, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_CHARGE(kmem, + (struct user_beancounter *ub, unsigned long size, int strict), + (ub, size, strict)) +UB_DECLARE_UNCHARGE(kmem, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_CHARGE(shmpages, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_UNCHARGE(shmpages, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_CHARGE(sock, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_UNCHARGE(sock, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_CHARGE(flock, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_UNCHARGE(flock, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_CHARGE(pty, + (struct user_beancounter *ub), + (ub)) +UB_DECLARE_UNCHARGE(pty, + (struct user_beancounter *ub), + (ub)) +UB_DECLARE_CHARGE(siginfo, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) +UB_DECLARE_UNCHARGE(siginfo, + (struct user_beancounter *ub, unsigned long size), + (ub, size)) + +#undef UB_DECLARE + +#endif /* __KERNEL__ */ +#endif /* _LINUX_BEANCOUNTER_H */ diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/linux/fs.h linux-lt-2.4.0-test7/include/linux/fs.h --- linux-lt-2.4.0-test7.prev/include/linux/fs.h Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/include/linux/fs.h Fri Aug 25 17:12:24 2000 @@ -507,6 +507,7 @@ */ typedef struct files_struct *fl_owner_t; +struct user_beancounter; struct file_lock { struct file_lock *fl_next; /* singly linked list for this inode */ struct list_head fl_link; /* doubly linked list of all locks */ @@ -527,6 +528,8 @@ union { struct nfs_lock_info nfs_fl; } fl_u; + + struct user_beancounter *charged_bc; }; /* The following constant reflects the upper bound of the file/locking space */ diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/linux/mm.h linux-lt-2.4.0-test7/include/linux/mm.h --- linux-lt-2.4.0-test7.prev/include/linux/mm.h Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/include/linux/mm.h Fri Aug 25 17:12:24 2000 @@ -10,6 +10,7 @@ #include #include #include +#include extern unsigned long max_mapnr; extern unsigned long num_physpages; @@ -92,6 +93,8 @@ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ +#define VM_ANON 0x00040000 /* Anonymous shared memory */ + #define VM_STACK_FLAGS 0x00000177 #define VM_READHINTMASK (VM_SEQ_READ | VM_RAND_READ) @@ -468,7 +471,10 @@ address &= PAGE_MASK; grow = (vma->vm_start - address) >> PAGE_SHIFT; if (vma->vm_end - address > current->rlim[RLIMIT_STACK].rlim_cur || - ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) > current->rlim[RLIMIT_AS].rlim_cur) + ((vma->vm_mm->total_vm + grow) << PAGE_SHIFT) + > current->rlim[RLIMIT_AS].rlim_cur || + charge_memory(vma->vm_mm->beancounter, vma->vm_start - address, + (vma->vm_flags & VM_LOCKED), 0)) return -ENOMEM; vma->vm_start = address; vma->vm_pgoff -= grow; diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/linux/net.h linux-lt-2.4.0-test7/include/linux/net.h --- linux-lt-2.4.0-test7.prev/include/linux/net.h Wed May 17 17:56:29 2000 +++ linux-lt-2.4.0-test7/include/linux/net.h Fri Aug 25 17:12:24 2000 @@ -62,6 +62,7 @@ #define SOCK_ASYNC_WAITDATA 1 #define SOCK_NOSPACE 2 +struct user_beancounter; struct socket { socket_state state; @@ -76,6 +77,7 @@ short type; unsigned char passcred; + struct user_beancounter *beancounter; }; #define SOCK_INODE(S) ((S)->inode) diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/linux/sched.h linux-lt-2.4.0-test7/include/linux/sched.h --- linux-lt-2.4.0-test7.prev/include/linux/sched.h Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/include/linux/sched.h Fri Aug 25 17:12:24 2000 @@ -190,6 +190,7 @@ /* Number of map areas at which the AVL tree is activated. This is arbitrary. */ #define AVL_MIN_MAP_COUNT 32 +struct user_beancounter; struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct vm_area_struct * mmap_avl; /* tree of VMAs */ @@ -209,6 +210,7 @@ unsigned long cpu_vm_mask; unsigned long swap_cnt; /* number of pages to swap on next pass */ unsigned long swap_address; + struct user_beancounter * beancounter; /* * This is an architecture-specific pointer: the portable * part of Linux does not know about any segments. @@ -227,7 +229,7 @@ 0, 0, 0, \ 0, 0, 0, 0, \ 0, 0, 0, \ - 0, 0, 0, 0, NULL } + 0, 0, 0, 0, NULL, NULL } struct signal_struct { atomic_t count; @@ -343,7 +345,14 @@ gid_t groups[NGROUPS]; kernel_cap_t cap_effective, cap_inheritable, cap_permitted; int keep_capabilities:1; +/* resource accounting */ struct user_struct *user; + /* Charging object for this task */ + struct user_beancounter *task_bc; + /* The login user id */ + uid_t luid; + /* Charging object corresponding to the login id */ + struct user_beancounter *login_bc; /* limits */ struct rlimit rlim[RLIM_NLIMITS]; unsigned short used_math; @@ -443,6 +452,9 @@ cap_inheritable: CAP_INIT_INH_SET, \ cap_permitted: CAP_FULL_SET, \ keep_capabilities: 0, \ + task_bc: NULL, \ + luid: (uid_t)-1, \ + login_bc: NULL, \ rlim: INIT_RLIMITS, \ user: INIT_USER, \ comm: "swapper", \ @@ -675,7 +687,7 @@ /* * Routines for handling mm_structs */ -extern struct mm_struct * mm_alloc(void); +extern struct mm_struct * mm_alloc(struct user_beancounter *); extern struct mm_struct * start_lazy_tlb(void); extern void end_lazy_tlb(struct mm_struct *mm); diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/linux/signal.h linux-lt-2.4.0-test7/include/linux/signal.h --- linux-lt-2.4.0-test7.prev/include/linux/signal.h Sun Mar 19 04:10:59 2000 +++ linux-lt-2.4.0-test7/include/linux/signal.h Fri Aug 25 17:12:24 2000 @@ -9,10 +9,12 @@ * Real Time signals may be queued. */ +struct user_beancounter; struct signal_queue { struct signal_queue *next; siginfo_t info; + struct user_beancounter *charged_bc; }; /* diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/linux/tty.h linux-lt-2.4.0-test7/include/linux/tty.h --- linux-lt-2.4.0-test7.prev/include/linux/tty.h Thu Aug 3 16:47:58 2000 +++ linux-lt-2.4.0-test7/include/linux/tty.h Fri Aug 25 17:12:24 2000 @@ -256,6 +256,7 @@ * the size of this structure, and it needs to be done with care. * - TYT, 9/14/92 */ +struct user_beancounter; struct tty_struct { int magic; struct tty_driver driver; @@ -306,6 +307,7 @@ unsigned int canon_column; struct semaphore atomic_read; spinlock_t read_lock; + struct user_beancounter *charged_bc; }; /* tty magic number */ @@ -332,6 +334,7 @@ #define TTY_HW_COOK_IN 15 #define TTY_PTY_LOCK 16 #define TTY_NO_WRITE_SPLIT 17 +#define TTY_BEANCOUNTER_CHARGED 18 #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty)) diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/linux/ubhash.h linux-lt-2.4.0-test7/include/linux/ubhash.h --- linux-lt-2.4.0-test7.prev/include/linux/ubhash.h Thu Jan 1 07:30:00 1970 +++ linux-lt-2.4.0-test7/include/linux/ubhash.h Fri Aug 25 17:12:24 2000 @@ -0,0 +1,23 @@ +#ifndef _LINUX_UBHASH_H +#define _LINUX_UBHASH_H + +#ifdef __KERNEL__ + +#include + +#define UB_HASH_SIZE 256 +#define ub_hash_fun(x) ( ( ((x) >> 8) ^ (x) ) & (UB_HASH_SIZE - 1) ) + +struct ub_hash_slot { + spinlock_t ubh_lock; + struct user_beancounter *ubh_beans; +} ub_hash[UB_HASH_SIZE]; + +#define lock_beancounters(slot, flags) \ + spin_lock_irqsave(&slot->ubh_lock, flags) + +#define unlock_beancounters(slot, flags) \ + spin_unlock_irqrestore(&slot->ubh_lock, flags) + +#endif /* __KERNEL__ */ +#endif /* _LINUX_UBHASH_H */ diff -ruN -X excl linux-lt-2.4.0-test7.prev/include/net/sock.h linux-lt-2.4.0-test7/include/net/sock.h --- linux-lt-2.4.0-test7.prev/include/net/sock.h Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/include/net/sock.h Fri Aug 25 17:12:24 2000 @@ -408,7 +408,8 @@ int linger2; }; - +struct user_beancounter; + /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of @@ -500,6 +501,7 @@ socket_lock_t lock; /* Synchronizer... */ int rcvbuf; /* Size of receive buffer in bytes */ + int rcvbuf_charged; /* for resource control */ wait_queue_head_t *sleep; /* Sock wait queue */ struct dst_entry *dst_cache; /* Destination cache */ @@ -514,6 +516,7 @@ __u32 saddr; /* Sending source */ unsigned int allocation; /* Allocation mode */ int sndbuf; /* Size of send buffer in bytes */ + int sndbuf_charged; /* for resource control */ struct sock *prev; /* Not all are volatile, but some are, so we might as well say they all are. @@ -651,6 +654,9 @@ /* Identd and reporting IO signals */ struct socket *socket; + /* Accounting */ + struct user_beancounter *beancounter; + /* RPC layer private data */ void *user_data; @@ -1301,5 +1307,7 @@ extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; #endif /* _SOCK_H */ diff -ruN -X excl linux-lt-2.4.0-test7.prev/init/main.c linux-lt-2.4.0-test7/init/main.c --- linux-lt-2.4.0-test7.prev/init/main.c Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/init/main.c Fri Aug 25 17:13:50 2000 @@ -88,6 +88,7 @@ extern void init_modules(void); extern void sock_init(void); extern void fork_init(unsigned long); +extern void beancounter_init(unsigned long); extern void mca_init(void); extern void sbus_init(void); extern void ppc_init(void); @@ -562,6 +563,7 @@ fork_init(mempages); proc_caches_init(); + beancounter_init(mempages); vfs_caches_init(mempages); buffer_init(mempages); page_cache_init(mempages); diff -ruN -X excl linux-lt-2.4.0-test7.prev/ipc/shm.c linux-lt-2.4.0-test7/ipc/shm.c --- linux-lt-2.4.0-test7.prev/ipc/shm.c Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/ipc/shm.c Fri Aug 25 17:12:24 2000 @@ -75,6 +75,7 @@ struct shmid_kernel /* private to the kernel */ { struct kern_ipc_perm shm_perm; + struct user_beancounter *beancounter; size_t shm_segsz; unsigned long shm_nattch; unsigned long shm_npages; /* size of segment (pages) */ @@ -684,6 +685,23 @@ /* Now we set them to the real values */ old_dir = shp->shm_dir; old_pages = shp->shm_npages; + if (new_pages > old_pages) { + if (shp->id == zero_id) + error = charge_memory (shp->beancounter, + new_pages - old_pages, VM_ANON, 1); + else + error = charge_shmpages (shp->beancounter, + new_pages - old_pages); + } else { + if (shp->id == zero_id) + uncharge_memory (shp->beancounter, + old_pages - new_pages, VM_ANON); + else + uncharge_shmpages (shp->beancounter, + old_pages - new_pages); + } + if (error) + goto out; if (old_dir){ pte_t *swap; int i,j; @@ -753,6 +771,7 @@ struct shmid_kernel *shp; int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT; int id; + int retval; if (namelen > SHM_NAME_LEN) return -ENAMETOOLONG; @@ -763,12 +782,18 @@ if (shm_tot + numpages >= shm_ctlall) return -ENOSPC; + if ((retval = charge_shmpages(current->login_bc, numpages)) != 0) + return retval; + shp = seg_alloc(numpages, namelen ? namelen : SHM_FMT_LEN + 1); - if (IS_ERR(shp)) + if (IS_ERR(shp)) { + uncharge_shmpages(current->login_bc, numpages); return PTR_ERR(shp); + } id = shm_addid(shp); if(id == -1) { seg_free(shp, 1); + uncharge_shmpages(current->login_bc, numpages); return -ENOSPC; } shp->shm_perm.key = key; @@ -779,6 +804,7 @@ shp->shm_atim = shp->shm_dtim = 0; shp->shm_ctim = CURRENT_TIME; shp->id = shm_buildid(id,shp->shm_perm.seq); + shp->beancounter = current->login_bc; if (namelen != 0) { shp->shm_namelen = namelen; memcpy (shp->shm_name, name, namelen); @@ -839,6 +865,7 @@ shp = shm_rmid(shmid); shm_unlock(shmid); up(&shm_ids.sem); + uncharge_shmpages(shp->beancounter, shp->shm_npages); seg_free(shp, 1); clear_inode(ino); } @@ -1044,6 +1071,13 @@ if(shp==NULL) return -EINVAL; err = shm_checkid(shp,shmid); + if(err) + goto out_unlock; + if(cmd==SHM_LOCK) + err = charge_locked_mem(shp->beancounter, + shp->shm_npages); + else + uncharge_locked_mem(shp->beancounter, shp->shm_npages); if(err) goto out_unlock; if(cmd==SHM_LOCK) diff -ruN -X excl linux-lt-2.4.0-test7.prev/kernel/Makefile linux-lt-2.4.0-test7/kernel/Makefile --- linux-lt-2.4.0-test7.prev/kernel/Makefile Thu Aug 10 11:42:10 2000 +++ linux-lt-2.4.0-test7/kernel/Makefile Fri Aug 25 17:12:24 2000 @@ -10,7 +10,7 @@ O_TARGET := kernel.o O_OBJS = sched.o dma.o fork.o exec_domain.o panic.o printk.o \ module.o exit.o itimer.o info.o time.o softirq.o resource.o \ - sysctl.o acct.o capability.o ptrace.o timer.o user.o + sysctl.o acct.o capability.o ptrace.o timer.o user.o beancounter.o OX_OBJS += signal.o sys.o @@ -31,5 +31,8 @@ endif CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer + +# debugging: +CFLAGS_beancounter.o := -fno-omit-frame-pointer include $(TOPDIR)/Rules.make diff -ruN -X excl linux-lt-2.4.0-test7.prev/kernel/beancounter.c linux-lt-2.4.0-test7/kernel/beancounter.c --- linux-lt-2.4.0-test7.prev/kernel/beancounter.c Thu Jan 1 07:30:00 1970 +++ linux-lt-2.4.0-test7/kernel/beancounter.c Fri Aug 25 17:47:34 2000 @@ -0,0 +1,859 @@ +/* + * linux/kernel/beancounter.c + * + * Copyright (C) 1998 Alan Cox + * 1998-2000 Andrey V. Savochkin + * + * TODO: + * - more intelligent limit check in mremap(): currently the new size is + * charged and _then_ old size is uncharged + * - limits on number of file descriptors + * - problem: bad pmd page handling + * - think about unserialized accesses to guarantee fields + * - sizeof(struct inode) and omem for sockets + * - think about sizeof(struct inode) charge in general and limits for number + * of files + * + * Changes: + * 1999/08/17 Marcelo Tosatti + * - Set "barrier" and "limit" parts of limits atomically. + * 1999/10/06 Marcelo Tosatti + * - setublimit system call. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_USER_RESOURCE_PROC +#include +#endif + +/* + * Various debugging stuff + * 1 creation/destruction + * 2 charge operations + * 4 resource limit change + * 8 proc output + */ +#define UB_DEBUG 0 + + + +#ifdef CONFIG_USER_RESOURCE + + + +extern int max_threads; + +static kmem_cache_t *ub_cachep; + +static struct user_beancounter default_beancounter; + +/* + * Per user resource beancounting. Resources are tied to their + * luid. The resource structure itself is tagged both to the process + * and the charging resources (a socket doesnt want to have to search + * for things at irq time for example). Reference counters keep things + * in hand. + * + * The case where a user creates resource, kills all his processes and + * then starts new ones is correctly handled this way. The refcounters + * will mean the old entry is still around with resource tied to it. + */ +static struct user_beancounter *get_beancounter_byuid(uid_t uid, int create) +{ + struct user_beancounter *ub, *walkp; + unsigned long flags; + struct ub_hash_slot *slot = &ub_hash[ub_hash_fun(uid)]; + + ub = (struct user_beancounter *)kmem_cache_alloc(ub_cachep, GFP_KERNEL); + if (ub == NULL) + return NULL; + + lock_beancounters(slot, flags); + + walkp = slot->ubh_beans; + while (walkp != NULL && walkp->ub_uid != uid) + walkp = walkp->ub_next; + + if (walkp == NULL) { + if (create) { +#if UB_DEBUG & 1 + printk(KERN_DEBUG "Creating ub %p in slot %p\n", ub, slot); +#endif + memcpy(ub, &default_beancounter, sizeof(*ub)); + ub->ub_next = slot->ubh_beans; + slot->ubh_beans = ub; + ub->ub_uid = uid; + walkp = ub; + ub = NULL; + } + /* if create isn't set, just return NULL */ + } else { + /* beancounter already exists */ + atomic_inc(&walkp->ub_refcount); + } + + if (ub != NULL) + kmem_cache_free(ub_cachep, ub); + unlock_beancounters(slot, flags); + return walkp; +} + + +void __put_beancounter(struct user_beancounter *ub) +{ + struct user_beancounter **ubptr; + unsigned long flags; + struct ub_hash_slot *slot = &ub_hash[ub_hash_fun(ub->ub_uid)]; + +#if UB_DEBUG & 1 + printk(KERN_DEBUG "__put bc %p (cnt %d) for %.20s pid %d cur %08lx cpu %d.\n", + ub, atomic_read(&ub->ub_refcount), + current->comm, current->pid, + (unsigned long)current, smp_processor_id()); +#endif + lock_beancounters(slot, flags); + + if (!atomic_dec_and_test(&ub->ub_refcount)) { + unlock_beancounters(slot, flags); + return; + } + + /* + * Ok its a has bean.... neither the user nor any user + * charged objects exist. + */ + ubptr = &slot->ubh_beans; + + while (*ubptr != NULL) { + if (*ubptr == ub) { + { + int i; + for (i = 0; i < UB_RESOURCES; i++) + if (i != UB_RESPAGES && ub->ub_held[i]) + printk(KERN_DEBUG "Ub %p helds %lu in %d on destroy\n", + ub, ub->ub_held[i], i); + } + *ubptr = ub->ub_next; + kmem_cache_free(ub_cachep, ub); + unlock_beancounters(slot, flags); + return; + } + ubptr = &((*ubptr)->ub_next); + } +#if UB_DEBUG & 1 + printk(KERN_ERR "Invalid beancounter '%p' passed to free.\n", ub); + printk(KERN_DEBUG "Slot %p.\n", slot); +#endif + unlock_beancounters(slot, flags); +} + + +/* + * Generic resource charging stuff + */ + +static int __charge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val, int strict) +{ + int retval; + +#if UB_DEBUG & 2 + printk(KERN_DEBUG "Charging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_held[resource]); +#endif + retval = -ENOMEM; + /* ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition + * at the moment is possible so an overflow is impossible. + */ + ub->ub_held[resource] += val; + if (ub->ub_held[resource] > ub->ub_limit[resource]) + goto out; + if (strict && ub->ub_held[resource] > ub->ub_barrier[resource]) + goto out; + if (ub->ub_maxheld[resource] < ub->ub_held[resource]) + ub->ub_maxheld[resource] = ub->ub_held[resource]; + return 0; + +out: + ub->ub_held[resource] -= val; + return retval; +} + +int __charge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val, int strict) +{ + int retval; + unsigned long flags; + + retval = -EINVAL; + if (val > UB_MAXVALUE) + goto out; + + spin_lock_irqsave(&ub->ub_lock, flags); + retval = __charge_beancounter_locked(ub, resource, val, strict); + spin_unlock_irqrestore(&ub->ub_lock, flags); +out: + return retval; +} + +static inline void __uncharge_beancounter_locked(struct user_beancounter *ub, + int resource, unsigned long val) +{ +#if UB_DEBUG & 2 + printk(KERN_DEBUG "Uncharging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_held[resource]); +#endif + if (ub->ub_held[resource] < val) + printk(KERN_ERR "Uncharging %lu for %d of %p with %lu\n", + val, resource, ub, ub->ub_held[resource]); + ub->ub_held[resource] -= val; +} + +void __uncharge_beancounter(struct user_beancounter *ub, + int resource, unsigned long val) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, resource, val); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +/* + * Charge resources on a new task creation + * + * User beancounter here is a login_bc of a parent task + * (which will be equal to task_bc of the new task after allocation + * and initialisation). + * Currently only stack+struct task size is charged. + */ +int __charge_task(struct user_beancounter *ub) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + retval = __charge_beancounter_locked(ub, UB_KMEMSIZE, THREAD_SIZE, 1); + if (!retval) { + retval = __charge_beancounter_locked(ub, UB_NUMPROC, 1, 1); + if (retval) + goto fail; + } +out: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return retval; + +fail: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, THREAD_SIZE); + goto out; +} + +void __uncharge_task(struct user_beancounter *ub) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, THREAD_SIZE); + __uncharge_beancounter_locked(ub, UB_NUMPROC, 1); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +/* + * Different memory type accounting + */ +int __charge_memory(struct user_beancounter *ub, unsigned long size, + unsigned vm_flags, int strict) +{ + int retval; + unsigned long flags; + + size >>= PAGE_SHIFT; + + retval = -EINVAL; + if (size > UB_MAXVALUE) + goto out; + + spin_lock_irqsave(&ub->ub_lock, flags); + retval = __charge_beancounter_locked(ub, UB_TOTVMPAGES, size, strict); + if (retval) + goto out_unlock; + if (vm_flags & VM_LOCKED) { + retval = __charge_beancounter_locked(ub, UB_LOCKEDPAGES, + size, strict); + if (retval) + goto totvm_restore; + } + if (vm_flags & VM_ANON) { + retval = __charge_beancounter_locked(ub, UB_ZSHMPAGES, + size, strict); + if (retval) + goto locked_restore; + } +out_unlock: + spin_unlock_irqrestore(&ub->ub_lock, flags); +out: + return retval; + +locked_restore: + __uncharge_beancounter_locked(ub, UB_LOCKEDPAGES, size); +totvm_restore: + __uncharge_beancounter_locked(ub, UB_TOTVMPAGES, size); + goto out_unlock; +} + +void __uncharge_memory(struct user_beancounter *ub, unsigned long size, + unsigned vm_flags) +{ + unsigned long flags; + + size >>= PAGE_SHIFT; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_TOTVMPAGES, size); + if (vm_flags & VM_LOCKED) + __uncharge_beancounter_locked(ub, UB_LOCKEDPAGES, size); + if (vm_flags & VM_ANON) + __uncharge_beancounter_locked(ub, UB_ZSHMPAGES, size); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + + +int __charge_locked_mem(struct user_beancounter *ub, unsigned long size) +{ + return __charge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT, 1); +} + +void __uncharge_locked_mem(struct user_beancounter *ub, unsigned long size) +{ + __uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT); +} + +int __charge_kmem(struct user_beancounter *ub, unsigned long size, int strict) +{ + return __charge_beancounter(ub, UB_KMEMSIZE, size, strict); +} + +void __uncharge_kmem(struct user_beancounter *ub, unsigned long size) +{ + __uncharge_beancounter(ub, UB_KMEMSIZE, size); +} + +int __charge_shmpages(struct user_beancounter *ub, unsigned long size) +{ + return __charge_beancounter(ub, UB_SHMPAGES, size, 1); +} + +void __uncharge_shmpages(struct user_beancounter *ub, unsigned long size) +{ + __uncharge_beancounter(ub, UB_SHMPAGES, size); +} + +struct task_struct *select_worst_task(void) +{ + struct task_struct * p; + struct task_struct * worst; + long ub_maxover = 0; + + worst = p = init_task.next_task; + for (; p != &init_task; p = p->next_task) { + struct mm_struct *mm = p->mm; + struct user_beancounter *ub; + long ub_overdraft = 0; /* ub current overdraft */ + + if (!mm) /* don't touch the kernel threads */ + continue; + else + ub = mm->beancounter; + + if (ub) { + ub_overdraft = + ub->ub_held[UB_TOTVMPAGES] + - ub->ub_limit[UB_OOMGUARPAGES]; + if (ub_overdraft < 0) + /* processes without overdraft are not + * preferred over ones without beancounter */ + ub_overdraft = 0; + } + if (ub_overdraft > ub_maxover || + (ub_overdraft == ub_maxover && + mm->total_vm > worst->mm->total_vm) ) { + ub_maxover = ub_overdraft; + worst = p; + } + } + return worst; +} + +/* + * File-related accounting + */ +int __charge_sock(struct user_beancounter *ub, unsigned long size) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + retval = __charge_beancounter_locked(ub, UB_KMEMSIZE, size, 1); + if (!retval) { + retval = __charge_beancounter_locked(ub, UB_NUMSOCK, 1, 1); + if (retval) + goto fail; + } +out: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return retval; + +fail: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + goto out; +} + +void __uncharge_sock(struct user_beancounter *ub, unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + __uncharge_beancounter_locked(ub, UB_NUMSOCK, 1); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int __charge_flock(struct user_beancounter *ub, unsigned long size) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + retval = __charge_beancounter_locked(ub, UB_KMEMSIZE, size, 1); + if (!retval) { + retval = __charge_beancounter_locked(ub, UB_NUMFLOCK, 1, 1); + if (retval) + goto fail; + } +out: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return retval; + +fail: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + goto out; +} + +void __uncharge_flock(struct user_beancounter *ub, unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + __uncharge_beancounter_locked(ub, UB_NUMFLOCK, 1); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + +int __charge_pty(struct user_beancounter *ub) +{ + return __charge_beancounter(ub, UB_NUMPTY, 1, 1); +} + +void __uncharge_pty(struct user_beancounter *ub) +{ + __uncharge_beancounter(ub, UB_NUMPTY, 1); +} + +/* + * Accounting for other resources + */ +int __charge_siginfo(struct user_beancounter *ub, unsigned long size) +{ + int retval; + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + retval = __charge_beancounter_locked(ub, UB_KMEMSIZE, size, 1); + if (!retval) { + retval = __charge_beancounter_locked(ub, UB_NUMSIGINFO, 1, 1); + if (retval) + goto fail; + } +out: + spin_unlock_irqrestore(&ub->ub_lock, flags); + return retval; + +fail: + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + goto out; +} + +void __uncharge_siginfo(struct user_beancounter *ub, unsigned long size) +{ + unsigned long flags; + + spin_lock_irqsave(&ub->ub_lock, flags); + __uncharge_beancounter_locked(ub, UB_KMEMSIZE, size); + __uncharge_beancounter_locked(ub, UB_NUMSIGINFO, 1); + spin_unlock_irqrestore(&ub->ub_lock, flags); +} + + +#ifdef CONFIG_USER_RESOURCE_PROC + +#if BITS_PER_LONG == 32 +#define UB_PROC_LINE_SHIFT 6 +#define UB_PROC_LINE_TEXT (5+2+11+1+10+1+10+1+10+1+10) +#else +#define UB_PROC_LINE_SHIFT 7 +#define UB_PROC_LINE_TEXT (10+2+11+1+20+1+20+1+20+1+20) +#endif +#define UB_PROC_LINE_LEN (1 << UB_PROC_LINE_SHIFT) +#define UB_PROC_LINE_SPACES (UB_PROC_LINE_LEN - UB_PROC_LINE_TEXT - 1) + +static const char *ub_rnames[UB_RESOURCES] = { + "kmemsize", + "lockedpages", + "totvmpages", + "ipcshmpages", + "anonshpages", + "numproc", + "rsspages", + "vmspaceguar", + "oomguar", + "numsock", + "numflock", + "numpty", + "numsiginfo", +}; + +static void out_proc_head(char *buf) +{ +#if BITS_PER_LONG == 32 + sprintf(buf, " uid resource held max barrier" + " limit"); +#else + sprintf(buf, " uid resource held" + " max barrier" + " limit"); +#endif + memset(buf + UB_PROC_LINE_TEXT, ' ', UB_PROC_LINE_SPACES); + buf[UB_PROC_LINE_LEN - 1] = '\n'; +} + +static void out_proc_beancounter(char *buf, struct user_beancounter *ub, int r) +{ + if (!r) +#if BITS_PER_LONG == 32 + sprintf(buf, "%5u: %-11s %10lu %10lu %10lu %10lu", + (unsigned)ub->ub_uid, ub_rnames[r], + ub->ub_held[r], ub->ub_maxheld[r], + ub->ub_barrier[r], ub->ub_limit[r]); +#else + sprintf(buf, "%10u: %-11s %20lu %20lu %20lu %20lu", + (unsigned)ub->ub_uid, ub_rnames[r], + ub->ub_held[r], ub->ub_maxheld[r], + ub->ub_barrier[r], ub->ub_limit[r]); +#endif + else +#if BITS_PER_LONG == 32 + sprintf(buf, " %-11s %10lu %10lu %10lu %10lu", + ub_rnames[r], + ub->ub_held[r], ub->ub_maxheld[r], + ub->ub_barrier[r], ub->ub_limit[r]); +#else + sprintf(buf, " %-11s %20lu %20lu %20lu %20lu", + ub_rnames[r], + ub->ub_held[r], ub->ub_maxheld[r], + ub->ub_barrier[r], ub->ub_limit[r]); +#endif + memset(buf + UB_PROC_LINE_TEXT, ' ', UB_PROC_LINE_SPACES); + buf[UB_PROC_LINE_LEN - 1] = '\n'; +} + +static ssize_t ub_proc_read(struct file *file, char *usrbuf, size_t len, + loff_t *poff) +{ + ssize_t retval; + char *buf; + unsigned long flags; + int i, resource; + struct ub_hash_slot *slot; + struct user_beancounter *ub; + size_t n; + int rem, produced, job, tocopy; + const int is_capable = capable(CAP_DAC_OVERRIDE) || + capable(CAP_DAC_READ_SEARCH); + + retval = -ENOBUFS; + buf = (char *)__get_free_page(GFP_KERNEL); + if (buf == NULL) + goto out; + + retval = 0; + if (!is_capable && current->luid == (uid_t)-1) + goto out_free; + if (*poff < 0) + goto out_free; +again: + i = 0; + slot = ub_hash; + n = *poff >> UB_PROC_LINE_SHIFT; /* in lines */ + rem = *poff & (UB_PROC_LINE_LEN - 1); /* in bytes */ + produced = 0; + if (!n) { + out_proc_head(buf); + produced += UB_PROC_LINE_LEN; + n++; + } + n--; + while (1) { + lock_beancounters(slot, flags); + for (ub = slot->ubh_beans; ub != NULL && n >= UB_RESOURCES; + ub = ub->ub_next) + if (is_capable || current->luid == ub->ub_uid) + n -= UB_RESOURCES; + if (ub != NULL) + break; + unlock_beancounters(slot, flags); + if (++i >= UB_HASH_SIZE) + goto out_free; + ++slot; + } + rem += n << UB_PROC_LINE_SHIFT; + job = PAGE_SIZE; + if (len < PAGE_SIZE - rem) + job = rem + len; + while (produced < job) { + if (is_capable || (current->luid == ub->ub_uid)) + for (resource = 0; produced < job && resource < UB_RESOURCES; + resource++, produced += UB_PROC_LINE_LEN) + out_proc_beancounter(buf + produced, ub, resource); + if (produced >= job) + break; + ub = ub->ub_next; +checkub: + if (ub != NULL) + continue; + if (++i >= UB_HASH_SIZE) + break; + unlock_beancounters(slot, flags); + ++slot; + lock_beancounters(slot, flags); + ub = slot->ubh_beans; + goto checkub; + } + unlock_beancounters(slot, flags); +#if UB_DEBUG & 8 + printk(KERN_DEBUG "UB_PROC: produced %d, job %d, rem %d\n", + produced, job, rem); +#endif + if (produced <= rem) + goto out_free; + /* Temporay buffer `buf' contains `produced' bytes. + * Extract no more than `len' bytes at offset `rem'. + */ + tocopy = produced - rem; + if (len < tocopy) + tocopy = len; + if (!tocopy) + goto out_free; + if (copy_to_user(usrbuf, buf + rem, tocopy)) + goto fail; + *poff += tocopy; + len -= tocopy; + retval += tocopy; + if (!len) + goto out_free; + usrbuf += tocopy; + goto again; +fail: + retval = -EFAULT; +out_free: + free_page((unsigned long)buf); +out: + return retval; +} + +static struct file_operations ub_file_operations = { + read: ub_proc_read, +}; + +#endif /* defined CONFIG_USER_RESOURCE_PROC */ + + +#endif /* defined CONFIG_USER_RESOURCE */ + + +/* + * Initialisation + */ + +void __init beancounter_init(unsigned long mempages) +{ +#ifdef CONFIG_USER_RESOURCE + int k; + + ub_cachep = kmem_cache_create("user_beancounters", + sizeof(struct user_beancounter), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + atomic_set(&default_beancounter.ub_refcount, 1); + spin_lock_init(&default_beancounter.ub_lock); + + /* + * Default settings + */ + memset(&default_beancounter.ub_held, + 0, sizeof(default_beancounter.ub_held)); + default_beancounter.ub_limit[UB_KMEMSIZE] = + mempages > (192*1024*1024 >> PAGE_SHIFT) ? + 32*1024*1024 : + (mempages << PAGE_SHIFT) / 6; + default_beancounter.ub_limit[UB_LOCKEDPAGES] = 8; + default_beancounter.ub_limit[UB_TOTVMPAGES] = + UB_MAXVALUE; /* swappable */ + default_beancounter.ub_limit[UB_SHMPAGES] = 64; + default_beancounter.ub_limit[UB_ZSHMPAGES] = 1024; + default_beancounter.ub_limit[UB_NUMPROC] = max_threads / 2; + default_beancounter.ub_limit[UB_NUMSOCK] = 1024; + default_beancounter.ub_limit[UB_NUMFLOCK] = 1024; + default_beancounter.ub_limit[UB_NUMPTY] = 16; + default_beancounter.ub_limit[UB_NUMSIGINFO] = 1024; + for (k = 0; k < UB_RESOURCES; k++) + default_beancounter.ub_barrier[k] = + default_beancounter.ub_limit[k]; + + /* + * Initialise the beancounter hash. + */ + for (k = 0; k < UB_HASH_SIZE; k++) + spin_lock_init(&ub_hash[k].ubh_lock); + +#ifdef CONFIG_USER_RESOURCE_PROC + { + struct proc_dir_entry *entry; + entry = create_proc_entry("user_beancounters", S_IRUGO, NULL); + if (entry) + entry->proc_fops = &ub_file_operations; + } +#endif +#endif +} + + +/* + * The (rather boring) getluid syscall + */ +asmlinkage long sys_getluid(void) +{ + return current->luid != (uid_t)-1 ? current->luid : -EINVAL; +} + +/* + * The setluid syscall + */ +asmlinkage long sys_setluid(uid_t uid) +{ +#ifdef CONFIG_USER_RESOURCE + struct user_beancounter *ub; +#endif + int error; + + /* You may not disown a setluid */ + error = -EINVAL; + if (uid == (uid_t)-1) + goto out; + + /* You may only set an luid as root */ + error = -EPERM; + if (!capable(CAP_SETUID)) + goto out; + + /* The luid once set is irrevocable to all */ + if (current->luid != (uid_t)-1) + goto out; + +#ifdef CONFIG_USER_RESOURCE + /* Ok - set up a beancounter entry for this user */ + error = -ENOBUFS; + ub = get_beancounter_byuid(uid, 1); + if(ub == NULL) + goto out; + + printk(KERN_DEBUG "setluid, bean %p (count %d) for %.20s pid %d\n", + ub, atomic_read(&ub->ub_refcount), + current->comm, current->pid); + /* Install it */ + current->login_bc = ub; +#endif + + /* Take on our new luid and report it as OK */ + current->luid = uid; + + error = 0; +out: + return error; +} + +/* + * The setbeanlimit syscall + */ +asmlinkage long sys_setublimit(uid_t uid, unsigned long resource, + struct rlimit *rlim) +{ + int error; +#ifdef CONFIG_USER_RESOURCE + unsigned long flags; + struct user_beancounter *ub; + struct rlimit new_rlim; + + error = -EPERM; + if(!capable(CAP_SYS_RESOURCE)) + goto out; + + error = -EINVAL; + if (resource >= UB_RESOURCES) + goto out; + + error = -EFAULT; + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + goto out; + + error = -EINVAL; + if (new_rlim.rlim_cur < 0 || new_rlim.rlim_cur > UB_MAXVALUE || + new_rlim.rlim_max < 0 || new_rlim.rlim_max > UB_MAXVALUE) + goto out; + + error = -EINVAL; + ub = get_beancounter_byuid(uid, 0); + if (ub == NULL) { +#if UB_DEBUG & 4 + printk(KERN_DEBUG "No login bc for uid %d\n", uid); +#endif + goto out; + } + + spin_lock_irqsave(&ub->ub_lock, flags); + ub->ub_barrier[resource] = new_rlim.rlim_cur; + ub->ub_limit[resource] = new_rlim.rlim_max; + spin_unlock_irqrestore(&ub->ub_lock, flags); + + __put_beancounter(ub); + + error = 0; +out: +#else + error = -ENOSYS; +#endif + return error; +} diff -ruN -X excl linux-lt-2.4.0-test7.prev/kernel/exit.c linux-lt-2.4.0-test7/kernel/exit.c --- linux-lt-2.4.0-test7.prev/kernel/exit.c Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/kernel/exit.c Fri Aug 25 17:12:24 2000 @@ -12,6 +12,7 @@ #ifdef CONFIG_BSD_PROCESS_ACCT #include #endif +#include #include #include @@ -61,6 +62,8 @@ current->counter += p->counter; if (current->counter >= MAX_COUNTER) current->counter = MAX_COUNTER; + uncharge_task(p->task_bc); + put_beancounter(p->task_bc); free_task_struct(p); } else { printk("task releasing itself\n"); @@ -454,6 +457,12 @@ tsk->state = TASK_ZOMBIE; tsk->exit_code = code; exit_notify(); + /* + * Login beancounter will no longer have a sense. + * Struct task memory will be uncharged and tsk->task_bc dropped + * in release(). + */ + put_beancounter(tsk->login_bc); put_exec_domain(tsk->exec_domain); if (tsk->binfmt && tsk->binfmt->module) __MOD_DEC_USE_COUNT(tsk->binfmt->module); diff -ruN -X excl linux-lt-2.4.0-test7.prev/kernel/fork.c linux-lt-2.4.0-test7/kernel/fork.c --- linux-lt-2.4.0-test7.prev/kernel/fork.c Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/kernel/fork.c Fri Aug 25 17:46:48 2000 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -126,6 +127,7 @@ int retval; flush_cache_mm(current->mm); + mm->total_vm = 0; mm->locked_vm = 0; mm->mmap = NULL; mm->mmap_avl = NULL; @@ -143,9 +145,16 @@ retval = -ENOMEM; if(mpnt->vm_flags & VM_DONTCOPY) continue; + retval = charge_memory(mm->beancounter, + mpnt->vm_end - mpnt->vm_start, + (mpnt->vm_flags & ~VM_LOCKED), 1); + if (retval) + goto out; + retval = -ENOMEM; tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!tmp) - goto fail_nomem; + goto out_uncharge; + mm->total_vm += (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; *tmp = *mpnt; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; @@ -182,28 +191,38 @@ pprev = &tmp->vm_next; if (retval) - goto fail_nomem; + goto out; } retval = 0; if (mm->map_count >= AVL_MIN_MAP_COUNT) build_mmap_avl(mm); -fail_nomem: +out: flush_tlb_mm(current->mm); return retval; + +out_uncharge: + uncharge_memory(mm->beancounter, mpnt->vm_end - mpnt->vm_start, + (mpnt->vm_flags & ~VM_LOCKED)); + /* linked VM areas will be uncharged when the list is destroyed */ + goto out; } #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) -static struct mm_struct * mm_init(struct mm_struct * mm) +static struct mm_struct * mm_init(struct mm_struct * mm, + struct user_beancounter * ub) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_MUTEX(&mm->mmap_sem); mm->page_table_lock = SPIN_LOCK_UNLOCKED; - mm->pgd = pgd_alloc(); - if (mm->pgd) + mm->pgd = pgd_alloc(ub); + if (mm->pgd) { + mm->beancounter = ub; + get_beancounter(ub); return mm; + } kmem_cache_free(mm_cachep, mm); return NULL; } @@ -212,14 +231,14 @@ /* * Allocate and initialize an mm_struct. */ -struct mm_struct * mm_alloc(void) +struct mm_struct * mm_alloc(struct user_beancounter * ub) { struct mm_struct * mm; mm = allocate_mm(); if (mm) { memset(mm, 0, sizeof(*mm)); - return mm_init(mm); + return mm_init(mm, ub); } return NULL; } @@ -232,8 +251,9 @@ inline void __mmdrop(struct mm_struct *mm) { if (mm == &init_mm) BUG(); - pgd_free(mm->pgd); + pgd_free(mm->beancounter, mm->pgd); destroy_context(mm); + put_beancounter(mm->beancounter); kmem_cache_free(mm_cachep, mm); } @@ -305,7 +325,7 @@ /* Copy the current MM stuff.. */ memcpy(mm, current->mm, sizeof(*mm)); - if (!mm_init(mm)) + if (!mm_init(mm, tsk->login_bc)) goto fail_nomem; tsk->mm = mm; @@ -533,7 +553,7 @@ */ int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs) { - int retval = -ENOMEM; + int retval; struct task_struct *p; DECLARE_MUTEX_LOCKED(sem); @@ -545,9 +565,14 @@ current->vfork_sem = &sem; + retval = charge_task(current->login_bc); + if (retval) + goto fork_out; + + retval = -ENOMEM; p = alloc_task_struct(); if (!p) - goto fork_out; + goto fork_out_uncharge; *p = *current; @@ -618,6 +643,12 @@ p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; + /* Clone the beancounter reference. + * The parent (current) keeps the reference count being nonzero. */ + get_beancounter(p->login_bc); /* copied intact */ + p->task_bc = p->login_bc; + get_beancounter(p->task_bc); + retval = -ENOMEM; /* copy all the process information */ if (copy_files(clone_flags, p)) @@ -685,6 +716,8 @@ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup: + put_beancounter(p->task_bc); + put_beancounter(p->login_bc); put_exec_domain(p->exec_domain); if (p->binfmt && p->binfmt->module) __MOD_DEC_USE_COUNT(p->binfmt->module); @@ -693,6 +726,11 @@ free_uid(p->user); bad_fork_free: free_task_struct(p); + uncharge_task(current->login_bc); + goto fork_out; + +fork_out_uncharge: + uncharge_task(current->login_bc); goto fork_out; } diff -ruN -X excl linux-lt-2.4.0-test7.prev/kernel/signal.c linux-lt-2.4.0-test7/kernel/signal.c --- linux-lt-2.4.0-test7.prev/kernel/signal.c Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/kernel/signal.c Fri Aug 25 17:12:24 2000 @@ -97,9 +97,12 @@ t->sigqueue_tail = &t->sigqueue; while (q) { + struct user_beancounter *bc = q->charged_bc; n = q->next; kmem_cache_free(signal_queue_cachep, q); atomic_dec(&nr_queued_signals); + uncharge_siginfo(bc, sizeof(struct signal_queue)); + put_beancounter(bc); q = n; } } @@ -196,11 +199,14 @@ if (q->info.si_signo == sig) break; if (q) { + struct user_beancounter *bc = q->charged_bc; if ((*pp = q->next) == NULL) current->sigqueue_tail = pp; copy_siginfo(info, &q->info); - kmem_cache_free(signal_queue_cachep,q); + kmem_cache_free(signal_queue_cachep, q); atomic_dec(&nr_queued_signals); + uncharge_siginfo(bc, sizeof(struct signal_queue)); + put_beancounter(bc); /* Then see if this signal is still pending. (Non rt signals may not be queued twice.) @@ -278,10 +284,13 @@ if (q->info.si_signo == sig) break; if (q) { + struct user_beancounter *bc = q->charged_bc; if ((*pp = q->next) == NULL) t->sigqueue_tail = pp; - kmem_cache_free(signal_queue_cachep,q); + kmem_cache_free(signal_queue_cachep, q); atomic_dec(&nr_queued_signals); + uncharge_siginfo(bc, sizeof(struct signal_queue)); + put_beancounter(bc); } return 1; } @@ -399,8 +408,19 @@ pass on the info struct. */ if (atomic_read(&nr_queued_signals) < max_queued_signals) { - q = (struct signal_queue *) - kmem_cache_alloc(signal_queue_cachep, GFP_ATOMIC); + struct user_beancounter *bc = current->login_bc; + if ( !charge_siginfo(bc, sizeof(struct signal_queue))) { + q = (struct signal_queue *) + kmem_cache_alloc(signal_queue_cachep, + GFP_ATOMIC); + if (q) { + get_beancounter(bc); + q->charged_bc = bc; + } + else + uncharge_siginfo(bc, + sizeof(struct signal_queue)); + } } if (q) { @@ -956,10 +976,13 @@ if (q->info.si_signo != sig) pp = &q->next; else { + struct user_beancounter *bc = q->charged_bc; if ((*pp = q->next) == NULL) current->sigqueue_tail = pp; kmem_cache_free(signal_queue_cachep, q); atomic_dec(&nr_queued_signals); + uncharge_siginfo(bc, sizeof(struct signal_queue)); + put_beancounter(bc); } q = *pp; } diff -ruN -X excl linux-lt-2.4.0-test7.prev/mm/Makefile linux-lt-2.4.0-test7/mm/Makefile --- linux-lt-2.4.0-test7.prev/mm/Makefile Tue Dec 7 02:14:13 1999 +++ linux-lt-2.4.0-test7/mm/Makefile Fri Aug 25 17:12:24 2000 @@ -16,4 +16,8 @@ O_OBJS += highmem.o endif +ifeq ($(CONFIG_USER_RESOURCE),y) +O_OBJS += kubd.o +endif + include $(TOPDIR)/Rules.make diff -ruN -X excl linux-lt-2.4.0-test7.prev/mm/kubd.c linux-lt-2.4.0-test7/mm/kubd.c --- linux-lt-2.4.0-test7.prev/mm/kubd.c Thu Jan 1 07:30:00 1970 +++ linux-lt-2.4.0-test7/mm/kubd.c Fri Aug 25 17:12:24 2000 @@ -0,0 +1,235 @@ +/* + * linux/mm/kubd.c + * + * Copyright (C) 2000 Andrey Moruga + * + * TODO: + * - consider what should be done (if any) for bad pgd/pmd entries + * + * Changes: + * 2000/07/28 Andrey V. Savochkin + * - some cosmetic changes + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Page weights (it's a scale for our fixed-point arithmetic). + * Restrictions: + * - UB_PAGE_WEIGHT should be considerably greater than maximum number of + * different mm in the system (is defines the precision of calculations) + * - MAX_ULONGLONG/UB_PAGE_WEIGHT should be greater than the amount of + * physical memory (to avoid overflows) + */ +#define UB_PAGE_WEIGHT_SHIFT 24 +#define UB_PAGE_WEIGHT (1 << UB_PAGE_WEIGHT_SHIFT) + +/* + * Various debugging stuff + * 1 main cycle + * 2 pte + */ +#define UB_DEBUG 2 + +static inline void kcharge_pte (pte_t *pte, struct user_beancounter *ub) +{ + struct page *page; + int nref; + + if (!pte_present(*pte)) + return; + + page = pte_page(*pte); + if ((page-mem_map >= max_mapnr) || PageReserved(page)) + return; + + nref = page_count(page) - !!page->buffers + - (PageSwapCache(page) ? 1 : !!page->mapping); + /* now charge the beancounter with page_weight / nreferences */ + if (nref <= 0) { +#if UB_DEBUG & 2 + printk("Error: reference counter is %d\n", nref); + printk("Page is %p, counter %d\n", page, page_count(page)); + printk("Buffers is %p, Mapping %p\n", page->buffers, + page->mapping); + printk("PageSwapCache(page) is %d\n", PageSwapCache(page)); + printk("PageReserved(page) is %d\n", PageReserved(page)); + printk("page-mem_map is %d, max_mapnr %ld\n", page-mem_map, + max_mapnr); +#endif + } else + ub->ub_held_pages += UB_PAGE_WEIGHT / nref; +} + +static inline void kcharge_pmd (pmd_t *pmd, unsigned long address, + unsigned long end, struct user_beancounter *ub) +{ + pte_t *pte; + unsigned long pmd_end; + + if (pmd_none(*pmd)) + return; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + return; + } + + pte = pte_offset(pmd, address); + + pmd_end = (address + PMD_SIZE) & PMD_MASK; + if (end > pmd_end) + end = pmd_end; + + do { + kcharge_pte(pte, ub); + address += PAGE_SIZE; + pte++; + } while (address && (address < end)); +} + + +static inline void kcharge_pgd (pgd_t *dir, unsigned long address, + unsigned long end, struct user_beancounter *ub) +{ + pmd_t *pmd; + unsigned long pgd_end; + + if (pgd_none(*dir)) + return; + if (pgd_bad(*dir)) { + pgd_ERROR(*dir); + pgd_clear(dir); + return; + } + pmd = pmd_offset(dir, address); + + pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK; + if (pgd_end && (end > pgd_end)) + end = pgd_end; + + do { + kcharge_pmd(pmd, address, end, ub); + + address = (address + PMD_SIZE) & PMD_MASK; + pmd++; + } while (address && (address < end)); +} + +static inline void kcharge_vma (struct vm_area_struct *vma, + struct user_beancounter *ub) +{ + unsigned long address = vma->vm_start; + unsigned long end = vma->vm_end; + pgd_t *pgdir; + + pgdir = pgd_offset(vma->vm_mm, address); + do { + /* go through pgd */ + kcharge_pgd(pgdir, address, end, ub); + address = (address + PGDIR_SIZE) & PGDIR_MASK; + pgdir++; + } while (address && (address < end)); +} + +static inline void kcharge_mm (struct mm_struct *mm, + struct user_beancounter *ub) +{ + struct vm_area_struct *vma; + + vmlist_access_lock(mm); + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) + kcharge_vma (vma, ub); + vmlist_access_unlock(mm); + +} + +/* The background daemon, started as a kernel thread and repeatedly calculating + * and charging the current physical memory consumption */ +int kubd(void *unused) +{ + struct task_struct *tsk = current; + struct ub_hash_slot *slot; + struct task_struct *p; + struct user_beancounter *ub; + unsigned long flags; + long i; + + tsk->session = 1; + tsk->pgrp = 1; + strcpy(tsk->comm, "kubd"); + sigfillset(&tsk->blocked); + +repeat: + /* first of all, clear the current ub_held_pages values */ + slot = ub_hash; + for (i = 0; i < UB_HASH_SIZE; i++) { + lock_beancounters(slot, flags); + for (ub = slot->ubh_beans; ub != NULL; ub = ub->ub_next) + ub->ub_held_pages = 0; + unlock_beancounters(slot, flags); + slot++; + } + + /* now, go through all the processes -> mm -> vma -> pgd -> pmd + * -> page to find out how many processes reference the page */ + /* we have to keep the tasklist locked */ + read_lock(&tasklist_lock); + for (p = init_task.next_task; p != &init_task; + p = p->next_task) { + + struct mm_struct *mm = p->mm; + struct user_beancounter *ub; + + if (!p->swappable || !mm) + continue; + ub = mm->beancounter; + if (!ub) + continue; + kcharge_mm(mm, ub); + } + read_unlock(&tasklist_lock); + + /* now the calculations are done; + * set ub_held[UB_RESPAGES] values */ + slot = ub_hash; + for (i = 0; i < UB_HASH_SIZE; i++) { + lock_beancounters(slot, flags); + for (ub = slot->ubh_beans; ub != NULL; ub = ub->ub_next) { + ub->ub_held[UB_RESPAGES] = + ub->ub_held_pages >> UB_PAGE_WEIGHT_SHIFT; + if (ub->ub_maxheld[UB_RESPAGES] < + ub->ub_held[UB_RESPAGES]) + ub->ub_maxheld[UB_RESPAGES] = + ub->ub_held[UB_RESPAGES]; + } + unlock_beancounters(slot, flags); + slot++; + } + /* work's done, time to get sleep a bit */ + tsk->state = TASK_INTERRUPTIBLE; +#if UB_DEBUG & 1 + printk("Gonna sleep now\n"); +#endif + schedule_timeout(HZ); +#if UB_DEBUG & 1 + printk("Just woke up\n"); +#endif + goto repeat; +} + +static int __init kubd_init(void) +{ + printk("Starting kubd\n"); + kernel_thread(kubd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); + return 0; +} + +module_init(kubd_init) diff -ruN -X excl linux-lt-2.4.0-test7.prev/mm/memory.c linux-lt-2.4.0-test7/mm/memory.c --- linux-lt-2.4.0-test7.prev/mm/memory.c Thu Aug 10 11:42:10 2000 +++ linux-lt-2.4.0-test7/mm/memory.c Sun Aug 27 17:40:14 2000 @@ -73,7 +73,7 @@ * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static inline void free_one_pmd(pmd_t * dir) +static inline void free_one_pmd(struct mm_struct *mm, pmd_t * dir) { pte_t * pte; @@ -86,10 +86,10 @@ } pte = pte_offset(dir, 0); pmd_clear(dir); - pte_free(pte); + pte_free(mm->beancounter, pte); } -static inline void free_one_pgd(pgd_t * dir) +static inline void free_one_pgd(struct mm_struct *mm, pgd_t * dir) { int j; pmd_t * pmd; @@ -104,8 +104,8 @@ pmd = pmd_offset(dir, 0); pgd_clear(dir); for (j = 0; j < PTRS_PER_PMD ; j++) - free_one_pmd(pmd+j); - pmd_free(pmd); + free_one_pmd(mm, pmd+j); + pmd_free(mm->beancounter, pmd); } /* Low and high watermarks for page table cache. @@ -130,7 +130,7 @@ page_dir += first; do { - free_one_pgd(page_dir); + free_one_pgd(mm, page_dir); page_dir++; } while (--nr); @@ -178,7 +178,7 @@ continue; } if (pgd_none(*dst_pgd)) { - if (!pmd_alloc(dst_pgd, 0)) + if (!pmd_alloc(dst->beancounter, dst_pgd, 0)) goto nomem; } @@ -201,7 +201,7 @@ goto cont_copy_pmd_range; } if (pmd_none(*dst_pmd)) { - if (!pte_alloc(dst_pmd, 0)) + if (!pte_alloc(dst->beancounter, dst_pmd, 0)) goto nomem; } @@ -377,8 +377,12 @@ */ if (mm->rss > freed) mm->rss -= freed; - else + else { +#if 0 + printk(KERN_ERR "Negative rss!\n"); +#endif mm->rss = 0; + } } @@ -650,7 +654,8 @@ } while (address && (address < end)); } -static inline int zeromap_pmd_range(pmd_t * pmd, unsigned long address, +static inline int zeromap_pmd_range(struct mm_struct *mm, + pmd_t * pmd, unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -660,7 +665,7 @@ if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc(pmd, address); + pte_t * pte = pte_alloc(mm->beancounter, pmd, address); if (!pte) return -ENOMEM; zeromap_pte_range(pte, address, end - address, prot); @@ -682,11 +687,11 @@ if (address >= end) BUG(); do { - pmd_t *pmd = pmd_alloc(dir, address); + pmd_t *pmd = pmd_alloc(current->mm->beancounter, dir, address); error = -ENOMEM; if (!pmd) break; - error = zeromap_pmd_range(pmd, address, end - address, prot); + error = zeromap_pmd_range(current->mm, pmd, address, end - address, prot); if (error) break; address = (address + PGDIR_SIZE) & PGDIR_MASK; @@ -725,7 +730,8 @@ } while (address && (address < end)); } -static inline int remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, +static inline int remap_pmd_range(struct mm_struct *mm, + pmd_t * pmd, unsigned long address, unsigned long size, unsigned long phys_addr, pgprot_t prot) { unsigned long end; @@ -736,7 +742,7 @@ end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc(pmd, address); + pte_t * pte = pte_alloc(mm->beancounter, pmd, address); if (!pte) return -ENOMEM; remap_pte_range(pte, address, end - address, address + phys_addr, prot); @@ -759,11 +765,11 @@ if (from >= end) BUG(); do { - pmd_t *pmd = pmd_alloc(dir, from); + pmd_t *pmd = pmd_alloc(current->mm->beancounter, dir, from); error = -ENOMEM; if (!pmd) break; - error = remap_pmd_range(pmd, from, end - from, phys_addr + from, prot); + error = remap_pmd_range(current->mm, pmd, from, end - from, phys_addr + from, prot); if (error) break; from = (from + PGDIR_SIZE) & PGDIR_MASK; @@ -1205,6 +1211,7 @@ * Ok, the entry was present, we need to get the page table * lock to synchronize with kswapd, and verify that the entry * didn't change from under us.. + * RED PEN: ok, what if it really changed between these two checks? SAW */ spin_lock(&mm->page_table_lock); if (pte_val(entry) == pte_val(*pte)) { @@ -1232,10 +1239,10 @@ pmd_t *pmd; pgd = pgd_offset(mm, address); - pmd = pmd_alloc(pgd, address); + pmd = pmd_alloc(mm->beancounter, pgd, address); if (pmd) { - pte_t * pte = pte_alloc(pmd, address); + pte_t * pte = pte_alloc(mm->beancounter, pmd, address); if (pte) ret = handle_pte_fault(mm, vma, address, write_access, pte); } diff -ruN -X excl linux-lt-2.4.0-test7.prev/mm/mlock.c linux-lt-2.4.0-test7/mm/mlock.c --- linux-lt-2.4.0-test7.prev/mm/mlock.c Fri Aug 25 12:04:32 2000 +++ linux-lt-2.4.0-test7/mm/mlock.c Fri Aug 25 17:12:24 2000 @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -116,6 +117,13 @@ if (newflags == vma->vm_flags) return 0; + if (newflags & VM_LOCKED) { + retval = charge_locked_mem(vma->vm_mm->beancounter, + end - start); + if (retval) + return retval; + } + if (start == vma->vm_start) { if (end == vma->vm_end) retval = mlock_fixup_all(vma, newflags); @@ -133,8 +141,17 @@ if (newflags & VM_LOCKED) { pages = -pages; make_pages_present(start, end); + } else { + /* successfully unlocked some memory */ + uncharge_locked_mem(vma->vm_mm->beancounter, + end - start); } vma->vm_mm->locked_vm -= pages; + } else { + /* memory was charged and fixup failed; uncharge the mem */ + if (newflags & VM_LOCKED) + uncharge_locked_mem(vma->vm_mm->beancounter, + end - start); } return retval; } diff -ruN -X excl linux-lt-2.4.0-test7.prev/mm/mmap.c linux-lt-2.4.0-test7/mm/mmap.c --- linux-lt-2.4.0-test7.prev/mm/mmap.c Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/mm/mmap.c Fri Aug 25 17:12:24 2000 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -52,13 +53,18 @@ * (buffers+cache), use the minimum values. Allow an extra 2% * of num_physpages for safety margin. */ - long free; /* Sometimes we want to use more memory than we have. */ if (sysctl_overcommit_memory) return 1; + /* Memory gaurantees for "good" processes */ + if (current->mm->beancounter && + (current->mm->beancounter->ub_held[UB_TOTVMPAGES] + pages + <= current->mm->beancounter->ub_barrier[UB_SPCGUARPAGES])) + return 1; + free = atomic_read(&buffermem_pages); free += atomic_read(&page_cache_size); free += nr_free_pages(); @@ -269,7 +275,7 @@ } else { vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_SHARED) - vma->vm_flags |= VM_SHARED | VM_MAYSHARE; + vma->vm_flags |= VM_SHARED | VM_MAYSHARE | VM_ANON; } vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f]; vma->vm_ops = NULL; @@ -293,22 +299,25 @@ !vm_enough_memory(len >> PAGE_SHIFT)) goto free_vma; + if (charge_memory(vma->vm_mm->beancounter, len, vma->vm_flags, 1)) + goto free_vma; + if (file) { if (vma->vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) - goto free_vma; + goto uncharge_and_free_vma; correct_wcount = 1; } vma->vm_file = file; get_file(file); error = file->f_op->mmap(file, vma); if (error) - goto unmap_and_free_vma; + goto unmap_uncharge_and_free_vma; } else if (flags & MAP_SHARED) { error = map_zero_setup(vma); if (error) - goto free_vma; + goto uncharge_and_free_vma; } /* @@ -331,7 +340,7 @@ } return addr; -unmap_and_free_vma: +unmap_uncharge_and_free_vma: if (correct_wcount) atomic_inc(&file->f_dentry->d_inode->i_writecount); vma->vm_file = NULL; @@ -340,6 +349,8 @@ flush_cache_range(mm, vma->vm_start, vma->vm_end); zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); flush_tlb_range(mm, vma->vm_start, vma->vm_end); +uncharge_and_free_vma: + uncharge_memory(vma->vm_mm->beancounter, len, vma->vm_flags); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -527,6 +538,7 @@ area->vm_ops->close(area); if (area->vm_file) fput(area->vm_file); + uncharge_memory(area->vm_mm->beancounter, len, area->vm_flags); kmem_cache_free(vm_area_cachep, area); return extra; } @@ -566,6 +578,8 @@ insert_vm_struct(mm, area); vmlist_modify_unlock(mm); + /* uncharge memory at the last moment */ + uncharge_memory(area->vm_mm->beancounter, len, area->vm_flags); return extra; } @@ -784,12 +798,17 @@ if (!vm_enough_memory(len >> PAGE_SHIFT)) return -ENOMEM; + if (charge_memory(mm->beancounter, len, + vm_flags(PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE) | mm->def_flags, 1)) + return -ENOMEM; + /* * create a vma struct for an anonymous mapping */ vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); if (!vma) - return -ENOMEM; + goto fail_uncharge; vma->vm_mm = mm; vma->vm_start = addr; @@ -822,6 +841,12 @@ make_pages_present(addr, addr + len); } return addr; + +fail_uncharge: + uncharge_memory(mm->beancounter, len, + vm_flags(PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE) | mm->def_flags); + return -ENOMEM; } /* Build the AVL tree corresponding to the VMA list. */ @@ -865,6 +890,7 @@ zap_page_range(mm, start, size); if (mpnt->vm_file) fput(mpnt->vm_file); + uncharge_memory(mm->beancounter, size, mpnt->vm_flags); kmem_cache_free(vm_area_cachep, mpnt); mpnt = next; } diff -ruN -X excl linux-lt-2.4.0-test7.prev/mm/mremap.c linux-lt-2.4.0-test7/mm/mremap.c --- linux-lt-2.4.0-test7.prev/mm/mremap.c Thu Aug 3 16:47:24 2000 +++ linux-lt-2.4.0-test7/mm/mremap.c Fri Aug 25 17:12:24 2000 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -51,9 +52,9 @@ pmd_t * pmd; pte_t * pte = NULL; - pmd = pmd_alloc(pgd_offset(mm, addr), addr); + pmd = pmd_alloc(mm->beancounter, pgd_offset(mm, addr), addr); if (pmd) - pte = pte_alloc(pmd, addr); + pte = pte_alloc(mm->beancounter, pmd, addr); return pte; } @@ -241,6 +242,9 @@ !(flags & MAP_NORESERVE) && !vm_enough_memory((new_len - old_len) >> PAGE_SHIFT)) goto out; + ret = charge_memory(vma->vm_mm->beancounter, new_len, vma->vm_flags, 1); + if (ret) + goto out; /* old_len exactly to the end of the area.. * And we're not relocating the area. @@ -264,6 +268,8 @@ addr + new_len); } ret = addr; + uncharge_memory(vma->vm_mm->beancounter, old_len, + vma->vm_flags); goto out; } } @@ -277,10 +283,20 @@ if (!(flags & MREMAP_FIXED)) { new_addr = get_unmapped_area(0, new_len); if (!new_addr) - goto out; + goto out_uncharge; } ret = move_vma(vma, addr, old_len, new_len, new_addr); } + + /* + * On fail uncharge the initially charged size. + * On success the old size is uncharged in do_unmap called from + * move_vma. + */ +out_uncharge: + if (ret == -ENOMEM) + uncharge_memory(vma->vm_mm->beancounter, new_len, + vma->vm_flags); out: return ret; } diff -ruN -X excl linux-lt-2.4.0-test7.prev/mm/vmscan.c linux-lt-2.4.0-test7/mm/vmscan.c --- linux-lt-2.4.0-test7.prev/mm/vmscan.c Thu Aug 10 11:42:10 2000 +++ linux-lt-2.4.0-test7/mm/vmscan.c Fri Aug 25 17:12:24 2000 @@ -11,6 +11,8 @@ * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). */ +#define UB_VM_DEBUG 0 + #include #include #include @@ -51,6 +53,9 @@ if (mm->swap_cnt) mm->swap_cnt--; + if (mm->beancounter && mm->beancounter->ub_swp_pri) + mm->beancounter->ub_swp_pri--; + /* Don't look at this pte if it's been accessed recently. */ if (pte_young(pte)) { /* @@ -337,6 +342,17 @@ return 0; } +static inline void update_ub_swp_pri(struct mm_struct* mm) +{ + if (mm->beancounter) { + mm->beancounter->ub_swp_pri = + mm->beancounter->ub_held[UB_RESPAGES] + - mm->beancounter->ub_barrier[UB_RESPAGES]; + if (mm->beancounter->ub_swp_pri < 0) + mm->beancounter->ub_swp_pri = 0; + } +} + /* * Select the task with maximal swap_cnt and try to swap out a page. * N.B. This function returns only 0 or 1. Return values != 1 from @@ -345,6 +361,7 @@ static int swap_out(unsigned int priority, int gfp_mask) { struct task_struct * p; + struct task_struct * pb; int counter; int __ret = 0; @@ -369,6 +386,8 @@ for (; counter >= 0; counter--) { unsigned long max_cnt = 0; + unsigned long max_overdraft = 0; + signed long ub_swp_pri = 0; struct mm_struct *best = NULL; int pid = 0; int assign = 0; @@ -381,25 +400,52 @@ continue; if (mm->rss <= 0) continue; - /* Refresh swap_cnt? */ - if (assign == 1) + /* Refresh swap_cnt and ub_swp_pri? */ + if (assign == 1) { + /* updates of beancounter->ub_swp_pri are only + * here with the kernel lock - we do not need + * the ub_lock yet */ + update_ub_swp_pri(mm); mm->swap_cnt = mm->rss; - if (mm->swap_cnt > max_cnt) { + } + if (mm->beancounter) { + ub_swp_pri = mm->beancounter->ub_swp_pri; + if ( ub_swp_pri <= 0) + continue; + } + else + ub_swp_pri = 0; + if ((ub_swp_pri > max_overdraft && mm->swap_cnt > 0) || + (ub_swp_pri == max_overdraft && + mm->swap_cnt > max_cnt)) { + max_overdraft = ub_swp_pri; max_cnt = mm->swap_cnt; best = mm; pid = p->pid; + pb = p; } } read_unlock(&tasklist_lock); if (!best) { if (!assign) { assign = 1; +#if UB_VM_DEBUG + printk(KERN_DEBUG "Reassign happened...\n"); +#endif goto select; } goto out; } else { int ret; +#if UB_VM_DEBUG + printk("best is pid %d swap_cnt %d cmd %.20s ub %c\n", + pid, best->swap_cnt, pb->comm, + best->beancounter?'+':'-'); + if (best->beancounter) + printk("ub_swp_pri is %d\n", + best->beancounter->ub_swp_pri); +#endif atomic_inc(&best->mm_count); ret = swap_out_mm(best, gfp_mask); mmdrop(best); diff -ruN -X excl linux-lt-2.4.0-test7.prev/net/core/sock.c linux-lt-2.4.0-test7/net/core/sock.c --- linux-lt-2.4.0-test7.prev/net/core/sock.c Fri Aug 25 11:19:32 2000 +++ linux-lt-2.4.0-test7/net/core/sock.c Fri Aug 25 17:22:20 2000 @@ -169,7 +169,7 @@ #ifdef CONFIG_FILTER struct sk_filter *filter; #endif - int val; + int val, t_val; int valbool; int err; struct linger ling; @@ -231,8 +231,16 @@ if (val > sysctl_wmem_max) val = sysctl_wmem_max; + t_val = max(val*2,SOCK_MIN_SNDBUF); + if (t_val > sk->sndbuf_charged && sk->beancounter) { + if ((ret = charge_kmem(sk->beancounter, + t_val - sk->sndbuf_charged, 1))) + break; + sk->sndbuf_charged = t_val; + } + sk->userlocks |= SOCK_SNDBUF_LOCK; - sk->sndbuf = max(val*2,SOCK_MIN_SNDBUF); + sk->sndbuf = t_val; /* * Wake up sending tasks if we @@ -252,7 +260,14 @@ sk->userlocks |= SOCK_RCVBUF_LOCK; /* FIXME: is this lower bound the right one? */ - sk->rcvbuf = max(val*2,SOCK_MIN_RCVBUF); + t_val = max(val*2,SOCK_MIN_RCVBUF); + if (t_val > sk->rcvbuf_charged && sk->beancounter) { + if ((ret = charge_kmem(sk->beancounter, + t_val - sk->rcvbuf_charged, 1))) + break; + sk->rcvbuf_charged = t_val; + } + sk->rcvbuf = t_val; break; case SO_KEEPALIVE: @@ -584,6 +599,8 @@ void sk_free(struct sock *sk) { + struct user_beancounter *bc; + int charged; #ifdef CONFIG_FILTER struct sk_filter *filter; #endif @@ -602,7 +619,13 @@ if (atomic_read(&sk->omem_alloc)) printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc)); + bc = sk->beancounter; + charged = sizeof(struct sock) + sk->rcvbuf_charged + sk->sndbuf_charged; + kmem_cache_free(sk_cachep, sk); + + uncharge_sock(bc, charged); + put_beancounter(bc); } void __init sk_init(void) diff -ruN -X excl linux-lt-2.4.0-test7.prev/net/ipv4/af_inet.c linux-lt-2.4.0-test7/net/ipv4/af_inet.c --- linux-lt-2.4.0-test7.prev/net/ipv4/af_inet.c Fri Aug 25 15:31:09 2000 +++ linux-lt-2.4.0-test7/net/ipv4/af_inet.c Fri Aug 25 17:12:24 2000 @@ -81,6 +81,7 @@ #include #include #include +#include #include #include @@ -308,6 +309,12 @@ { struct sock *sk; struct proto *prot; + int charge; + + charge = sizeof(struct sock) + + sysctl_wmem_default + sysctl_rmem_default; + if (charge_sock(sock->beancounter, charge)) + goto ret_oom; sock->state = SS_UNCONNECTED; sk = sk_alloc(PF_INET, GFP_KERNEL, 1); @@ -354,6 +361,10 @@ sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT; sock_init_data(sock,sk); + sk->beancounter = sock->beancounter; + get_beancounter(sk->beancounter); + sk->rcvbuf_charged = sysctl_rmem_default; + sk->sndbuf_charged = sysctl_wmem_default; sk->destruct = inet_sock_destruct; @@ -399,17 +410,22 @@ free_and_badtype: sk_free(sk); + uncharge_sock(sock->beancounter, charge); return -ESOCKTNOSUPPORT; free_and_badperm: sk_free(sk); + uncharge_sock(sock->beancounter, charge); return -EPERM; free_and_noproto: sk_free(sk); + uncharge_sock(sock->beancounter, charge); return -EPROTONOSUPPORT; do_oom: + uncharge_sock(sock->beancounter, charge); +ret_oom: return -ENOBUFS; } @@ -419,7 +435,7 @@ * function we are destroying the object and from then on nobody * should refer to it. */ - + int inet_release(struct socket *sock) { struct sock *sk = sock->sk; diff -ruN -X excl linux-lt-2.4.0-test7.prev/net/ipv4/tcp_minisocks.c linux-lt-2.4.0-test7/net/ipv4/tcp_minisocks.c --- linux-lt-2.4.0-test7.prev/net/ipv4/tcp_minisocks.c Fri Aug 25 11:19:33 2000 +++ linux-lt-2.4.0-test7/net/ipv4/tcp_minisocks.c Fri Aug 25 17:36:54 2000 @@ -641,9 +641,16 @@ */ struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb) { - struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0); + struct sock *newsk; - if(newsk != NULL) { + if (charge_sock(sk->beancounter, + sizeof(struct sock) + sk->rcvbuf + sk->sndbuf)) + goto charge_fail; + newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0); + if (newsk == NULL) + goto alloc_fail; + + { struct tcp_opt *newtp; #ifdef CONFIG_FILTER struct sk_filter *filter; @@ -752,6 +759,10 @@ tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newsk->socket = NULL; newsk->sleep = NULL; + /* newsk->beancounter = sk->beancounter; -- copied */ + get_beancounter(newsk->beancounter); + newsk->rcvbuf_charged = sk->rcvbuf; + newsk->sndbuf_charged = sk->sndbuf; newtp->tstamp_ok = req->tstamp_ok; if((newtp->sack_ok = req->sack_ok) != 0) { @@ -786,6 +797,15 @@ TCP_ECN_openreq_child(newtp, req); } return newsk; + +alloc_fail: + uncharge_sock(sk->beancounter, + sizeof(struct sock) + sk->rcvbuf + sk->sndbuf); + return NULL; +charge_fail: + if (net_ratelimit()) + printk(KERN_WARNING "no resources, can\'t create socket.\n"); + return NULL; } /* diff -ruN -X excl linux-lt-2.4.0-test7.prev/net/socket.c linux-lt-2.4.0-test7/net/socket.c --- linux-lt-2.4.0-test7.prev/net/socket.c Fri Aug 25 12:04:16 2000 +++ linux-lt-2.4.0-test7/net/socket.c Fri Aug 25 17:12:24 2000 @@ -71,6 +71,7 @@ #include #include #include +#include #if defined(CONFIG_KMOD) && defined(CONFIG_NET) #include @@ -466,6 +467,10 @@ sock->file = NULL; sockets_in_use[smp_processor_id()].counter++; + + sock->beancounter = current->login_bc; + get_beancounter(sock->beancounter); + return sock; } @@ -503,6 +508,9 @@ return; } sock->file=NULL; + + put_beancounter(sock->beancounter); + sock->beancounter = NULL; } int sock_sendmsg(struct socket *sock, struct msghdr *msg, int size)