diff -urN linux-2.4.16-reiserfspatches-immutable/Makefile linux-2.4.16-reiserfspatches-immutable-ctx4/Makefile --- linux-2.4.16-reiserfspatches-immutable/Makefile Mon Dec 10 13:12:57 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/Makefile Mon Dec 10 15:01:37 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 16 -EXTRAVERSION = +EXTRAVERSION =ctx-4 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -urN linux-2.4.16-reiserfspatches-immutable/arch/i386/kernel/entry.S linux-2.4.16-reiserfspatches-immutable-ctx4/arch/i386/kernel/entry.S --- linux-2.4.16-reiserfspatches-immutable/arch/i386/kernel/entry.S Mon Dec 10 13:12:28 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/arch/i386/kernel/entry.S Mon Dec 10 15:01:37 2001 @@ -622,6 +622,8 @@ .long SYMBOL_NAME(sys_ni_syscall) /* Reserved for Security */ .long SYMBOL_NAME(sys_gettid) .long SYMBOL_NAME(sys_readahead) /* 225 */ + .long SYMBOL_NAME(sys_new_s_context) + .long SYMBOL_NAME(sys_set_ipv4root) .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) diff -urN linux-2.4.16-reiserfspatches-immutable/arch/i386/kernel/ptrace.c linux-2.4.16-reiserfspatches-immutable-ctx4/arch/i386/kernel/ptrace.c --- linux-2.4.16-reiserfspatches-immutable/arch/i386/kernel/ptrace.c Mon Dec 10 13:12:42 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/arch/i386/kernel/ptrace.c Mon Dec 10 15:01:37 2001 @@ -170,7 +170,7 @@ if (child) get_task_struct(child); read_unlock(&tasklist_lock); - if (!child) + if (!child || child->s_context != current->s_context) goto out; ret = -EPERM; diff -urN linux-2.4.16-reiserfspatches-immutable/fs/exec.c linux-2.4.16-reiserfspatches-immutable-ctx4/fs/exec.c --- linux-2.4.16-reiserfspatches-immutable/fs/exec.c Mon Dec 10 13:12:37 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/exec.c Mon Dec 10 15:01:37 2001 @@ -685,7 +685,7 @@ kernel_cap_t new_permitted, working; int do_unlock = 0; - new_permitted = cap_intersect(bprm->cap_permitted, cap_bset); + new_permitted = cap_intersect(bprm->cap_permitted, current->cap_bset); working = cap_intersect(bprm->cap_inheritable, current->cap_inheritable); new_permitted = cap_combine(new_permitted, working); diff -urN linux-2.4.16-reiserfspatches-immutable/fs/ext2/ialloc.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext2/ialloc.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/ext2/ialloc.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext2/ialloc.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,510 @@ +/* + * linux/fs/ext2/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@dcs.ed.ac.uk), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include + + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext2_read_super). + */ + + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return buffer_head of bitmap on success or NULL. + */ +static struct buffer_head *read_inode_bitmap (struct super_block * sb, + unsigned long block_group) +{ + struct ext2_group_desc *desc; + struct buffer_head *bh = NULL; + + desc = ext2_get_group_desc(sb, block_group, NULL); + if (!desc) + goto error_out; + + bh = bread(sb->s_dev, le32_to_cpu(desc->bg_inode_bitmap), + sb->s_blocksize); + if (!bh) + ext2_error (sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %lu", + block_group, (unsigned long) desc->bg_inode_bitmap); +error_out: + return bh; +} + +/* + * load_inode_bitmap loads the inode bitmap for a blocks group + * + * It maintains a cache for the last bitmaps loaded. This cache is managed + * with a LRU algorithm. + * + * Notes: + * 1/ There is one cache per mounted file system. + * 2/ If the file system contains less than EXT2_MAX_GROUP_LOADED groups, + * this function reads the bitmap without maintaining a LRU cache. + * + * Return the buffer_head of the bitmap or the ERR_PTR(error) + */ +static struct buffer_head *load_inode_bitmap (struct super_block * sb, + unsigned int block_group) +{ + int i, slot = 0; + struct ext2_sb_info *sbi = &sb->u.ext2_sb; + struct buffer_head *bh = sbi->s_inode_bitmap[0]; + + if (block_group >= sbi->s_groups_count) + ext2_panic (sb, "load_inode_bitmap", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + + if (sbi->s_loaded_inode_bitmaps > 0 && + sbi->s_inode_bitmap_number[0] == block_group && bh) + goto found; + + if (sbi->s_groups_count <= EXT2_MAX_GROUP_LOADED) { + slot = block_group; + bh = sbi->s_inode_bitmap[slot]; + if (!bh) + goto read_it; + if (sbi->s_inode_bitmap_number[slot] == slot) + goto found; + ext2_panic (sb, "load_inode_bitmap", + "block_group != inode_bitmap_number"); + } + + bh = NULL; + for (i = 0; i < sbi->s_loaded_inode_bitmaps && + sbi->s_inode_bitmap_number[i] != block_group; + i++) + ; + if (i < sbi->s_loaded_inode_bitmaps) + bh = sbi->s_inode_bitmap[i]; + else if (sbi->s_loaded_inode_bitmaps < EXT2_MAX_GROUP_LOADED) + sbi->s_loaded_inode_bitmaps++; + else + brelse (sbi->s_inode_bitmap[--i]); + + while (i--) { + sbi->s_inode_bitmap_number[i+1] = sbi->s_inode_bitmap_number[i]; + sbi->s_inode_bitmap[i+1] = sbi->s_inode_bitmap[i]; + } + +read_it: + if (!bh) + bh = read_inode_bitmap (sb, block_group); + sbi->s_inode_bitmap_number[slot] = block_group; + sbi->s_inode_bitmap[slot] = bh; + if (!bh) + return ERR_PTR(-EIO); +found: + return bh; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext2_free_inode (struct inode * inode) +{ + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head * bh; + struct buffer_head * bh2; + unsigned long block_group; + unsigned long bit; + struct ext2_group_desc * desc; + struct ext2_super_block * es; + + ino = inode->i_ino; + ext2_debug ("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + if (!is_bad_inode(inode)) { + /* Quota is already initialized in iput() */ + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + } + + lock_super (sb); + es = sb->u.ext2_sb.s_es; + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + clear_inode (inode); + + if (ino < EXT2_FIRST_INO(sb) || + ino > le32_to_cpu(es->s_inodes_count)) { + ext2_error (sb, "ext2_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT2_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT2_INODES_PER_GROUP(sb); + bh = load_inode_bitmap (sb, block_group); + if (IS_ERR(bh)) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext2_clear_bit (bit, bh->b_data)) + ext2_error (sb, "ext2_free_inode", + "bit already cleared for inode %lu", ino); + else { + desc = ext2_get_group_desc (sb, block_group, &bh2); + if (desc) { + desc->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) + 1); + if (is_directory) + desc->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) - 1); + } + mark_buffer_dirty(bh2); + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); + mark_buffer_dirty(sb->u.ext2_sb.s_sbh); + } + mark_buffer_dirty(bh); + if (sb->s_flags & MS_SYNCHRONOUS) { + ll_rw_block (WRITE, 1, &bh); + wait_on_buffer (bh); + } + sb->s_dirt = 1; +error_return: + unlock_super (sb); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory\'s block + * group to find a free inode. + */ + +static int find_group_dir(struct super_block *sb, int parent_group) +{ + struct ext2_super_block * es = sb->u.ext2_sb.s_es; + int ngroups = sb->u.ext2_sb.s_groups_count; + int avefreei = le32_to_cpu(es->s_free_inodes_count) / ngroups; + struct ext2_group_desc *desc, *best_desc = NULL; + struct buffer_head *bh, *best_bh = NULL; + int group, best_group = -1; + + for (group = 0; group < ngroups; group++) { + desc = ext2_get_group_desc (sb, group, &bh); + if (!desc || !desc->bg_free_inodes_count) + continue; + if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + continue; + if (!best_desc || + (le16_to_cpu(desc->bg_free_blocks_count) > + le16_to_cpu(best_desc->bg_free_blocks_count))) { + best_group = group; + best_desc = desc; + best_bh = bh; + } + } + if (!best_desc) + return -1; + best_desc->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(best_desc->bg_free_inodes_count) - 1); + best_desc->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(best_desc->bg_used_dirs_count) + 1); + mark_buffer_dirty(best_bh); + return best_group; +} + +static int find_group_other(struct super_block *sb, int parent_group) +{ + int ngroups = sb->u.ext2_sb.s_groups_count; + struct ext2_group_desc *desc; + struct buffer_head *bh; + int group, i; + + /* + * Try to place the inode in its parent directory + */ + group = parent_group; + desc = ext2_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + goto found; + + /* + * Use a quadratic hash to find a group with a + * free inode + */ + for (i = 1; i < ngroups; i <<= 1) { + group += i; + if (group >= ngroups) + group -= ngroups; + desc = ext2_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + goto found; + } + + /* + * That failed: try linear search for a free inode + */ + group = parent_group + 1; + for (i = 2; i < ngroups; i++) { + if (++group >= ngroups) + group = 0; + desc = ext2_get_group_desc (sb, group, &bh); + if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + goto found; + } + + return -1; + +found: + desc->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) - 1); + mark_buffer_dirty(bh); + return group; +} + +struct inode * ext2_new_inode (const struct inode * dir, int mode) +{ + struct super_block * sb; + struct buffer_head * bh; + struct buffer_head * bh2; + int group, i; + ino_t ino; + struct inode * inode; + struct ext2_group_desc * desc; + struct ext2_super_block * es; + int err; + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + lock_super (sb); + es = sb->u.ext2_sb.s_es; +repeat: + if (S_ISDIR(mode)) + group = find_group_dir(sb, dir->u.ext2_i.i_block_group); + else + group = find_group_other(sb, dir->u.ext2_i.i_block_group); + + err = -ENOSPC; + if (group == -1) + goto fail; + + err = -EIO; + bh = load_inode_bitmap (sb, group); + if (IS_ERR(bh)) + goto fail2; + + i = ext2_find_first_zero_bit ((unsigned long *) bh->b_data, + EXT2_INODES_PER_GROUP(sb)); + if (i >= EXT2_INODES_PER_GROUP(sb)) + goto bad_count; + ext2_set_bit (i, bh->b_data); + + mark_buffer_dirty(bh); + if (sb->s_flags & MS_SYNCHRONOUS) { + ll_rw_block (WRITE, 1, &bh); + wait_on_buffer (bh); + } + + ino = group * EXT2_INODES_PER_GROUP(sb) + i + 1; + if (ino < EXT2_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext2_error (sb, "ext2_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d,inode=%ld", group, ino); + err = -EIO; + goto fail2; + } + + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); + mark_buffer_dirty(sb->u.ext2_sb.s_sbh); + sb->s_dirt = 1; + inode->i_uid = current->fsuid; + if (test_opt (sb, GRPID)) + inode->i_gid = dir->i_gid; + else if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + + inode->i_ino = ino; + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->u.ext2_i.i_new_inode = 1; + inode->u.ext2_i.i_flags = dir->u.ext2_i.i_flags; + if (S_ISLNK(mode)) + inode->u.ext2_i.i_flags &= ~(EXT2_IMMUTABLE_FILE_FL|EXT2_IMMUTABLE_LINK_FL|EXT2_APPEND_FL); + inode->u.ext2_i.i_block_group = group; + if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) + inode->i_flags |= S_SYNC; + insert_inode_hash(inode); + inode->i_generation = event++; + mark_inode_dirty(inode); + + unlock_super (sb); + if(DQUOT_ALLOC_INODE(inode)) { + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + iput(inode); + return ERR_PTR(-EDQUOT); + } + ext2_debug ("allocating inode %lu\n", inode->i_ino); + return inode; + +fail2: + desc = ext2_get_group_desc (sb, group, &bh2); + desc->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) + 1); + if (S_ISDIR(mode)) + desc->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) - 1); + mark_buffer_dirty(bh2); +fail: + unlock_super(sb); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); + +bad_count: + ext2_error (sb, "ext2_new_inode", + "Free inodes count corrupted in group %d", + group); + /* Is it really ENOSPC? */ + err = -ENOSPC; + if (sb->s_flags & MS_RDONLY) + goto fail; + + desc = ext2_get_group_desc (sb, group, &bh2); + desc->bg_free_inodes_count = 0; + mark_buffer_dirty(bh2); + goto repeat; +} + +unsigned long ext2_count_free_inodes (struct super_block * sb) +{ +#ifdef EXT2FS_DEBUG + struct ext2_super_block * es; + unsigned long desc_count = 0, bitmap_count = 0; + int i; + + lock_super (sb); + es = sb->u.ext2_sb.s_es; + for (i = 0; i < sb->u.ext2_sb.s_groups_count; i++) { + struct ext2_group_desc *desc = ext2_get_group_desc (sb, i, NULL); + struct buffer_head *bh; + unsigned x; + + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_inodes_count); + bh = load_inode_bitmap (sb, i); + if (IS_ERR(bh)) + continue; + + x = ext2_count_free (bh, EXT2_INODES_PER_GROUP(sb) / 8); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(desc->bg_free_inodes_count), x); + bitmap_count += x; + } + printk("ext2_count_free_inodes: stored = %lu, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + unlock_super (sb); + return desc_count; +#else + return le32_to_cpu(sb->u.ext2_sb.s_es->s_free_inodes_count); +#endif +} + +#ifdef CONFIG_EXT2_CHECK +/* Called at mount-time, super-block is locked */ +void ext2_check_inodes_bitmap (struct super_block * sb) +{ + struct ext2_super_block * es = sb->u.ext2_sb.s_es; + unsigned long desc_count = 0, bitmap_count = 0; + int i; + + for (i = 0; i < sb->u.ext2_sb.s_groups_count; i++) { + struct ext2_group_desc *desc = ext2_get_group_desc(sb, i, NULL); + struct buffer_head *bh; + unsigned x; + + if (!desc) + continue; + desc_count += le16_to_cpu(desc->bg_free_inodes_count); + bh = load_inode_bitmap (sb, i); + if (IS_ERR(bh)) + continue; + + x = ext2_count_free (bh, EXT2_INODES_PER_GROUP(sb) / 8); + if (le16_to_cpu(desc->bg_free_inodes_count) != x) + ext2_error (sb, "ext2_check_inodes_bitmap", + "Wrong free inodes count in group %d, " + "stored = %d, counted = %lu", i, + le16_to_cpu(desc->bg_free_inodes_count), x); + bitmap_count += x; + } + if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count) + ext2_error (sb, "ext2_check_inodes_bitmap", + "Wrong free inodes count in super block, " + "stored = %lu, counted = %lu", + (unsigned long)le32_to_cpu(es->s_free_inodes_count), + bitmap_count); +} +#endif diff -urN linux-2.4.16-reiserfspatches-immutable/fs/ext2/inode.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext2/inode.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/ext2/inode.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext2/inode.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,1165 @@ +/* + * linux/fs/ext2/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@dcs.ed.ac.uk), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext2_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Remy Card and others"); +MODULE_DESCRIPTION("Second Extended Filesystem"); +MODULE_LICENSE("GPL"); + + +static int ext2_update_inode(struct inode * inode, int do_sync); + +/* + * Called at each iput() + */ +void ext2_put_inode (struct inode * inode) +{ + ext2_discard_prealloc (inode); +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext2_delete_inode (struct inode * inode) +{ + lock_kernel(); + + if (is_bad_inode(inode) || + inode->i_ino == EXT2_ACL_IDX_INO || + inode->i_ino == EXT2_ACL_DATA_INO) + goto no_delete; + inode->u.ext2_i.i_dtime = CURRENT_TIME; + mark_inode_dirty(inode); + ext2_update_inode(inode, IS_SYNC(inode)); + inode->i_size = 0; + if (inode->i_blocks) + ext2_truncate (inode); + ext2_free_inode (inode); + + unlock_kernel(); + return; +no_delete: + unlock_kernel(); + clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +void ext2_discard_prealloc (struct inode * inode) +{ +#ifdef EXT2_PREALLOCATE + lock_kernel(); + /* Writer: ->i_prealloc* */ + if (inode->u.ext2_i.i_prealloc_count) { + unsigned short total = inode->u.ext2_i.i_prealloc_count; + unsigned long block = inode->u.ext2_i.i_prealloc_block; + inode->u.ext2_i.i_prealloc_count = 0; + inode->u.ext2_i.i_prealloc_block = 0; + /* Writer: end */ + ext2_free_blocks (inode, block, total); + } + unlock_kernel(); +#endif +} + +static int ext2_alloc_block (struct inode * inode, unsigned long goal, int *err) +{ +#ifdef EXT2FS_DEBUG + static unsigned long alloc_hits = 0, alloc_attempts = 0; +#endif + unsigned long result; + + +#ifdef EXT2_PREALLOCATE + /* Writer: ->i_prealloc* */ + if (inode->u.ext2_i.i_prealloc_count && + (goal == inode->u.ext2_i.i_prealloc_block || + goal + 1 == inode->u.ext2_i.i_prealloc_block)) + { + result = inode->u.ext2_i.i_prealloc_block++; + inode->u.ext2_i.i_prealloc_count--; + /* Writer: end */ + ext2_debug ("preallocation hit (%lu/%lu).\n", + ++alloc_hits, ++alloc_attempts); + } else { + ext2_discard_prealloc (inode); + ext2_debug ("preallocation miss (%lu/%lu).\n", + alloc_hits, ++alloc_attempts); + if (S_ISREG(inode->i_mode)) + result = ext2_new_block (inode, goal, + &inode->u.ext2_i.i_prealloc_count, + &inode->u.ext2_i.i_prealloc_block, err); + else + result = ext2_new_block (inode, goal, 0, 0, err); + } +#else + result = ext2_new_block (inode, goal, 0, 0, err); +#endif + return result; +} + +typedef struct { + u32 *p; + u32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext2_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * + * To store the locations of file's data ext2 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext2_block_to_path(struct inode *inode, long i_block, int offsets[4]) +{ + int ptrs = EXT2_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT2_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT2_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + + if (i_block < 0) { + ext2_warning (inode->i_sb, "ext2_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT2_IND_BLOCK; + offsets[n++] = i_block; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT2_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT2_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + } else { + ext2_warning (inode->i_sb, "ext2_block_to_path", "block > big"); + } + return n; +} + +/** + * ext2_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext2_get_branch(struct inode *inode, + int depth, + int *offsets, + Indirect chain[4], + int *err) +{ + kdev_t dev = inode->i_dev; + int size = inode->i_sb->s_blocksize; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, inode->u.ext2_i.i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = bread(dev, le32_to_cpu(p->key), size); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (u32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +changed: + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext2_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the prefered place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same cylinder group. + * Caller must make sure that @ind is valid and will stay that way. + */ + +static inline unsigned long ext2_find_near(struct inode *inode, Indirect *ind) +{ + u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext2_i.i_data; + u32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) + if (*p) + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be refered from inode itself? OK, just put it into + * the same cylinder group then. + */ + return (inode->u.ext2_i.i_block_group * + EXT2_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_first_data_block); +} + +/** + * ext2_find_goal - find a prefered place for allocation. + * @inode: owner + * @block: block we want + * @chain: chain of indirect blocks + * @partial: pointer to the last triple within a chain + * @goal: place to store the result. + * + * Normally this function find the prefered place for block allocation, + * stores it in *@goal and returns zero. If the branch had been changed + * under us we return -EAGAIN. + */ + +static inline int ext2_find_goal(struct inode *inode, + long block, + Indirect chain[4], + Indirect *partial, + unsigned long *goal) +{ + /* Writer: ->i_next_alloc* */ + if (block == inode->u.ext2_i.i_next_alloc_block + 1) { + inode->u.ext2_i.i_next_alloc_block++; + inode->u.ext2_i.i_next_alloc_goal++; + } + /* Writer: end */ + /* Reader: pointers, ->i_next_alloc* */ + if (verify_chain(chain, partial)) { + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block == inode->u.ext2_i.i_next_alloc_block) + *goal = inode->u.ext2_i.i_next_alloc_goal; + if (!*goal) + *goal = ext2_find_near(inode, partial); + return 0; + } + /* Reader: end */ + return -EAGAIN; +} + +/** + * ext2_alloc_branch - allocate and set up a chain of blocks. + * @inode: owner + * @num: depth of the chain (number of blocks to allocate) + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates @num blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext2_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext2_get_block(), excpet that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext2_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ + +static int ext2_alloc_branch(struct inode *inode, + int num, + unsigned long goal, + int *offsets, + Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int n = 0; + int err; + int i; + int parent = ext2_alloc_block(inode, goal, &err); + + branch[0].key = cpu_to_le32(parent); + if (parent) for (n = 1; n < num; n++) { + struct buffer_head *bh; + /* Allocate the next block */ + int nr = ext2_alloc_block(inode, parent, &err); + if (!nr) + break; + branch[n].key = cpu_to_le32(nr); + /* + * Get buffer_head for parent block, zero it out and set + * the pointer to new one, then send parent to disk. + */ + bh = getblk(inode->i_dev, parent, blocksize); + lock_buffer(bh); + memset(bh->b_data, 0, blocksize); + branch[n].bh = bh; + branch[n].p = (u32*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) { + ll_rw_block (WRITE, 1, &bh); + wait_on_buffer (bh); + } + parent = nr; + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < n; i++) + bforget(branch[i].bh); + for (i = 0; i < n; i++) + ext2_free_blocks(inode, le32_to_cpu(branch[i].key), 1); + return err; +} + +/** + * ext2_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext2_alloc_branch) + * @where: location of missing link + * @num: number of blocks we are adding + * + * This function verifies that chain (up to the missing link) had not + * changed, fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. Otherwise (== chain had been changed) + * we free the new blocks (forgetting their buffer_heads, indeed) and + * return -EAGAIN. + */ + +static inline int ext2_splice_branch(struct inode *inode, + long block, + Indirect chain[4], + Indirect *where, + int num) +{ + int i; + + /* Verify that place we are splicing to is still there and vacant */ + + /* Writer: pointers, ->i_next_alloc* */ + if (!verify_chain(chain, where-1) || *where->p) + /* Writer: end */ + goto changed; + + /* That's it */ + + *where->p = where->key; + inode->u.ext2_i.i_next_alloc_block = block; + inode->u.ext2_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); + + /* Writer: end */ + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME; + + /* had we spliced it onto indirect block? */ + if (where->bh) { + mark_buffer_dirty_inode(where->bh, inode); + if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) { + ll_rw_block (WRITE, 1, &where->bh); + wait_on_buffer(where->bh); + } + } + + if (IS_SYNC(inode) || inode->u.ext2_i.i_osync) + ext2_sync_inode (inode); + else + mark_inode_dirty(inode); + return 0; + +changed: + for (i = 1; i < num; i++) + bforget(where[i].bh); + for (i = 0; i < num; i++) + ext2_free_blocks(inode, le32_to_cpu(where[i].key), 1); + return -EAGAIN; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + */ + +static int ext2_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + unsigned long goal; + int left; + int depth = ext2_block_to_path(inode, iblock, offsets); + + if (depth == 0) + goto out; + + lock_kernel(); +reread: + partial = ext2_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { +got_it: + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key); + bh_result->b_state |= (1UL << BH_Mapped); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { +cleanup: + while (partial > chain) { + brelse(partial->bh); + partial--; + } + unlock_kernel(); +out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + if (ext2_find_goal(inode, iblock, chain, partial, &goal) < 0) + goto changed; + + left = (chain + depth) - partial; + err = ext2_alloc_branch(inode, left, goal, + offsets+(partial-chain), partial); + if (err) + goto cleanup; + + if (ext2_splice_branch(inode, iblock, chain, partial, left) < 0) + goto changed; + + bh_result->b_state |= (1UL << BH_New); + goto got_it; + +changed: + while (partial > chain) { + brelse(partial->bh); + partial--; + } + goto reread; +} + +static int ext2_writepage(struct page *page) +{ + return block_write_full_page(page,ext2_get_block); +} +static int ext2_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,ext2_get_block); +} +static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) +{ + return block_prepare_write(page,from,to,ext2_get_block); +} +static int ext2_bmap(struct address_space *mapping, long block) +{ + return generic_block_bmap(mapping,block,ext2_get_block); +} +static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +{ + return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); +} +struct address_space_operations ext2_aops = { + readpage: ext2_readpage, + writepage: ext2_writepage, + sync_page: block_sync_page, + prepare_write: ext2_prepare_write, + commit_write: generic_commit_write, + bmap: ext2_bmap, + direct_IO: ext2_direct_IO, +}; + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(u32 *p, u32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext2_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext2_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext2_truncate(). + * + * When we do truncate() we may have to clean the ends of several indirect + * blocks but leave the blocks themselves alive. Block is partially + * truncated if some data below the new i_size is refered from it (and + * it is on the path to the first completely truncated data block, indeed). + * We have to free the top of that path along with everything to the right + * of the path. Since no allocation past the truncation point is possible + * until ext2_truncate() finishes, we may safely do the latter, but top + * of branch may require special attention - pageout below the truncation + * point might try to populate it. + * + * We atomically detach the top of branch from the tree, store the block + * number of its root in *@top, pointers to buffer_heads of partially + * truncated blocks - in @chain[].bh and pointers to their last elements + * that should not be removed - in @chain[].p. Return value is the pointer + * to last filled element of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0].p + * (no partially truncated stuff there). + */ + +static Indirect *ext2_find_shared(struct inode *inode, + int depth, + int offsets[4], + Indirect chain[4], + u32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext2_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + *p->p = 0; + } + /* Writer: end */ + + while(partial > p) + { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/** + * ext2_free_data - free a list of data blocks + * @inode: inode we are dealing with + * @p: array of block numbers + * @q: points immediately past the end of array + * + * We are freeing all blocks refered from that array (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static inline void ext2_free_data(struct inode *inode, u32 *p, u32 *q) +{ + unsigned long block_to_free = 0, count = 0; + unsigned long nr; + + for ( ; p < q ; p++) { + nr = le32_to_cpu(*p); + if (nr) { + *p = 0; + /* accumulate blocks to free if they're contiguous */ + if (count == 0) + goto free_this; + else if (block_to_free == nr - count) + count++; + else { + mark_inode_dirty(inode); + ext2_free_blocks (inode, block_to_free, count); + free_this: + block_to_free = nr; + count = 1; + } + } + } + if (count > 0) { + mark_inode_dirty(inode); + ext2_free_blocks (inode, block_to_free, count); + } +} + +/** + * ext2_free_branches - free an array of branches + * @inode: inode we are dealing with + * @p: array of block numbers + * @q: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks refered from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext2_free_branches(struct inode *inode, u32 *p, u32 *q, int depth) +{ + struct buffer_head * bh; + unsigned long nr; + + if (depth--) { + int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); + for ( ; p < q ; p++) { + nr = le32_to_cpu(*p); + if (!nr) + continue; + *p = 0; + bh = bread (inode->i_dev, nr, inode->i_sb->s_blocksize); + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext2_error(inode->i_sb, "ext2_free_branches", + "Read failure, inode=%ld, block=%ld", + inode->i_ino, nr); + continue; + } + ext2_free_branches(inode, + (u32*)bh->b_data, + (u32*)bh->b_data + addr_per_block, + depth); + bforget(bh); + ext2_free_blocks(inode, nr, 1); + mark_inode_dirty(inode); + } + } else + ext2_free_data(inode, p, q); +} + +void ext2_truncate (struct inode * inode) +{ + u32 *i_data = inode->u.ext2_i.i_data; + int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + int nr = 0; + int n; + long iblock; + unsigned blocksize; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode)) + return; + + ext2_discard_prealloc(inode); + + blocksize = inode->i_sb->s_blocksize; + iblock = (inode->i_size + blocksize-1) + >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); + + block_truncate_page(inode->i_mapping, inode->i_size, ext2_get_block); + + n = ext2_block_to_path(inode, iblock, offsets); + if (n == 0) + return; + + if (n == 1) { + ext2_free_data(inode, i_data+offsets[0], + i_data + EXT2_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext2_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (already detached) */ + if (nr) { + if (partial == chain) + mark_inode_dirty(inode); + else + mark_buffer_dirty_inode(partial->bh, inode); + ext2_free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext2_free_branches(inode, + partial->p + 1, + (u32*)partial->bh->b_data + addr_per_block, + (chain+n-1) - partial); + mark_buffer_dirty_inode(partial->bh, inode); + if (IS_SYNC(inode)) { + ll_rw_block (WRITE, 1, &partial->bh); + wait_on_buffer (partial->bh); + } + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT2_IND_BLOCK]; + if (nr) { + i_data[EXT2_IND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 1); + } + case EXT2_IND_BLOCK: + nr = i_data[EXT2_DIND_BLOCK]; + if (nr) { + i_data[EXT2_DIND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 2); + } + case EXT2_DIND_BLOCK: + nr = i_data[EXT2_TIND_BLOCK]; + if (nr) { + i_data[EXT2_TIND_BLOCK] = 0; + mark_inode_dirty(inode); + ext2_free_branches(inode, &nr, &nr+1, 3); + } + case EXT2_TIND_BLOCK: + ; + } + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (IS_SYNC(inode)) + ext2_sync_inode (inode); + else + mark_inode_dirty(inode); +} + +void ext2_read_inode (struct inode * inode) +{ + struct buffer_head * bh; + struct ext2_inode * raw_inode; + unsigned long block_group; + unsigned long group_desc; + unsigned long desc; + unsigned long block; + unsigned long offset; + struct ext2_group_desc * gdp; + + if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO && + inode->i_ino != EXT2_ACL_DATA_INO && + inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) { + ext2_error (inode->i_sb, "ext2_read_inode", + "bad inode number: %lu", inode->i_ino); + goto bad_inode; + } + block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); + if (block_group >= inode->i_sb->u.ext2_sb.s_groups_count) { + ext2_error (inode->i_sb, "ext2_read_inode", + "group >= groups count"); + goto bad_inode; + } + group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(inode->i_sb); + desc = block_group & (EXT2_DESC_PER_BLOCK(inode->i_sb) - 1); + bh = inode->i_sb->u.ext2_sb.s_group_desc[group_desc]; + if (!bh) { + ext2_error (inode->i_sb, "ext2_read_inode", + "Descriptor not loaded"); + goto bad_inode; + } + + gdp = (struct ext2_group_desc *) bh->b_data; + /* + * Figure out the offset within the block group inode table + */ + offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) * + EXT2_INODE_SIZE(inode->i_sb); + block = le32_to_cpu(gdp[desc].bg_inode_table) + + (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb)); + if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) { + ext2_error (inode->i_sb, "ext2_read_inode", + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, block); + goto bad_inode; + } + offset &= (EXT2_BLOCK_SIZE(inode->i_sb) - 1); + raw_inode = (struct ext2_inode *) (bh->b_data + offset); + + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime = le32_to_cpu(raw_inode->i_atime); + inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); + inode->u.ext2_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0 && (inode->i_mode == 0 || inode->u.ext2_i.i_dtime)) { + /* this inode is deleted */ + brelse (bh); + goto bad_inode; + } + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + inode->i_version = ++event; + inode->u.ext2_i.i_flags = le32_to_cpu(raw_inode->i_flags); + inode->u.ext2_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); + inode->u.ext2_i.i_frag_no = raw_inode->i_frag; + inode->u.ext2_i.i_frag_size = raw_inode->i_fsize; + inode->u.ext2_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (S_ISREG(inode->i_mode)) + inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + else + inode->u.ext2_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + inode->u.ext2_i.i_prealloc_count = 0; + inode->u.ext2_i.i_block_group = block_group; + + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT2_N_BLOCKS; block++) + inode->u.ext2_i.i_data[block] = raw_inode->i_block[block]; + + if (inode->i_ino == EXT2_ACL_IDX_INO || + inode->i_ino == EXT2_ACL_DATA_INO) + /* Nothing to do */ ; + else if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext2_file_inode_operations; + inode->i_fop = &ext2_file_operations; + inode->i_mapping->a_ops = &ext2_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext2_dir_inode_operations; + inode->i_fop = &ext2_dir_operations; + inode->i_mapping->a_ops = &ext2_aops; + } else if (S_ISLNK(inode->i_mode)) { + if (!inode->i_blocks) + inode->i_op = &ext2_fast_symlink_inode_operations; + else { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext2_aops; + } + } else + init_special_inode(inode, inode->i_mode, + le32_to_cpu(raw_inode->i_block[0])); + brelse (bh); + inode->i_attr_flags = 0; + if (inode->u.ext2_i.i_flags & EXT2_SYNC_FL) { + inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; + inode->i_flags |= S_SYNC; + } + if (inode->u.ext2_i.i_flags & EXT2_APPEND_FL) { + inode->i_attr_flags |= ATTR_FLAG_APPEND; + inode->i_flags |= S_APPEND; + } + if (inode->u.ext2_i.i_flags & EXT2_IMMUTABLE_FILE_FL) { + inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_FILE; + inode->i_flags |= S_IMMUTABLE_FILE; + } + if (inode->u.ext2_i.i_flags & EXT2_IMMUTABLE_LINK_FL) { + inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_LINK; + inode->i_flags |= S_IMMUTABLE_LINK; + } + if (inode->u.ext2_i.i_flags & EXT2_NOATIME_FL) { + inode->i_attr_flags |= ATTR_FLAG_NOATIME; + inode->i_flags |= S_NOATIME; + } + return; + +bad_inode: + make_bad_inode(inode); + return; +} + +static int ext2_update_inode(struct inode * inode, int do_sync) +{ + struct buffer_head * bh; + struct ext2_inode * raw_inode; + unsigned long block_group; + unsigned long group_desc; + unsigned long desc; + unsigned long block; + unsigned long offset; + int err = 0; + struct ext2_group_desc * gdp; + + if ((inode->i_ino != EXT2_ROOT_INO && + inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) { + ext2_error (inode->i_sb, "ext2_write_inode", + "bad inode number: %lu", inode->i_ino); + return -EIO; + } + block_group = (inode->i_ino - 1) / EXT2_INODES_PER_GROUP(inode->i_sb); + if (block_group >= inode->i_sb->u.ext2_sb.s_groups_count) { + ext2_error (inode->i_sb, "ext2_write_inode", + "group >= groups count"); + return -EIO; + } + group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(inode->i_sb); + desc = block_group & (EXT2_DESC_PER_BLOCK(inode->i_sb) - 1); + bh = inode->i_sb->u.ext2_sb.s_group_desc[group_desc]; + if (!bh) { + ext2_error (inode->i_sb, "ext2_write_inode", + "Descriptor not loaded"); + return -EIO; + } + gdp = (struct ext2_group_desc *) bh->b_data; + /* + * Figure out the offset within the block group inode table + */ + offset = ((inode->i_ino - 1) % EXT2_INODES_PER_GROUP(inode->i_sb)) * + EXT2_INODE_SIZE(inode->i_sb); + block = le32_to_cpu(gdp[desc].bg_inode_table) + + (offset >> EXT2_BLOCK_SIZE_BITS(inode->i_sb)); + if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) { + ext2_error (inode->i_sb, "ext2_write_inode", + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, block); + return -EIO; + } + offset &= EXT2_BLOCK_SIZE(inode->i_sb) - 1; + raw_inode = (struct ext2_inode *) (bh->b_data + offset); + + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if(!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!inode->u.ext2_i.i_dtime) { + raw_inode->i_uid_high = cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(inode->i_size); + raw_inode->i_atime = cpu_to_le32(inode->i_atime); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(inode->u.ext2_i.i_dtime); + raw_inode->i_flags = cpu_to_le32(inode->u.ext2_i.i_flags); + raw_inode->i_faddr = cpu_to_le32(inode->u.ext2_i.i_faddr); + raw_inode->i_frag = inode->u.ext2_i.i_frag_no; + raw_inode->i_fsize = inode->u.ext2_i.i_frag_size; + raw_inode->i_file_acl = cpu_to_le32(inode->u.ext2_i.i_file_acl); + if (S_ISDIR(inode->i_mode)) + raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext2_i.i_dir_acl); + else { + raw_inode->i_size_high = cpu_to_le32(inode->i_size >> 32); + if (inode->i_size > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT2_HAS_RO_COMPAT_FEATURE(sb, + EXT2_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT2_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT2_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + lock_kernel(); + ext2_update_dynamic_rev(sb); + EXT2_SET_RO_COMPAT_FEATURE(sb, + EXT2_FEATURE_RO_COMPAT_LARGE_FILE); + unlock_kernel(); + ext2_write_super(sb); + } + } + } + + raw_inode->i_generation = cpu_to_le32(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_block[0] = cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); + else for (block = 0; block < EXT2_N_BLOCKS; block++) + raw_inode->i_block[block] = inode->u.ext2_i.i_data[block]; + mark_buffer_dirty(bh); + if (do_sync) { + ll_rw_block (WRITE, 1, &bh); + wait_on_buffer (bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + printk ("IO error syncing ext2 inode [" + "%s:%08lx]\n", + bdevname(inode->i_dev), inode->i_ino); + err = -EIO; + } + } + brelse (bh); + return err; +} + +void ext2_write_inode (struct inode * inode, int wait) +{ + lock_kernel(); + ext2_update_inode (inode, wait); + unlock_kernel(); +} + +int ext2_sync_inode (struct inode *inode) +{ + return ext2_update_inode (inode, 1); +} diff -urN linux-2.4.16-reiserfspatches-immutable/fs/ext2/ioctl.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext2/ioctl.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/ext2/ioctl.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext2/ioctl.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,98 @@ +/* + * linux/fs/ext2/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include + + +int ext2_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +{ + unsigned int flags; + + ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT2_IOC_GETFLAGS: + flags = inode->u.ext2_i.i_flags & EXT2_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); + case EXT2_IOC_SETFLAGS: { + unsigned int oldflags; + + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (get_user(flags, (int *) arg)) + return -EFAULT; + + oldflags = inode->u.ext2_i.i_flags; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FILE_FL | EXT2_IMMUTABLE_LINK_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + } + + flags = flags & EXT2_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT2_FL_USER_MODIFIABLE; + inode->u.ext2_i.i_flags = flags; + + if (flags & EXT2_SYNC_FL) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (flags & EXT2_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + + if (flags & EXT2_IMMUTABLE_FILE_FL) + inode->i_flags |= S_IMMUTABLE_FILE; + else + inode->i_flags &= ~S_IMMUTABLE_FILE; + + if (flags & EXT2_IMMUTABLE_LINK_FL) + inode->i_flags |= S_IMMUTABLE_LINK; + else + inode->i_flags &= ~S_IMMUTABLE_LINK; + + if (flags & EXT2_NOATIME_FL) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + return 0; + } + case EXT2_IOC_GETVERSION: + return put_user(inode->i_generation, (int *) arg); + case EXT2_IOC_SETVERSION: + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (get_user(inode->i_generation, (int *) arg)) + return -EFAULT; + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + return 0; + default: + return -ENOTTY; + } +} diff -urN linux-2.4.16-reiserfspatches-immutable/fs/ext3/ialloc.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext3/ialloc.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/ext3/ialloc.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext3/ialloc.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,664 @@ +/* + * linux/fs/ext3/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * ialloc.c contains the inodes allocation and deallocation routines + */ + +/* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3_read_super). + */ + + +/* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return >=0 on success or a -ve error code. + */ +static int read_inode_bitmap (struct super_block * sb, + unsigned long block_group, + unsigned int bitmap_nr) +{ + struct ext3_group_desc * gdp; + struct buffer_head * bh = NULL; + int retval = 0; + + gdp = ext3_get_group_desc (sb, block_group, NULL); + if (!gdp) { + retval = -EIO; + goto error_out; + } + bh = bread (sb->s_dev, + le32_to_cpu(gdp->bg_inode_bitmap), sb->s_blocksize); + if (!bh) { + ext3_error (sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %lu", + block_group, (unsigned long) gdp->bg_inode_bitmap); + retval = -EIO; + } + /* + * On IO error, just leave a zero in the superblock's block pointer for + * this group. The IO will be retried next time. + */ +error_out: + sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group; + sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh; + return retval; +} + +/* + * load_inode_bitmap loads the inode bitmap for a blocks group + * + * It maintains a cache for the last bitmaps loaded. This cache is managed + * with a LRU algorithm. + * + * Notes: + * 1/ There is one cache per mounted file system. + * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, + * this function reads the bitmap without maintaining a LRU cache. + * + * Return the slot used to store the bitmap, or a -ve error code. + */ +static int load_inode_bitmap (struct super_block * sb, + unsigned int block_group) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long inode_bitmap_number; + struct buffer_head * inode_bitmap; + int i, j, retval = 0; + + if (block_group >= sbi->s_groups_count) + ext3_panic (sb, "load_inode_bitmap", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + if (sbi->s_loaded_inode_bitmaps > 0 && + sbi->s_inode_bitmap_number[0] == block_group && + sbi->s_inode_bitmap[0] != NULL) + return 0; + if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) { + if (sbi->s_inode_bitmap[block_group]) { + if (sbi->s_inode_bitmap_number[block_group] != + block_group) + ext3_panic(sb, "load_inode_bitmap", + "block_group != inode_bitmap_number"); + return block_group; + } + retval = read_inode_bitmap(sb, block_group, block_group); + if (retval < 0) + return retval; + return block_group; + } + + for (i = 0; i < sbi->s_loaded_inode_bitmaps && + sbi->s_inode_bitmap_number[i] != block_group; i++) + /* do nothing */; + if (i < sbi->s_loaded_inode_bitmaps && + sbi->s_inode_bitmap_number[i] == block_group) { + inode_bitmap_number = sbi->s_inode_bitmap_number[i]; + inode_bitmap = sbi->s_inode_bitmap[i]; + for (j = i; j > 0; j--) { + sbi->s_inode_bitmap_number[j] = + sbi->s_inode_bitmap_number[j - 1]; + sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; + } + sbi->s_inode_bitmap_number[0] = inode_bitmap_number; + sbi->s_inode_bitmap[0] = inode_bitmap; + + /* + * There's still one special case here --- if inode_bitmap == 0 + * then our last attempt to read the bitmap failed and we have + * just ended up caching that failure. Try again to read it. + */ + if (!inode_bitmap) + retval = read_inode_bitmap (sb, block_group, 0); + } else { + if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED) + sbi->s_loaded_inode_bitmaps++; + else + brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]); + for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) { + sbi->s_inode_bitmap_number[j] = + sbi->s_inode_bitmap_number[j - 1]; + sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; + } + retval = read_inode_bitmap (sb, block_group, 0); + } + return retval; +} + +/* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ +void ext3_free_inode (handle_t *handle, struct inode * inode) +{ + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head * bh; + struct buffer_head * bh2; + unsigned long block_group; + unsigned long bit; + int bitmap_nr; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + int fatal = 0, err; + + if (!inode->i_dev) { + printk ("ext3_free_inode: inode has no device\n"); + return; + } + if (atomic_read(&inode->i_count) > 1) { + printk ("ext3_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk ("ext3_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk("ext3_free_inode: inode on nonexistent device\n"); + return; + } + + ino = inode->i_ino; + ext3_debug ("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + clear_inode (inode); + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_nr = load_inode_bitmap (sb, block_group); + if (bitmap_nr < 0) + goto error_return; + + bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; + + BUFFER_TRACE(bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh); + if (fatal) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext3_clear_bit (bit, bh->b_data)) + ext3_error (sb, "ext3_free_inode", + "bit already cleared for inode %lu", ino); + else { + gdp = ext3_get_group_desc (sb, block_group, &bh2); + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access"); + fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (fatal) goto error_return; + + if (gdp) { + gdp->bg_free_inodes_count = cpu_to_le16( + le16_to_cpu(gdp->bg_free_inodes_count) + 1); + if (is_directory) + gdp->bg_used_dirs_count = cpu_to_le16( + le16_to_cpu(gdp->bg_used_dirs_count) - 1); + } + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, + "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + sb->s_dirt = 1; +error_return: + ext3_std_error(sb, fatal); + unlock_super(sb); +} + +/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +struct inode * ext3_new_inode (handle_t *handle, + const struct inode * dir, int mode) +{ + struct super_block * sb; + struct buffer_head * bh; + struct buffer_head * bh2; + int i, j, avefreei; + struct inode * inode; + int bitmap_nr; + struct ext3_group_desc * gdp; + struct ext3_group_desc * tmp; + struct ext3_super_block * es; + int err = 0; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + init_rwsem(&inode->u.ext3_i.truncate_sem); + + lock_super (sb); + es = sb->u.ext3_sb.s_es; +repeat: + gdp = NULL; + i = 0; + + if (S_ISDIR(mode)) { + avefreei = le32_to_cpu(es->s_free_inodes_count) / + sb->u.ext3_sb.s_groups_count; + if (!gdp) { + for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) { + struct buffer_head *temp_buffer; + tmp = ext3_get_group_desc (sb, j, &temp_buffer); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count) && + le16_to_cpu(tmp->bg_free_inodes_count) >= + avefreei) { + if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) > + le16_to_cpu(gdp->bg_free_blocks_count))) { + i = j; + gdp = tmp; + bh2 = temp_buffer; + } + } + } + } + } else { + /* + * Try to place the inode in its parent directory + */ + i = dir->u.ext3_i.i_block_group; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) + gdp = tmp; + else + { + /* + * Use a quadratic hash to find a group with a + * free inode + */ + for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) { + i += j; + if (i >= sb->u.ext3_sb.s_groups_count) + i -= sb->u.ext3_sb.s_groups_count; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count)) { + gdp = tmp; + break; + } + } + } + if (!gdp) { + /* + * That failed: try linear search for a free inode + */ + i = dir->u.ext3_i.i_block_group + 1; + for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) { + if (++i >= sb->u.ext3_sb.s_groups_count) + i = 0; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count)) { + gdp = tmp; + break; + } + } + } + } + + err = -ENOSPC; + if (!gdp) + goto fail; + + err = -EIO; + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + goto fail; + + bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; + + if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data, + EXT3_INODES_PER_GROUP(sb))) < + EXT3_INODES_PER_GROUP(sb)) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) goto fail; + + if (ext3_set_bit (j, bh->b_data)) { + ext3_error (sb, "ext3_new_inode", + "bit already set for inode %d", j); + goto repeat; + } + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) goto fail; + } else { + if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) { + ext3_error (sb, "ext3_new_inode", + "Free inodes count corrupted in group %d", + i); + /* Is it really ENOSPC? */ + err = -ENOSPC; + if (sb->s_flags & MS_RDONLY) + goto fail; + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + gdp->bg_free_inodes_count = 0; + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + } + goto repeat; + } + j += i * EXT3_INODES_PER_GROUP(sb) + 1; + if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d,inode=%d", i, j); + err = -EIO; + goto fail; + } + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + gdp->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); + if (S_ISDIR(mode)) + gdp->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (err) goto fail; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + sb->s_dirt = 1; + if (err) goto fail; + + inode->i_uid = current->fsuid; + if (test_opt (sb, GRPID)) + inode->i_gid = dir->i_gid; + else if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + + inode->i_ino = j; + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blksize = PAGE_SIZE; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; + if (S_ISLNK(mode)) + inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FILE_FL|EXT3_IMMUTABLE_LINK_FL|EXT3_APPEND_FL); +#ifdef EXT3_FRAGMENTS + inode->u.ext3_i.i_faddr = 0; + inode->u.ext3_i.i_frag_no = 0; + inode->u.ext3_i.i_frag_size = 0; +#endif + inode->u.ext3_i.i_file_acl = 0; + inode->u.ext3_i.i_dir_acl = 0; + inode->u.ext3_i.i_dtime = 0; + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); +#ifdef EXT3_PREALLOCATE + inode->u.ext3_i.i_prealloc_count = 0; +#endif + inode->u.ext3_i.i_block_group = i; + + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (IS_SYNC(inode)) + handle->h_sync = 1; + insert_inode_hash(inode); + inode->i_generation = event++; + + inode->u.ext3_i.i_state = EXT3_STATE_NEW; + err = ext3_mark_inode_dirty(handle, inode); + if (err) goto fail; + + unlock_super (sb); + if(DQUOT_ALLOC_INODE(inode)) { + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + iput(inode); + return ERR_PTR(-EDQUOT); + } + ext3_debug ("allocating inode %lu\n", inode->i_ino); + return inode; + +fail: + unlock_super(sb); + iput(inode); + ext3_std_error(sb, err); + return ERR_PTR(err); +} + +/* Verify that we are loading a valid orphan from disk */ +struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino) +{ + ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); + unsigned long block_group; + int bit; + int bitmap_nr; + struct buffer_head *bh; + struct inode *inode = NULL; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan ino %ld! e2fsck was run?\n", ino); + return NULL; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 || + !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) { + ext3_warning(sb, __FUNCTION__, + "inode bitmap error for orphan %ld\n", ino); + return NULL; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) || + is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan inode %ld! e2fsck was run?\n", ino); + printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n", + bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%ld\n", max_ino); + } + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode && inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + return NULL; + } + + return inode; +} + +unsigned long ext3_count_free_inodes (struct super_block * sb) +{ +#ifdef EXT3FS_DEBUG + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], + EXT3_INODES_PER_GROUP(sb) / 8); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + unlock_super (sb); + return desc_count; +#else + return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count); +#endif +} + +#ifdef CONFIG_EXT3_CHECK +/* Called at mount-time, super-block is locked */ +void ext3_check_inodes_bitmap (struct super_block * sb) +{ + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], + EXT3_INODES_PER_GROUP(sb) / 8); + if (le16_to_cpu(gdp->bg_free_inodes_count) != x) + ext3_error (sb, "ext3_check_inodes_bitmap", + "Wrong free inodes count in group %d, " + "stored = %d, counted = %lu", i, + le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count) + ext3_error (sb, "ext3_check_inodes_bitmap", + "Wrong free inodes count in super block, " + "stored = %lu, counted = %lu", + (unsigned long)le32_to_cpu(es->s_free_inodes_count), + bitmap_count); +} +#endif diff -urN linux-2.4.16-reiserfspatches-immutable/fs/ext3/inode.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext3/inode.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/ext3/inode.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext3/inode.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,2676 @@ +/* + * linux/fs/ext3/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * SEARCH_FROM_ZERO forces each block allocation to search from the start + * of the filesystem. This is to force rapid reallocation of recently-freed + * blocks. The file fragmentation is horrendous. + */ +#undef SEARCH_FROM_ZERO + +/* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ + +static int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) +{ + int err; + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext3_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call journal_forget"); + ext3_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext3_journal_revoke"); + err = ext3_journal_revoke(handle, blocknr, bh); + if (err) + ext3_abort(inode->i_sb, __FUNCTION__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; +} + +/* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ + +static handle_t *start_transaction(struct inode *inode) +{ + long needed; + handle_t *result; + + needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + + result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); + if (!IS_ERR(result)) + return result; + + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; +} + +/* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ +static int try_to_extend_transaction(handle_t *handle, struct inode *inode) +{ + long needed; + + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; + needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed)) + return 0; + return 1; +} + +/* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ +static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) +{ + long needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + jbd_debug(2, "restarting handle %p\n", handle); + return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed); +} + +/* + * Called at each iput() + */ +void ext3_put_inode (struct inode * inode) +{ + ext3_discard_prealloc (inode); +} + +/* + * Called at the last iput() if i_nlink is zero. + */ +void ext3_delete_inode (struct inode * inode) +{ + handle_t *handle; + + if (is_bad_inode(inode) || + inode->i_ino == EXT3_ACL_IDX_INO || + inode->i_ino == EXT3_ACL_DATA_INO) + goto no_delete; + + lock_kernel(); + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* If we're going to skip the normal cleanup, we still + * need to make sure that the in-core orphan linked list + * is properly cleaned up. */ + ext3_orphan_del(NULL, inode); + + ext3_std_error(inode->i_sb, PTR_ERR(handle)); + unlock_kernel(); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext3_truncate(inode); + /* + * Kill off the orphan record which ext3_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext3_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext3_orphan_del(handle, inode); + inode->u.ext3_i.i_dtime = CURRENT_TIME; + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext3_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + clear_inode(inode); + else + ext3_free_inode(handle, inode); + ext3_journal_stop(handle, inode); + unlock_kernel(); + return; +no_delete: + clear_inode(inode); /* We must guarantee clearing of inode... */ +} + +void ext3_discard_prealloc (struct inode * inode) +{ +#ifdef EXT3_PREALLOCATE + lock_kernel(); + /* Writer: ->i_prealloc* */ + if (inode->u.ext3_i.i_prealloc_count) { + unsigned short total = inode->u.ext3_i.i_prealloc_count; + unsigned long block = inode->u.ext3_i.i_prealloc_block; + inode->u.ext3_i.i_prealloc_count = 0; + inode->u.ext3_i.i_prealloc_block = 0; + /* Writer: end */ + ext3_free_blocks (inode, block, total); + } + unlock_kernel(); +#endif +} + +static int ext3_alloc_block (handle_t *handle, + struct inode * inode, unsigned long goal, int *err) +{ +#ifdef EXT3FS_DEBUG + static unsigned long alloc_hits = 0, alloc_attempts = 0; +#endif + unsigned long result; + +#ifdef EXT3_PREALLOCATE + /* Writer: ->i_prealloc* */ + if (inode->u.ext3_i.i_prealloc_count && + (goal == inode->u.ext3_i.i_prealloc_block || + goal + 1 == inode->u.ext3_i.i_prealloc_block)) + { + result = inode->u.ext3_i.i_prealloc_block++; + inode->u.ext3_i.i_prealloc_count--; + /* Writer: end */ + ext3_debug ("preallocation hit (%lu/%lu).\n", + ++alloc_hits, ++alloc_attempts); + } else { + ext3_discard_prealloc (inode); + ext3_debug ("preallocation miss (%lu/%lu).\n", + alloc_hits, ++alloc_attempts); + if (S_ISREG(inode->i_mode)) + result = ext3_new_block (inode, goal, + &inode->u.ext3_i.i_prealloc_count, + &inode->u.ext3_i.i_prealloc_block, err); + else + result = ext3_new_block (inode, goal, 0, 0, err); + /* + * AKPM: this is somewhat sticky. I'm not surprised it was + * disabled in 2.2's ext3. Need to integrate b_committed_data + * guarding with preallocation, if indeed preallocation is + * effective. + */ + } +#else + result = ext3_new_block (handle, inode, goal, 0, 0, err); +#endif + return result; +} + + +typedef struct { + u32 *p; + u32 key; + struct buffer_head *bh; +} Indirect; + +static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v) +{ + p->key = *(p->p = v); + p->bh = bh; +} + +static inline int verify_chain(Indirect *from, Indirect *to) +{ + while (from <= to && from->key == *from->p) + from++; + return (from > to); +} + +/** + * ext3_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * + * To store the locations of file's data ext3 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + +/* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + +static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4]) +{ + int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT3_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + + if (i_block < 0) { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT3_IND_BLOCK; + offsets[n++] = i_block; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT3_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT3_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + } else { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); + } + return n; +} + +/** + * ext3_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ +static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, + Indirect chain[4], int *err) +{ + kdev_t dev = inode->i_dev; + int blocksize = inode->i_sb->s_blocksize; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = bread(dev, le32_to_cpu(p->key), blocksize); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (u32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + +changed: + *err = -EAGAIN; + goto no_block; +failure: + *err = -EIO; +no_block: + return p; +} + +/** + * ext3_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the prefered place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * Caller must make sure that @ind is valid and will stay that way. + */ + +static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) +{ + u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; + u32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) + if (*p) + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be refered from inode itself? OK, just put it into + * the same cylinder group then. + */ + return (inode->u.ext3_i.i_block_group * + EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); +} + +/** + * ext3_find_goal - find a prefered place for allocation. + * @inode: owner + * @block: block we want + * @chain: chain of indirect blocks + * @partial: pointer to the last triple within a chain + * @goal: place to store the result. + * + * Normally this function find the prefered place for block allocation, + * stores it in *@goal and returns zero. If the branch had been changed + * under us we return -EAGAIN. + */ + +static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], + Indirect *partial, unsigned long *goal) +{ + /* Writer: ->i_next_alloc* */ + if (block == inode->u.ext3_i.i_next_alloc_block + 1) { + inode->u.ext3_i.i_next_alloc_block++; + inode->u.ext3_i.i_next_alloc_goal++; + } +#ifdef SEARCH_FROM_ZERO + inode->u.ext3_i.i_next_alloc_block = 0; + inode->u.ext3_i.i_next_alloc_goal = 0; +#endif + /* Writer: end */ + /* Reader: pointers, ->i_next_alloc* */ + if (verify_chain(chain, partial)) { + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block == inode->u.ext3_i.i_next_alloc_block) + *goal = inode->u.ext3_i.i_next_alloc_goal; + if (!*goal) + *goal = ext3_find_near(inode, partial); +#ifdef SEARCH_FROM_ZERO + *goal = 0; +#endif + return 0; + } + /* Reader: end */ + return -EAGAIN; +} + +/** + * ext3_alloc_branch - allocate and set up a chain of blocks. + * @inode: owner + * @num: depth of the chain (number of blocks to allocate) + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates @num blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext3_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext3_get_block(), excpet that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ + +static int ext3_alloc_branch(handle_t *handle, struct inode *inode, + int num, + unsigned long goal, + int *offsets, + Indirect *branch) +{ + int blocksize = inode->i_sb->s_blocksize; + int n = 0, keys = 0; + int err = 0; + int i; + int parent = ext3_alloc_block(handle, inode, goal, &err); + + branch[0].key = cpu_to_le32(parent); + if (parent) { + for (n = 1; n < num; n++) { + struct buffer_head *bh; + /* Allocate the next block */ + int nr = ext3_alloc_block(handle, inode, parent, &err); + if (!nr) + break; + branch[n].key = cpu_to_le32(nr); + keys = n+1; + + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = getblk(inode->i_dev, parent, blocksize); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + break; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (u32*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + BUFFER_TRACE(bh, "marking uptodate"); + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + break; + + parent = nr; + } + if (IS_SYNC(inode)) + handle->h_sync = 1; + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < keys; i++) { + BUFFER_TRACE(branch[i].bh, "call journal_forget"); + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) + ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + return err; +} + +/** + * ext3_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext3_alloc_branch) + * @where: location of missing link + * @num: number of blocks we are adding + * + * This function verifies that chain (up to the missing link) had not + * changed, fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. Otherwise (== chain had been changed) + * we free the new blocks (forgetting their buffer_heads, indeed) and + * return -EAGAIN. + */ + +static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, + Indirect chain[4], Indirect *where, int num) +{ + int i; + int err = 0; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* Verify that place we are splicing to is still there and vacant */ + + /* Writer: pointers, ->i_next_alloc* */ + if (!verify_chain(chain, where-1) || *where->p) + /* Writer: end */ + goto changed; + + /* That's it */ + + *where->p = where->key; + inode->u.ext3_i.i_next_alloc_block = block; + inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); +#ifdef SEARCH_FROM_ZERO + inode->u.ext3_i.i_next_alloc_block = 0; + inode->u.ext3_i.i_next_alloc_goal = 0; +#endif + /* Writer: end */ + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * akpm: If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + +changed: + /* + * AKPM: if where[i].bh isn't part of the current updating + * transaction then we explode nastily. Test this code path. + */ + jbd_debug(1, "the chain changed: try again\n"); + err = -EAGAIN; + +err_out: + for (i = 1; i < num; i++) { + BUFFER_TRACE(where[i].bh, "call journal_forget"); + ext3_journal_forget(handle, where[i].bh); + } + /* For the normal collision cleanup case, we free up the blocks. + * On genuine filesystem errors we don't even think about doing + * that. */ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, + le32_to_cpu(where[i].key), 1); + return err; +} + +/* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * akpm: `handle' can be NULL if create == 0. + */ + +static int ext3_get_block_handle(handle_t *handle, struct inode *inode, + long iblock, + struct buffer_head *bh_result, int create) +{ + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + unsigned long goal; + int left; + int depth = ext3_block_to_path(inode, iblock, offsets); + loff_t new_size; + + J_ASSERT(handle != NULL || create == 0); + + if (depth == 0) + goto out; + + lock_kernel(); +reread: + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + bh_result->b_state &= ~(1UL << BH_New); +got_it: + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key); + bh_result->b_state |= (1UL << BH_Mapped); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { +cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); + unlock_kernel(); +out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) + goto changed; + + left = (chain + depth) - partial; + + /* + * Block out ext3_truncate while we alter the tree + */ + down_read(&inode->u.ext3_i.truncate_sem); + err = ext3_alloc_branch(handle, inode, left, goal, + offsets+(partial-chain), partial); + + /* The ext3_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct */ + if (!err) + err = ext3_splice_branch(handle, inode, iblock, chain, + partial, left); + up_read(&inode->u.ext3_i.truncate_sem); + if (err == -EAGAIN) + goto changed; + if (err) + goto cleanup; + + new_size = inode->i_size; + /* + * This is not racy against ext3_truncate's modification of i_disksize + * because VM/VFS ensures that the file cannot be extended while + * truncate is in progress. It is racy between multiple parallel + * instances of get_block, but we have the BKL. + */ + if (new_size > inode->u.ext3_i.i_disksize) + inode->u.ext3_i.i_disksize = new_size; + + bh_result->b_state |= (1UL << BH_New); + goto got_it; + +changed: + while (partial > chain) { + jbd_debug(1, "buffer chain changed, retrying\n"); + BUFFER_TRACE(partial->bh, "brelsing"); + brelse(partial->bh); + partial--; + } + goto reread; +} + +static int ext3_get_block(struct inode *inode, long iblock, + struct buffer_head *bh_result, int create) +{ + handle_t *handle = 0; + int ret; + + if (create) { + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } + ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); + return ret; +} + +/* + * `handle' can be NULL if create is zero + */ +struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, + long block, int create, int * errp) +{ + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = getblk(dummy.b_dev, dummy.b_blocknr, + inode->i_sb->s_blocksize); + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != 0); + + /* Now that we do not always journal data, we + should keep in mind whether this should + always journal the new buffer as metadata. + For now, regular file writes use + ext3_get_block instead, so it's not a + problem. */ + lock_kernel(); + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext3_journal_get_create_access(handle, bh); + if (!fatal) { + memset(bh->b_data, 0, + inode->i_sb->s_blocksize); + mark_buffer_uptodate(bh, 1); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) fatal = err; + unlock_kernel(); + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } + return NULL; +} + +struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, + int block, int create, int *err) +{ + struct buffer_head * bh; + int prev_blocks; + + prev_blocks = inode->i_blocks; + + bh = ext3_getblk (handle, inode, block, create, err); + if (!bh) + return bh; +#ifdef EXT3_PREALLOCATE + /* + * If the inode has grown, and this is a directory, then use a few + * more of the preallocated blocks to keep directory fragmentation + * down. The preallocated blocks are guaranteed to be contiguous. + */ + if (create && + S_ISDIR(inode->i_mode) && + inode->i_blocks > prev_blocks && + EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_PREALLOC)) { + int i; + struct buffer_head *tmp_bh; + + for (i = 1; + inode->u.ext3_i.i_prealloc_count && + i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; + i++) { + /* + * ext3_getblk will zero out the contents of the + * directory for us + */ + tmp_bh = ext3_getblk(handle, inode, + block+i, create, err); + if (!tmp_bh) { + brelse (bh); + return 0; + } + brelse (tmp_bh); + } + } +#endif + if (buffer_uptodate(bh)) + return bh; + ll_rw_block (READ, 1, &bh); + wait_on_buffer (bh); + if (buffer_uptodate(bh)) + return bh; + brelse (bh); + *err = -EIO; + return NULL; +} + +static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) +{ + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = bh->b_this_page) + { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; +} + +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext3_get_block() + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext3_writepage() -> + * block_write_full_page(). In that case, we *know* that ext3_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext3 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ + +static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) +{ + return ext3_journal_get_write_access(handle, bh); +} + +static int ext3_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + handle_t *handle = ext3_journal_current_handle(); + int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + + lock_kernel(); + handle = ext3_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = block_prepare_write(page, from, to, ext3_get_block); + if (ret != 0) + goto prepare_write_failed; + + if (ext3_should_journal_data(inode)) + ret = walk_page_buffers(handle, page->buffers, + from, to, NULL, do_journal_get_write_access); +prepare_write_failed: + if (ret) + ext3_journal_stop(handle, inode); +out: + unlock_kernel(); + return ret; +} + +static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) +{ + return ext3_journal_dirty_data(handle, bh, 0); +} + +/* + * For ext3_writepage(). We also brelse() the buffer to account for + * the bget() which ext3_writepage() performs. + */ +static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) +{ + int ret = ext3_journal_dirty_data(handle, bh, 1); + __brelse(bh); + return ret; +} + +/* For commit_write() in data=journal mode */ +static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +{ + set_bit(BH_Uptodate, &bh->b_state); + return ext3_journal_dirty_metadata(handle, bh); +} + +/* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from block_symlink(). + * + * ext3 inode->i_dirty_buffers policy: If we're journalling data we + * definitely don't want them to appear on the inode at all - instead + * we need to manage them at the JBD layer and we need to intercept + * the relevant sync operations and translate them into journal operations. + * + * If we're not journalling data then we can just leave the buffers + * on ->i_dirty_buffers. If someone writes them out for us then thanks. + * Otherwise we'll do it in commit, if we're using ordered data. + */ + +static int ext3_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + + lock_kernel(); + if (ext3_should_journal_data(inode)) { + /* + * Here we duplicate the generic_commit_write() functionality + */ + int partial = 0; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + ret = walk_page_buffers(handle, page->buffers, + from, to, &partial, commit_write_fn); + if (!partial) + SetPageUptodate(page); + kunmap(page); + if (pos > inode->i_size) + inode->i_size = pos; + set_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state); + } else { + if (ext3_should_order_data(inode)) { + ret = walk_page_buffers(handle, page->buffers, + from, to, NULL, journal_dirty_sync_data); + } + /* Be careful here if generic_commit_write becomes a + * required invocation after block_prepare_write. */ + if (ret == 0) + ret = generic_commit_write(file, page, from, to); + } + if (inode->i_size > inode->u.ext3_i.i_disksize) { + inode->u.ext3_i.i_disksize = inode->i_size; + ret2 = ext3_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + ret2 = ext3_journal_stop(handle, inode); + unlock_kernel(); + if (!ret) + ret = ret2; + return ret; +} + +/* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext3 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ +static int ext3_bmap(struct address_space *mapping, long block) +{ + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (test_and_clear_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT3_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + journal = EXT3_JOURNAL(inode); + journal_lock_updates(journal); + err = journal_flush(journal); + journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping,block,ext3_get_block); +} + +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + atomic_inc(&bh->b_count); + return 0; +} + +/* + * Note that we always start a transaction even if we're not journalling + * data. This is to preserve ordering: any hole instantiation within + * __block_write_full_page -> ext3_get_block() should be journalled + * along with the data so we don't crash and then get metadata which + * refers to old data. + * + * In all journalling modes block_write_full_page() will start the I/O. + * + * Problem: + * + * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext3_writepage() + * + * Similar for: + * + * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... + * + * Same applies to ext3_get_block(). We will deadlock on various things like + * lock_journal and i_truncate_sem. + * + * Setting PF_MEMALLOC here doesn't work - too many internal memory + * allocations fail. + * + * 16May01: If we're reentered then journal_current_handle() will be + * non-zero. We simply *return*. + * + * 1 July 2001: @@@ FIXME: + * In journalled data mode, a data buffer may be metadata against the + * current transaction. But the same file is part of a shared mapping + * and someone does a writepage() on it. + * + * We will move the buffer onto the async_data list, but *after* it has + * been dirtied. So there's a small window where we have dirty data on + * BJ_Metadata. + * + * Note that this only applies to the last partial page in the file. The + * bit which block_write_full_page() uses prepare/commit for. (That's + * broken code anyway: it's wrong for msync()). + * + * It's a rare case: affects the final partial page, for journalled data + * where the file is subject to bith write() and writepage() in the same + * transction. To fix it we'll need a custom block_write_full_page(). + * We'll probably need that anyway for journalling writepage() output. + * + * We don't honour synchronous mounts for writepage(). That would be + * disastrous. Any write() or metadata operation will sync the fs for + * us. + */ +static int ext3_writepage(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct buffer_head *page_buffers; + handle_t *handle = NULL; + int ret = 0, err; + int needed; + int order_data; + + J_ASSERT(PageLocked(page)); + + /* + * We give up here if we're reentered, because it might be + * for a different filesystem. One *could* look for a + * nested transaction opportunity. + */ + lock_kernel(); + if (ext3_journal_current_handle()) + goto out_fail; + + needed = ext3_writepage_trans_blocks(inode); + if (current->flags & PF_MEMALLOC) + handle = ext3_journal_try_start(inode, needed); + else + handle = ext3_journal_start(inode, needed); + + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + order_data = ext3_should_order_data(inode) || + ext3_should_journal_data(inode); + + unlock_kernel(); + + page_buffers = NULL; /* Purely to prevent compiler warning */ + + /* bget() all the buffers */ + if (order_data) { + if (!page->buffers) + create_empty_buffers(page, + inode->i_dev, inode->i_sb->s_blocksize); + page_buffers = page->buffers; + walk_page_buffers(handle, page_buffers, 0, + PAGE_CACHE_SIZE, NULL, bget_one); + } + + ret = block_write_full_page(page, ext3_get_block); + + /* + * The page can become unlocked at any point now, and + * truncate can then come in and change things. So we + * can't touch *page from now on. But *page_buffers is + * safe due to elevated refcount. + */ + + handle = ext3_journal_current_handle(); + lock_kernel(); + + /* And attach them to the current transaction */ + if (order_data) { + err = walk_page_buffers(handle, page_buffers, + 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); + if (!ret) + ret = err; + } + + err = ext3_journal_stop(handle, inode); + if (!ret) + ret = err; + unlock_kernel(); + return ret; + +out_fail: + + unlock_kernel(); + SetPageDirty(page); + UnlockPage(page); + return ret; +} + +static int ext3_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,ext3_get_block); +} + + +static int ext3_flushpage(struct page *page, unsigned long offset) +{ + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + return journal_flushpage(journal, page, offset); +} + +static int ext3_releasepage(struct page *page, int wait) +{ + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + return journal_try_to_free_buffers(journal, page, wait); +} + + +struct address_space_operations ext3_aops = { + readpage: ext3_readpage, /* BKL not held. Don't need */ + writepage: ext3_writepage, /* BKL not held. We take it */ + sync_page: block_sync_page, + prepare_write: ext3_prepare_write, /* BKL not held. We take it */ + commit_write: ext3_commit_write, /* BKL not held. We take it */ + bmap: ext3_bmap, /* BKL held */ + flushpage: ext3_flushpage, /* BKL not held. Don't need */ + releasepage: ext3_releasepage, /* BKL not held. Don't need */ +}; + +/* + * ext3_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +static int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) +{ + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = inode->i_sb->s_blocksize; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + + /* Find the buffer that contains "offset" */ + bh = page->buffers; + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + /* Hole? Nothing to do */ + if (buffer_uptodate(bh)) + goto unlock; + ext3_get_block(inode, iblock, bh, 0); + /* Still unmapped? Nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (Page_Uptodate(page)) + set_bit(BH_Uptodate, &bh->b_state); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (ext3_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + + memset(kmap(page) + offset, 0, length); + flush_dcache_page(page); + kunmap(page); + + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext3_should_journal_data(inode)) { + err = ext3_journal_dirty_metadata(handle, bh); + } else { + if (ext3_should_order_data(inode)) + err = ext3_journal_dirty_data(handle, bh, 0); + __mark_buffer_dirty(bh); + } + +unlock: + UnlockPage(page); + page_cache_release(page); +out: + return err; +} + +/* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ +static inline int all_zeroes(u32 *p, u32 *q) +{ + while (p < q) + if (*p++) + return 0; + return 1; +} + +/** + * ext3_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext3_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext3_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is refered + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext3_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + +static Indirect *ext3_find_shared(struct inode *inode, + int depth, + int offsets[4], + Indirect chain[4], + u32 *top) +{ + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offest + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext3_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext3. Must leave the tree intact */ +#if 0 + *p->p = 0; +#endif + } + /* Writer: end */ + + while(partial > p) + { + brelse(partial->bh); + partial--; + } +no_top: + return partial; +} + +/* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + */ +static void +ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, + unsigned long block_to_free, unsigned long count, + u32 *first, u32 *last) +{ + u32 *p; + kdev_t dev = inode->i_sb->s_dev; + unsigned long blocksize = inode->i_sb->s_blocksize; + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + } + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + } + + /* + * Any buffers which are on the journal will be in memory. We find + * them on the hash table so journal_revoke() will run journal_forget() + * on them. We've already detached each block from the file, so + * bforget() in journal_forget() should be safe. + * + * AKPM: turn on bforget in journal_forget()!!! + */ + for (p = first; p < last; p++) { + u32 nr = le32_to_cpu(*p); + if (nr) { + struct buffer_head *bh; + + *p = 0; + bh = get_hash_table(dev, nr, blocksize); + ext3_forget(handle, 0, inode, bh, nr); + } + } + + ext3_free_blocks(handle, inode, block_to_free, count); +} + +/** + * ext3_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks refered from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ +static void ext3_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, u32 *first, u32 *last) +{ + unsigned long block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + u32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + unsigned long nr; /* Current block # */ + u32 *p; /* Pointer into inode/ind + for current block */ + int err; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + ext3_clear_blocks(handle, inode, this_bh, + block_to_free, + count, block_to_free_p, p); + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (count > 0) + ext3_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, this_bh); + } +} + +/** + * ext3_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks refered from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ +static void ext3_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + u32 *first, u32 *last, int depth) +{ + unsigned long nr; + u32 *p; + + if (is_handle_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + /* Go read the buffer for the next level down */ + bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext3_error(inode->i_sb, "ext3_free_branches", + "Read failure, inode=%ld, block=%ld", + inode->i_ino, nr); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext3_free_branches(handle, inode, bh, (u32*)bh->b_data, + (u32*)bh->b_data + addr_per_block, + depth); + + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. + * + * If this block has already been committed to the + * journal, a revoke record will be written. And + * revoke records must be emitted *before* clearing + * this block's bit in the bitmaps. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (is_handle_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + } + + ext3_free_blocks(handle, inode, nr, 1); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext3_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext3_free_data(handle, inode, parent_bh, first, last); + } +} + +/* + * ext3_truncate() + * + * We block out ext3_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext3_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commmit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext3_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext3 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext3_truncate() run will find them and release them. + */ + +void ext3_truncate(struct inode * inode) +{ + handle_t *handle; + u32 *i_data = inode->u.ext3_i.i_data; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + int nr = 0; + int n; + long last_block; + unsigned blocksize; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode)) + return; + + ext3_discard_prealloc(inode); + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + blocksize = inode->i_sb->s_blocksize; + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); + + ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size); + + + n = ext3_block_to_path(inode, last_block, offsets); + if (n == 0) + goto out_stop; /* error */ + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext3 *really* writes onto the disk inode. + */ + inode->u.ext3_i.i_disksize = inode->i_size; + + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&inode->u.ext3_i.truncate_sem); + + if (n == 1) { /* direct blocks */ + ext3_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT3_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext3_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext3_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext3_free_branches(handle, inode, partial->bh, partial->p + 1, + (u32*)partial->bh->b_data + addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse (partial->bh); + partial--; + } +do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT3_IND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 1); + i_data[EXT3_IND_BLOCK] = 0; + } + case EXT3_IND_BLOCK: + nr = i_data[EXT3_DIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 2); + i_data[EXT3_DIND_BLOCK] = 0; + } + case EXT3_DIND_BLOCK: + nr = i_data[EXT3_TIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 3); + i_data[EXT3_TIND_BLOCK] = 0; + } + case EXT3_TIND_BLOCK: + ; + } + up_write(&inode->u.ext3_i.truncate_sem); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) + handle->h_sync = 1; +out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + ext3_journal_stop(handle, inode); +} + +/* + * ext3_get_inode_loc returns with an extra refcount against the + * inode's underlying buffer_head on success. + */ + +int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) +{ + struct buffer_head *bh = 0; + unsigned long block; + unsigned long block_group; + unsigned long group_desc; + unsigned long desc; + unsigned long offset; + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && + inode->i_ino != EXT3_ACL_IDX_INO && + inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( + inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "bad inode number: %lu", inode->i_ino); + goto bad_inode; + } + block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); + if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "group >= groups count"); + goto bad_inode; + } + group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); + desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); + bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; + if (!bh) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "Descriptor not loaded"); + goto bad_inode; + } + + gdp = (struct ext3_group_desc *) bh->b_data; + /* + * Figure out the offset within the block group inode table + */ + offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * + EXT3_INODE_SIZE(inode->i_sb); + block = le32_to_cpu(gdp[desc].bg_inode_table) + + (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); + if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, block); + goto bad_inode; + } + offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); + + iloc->bh = bh; + iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); + iloc->block_group = block_group; + + return 0; + + bad_inode: + return -EIO; +} + +void ext3_read_inode(struct inode * inode) +{ + struct ext3_iloc iloc; + struct ext3_inode *raw_inode; + struct buffer_head *bh; + int block; + + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; + raw_inode = iloc.raw_inode; + init_rwsem(&inode->u.ext3_i.truncate_sem); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime = le32_to_cpu(raw_inode->i_atime); + inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); + inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size + * (for stat), not the fs block + * size */ + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + inode->i_version = ++event; + inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags); +#ifdef EXT3_FRAGMENTS + inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); + inode->u.ext3_i.i_frag_no = raw_inode->i_frag; + inode->u.ext3_i.i_frag_size = raw_inode->i_fsize; +#endif + inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } + inode->u.ext3_i.i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); +#ifdef EXT3_PREALLOCATE + inode->u.ext3_i.i_prealloc_count = 0; +#endif + inode->u.ext3_i.i_block_group = iloc.block_group; + + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3_N_BLOCKS; block++) + inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); + + brelse (iloc.bh); + + if (inode->i_ino == EXT3_ACL_IDX_INO || + inode->i_ino == EXT3_ACL_DATA_INO) + /* Nothing to do */ ; + else if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (!inode->i_blocks) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + } + } else + init_special_inode(inode, inode->i_mode, + le32_to_cpu(iloc.raw_inode->i_block[0])); + /* inode->i_attr_flags = 0; unused */ + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ + inode->i_flags |= S_SYNC; + } + if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */ + inode->i_flags |= S_APPEND; + } + if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FILE_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_FILE; unused */ + inode->i_flags |= S_IMMUTABLE_FILE; + } + if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_LINK_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_FILE; unused */ + inode->i_flags |= S_IMMUTABLE_LINK; + } + if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ + inode->i_flags |= S_NOATIME; + } + return; + +bad_inode: + make_bad_inode(inode); + return; +} + +/* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + */ + +static int ext3_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) +{ + struct ext3_inode *raw_inode = iloc->raw_inode; + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + + if (handle) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto out_brelse; + } + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if(!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); +/* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!inode->u.ext3_i.i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime); + raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags); +#ifdef EXT3_FRAGMENTS + raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr); + raw_inode->i_frag = inode->u.ext3_i.i_frag_no; + raw_inode->i_fsize = inode->u.ext3_i.i_frag_size; +#else + /* If we are not tracking these fields in the in-memory inode, + * then preserve them on disk, but still initialise them to zero + * for new inodes. */ + if (inode->u.ext3_i.i_state & EXT3_STATE_NEW) { + raw_inode->i_faddr = 0; + raw_inode->i_frag = 0; + raw_inode->i_fsize = 0; + } +#endif + raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); + } else { + raw_inode->i_size_high = + cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); + if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT3_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT3_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + err = ext3_journal_get_write_access(handle, + sb->u.ext3_sb.s_sbh); + if (err) + goto out_brelse; + ext3_update_dynamic_rev(sb); + EXT3_SET_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; + handle->h_sync = 1; + err = ext3_journal_dirty_metadata(handle, + sb->u.ext3_sb.s_sbh); + } + } + } + raw_inode->i_generation = le32_to_cpu(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_block[0] = + cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); + else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = inode->u.ext3_i.i_data[block]; + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; + inode->u.ext3_i.i_state &= ~EXT3_STATE_NEW; + +out_brelse: + brelse (bh); + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * ext3_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ +void ext3_write_inode(struct inode *inode, int wait) +{ + if (current->flags & PF_MEMALLOC) + return; + + if (ext3_journal_current_handle()) { + jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); + return; + } + + if (!wait) + return; + + ext3_force_commit(inode->i_sb); +} + +/* + * ext3_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * This is only needed for regular files. rmdir() has its own path, and + * we can never truncate a direcory except on final unlink (at which + * point i_nlink is zero so recovery is easy.) + * + * Called with the BKL. + */ + +int ext3_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int error, rc; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + handle_t *handle; + + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + error = ext3_orphan_add(handle, inode); + inode->u.ext3_i.i_disksize = attr->ia_size; + rc = ext3_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext3_journal_stop(handle, inode); + } + + inode_setattr(inode, attr); + + /* If inode_setattr's call to ext3_truncate failed to get a + * transaction handle at all, we need to clean up the in-core + * orphan list manually. */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + +err_out: + ext3_std_error(inode->i_sb, error); + return 0; +} + + +/* + * akpm: how many blocks doth make a writepage()? + * + * With N blocks per page, it may be: + * N data blocks + * 2 indirect block + * 2 dindirect + * 1 tindirect + * N+5 bitmap blocks (from the above) + * N+5 group descriptor summary blocks + * 1 inode block + * 1 superblock. + * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files + * + * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS + * + * With ordered or writeback data it's the same, less the N data blocks. + * + * If the inode's direct blocks can hold an integral number of pages then a + * page cannot straddle two indirect blocks, and we can only touch one indirect + * and dindirect block, and the "5" above becomes "3". + * + * This still overestimates under most circumstances. If we were to pass the + * start and end offsets in here as well we could do block_to_path() on each + * block and work out the exact number of indirects which are touched. Pah. + */ + +int ext3_writepage_trans_blocks(struct inode *inode) +{ + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else + ret = 2 * (bpp + indirects) + 2; + +#ifdef CONFIG_QUOTA + ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; +#endif + + return ret; +} + +int +ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) +{ + int err = 0; + + if (handle) { + /* the do_update_inode consumes one bh->b_count */ + atomic_inc(&iloc->bh->b_count); + err = ext3_do_update_inode(handle, inode, iloc); + /* ext3_do_update_inode() does journal_dirty_metadata */ + brelse(iloc->bh); + } else { + printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n"); + } + return err; +} + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int +ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc) +{ + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + } + ext3_std_error(inode->i_sb, err); + return err; +} + +/* + * akpm: What we do here is to mark the in-core inode as clean + * with respect to inode dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ +int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) +{ + struct ext3_iloc iloc; + int err; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (!err) + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + return err; +} + +/* + * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ +void ext3_dirty_inode(struct inode *inode) +{ + handle_t *current_handle = ext3_journal_current_handle(); + handle_t *handle; + + lock_kernel(); + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + goto out; + if (current_handle && + current_handle->h_transaction != handle->h_transaction) { + /* This task has a transaction open against a different fs */ + printk(KERN_EMERG __FUNCTION__": transactions do not match!\n"); + } else { + jbd_debug(5, "marking dirty. outer handle=%p\n", + current_handle); + ext3_mark_inode_dirty(handle, inode); + } + ext3_journal_stop(handle, inode); +out: + unlock_kernel(); +} + +#ifdef AKPM +/* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext3_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ +static inline int +ext3_pin_inode(handle_t *handle, struct inode *inode) +{ + struct ext3_iloc iloc; + + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext3_journal_dirty_metadata(handle, + iloc.bh); + brelse(iloc.bh); + } + } + ext3_std_error(inode->i_sb, err); + return err; +} +#endif + +int ext3_change_inode_journal_flag(struct inode *inode, int val) +{ + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT3_JOURNAL(inode); + if (is_journal_aborted(journal) || IS_RDONLY(inode)) + return -EROFS; + + journal_lock_updates(journal); + journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL; + else + inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL; + + journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext3_mark_inode_dirty(handle, inode); + handle->h_sync = 1; + ext3_journal_stop(handle, inode); + ext3_std_error(inode->i_sb, err); + + return err; +} + + +/* + * ext3_aops_journal_start(). + * + * + * + * We need to take the inode semaphore *outside* the + * journal_start/journal_stop. Otherwise, a different task could do a + * wait_for_commit() while holding ->i_sem, which deadlocks. The rule + * is: transaction open/closes are considered to be a locking operation + * and they nest *inside* ->i_sem. + * ---------------------------------------------------------------------------- + * Possible problem: + * ext3_file_write() + * -> generic_file_write() + * -> __alloc_pages() + * -> page_launder() + * -> ext3_writepage() + * + * And the writepage can be on a different fs while we have a + * transaction open against this one! Bad. + * + * I tried making the task PF_MEMALLOC here, but that simply results in + * 0-order allocation failures passed back to generic_file_write(). + * Instead, we rely on the reentrancy protection in ext3_writepage(). + * ---------------------------------------------------------------------------- + * When we do the journal_start() here we don't really need to reserve + * any blocks - we won't need any until we hit ext3_prepare_write(), + * which does all the needed journal extending. However! There is a + * problem with quotas: + * + * Thread 1: + * sys_sync + * ->sync_dquots + * ->commit_dquot + * ->lock_dquot + * ->write_dquot + * ->ext3_file_write + * ->journal_start + * ->ext3_prepare_write + * ->journal_extend + * ->journal_start + * Thread 2: + * ext3_create (for example) + * ->ext3_new_inode + * ->dquot_initialize + * ->lock_dquot + * + * Deadlock. Thread 1's journal_start blocks because thread 2 has a + * transaction open. Thread 2's transaction will never close because + * thread 2 is stuck waiting for the dquot lock. + * + * So. We must ensure that thread 1 *never* needs to extend the journal + * for quota writes. We do that by reserving enough journal blocks + * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we + * need to extend" test in ext3_prepare_write() succeeds. + */ + + +MODULE_LICENSE("GPL"); diff -urN linux-2.4.16-reiserfspatches-immutable/fs/ext3/ioctl.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext3/ioctl.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/ext3/ioctl.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ext3/ioctl.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,174 @@ +/* + * linux/fs/ext3/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + +#include +#include +#include +#include +#include +#include + + +int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +{ + unsigned int flags; + + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT3_IOC_GETFLAGS: + flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); + case EXT3_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err; + struct ext3_iloc iloc; + unsigned int oldflags; + unsigned int jflag; + + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (get_user(flags, (int *) arg)) + return -EFAULT; + + oldflags = inode->u.ext3_i.i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + } + + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + flags = flags & EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; + inode->u.ext3_i.i_flags = flags; + + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (flags & EXT3_IMMUTABLE_FILE_FL) + inode->i_flags |= S_IMMUTABLE_FILE; + else + inode->i_flags &= ~S_IMMUTABLE_FILE; + if (flags & EXT3_IMMUTABLE_LINK_FL) + inode->i_flags |= S_IMMUTABLE_LINK; + else + inode->i_flags &= ~S_IMMUTABLE_LINK; + if (flags & EXT3_NOATIME_FL) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + inode->i_ctime = CURRENT_TIME; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext3_journal_stop(handle, inode); + if (err) + return err; + + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int *) arg); + case EXT3_IOC_SETVERSION: + case EXT3_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext3_iloc iloc; + __u32 generation; + int err; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (get_user(generation, (int *) arg)) + return -EFAULT; + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; + + inode->i_ctime = CURRENT_TIME; + inode->i_generation = generation; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + ext3_journal_stop(handle, inode); + return err; + } +#ifdef CONFIG_JBD_DEBUG + case EXT3_IOC_WAIT_FOR_READONLY: + /* + * This is racy - by the time we're woken up and running, + * the superblock could be released. And the module could + * have been unloaded. So sue me. + * + * Returns 1 if it slept, else zero. + */ + { + struct super_block *sb = inode->i_sb; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); + if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) { + schedule(); + ret = 1; + } + remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); + return ret; + } +#endif + default: + return -ENOTTY; + } +} diff -urN linux-2.4.16-reiserfspatches-immutable/fs/fat/file.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/fat/file.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/fat/file.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/fat/file.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,136 @@ +/* + * linux/fs/fat/file.c + * + * Written 1992,1993 by Werner Almesberger + * + * regular file handling primitives for fat-based filesystems + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PRINTK(x) +#define Printk(x) printk x + +struct file_operations fat_file_operations = { + llseek: generic_file_llseek, + read: fat_file_read, + write: fat_file_write, + mmap: generic_file_mmap, + fsync: file_fsync, +}; + +struct inode_operations fat_file_inode_operations = { + truncate: fat_truncate, + setattr: fat_notify_change, +}; + +ssize_t fat_file_read( + struct file *filp, + char *buf, + size_t count, + loff_t *ppos) +{ + struct inode *inode = filp->f_dentry->d_inode; + return MSDOS_SB(inode->i_sb)->cvf_format + ->cvf_file_read(filp,buf,count,ppos); +} + + +int fat_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long phys; + + phys = fat_bmap(inode, iblock); + if (phys) { + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = phys; + bh_result->b_state |= (1UL << BH_Mapped); + return 0; + } + if (!create) + return 0; + if (iblock << sb->s_blocksize_bits != MSDOS_I(inode)->mmu_private) { + BUG(); + return -EIO; + } + if (!(iblock % MSDOS_SB(inode->i_sb)->cluster_size)) { + if (fat_add_cluster(inode) < 0) + return -ENOSPC; + } + MSDOS_I(inode)->mmu_private += sb->s_blocksize; + phys = fat_bmap(inode, iblock); + if (!phys) + BUG(); + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = phys; + bh_result->b_state |= (1UL << BH_Mapped); + bh_result->b_state |= (1UL << BH_New); + return 0; +} + +ssize_t fat_file_write( + struct file *filp, + const char *buf, + size_t count, + loff_t *ppos) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct super_block *sb = inode->i_sb; + return MSDOS_SB(sb)->cvf_format + ->cvf_file_write(filp,buf,count,ppos); +} + +ssize_t default_fat_file_write( + struct file *filp, + const char *buf, + size_t count, + loff_t *ppos) +{ + struct inode *inode = filp->f_dentry->d_inode; + int retval; + + retval = generic_file_write(filp, buf, count, ppos); + if (retval > 0) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + mark_inode_dirty(inode); + } + return retval; +} + +void fat_truncate(struct inode *inode) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + int cluster; + + /* Why no return value? Surely the disk could fail... */ + if (IS_RDONLY (inode)) + return /* -EPERM */; + if (IS_IMMUTABLE_FILE(inode)) + return /* -EPERM */; + cluster = 1 << sbi->cluster_bits; + /* + * This protects against truncating a file bigger than it was then + * trying to write into the hole. + */ + if (MSDOS_I(inode)->mmu_private > inode->i_size) + MSDOS_I(inode)->mmu_private = inode->i_size; + + fat_free(inode, (inode->i_size + (cluster - 1)) >> sbi->cluster_bits); + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + mark_inode_dirty(inode); +} diff -urN linux-2.4.16-reiserfspatches-immutable/fs/fat/inode.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/fat/inode.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/fat/inode.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/fat/inode.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,1061 @@ +/* + * linux/fs/fat/inode.c + * + * Written 1992,1993 by Werner Almesberger + * VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner + * Rewritten for the constant inumbers support by Al Viro + * + * Fixes: + * + * Max Cohan: Fixed invalid FSINFO offset when info_sector is 0 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern struct cvf_format default_cvf; + +/* #define FAT_PARANOIA 1 */ +#define DEBUG_LEVEL 0 +#ifdef FAT_DEBUG +# define PRINTK(x) printk x +#else +# define PRINTK(x) +#endif +#if (DEBUG_LEVEL >= 1) +# define PRINTK1(x) printk x +#else +# define PRINTK1(x) +#endif + +/* + * New FAT inode stuff. We do the following: + * a) i_ino is constant and has nothing with on-disk location. + * b) FAT manages its own cache of directory entries. + * c) *This* cache is indexed by on-disk location. + * d) inode has an associated directory entry, all right, but + * it may be unhashed. + * e) currently entries are stored within struct inode. That should + * change. + * f) we deal with races in the following way: + * 1. readdir() and lookup() do FAT-dir-cache lookup. + * 2. rename() unhashes the F-d-c entry and rehashes it in + * a new place. + * 3. unlink() and rmdir() unhash F-d-c entry. + * 4. fat_write_inode() checks whether the thing is unhashed. + * If it is we silently return. If it isn't we do bread(), + * check if the location is still valid and retry if it + * isn't. Otherwise we do changes. + * 5. Spinlock is used to protect hash/unhash/location check/lookup + * 6. fat_clear_inode() unhashes the F-d-c entry. + * 7. lookup() and readdir() do igrab() if they find a F-d-c entry + * and consider negative result as cache miss. + */ + +#define FAT_HASH_BITS 8 +#define FAT_HASH_SIZE (1UL << FAT_HASH_BITS) +#define FAT_HASH_MASK (FAT_HASH_SIZE-1) +static struct list_head fat_inode_hashtable[FAT_HASH_SIZE]; +spinlock_t fat_inode_lock = SPIN_LOCK_UNLOCKED; + +void fat_hash_init(void) +{ + int i; + for(i = 0; i < FAT_HASH_SIZE; i++) { + INIT_LIST_HEAD(&fat_inode_hashtable[i]); + } +} + +static inline unsigned long fat_hash(struct super_block *sb, int i_pos) +{ + unsigned long tmp = (unsigned long)i_pos | (unsigned long) sb; + tmp = tmp + (tmp >> FAT_HASH_BITS) + (tmp >> FAT_HASH_BITS * 2); + return tmp & FAT_HASH_MASK; +} + +void fat_attach(struct inode *inode, int i_pos) +{ + spin_lock(&fat_inode_lock); + MSDOS_I(inode)->i_location = i_pos; + list_add(&MSDOS_I(inode)->i_fat_hash, + fat_inode_hashtable + fat_hash(inode->i_sb, i_pos)); + spin_unlock(&fat_inode_lock); +} + +void fat_detach(struct inode *inode) +{ + spin_lock(&fat_inode_lock); + MSDOS_I(inode)->i_location = 0; + list_del(&MSDOS_I(inode)->i_fat_hash); + INIT_LIST_HEAD(&MSDOS_I(inode)->i_fat_hash); + spin_unlock(&fat_inode_lock); +} + +struct inode *fat_iget(struct super_block *sb, int i_pos) +{ + struct list_head *p = fat_inode_hashtable + fat_hash(sb, i_pos); + struct list_head *walk; + struct msdos_inode_info *i; + struct inode *inode = NULL; + + spin_lock(&fat_inode_lock); + list_for_each(walk, p) { + i = list_entry(walk, struct msdos_inode_info, i_fat_hash); + if (i->i_fat_inode->i_sb != sb) + continue; + if (i->i_location != i_pos) + continue; + inode = igrab(i->i_fat_inode); + if (inode) + break; + } + spin_unlock(&fat_inode_lock); + return inode; +} + +static void fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de); + +struct inode *fat_build_inode(struct super_block *sb, + struct msdos_dir_entry *de, int ino, int *res) +{ + struct inode *inode; + *res = 0; + inode = fat_iget(sb, ino); + if (inode) + goto out; + inode = new_inode(sb); + *res = -ENOMEM; + if (!inode) + goto out; + *res = 0; + inode->i_ino = iunique(sb, MSDOS_ROOT_INO); + fat_fill_inode(inode, de); + fat_attach(inode, ino); + insert_inode_hash(inode); +out: + return inode; +} + +void fat_delete_inode(struct inode *inode) +{ + if (!is_bad_inode(inode)) { + lock_kernel(); + inode->i_size = 0; + fat_truncate(inode); + unlock_kernel(); + } + clear_inode(inode); +} + +void fat_clear_inode(struct inode *inode) +{ + if (is_bad_inode(inode)) + return; + lock_kernel(); + spin_lock(&fat_inode_lock); + fat_cache_inval_inode(inode); + list_del(&MSDOS_I(inode)->i_fat_hash); + spin_unlock(&fat_inode_lock); + unlock_kernel(); +} + +void fat_put_super(struct super_block *sb) +{ + if (MSDOS_SB(sb)->cvf_format->cvf_version) { + dec_cvf_format_use_count_by_version(MSDOS_SB(sb)->cvf_format->cvf_version); + MSDOS_SB(sb)->cvf_format->unmount_cvf(sb); + } + if (MSDOS_SB(sb)->fat_bits == 32) { + fat_clusters_flush(sb); + } + fat_cache_inval_dev(sb->s_dev); + set_blocksize (sb->s_dev,BLOCK_SIZE); + if (MSDOS_SB(sb)->nls_disk) { + unload_nls(MSDOS_SB(sb)->nls_disk); + MSDOS_SB(sb)->nls_disk = NULL; + MSDOS_SB(sb)->options.codepage = 0; + } + if (MSDOS_SB(sb)->nls_io) { + unload_nls(MSDOS_SB(sb)->nls_io); + MSDOS_SB(sb)->nls_io = NULL; + } + /* + * Note: the iocharset option might have been specified + * without enabling nls_io, so check for it here. + */ + if (MSDOS_SB(sb)->options.iocharset) { + kfree(MSDOS_SB(sb)->options.iocharset); + MSDOS_SB(sb)->options.iocharset = NULL; + } +} + + +static int parse_options(char *options,int *fat, int *debug, + struct fat_mount_options *opts, + char *cvf_format, char *cvf_options) +{ + char *this_char,*value,save,*savep; + char *p; + int ret = 1, len; + + opts->name_check = 'n'; + opts->conversion = 'b'; + opts->fs_uid = current->uid; + opts->fs_gid = current->gid; + opts->fs_umask = current->fs->umask; + opts->quiet = opts->sys_immutable = opts->dotsOK = opts->showexec = 0; + opts->codepage = 0; + opts->nocase = 0; + opts->shortname = 0; + opts->utf8 = 0; + opts->iocharset = NULL; + *debug = *fat = 0; + + if (!options) + goto out; + save = 0; + savep = NULL; + for (this_char = strtok(options,","); this_char; + this_char = strtok(NULL,",")) { + if ((value = strchr(this_char,'=')) != NULL) { + save = *value; + savep = value; + *value++ = 0; + } + if (!strcmp(this_char,"check") && value) { + if (value[0] && !value[1] && strchr("rns",*value)) + opts->name_check = *value; + else if (!strcmp(value,"relaxed")) + opts->name_check = 'r'; + else if (!strcmp(value,"normal")) + opts->name_check = 'n'; + else if (!strcmp(value,"strict")) + opts->name_check = 's'; + else ret = 0; + } + else if (!strcmp(this_char,"conv") && value) { + if (value[0] && !value[1] && strchr("bta",*value)) + opts->conversion = *value; + else if (!strcmp(value,"binary")) + opts->conversion = 'b'; + else if (!strcmp(value,"text")) + opts->conversion = 't'; + else if (!strcmp(value,"auto")) + opts->conversion = 'a'; + else ret = 0; + } + else if (!strcmp(this_char,"dots")) { + opts->dotsOK = 1; + } + else if (!strcmp(this_char,"nocase")) { + opts->nocase = 1; + } + else if (!strcmp(this_char,"nodots")) { + opts->dotsOK = 0; + } + else if (!strcmp(this_char,"showexec")) { + opts->showexec = 1; + } + else if (!strcmp(this_char,"dotsOK") && value) { + if (!strcmp(value,"yes")) opts->dotsOK = 1; + else if (!strcmp(value,"no")) opts->dotsOK = 0; + else ret = 0; + } + else if (!strcmp(this_char,"uid")) { + if (!value || !*value) ret = 0; + else { + opts->fs_uid = simple_strtoul(value,&value,0); + if (*value) ret = 0; + } + } + else if (!strcmp(this_char,"gid")) { + if (!value || !*value) ret= 0; + else { + opts->fs_gid = simple_strtoul(value,&value,0); + if (*value) ret = 0; + } + } + else if (!strcmp(this_char,"umask")) { + if (!value || !*value) ret = 0; + else { + opts->fs_umask = simple_strtoul(value,&value,8); + if (*value) ret = 0; + } + } + else if (!strcmp(this_char,"debug")) { + if (value) ret = 0; + else *debug = 1; + } + else if (!strcmp(this_char,"fat")) { + if (!value || !*value) ret = 0; + else { + *fat = simple_strtoul(value,&value,0); + if (*value || (*fat != 12 && *fat != 16 && + *fat != 32)) + ret = 0; + } + } + else if (!strcmp(this_char,"quiet")) { + if (value) ret = 0; + else opts->quiet = 1; + } + else if (!strcmp(this_char,"blocksize")) { + printk("FAT: blocksize option is obsolete, " + "not supported now\n"); + } + else if (!strcmp(this_char,"sys_immutable")) { + if (value) ret = 0; + else opts->sys_immutable = 1; + } + else if (!strcmp(this_char,"codepage") && value) { + opts->codepage = simple_strtoul(value,&value,0); + if (*value) ret = 0; + else printk ("MSDOS FS: Using codepage %d\n", + opts->codepage); + } + else if (!strcmp(this_char,"iocharset") && value) { + p = value; + while (*value && *value != ',') + value++; + len = value - p; + if (len) { + char *buffer; + + if (opts->iocharset != NULL) { + kfree(opts->iocharset); + opts->iocharset = NULL; + } + buffer = kmalloc(len + 1, GFP_KERNEL); + if (buffer != NULL) { + opts->iocharset = buffer; + memcpy(buffer, p, len); + buffer[len] = 0; + printk("MSDOS FS: IO charset %s\n", buffer); + } else + ret = 0; + } + } + else if (!strcmp(this_char,"cvf_format")) { + if (!value) + return 0; + strncpy(cvf_format,value,20); + } + else if (!strcmp(this_char,"cvf_options")) { + if (!value) + return 0; + strncpy(cvf_options,value,100); + } + + if (this_char != options) *(this_char-1) = ','; + if (value) *savep = save; + if (ret == 0) + break; + } +out: + return ret; +} + +static void fat_read_root(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int nr; + + INIT_LIST_HEAD(&MSDOS_I(inode)->i_fat_hash); + MSDOS_I(inode)->i_location = 0; + MSDOS_I(inode)->i_fat_inode = inode; + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode->i_version = ++event; + inode->i_generation = 0; + inode->i_mode = (S_IRWXUGO & ~sbi->options.fs_umask) | S_IFDIR; + inode->i_op = sbi->dir_ops; + inode->i_fop = &fat_dir_operations; + if (sbi->fat_bits == 32) { + MSDOS_I(inode)->i_start = sbi->root_cluster; + if ((nr = MSDOS_I(inode)->i_start) != 0) { + while (nr != -1) { + inode->i_size += 1 << sbi->cluster_bits; + if (!(nr = fat_access(sb, nr, -1))) { + printk("Directory %ld: bad FAT\n", + inode->i_ino); + break; + } + } + } + } else { + MSDOS_I(inode)->i_start = 0; + inode->i_size = sbi->dir_entries * sizeof(struct msdos_dir_entry); + } + inode->i_blksize = 1 << sbi->cluster_bits; + inode->i_blocks = ((inode->i_size + inode->i_blksize - 1) + & ~(inode->i_blksize - 1)) / 512; + MSDOS_I(inode)->i_logstart = 0; + MSDOS_I(inode)->mmu_private = inode->i_size; + + MSDOS_I(inode)->i_attrs = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = 0; + MSDOS_I(inode)->i_ctime_ms = 0; + inode->i_nlink = fat_subdirs(inode)+2; +} + +/* + * a FAT file handle with fhtype 3 is + * 0/ i_ino - for fast, reliable lookup if still in the cache + * 1/ i_generation - to see if i_ino is still valid + * bit 0 == 0 iff directory + * 2/ i_location - if ino has changed, but still in cache + * 3/ i_logstart - to semi-verify inode found at i_location + * 4/ parent->i_logstart - maybe used to hunt for the file on disc + * + */ +struct dentry *fat_fh_to_dentry(struct super_block *sb, __u32 *fh, + int len, int fhtype, int parent) +{ + struct inode *inode = NULL; + struct list_head *lp; + struct dentry *result; + + if (fhtype != 3) + return NULL; + if (len < 5) + return NULL; + if (parent) + return NULL; /* We cannot find the parent, + It better just *be* there */ + + inode = iget(sb, fh[0]); + if (!inode || is_bad_inode(inode) || + inode->i_generation != fh[1]) { + if (inode) iput(inode); + inode = NULL; + } + if (!inode) { + /* try 2 - see if i_location is in F-d-c + * require i_logstart to be the same + * Will fail if you truncate and then re-write + */ + + inode = fat_iget(sb, fh[2]); + if (inode && MSDOS_I(inode)->i_logstart != fh[3]) { + iput(inode); + inode = NULL; + } + } + if (!inode) { + /* For now, do nothing + * What we could do is: + * follow the file starting at fh[4], and record + * the ".." entry, and the name of the fh[2] entry. + * The follow the ".." file finding the next step up. + * This way we build a path to the root of + * the tree. If this works, we lookup the path and so + * get this inode into the cache. + * Finally try the fat_iget lookup again + * If that fails, then weare totally out of luck + * But all that is for another day + */ + } + if (!inode) + return ERR_PTR(-ESTALE); + + + /* now to find a dentry. + * If possible, get a well-connected one + * + * Given the way that we found the inode, it *MUST* be + * well-connected, but it is easiest to just copy the + * code. + */ + spin_lock(&dcache_lock); + for (lp = inode->i_dentry.next; lp != &inode->i_dentry ; lp=lp->next) { + result = list_entry(lp,struct dentry, d_alias); + if (! (result->d_flags & DCACHE_NFSD_DISCONNECTED)) { + dget_locked(result); + result->d_vfs_flags |= DCACHE_REFERENCED; + spin_unlock(&dcache_lock); + iput(inode); + return result; + } + } + spin_unlock(&dcache_lock); + result = d_alloc_root(inode); + if (result == NULL) { + iput(inode); + return ERR_PTR(-ENOMEM); + } + result->d_flags |= DCACHE_NFSD_DISCONNECTED; + return result; + + +} + +int fat_dentry_to_fh(struct dentry *de, __u32 *fh, int *lenp, int needparent) +{ + int len = *lenp; + struct inode *inode = de->d_inode; + + if (len < 5) + return 255; /* no room */ + *lenp = 5; + fh[0] = inode->i_ino; + fh[1] = inode->i_generation; + fh[2] = MSDOS_I(inode)->i_location; + fh[3] = MSDOS_I(inode)->i_logstart; + fh[4] = MSDOS_I(de->d_parent->d_inode)->i_logstart; + return 3; +} + +static struct super_operations fat_sops = { + write_inode: fat_write_inode, + delete_inode: fat_delete_inode, + put_super: fat_put_super, + statfs: fat_statfs, + clear_inode: fat_clear_inode, + + read_inode: make_bad_inode, + fh_to_dentry: fat_fh_to_dentry, + dentry_to_fh: fat_dentry_to_fh, +}; + +/* + * Read the super block of an MS-DOS FS. + * + * Note that this may be called from vfat_read_super + * with some fields already initialized. + */ +struct super_block * +fat_read_super(struct super_block *sb, void *data, int silent, + struct inode_operations *fs_dir_inode_ops) +{ + struct inode *root_inode; + struct buffer_head *bh; + struct fat_boot_sector *b; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + char *p; + int logical_sector_size, hard_blksize, fat_clusters = 0; + unsigned int total_sectors, rootdir_sectors; + int fat32, debug, error, fat, cp; + struct fat_mount_options opts; + char buf[50]; + int i; + char cvf_format[21]; + char cvf_options[101]; + + cvf_format[0] = '\0'; + cvf_options[0] = '\0'; + sbi->cvf_format = NULL; + sbi->private_data = NULL; + + sbi->dir_ops = fs_dir_inode_ops; + + sb->s_maxbytes = MAX_NON_LFS; + sb->s_op = &fat_sops; + + hard_blksize = get_hardsect_size(sb->s_dev); + if (!hard_blksize) + hard_blksize = 512; + + opts.isvfat = sbi->options.isvfat; + if (!parse_options((char *) data, &fat, &debug, &opts, + cvf_format, cvf_options)) + goto out_fail; + /* N.B. we should parse directly into the sb structure */ + memcpy(&(sbi->options), &opts, sizeof(struct fat_mount_options)); + + fat_cache_init(); + + sb->s_blocksize = hard_blksize; + set_blocksize(sb->s_dev, hard_blksize); + bh = bread(sb->s_dev, 0, sb->s_blocksize); + if (bh == NULL) { + printk("FAT: unable to read boot sector\n"); + goto out_fail; + } + +/* + * The DOS3 partition size limit is *not* 32M as many people think. + * Instead, it is 64K sectors (with the usual sector size being + * 512 bytes, leading to a 32M limit). + * + * DOS 3 partition managers got around this problem by faking a + * larger sector size, ie treating multiple physical sectors as + * a single logical sector. + * + * We can accommodate this scheme by adjusting our cluster size, + * fat_start, and data_start by an appropriate value. + * + * (by Drew Eckhardt) + */ + + + b = (struct fat_boot_sector *) bh->b_data; + logical_sector_size = + CF_LE_W(get_unaligned((unsigned short *) &b->sector_size)); + if (!logical_sector_size + || (logical_sector_size & (logical_sector_size - 1))) { + printk("FAT: bogus logical sector size %d\n", + logical_sector_size); + brelse(bh); + goto out_invalid; + } + + sbi->cluster_size = b->cluster_size; + if (!sbi->cluster_size + || (sbi->cluster_size & (sbi->cluster_size - 1))) { + printk("FAT: bogus cluster size %d\n", sbi->cluster_size); + brelse(bh); + goto out_invalid; + } + + if (logical_sector_size < hard_blksize) { + printk("FAT: logical sector size too small for device" + " (logical sector size = %d)\n", logical_sector_size); + brelse(bh); + goto out_invalid; + } + + sbi->cluster_bits = ffs(logical_sector_size * sbi->cluster_size) - 1; + sbi->fats = b->fats; + sbi->fat_start = CF_LE_W(b->reserved); + if (!b->fat_length && b->fat32_length) { + struct fat_boot_fsinfo *fsinfo; + struct buffer_head *fsinfo_bh; + int fsinfo_block, fsinfo_offset; + + /* Must be FAT32 */ + fat32 = 1; + sbi->fat_length = CF_LE_L(b->fat32_length); + sbi->root_cluster = CF_LE_L(b->root_cluster); + + sbi->fsinfo_sector = CF_LE_W(b->info_sector); + /* MC - if info_sector is 0, don't multiply by 0 */ + if (sbi->fsinfo_sector == 0) + sbi->fsinfo_sector = 1; + + fsinfo_block = + (sbi->fsinfo_sector * logical_sector_size) / hard_blksize; + fsinfo_offset = + (sbi->fsinfo_sector * logical_sector_size) % hard_blksize; + fsinfo_bh = bh; + if (fsinfo_block != 0) { + fsinfo_bh = bread(sb->s_dev, fsinfo_block, hard_blksize); + if (fsinfo_bh == NULL) { + printk("FAT: bread failed, FSINFO block" + " (blocknr = %d)\n", fsinfo_block); + brelse(bh); + goto out_invalid; + } + } + fsinfo = (struct fat_boot_fsinfo *)&fsinfo_bh->b_data[fsinfo_offset]; + if (!IS_FSINFO(fsinfo)) { + printk("FAT: Did not find valid FSINFO signature.\n" + "Found signature1 0x%x signature2 0x%x sector=%ld.\n", + CF_LE_L(fsinfo->signature1), + CF_LE_L(fsinfo->signature2), + sbi->fsinfo_sector); + } else { + sbi->free_clusters = CF_LE_L(fsinfo->free_clusters); + } + + if (fsinfo_block != 0) + brelse(fsinfo_bh); + } else { + fat32 = 0; + sbi->fat_length = CF_LE_W(b->fat_length); + sbi->root_cluster = 0; + sbi->free_clusters = -1; /* Don't know yet */ + } + + sbi->dir_per_block = logical_sector_size / sizeof(struct msdos_dir_entry); + sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; + + sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length; + sbi->dir_entries = + CF_LE_W(get_unaligned((unsigned short *)&b->dir_entries)); + rootdir_sectors = sbi->dir_entries + * sizeof(struct msdos_dir_entry) / logical_sector_size; + sbi->data_start = sbi->dir_start + rootdir_sectors; + total_sectors = CF_LE_W(get_unaligned((unsigned short *)&b->sectors)); + if (total_sectors == 0) + total_sectors = CF_LE_L(b->total_sect); + sbi->clusters = (total_sectors - sbi->data_start) / sbi->cluster_size; + + error = 0; + if (!error) { + sbi->fat_bits = fat32 ? 32 : + (fat ? fat : + (sbi->clusters > MSDOS_FAT12 ? 16 : 12)); + fat_clusters = + sbi->fat_length * logical_sector_size * 8 / sbi->fat_bits; + error = !sbi->fats || (sbi->dir_entries & (sbi->dir_per_block - 1)) + || sbi->clusters + 2 > fat_clusters + MSDOS_MAX_EXTRA + || logical_sector_size < 512 + || PAGE_CACHE_SIZE < logical_sector_size + || !b->secs_track || !b->heads; + } + brelse(bh); + + if (error) + goto out_invalid; + + sb->s_blocksize = logical_sector_size; + sb->s_blocksize_bits = ffs(logical_sector_size) - 1; + set_blocksize(sb->s_dev, sb->s_blocksize); + sbi->cvf_format = &default_cvf; + if (!strcmp(cvf_format, "none")) + i = -1; + else + i = detect_cvf(sb,cvf_format); + if (i >= 0) + error = cvf_formats[i]->mount_cvf(sb, cvf_options); + if (error || debug) { + /* The MSDOS_CAN_BMAP is obsolete, but left just to remember */ + printk("[MS-DOS FS Rel. 12,FAT %d,check=%c,conv=%c," + "uid=%d,gid=%d,umask=%03o%s]\n", + sbi->fat_bits,opts.name_check, + opts.conversion,opts.fs_uid,opts.fs_gid,opts.fs_umask, + MSDOS_CAN_BMAP(sbi) ? ",bmap" : ""); + printk("[me=0x%x,cs=%d,#f=%d,fs=%d,fl=%ld,ds=%ld,de=%d,data=%ld," + "se=%u,ts=%u,ls=%d,rc=%ld,fc=%u]\n", + b->media, sbi->cluster_size, sbi->fats, + sbi->fat_start, sbi->fat_length, sbi->dir_start, + sbi->dir_entries, sbi->data_start, + CF_LE_W(get_unaligned((unsigned short *)&b->sectors)), + CF_LE_L(b->total_sect), logical_sector_size, + sbi->root_cluster, sbi->free_clusters); + printk ("hard sector size = %d\n", hard_blksize); + } + if (i < 0) + if (sbi->clusters + 2 > fat_clusters) + sbi->clusters = fat_clusters - 2; + if (error) + goto out_invalid; + + sb->s_magic = MSDOS_SUPER_MAGIC; + /* set up enough so that it can read an inode */ + init_MUTEX(&sbi->fat_lock); + sbi->prev_free = 0; + + cp = opts.codepage ? opts.codepage : 437; + sprintf(buf, "cp%d", cp); + sbi->nls_disk = load_nls(buf); + if (! sbi->nls_disk) { + /* Fail only if explicit charset specified */ + if (opts.codepage != 0) + goto out_fail; + sbi->options.codepage = 0; /* already 0?? */ + sbi->nls_disk = load_nls_default(); + } + + sbi->nls_io = NULL; + if (sbi->options.isvfat && !opts.utf8) { + p = opts.iocharset ? opts.iocharset : CONFIG_NLS_DEFAULT; + sbi->nls_io = load_nls(p); + if (! sbi->nls_io) + /* Fail only if explicit charset specified */ + if (opts.iocharset) + goto out_unload_nls; + } + if (! sbi->nls_io) + sbi->nls_io = load_nls_default(); + + root_inode = new_inode(sb); + if (!root_inode) + goto out_unload_nls; + root_inode->i_ino = MSDOS_ROOT_INO; + fat_read_root(root_inode); + insert_inode_hash(root_inode); + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) + goto out_no_root; + if(i >= 0) { + sbi->cvf_format = cvf_formats[i]; + ++cvf_format_use_count[i]; + } + return sb; + +out_no_root: + printk("FAT: get root inode failed\n"); + iput(root_inode); + unload_nls(sbi->nls_io); +out_unload_nls: + unload_nls(sbi->nls_disk); + goto out_fail; +out_invalid: + if (!silent) { + printk("VFS: Can't find a valid FAT filesystem on dev %s.\n", + kdevname(sb->s_dev)); + } +out_fail: + if (opts.iocharset) { + printk("FAT: freeing iocharset=%s\n", opts.iocharset); + kfree(opts.iocharset); + } + if(sbi->private_data) + kfree(sbi->private_data); + sbi->private_data = NULL; + + return NULL; +} + +int fat_statfs(struct super_block *sb,struct statfs *buf) +{ + int free,nr; + + if (MSDOS_SB(sb)->cvf_format && + MSDOS_SB(sb)->cvf_format->cvf_statfs) + return MSDOS_SB(sb)->cvf_format->cvf_statfs(sb,buf, + sizeof(struct statfs)); + + lock_fat(sb); + if (MSDOS_SB(sb)->free_clusters != -1) + free = MSDOS_SB(sb)->free_clusters; + else { + free = 0; + for (nr = 2; nr < MSDOS_SB(sb)->clusters+2; nr++) + if (!fat_access(sb,nr,-1)) free++; + MSDOS_SB(sb)->free_clusters = free; + } + unlock_fat(sb); + buf->f_type = sb->s_magic; + buf->f_bsize = 1 << MSDOS_SB(sb)->cluster_bits; + buf->f_blocks = MSDOS_SB(sb)->clusters; + buf->f_bfree = free; + buf->f_bavail = free; + buf->f_namelen = MSDOS_SB(sb)->options.isvfat ? 260 : 12; + return 0; +} + +static int is_exec(char *extension) +{ + char *exe_extensions = "EXECOMBAT", *walk; + + for (walk = exe_extensions; *walk; walk += 3) + if (!strncmp(extension, walk, 3)) + return 1; + return 0; +} + +static int fat_writepage(struct page *page) +{ + return block_write_full_page(page,fat_get_block); +} +static int fat_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,fat_get_block); +} +static int fat_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) +{ + return cont_prepare_write(page,from,to,fat_get_block, + &MSDOS_I(page->mapping->host)->mmu_private); +} +static int _fat_bmap(struct address_space *mapping, long block) +{ + return generic_block_bmap(mapping,block,fat_get_block); +} +static struct address_space_operations fat_aops = { + readpage: fat_readpage, + writepage: fat_writepage, + sync_page: block_sync_page, + prepare_write: fat_prepare_write, + commit_write: generic_commit_write, + bmap: _fat_bmap +}; + +/* doesn't deal with root inode */ +static void fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int nr; + + INIT_LIST_HEAD(&MSDOS_I(inode)->i_fat_hash); + MSDOS_I(inode)->i_location = 0; + MSDOS_I(inode)->i_fat_inode = inode; + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + inode->i_version = ++event; + inode->i_generation = CURRENT_TIME; + + if ((de->attr & ATTR_DIR) && !IS_FREE(de->name)) { + inode->i_generation &= ~1; + inode->i_mode = MSDOS_MKMODE(de->attr,S_IRWXUGO & + ~sbi->options.fs_umask) | S_IFDIR; + inode->i_op = sbi->dir_ops; + inode->i_fop = &fat_dir_operations; + + MSDOS_I(inode)->i_start = CF_LE_W(de->start); + if (sbi->fat_bits == 32) { + MSDOS_I(inode)->i_start |= + (CF_LE_W(de->starthi) << 16); + } + MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; + inode->i_nlink = fat_subdirs(inode); + /* includes .., compensating for "self" */ +#ifdef DEBUG + if (!inode->i_nlink) { + printk("directory %d: i_nlink == 0\n",inode->i_ino); + inode->i_nlink = 1; + } +#endif + if ((nr = MSDOS_I(inode)->i_start) != 0) + while (nr != -1) { + inode->i_size += 1 << sbi->cluster_bits; + if (!(nr = fat_access(sb, nr, -1))) { + printk("Directory %ld: bad FAT\n", + inode->i_ino); + break; + } + } + MSDOS_I(inode)->mmu_private = inode->i_size; + } else { /* not a directory */ + inode->i_generation |= 1; + inode->i_mode = MSDOS_MKMODE(de->attr, + ((sbi->options.showexec && + !is_exec(de->ext)) + ? S_IRUGO|S_IWUGO : S_IRWXUGO) + & ~sbi->options.fs_umask) | S_IFREG; + MSDOS_I(inode)->i_start = CF_LE_W(de->start); + if (sbi->fat_bits == 32) { + MSDOS_I(inode)->i_start |= + (CF_LE_W(de->starthi) << 16); + } + MSDOS_I(inode)->i_logstart = MSDOS_I(inode)->i_start; + inode->i_size = CF_LE_L(de->size); + inode->i_op = &fat_file_inode_operations; + inode->i_fop = &fat_file_operations; + inode->i_mapping->a_ops = &fat_aops; + MSDOS_I(inode)->mmu_private = inode->i_size; + } + if(de->attr & ATTR_SYS) + if (sbi->options.sys_immutable) + inode->i_flags |= S_IMMUTABLE_FILE; + MSDOS_I(inode)->i_attrs = de->attr & ATTR_UNUSED; + /* this is as close to the truth as we can get ... */ + inode->i_blksize = 1 << sbi->cluster_bits; + inode->i_blocks = ((inode->i_size + inode->i_blksize - 1) + & ~(inode->i_blksize - 1)) / 512; + inode->i_mtime = inode->i_atime = + date_dos2unix(CF_LE_W(de->time),CF_LE_W(de->date)); + inode->i_ctime = + MSDOS_SB(sb)->options.isvfat + ? date_dos2unix(CF_LE_W(de->ctime),CF_LE_W(de->cdate)) + : inode->i_mtime; + MSDOS_I(inode)->i_ctime_ms = de->ctime_ms; +} + +void fat_write_inode(struct inode *inode, int wait) +{ + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; + struct msdos_dir_entry *raw_entry; + unsigned int i_pos; + +retry: + i_pos = MSDOS_I(inode)->i_location; + if (inode->i_ino == MSDOS_ROOT_INO || !i_pos) { + return; + } + lock_kernel(); + if (!(bh = fat_bread(sb, i_pos >> MSDOS_SB(sb)->dir_per_block_bits))) { + printk("dev = %s, ino = %d\n", kdevname(inode->i_dev), i_pos); + fat_fs_panic(sb, "msdos_write_inode: unable to read i-node block"); + unlock_kernel(); + return; + } + spin_lock(&fat_inode_lock); + if (i_pos != MSDOS_I(inode)->i_location) { + spin_unlock(&fat_inode_lock); + fat_brelse(sb, bh); + unlock_kernel(); + goto retry; + } + + raw_entry = &((struct msdos_dir_entry *) (bh->b_data)) + [i_pos & (MSDOS_SB(sb)->dir_per_block - 1)]; + if (S_ISDIR(inode->i_mode)) { + raw_entry->attr = ATTR_DIR; + raw_entry->size = 0; + } + else { + raw_entry->attr = ATTR_NONE; + raw_entry->size = CT_LE_L(inode->i_size); + } + raw_entry->attr |= MSDOS_MKATTR(inode->i_mode) | + MSDOS_I(inode)->i_attrs; + raw_entry->start = CT_LE_W(MSDOS_I(inode)->i_logstart); + raw_entry->starthi = CT_LE_W(MSDOS_I(inode)->i_logstart >> 16); + fat_date_unix2dos(inode->i_mtime,&raw_entry->time,&raw_entry->date); + raw_entry->time = CT_LE_W(raw_entry->time); + raw_entry->date = CT_LE_W(raw_entry->date); + if (MSDOS_SB(sb)->options.isvfat) { + fat_date_unix2dos(inode->i_ctime,&raw_entry->ctime,&raw_entry->cdate); + raw_entry->ctime_ms = MSDOS_I(inode)->i_ctime_ms; + raw_entry->ctime = CT_LE_W(raw_entry->ctime); + raw_entry->cdate = CT_LE_W(raw_entry->cdate); + } + spin_unlock(&fat_inode_lock); + fat_mark_buffer_dirty(sb, bh); + fat_brelse(sb, bh); + unlock_kernel(); +} + + +int fat_notify_change(struct dentry * dentry, struct iattr * attr) +{ + struct super_block *sb = dentry->d_sb; + struct inode *inode = dentry->d_inode; + int error; + + /* FAT cannot truncate to a longer file */ + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size > inode->i_size) + return -EPERM; + } + + error = inode_change_ok(inode, attr); + if (error) + return MSDOS_SB(sb)->options.quiet ? 0 : error; + + if (((attr->ia_valid & ATTR_UID) && + (attr->ia_uid != MSDOS_SB(sb)->options.fs_uid)) || + ((attr->ia_valid & ATTR_GID) && + (attr->ia_gid != MSDOS_SB(sb)->options.fs_gid)) || + ((attr->ia_valid & ATTR_MODE) && + (attr->ia_mode & ~MSDOS_VALID_MODE))) + error = -EPERM; + + if (error) + return MSDOS_SB(sb)->options.quiet ? 0 : error; + + error = inode_setattr(inode, attr); + if (error) + return error; + + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_IXUGO; + + inode->i_mode = ((inode->i_mode & S_IFMT) | ((((inode->i_mode & S_IRWXU + & ~MSDOS_SB(sb)->options.fs_umask) | S_IRUSR) >> 6)*S_IXUGO)) & + ~MSDOS_SB(sb)->options.fs_umask; + return 0; +} +MODULE_LICENSE("GPL"); diff -urN linux-2.4.16-reiserfspatches-immutable/fs/hpfs/file.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/hpfs/file.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/hpfs/file.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/hpfs/file.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,136 @@ +/* + * linux/fs/hpfs/file.c + * + * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 + * + * file VFS functions + */ + +#include +#include +#include +#include "hpfs_fn.h" + +#define BLOCKS(size) (((size) + 511) >> 9) + +/* HUH? */ +int hpfs_open(struct inode *i, struct file *f) +{ + lock_kernel(); + hpfs_lock_inode(i); + hpfs_unlock_inode(i); /* make sure nobody is deleting the file */ + unlock_kernel(); + if (!i->i_nlink) return -ENOENT; + return 0; +} + +int hpfs_file_release(struct inode *inode, struct file *file) +{ + lock_kernel(); + hpfs_write_if_changed(inode); + unlock_kernel(); + return 0; +} + +int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + /*return file_fsync(file, dentry);*/ + return 0; /* Don't fsync :-) */ +} + +/* + * generic_file_read often calls bmap with non-existing sector, + * so we must ignore such errors. + */ + +secno hpfs_bmap(struct inode *inode, unsigned file_secno) +{ + unsigned n, disk_secno; + struct fnode *fnode; + struct buffer_head *bh; + if (BLOCKS(inode->u.hpfs_i.mmu_private) <= file_secno) return 0; + n = file_secno - inode->i_hpfs_file_sec; + if (n < inode->i_hpfs_n_secs) return inode->i_hpfs_disk_sec + n; + if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0; + disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh); + if (disk_secno == -1) return 0; + if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0; + return disk_secno; +} + +void hpfs_truncate(struct inode *i) +{ + if (IS_IMMUTABLE_FILE(i)) return /*-EPERM*/; + i->i_hpfs_n_secs = 0; + i->i_blocks = 1 + ((i->i_size + 511) >> 9); + i->u.hpfs_i.mmu_private = i->i_size; + hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9)); + hpfs_write_inode(i); +} + +int hpfs_get_block(struct inode *inode, long iblock, struct buffer_head *bh_result, int create) +{ + secno s; + s = hpfs_bmap(inode, iblock); + if (s) { + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = s; + bh_result->b_state |= (1UL << BH_Mapped); + return 0; + } + if (!create) return 0; + if (iblock<<9 != inode->u.hpfs_i.mmu_private) { + BUG(); + return -EIO; + } + if ((s = hpfs_add_sector_to_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1)) == -1) { + hpfs_truncate_btree(inode->i_sb, inode->i_ino, 1, inode->i_blocks - 1); + return -ENOSPC; + } + inode->i_blocks++; + inode->u.hpfs_i.mmu_private += 512; + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = s; + bh_result->b_state |= (1UL << BH_Mapped) | (1UL << BH_New); + return 0; +} + +static int hpfs_writepage(struct page *page) +{ + return block_write_full_page(page,hpfs_get_block); +} +static int hpfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page,hpfs_get_block); +} +static int hpfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) +{ + return cont_prepare_write(page,from,to,hpfs_get_block, + &page->mapping->host->u.hpfs_i.mmu_private); +} +static int _hpfs_bmap(struct address_space *mapping, long block) +{ + return generic_block_bmap(mapping,block,hpfs_get_block); +} +struct address_space_operations hpfs_aops = { + readpage: hpfs_readpage, + writepage: hpfs_writepage, + sync_page: block_sync_page, + prepare_write: hpfs_prepare_write, + commit_write: generic_commit_write, + bmap: _hpfs_bmap +}; + +ssize_t hpfs_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) +{ + ssize_t retval; + + retval = generic_file_write(file, buf, count, ppos); + if (retval > 0) { + struct inode *inode = file->f_dentry->d_inode; + inode->i_mtime = CURRENT_TIME; + inode->i_hpfs_dirty = 1; + } + return retval; +} + diff -urN linux-2.4.16-reiserfspatches-immutable/fs/namei.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/namei.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/namei.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/namei.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,2033 @@ +/* + * linux/fs/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * Some corrections by tytso. + */ + +/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname + * lookup logic. + */ +/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) + +/* [Feb-1997 T. Schoebel-Theuer] + * Fundamental changes in the pathname lookup mechanisms (namei) + * were necessary because of omirr. The reason is that omirr needs + * to know the _real_ pathname, not the user-supplied one, in case + * of symlinks (and also when transname replacements occur). + * + * The new code replaces the old recursive symlink resolution with + * an iterative one (in case of non-nested symlink chains). It does + * this with calls to _follow_link(). + * As a side effect, dir_namei(), _namei() and follow_link() are now + * replaced with a single function lookup_dentry() that can handle all + * the special cases of the former code. + * + * With the new dcache, the pathname is stored at each inode, at least as + * long as the refcount of the inode is positive. As a side effect, the + * size of the dcache depends on the inode cache and thus is dynamic. + * + * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink + * resolution to correspond with current state of the code. + * + * Note that the symlink resolution is not *completely* iterative. + * There is still a significant amount of tail- and mid- recursion in + * the algorithm. Also, note that _readlink() is not used in + * lookup_dentry(): lookup_dentry() on the result of _readlink() + * may return different results than _follow_link(). Many virtual + * filesystems (including /proc) exhibit this behavior. + */ + +/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: + * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL + * and the name already exists in form of a symlink, try to create the new + * name indicated by the symlink. The old code always complained that the + * name already exists, due to not following the symlink even if its target + * is nonexistent. The new semantics affects also mknod() and link() when + * the name is a symlink pointing to a non-existant name. + * + * I don't know which semantics is the right one, since I have no access + * to standards. But I found by trial that HP-UX 9.0 has the full "new" + * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the + * "old" one. Personally, I think the new semantics is much more logical. + * Note that "ln old new" where "new" is a symlink pointing to a non-existing + * file does succeed in both HP-UX and SunOs, but not in Solaris + * and in the old Linux semantics. + */ + +/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink + * semantics. See the comments in "open_namei" and "do_link" below. + * + * [10-Sep-98 Alan Modra] Another symlink change. + */ + +/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: + * inside the path - always follow. + * in the last component in creation/removal/renaming - never follow. + * if LOOKUP_FOLLOW passed - follow. + * if the pathname has trailing slashes - follow. + * otherwise - don't follow. + * (applied in that order). + * + * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT + * restored for 2.4. This is the last surviving part of old 4.2BSD bug. + * During the 2.4 we need to fix the userland stuff depending on it - + * hopefully we will be able to get rid of that wart in 2.5. So far only + * XEmacs seems to be relying on it... + */ + +/* In order to reduce some races, while at the same time doing additional + * checking and hopefully speeding things up, we copy filenames to the + * kernel data space before using them.. + * + * POSIX.1 2.4: an empty pathname is invalid (ENOENT). + */ +static inline int do_getname(const char *filename, char *page) +{ + int retval; + unsigned long len = PATH_MAX + 1; + + if ((unsigned long) filename >= TASK_SIZE) { + if (!segment_eq(get_fs(), KERNEL_DS)) + return -EFAULT; + } else if (TASK_SIZE - (unsigned long) filename < PATH_MAX + 1) + len = TASK_SIZE - (unsigned long) filename; + + retval = strncpy_from_user((char *)page, filename, len); + if (retval > 0) { + if (retval < len) + return 0; + return -ENAMETOOLONG; + } else if (!retval) + retval = -ENOENT; + return retval; +} + +char * getname(const char * filename) +{ + char *tmp, *result; + + result = ERR_PTR(-ENOMEM); + tmp = __getname(); + if (tmp) { + int retval = do_getname(filename, tmp); + + result = tmp; + if (retval < 0) { + putname(tmp); + result = ERR_PTR(retval); + } + } + return result; +} + +/* + * vfs_permission() + * + * is used to check for read/write/execute permissions on a file. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things.. + */ +int vfs_permission(struct inode * inode, int mask) +{ + umode_t mode = inode->i_mode; + + if (mask & MAY_WRITE) { + /* + * Nobody gets write access to a read-only fs. + */ + if (IS_RDONLY(inode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; + + /* + * Nobody gets write access to an immutable file. + */ + if (IS_IMMUTABLE_FILE(inode)) + return -EACCES; + } + + if (current->fsuid == inode->i_uid) + mode >>= 6; + else if (in_group_p(inode->i_gid)) + mode >>= 3; + + /* + * If the DACs are ok we don't need any capability check. + */ + if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)) + return 0; + + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable if at least one exec bit is set. + */ + if ((mask & (MAY_READ|MAY_WRITE)) || (inode->i_mode & S_IXUGO)) + if (capable(CAP_DAC_OVERRIDE)) + return 0; + + /* + * Searching includes executable on directories, else just read. + */ + if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) + if (capable(CAP_DAC_READ_SEARCH)) + return 0; + + return -EACCES; +} + +int permission(struct inode * inode,int mask) +{ + if (inode->i_op && inode->i_op->permission) { + int retval; + lock_kernel(); + retval = inode->i_op->permission(inode, mask); + unlock_kernel(); + return retval; + } + return vfs_permission(inode, mask); +} + +/* + * get_write_access() gets write permission for a file. + * put_write_access() releases this write permission. + * This is used for regular files. + * We cannot support write (and maybe mmap read-write shared) accesses and + * MAP_DENYWRITE mmappings simultaneously. The i_writecount field of an inode + * can have the following values: + * 0: no writers, no VM_DENYWRITE mappings + * < 0: (-i_writecount) vm_area_structs with VM_DENYWRITE set exist + * > 0: (i_writecount) users are writing to the file. + * + * Normally we operate on that counter with atomic_{inc,dec} and it's safe + * except for the cases where we don't hold i_writecount yet. Then we need to + * use {get,deny}_write_access() - these functions check the sign and refuse + * to do the change if sign is wrong. Exclusion between them is provided by + * spinlock (arbitration_lock) and I'll rip the second arsehole to the first + * who will try to move it in struct inode - just leave it here. + */ +static spinlock_t arbitration_lock = SPIN_LOCK_UNLOCKED; +int get_write_access(struct inode * inode) +{ + spin_lock(&arbitration_lock); + if (atomic_read(&inode->i_writecount) < 0) { + spin_unlock(&arbitration_lock); + return -ETXTBSY; + } + atomic_inc(&inode->i_writecount); + spin_unlock(&arbitration_lock); + return 0; +} +int deny_write_access(struct file * file) +{ + spin_lock(&arbitration_lock); + if (atomic_read(&file->f_dentry->d_inode->i_writecount) > 0) { + spin_unlock(&arbitration_lock); + return -ETXTBSY; + } + atomic_dec(&file->f_dentry->d_inode->i_writecount); + spin_unlock(&arbitration_lock); + return 0; +} + +void path_release(struct nameidata *nd) +{ + dput(nd->dentry); + mntput(nd->mnt); +} + +/* + * Internal lookup() using the new generic dcache. + * SMP-safe + */ +static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags) +{ + struct dentry * dentry = d_lookup(parent, name); + + if (dentry && dentry->d_op && dentry->d_op->d_revalidate) { + if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) { + dput(dentry); + dentry = NULL; + } + } + return dentry; +} + +/* + * This is called when everything else fails, and we actually have + * to go to the low-level filesystem to find out what we should do.. + * + * We get the directory semaphore, and after getting that we also + * make sure that nobody added the entry to the dcache in the meantime.. + * SMP-safe + */ +static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags) +{ + struct dentry * result; + struct inode *dir = parent->d_inode; + + down(&dir->i_sem); + /* + * First re-do the cached lookup just in case it was created + * while we waited for the directory semaphore.. + * + * FIXME! This could use version numbering or similar to + * avoid unnecessary cache lookups. + */ + result = d_lookup(parent, name); + if (!result) { + struct dentry * dentry = d_alloc(parent, name); + result = ERR_PTR(-ENOMEM); + if (dentry) { + lock_kernel(); + result = dir->i_op->lookup(dir, dentry); + unlock_kernel(); + if (result) + dput(dentry); + else + result = dentry; + } + up(&dir->i_sem); + return result; + } + + /* + * Uhhuh! Nasty case: the cache was re-populated while + * we waited on the semaphore. Need to revalidate. + */ + up(&dir->i_sem); + if (result->d_op && result->d_op->d_revalidate) { + if (!result->d_op->d_revalidate(result, flags) && !d_invalidate(result)) { + dput(result); + result = ERR_PTR(-ENOENT); + } + } + return result; +} + +/* + * This limits recursive symlink follows to 8, while + * limiting consecutive symlinks to 40. + * + * Without that kind of total limit, nasty chains of consecutive + * symlinks can cause almost arbitrarily long lookups. + */ +static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + int err; + if (current->link_count >= 5) + goto loop; + if (current->total_link_count >= 40) + goto loop; + if (current->need_resched) { + current->state = TASK_RUNNING; + schedule(); + } + current->link_count++; + current->total_link_count++; + UPDATE_ATIME(dentry->d_inode); + err = dentry->d_inode->i_op->follow_link(dentry, nd); + current->link_count--; + return err; +loop: + path_release(nd); + return -ELOOP; +} + +static inline int __follow_up(struct vfsmount **mnt, struct dentry **base) +{ + struct vfsmount *parent; + struct dentry *dentry; + spin_lock(&dcache_lock); + parent=(*mnt)->mnt_parent; + if (parent == *mnt) { + spin_unlock(&dcache_lock); + return 0; + } + mntget(parent); + dentry=dget((*mnt)->mnt_mountpoint); + spin_unlock(&dcache_lock); + dput(*base); + *base = dentry; + mntput(*mnt); + *mnt = parent; + return 1; +} + +int follow_up(struct vfsmount **mnt, struct dentry **dentry) +{ + return __follow_up(mnt, dentry); +} + +static inline int __follow_down(struct vfsmount **mnt, struct dentry **dentry) +{ + struct vfsmount *mounted; + + spin_lock(&dcache_lock); + mounted = lookup_mnt(*mnt, *dentry); + if (mounted) { + *mnt = mntget(mounted); + spin_unlock(&dcache_lock); + dput(*dentry); + mntput(mounted->mnt_parent); + *dentry = dget(mounted->mnt_root); + return 1; + } + spin_unlock(&dcache_lock); + return 0; +} + +int follow_down(struct vfsmount **mnt, struct dentry **dentry) +{ + return __follow_down(mnt,dentry); +} + +static inline void follow_dotdot(struct nameidata *nd) +{ + while(1) { + struct vfsmount *parent; + struct dentry *dentry; + read_lock(¤t->fs->lock); + if (nd->dentry == current->fs->root && + nd->mnt == current->fs->rootmnt) { + read_unlock(¤t->fs->lock); + break; + } + read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); + if (nd->dentry != nd->mnt->mnt_root) { + dentry = dget(nd->dentry->d_parent); + spin_unlock(&dcache_lock); + dput(nd->dentry); + nd->dentry = dentry; + break; + } + parent=nd->mnt->mnt_parent; + if (parent == nd->mnt) { + spin_unlock(&dcache_lock); + break; + } + mntget(parent); + dentry=dget(nd->mnt->mnt_mountpoint); + spin_unlock(&dcache_lock); + dput(nd->dentry); + nd->dentry = dentry; + mntput(nd->mnt); + nd->mnt = parent; + } +} + +/* + * Name resolution. + * + * This is the basic name resolution function, turning a pathname + * into the final dentry. + * + * We expect 'base' to be positive and a directory. + */ +int link_path_walk(const char * name, struct nameidata *nd) +{ + struct dentry *dentry; + struct inode *inode; + int err; + unsigned int lookup_flags = nd->flags; + + while (*name=='/') + name++; + if (!*name) + goto return_base; + + inode = nd->dentry->d_inode; + if (current->link_count) + lookup_flags = LOOKUP_FOLLOW; + + /* At this point we know we have a real path component. */ + for(;;) { + unsigned long hash; + struct qstr this; + unsigned int c; + + err = permission(inode, MAY_EXEC); + dentry = ERR_PTR(err); + if (err) + break; + + this.name = name; + c = *(const unsigned char *)name; + + hash = init_name_hash(); + do { + name++; + hash = partial_name_hash(c, hash); + c = *(const unsigned char *)name; + } while (c && (c != '/')); + this.len = name - (const char *) this.name; + this.hash = end_name_hash(hash); + + /* remove trailing slashes? */ + if (!c) + goto last_component; + while (*++name == '/'); + if (!*name) + goto last_with_slashes; + + /* + * "." and ".." are special - ".." especially so because it has + * to be able to know about the current root directory and + * parent relationships. + */ + if (this.name[0] == '.') switch (this.len) { + default: + break; + case 2: + if (this.name[1] != '.') + break; + follow_dotdot(nd); + inode = nd->dentry->d_inode; + /* fallthrough */ + case 1: + continue; + } + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { + err = nd->dentry->d_op->d_hash(nd->dentry, &this); + if (err < 0) + break; + } + /* This does the actual lookups.. */ + dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE); + if (!dentry) { + dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; + } + /* Check mountpoints.. */ + while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) + ; + + err = -ENOENT; + inode = dentry->d_inode; + if (!inode) + goto out_dput; + err = -ENOTDIR; + if (!inode->i_op) + goto out_dput; + + if (inode->i_op->follow_link) { + err = do_follow_link(dentry, nd); + dput(dentry); + if (err) + goto return_err; + err = -ENOENT; + inode = nd->dentry->d_inode; + if (!inode) + break; + err = -ENOTDIR; + if (!inode->i_op) + break; + } else { + dput(nd->dentry); + nd->dentry = dentry; + } + err = -ENOTDIR; + if (!inode->i_op->lookup) + break; + continue; + /* here ends the main loop */ + +last_with_slashes: + lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; +last_component: + if (lookup_flags & LOOKUP_PARENT) + goto lookup_parent; + if (this.name[0] == '.') switch (this.len) { + default: + break; + case 2: + if (this.name[1] != '.') + break; + follow_dotdot(nd); + inode = nd->dentry->d_inode; + /* fallthrough */ + case 1: + goto return_base; + } + if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { + err = nd->dentry->d_op->d_hash(nd->dentry, &this); + if (err < 0) + break; + } + dentry = cached_lookup(nd->dentry, &this, 0); + if (!dentry) { + dentry = real_lookup(nd->dentry, &this, 0); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + break; + } + while (d_mountpoint(dentry) && __follow_down(&nd->mnt, &dentry)) + ; + inode = dentry->d_inode; + if ((lookup_flags & LOOKUP_FOLLOW) + && inode && inode->i_op && inode->i_op->follow_link) { + err = do_follow_link(dentry, nd); + dput(dentry); + if (err) + goto return_err; + inode = nd->dentry->d_inode; + } else { + dput(nd->dentry); + nd->dentry = dentry; + } + err = -ENOENT; + if (!inode) + goto no_inode; + if (lookup_flags & LOOKUP_DIRECTORY) { + err = -ENOTDIR; + if (!inode->i_op || !inode->i_op->lookup) + break; + } + goto return_base; +no_inode: + err = -ENOENT; + if (lookup_flags & (LOOKUP_POSITIVE|LOOKUP_DIRECTORY)) + break; + goto return_base; +lookup_parent: + nd->last = this; + nd->last_type = LAST_NORM; + if (this.name[0] != '.') + goto return_base; + if (this.len == 1) + nd->last_type = LAST_DOT; + else if (this.len == 2 && this.name[1] == '.') + nd->last_type = LAST_DOTDOT; +return_base: + return 0; +out_dput: + dput(dentry); + break; + } + path_release(nd); +return_err: + return err; +} + +int path_walk(const char * name, struct nameidata *nd) +{ + current->total_link_count = 0; + return link_path_walk(name, nd); +} + +/* SMP-safe */ +/* returns 1 if everything is done */ +static int __emul_lookup_dentry(const char *name, struct nameidata *nd) +{ + if (path_walk(name, nd)) + return 0; /* something went wrong... */ + + if (!nd->dentry->d_inode || S_ISDIR(nd->dentry->d_inode->i_mode)) { + struct nameidata nd_root; + /* + * NAME was not found in alternate root or it's a directory. Try to find + * it in the normal root: + */ + nd_root.last_type = LAST_ROOT; + nd_root.flags = nd->flags; + read_lock(¤t->fs->lock); + nd_root.mnt = mntget(current->fs->rootmnt); + nd_root.dentry = dget(current->fs->root); + read_unlock(¤t->fs->lock); + if (path_walk(name, &nd_root)) + return 1; + if (nd_root.dentry->d_inode) { + path_release(nd); + nd->dentry = nd_root.dentry; + nd->mnt = nd_root.mnt; + nd->last = nd_root.last; + return 1; + } + path_release(&nd_root); + } + return 1; +} + +void set_fs_altroot(void) +{ + char *emul = __emul_prefix(); + struct nameidata nd; + struct vfsmount *mnt = NULL, *oldmnt; + struct dentry *dentry = NULL, *olddentry; + if (emul) { + read_lock(¤t->fs->lock); + nd.mnt = mntget(current->fs->rootmnt); + nd.dentry = dget(current->fs->root); + read_unlock(¤t->fs->lock); + nd.flags = LOOKUP_FOLLOW|LOOKUP_DIRECTORY|LOOKUP_POSITIVE; + if (path_walk(emul,&nd) == 0) { + mnt = nd.mnt; + dentry = nd.dentry; + } + } + write_lock(¤t->fs->lock); + oldmnt = current->fs->altrootmnt; + olddentry = current->fs->altroot; + current->fs->altrootmnt = mnt; + current->fs->altroot = dentry; + write_unlock(¤t->fs->lock); + if (olddentry) { + dput(olddentry); + mntput(oldmnt); + } +} + +/* SMP-safe */ +static inline int +walk_init_root(const char *name, struct nameidata *nd) +{ + read_lock(¤t->fs->lock); + if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { + nd->mnt = mntget(current->fs->altrootmnt); + nd->dentry = dget(current->fs->altroot); + read_unlock(¤t->fs->lock); + if (__emul_lookup_dentry(name,nd)) + return 0; + read_lock(¤t->fs->lock); + } + nd->mnt = mntget(current->fs->rootmnt); + nd->dentry = dget(current->fs->root); + read_unlock(¤t->fs->lock); + return 1; +} + +/* SMP-safe */ +int path_init(const char *name, unsigned int flags, struct nameidata *nd) +{ + nd->last_type = LAST_ROOT; /* if there are only slashes... */ + nd->flags = flags; + if (*name=='/') + return walk_init_root(name,nd); + read_lock(¤t->fs->lock); + nd->mnt = mntget(current->fs->pwdmnt); + nd->dentry = dget(current->fs->pwd); + read_unlock(¤t->fs->lock); + return 1; +} + +/* + * Restricted form of lookup. Doesn't follow links, single-component only, + * needs parent already locked. Doesn't follow mounts. + * SMP-safe. + */ +struct dentry * lookup_hash(struct qstr *name, struct dentry * base) +{ + struct dentry * dentry; + struct inode *inode; + int err; + + inode = base->d_inode; + err = permission(inode, MAY_EXEC); + dentry = ERR_PTR(err); + if (err) + goto out; + + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_op && base->d_op->d_hash) { + err = base->d_op->d_hash(base, name); + dentry = ERR_PTR(err); + if (err < 0) + goto out; + } + + dentry = cached_lookup(base, name, 0); + if (!dentry) { + struct dentry *new = d_alloc(base, name); + dentry = ERR_PTR(-ENOMEM); + if (!new) + goto out; + lock_kernel(); + dentry = inode->i_op->lookup(inode, new); + unlock_kernel(); + if (!dentry) + dentry = new; + else + dput(new); + } +out: + return dentry; +} + +/* SMP-safe */ +struct dentry * lookup_one_len(const char * name, struct dentry * base, int len) +{ + unsigned long hash; + struct qstr this; + unsigned int c; + + this.name = name; + this.len = len; + if (!len) + goto access; + + hash = init_name_hash(); + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + goto access; + hash = partial_name_hash(c, hash); + } + this.hash = end_name_hash(hash); + + return lookup_hash(&this, base); +access: + return ERR_PTR(-EACCES); +} + +/* + * namei() + * + * is used by most simple commands to get the inode of a specified name. + * Open, link etc use their own routines, but this is enough for things + * like 'chmod' etc. + * + * namei exists in two versions: namei/lnamei. The only difference is + * that namei follows links, while lnamei does not. + * SMP-safe + */ +int __user_walk(const char *name, unsigned flags, struct nameidata *nd) +{ + char *tmp; + int err; + + tmp = getname(name); + err = PTR_ERR(tmp); + if (!IS_ERR(tmp)) { + err = 0; + if (path_init(tmp, flags, nd)) + err = path_walk(tmp, nd); + putname(tmp); + } + return err; +} + +/* + * It's inline, so penalty for filesystems that don't use sticky bit is + * minimal. + */ +static inline int check_sticky(struct inode *dir, struct inode *inode) +{ + if (!(dir->i_mode & S_ISVTX)) + return 0; + if (inode->i_uid == current->fsuid) + return 0; + if (dir->i_uid == current->fsuid) + return 0; + return !capable(CAP_FOWNER); +} + +/* + * Check whether we can remove a link victim from directory dir, check + * whether the type of victim is right. + * 1. We can't do it if dir is read-only (done in permission()) + * 2. We should have write and exec permissions on dir + * 3. We can't remove anything from append-only dir + * 4. We can't do anything with immutable dir (done in permission()) + * 5. If the sticky bit on dir is set we should either + * a. be owner of dir, or + * b. be owner of victim, or + * c. have CAP_FOWNER capability + * 6. If the victim is append-only or immutable we can't do antyhing with + * links pointing to it. + * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR. + * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR. + * 9. We can't remove a root or mountpoint. + */ +static inline int may_delete(struct inode *dir,struct dentry *victim, int isdir) +{ + int error; + if (!victim->d_inode || victim->d_parent->d_inode != dir) + return -ENOENT; + error = permission(dir,MAY_WRITE | MAY_EXEC); + if (error) + return error; + if (IS_APPEND(dir)) + return -EPERM; + if (check_sticky(dir, victim->d_inode)||IS_APPEND(victim->d_inode)||IS_IMMUTABLE_LINK(victim->d_inode)) + return -EPERM; + if (isdir) { + if (!S_ISDIR(victim->d_inode->i_mode)) + return -ENOTDIR; + if (IS_ROOT(victim)) + return -EBUSY; + } else if (S_ISDIR(victim->d_inode->i_mode)) + return -EISDIR; + return 0; +} + +/* Check whether we can create an object with dentry child in directory + * dir. + * 1. We can't do it if child already exists (open has special treatment for + * this case, but since we are inlined it's OK) + * 2. We can't do it if dir is read-only (done in permission()) + * 3. We should have write and exec permissions on dir + * 4. We can't do it if dir is immutable (done in permission()) + */ +static inline int may_create(struct inode *dir, struct dentry *child) { + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return permission(dir,MAY_WRITE | MAY_EXEC); +} + +/* + * Special case: O_CREAT|O_EXCL implies O_NOFOLLOW for security + * reasons. + * + * O_DIRECTORY translates into forcing a directory lookup. + */ +static inline int lookup_flags(unsigned int f) +{ + unsigned long retval = LOOKUP_FOLLOW; + + if (f & O_NOFOLLOW) + retval &= ~LOOKUP_FOLLOW; + + if ((f & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) + retval &= ~LOOKUP_FOLLOW; + + if (f & O_DIRECTORY) + retval |= LOOKUP_DIRECTORY; + + return retval; +} + +int vfs_create(struct inode *dir, struct dentry *dentry, int mode) +{ + int error; + + mode &= S_IALLUGO; + mode |= S_IFREG; + + down(&dir->i_zombie); + error = may_create(dir, dentry); + if (error) + goto exit_lock; + + error = -EACCES; /* shouldn't it be ENOSYS? */ + if (!dir->i_op || !dir->i_op->create) + goto exit_lock; + + DQUOT_INIT(dir); + lock_kernel(); + error = dir->i_op->create(dir, dentry, mode); + unlock_kernel(); +exit_lock: + up(&dir->i_zombie); + if (!error) + inode_dir_notify(dir, DN_CREATE); + return error; +} + +/* + * open_namei() + * + * namei for open - this is in fact almost the whole open-routine. + * + * Note that the low bits of "flag" aren't the same as in the open + * system call - they are 00 - no permissions needed + * 01 - read permission needed + * 10 - write permission needed + * 11 - read/write permissions needed + * which is a lot more logical, and also allows the "no perm" needed + * for symlinks (where the permissions are checked later). + * SMP-safe + */ +int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd) +{ + int acc_mode, error = 0; + struct inode *inode; + struct dentry *dentry; + struct dentry *dir; + int count = 0; + + acc_mode = ACC_MODE(flag); + + /* + * The simplest case - just a plain lookup. + */ + if (!(flag & O_CREAT)) { + if (path_init(pathname, lookup_flags(flag), nd)) + error = path_walk(pathname, nd); + if (error) + return error; + dentry = nd->dentry; + goto ok; + } + + /* + * Create - we need to know the parent. + */ + if (path_init(pathname, LOOKUP_PARENT, nd)) + error = path_walk(pathname, nd); + if (error) + return error; + + /* + * We have the parent and last component. First of all, check + * that we are not asked to creat(2) an obvious directory - that + * will not do. + */ + error = -EISDIR; + if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len]) + goto exit; + + dir = nd->dentry; + down(&dir->d_inode->i_sem); + dentry = lookup_hash(&nd->last, nd->dentry); + +do_last: + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) { + up(&dir->d_inode->i_sem); + goto exit; + } + + /* Negative dentry, just create the file */ + if (!dentry->d_inode) { + error = vfs_create(dir->d_inode, dentry, + mode & ~current->fs->umask); + up(&dir->d_inode->i_sem); + dput(nd->dentry); + nd->dentry = dentry; + if (error) + goto exit; + /* Don't check for write permission, don't truncate */ + acc_mode = 0; + flag &= ~O_TRUNC; + goto ok; + } + + /* + * It already exists. + */ + up(&dir->d_inode->i_sem); + + error = -EEXIST; + if (flag & O_EXCL) + goto exit_dput; + + if (d_mountpoint(dentry)) { + error = -ELOOP; + if (flag & O_NOFOLLOW) + goto exit_dput; + while (__follow_down(&nd->mnt,&dentry) && d_mountpoint(dentry)); + } + error = -ENOENT; + if (!dentry->d_inode) + goto exit_dput; + if (dentry->d_inode->i_op && dentry->d_inode->i_op->follow_link) + goto do_link; + + dput(nd->dentry); + nd->dentry = dentry; + error = -EISDIR; + if (dentry->d_inode && S_ISDIR(dentry->d_inode->i_mode)) + goto exit; +ok: + error = -ENOENT; + inode = dentry->d_inode; + if (!inode) + goto exit; + + error = -ELOOP; + if (S_ISLNK(inode->i_mode)) + goto exit; + + error = -EISDIR; + if (S_ISDIR(inode->i_mode) && (flag & FMODE_WRITE)) + goto exit; + + error = permission(inode,acc_mode); + if (error) + goto exit; + + /* + * FIFO's, sockets and device files are special: they don't + * actually live on the filesystem itself, and as such you + * can write to them even if the filesystem is read-only. + */ + if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + flag &= ~O_TRUNC; + } else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) { + error = -EACCES; + if (nd->mnt->mnt_flags & MNT_NODEV) + goto exit; + + flag &= ~O_TRUNC; + } else { + error = -EROFS; + if (IS_RDONLY(inode) && (flag & 2)) + goto exit; + } + /* + * An append-only file must be opened in append mode for writing. + */ + error = -EPERM; + if (IS_APPEND(inode)) { + if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) + goto exit; + if (flag & O_TRUNC) + goto exit; + } + + /* + * Ensure there are no outstanding leases on the file. + */ + error = get_lease(inode, flag); + if (error) + goto exit; + + if (flag & O_TRUNC) { + error = get_write_access(inode); + if (error) + goto exit; + + /* + * Refuse to truncate files with mandatory locks held on them. + */ + error = locks_verify_locked(inode); + if (!error) { + DQUOT_INIT(inode); + + error = do_truncate(dentry, 0); + } + put_write_access(inode); + if (error) + goto exit; + } else + if (flag & FMODE_WRITE) + DQUOT_INIT(inode); + + return 0; + +exit_dput: + dput(dentry); +exit: + path_release(nd); + return error; + +do_link: + error = -ELOOP; + if (flag & O_NOFOLLOW) + goto exit_dput; + /* + * This is subtle. Instead of calling do_follow_link() we do the + * thing by hands. The reason is that this way we have zero link_count + * and path_walk() (called from ->follow_link) honoring LOOKUP_PARENT. + * After that we have the parent and last component, i.e. + * we are in the same situation as after the first path_walk(). + * Well, almost - if the last component is normal we get its copy + * stored in nd->last.name and we will have to putname() it when we + * are done. Procfs-like symlinks just set LAST_BIND. + */ + UPDATE_ATIME(dentry->d_inode); + error = dentry->d_inode->i_op->follow_link(dentry, nd); + dput(dentry); + if (error) + return error; + if (nd->last_type == LAST_BIND) { + dentry = nd->dentry; + goto ok; + } + error = -EISDIR; + if (nd->last_type != LAST_NORM) + goto exit; + if (nd->last.name[nd->last.len]) { + putname(nd->last.name); + goto exit; + } + error = -ELOOP; + if (count++==32) { + putname(nd->last.name); + goto exit; + } + dir = nd->dentry; + down(&dir->d_inode->i_sem); + dentry = lookup_hash(&nd->last, nd->dentry); + putname(nd->last.name); + goto do_last; +} + +/* SMP-safe */ +static struct dentry *lookup_create(struct nameidata *nd, int is_dir) +{ + struct dentry *dentry; + + down(&nd->dentry->d_inode->i_sem); + dentry = ERR_PTR(-EEXIST); + if (nd->last_type != LAST_NORM) + goto fail; + dentry = lookup_hash(&nd->last, nd->dentry); + if (IS_ERR(dentry)) + goto fail; + if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode) + goto enoent; + return dentry; +enoent: + dput(dentry); + dentry = ERR_PTR(-ENOENT); +fail: + return dentry; +} + +int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) +{ + int error = -EPERM; + + down(&dir->i_zombie); + if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) + goto exit_lock; + + error = may_create(dir, dentry); + if (error) + goto exit_lock; + + error = -EPERM; + if (!dir->i_op || !dir->i_op->mknod) + goto exit_lock; + + DQUOT_INIT(dir); + lock_kernel(); + error = dir->i_op->mknod(dir, dentry, mode, dev); + unlock_kernel(); +exit_lock: + up(&dir->i_zombie); + if (!error) + inode_dir_notify(dir, DN_CREATE); + return error; +} + +asmlinkage long sys_mknod(const char * filename, int mode, dev_t dev) +{ + int error = 0; + char * tmp; + struct dentry * dentry; + struct nameidata nd; + + if (S_ISDIR(mode)) + return -EPERM; + tmp = getname(filename); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + if (path_init(tmp, LOOKUP_PARENT, &nd)) + error = path_walk(tmp, &nd); + if (error) + goto out; + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + + mode &= ~current->fs->umask; + if (!IS_ERR(dentry)) { + switch (mode & S_IFMT) { + case 0: case S_IFREG: + error = vfs_create(nd.dentry->d_inode,dentry,mode); + break; + case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: + error = vfs_mknod(nd.dentry->d_inode,dentry,mode,dev); + break; + case S_IFDIR: + error = -EPERM; + break; + default: + error = -EINVAL; + } + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); + path_release(&nd); +out: + putname(tmp); + + return error; +} + +int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + int error; + + down(&dir->i_zombie); + error = may_create(dir, dentry); + if (error) + goto exit_lock; + + error = -EPERM; + if (!dir->i_op || !dir->i_op->mkdir) + goto exit_lock; + + DQUOT_INIT(dir); + mode &= (S_IRWXUGO|S_ISVTX); + lock_kernel(); + error = dir->i_op->mkdir(dir, dentry, mode); + unlock_kernel(); + +exit_lock: + up(&dir->i_zombie); + if (!error) + inode_dir_notify(dir, DN_CREATE); + return error; +} + +asmlinkage long sys_mkdir(const char * pathname, int mode) +{ + int error = 0; + char * tmp; + + tmp = getname(pathname); + error = PTR_ERR(tmp); + if (!IS_ERR(tmp)) { + struct dentry *dentry; + struct nameidata nd; + + if (path_init(tmp, LOOKUP_PARENT, &nd)) + error = path_walk(tmp, &nd); + if (error) + goto out; + dentry = lookup_create(&nd, 1); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_mkdir(nd.dentry->d_inode, dentry, + mode & ~current->fs->umask); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); + path_release(&nd); +out: + putname(tmp); + } + + return error; +} + +/* + * We try to drop the dentry early: we should have + * a usage count of 2 if we're the only user of this + * dentry, and if that is true (possibly after pruning + * the dcache), then we drop the dentry now. + * + * A low-level filesystem can, if it choses, legally + * do a + * + * if (!d_unhashed(dentry)) + * return -EBUSY; + * + * if it cannot handle the case of removing a directory + * that is still in use by something else.. + */ +static void d_unhash(struct dentry *dentry) +{ + dget(dentry); + switch (atomic_read(&dentry->d_count)) { + default: + shrink_dcache_parent(dentry); + if (atomic_read(&dentry->d_count) != 2) + break; + case 2: + d_drop(dentry); + } +} + +int vfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + + error = may_delete(dir, dentry, 1); + if (error) + return error; + + if (!dir->i_op || !dir->i_op->rmdir) + return -EPERM; + + DQUOT_INIT(dir); + + double_down(&dir->i_zombie, &dentry->d_inode->i_zombie); + d_unhash(dentry); + if (IS_DEADDIR(dir)) + error = -ENOENT; + else if (d_mountpoint(dentry)) + error = -EBUSY; + else { + lock_kernel(); + error = dir->i_op->rmdir(dir, dentry); + unlock_kernel(); + if (!error) + dentry->d_inode->i_flags |= S_DEAD; + } + double_up(&dir->i_zombie, &dentry->d_inode->i_zombie); + if (!error) { + inode_dir_notify(dir, DN_DELETE); + d_delete(dentry); + } + dput(dentry); + + return error; +} + +asmlinkage long sys_rmdir(const char * pathname) +{ + int error = 0; + char * name; + struct dentry *dentry; + struct nameidata nd; + + name = getname(pathname); + if(IS_ERR(name)) + return PTR_ERR(name); + + if (path_init(name, LOOKUP_PARENT, &nd)) + error = path_walk(name, &nd); + if (error) + goto exit; + + switch(nd.last_type) { + case LAST_DOTDOT: + error = -ENOTEMPTY; + goto exit1; + case LAST_DOT: + error = -EINVAL; + goto exit1; + case LAST_ROOT: + error = -EBUSY; + goto exit1; + } + down(&nd.dentry->d_inode->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_rmdir(nd.dentry->d_inode, dentry); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); +exit1: + path_release(&nd); +exit: + putname(name); + return error; +} + +int vfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int error; + + down(&dir->i_zombie); + error = may_delete(dir, dentry, 0); + if (!error) { + error = -EPERM; + if (dir->i_op && dir->i_op->unlink) { + DQUOT_INIT(dir); + if (d_mountpoint(dentry)) + error = -EBUSY; + else { + lock_kernel(); + error = dir->i_op->unlink(dir, dentry); + unlock_kernel(); + if (!error) + d_delete(dentry); + } + } + } + up(&dir->i_zombie); + if (!error) + inode_dir_notify(dir, DN_DELETE); + return error; +} + +asmlinkage long sys_unlink(const char * pathname) +{ + int error = 0; + char * name; + struct dentry *dentry; + struct nameidata nd; + + name = getname(pathname); + if(IS_ERR(name)) + return PTR_ERR(name); + + if (path_init(name, LOOKUP_PARENT, &nd)) + error = path_walk(name, &nd); + if (error) + goto exit; + error = -EISDIR; + if (nd.last_type != LAST_NORM) + goto exit1; + down(&nd.dentry->d_inode->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + /* Why not before? Because we want correct error value */ + if (nd.last.name[nd.last.len]) + goto slashes; + error = vfs_unlink(nd.dentry->d_inode, dentry); + exit2: + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); +exit1: + path_release(&nd); +exit: + putname(name); + + return error; + +slashes: + error = !dentry->d_inode ? -ENOENT : + S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR; + goto exit2; +} + +int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +{ + int error; + + down(&dir->i_zombie); + error = may_create(dir, dentry); + if (error) + goto exit_lock; + + error = -EPERM; + if (!dir->i_op || !dir->i_op->symlink) + goto exit_lock; + + DQUOT_INIT(dir); + lock_kernel(); + error = dir->i_op->symlink(dir, dentry, oldname); + unlock_kernel(); + +exit_lock: + up(&dir->i_zombie); + if (!error) + inode_dir_notify(dir, DN_CREATE); + return error; +} + +asmlinkage long sys_symlink(const char * oldname, const char * newname) +{ + int error = 0; + char * from; + char * to; + + from = getname(oldname); + if(IS_ERR(from)) + return PTR_ERR(from); + to = getname(newname); + error = PTR_ERR(to); + if (!IS_ERR(to)) { + struct dentry *dentry; + struct nameidata nd; + + if (path_init(to, LOOKUP_PARENT, &nd)) + error = path_walk(to, &nd); + if (error) + goto out; + dentry = lookup_create(&nd, 0); + error = PTR_ERR(dentry); + if (!IS_ERR(dentry)) { + error = vfs_symlink(nd.dentry->d_inode, dentry, from); + dput(dentry); + } + up(&nd.dentry->d_inode->i_sem); + path_release(&nd); +out: + putname(to); + } + putname(from); + return error; +} + +int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) +{ + struct inode *inode; + int error; + + down(&dir->i_zombie); + error = -ENOENT; + inode = old_dentry->d_inode; + if (!inode) + goto exit_lock; + + error = may_create(dir, new_dentry); + if (error) + goto exit_lock; + + error = -EXDEV; + if (dir->i_dev != inode->i_dev) + goto exit_lock; + + /* + * A link to an append-only or immutable file cannot be created. + */ + error = -EPERM; + if (IS_APPEND(inode) || IS_IMMUTABLE_LINK(inode)) + goto exit_lock; + if (!dir->i_op || !dir->i_op->link) + goto exit_lock; + + DQUOT_INIT(dir); + lock_kernel(); + error = dir->i_op->link(old_dentry, dir, new_dentry); + unlock_kernel(); + +exit_lock: + up(&dir->i_zombie); + if (!error) + inode_dir_notify(dir, DN_CREATE); + return error; +} + +/* + * Hardlinks are often used in delicate situations. We avoid + * security-related surprises by not following symlinks on the + * newname. --KAB + * + * We don't follow them on the oldname either to be compatible + * with linux 2.0, and to avoid hard-linking to directories + * and other special files. --ADM + */ +asmlinkage long sys_link(const char * oldname, const char * newname) +{ + int error; + char * from; + char * to; + + from = getname(oldname); + if(IS_ERR(from)) + return PTR_ERR(from); + to = getname(newname); + error = PTR_ERR(to); + if (!IS_ERR(to)) { + struct dentry *new_dentry; + struct nameidata nd, old_nd; + + error = 0; + if (path_init(from, LOOKUP_POSITIVE, &old_nd)) + error = path_walk(from, &old_nd); + if (error) + goto exit; + if (path_init(to, LOOKUP_PARENT, &nd)) + error = path_walk(to, &nd); + if (error) + goto out; + error = -EXDEV; + if (old_nd.mnt != nd.mnt) + goto out_release; + new_dentry = lookup_create(&nd, 0); + error = PTR_ERR(new_dentry); + if (!IS_ERR(new_dentry)) { + error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry); + dput(new_dentry); + } + up(&nd.dentry->d_inode->i_sem); +out_release: + path_release(&nd); +out: + path_release(&old_nd); +exit: + putname(to); + } + putname(from); + + return error; +} + +/* + * The worst of all namespace operations - renaming directory. "Perverted" + * doesn't even start to describe it. Somebody in UCB had a heck of a trip... + * Problems: + * a) we can get into loop creation. Check is done in is_subdir(). + * b) race potential - two innocent renames can create a loop together. + * That's where 4.4 screws up. Current fix: serialization on + * sb->s_vfs_rename_sem. We might be more accurate, but that's another + * story. + * c) we have to lock _three_ objects - parents and victim (if it exists). + * And that - after we got ->i_sem on parents (until then we don't know + * whether the target exists at all, let alone whether it is a directory + * or not). Solution: ->i_zombie. Taken only after ->i_sem. Always taken + * on link creation/removal of any kind. And taken (without ->i_sem) on + * directory that will be removed (both in rmdir() and here). + * d) some filesystems don't support opened-but-unlinked directories, + * either because of layout or because they are not ready to deal with + * all cases correctly. The latter will be fixed (taking this sort of + * stuff into VFS), but the former is not going away. Solution: the same + * trick as in rmdir(). + * e) conversion from fhandle to dentry may come in the wrong moment - when + * we are removing the target. Solution: we will have to grab ->i_zombie + * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on + * ->i_sem on parents, which works but leads to some truely excessive + * locking]. + */ +int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int error; + struct inode *target; + + if (old_dentry->d_inode == new_dentry->d_inode) + return 0; + + error = may_delete(old_dir, old_dentry, 1); + if (error) + return error; + + if (new_dir->i_dev != old_dir->i_dev) + return -EXDEV; + + if (!new_dentry->d_inode) + error = may_create(new_dir, new_dentry); + else + error = may_delete(new_dir, new_dentry, 1); + if (error) + return error; + + if (!old_dir->i_op || !old_dir->i_op->rename) + return -EPERM; + + /* + * If we are going to change the parent - check write permissions, + * we'll need to flip '..'. + */ + if (new_dir != old_dir) { + error = permission(old_dentry->d_inode, MAY_WRITE); + } + if (error) + return error; + + DQUOT_INIT(old_dir); + DQUOT_INIT(new_dir); + down(&old_dir->i_sb->s_vfs_rename_sem); + error = -EINVAL; + if (is_subdir(new_dentry, old_dentry)) + goto out_unlock; + /* Don't eat your daddy, dear... */ + /* This also avoids locking issues */ + if (old_dentry->d_parent == new_dentry) + goto out_unlock; + target = new_dentry->d_inode; + if (target) { /* Hastur! Hastur! Hastur! */ + triple_down(&old_dir->i_zombie, + &new_dir->i_zombie, + &target->i_zombie); + d_unhash(new_dentry); + } else + double_down(&old_dir->i_zombie, + &new_dir->i_zombie); + if (IS_DEADDIR(old_dir)||IS_DEADDIR(new_dir)) + error = -ENOENT; + else if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) + error = -EBUSY; + else + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + if (target) { + if (!error) + target->i_flags |= S_DEAD; + triple_up(&old_dir->i_zombie, + &new_dir->i_zombie, + &target->i_zombie); + if (d_unhashed(new_dentry)) + d_rehash(new_dentry); + dput(new_dentry); + } else + double_up(&old_dir->i_zombie, + &new_dir->i_zombie); + + if (!error) + d_move(old_dentry,new_dentry); +out_unlock: + up(&old_dir->i_sb->s_vfs_rename_sem); + return error; +} + +int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int error; + + if (old_dentry->d_inode == new_dentry->d_inode) + return 0; + + error = may_delete(old_dir, old_dentry, 0); + if (error) + return error; + + if (new_dir->i_dev != old_dir->i_dev) + return -EXDEV; + + if (!new_dentry->d_inode) + error = may_create(new_dir, new_dentry); + else + error = may_delete(new_dir, new_dentry, 0); + if (error) + return error; + + if (!old_dir->i_op || !old_dir->i_op->rename) + return -EPERM; + + DQUOT_INIT(old_dir); + DQUOT_INIT(new_dir); + double_down(&old_dir->i_zombie, &new_dir->i_zombie); + if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) + error = -EBUSY; + else + error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); + double_up(&old_dir->i_zombie, &new_dir->i_zombie); + if (error) + return error; + /* The following d_move() should become unconditional */ + if (!(old_dir->i_sb->s_type->fs_flags & FS_ODD_RENAME)) { + d_move(old_dentry, new_dentry); + } + return 0; +} + +int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + int error; + if (S_ISDIR(old_dentry->d_inode->i_mode)) + error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry); + else + error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry); + if (!error) { + if (old_dir == new_dir) + inode_dir_notify(old_dir, DN_RENAME); + else { + inode_dir_notify(old_dir, DN_DELETE); + inode_dir_notify(new_dir, DN_CREATE); + } + } + return error; +} + +static inline int do_rename(const char * oldname, const char * newname) +{ + int error = 0; + struct dentry * old_dir, * new_dir; + struct dentry * old_dentry, *new_dentry; + struct nameidata oldnd, newnd; + + if (path_init(oldname, LOOKUP_PARENT, &oldnd)) + error = path_walk(oldname, &oldnd); + + if (error) + goto exit; + + if (path_init(newname, LOOKUP_PARENT, &newnd)) + error = path_walk(newname, &newnd); + if (error) + goto exit1; + + error = -EXDEV; + if (oldnd.mnt != newnd.mnt) + goto exit2; + + old_dir = oldnd.dentry; + error = -EBUSY; + if (oldnd.last_type != LAST_NORM) + goto exit2; + + new_dir = newnd.dentry; + if (newnd.last_type != LAST_NORM) + goto exit2; + + double_lock(new_dir, old_dir); + + old_dentry = lookup_hash(&oldnd.last, old_dir); + error = PTR_ERR(old_dentry); + if (IS_ERR(old_dentry)) + goto exit3; + /* source must exist */ + error = -ENOENT; + if (!old_dentry->d_inode) + goto exit4; + /* unless the source is a directory trailing slashes give -ENOTDIR */ + if (!S_ISDIR(old_dentry->d_inode->i_mode)) { + error = -ENOTDIR; + if (oldnd.last.name[oldnd.last.len]) + goto exit4; + if (newnd.last.name[newnd.last.len]) + goto exit4; + } + new_dentry = lookup_hash(&newnd.last, new_dir); + error = PTR_ERR(new_dentry); + if (IS_ERR(new_dentry)) + goto exit4; + + lock_kernel(); + error = vfs_rename(old_dir->d_inode, old_dentry, + new_dir->d_inode, new_dentry); + unlock_kernel(); + + dput(new_dentry); +exit4: + dput(old_dentry); +exit3: + double_up(&new_dir->d_inode->i_sem, &old_dir->d_inode->i_sem); +exit2: + path_release(&newnd); +exit1: + path_release(&oldnd); +exit: + return error; +} + +asmlinkage long sys_rename(const char * oldname, const char * newname) +{ + int error; + char * from; + char * to; + + from = getname(oldname); + if(IS_ERR(from)) + return PTR_ERR(from); + to = getname(newname); + error = PTR_ERR(to); + if (!IS_ERR(to)) { + error = do_rename(from,to); + putname(to); + } + putname(from); + return error; +} + +int vfs_readlink(struct dentry *dentry, char *buffer, int buflen, const char *link) +{ + int len; + + len = PTR_ERR(link); + if (IS_ERR(link)) + goto out; + + len = strlen(link); + if (len > (unsigned) buflen) + len = buflen; + if (copy_to_user(buffer, link, len)) + len = -EFAULT; +out: + return len; +} + +static inline int +__vfs_follow_link(struct nameidata *nd, const char *link) +{ + int res = 0; + char *name; + if (IS_ERR(link)) + goto fail; + + if (*link == '/') { + path_release(nd); + if (!walk_init_root(link, nd)) + /* weird __emul_prefix() stuff did it */ + goto out; + } + res = link_path_walk(link, nd); +out: + if (current->link_count || res || nd->last_type!=LAST_NORM) + return res; + /* + * If it is an iterative symlinks resolution in open_namei() we + * have to copy the last component. And all that crap because of + * bloody create() on broken symlinks. Furrfu... + */ + name = __getname(); + if (!name) + return -ENOMEM; + strcpy(name, nd->last.name); + nd->last.name = name; + return 0; +fail: + path_release(nd); + return PTR_ERR(link); +} + +int vfs_follow_link(struct nameidata *nd, const char *link) +{ + return __vfs_follow_link(nd, link); +} + +/* get the link contents into pagecache */ +static char *page_getlink(struct dentry * dentry, struct page **ppage) +{ + struct page * page; + struct address_space *mapping = dentry->d_inode->i_mapping; + page = read_cache_page(mapping, 0, (filler_t *)mapping->a_ops->readpage, + NULL); + if (IS_ERR(page)) + goto sync_fail; + wait_on_page(page); + if (!Page_Uptodate(page)) + goto async_fail; + *ppage = page; + return kmap(page); + +async_fail: + page_cache_release(page); + return ERR_PTR(-EIO); + +sync_fail: + return (char*)page; +} + +int page_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + struct page *page = NULL; + char *s = page_getlink(dentry, &page); + int res = vfs_readlink(dentry,buffer,buflen,s); + if (page) { + kunmap(page); + page_cache_release(page); + } + return res; +} + +int page_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct page *page = NULL; + char *s = page_getlink(dentry, &page); + int res = __vfs_follow_link(nd, s); + if (page) { + kunmap(page); + page_cache_release(page); + } + return res; +} + +struct inode_operations page_symlink_inode_operations = { + readlink: page_readlink, + follow_link: page_follow_link, +}; diff -urN linux-2.4.16-reiserfspatches-immutable/fs/nfsd/vfs.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/nfsd/vfs.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/nfsd/vfs.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/nfsd/vfs.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,1591 @@ +#define MSNFS /* HACK HACK */ +/* + * linux/fs/nfsd/vfs.c + * + * File operations used by nfsd. Some of these have been ripped from + * other parts of the kernel because they weren't in ksyms.c, others + * are partial duplicates with added or changed functionality. + * + * Note that several functions dget() the dentry upon which they want + * to act, most notably those that create directory entries. Response + * dentry's are dput()'d if necessary in the release callback. + * So if you notice code paths that apparently fail to dput() the + * dentry, don't worry--they have been taken care of. + * + * Copyright (C) 1995-1999 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define __NO_VERSION__ +#include + +#include +#include +#ifdef CONFIG_NFSD_V3 +#include +#include +#endif /* CONFIG_NFSD_V3 */ +#include +#include + +#include + +#define NFSDDBG_FACILITY NFSDDBG_FILEOP +#define NFSD_PARANOIA + + +/* We must ignore files (but only files) which might have mandatory + * locks on them because there is no way to know if the accesser has + * the lock. + */ +#define IS_ISMNDLK(i) (S_ISREG((i)->i_mode) && MANDATORY_LOCK(i)) + +/* + * This is a cache of readahead params that help us choose the proper + * readahead strategy. Initially, we set all readahead parameters to 0 + * and let the VFS handle things. + * If you increase the number of cached files very much, you'll need to + * add a hash table here. + */ +struct raparms { + struct raparms *p_next; + unsigned int p_count; + ino_t p_ino; + dev_t p_dev; + unsigned long p_reada, + p_ramax, + p_raend, + p_ralen, + p_rawin; +}; + +static struct raparms * raparml; +static struct raparms * raparm_cache; + +/* + * Look up one component of a pathname. + * N.B. After this call _both_ fhp and resfh need an fh_put + * + * If the lookup would cross a mountpoint, and the mounted filesystem + * is exported to the client with NFSEXP_CROSSMNT, then the lookup is + * accepted as it stands and the mounted directory is + * returned. Otherwise the covered directory is returned. + * NOTE: this mountpoint crossing is not supported properly by all + * clients and is explicitly disallowed for NFSv3 + * NeilBrown + */ +int +nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, + int len, struct svc_fh *resfh) +{ + struct svc_export *exp; + struct dentry *dparent; + struct dentry *dentry; + int err; + + dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); + + /* Obtain dentry and export. */ + err = fh_verify(rqstp, fhp, S_IFDIR, MAY_EXEC); + if (err) + goto out; + + dparent = fhp->fh_dentry; + exp = fhp->fh_export; + + err = nfserr_acces; + + /* Lookup the name, but don't follow links */ + if (isdotent(name, len)) { + if (len==1) + dentry = dget(dparent); + else { /* must be ".." */ + /* checking mountpoint crossing is very different when stepping up */ + if (dparent == exp->ex_dentry) { + if (!EX_CROSSMNT(exp)) + dentry = dget(dparent); /* .. == . just like at / */ + else + { + struct svc_export *exp2 = NULL; + struct dentry *dp; + struct vfsmount *mnt = mntget(exp->ex_mnt); + dentry = dget(dparent); + while(follow_up(&mnt, &dentry)) + ; + dp = dget(dentry->d_parent); + dput(dentry); + dentry = dp; + for ( ; exp2 == NULL && dp->d_parent != dp; + dp=dp->d_parent) + exp2 = exp_get(exp->ex_client, dp->d_inode->i_dev, dp->d_inode->i_ino); + if (exp2==NULL) { + dput(dentry); + dentry = dget(dparent); + } else { + exp = exp2; + } + mntput(mnt); + } + } else + dentry = dget(dparent->d_parent); + } + } else { + fh_lock(fhp); + dentry = lookup_one_len(name, dparent, len); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_nfserr; + /* + * check if we have crossed a mount point ... + */ + if (d_mountpoint(dentry)) { + struct svc_export *exp2 = NULL; + struct vfsmount *mnt = mntget(exp->ex_mnt); + struct dentry *mounts = dget(dentry); + while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts)) + ; + exp2 = exp_get(rqstp->rq_client, + mounts->d_inode->i_dev, + mounts->d_inode->i_ino); + if (exp2 && EX_CROSSMNT(exp2)) { + /* successfully crossed mount point */ + exp = exp2; + dput(dentry); + dentry = mounts; + } else + dput(mounts); + mntput(mnt); + } + } + /* + * Note: we compose the file handle now, but as the + * dentry may be negative, it may need to be updated. + */ + err = fh_compose(resfh, exp, dentry, fhp); + if (!err && !dentry->d_inode) + err = nfserr_noent; +out: + return err; + +out_nfserr: + err = nfserrno(err); + goto out; +} + +/* + * Set various file attributes. + * N.B. After this call fhp needs an fh_put + */ +int +nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, + int check_guard, time_t guardtime) +{ + struct dentry *dentry; + struct inode *inode; + int accmode = MAY_SATTR; + int ftype = 0; + int imode; + int err; + int size_change = 0; + + if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE)) + accmode |= MAY_WRITE|MAY_OWNER_OVERRIDE; + if (iap->ia_valid & ATTR_SIZE) + ftype = S_IFREG; + + /* Get inode */ + err = fh_verify(rqstp, fhp, ftype, accmode); + if (err || !iap->ia_valid) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + err = inode_change_ok(inode, iap); + /* could be a "touch" (utimes) request where the user is not the owner but does + * have write permission. In this case the user should be allowed to set + * both times to the current time. We could just assume any such SETATTR + * is intended to set the times to "now", but we do a couple of simple tests + * to increase our confidence. + */ +#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) +#define MAX_TOUCH_TIME_ERROR (30*60) + if (err + && (iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET + && iap->ia_mtime == iap->ia_atime + ) { + /* looks good. now just make sure time is in the right ballpark. + * solaris, at least, doesn't seem to care what the time request is + */ + time_t delta = iap->ia_atime - CURRENT_TIME; + if (delta<0) delta = -delta; + if (delta < MAX_TOUCH_TIME_ERROR) { + /* turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME + * this will cause notify_change to set these times to "now" + */ + iap->ia_valid &= ~BOTH_TIME_SET; + err = inode_change_ok(inode, iap); + } + } + + if (err) + goto out_nfserr; + + /* The size case is special. It changes the file as well as the attributes. */ + if (iap->ia_valid & ATTR_SIZE) { + if (iap->ia_size < inode->i_size) { + err = nfsd_permission(fhp->fh_export, dentry, MAY_TRUNC|MAY_OWNER_OVERRIDE); + if (err) + goto out; + } + + /* + * If we are changing the size of the file, then + * we need to break all leases. + */ + err = get_lease(inode, FMODE_WRITE); + if (err) + goto out_nfserr; + + err = get_write_access(inode); + if (err) + goto out_nfserr; + + err = locks_verify_truncate(inode, NULL, iap->ia_size); + if (err) { + put_write_access(inode); + goto out_nfserr; + } + DQUOT_INIT(inode); + } + + imode = inode->i_mode; + if (iap->ia_valid & ATTR_MODE) { + iap->ia_mode &= S_IALLUGO; + imode = iap->ia_mode |= (imode & ~S_IALLUGO); + } + + /* Revoke setuid/setgid bit on chown/chgrp */ + if ((iap->ia_valid & ATTR_UID) && (imode & S_ISUID) + && iap->ia_uid != inode->i_uid) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = imode &= ~S_ISUID; + } + if ((iap->ia_valid & ATTR_GID) && (imode & S_ISGID) + && iap->ia_gid != inode->i_gid) { + iap->ia_valid |= ATTR_MODE; + iap->ia_mode = imode &= ~S_ISGID; + } + + /* Change the attributes. */ + + + iap->ia_valid |= ATTR_CTIME; + + if (iap->ia_valid & ATTR_SIZE) { + fh_lock(fhp); + size_change = 1; + } + err = nfserr_notsync; + if (!check_guard || guardtime == inode->i_ctime) { + err = notify_change(dentry, iap); + err = nfserrno(err); + } + if (size_change) { + fh_unlock(fhp); + put_write_access(inode); + } + if (!err) + if (EX_ISSYNC(fhp->fh_export)) + write_inode_now(inode, 1); +out: + return err; + +out_nfserr: + err = nfserrno(err); + goto out; +} + +#ifdef CONFIG_NFSD_V3 +/* + * Check server access rights to a file system object + */ +struct accessmap { + u32 access; + int how; +}; +static struct accessmap nfs3_regaccess[] = { + { NFS3_ACCESS_READ, MAY_READ }, + { NFS3_ACCESS_EXECUTE, MAY_EXEC }, + { NFS3_ACCESS_MODIFY, MAY_WRITE|MAY_TRUNC }, + { NFS3_ACCESS_EXTEND, MAY_WRITE }, + + { 0, 0 } +}; + +static struct accessmap nfs3_diraccess[] = { + { NFS3_ACCESS_READ, MAY_READ }, + { NFS3_ACCESS_LOOKUP, MAY_EXEC }, + { NFS3_ACCESS_MODIFY, MAY_EXEC|MAY_WRITE|MAY_TRUNC }, + { NFS3_ACCESS_EXTEND, MAY_EXEC|MAY_WRITE }, + { NFS3_ACCESS_DELETE, MAY_REMOVE }, + + { 0, 0 } +}; + +static struct accessmap nfs3_anyaccess[] = { + /* Some clients - Solaris 2.6 at least, make an access call + * to the server to check for access for things like /dev/null + * (which really, the server doesn't care about). So + * We provide simple access checking for them, looking + * mainly at mode bits + */ + { NFS3_ACCESS_READ, MAY_READ }, + { NFS3_ACCESS_EXECUTE, MAY_EXEC }, + { NFS3_ACCESS_MODIFY, MAY_WRITE }, + { NFS3_ACCESS_EXTEND, MAY_WRITE }, + + { 0, 0 } +}; + +int +nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access) +{ + struct accessmap *map; + struct svc_export *export; + struct dentry *dentry; + u32 query, result = 0; + unsigned int error; + + error = fh_verify(rqstp, fhp, 0, MAY_NOP); + if (error) + goto out; + + export = fhp->fh_export; + dentry = fhp->fh_dentry; + + if (S_ISREG(dentry->d_inode->i_mode)) + map = nfs3_regaccess; + else if (S_ISDIR(dentry->d_inode->i_mode)) + map = nfs3_diraccess; + else + map = nfs3_anyaccess; + + + query = *access; + for (; map->access; map++) { + if (map->access & query) { + unsigned int err2; + err2 = nfsd_permission(export, dentry, map->how); + switch (err2) { + case nfs_ok: + result |= map->access; + break; + + /* the following error codes just mean the access was not allowed, + * rather than an error occurred */ + case nfserr_rofs: + case nfserr_acces: + case nfserr_perm: + /* simply don't "or" in the access bit. */ + break; + default: + error = err2; + goto out; + } + } + } + *access = result; + + out: + return error; +} +#endif /* CONFIG_NFSD_V3 */ + + + +/* + * Open an existing file or directory. + * The access argument indicates the type of open (read/write/lock) + * N.B. After this call fhp needs an fh_put + */ +int +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, + int access, struct file *filp) +{ + struct dentry *dentry; + struct inode *inode; + int err; + + /* If we get here, then the client has already done an "open", and (hopefully) + * checked permission - so allow OWNER_OVERRIDE in case a chmod has now revoked + * permission */ + err = fh_verify(rqstp, fhp, type, access | MAY_OWNER_OVERRIDE); + if (err) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + /* Disallow access to files with the append-only bit set or + * with mandatory locking enabled + */ + err = nfserr_perm; + if (IS_APPEND(inode) || IS_ISMNDLK(inode)) + goto out; + if (!inode->i_fop) + goto out; + + /* + * Check to see if there are any leases on this file. + * This may block while leases are broken. + */ + err = get_lease(inode, (access & MAY_WRITE) ? FMODE_WRITE : 0); + if (err) + goto out_nfserr; + + if ((access & MAY_WRITE) && (err = get_write_access(inode)) != 0) + goto out_nfserr; + + memset(filp, 0, sizeof(*filp)); + filp->f_op = fops_get(inode->i_fop); + atomic_set(&filp->f_count, 1); + filp->f_dentry = dentry; + filp->f_vfsmnt = fhp->fh_export->ex_mnt; + if (access & MAY_WRITE) { + filp->f_flags = O_WRONLY|O_LARGEFILE; + filp->f_mode = FMODE_WRITE; + DQUOT_INIT(inode); + } else { + filp->f_flags = O_RDONLY|O_LARGEFILE; + filp->f_mode = FMODE_READ; + } + + err = 0; + if (filp->f_op && filp->f_op->open) { + err = filp->f_op->open(inode, filp); + if (err) { + fops_put(filp->f_op); + if (access & MAY_WRITE) + put_write_access(inode); + + /* I nearly added put_filp() call here, but this filp + * is really on callers stack frame. -DaveM + */ + atomic_dec(&filp->f_count); + } + } +out_nfserr: + if (err) + err = nfserrno(err); +out: + return err; +} + +/* + * Close a file. + */ +void +nfsd_close(struct file *filp) +{ + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + + if (filp->f_op && filp->f_op->release) + filp->f_op->release(inode, filp); + fops_put(filp->f_op); + if (filp->f_mode & FMODE_WRITE) + put_write_access(inode); +} + +/* + * Sync a file + * As this calls fsync (not fdatasync) there is no need for a write_inode + * after it. + */ +void +nfsd_sync(struct file *filp) +{ + dprintk("nfsd: sync file %s\n", filp->f_dentry->d_name.name); + down(&filp->f_dentry->d_inode->i_sem); + filp->f_op->fsync(filp, filp->f_dentry, 0); + up(&filp->f_dentry->d_inode->i_sem); +} + +void +nfsd_sync_dir(struct dentry *dp) +{ + struct inode *inode = dp->d_inode; + int (*fsync) (struct file *, struct dentry *, int); + + if (inode->i_fop && (fsync = inode->i_fop->fsync)) { + fsync(NULL, dp, 0); + } +} + +/* + * Obtain the readahead parameters for the file + * specified by (dev, ino). + */ +static inline struct raparms * +nfsd_get_raparms(dev_t dev, ino_t ino) +{ + struct raparms *ra, **rap, **frap = NULL; + int depth = 0; + + for (rap = &raparm_cache; (ra = *rap); rap = &ra->p_next) { + if (ra->p_ino == ino && ra->p_dev == dev) + goto found; + depth++; + if (ra->p_count == 0) + frap = rap; + } + depth = nfsdstats.ra_size*11/10; + if (!frap) + return NULL; + rap = frap; + ra = *frap; + memset(ra, 0, sizeof(*ra)); + ra->p_dev = dev; + ra->p_ino = ino; +found: + if (rap != &raparm_cache) { + *rap = ra->p_next; + ra->p_next = raparm_cache; + raparm_cache = ra; + } + ra->p_count++; + nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; + return ra; +} + +/* + * Read data from a file. count must contain the requested read count + * on entry. On return, *count contains the number of bytes actually read. + * N.B. After this call fhp needs an fh_put + */ +int +nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, + char *buf, unsigned long *count) +{ + struct raparms *ra; + mm_segment_t oldfs; + int err; + struct file file; + + err = nfsd_open(rqstp, fhp, S_IFREG, MAY_READ, &file); + if (err) + goto out; + err = nfserr_perm; + if (!file.f_op->read) + goto out_close; +#ifdef MSNFS + if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && + (!lock_may_read(file.f_dentry->d_inode, offset, *count))) + goto out_close; +#endif + + /* Get readahead parameters */ + ra = nfsd_get_raparms(fhp->fh_export->ex_dev, fhp->fh_dentry->d_inode->i_ino); + if (ra) { + file.f_reada = ra->p_reada; + file.f_ramax = ra->p_ramax; + file.f_raend = ra->p_raend; + file.f_ralen = ra->p_ralen; + file.f_rawin = ra->p_rawin; + } + file.f_pos = offset; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = file.f_op->read(&file, buf, *count, &file.f_pos); + set_fs(oldfs); + + /* Write back readahead params */ + if (ra != NULL) { + dprintk("nfsd: raparms %ld %ld %ld %ld %ld\n", + file.f_reada, file.f_ramax, file.f_raend, + file.f_ralen, file.f_rawin); + ra->p_reada = file.f_reada; + ra->p_ramax = file.f_ramax; + ra->p_raend = file.f_raend; + ra->p_ralen = file.f_ralen; + ra->p_rawin = file.f_rawin; + ra->p_count -= 1; + } + + if (err >= 0) { + nfsdstats.io_read += err; + *count = err; + err = 0; + } else + err = nfserrno(err); +out_close: + nfsd_close(&file); +out: + return err; +} + +/* + * Write data to a file. + * The stable flag requests synchronous writes. + * N.B. After this call fhp needs an fh_put + */ +int +nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, + char *buf, unsigned long cnt, int *stablep) +{ + struct svc_export *exp; + struct file file; + struct dentry *dentry; + struct inode *inode; + mm_segment_t oldfs; + int err = 0; + int stable = *stablep; + + err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file); + if (err) + goto out; + if (!cnt) + goto out_close; + err = nfserr_perm; + if (!file.f_op->write) + goto out_close; +#ifdef MSNFS + if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && + (!lock_may_write(file.f_dentry->d_inode, offset, cnt))) + goto out_close; +#endif + + dentry = file.f_dentry; + inode = dentry->d_inode; + exp = fhp->fh_export; + + /* + * Request sync writes if + * - the sync export option has been set, or + * - the client requested O_SYNC behavior (NFSv3 feature). + * - The file system doesn't support fsync(). + * When gathered writes have been configured for this volume, + * flushing the data to disk is handled separately below. + */ + + if (file.f_op->fsync == 0) {/* COMMIT3 cannot work */ + stable = 2; + *stablep = 2; /* FILE_SYNC */ + } + + if (!EX_ISSYNC(exp)) + stable = 0; + if (stable && !EX_WGATHER(exp)) + file.f_flags |= O_SYNC; + + file.f_pos = offset; /* set write offset */ + + /* Write the data. */ + oldfs = get_fs(); set_fs(KERNEL_DS); + err = file.f_op->write(&file, buf, cnt, &file.f_pos); + if (err >= 0) + nfsdstats.io_write += cnt; + set_fs(oldfs); + + /* clear setuid/setgid flag after write */ + if (err >= 0 && (inode->i_mode & (S_ISUID | S_ISGID))) { + struct iattr ia; + + ia.ia_valid = ATTR_MODE; + ia.ia_mode = inode->i_mode & ~(S_ISUID | S_ISGID); + notify_change(dentry, &ia); + } + + if (err >= 0 && stable) { + static unsigned long last_ino; + static kdev_t last_dev = NODEV; + + /* + * Gathered writes: If another process is currently + * writing to the file, there's a high chance + * this is another nfsd (triggered by a bulk write + * from a client's biod). Rather than syncing the + * file with each write request, we sleep for 10 msec. + * + * I don't know if this roughly approximates + * C. Juszak's idea of gathered writes, but it's a + * nice and simple solution (IMHO), and it seems to + * work:-) + */ + if (EX_WGATHER(exp)) { + if (atomic_read(&inode->i_writecount) > 1 + || (last_ino == inode->i_ino && last_dev == inode->i_dev)) { + dprintk("nfsd: write defer %d\n", current->pid); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout((HZ+99)/100); + current->state = TASK_RUNNING; + dprintk("nfsd: write resume %d\n", current->pid); + } + + if (inode->i_state & I_DIRTY) { + dprintk("nfsd: write sync %d\n", current->pid); + nfsd_sync(&file); + } +#if 0 + wake_up(&inode->i_wait); +#endif + } + last_ino = inode->i_ino; + last_dev = inode->i_dev; + } + + dprintk("nfsd: write complete err=%d\n", err); + if (err >= 0) + err = 0; + else + err = nfserrno(err); +out_close: + nfsd_close(&file); +out: + return err; +} + + +#ifdef CONFIG_NFSD_V3 +/* + * Commit all pending writes to stable storage. + * Strictly speaking, we could sync just the indicated file region here, + * but there's currently no way we can ask the VFS to do so. + * + * Unfortunately we cannot lock the file to make sure we return full WCC + * data to the client, as locking happens lower down in the filesystem. + */ +int +nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, + off_t offset, unsigned long count) +{ + struct file file; + int err; + + if ((err = nfsd_open(rqstp, fhp, S_IFREG, MAY_WRITE, &file)) != 0) + return err; + if (EX_ISSYNC(fhp->fh_export)) { + if (file.f_op && file.f_op->fsync) { + nfsd_sync(&file); + } else { + err = nfserr_notsupp; + } + } + + nfsd_close(&file); + return err; +} +#endif /* CONFIG_NFSD_V3 */ + +/* + * Create a file (regular, directory, device, fifo); UNIX sockets + * not yet implemented. + * If the response fh has been verified, the parent directory should + * already be locked. Note that the parent directory is left locked. + * + * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp + */ +int +nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + int type, dev_t rdev, struct svc_fh *resfhp) +{ + struct dentry *dentry, *dchild; + struct inode *dirp; + int err; + + err = nfserr_perm; + if (!flen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + + err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + if (err) + goto out; + + dentry = fhp->fh_dentry; + dirp = dentry->d_inode; + + err = nfserr_notdir; + if(!dirp->i_op || !dirp->i_op->lookup) + goto out; + /* + * Check whether the response file handle has been verified yet. + * If it has, the parent directory should already be locked. + */ + if (!resfhp->fh_dentry) { + /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ + fh_lock(fhp); + dchild = lookup_one_len(fname, dentry, flen); + err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + goto out_nfserr; + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) + goto out; + } else { + /* called from nfsd_proc_create */ + dchild = resfhp->fh_dentry; + if (!fhp->fh_locked) { + /* not actually possible */ + printk(KERN_ERR + "nfsd_create: parent %s/%s not locked!\n", + dentry->d_parent->d_name.name, + dentry->d_name.name); + err = -EIO; + goto out; + } + } + /* + * Make sure the child dentry is still negative ... + */ + err = nfserr_exist; + if (dchild->d_inode) { + dprintk("nfsd_create: dentry %s/%s not negative!\n", + dentry->d_name.name, dchild->d_name.name); + goto out; + } + + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type; + + /* + * Get the dir op function pointer. + */ + err = nfserr_perm; + switch (type) { + case S_IFREG: + err = vfs_create(dirp, dchild, iap->ia_mode); + break; + case S_IFDIR: + err = vfs_mkdir(dirp, dchild, iap->ia_mode); + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); + break; + default: + printk("nfsd: bad file type %o in nfsd_create\n", type); + err = -EINVAL; + } + if (err < 0) + goto out_nfserr; + + if (EX_ISSYNC(fhp->fh_export)) { + nfsd_sync_dir(dentry); + write_inode_now(dchild->d_inode, 1); + } + + + /* Set file attributes. Mode has already been set and + * setting uid/gid works only for root. Irix appears to + * send along the gid when it tries to implement setgid + * directories via NFS. + */ + err = 0; + if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) + err = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); + /* + * Update the file handle to get the new inode info. + */ + if (!err) + err = fh_update(resfhp); +out: + return err; + +out_nfserr: + err = nfserrno(err); + goto out; +} + +#ifdef CONFIG_NFSD_V3 +/* + * NFSv3 version of nfsd_create + */ +int +nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, struct iattr *iap, + struct svc_fh *resfhp, int createmode, u32 *verifier) +{ + struct dentry *dentry, *dchild; + struct inode *dirp; + int err; + __u32 v_mtime=0, v_atime=0; + int v_mode=0; + + err = nfserr_perm; + if (!flen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + if (!(iap->ia_valid & ATTR_MODE)) + iap->ia_mode = 0; + err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + if (err) + goto out; + + dentry = fhp->fh_dentry; + dirp = dentry->d_inode; + + /* Get all the sanity checks out of the way before + * we lock the parent. */ + err = nfserr_notdir; + if(!dirp->i_op || !dirp->i_op->lookup) + goto out; + fh_lock(fhp); + + /* + * Compose the response file handle. + */ + dchild = lookup_one_len(fname, dentry, flen); + err = PTR_ERR(dchild); + if (IS_ERR(dchild)) + goto out_nfserr; + + err = fh_compose(resfhp, fhp->fh_export, dchild, fhp); + if (err) + goto out; + + if (createmode == NFS3_CREATE_EXCLUSIVE) { + /* while the verifier would fit in mtime+atime, + * solaris7 gets confused (bugid 4218508) if these have + * the high bit set, so we use the mode as well + */ + v_mtime = verifier[0]&0x7fffffff; + v_atime = verifier[1]&0x7fffffff; + v_mode = S_IFREG + | ((verifier[0]&0x80000000) >> (32-7)) /* u+x */ + | ((verifier[1]&0x80000000) >> (32-9)) /* u+r */ + ; + } + + if (dchild->d_inode) { + err = 0; + + switch (createmode) { + case NFS3_CREATE_UNCHECKED: + if (! S_ISREG(dchild->d_inode->i_mode)) + err = nfserr_exist; + else { + iap->ia_valid &= ATTR_SIZE; + goto set_attr; + } + break; + case NFS3_CREATE_EXCLUSIVE: + if ( dchild->d_inode->i_mtime == v_mtime + && dchild->d_inode->i_atime == v_atime + && dchild->d_inode->i_mode == v_mode + && dchild->d_inode->i_size == 0 ) + break; + /* fallthru */ + case NFS3_CREATE_GUARDED: + err = nfserr_exist; + } + goto out; + } + + err = vfs_create(dirp, dchild, iap->ia_mode); + if (err < 0) + goto out_nfserr; + + if (EX_ISSYNC(fhp->fh_export)) { + nfsd_sync_dir(dentry); + /* setattr will sync the child (or not) */ + } + + /* + * Update the filehandle to get the new inode info. + */ + err = fh_update(resfhp); + if (err) + goto out; + + if (createmode == NFS3_CREATE_EXCLUSIVE) { + /* Cram the verifier into atime/mtime/mode */ + iap->ia_valid = ATTR_MTIME|ATTR_ATIME + | ATTR_MTIME_SET|ATTR_ATIME_SET + | ATTR_MODE; + iap->ia_mtime = v_mtime; + iap->ia_atime = v_atime; + iap->ia_mode = v_mode; + } + + /* Set file attributes. + * Mode has already been set but we might need to reset it + * for CREATE_EXCLUSIVE + * Irix appears to send along the gid when it tries to + * implement setgid directories via NFS. Clear out all that cruft. + */ + set_attr: + if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID)) != 0) + err = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); + + out: + fh_unlock(fhp); + return err; + + out_nfserr: + err = nfserrno(err); + goto out; +} +#endif /* CONFIG_NFSD_V3 */ + +/* + * Read a symlink. On entry, *lenp must contain the maximum path length that + * fits into the buffer. On return, it contains the true length. + * N.B. After this call fhp needs an fh_put + */ +int +nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) +{ + struct dentry *dentry; + struct inode *inode; + mm_segment_t oldfs; + int err; + + err = fh_verify(rqstp, fhp, S_IFLNK, MAY_NOP); + if (err) + goto out; + + dentry = fhp->fh_dentry; + inode = dentry->d_inode; + + err = nfserr_inval; + if (!inode->i_op || !inode->i_op->readlink) + goto out; + + UPDATE_ATIME(inode); + /* N.B. Why does this call need a get_fs()?? + * Remove the set_fs and watch the fireworks:-) --okir + */ + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = inode->i_op->readlink(dentry, buf, *lenp); + set_fs(oldfs); + + if (err < 0) + goto out_nfserr; + *lenp = err; + err = 0; +out: + return err; + +out_nfserr: + err = nfserrno(err); + goto out; +} + +/* + * Create a symlink and look up its inode + * N.B. After this call _both_ fhp and resfhp need an fh_put + */ +int +nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *fname, int flen, + char *path, int plen, + struct svc_fh *resfhp, + struct iattr *iap) +{ + struct dentry *dentry, *dnew; + int err, cerr; + + err = nfserr_noent; + if (!flen || !plen) + goto out; + err = nfserr_exist; + if (isdotent(fname, flen)) + goto out; + + err = fh_verify(rqstp, fhp, S_IFDIR, MAY_CREATE); + if (err) + goto out; + fh_lock(fhp); + dentry = fhp->fh_dentry; + dnew = lookup_one_len(fname, dentry, flen); + err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + + err = vfs_symlink(dentry->d_inode, dnew, path); + if (!err) { + if (EX_ISSYNC(fhp->fh_export)) + nfsd_sync_dir(dentry); + if (iap) { + iap->ia_valid &= ATTR_MODE /* ~(ATTR_MODE|ATTR_UID|ATTR_GID)*/; + if (iap->ia_valid) { + iap->ia_valid |= ATTR_CTIME; + iap->ia_mode = (iap->ia_mode&S_IALLUGO) + | S_IFLNK; + err = notify_change(dnew, iap); + if (!err && EX_ISSYNC(fhp->fh_export)) + write_inode_now(dentry->d_inode, 1); + } + } + } else + err = nfserrno(err); + fh_unlock(fhp); + + /* Compose the fh so the dentry will be freed ... */ + cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); + if (err==0) err = cerr; +out: + return err; + +out_nfserr: + err = nfserrno(err); + goto out; +} + +/* + * Create a hardlink + * N.B. After this call _both_ ffhp and tfhp need an fh_put + */ +int +nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, + char *name, int len, struct svc_fh *tfhp) +{ + struct dentry *ddir, *dnew, *dold; + struct inode *dirp, *dest; + int err; + + err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_CREATE); + if (err) + goto out; + err = fh_verify(rqstp, tfhp, -S_IFDIR, MAY_NOP); + if (err) + goto out; + + err = nfserr_perm; + if (!len) + goto out; + err = nfserr_exist; + if (isdotent(name, len)) + goto out; + + fh_lock(ffhp); + ddir = ffhp->fh_dentry; + dirp = ddir->d_inode; + + dnew = lookup_one_len(name, ddir, len); + err = PTR_ERR(dnew); + if (IS_ERR(dnew)) + goto out_nfserr; + + dold = tfhp->fh_dentry; + dest = dold->d_inode; + + err = vfs_link(dold, dirp, dnew); + if (!err) { + if (EX_ISSYNC(ffhp->fh_export)) { + nfsd_sync_dir(ddir); + write_inode_now(dest, 1); + } + } else { + if (err == -EXDEV && rqstp->rq_vers == 2) + err = nfserr_acces; + else + err = nfserrno(err); + } + + fh_unlock(ffhp); + dput(dnew); +out: + return err; + +out_nfserr: + err = nfserrno(err); + goto out; +} + +/* + * Rename a file + * N.B. After this call _both_ ffhp and tfhp need an fh_put + */ +int +nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, + struct svc_fh *tfhp, char *tname, int tlen) +{ + struct dentry *fdentry, *tdentry, *odentry, *ndentry; + struct inode *fdir, *tdir; + int err; + + err = fh_verify(rqstp, ffhp, S_IFDIR, MAY_REMOVE); + if (err) + goto out; + err = fh_verify(rqstp, tfhp, S_IFDIR, MAY_CREATE); + if (err) + goto out; + + fdentry = ffhp->fh_dentry; + fdir = fdentry->d_inode; + + tdentry = tfhp->fh_dentry; + tdir = tdentry->d_inode; + + err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; + if (fdir->i_dev != tdir->i_dev) + goto out; + + err = nfserr_perm; + if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) + goto out; + + /* cannot use fh_lock as we need deadlock protective ordering + * so do it by hand */ + double_down(&tdir->i_sem, &fdir->i_sem); + ffhp->fh_locked = tfhp->fh_locked = 1; + fill_pre_wcc(ffhp); + fill_pre_wcc(tfhp); + + odentry = lookup_one_len(fname, fdentry, flen); + err = PTR_ERR(odentry); + if (IS_ERR(odentry)) + goto out_nfserr; + + err = -ENOENT; + if (!odentry->d_inode) + goto out_dput_old; + + ndentry = lookup_one_len(tname, tdentry, tlen); + err = PTR_ERR(ndentry); + if (IS_ERR(ndentry)) + goto out_dput_old; + + +#ifdef MSNFS + if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) && + ((atomic_read(&odentry->d_count) > 1) + || (atomic_read(&ndentry->d_count) > 1))) { + err = nfserr_perm; + } else +#endif + err = vfs_rename(fdir, odentry, tdir, ndentry); + if (!err && EX_ISSYNC(tfhp->fh_export)) { + nfsd_sync_dir(tdentry); + nfsd_sync_dir(fdentry); + } + dput(ndentry); + + out_dput_old: + dput(odentry); + out_nfserr: + if (err) + err = nfserrno(err); + + /* we cannot reply on fh_unlock on the two filehandles, + * as that would do the wrong thing if the two directories + * were the same, so again we do it by hand + */ + fill_post_wcc(ffhp); + fill_post_wcc(tfhp); + double_up(&tdir->i_sem, &fdir->i_sem); + ffhp->fh_locked = tfhp->fh_locked = 0; + +out: + return err; +} + +/* + * Unlink a file or directory + * N.B. After this call fhp needs an fh_put + */ +int +nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, + char *fname, int flen) +{ + struct dentry *dentry, *rdentry; + struct inode *dirp; + int err; + + err = nfserr_acces; + if (!flen || isdotent(fname, flen)) + goto out; + err = fh_verify(rqstp, fhp, S_IFDIR, MAY_REMOVE); + if (err) + goto out; + + fh_lock(fhp); + dentry = fhp->fh_dentry; + dirp = dentry->d_inode; + + rdentry = lookup_one_len(fname, dentry, flen); + err = PTR_ERR(rdentry); + if (IS_ERR(rdentry)) + goto out_nfserr; + + if (!rdentry->d_inode) { + dput(rdentry); + err = nfserr_noent; + goto out; + } + + if (type != S_IFDIR) { /* It's UNLINK */ +#ifdef MSNFS + if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && + (atomic_read(&rdentry->d_count) > 1)) { + err = nfserr_perm; + } else +#endif + err = vfs_unlink(dirp, rdentry); + } else { /* It's RMDIR */ + err = vfs_rmdir(dirp, rdentry); + } + + dput(rdentry); + + if (err) + goto out_nfserr; + if (EX_ISSYNC(fhp->fh_export)) + nfsd_sync_dir(dentry); + +out: + return err; + +out_nfserr: + err = nfserrno(err); + goto out; +} + +/* + * Read entries from a directory. + * The verifier is an NFSv3 thing we ignore for now. + */ +int +nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, + encode_dent_fn func, u32 *buffer, int *countp, u32 *verf) +{ + struct inode *inode; + u32 *p; + int oldlen, eof, err; + struct file file; + struct readdir_cd cd; + + err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file); + if (err) + goto out; + if (offset > ~(u32) 0) + goto out_close; + + err = nfserr_notdir; + if (!file.f_op->readdir) + goto out_close; + file.f_pos = offset; + + /* Set up the readdir context */ + memset(&cd, 0, sizeof(cd)); + cd.rqstp = rqstp; + cd.buffer = buffer; + cd.buflen = *countp; /* count of words */ + cd.dirfh = fhp; + + /* + * Read the directory entries. This silly loop is necessary because + * readdir() is not guaranteed to fill up the entire buffer, but + * may choose to do less. + */ + inode = file.f_dentry->d_inode; + down(&inode->i_sem); + while (1) { + oldlen = cd.buflen; + + /* + dprintk("nfsd: f_op->readdir(%x/%ld @ %d) buflen = %d (%d)\n", + file.f_inode->i_dev, file.f_inode->i_ino, + (int) file.f_pos, (int) oldlen, (int) cd.buflen); + */ + err = file.f_op->readdir(&file, &cd, (filldir_t) func); + if (err < 0) + goto out_nfserr; + if (oldlen == cd.buflen) + break; + if (cd.eob) + break; + } + up(&inode->i_sem); + + /* If we didn't fill the buffer completely, we're at EOF */ + eof = !cd.eob; + + if (cd.offset) { + if (rqstp->rq_vers == 3) + (void)xdr_encode_hyper(cd.offset, file.f_pos); + else + *cd.offset = htonl(file.f_pos); + } + + p = cd.buffer; + *p++ = 0; /* no more entries */ + *p++ = htonl(eof); /* end of directory */ + *countp = (caddr_t) p - (caddr_t) buffer; + + dprintk("nfsd: readdir result %d bytes, eof %d offset %d\n", + *countp, eof, + cd.offset? ntohl(*cd.offset) : -1); + err = 0; +out_close: + nfsd_close(&file); +out: + return err; + +out_nfserr: + up(&inode->i_sem); + err = nfserrno(err); + goto out_close; +} + +/* + * Get file system stats + * N.B. After this call fhp needs an fh_put + */ +int +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct statfs *stat) +{ + int err = fh_verify(rqstp, fhp, 0, MAY_NOP); + if (!err && vfs_statfs(fhp->fh_dentry->d_inode->i_sb,stat)) + err = nfserr_io; + return err; +} + +/* + * Check for a user's access permissions to this inode. + */ +int +nfsd_permission(struct svc_export *exp, struct dentry *dentry, int acc) +{ + struct inode *inode = dentry->d_inode; + int err; + + if (acc == MAY_NOP) + return 0; +#if 0 + dprintk("nfsd: permission 0x%x%s%s%s%s%s%s%s mode 0%o%s%s%s%s\n", + acc, + (acc & MAY_READ)? " read" : "", + (acc & MAY_WRITE)? " write" : "", + (acc & MAY_EXEC)? " exec" : "", + (acc & MAY_SATTR)? " sattr" : "", + (acc & MAY_TRUNC)? " trunc" : "", + (acc & MAY_LOCK)? " lock" : "", + (acc & MAY_OWNER_OVERRIDE)? " owneroverride" : "", + inode->i_mode, + IS_IMMUTABLE_FILE(inode)? " immut(F)" : "", + IS_IMMUTABLE_LINK(inode)? " immut(L)" : "", + IS_APPEND(inode)? " append" : "", + IS_RDONLY(inode)? " ro" : ""); + dprintk(" owner %d/%d user %d/%d\n", + inode->i_uid, inode->i_gid, current->fsuid, current->fsgid); +#endif + + /* only care about readonly exports for files and + * directories. links don't have meaningful write access, + * and all else is local to the client + */ + if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)) + if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { + if (EX_RDONLY(exp) || IS_RDONLY(inode)) + return nfserr_rofs; + if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE_FILE(inode)) + return nfserr_perm; + } + if ((acc & MAY_TRUNC) && IS_APPEND(inode)) + return nfserr_perm; + + if (acc & MAY_LOCK) { + /* If we cannot rely on authentication in NLM requests, + * just allow locks, otherwise require read permission, or + * ownership + */ + if (exp->ex_flags & NFSEXP_NOAUTHNLM) + return 0; + else + acc = MAY_READ | MAY_OWNER_OVERRIDE; + } + /* + * The file owner always gets access permission for accesses that + * would normally be checked at open time. This is to make + * file access work even when the client has done a fchmod(fd, 0). + * + * However, `cp foo bar' should fail nevertheless when bar is + * readonly. A sensible way to do this might be to reject all + * attempts to truncate a read-only file, because a creat() call + * always implies file truncation. + * ... but this isn't really fair. A process may reasonably call + * ftruncate on an open file descriptor on a file with perm 000. + * We must trust the client to do permission checking - using "ACCESS" + * with NFSv3. + */ + if ((acc & MAY_OWNER_OVERRIDE) && + inode->i_uid == current->fsuid) + return 0; + + acc &= ~ MAY_OWNER_OVERRIDE; /* This bit is no longer needed, + and gets in the way later */ + + err = permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); + + /* Allow read access to binaries even when mode 111 */ + if (err == -EACCES && S_ISREG(inode->i_mode) && acc == MAY_READ) + err = permission(inode, MAY_EXEC); + + return err? nfserrno(err) : 0; +} + +void +nfsd_racache_shutdown(void) +{ + if (!raparm_cache) + return; + dprintk("nfsd: freeing readahead buffers.\n"); + kfree(raparml); + raparm_cache = raparml = NULL; +} +/* + * Initialize readahead param cache + */ +int +nfsd_racache_init(int cache_size) +{ + int i; + + if (raparm_cache) + return 0; + raparml = kmalloc(sizeof(struct raparms) * cache_size, GFP_KERNEL); + + if (raparml != NULL) { + dprintk("nfsd: allocating %d readahead buffers.\n", + cache_size); + memset(raparml, 0, sizeof(struct raparms) * cache_size); + for (i = 0; i < cache_size - 1; i++) { + raparml[i].p_next = raparml + i + 1; + } + raparm_cache = raparml; + } else { + printk(KERN_WARNING + "nfsd: Could not allocate memory read-ahead cache.\n"); + return -ENOMEM; + } + nfsdstats.ra_size = cache_size; + return 0; +} diff -urN linux-2.4.16-reiserfspatches-immutable/fs/open.c linux-2.4.16-reiserfspatches-immutable-ctx4/fs/open.c --- linux-2.4.16-reiserfspatches-immutable/fs/open.c Mon Dec 10 14:28:03 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/open.c Mon Dec 10 15:01:47 2001 @@ -122,7 +122,7 @@ goto dput_and_out; error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + if (IS_IMMUTABLE_FILE(inode) || IS_APPEND(inode)) goto dput_and_out; /* diff -urN linux-2.4.16-reiserfspatches-immutable/fs/open.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/open.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/open.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/open.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,895 @@ +/* + * linux/fs/open.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + +int vfs_statfs(struct super_block *sb, struct statfs *buf) +{ + int retval = -ENODEV; + + if (sb) { + retval = -ENOSYS; + if (sb->s_op && sb->s_op->statfs) { + memset(buf, 0, sizeof(struct statfs)); + lock_kernel(); + retval = sb->s_op->statfs(sb, buf); + unlock_kernel(); + } + } + return retval; +} + + +asmlinkage long sys_statfs(const char * path, struct statfs * buf) +{ + struct nameidata nd; + int error; + + error = user_path_walk(path, &nd); + if (!error) { + struct statfs tmp; + error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(struct statfs))) + error = -EFAULT; + path_release(&nd); + } + return error; +} + +asmlinkage long sys_fstatfs(unsigned int fd, struct statfs * buf) +{ + struct file * file; + struct statfs tmp; + int error; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp); + if (!error && copy_to_user(buf, &tmp, sizeof(struct statfs))) + error = -EFAULT; + fput(file); +out: + return error; +} + +int do_truncate(struct dentry *dentry, loff_t length) +{ + struct inode *inode = dentry->d_inode; + int error; + struct iattr newattrs; + + /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ + if (length < 0) + return -EINVAL; + + down(&inode->i_sem); + newattrs.ia_size = length; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + error = notify_change(dentry, &newattrs); + up(&inode->i_sem); + return error; +} + +static inline long do_sys_truncate(const char * path, loff_t length) +{ + struct nameidata nd; + struct inode * inode; + int error; + + error = -EINVAL; + if (length < 0) /* sorry, but loff_t says... */ + goto out; + + error = user_path_walk(path, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; + + /* For directories it's -EISDIR, for other non-regulars - -EINVAL */ + error = -EISDIR; + if (S_ISDIR(inode->i_mode)) + goto dput_and_out; + + error = -EINVAL; + if (!S_ISREG(inode->i_mode)) + goto dput_and_out; + + error = permission(inode,MAY_WRITE); + if (error) + goto dput_and_out; + + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; + + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto dput_and_out; + + /* + * Make sure that there are no leases. + */ + error = get_lease(inode, FMODE_WRITE); + if (error) + goto dput_and_out; + + error = get_write_access(inode); + if (error) + goto dput_and_out; + + error = locks_verify_truncate(inode, NULL, length); + if (!error) { + DQUOT_INIT(inode); + error = do_truncate(nd.dentry, length); + } + put_write_access(inode); + +dput_and_out: + path_release(&nd); +out: + return error; +} + +asmlinkage long sys_truncate(const char * path, unsigned long length) +{ + /* on 32-bit boxen it will cut the range 2^31--2^32-1 off */ + return do_sys_truncate(path, (long)length); +} + +static inline long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +{ + struct inode * inode; + struct dentry *dentry; + struct file * file; + int error; + + error = -EINVAL; + if (length < 0) + goto out; + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + /* explicitly opened as large or we are on 64-bit box */ + if (file->f_flags & O_LARGEFILE) + small = 0; + + dentry = file->f_dentry; + inode = dentry->d_inode; + error = -EINVAL; + if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE)) + goto out_putf; + + error = -EINVAL; + /* Cannot ftruncate over 2^31 bytes without large file support */ + if (small && length > MAX_NON_LFS) + goto out_putf; + + error = -EPERM; + if (IS_APPEND(inode)) + goto out_putf; + + error = locks_verify_truncate(inode, file, length); + if (!error) + error = do_truncate(dentry, length); +out_putf: + fput(file); +out: + return error; +} + +asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) +{ + return do_sys_ftruncate(fd, length, 1); +} + +/* LFS versions of truncate are only needed on 32 bit machines */ +#if BITS_PER_LONG == 32 +asmlinkage long sys_truncate64(const char * path, loff_t length) +{ + return do_sys_truncate(path, length); +} + +asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) +{ + return do_sys_ftruncate(fd, length, 0); +} +#endif + +#if !(defined(__alpha__) || defined(__ia64__)) + +/* + * sys_utime() can be implemented in user-level using sys_utimes(). + * Is this for backwards compatibility? If so, why not move it + * into the appropriate arch directory (for those architectures that + * need it). + */ + +/* If times==NULL, set access and modification to current time, + * must be owner or have write permission. + * Else, update from *times, must be owner or super user. + */ +asmlinkage long sys_utime(char * filename, struct utimbuf * times) +{ + int error; + struct nameidata nd; + struct inode * inode; + struct iattr newattrs; + + error = user_path_walk(filename, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; + + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; + + /* Don't worry, the checks are done in inode_change_ok() */ + newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; + if (times) { + error = get_user(newattrs.ia_atime, ×->actime); + if (!error) + error = get_user(newattrs.ia_mtime, ×->modtime); + if (error) + goto dput_and_out; + + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; + } else { + if (current->fsuid != inode->i_uid && + (error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; + } + error = notify_change(nd.dentry, &newattrs); +dput_and_out: + path_release(&nd); +out: + return error; +} + +#endif + +/* If times==NULL, set access and modification to current time, + * must be owner or have write permission. + * Else, update from *times, must be owner or super user. + */ +asmlinkage long sys_utimes(char * filename, struct timeval * utimes) +{ + int error; + struct nameidata nd; + struct inode * inode; + struct iattr newattrs; + + error = user_path_walk(filename, &nd); + + if (error) + goto out; + inode = nd.dentry->d_inode; + + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; + + /* Don't worry, the checks are done in inode_change_ok() */ + newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME; + if (utimes) { + struct timeval times[2]; + error = -EFAULT; + if (copy_from_user(×, utimes, sizeof(times))) + goto dput_and_out; + newattrs.ia_atime = times[0].tv_sec; + newattrs.ia_mtime = times[1].tv_sec; + newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET; + } else { + if ((error = permission(inode,MAY_WRITE)) != 0) + goto dput_and_out; + } + error = notify_change(nd.dentry, &newattrs); +dput_and_out: + path_release(&nd); +out: + return error; +} + +/* + * access() needs to use the real uid/gid, not the effective uid/gid. + * We do this by temporarily clearing all FS-related capabilities and + * switching the fsuid/fsgid around to the real ones. + */ +asmlinkage long sys_access(const char * filename, int mode) +{ + struct nameidata nd; + int old_fsuid, old_fsgid; + kernel_cap_t old_cap; + int res; + + if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */ + return -EINVAL; + + old_fsuid = current->fsuid; + old_fsgid = current->fsgid; + old_cap = current->cap_effective; + + current->fsuid = current->uid; + current->fsgid = current->gid; + + /* Clear the capabilities if we switch to a non-root user */ + if (current->uid) + cap_clear(current->cap_effective); + else + current->cap_effective = current->cap_permitted; + + res = user_path_walk(filename, &nd); + if (!res) { + res = permission(nd.dentry->d_inode, mode); + /* SuS v2 requires we report a read only fs too */ + if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode) + && !special_file(nd.dentry->d_inode->i_mode)) + res = -EROFS; + path_release(&nd); + } + + current->fsuid = old_fsuid; + current->fsgid = old_fsgid; + current->cap_effective = old_cap; + + return res; +} + +asmlinkage long sys_chdir(const char * filename) +{ + int error; + struct nameidata nd; + char *name; + + name = getname(filename); + error = PTR_ERR(name); + if (IS_ERR(name)) + goto out; + + error = 0; + if (path_init(name,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd)) + error = path_walk(name, &nd); + putname(name); + if (error) + goto out; + + error = permission(nd.dentry->d_inode,MAY_EXEC); + if (error) + goto dput_and_out; + + set_fs_pwd(current->fs, nd.mnt, nd.dentry); + +dput_and_out: + path_release(&nd); +out: + return error; +} + +asmlinkage long sys_fchdir(unsigned int fd) +{ + struct file *file; + struct dentry *dentry; + struct inode *inode; + struct vfsmount *mnt; + int error; + + error = -EBADF; + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + mnt = file->f_vfsmnt; + inode = dentry->d_inode; + + error = -ENOTDIR; + if (!S_ISDIR(inode->i_mode)) + goto out_putf; + + error = permission(inode, MAY_EXEC); + if (!error) + set_fs_pwd(current->fs, mnt, dentry); +out_putf: + fput(file); +out: + return error; +} + +asmlinkage long sys_chroot(const char * filename) +{ + int error; + struct nameidata nd; + char *name; + + name = getname(filename); + error = PTR_ERR(name); + if (IS_ERR(name)) + goto out; + + path_init(name, LOOKUP_POSITIVE | LOOKUP_FOLLOW | + LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd); + error = path_walk(name, &nd); + putname(name); + if (error) + goto out; + + error = permission(nd.dentry->d_inode,MAY_EXEC); + if (error) + goto dput_and_out; + + error = -EPERM; + if (!capable(CAP_SYS_CHROOT)) + goto dput_and_out; + + set_fs_root(current->fs, nd.mnt, nd.dentry); + set_fs_altroot(); + error = 0; +dput_and_out: + path_release(&nd); +out: + return error; +} + +asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) +{ + struct inode * inode; + struct dentry * dentry; + struct file * file; + int err = -EBADF; + struct iattr newattrs; + + file = fget(fd); + if (!file) + goto out; + + dentry = file->f_dentry; + inode = dentry->d_inode; + + err = -EROFS; + if (IS_RDONLY(inode)) + goto out_putf; + err = -EPERM; + if (IS_IMMUTABLE_FILE(inode) || IS_APPEND(inode)) + goto out_putf; + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + err = notify_change(dentry, &newattrs); + +out_putf: + fput(file); +out: + return err; +} + +asmlinkage long sys_chmod(const char * filename, mode_t mode) +{ + struct nameidata nd; + struct inode * inode; + int error; + struct iattr newattrs; + + error = user_path_walk(filename, &nd); + if (error) + goto out; + inode = nd.dentry->d_inode; + + error = -EROFS; + if (IS_RDONLY(inode)) + goto dput_and_out; + + error = -EPERM; + if (IS_IMMUTABLE_FILE(inode) || IS_APPEND(inode)) + goto dput_and_out; + + if (mode == (mode_t) -1) + mode = inode->i_mode; + newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; + error = notify_change(nd.dentry, &newattrs); + +dput_and_out: + path_release(&nd); +out: + return error; +} + +static int chown_common(struct dentry * dentry, uid_t user, gid_t group) +{ + struct inode * inode; + int error; + struct iattr newattrs; + + error = -ENOENT; + if (!(inode = dentry->d_inode)) { + printk(KERN_ERR "chown_common: NULL inode\n"); + goto out; + } + error = -EROFS; + if (IS_RDONLY(inode)) + goto out; + error = -EPERM; + if (IS_IMMUTABLE_FILE(inode) || IS_APPEND(inode)) + goto out; + if (user == (uid_t) -1) + user = inode->i_uid; + if (group == (gid_t) -1) + group = inode->i_gid; + newattrs.ia_mode = inode->i_mode; + newattrs.ia_uid = user; + newattrs.ia_gid = group; + newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME; + /* + * If the user or group of a non-directory has been changed by a + * non-root user, remove the setuid bit. + * 19981026 David C Niemi + * + * Changed this to apply to all users, including root, to avoid + * some races. This is the behavior we had in 2.0. The check for + * non-root was definitely wrong for 2.2 anyway, as it should + * have been using CAP_FSETID rather than fsuid -- 19990830 SD. + */ + if ((inode->i_mode & S_ISUID) == S_ISUID && + !S_ISDIR(inode->i_mode)) + { + newattrs.ia_mode &= ~S_ISUID; + newattrs.ia_valid |= ATTR_MODE; + } + /* + * Likewise, if the user or group of a non-directory has been changed + * by a non-root user, remove the setgid bit UNLESS there is no group + * execute bit (this would be a file marked for mandatory locking). + * 19981026 David C Niemi + * + * Removed the fsuid check (see the comment above) -- 19990830 SD. + */ + if (((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) + && !S_ISDIR(inode->i_mode)) + { + newattrs.ia_mode &= ~S_ISGID; + newattrs.ia_valid |= ATTR_MODE; + } + error = notify_change(dentry, &newattrs); +out: + return error; +} + +asmlinkage long sys_chown(const char * filename, uid_t user, gid_t group) +{ + struct nameidata nd; + int error; + + error = user_path_walk(filename, &nd); + if (!error) { + error = chown_common(nd.dentry, user, group); + path_release(&nd); + } + return error; +} + +asmlinkage long sys_lchown(const char * filename, uid_t user, gid_t group) +{ + struct nameidata nd; + int error; + + error = user_path_walk_link(filename, &nd); + if (!error) { + error = chown_common(nd.dentry, user, group); + path_release(&nd); + } + return error; +} + + +asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) +{ + struct file * file; + int error = -EBADF; + + file = fget(fd); + if (file) { + error = chown_common(file->f_dentry, user, group); + fput(file); + } + return error; +} + +/* + * Note that while the flag value (low two bits) for sys_open means: + * 00 - read-only + * 01 - write-only + * 10 - read-write + * 11 - special + * it is changed into + * 00 - no permissions needed + * 01 - read-permission + * 10 - write-permission + * 11 - read-write + * for the internal routines (ie open_namei()/follow_link() etc). 00 is + * used by symlinks. + */ +struct file *filp_open(const char * filename, int flags, int mode) +{ + int namei_flags, error; + struct nameidata nd; + + namei_flags = flags; + if ((namei_flags+1) & O_ACCMODE) + namei_flags++; + if (namei_flags & O_TRUNC) + namei_flags |= 2; + + error = open_namei(filename, namei_flags, mode, &nd); + if (!error) + return dentry_open(nd.dentry, nd.mnt, flags); + + return ERR_PTR(error); +} + +struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags) +{ + struct file * f; + struct inode *inode; + static LIST_HEAD(kill_list); + int error; + + error = -ENFILE; + f = get_empty_filp(); + if (!f) + goto cleanup_dentry; + f->f_flags = flags; + f->f_mode = (flags+1) & O_ACCMODE; + inode = dentry->d_inode; + if (f->f_mode & FMODE_WRITE) { + error = get_write_access(inode); + if (error) + goto cleanup_file; + } + + f->f_dentry = dentry; + f->f_vfsmnt = mnt; + f->f_pos = 0; + f->f_reada = 0; + f->f_op = fops_get(inode->i_fop); + file_move(f, &inode->i_sb->s_files); + + /* preallocate kiobuf for O_DIRECT */ + f->f_iobuf = NULL; + f->f_iobuf_lock = 0; + if (f->f_flags & O_DIRECT) { + error = alloc_kiovec(1, &f->f_iobuf); + if (error) + goto cleanup_all; + } + + if (f->f_op && f->f_op->open) { + error = f->f_op->open(inode,f); + if (error) + goto cleanup_all; + } + f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); + + return f; + +cleanup_all: + if (f->f_iobuf) + free_kiovec(1, &f->f_iobuf); + fops_put(f->f_op); + if (f->f_mode & FMODE_WRITE) + put_write_access(inode); + file_move(f, &kill_list); /* out of the way.. */ + f->f_dentry = NULL; + f->f_vfsmnt = NULL; +cleanup_file: + put_filp(f); +cleanup_dentry: + dput(dentry); + mntput(mnt); + return ERR_PTR(error); +} + +/* + * Find an empty file descriptor entry, and mark it busy. + */ +int get_unused_fd(void) +{ + struct files_struct * files = current->files; + int fd, error; + + error = -EMFILE; + write_lock(&files->file_lock); + +repeat: + fd = find_next_zero_bit(files->open_fds, + files->max_fdset, + files->next_fd); + + /* + * N.B. For clone tasks sharing a files structure, this test + * will limit the total number of files that can be opened. + */ + if (fd >= current->rlim[RLIMIT_NOFILE].rlim_cur) + goto out; + + /* Do we need to expand the fdset array? */ + if (fd >= files->max_fdset) { + error = expand_fdset(files, fd); + if (!error) { + error = -EMFILE; + goto repeat; + } + goto out; + } + + /* + * Check whether we need to expand the fd array. + */ + if (fd >= files->max_fds) { + error = expand_fd_array(files, fd); + if (!error) { + error = -EMFILE; + goto repeat; + } + goto out; + } + + FD_SET(fd, files->open_fds); + FD_CLR(fd, files->close_on_exec); + files->next_fd = fd + 1; +#if 1 + /* Sanity check */ + if (files->fd[fd] != NULL) { + printk(KERN_WARNING "get_unused_fd: slot %d not NULL!\n", fd); + files->fd[fd] = NULL; + } +#endif + error = fd; + +out: + write_unlock(&files->file_lock); + return error; +} + +asmlinkage long sys_open(const char * filename, int flags, int mode) +{ + char * tmp; + int fd, error; + +#if BITS_PER_LONG != 32 + flags |= O_LARGEFILE; +#endif + tmp = getname(filename); + fd = PTR_ERR(tmp); + if (!IS_ERR(tmp)) { + fd = get_unused_fd(); + if (fd >= 0) { + struct file *f = filp_open(tmp, flags, mode); + error = PTR_ERR(f); + if (IS_ERR(f)) + goto out_error; + fd_install(fd, f); + } +out: + putname(tmp); + } + return fd; + +out_error: + put_unused_fd(fd); + fd = error; + goto out; +} + +#ifndef __alpha__ + +/* + * For backward compatibility? Maybe this should be moved + * into arch/i386 instead? + */ +asmlinkage long sys_creat(const char * pathname, int mode) +{ + return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); +} + +#endif + +/* + * "id" is the POSIX thread ID. We use the + * files pointer for this.. + */ +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval; + + if (!file_count(filp)) { + printk(KERN_ERR "VFS: Close: file count is 0\n"); + return 0; + } + retval = 0; + if (filp->f_op && filp->f_op->flush) { + lock_kernel(); + retval = filp->f_op->flush(filp); + unlock_kernel(); + } + fcntl_dirnotify(0, filp, 0); + locks_remove_posix(filp, id); + fput(filp); + return retval; +} + +/* + * Careful here! We test whether the file pointer is NULL before + * releasing the fd. This ensures that one clone task can't release + * an fd while another clone is opening it. + */ +asmlinkage long sys_close(unsigned int fd) +{ + struct file * filp; + struct files_struct *files = current->files; + + write_lock(&files->file_lock); + if (fd >= files->max_fds) + goto out_unlock; + filp = files->fd[fd]; + if (!filp) + goto out_unlock; + files->fd[fd] = NULL; + FD_CLR(fd, files->close_on_exec); + __put_unused_fd(files, fd); + write_unlock(&files->file_lock); + return filp_close(filp, files); + +out_unlock: + write_unlock(&files->file_lock); + return -EBADF; +} + +/* + * This routine simulates a hangup on the tty, to arrange that users + * are given clean terminals at login time. + */ +asmlinkage long sys_vhangup(void) +{ + if (capable(CAP_SYS_TTY_CONFIG)) { + tty_vhangup(current->tty); + return 0; + } + return -EPERM; +} + +/* + * Called when an inode is about to be open. + * We use this to disallow opening RW large files on 32bit systems if + * the caller didn't specify O_LARGEFILE. On 64bit systems we force + * on this flag in sys_open. + */ +int generic_file_open(struct inode * inode, struct file * filp) +{ + if (!(filp->f_flags & O_LARGEFILE) && inode->i_size > MAX_NON_LFS) + return -EFBIG; + return 0; +} + +EXPORT_SYMBOL(generic_file_open); diff -urN linux-2.4.16-reiserfspatches-immutable/fs/proc/array.c linux-2.4.16-reiserfspatches-immutable-ctx4/fs/proc/array.c --- linux-2.4.16-reiserfspatches-immutable/fs/proc/array.c Mon Dec 10 13:12:23 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/proc/array.c Mon Dec 10 15:01:47 2001 @@ -263,10 +263,12 @@ { return buffer + sprintf(buffer, "CapInh:\t%016x\n" "CapPrm:\t%016x\n" - "CapEff:\t%016x\n", + "CapEff:\t%016x\n" + "CapBset:\t%016x\n", cap_t(p->cap_inheritable), cap_t(p->cap_permitted), - cap_t(p->cap_effective)); + cap_t(p->cap_effective), + cap_t(p->cap_bset)); } @@ -288,6 +290,18 @@ } buffer = task_sig(task, buffer); buffer = task_cap(task, buffer); + buffer += sprintf (buffer,"s_context: %d\n",task->s_context); + buffer += sprintf (buffer,"ipv4root: %08lx\n",task->ipv4root); + if (task->s_info != NULL){ + buffer += sprintf (buffer,"ctxticks: %d %d %d\n" + ,atomic_read(&task->s_info->ticks),task->counter + ,task->s_info->refcount); + buffer += sprintf (buffer,"ctxflags: %d\n" + ,task->s_info->flags); + }else{ + buffer += sprintf (buffer,"ctxticks: none\n"); + buffer += sprintf (buffer,"ctxflags: none\n"); + } #if defined(CONFIG_ARCH_S390) buffer = task_show_regs(task, buffer); #endif diff -urN linux-2.4.16-reiserfspatches-immutable/fs/proc/base.c linux-2.4.16-reiserfspatches-immutable-ctx4/fs/proc/base.c --- linux-2.4.16-reiserfspatches-immutable/fs/proc/base.c Mon Dec 10 14:28:03 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/proc/base.c Mon Dec 10 15:01:47 2001 @@ -966,6 +966,11 @@ if (!task) goto out; + if (pid != 1 + && current->s_context != 1 + && task->s_context != current->s_context){ + goto out; + } inode = proc_pid_make_inode(dir->i_sb, task, PROC_PID_INO); free_task_struct(task); @@ -1012,6 +1017,16 @@ int pid = p->pid; if (!pid) continue; + /* Even if the pid 1 is not part of the security context */ + /* we show it anyway. This makes the security box */ + /* more standard (and helps pstree do its job) */ + /* So current process "knows" pid 1 exist anyway and can't */ + /* send any signal either */ + + /* A process with security context 1 can see all processes */ + if (pid != 1 + && current->s_context != 1 + && p->s_context != current->s_context) continue; if (--index >= 0) continue; pids[nr_pids] = pid; diff -urN linux-2.4.16-reiserfspatches-immutable/fs/proc/base.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/proc/base.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/proc/base.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/proc/base.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,1055 @@ +/* + * linux/fs/proc/base.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * proc base directory handling functions + * + * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. + * Instead of using magical inumbers to determine the kind of object + * we allocate and fill in-core inodes upon lookup. They don't even + * go into icache. We cache the reference to task_struct upon lookup too. + * Eventually it should become a filesystem in its own. We don't use the + * rest of procfs anymore. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * For hysterical raisins we keep the same inumbers as in the old procfs. + * Feel free to change the macro below - just keep the range distinct from + * inumbers of the rest of procfs (currently those are in 0x0000--0xffff). + * As soon as we'll get a separate superblock we will be able to forget + * about magical ranges too. + */ + +#define fake_ino(pid,ino) (((pid)<<16)|(ino)) + +ssize_t proc_pid_read_maps(struct task_struct*,struct file*,char*,size_t,loff_t*); +int proc_pid_stat(struct task_struct*,char*); +int proc_pid_status(struct task_struct*,char*); +int proc_pid_statm(struct task_struct*,char*); +int proc_pid_cpu(struct task_struct*,char*); + +static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +{ + if (inode->u.proc_i.file) { + *mnt = mntget(inode->u.proc_i.file->f_vfsmnt); + *dentry = dget(inode->u.proc_i.file->f_dentry); + return 0; + } + return -ENOENT; +} + +static int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +{ + struct mm_struct * mm; + struct vm_area_struct * vma; + int result = -ENOENT; + struct task_struct *task = inode->u.proc_i.task; + + task_lock(task); + mm = task->mm; + if (mm) + atomic_inc(&mm->mm_users); + task_unlock(task); + if (!mm) + goto out; + down_read(&mm->mmap_sem); + vma = mm->mmap; + while (vma) { + if ((vma->vm_flags & VM_EXECUTABLE) && + vma->vm_file) { + *mnt = mntget(vma->vm_file->f_vfsmnt); + *dentry = dget(vma->vm_file->f_dentry); + result = 0; + break; + } + vma = vma->vm_next; + } + up_read(&mm->mmap_sem); + mmput(mm); +out: + return result; +} + +static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +{ + struct fs_struct *fs; + int result = -ENOENT; + task_lock(inode->u.proc_i.task); + fs = inode->u.proc_i.task->fs; + if(fs) + atomic_inc(&fs->count); + task_unlock(inode->u.proc_i.task); + if (fs) { + read_lock(&fs->lock); + *mnt = mntget(fs->pwdmnt); + *dentry = dget(fs->pwd); + read_unlock(&fs->lock); + result = 0; + put_fs_struct(fs); + } + return result; +} + +static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +{ + struct fs_struct *fs; + int result = -ENOENT; + task_lock(inode->u.proc_i.task); + fs = inode->u.proc_i.task->fs; + if(fs) + atomic_inc(&fs->count); + task_unlock(inode->u.proc_i.task); + if (fs) { + read_lock(&fs->lock); + *mnt = mntget(fs->rootmnt); + *dentry = dget(fs->root); + read_unlock(&fs->lock); + result = 0; + put_fs_struct(fs); + } + return result; +} + +static int proc_pid_environ(struct task_struct *task, char * buffer) +{ + struct mm_struct *mm; + int res = 0; + task_lock(task); + mm = task->mm; + if (mm) + atomic_inc(&mm->mm_users); + task_unlock(task); + if (mm) { + int len = mm->env_end - mm->env_start; + if (len > PAGE_SIZE) + len = PAGE_SIZE; + res = access_process_vm(task, mm->env_start, buffer, len, 0); + mmput(mm); + } + return res; +} + +static int proc_pid_cmdline(struct task_struct *task, char * buffer) +{ + struct mm_struct *mm; + int res = 0; + task_lock(task); + mm = task->mm; + if (mm) + atomic_inc(&mm->mm_users); + task_unlock(task); + if (mm) { + int len = mm->arg_end - mm->arg_start; + if (len > PAGE_SIZE) + len = PAGE_SIZE; + res = access_process_vm(task, mm->arg_start, buffer, len, 0); + // If the nul at the end of args has been overwritten, then + // assume application is using setproctitle(3). + if ( res > 0 && buffer[res-1] != '\0' ) + { + len = strnlen( buffer, res ); + if ( len < res ) + { + res = len; + } + else + { + len = mm->env_end - mm->env_start; + if (len > PAGE_SIZE - res) + len = PAGE_SIZE - res; + res += access_process_vm(task, mm->env_start, buffer+res, len, 0); + res = strnlen( buffer, res ); + } + } + mmput(mm); + } + return res; +} + +/************************************************************************/ +/* Here the fs part begins */ +/************************************************************************/ + +/* permission checks */ + +static int proc_check_root(struct inode *inode) +{ + struct dentry *de, *base, *root; + struct vfsmount *our_vfsmnt, *vfsmnt, *mnt; + int res = 0; + + if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */ + return -ENOENT; + read_lock(¤t->fs->lock); + our_vfsmnt = mntget(current->fs->rootmnt); + base = dget(current->fs->root); + read_unlock(¤t->fs->lock); + + spin_lock(&dcache_lock); + de = root; + mnt = vfsmnt; + + while (vfsmnt != our_vfsmnt) { + if (vfsmnt == vfsmnt->mnt_parent) + goto out; + de = vfsmnt->mnt_mountpoint; + vfsmnt = vfsmnt->mnt_parent; + } + + if (!is_subdir(de, base)) + goto out; + spin_unlock(&dcache_lock); + +exit: + dput(base); + mntput(our_vfsmnt); + dput(root); + mntput(mnt); + return res; +out: + spin_unlock(&dcache_lock); + res = -EACCES; + goto exit; +} + +static int proc_permission(struct inode *inode, int mask) +{ + if (vfs_permission(inode, mask) != 0) + return -EACCES; + return proc_check_root(inode); +} + +static ssize_t pid_maps_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_dentry->d_inode; + struct task_struct *task = inode->u.proc_i.task; + ssize_t res; + + res = proc_pid_read_maps(task, file, buf, count, ppos); + return res; +} + +static struct file_operations proc_maps_operations = { + read: pid_maps_read, +}; + +#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ + +static ssize_t proc_info_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_dentry->d_inode; + unsigned long page; + ssize_t length; + ssize_t end; + struct task_struct *task = inode->u.proc_i.task; + + if (count > PROC_BLOCK_SIZE) + count = PROC_BLOCK_SIZE; + if (!(page = __get_free_page(GFP_KERNEL))) + return -ENOMEM; + + length = inode->u.proc_i.op.proc_read(task, (char*)page); + + if (length < 0) { + free_page(page); + return length; + } + /* Static 4kB (or whatever) block capacity */ + if (*ppos >= length) { + free_page(page); + return 0; + } + if (count + *ppos > length) + count = length - *ppos; + end = count + *ppos; + copy_to_user(buf, (char *) page + *ppos, count); + *ppos = end; + free_page(page); + return count; +} + +static struct file_operations proc_info_file_operations = { + read: proc_info_read, +}; + +#define MAY_PTRACE(p) \ +(p==current||(p->p_pptr==current&&(p->ptrace & PT_PTRACED)&&p->state==TASK_STOPPED)) + + +static int mem_open(struct inode* inode, struct file* file) +{ + file->private_data = (void*)((long)current->self_exec_id); + return 0; +} + +static ssize_t mem_read(struct file * file, char * buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = file->f_dentry->d_inode->u.proc_i.task; + char *page; + unsigned long src = *ppos; + int copied = 0; + struct mm_struct *mm; + + + if (!MAY_PTRACE(task)) + return -ESRCH; + + page = (char *)__get_free_page(GFP_USER); + if (!page) + return -ENOMEM; + + task_lock(task); + mm = task->mm; + if (mm) + atomic_inc(&mm->mm_users); + task_unlock(task); + if (!mm) + return 0; + + if (file->private_data != (void*)((long)current->self_exec_id) ) { + mmput(mm); + return -EIO; + } + + + while (count > 0) { + int this_len, retval; + + this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; + retval = access_process_vm(task, src, page, this_len, 0); + if (!retval) { + if (!copied) + copied = -EIO; + break; + } + if (copy_to_user(buf, page, retval)) { + copied = -EFAULT; + break; + } + copied += retval; + src += retval; + buf += retval; + count -= retval; + } + *ppos = src; + mmput(mm); + free_page((unsigned long) page); + return copied; +} + +#define mem_write NULL + +#ifndef mem_write +/* This is a security hazard */ +static ssize_t mem_write(struct file * file, const char * buf, + size_t count, loff_t *ppos) +{ + int copied = 0; + char *page; + struct task_struct *task = file->f_dentry->d_inode->u.proc_i.task; + unsigned long dst = *ppos; + + if (!MAY_PTRACE(task)) + return -ESRCH; + + page = (char *)__get_free_page(GFP_USER); + if (!page) + return -ENOMEM; + + while (count > 0) { + int this_len, retval; + + this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; + if (copy_from_user(page, buf, this_len)) { + copied = -EFAULT; + break; + } + retval = access_process_vm(task, dst, page, this_len, 1); + if (!retval) { + if (!copied) + copied = -EIO; + break; + } + copied += retval; + buf += retval; + dst += retval; + count -= retval; + } + *ppos = dst; + free_page((unsigned long) page); + return copied; +} +#endif + +static struct file_operations proc_mem_operations = { + read: mem_read, + write: mem_write, + open: mem_open, +}; + +static struct inode_operations proc_mem_inode_operations = { + permission: proc_permission, +}; + +static int proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + int error = -EACCES; + + /* We don't need a base pointer in the /proc filesystem */ + path_release(nd); + + if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) + goto out; + error = proc_check_root(inode); + if (error) + goto out; + + error = inode->u.proc_i.op.proc_get_link(inode, &nd->dentry, &nd->mnt); + nd->last_type = LAST_BIND; +out: + return error; +} + +static int do_proc_readlink(struct dentry *dentry, struct vfsmount *mnt, + char * buffer, int buflen) +{ + struct inode * inode; + char * tmp = (char*)__get_free_page(GFP_KERNEL), *path; + int len; + + if (!tmp) + return -ENOMEM; + + inode = dentry->d_inode; + path = d_path(dentry, mnt, tmp, PAGE_SIZE); + len = tmp + PAGE_SIZE - 1 - path; + + if (len < buflen) + buflen = len; + copy_to_user(buffer, path, buflen); + free_page((unsigned long)tmp); + return buflen; +} + +static int proc_pid_readlink(struct dentry * dentry, char * buffer, int buflen) +{ + int error = -EACCES; + struct inode *inode = dentry->d_inode; + struct dentry *de; + struct vfsmount *mnt = NULL; + + if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) + goto out; + error = proc_check_root(inode); + if (error) + goto out; + + error = inode->u.proc_i.op.proc_get_link(inode, &de, &mnt); + if (error) + goto out; + + error = do_proc_readlink(de, mnt, buffer, buflen); + dput(de); + mntput(mnt); +out: + return error; +} + +static struct inode_operations proc_pid_link_inode_operations = { + readlink: proc_pid_readlink, + follow_link: proc_pid_follow_link +}; + +struct pid_entry { + int type; + int len; + char *name; + mode_t mode; +}; + +enum pid_directory_inos { + PROC_PID_INO = 2, + PROC_PID_STATUS, + PROC_PID_MEM, + PROC_PID_CWD, + PROC_PID_ROOT, + PROC_PID_EXE, + PROC_PID_FD, + PROC_PID_ENVIRON, + PROC_PID_CMDLINE, + PROC_PID_STAT, + PROC_PID_STATM, + PROC_PID_MAPS, + PROC_PID_CPU, + PROC_PID_FD_DIR = 0x8000, /* 0x8000-0xffff */ +}; + +#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)} +static struct pid_entry base_stuff[] = { + E(PROC_PID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR), + E(PROC_PID_ENVIRON, "environ", S_IFREG|S_IRUSR), + E(PROC_PID_STATUS, "status", S_IFREG|S_IRUGO), + E(PROC_PID_CMDLINE, "cmdline", S_IFREG|S_IRUGO), + E(PROC_PID_STAT, "stat", S_IFREG|S_IRUGO), + E(PROC_PID_STATM, "statm", S_IFREG|S_IRUGO), +#ifdef CONFIG_SMP + E(PROC_PID_CPU, "cpu", S_IFREG|S_IRUGO), +#endif + E(PROC_PID_MAPS, "maps", S_IFREG|S_IRUGO), + E(PROC_PID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), + E(PROC_PID_CWD, "cwd", S_IFLNK|S_IRWXUGO), + E(PROC_PID_ROOT, "root", S_IFLNK|S_IRWXUGO), + E(PROC_PID_EXE, "exe", S_IFLNK|S_IRWXUGO), + {0,0,NULL,0} +}; +#undef E + +#define NUMBUF 10 + +static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct task_struct *p = inode->u.proc_i.task; + unsigned int fd, pid, ino; + int retval; + char buf[NUMBUF]; + struct files_struct * files; + + retval = 0; + pid = p->pid; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + case 1: + ino = fake_ino(pid, PROC_PID_INO); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + default: + task_lock(p); + files = p->files; + if (files) + atomic_inc(&files->count); + task_unlock(p); + if (!files) + goto out; + for (fd = filp->f_pos-2; + fd < files->max_fds; + fd++, filp->f_pos++) { + unsigned int i,j; + + if (!fcheck_files(files, fd)) + continue; + + j = NUMBUF; + i = fd; + do { + j--; + buf[j] = '0' + (i % 10); + i /= 10; + } while (i); + + ino = fake_ino(pid, PROC_PID_FD_DIR + fd); + if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) + break; + } + put_files_struct(files); + } +out: + return retval; +} + +static int proc_base_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + int i; + int pid; + struct inode *inode = filp->f_dentry->d_inode; + struct pid_entry *p; + + pid = inode->u.proc_i.task->pid; + if (!pid) + return -ENOENT; + i = filp->f_pos; + switch (i) { + case 0: + if (filldir(dirent, ".", 1, i, inode->i_ino, DT_DIR) < 0) + return 0; + i++; + filp->f_pos++; + /* fall through */ + case 1: + if (filldir(dirent, "..", 2, i, PROC_ROOT_INO, DT_DIR) < 0) + return 0; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i>=sizeof(base_stuff)/sizeof(base_stuff[0])) + return 1; + p = base_stuff + i; + while (p->name) { + if (filldir(dirent, p->name, p->len, filp->f_pos, + fake_ino(pid, p->type), p->mode >> 12) < 0) + return 0; + filp->f_pos++; + p++; + } + } + return 1; +} + +/* building an inode */ + +static int task_dumpable(struct task_struct *task) +{ + int dumpable = 0; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) + dumpable = mm->dumpable; + task_unlock(task); + return dumpable; +} + + +static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, int ino) +{ + struct inode * inode; + + /* We need a new inode */ + + inode = new_inode(sb); + if (!inode) + goto out; + + /* Common stuff */ + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = fake_ino(task->pid, ino); + + if (!task->pid) + goto out_unlock; + + /* + * grab the reference to task. + */ + get_task_struct(task); + inode->u.proc_i.task = task; + inode->i_uid = 0; + inode->i_gid = 0; + if (ino == PROC_PID_INO || task_dumpable(task)) { + inode->i_uid = task->euid; + inode->i_gid = task->egid; + } + +out: + return inode; + +out_unlock: + iput(inode); + return NULL; +} + +/* dentry stuff */ + +static int pid_fd_revalidate(struct dentry * dentry, int flags) +{ + return 0; +} + +/* + * Exceptional case: normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems + * due to the way we treat inodes. + */ +static int pid_base_revalidate(struct dentry * dentry, int flags) +{ + if (dentry->d_inode->u.proc_i.task->pid) + return 1; + d_drop(dentry); + return 0; +} + +static int pid_delete_dentry(struct dentry * dentry) +{ + return 1; +} + +static struct dentry_operations pid_fd_dentry_operations = +{ + d_revalidate: pid_fd_revalidate, + d_delete: pid_delete_dentry, +}; + +static struct dentry_operations pid_dentry_operations = +{ + d_delete: pid_delete_dentry, +}; + +static struct dentry_operations pid_base_dentry_operations = +{ + d_revalidate: pid_base_revalidate, + d_delete: pid_delete_dentry, +}; + +/* Lookups */ +#define MAX_MULBY10 ((~0U-9)/10) + +static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry) +{ + unsigned int fd, c; + struct task_struct *task = dir->u.proc_i.task; + struct file * file; + struct files_struct * files; + struct inode *inode; + const char *name; + int len; + + fd = 0; + len = dentry->d_name.len; + name = dentry->d_name.name; + if (len > 1 && *name == '0') goto out; + while (len-- > 0) { + c = *name - '0'; + name++; + if (c > 9) + goto out; + if (fd >= MAX_MULBY10) + goto out; + fd *= 10; + fd += c; + } + + inode = proc_pid_make_inode(dir->i_sb, task, PROC_PID_FD_DIR+fd); + if (!inode) + goto out; + task_lock(task); + files = task->files; + if (files) + atomic_inc(&files->count); + task_unlock(task); + if (!files) + goto out_unlock; + read_lock(&files->file_lock); + file = inode->u.proc_i.file = fcheck_files(files, fd); + if (!file) + goto out_unlock2; + get_file(file); + read_unlock(&files->file_lock); + put_files_struct(files); + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + inode->i_mode = S_IFLNK; + inode->u.proc_i.op.proc_get_link = proc_fd_link; + if (file->f_mode & 1) + inode->i_mode |= S_IRUSR | S_IXUSR; + if (file->f_mode & 2) + inode->i_mode |= S_IWUSR | S_IXUSR; + dentry->d_op = &pid_fd_dentry_operations; + d_add(dentry, inode); + return NULL; + +out_unlock2: + put_files_struct(files); + read_unlock(&files->file_lock); +out_unlock: + iput(inode); +out: + return ERR_PTR(-ENOENT); +} + +static struct file_operations proc_fd_operations = { + read: generic_read_dir, + readdir: proc_readfd, +}; + +/* + * proc directories can do almost nothing.. + */ +static struct inode_operations proc_fd_inode_operations = { + lookup: proc_lookupfd, + permission: proc_permission, +}; + +static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + int error; + struct task_struct *task = dir->u.proc_i.task; + struct pid_entry *p; + + error = -ENOENT; + inode = NULL; + + for (p = base_stuff; p->name; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (!p->name) + goto out; + + error = -EINVAL; + inode = proc_pid_make_inode(dir->i_sb, task, p->type); + if (!inode) + goto out; + + inode->i_mode = p->mode; + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc// without very good reasons. + */ + switch(p->type) { + case PROC_PID_FD: + inode->i_nlink = 2; + inode->i_op = &proc_fd_inode_operations; + inode->i_fop = &proc_fd_operations; + break; + case PROC_PID_EXE: + inode->i_op = &proc_pid_link_inode_operations; + inode->u.proc_i.op.proc_get_link = proc_exe_link; + break; + case PROC_PID_CWD: + inode->i_op = &proc_pid_link_inode_operations; + inode->u.proc_i.op.proc_get_link = proc_cwd_link; + break; + case PROC_PID_ROOT: + inode->i_op = &proc_pid_link_inode_operations; + inode->u.proc_i.op.proc_get_link = proc_root_link; + break; + case PROC_PID_ENVIRON: + inode->i_fop = &proc_info_file_operations; + inode->u.proc_i.op.proc_read = proc_pid_environ; + break; + case PROC_PID_STATUS: + inode->i_fop = &proc_info_file_operations; + inode->u.proc_i.op.proc_read = proc_pid_status; + break; + case PROC_PID_STAT: + inode->i_fop = &proc_info_file_operations; + inode->u.proc_i.op.proc_read = proc_pid_stat; + break; + case PROC_PID_CMDLINE: + inode->i_fop = &proc_info_file_operations; + inode->u.proc_i.op.proc_read = proc_pid_cmdline; + break; + case PROC_PID_STATM: + inode->i_fop = &proc_info_file_operations; + inode->u.proc_i.op.proc_read = proc_pid_statm; + break; + case PROC_PID_MAPS: + inode->i_fop = &proc_maps_operations; + break; +#ifdef CONFIG_SMP + case PROC_PID_CPU: + inode->i_fop = &proc_info_file_operations; + inode->u.proc_i.op.proc_read = proc_pid_cpu; + break; +#endif + case PROC_PID_MEM: + inode->i_op = &proc_mem_inode_operations; + inode->i_fop = &proc_mem_operations; + break; + default: + printk("procfs: impossible type (%d)",p->type); + iput(inode); + return ERR_PTR(-EINVAL); + } + dentry->d_op = &pid_dentry_operations; + d_add(dentry, inode); + return NULL; + +out: + return ERR_PTR(error); +} + +static struct file_operations proc_base_operations = { + read: generic_read_dir, + readdir: proc_base_readdir, +}; + +static struct inode_operations proc_base_inode_operations = { + lookup: proc_base_lookup, +}; + +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + char tmp[30]; + sprintf(tmp, "%d", current->pid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static int proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char tmp[30]; + sprintf(tmp, "%d", current->pid); + return vfs_follow_link(nd,tmp); +} + +static struct inode_operations proc_self_inode_operations = { + readlink: proc_self_readlink, + follow_link: proc_self_follow_link, +}; + +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry) +{ + unsigned int pid, c; + struct task_struct *task; + const char *name; + struct inode *inode; + int len; + + pid = 0; + name = dentry->d_name.name; + len = dentry->d_name.len; + if (len == 4 && !memcmp(name, "self", 4)) { + inode = new_inode(dir->i_sb); + if (!inode) + return ERR_PTR(-ENOMEM); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_ino = fake_ino(0, PROC_PID_INO); + inode->u.proc_i.file = NULL; + inode->u.proc_i.task = NULL; + inode->i_mode = S_IFLNK|S_IRWXUGO; + inode->i_uid = inode->i_gid = 0; + inode->i_size = 64; + inode->i_op = &proc_self_inode_operations; + d_add(dentry, inode); + return NULL; + } + while (len-- > 0) { + c = *name - '0'; + name++; + if (c > 9) + goto out; + if (pid >= MAX_MULBY10) + goto out; + pid *= 10; + pid += c; + if (!pid) + goto out; + } + + read_lock(&tasklist_lock); + task = find_task_by_pid(pid); + if (task) + get_task_struct(task); + read_unlock(&tasklist_lock); + if (!task) + goto out; + + inode = proc_pid_make_inode(dir->i_sb, task, PROC_PID_INO); + + free_task_struct(task); + + if (!inode) + goto out; + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_base_inode_operations; + inode->i_fop = &proc_base_operations; + inode->i_nlink = 3; + inode->i_flags|=S_IMMUTABLE_FILE; + + dentry->d_op = &pid_base_dentry_operations; + d_add(dentry, inode); + return NULL; +out: + return ERR_PTR(-ENOENT); +} + +void proc_pid_delete_inode(struct inode *inode) +{ + if (inode->u.proc_i.file) + fput(inode->u.proc_i.file); + if (inode->u.proc_i.task) + free_task_struct(inode->u.proc_i.task); +} + +#define PROC_NUMBUF 10 +#define PROC_MAXPIDS 20 + +/* + * Get a few pid's to return for filldir - we need to hold the + * tasklist lock while doing this, and we must release it before + * we actually do the filldir itself, so we use a temp buffer.. + */ +static int get_pid_list(int index, unsigned int *pids) +{ + struct task_struct *p; + int nr_pids = 0; + + index--; + read_lock(&tasklist_lock); + for_each_task(p) { + int pid = p->pid; + if (!pid) + continue; + if (--index >= 0) + continue; + pids[nr_pids] = pid; + nr_pids++; + if (nr_pids >= PROC_MAXPIDS) + break; + } + read_unlock(&tasklist_lock); + return nr_pids; +} + +int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + unsigned int pid_array[PROC_MAXPIDS]; + char buf[PROC_NUMBUF]; + unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; + unsigned int nr_pids, i; + + if (!nr) { + ino_t ino = fake_ino(0,PROC_PID_INO); + if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) + return 0; + filp->f_pos++; + nr++; + } + + nr_pids = get_pid_list(nr, pid_array); + + for (i = 0; i < nr_pids; i++) { + int pid = pid_array[i]; + ino_t ino = fake_ino(pid,PROC_PID_INO); + unsigned long j = PROC_NUMBUF; + + do buf[--j] = '0' + (pid % 10); while (pid/=10); + + if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) + break; + filp->f_pos++; + } + return 0; +} diff -urN linux-2.4.16-reiserfspatches-immutable/fs/udf/inode.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/udf/inode.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/udf/inode.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/udf/inode.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,2220 @@ +/* + * inode.c + * + * PURPOSE + * Inode handling routines for the OSTA-UDF(tm) filesystem. + * + * CONTACTS + * E-mail regarding any portion of the Linux UDF file system should be + * directed to the development team mailing list (run by majordomo): + * linux_udf@hpesjro.fc.hp.com + * + * COPYRIGHT + * This file is distributed under the terms of the GNU General Public + * License (GPL). Copies of the GPL can be obtained from: + * ftp://prep.ai.mit.edu/pub/gnu/GPL + * Each contributing author retains all rights to their own work. + * + * (C) 1998 Dave Boynton + * (C) 1998-2000 Ben Fennema + * (C) 1999-2000 Stelias Computing Inc + * + * HISTORY + * + * 10/04/98 dgb Added rudimentary directory functions + * 10/07/98 Fully working udf_block_map! It works! + * 11/25/98 bmap altered to better support extents + * 12/06/98 blf partition support in udf_iget, udf_block_map and udf_read_inode + * 12/12/98 rewrote udf_block_map to handle next extents and descs across + * block boundaries (which is not actually allowed) + * 12/20/98 added support for strategy 4096 + * 03/07/99 rewrote udf_block_map (again) + * New funcs, inode_bmap, udf_next_aext + * 04/19/99 Support for writing device EA's for major/minor # + */ + +#include "udfdecl.h" +#include +#include +#include +#include + +#include "udf_i.h" +#include "udf_sb.h" + +MODULE_AUTHOR("Ben Fennema"); +MODULE_DESCRIPTION("Universal Disk Format Filesystem"); +MODULE_LICENSE("GPL"); + +#define EXTENT_MERGE_SIZE 5 + +static mode_t udf_convert_permissions(struct FileEntry *); +static int udf_update_inode(struct inode *, int); +static void udf_fill_inode(struct inode *, struct buffer_head *); +static struct buffer_head *inode_getblk(struct inode *, long, int *, long *, int *); +static void udf_split_extents(struct inode *, int *, int, int, + long_ad [EXTENT_MERGE_SIZE], int *); +static void udf_prealloc_extents(struct inode *, int, int, + long_ad [EXTENT_MERGE_SIZE], int *); +static void udf_merge_extents(struct inode *, + long_ad [EXTENT_MERGE_SIZE], int *); +static void udf_update_extents(struct inode *, + long_ad [EXTENT_MERGE_SIZE], int, int, + lb_addr, Uint32, struct buffer_head **); +static int udf_get_block(struct inode *, long, struct buffer_head *, int); + +/* + * udf_put_inode + * + * PURPOSE + * + * DESCRIPTION + * This routine is called whenever the kernel no longer needs the inode. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + * + * Called at each iput() + */ +void udf_put_inode(struct inode * inode) +{ + if (!(inode->i_sb->s_flags & MS_RDONLY)) + { + lock_kernel(); + udf_discard_prealloc(inode); + /* write the root inode on put, if dirty */ + if (!inode->i_sb->s_root && inode->i_state & I_DIRTY) + udf_update_inode(inode, IS_SYNC(inode)); + unlock_kernel(); + } +} + +/* + * udf_delete_inode + * + * PURPOSE + * Clean-up before the specified inode is destroyed. + * + * DESCRIPTION + * This routine is called when the kernel destroys an inode structure + * ie. when iput() finds i_count == 0. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + * + * Called at the last iput() if i_nlink is zero. + */ +void udf_delete_inode(struct inode * inode) +{ + lock_kernel(); + + if (is_bad_inode(inode)) + goto no_delete; + + inode->i_size = 0; + udf_truncate(inode); + udf_update_inode(inode, IS_SYNC(inode)); + udf_free_inode(inode); + + unlock_kernel(); + return; +no_delete: + unlock_kernel(); + clear_inode(inode); +} + +void udf_discard_prealloc(struct inode * inode) +{ + if (inode->i_size && inode->i_size != UDF_I_LENEXTENTS(inode) && + UDF_I_ALLOCTYPE(inode) != ICB_FLAG_AD_IN_ICB) + { + udf_truncate_extents(inode); + } +} + +static int udf_writepage(struct page *page) +{ + return block_write_full_page(page, udf_get_block); +} + +static int udf_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, udf_get_block); +} + +static int udf_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) +{ + return block_prepare_write(page, from, to, udf_get_block); +} + +static int udf_bmap(struct address_space *mapping, long block) +{ + return generic_block_bmap(mapping,block,udf_get_block); +} + +struct address_space_operations udf_aops = { + readpage: udf_readpage, + writepage: udf_writepage, + sync_page: block_sync_page, + prepare_write: udf_prepare_write, + commit_write: generic_commit_write, + bmap: udf_bmap, +}; + +void udf_expand_file_adinicb(struct inode * inode, int newsize, int * err) +{ + struct buffer_head *bh = NULL; + struct page *page; + char *kaddr; + int block; + + /* from now on we have normal address_space methods */ + inode->i_data.a_ops = &udf_aops; + + if (!UDF_I_LENALLOC(inode)) + { + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + UDF_I_ALLOCTYPE(inode) = ICB_FLAG_AD_SHORT; + else + UDF_I_ALLOCTYPE(inode) = ICB_FLAG_AD_LONG; + mark_inode_dirty(inode); + return; + } + + block = udf_get_lb_pblock(inode->i_sb, UDF_I_LOCATION(inode), 0); + bh = udf_tread(inode->i_sb, block, inode->i_sb->s_blocksize); + if (!bh) + return; + page = grab_cache_page(inode->i_mapping, 0); + if (!PageLocked(page)) + PAGE_BUG(page); + if (!Page_Uptodate(page)) + { + kaddr = kmap(page); + memset(kaddr + UDF_I_LENALLOC(inode), 0x00, + PAGE_CACHE_SIZE - UDF_I_LENALLOC(inode)); + memcpy(kaddr, bh->b_data + udf_file_entry_alloc_offset(inode), + UDF_I_LENALLOC(inode)); + flush_dcache_page(page); + SetPageUptodate(page); + kunmap(page); + } + memset(bh->b_data + udf_file_entry_alloc_offset(inode), + 0, UDF_I_LENALLOC(inode)); + UDF_I_LENALLOC(inode) = 0; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + UDF_I_ALLOCTYPE(inode) = ICB_FLAG_AD_SHORT; + else + UDF_I_ALLOCTYPE(inode) = ICB_FLAG_AD_LONG; + mark_buffer_dirty_inode(bh, inode); + udf_release_data(bh); + + inode->i_data.a_ops->writepage(page); + page_cache_release(page); + + mark_inode_dirty(inode); + inode->i_version ++; +} + +struct buffer_head * udf_expand_dir_adinicb(struct inode *inode, int *block, int *err) +{ + int newblock; + struct buffer_head *sbh = NULL, *dbh = NULL; + lb_addr bloc, eloc; + Uint32 elen, extoffset; + + struct udf_fileident_bh sfibh, dfibh; + loff_t f_pos = udf_ext0_offset(inode) >> 2; + int size = (udf_ext0_offset(inode) + inode->i_size) >> 2; + struct FileIdentDesc cfi, *sfi, *dfi; + + if (!inode->i_size) + { + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + UDF_I_ALLOCTYPE(inode) = ICB_FLAG_AD_SHORT; + else + UDF_I_ALLOCTYPE(inode) = ICB_FLAG_AD_LONG; + mark_inode_dirty(inode); + return NULL; + } + + /* alloc block, and copy data to it */ + *block = udf_new_block(inode->i_sb, inode, + UDF_I_LOCATION(inode).partitionReferenceNum, + UDF_I_LOCATION(inode).logicalBlockNum, err); + + if (!(*block)) + return NULL; + newblock = udf_get_pblock(inode->i_sb, *block, + UDF_I_LOCATION(inode).partitionReferenceNum, 0); + if (!newblock) + return NULL; + sbh = udf_tread(inode->i_sb, inode->i_ino, inode->i_sb->s_blocksize); + if (!sbh) + return NULL; + dbh = udf_tgetblk(inode->i_sb, newblock, inode->i_sb->s_blocksize); + if (!dbh) + return NULL; + lock_buffer(dbh); + memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize); + mark_buffer_uptodate(dbh, 1); + unlock_buffer(dbh); + mark_buffer_dirty_inode(dbh, inode); + + sfibh.soffset = sfibh.eoffset = (f_pos & ((inode->i_sb->s_blocksize - 1) >> 2)) << 2; + sfibh.sbh = sfibh.ebh = sbh; + dfibh.soffset = dfibh.eoffset = 0; + dfibh.sbh = dfibh.ebh = dbh; + while ( (f_pos < size) ) + { + sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, NULL, NULL, NULL, NULL, NULL); + if (!sfi) + { + udf_release_data(sbh); + udf_release_data(dbh); + return NULL; + } + sfi->descTag.tagLocation = *block; + dfibh.soffset = dfibh.eoffset; + dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); + dfi = (struct FileIdentDesc *)(dbh->b_data + dfibh.soffset); + if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, + sfi->fileIdent + sfi->lengthOfImpUse)) + { + udf_release_data(sbh); + udf_release_data(dbh); + return NULL; + } + } + mark_buffer_dirty_inode(dbh, inode); + + memset(sbh->b_data + udf_file_entry_alloc_offset(inode), + 0, UDF_I_LENALLOC(inode)); + + UDF_I_LENALLOC(inode) = 0; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) + UDF_I_ALLOCTYPE(inode) = ICB_FLAG_AD_SHORT; + else + UDF_I_ALLOCTYPE(inode) = ICB_FLAG_AD_LONG; + bloc = UDF_I_LOCATION(inode); + eloc.logicalBlockNum = *block; + eloc.partitionReferenceNum = UDF_I_LOCATION(inode).partitionReferenceNum; + elen = inode->i_size; + UDF_I_LENEXTENTS(inode) = elen; + extoffset = udf_file_entry_alloc_offset(inode); + udf_add_aext(inode, &bloc, &extoffset, eloc, elen, &sbh, 0); + /* UniqueID stuff */ + + mark_buffer_dirty(sbh); + udf_release_data(sbh); + mark_inode_dirty(inode); + inode->i_version ++; + return dbh; +} + +static int udf_get_block(struct inode *inode, long block, struct buffer_head *bh_result, int create) +{ + int err, new; + struct buffer_head *bh; + unsigned long phys; + + if (!create) + { + phys = udf_block_map(inode, block); + if (phys) + { + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = phys; + bh_result->b_state |= (1UL << BH_Mapped); + } + return 0; + } + + err = -EIO; + new = 0; + bh = NULL; + + lock_kernel(); + + if (block < 0) + goto abort_negative; + + if (block == UDF_I_NEXT_ALLOC_BLOCK(inode) + 1) + { + UDF_I_NEXT_ALLOC_BLOCK(inode) ++; + UDF_I_NEXT_ALLOC_GOAL(inode) ++; + } + + err = 0; + + bh = inode_getblk(inode, block, &err, &phys, &new); + if (bh) + BUG(); + if (err) + goto abort; + if (!phys) + BUG(); + + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = phys; + bh_result->b_state |= (1UL << BH_Mapped); + if (new) + bh_result->b_state |= (1UL << BH_New); +abort: + unlock_kernel(); + return err; + +abort_negative: + udf_warning(inode->i_sb, "udf_get_block", "block < 0"); + goto abort; +} + +struct buffer_head * udf_getblk(struct inode * inode, long block, + int create, int * err) +{ + struct buffer_head dummy; + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + *err = udf_get_block(inode, block, &dummy, create); + if (!*err && buffer_mapped(&dummy)) + { + struct buffer_head *bh; + bh = getblk(dummy.b_dev, dummy.b_blocknr, inode->i_sb->s_blocksize); + if (buffer_new(&dummy)) + { + lock_buffer(bh); + memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + } + return bh; + } + return NULL; +} + +static struct buffer_head * inode_getblk(struct inode * inode, long block, + int *err, long *phys, int *new) +{ + struct buffer_head *pbh = NULL, *cbh = NULL, *nbh = NULL, *result = NULL; + long_ad laarr[EXTENT_MERGE_SIZE]; + Uint32 pextoffset = 0, cextoffset = 0, nextoffset = 0; + int count = 0, startnum = 0, endnum = 0; + Uint32 elen = 0; + lb_addr eloc, pbloc, cbloc, nbloc; + int c = 1; + Uint64 lbcount = 0, b_off = 0; + Uint32 newblocknum, newblock, offset = 0; + Sint8 etype; + int goal = 0, pgoal = UDF_I_LOCATION(inode).logicalBlockNum; + char lastblock = 0; + + pextoffset = cextoffset = nextoffset = udf_file_entry_alloc_offset(inode); + b_off = (Uint64)block << inode->i_sb->s_blocksize_bits; + pbloc = cbloc = nbloc = UDF_I_LOCATION(inode); + + /* find the extent which contains the block we are looking for. + alternate between laarr[0] and laarr[1] for locations of the + current extent, and the previous extent */ + do + { + if (pbh != cbh) + { + udf_release_data(pbh); + atomic_inc(&cbh->b_count); + pbh = cbh; + } + if (cbh != nbh) + { + udf_release_data(cbh); + atomic_inc(&nbh->b_count); + cbh = nbh; + } + + lbcount += elen; + + pbloc = cbloc; + cbloc = nbloc; + + pextoffset = cextoffset; + cextoffset = nextoffset; + + if ((etype = udf_next_aext(inode, &nbloc, &nextoffset, &eloc, &elen, &nbh, 1)) == -1) + break; + + c = !c; + + laarr[c].extLength = (etype << 30) | elen; + laarr[c].extLocation = eloc; + + if (etype != EXTENT_NOT_RECORDED_NOT_ALLOCATED) + pgoal = eloc.logicalBlockNum + + ((elen + inode->i_sb->s_blocksize - 1) >> + inode->i_sb->s_blocksize_bits); + + count ++; + } while (lbcount + elen <= b_off); + + b_off -= lbcount; + offset = b_off >> inode->i_sb->s_blocksize_bits; + + /* if the extent is allocated and recorded, return the block + if the extent is not a multiple of the blocksize, round up */ + + if (etype == EXTENT_RECORDED_ALLOCATED) + { + if (elen & (inode->i_sb->s_blocksize - 1)) + { + elen = (EXTENT_RECORDED_ALLOCATED << 30) | + ((elen + inode->i_sb->s_blocksize - 1) & + ~(inode->i_sb->s_blocksize - 1)); + etype = udf_write_aext(inode, nbloc, &cextoffset, eloc, elen, nbh, 1); + } + udf_release_data(pbh); + udf_release_data(cbh); + udf_release_data(nbh); + newblock = udf_get_lb_pblock(inode->i_sb, eloc, offset); + *phys = newblock; + return NULL; + } + + if (etype == -1) + { + endnum = startnum = ((count > 1) ? 1 : count); + if (laarr[c].extLength & (inode->i_sb->s_blocksize - 1)) + { + laarr[c].extLength = + (laarr[c].extLength & UDF_EXTENT_FLAG_MASK) | + (((laarr[c].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) & + ~(inode->i_sb->s_blocksize - 1)); + UDF_I_LENEXTENTS(inode) = + (UDF_I_LENEXTENTS(inode) + inode->i_sb->s_blocksize - 1) & + ~(inode->i_sb->s_blocksize - 1); + } + c = !c; + laarr[c].extLength = (EXTENT_NOT_RECORDED_NOT_ALLOCATED << 30) | + ((offset + 1) << inode->i_sb->s_blocksize_bits); + memset(&laarr[c].extLocation, 0x00, sizeof(lb_addr)); + count ++; + endnum ++; + lastblock = 1; + } + else + endnum = startnum = ((count > 2) ? 2 : count); + + /* if the current extent is in position 0, swap it with the previous */ + if (!c && count != 1) + { + laarr[2] = laarr[0]; + laarr[0] = laarr[1]; + laarr[1] = laarr[2]; + c = 1; + } + + /* if the current block is located in a extent, read the next extent */ + if (etype != -1) + { + if ((etype = udf_next_aext(inode, &nbloc, &nextoffset, &eloc, &elen, &nbh, 0)) != -1) + { + laarr[c+1].extLength = (etype << 30) | elen; + laarr[c+1].extLocation = eloc; + count ++; + startnum ++; + endnum ++; + } + else + lastblock = 1; + } + udf_release_data(nbh); + if (!pbh) + pbh = cbh; + else + udf_release_data(cbh); + + /* if the current extent is not recorded but allocated, get the + block in the extent corresponding to the requested block */ + if ((laarr[c].extLength >> 30) == EXTENT_NOT_RECORDED_ALLOCATED) + newblocknum = laarr[c].extLocation.logicalBlockNum + offset; + else /* otherwise, allocate a new block */ + { + if (UDF_I_NEXT_ALLOC_BLOCK(inode) == block) + goal = UDF_I_NEXT_ALLOC_GOAL(inode); + + if (!goal) + { + if (!(goal = pgoal)) + goal = UDF_I_LOCATION(inode).logicalBlockNum + 1; + } + + if (!(newblocknum = udf_new_block(inode->i_sb, inode, + UDF_I_LOCATION(inode).partitionReferenceNum, goal, err))) + { + udf_release_data(pbh); + *err = -ENOSPC; + return NULL; + } + UDF_I_LENEXTENTS(inode) += inode->i_sb->s_blocksize; + } + + /* if the extent the requsted block is located in contains multiple blocks, + split the extent into at most three extents. blocks prior to requested + block, requested block, and blocks after requested block */ + udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); + +#ifdef UDF_PREALLOCATE + /* preallocate blocks */ + udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); +#endif + + /* merge any continuous blocks in laarr */ + udf_merge_extents(inode, laarr, &endnum); + + /* write back the new extents, inserting new extents if the new number + of extents is greater than the old number, and deleting extents if + the new number of extents is less than the old number */ + udf_update_extents(inode, laarr, startnum, endnum, pbloc, pextoffset, &pbh); + + udf_release_data(pbh); + + if (!(newblock = udf_get_pblock(inode->i_sb, newblocknum, + UDF_I_LOCATION(inode).partitionReferenceNum, 0))) + { + return NULL; + } + *phys = newblock; + *err = 0; + *new = 1; + UDF_I_NEXT_ALLOC_BLOCK(inode) = block; + UDF_I_NEXT_ALLOC_GOAL(inode) = newblocknum; + inode->i_ctime = CURRENT_TIME; + UDF_I_UCTIME(inode) = CURRENT_UTIME; + + if (IS_SYNC(inode)) + udf_sync_inode(inode); + else + mark_inode_dirty(inode); + return result; +} + +static void udf_split_extents(struct inode *inode, int *c, int offset, int newblocknum, + long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) +{ + if ((laarr[*c].extLength >> 30) == EXTENT_NOT_RECORDED_ALLOCATED || + (laarr[*c].extLength >> 30) == EXTENT_NOT_RECORDED_NOT_ALLOCATED) + { + int curr = *c; + int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; + int type = laarr[curr].extLength & ~UDF_EXTENT_LENGTH_MASK; + + if (blen == 1) + ; + else if (!offset || blen == offset + 1) + { + laarr[curr+2] = laarr[curr+1]; + laarr[curr+1] = laarr[curr]; + } + else + { + laarr[curr+3] = laarr[curr+1]; + laarr[curr+2] = laarr[curr+1] = laarr[curr]; + } + + if (offset) + { + if ((type >> 30) == EXTENT_NOT_RECORDED_ALLOCATED) + { + udf_free_blocks(inode->i_sb, inode, laarr[curr].extLocation, 0, offset); + laarr[curr].extLength = (EXTENT_NOT_RECORDED_NOT_ALLOCATED << 30) | + (offset << inode->i_sb->s_blocksize_bits); + laarr[curr].extLocation.logicalBlockNum = 0; + laarr[curr].extLocation.partitionReferenceNum = 0; + } + else + laarr[curr].extLength = type | + (offset << inode->i_sb->s_blocksize_bits); + curr ++; + (*c) ++; + (*endnum) ++; + } + + laarr[curr].extLocation.logicalBlockNum = newblocknum; + if ((type >> 30) == EXTENT_NOT_RECORDED_NOT_ALLOCATED) + laarr[curr].extLocation.partitionReferenceNum = + UDF_I_LOCATION(inode).partitionReferenceNum; + laarr[curr].extLength = (EXTENT_RECORDED_ALLOCATED << 30) | + inode->i_sb->s_blocksize; + curr ++; + + if (blen != offset + 1) + { + if ((type >> 30) == EXTENT_NOT_RECORDED_ALLOCATED) + laarr[curr].extLocation.logicalBlockNum += (offset + 1); + laarr[curr].extLength = type | + ((blen - (offset + 1)) << inode->i_sb->s_blocksize_bits); + curr ++; + (*endnum) ++; + } + } +} + +static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, + long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) +{ + int start, length = 0, currlength = 0, i; + + if (*endnum >= (c+1)) + { + if (!lastblock) + return; + else + start = c; + } + else + { + if ((laarr[c+1].extLength >> 30) == EXTENT_NOT_RECORDED_ALLOCATED) + { + start = c+1; + length = currlength = (((laarr[c+1].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); + } + else + start = c; + } + + for (i=start+1; i<=*endnum; i++) + { + if (i == *endnum) + { + if (lastblock) + length += UDF_DEFAULT_PREALLOC_BLOCKS; + } + else if ((laarr[i].extLength >> 30) == EXTENT_NOT_RECORDED_NOT_ALLOCATED) + length += (((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); + else + break; + } + + if (length) + { + int next = laarr[start].extLocation.logicalBlockNum + + (((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); + int numalloc = udf_prealloc_blocks(inode->i_sb, inode, + laarr[start].extLocation.partitionReferenceNum, + next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ? length : + UDF_DEFAULT_PREALLOC_BLOCKS) - currlength); + + if (numalloc) + { + UDF_I_LENEXTENTS(inode) += numalloc << inode->i_sb->s_blocksize_bits; + if (start == (c+1)) + laarr[start].extLength += + (numalloc << inode->i_sb->s_blocksize_bits); + else + { + memmove(&laarr[c+2], &laarr[c+1], + sizeof(long_ad) * (*endnum - (c+1))); + (*endnum) ++; + laarr[c+1].extLocation.logicalBlockNum = next; + laarr[c+1].extLocation.partitionReferenceNum = + laarr[c].extLocation.partitionReferenceNum; + laarr[c+1].extLength = (EXTENT_NOT_RECORDED_ALLOCATED << 30) | + (numalloc << inode->i_sb->s_blocksize_bits); + start = c+1; + } + + for (i=start+1; numalloc && i<*endnum; i++) + { + int elen = ((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; + + if (elen > numalloc) + { + laarr[c+1].extLength -= + (numalloc << inode->i_sb->s_blocksize_bits); + numalloc = 0; + } + else + { + numalloc -= elen; + if (*endnum > (i+1)) + memmove(&laarr[i], &laarr[i+1], + sizeof(long_ad) * (*endnum - (i+1))); + i --; + (*endnum) --; + } + } + } + } +} + +static void udf_merge_extents(struct inode *inode, + long_ad laarr[EXTENT_MERGE_SIZE], int *endnum) +{ + int i; + + for (i=0; i<(*endnum-1); i++) + { + if ((laarr[i].extLength >> 30) == (laarr[i+1].extLength >> 30)) + { + if (((laarr[i].extLength >> 30) == EXTENT_NOT_RECORDED_NOT_ALLOCATED) || + ((laarr[i+1].extLocation.logicalBlockNum - laarr[i].extLocation.logicalBlockNum) == + (((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits))) + { + if (((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + + (laarr[i+1].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) + { + laarr[i+1].extLength = (laarr[i+1].extLength - + (laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + + UDF_EXTENT_LENGTH_MASK) & ~(inode->i_sb->s_blocksize-1); + laarr[i].extLength = (UDF_EXTENT_LENGTH_MASK + 1) - + inode->i_sb->s_blocksize; + laarr[i+1].extLocation.logicalBlockNum = + laarr[i].extLocation.logicalBlockNum + + ((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) >> + inode->i_sb->s_blocksize_bits); + } + else + { + laarr[i].extLength = laarr[i+1].extLength + + (((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize-1)); + if (*endnum > (i+2)) + memmove(&laarr[i+1], &laarr[i+2], + sizeof(long_ad) * (*endnum - (i+2))); + i --; + (*endnum) --; + } + } + } + } +} + +static void udf_update_extents(struct inode *inode, + long_ad laarr[EXTENT_MERGE_SIZE], int startnum, int endnum, + lb_addr pbloc, Uint32 pextoffset, struct buffer_head **pbh) +{ + int start = 0, i; + lb_addr tmploc; + Uint32 tmplen; + + if (startnum > endnum) + { + for (i=0; i<(startnum-endnum); i++) + { + udf_delete_aext(inode, pbloc, pextoffset, laarr[i].extLocation, + laarr[i].extLength, *pbh); + } + } + else if (startnum < endnum) + { + for (i=0; i<(endnum-startnum); i++) + { + udf_insert_aext(inode, pbloc, pextoffset, laarr[i].extLocation, + laarr[i].extLength, *pbh); + udf_next_aext(inode, &pbloc, &pextoffset, &laarr[i].extLocation, + &laarr[i].extLength, pbh, 1); + start ++; + } + } + + for (i=start; ii_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode)) + return; + + if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_IN_ICB) + { + if (inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) + + inode->i_size)) + { + udf_expand_file_adinicb(inode, inode->i_size, &err); + if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_IN_ICB) + { + inode->i_size = UDF_I_LENALLOC(inode); + return; + } + else + udf_truncate_extents(inode); + } + else + { + offset = (inode->i_size & (inode->i_sb->s_blocksize - 1)) + + udf_file_entry_alloc_offset(inode); + + if ((bh = udf_tread(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, UDF_I_LOCATION(inode), 0), + inode->i_sb->s_blocksize))) + { + memset(bh->b_data + offset, 0x00, inode->i_sb->s_blocksize - offset); + mark_buffer_dirty(bh); + udf_release_data(bh); + } + UDF_I_LENALLOC(inode) = inode->i_size; + } + } + else + { + block_truncate_page(inode->i_mapping, inode->i_size, udf_get_block); + udf_truncate_extents(inode); + } + + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + UDF_I_UMTIME(inode) = UDF_I_UCTIME(inode) = CURRENT_UTIME; + if (IS_SYNC(inode)) + udf_sync_inode (inode); + else + mark_inode_dirty(inode); +} + +/* + * udf_read_inode + * + * PURPOSE + * Read an inode. + * + * DESCRIPTION + * This routine is called by iget() [which is called by udf_iget()] + * (clean_inode() will have been called first) + * when an inode is first read into memory. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + * + * 12/19/98 dgb Updated to fix size problems. + */ + +void +udf_read_inode(struct inode *inode) +{ + memset(&UDF_I_LOCATION(inode), 0xFF, sizeof(lb_addr)); +} + +void +__udf_read_inode(struct inode *inode) +{ + struct buffer_head *bh = NULL; + struct FileEntry *fe; + Uint16 ident; + + /* + * Set defaults, but the inode is still incomplete! + * Note: get_new_inode() sets the following on a new inode: + * i_sb = sb + * i_dev = sb->s_dev; + * i_no = ino + * i_flags = sb->s_flags + * i_state = 0 + * clean_inode(): zero fills and sets + * i_count = 1 + * i_nlink = 1 + * i_op = NULL; + */ + + inode->i_blksize = PAGE_SIZE; + + bh = udf_read_ptagged(inode->i_sb, UDF_I_LOCATION(inode), 0, &ident); + + if (!bh) + { + printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed !bh\n", + inode->i_ino); + make_bad_inode(inode); + return; + } + + if (ident != TID_FILE_ENTRY && ident != TID_EXTENDED_FILE_ENTRY && + ident != TID_UNALLOCATED_SPACE_ENTRY) + { + printk(KERN_ERR "udf: udf_read_inode(ino %ld) failed ident=%d\n", + inode->i_ino, ident); + udf_release_data(bh); + make_bad_inode(inode); + return; + } + + fe = (struct FileEntry *)bh->b_data; + + if (le16_to_cpu(fe->icbTag.strategyType) == 4096) + { + struct buffer_head *ibh = NULL, *nbh = NULL; + struct IndirectEntry *ie; + + ibh = udf_read_ptagged(inode->i_sb, UDF_I_LOCATION(inode), 1, &ident); + if (ident == TID_INDIRECT_ENTRY) + { + if (ibh) + { + lb_addr loc; + ie = (struct IndirectEntry *)ibh->b_data; + + loc = lelb_to_cpu(ie->indirectICB.extLocation); + + if (ie->indirectICB.extLength && + (nbh = udf_read_ptagged(inode->i_sb, loc, 0, &ident))) + { + if (ident == TID_FILE_ENTRY || + ident == TID_EXTENDED_FILE_ENTRY) + { + memcpy(&UDF_I_LOCATION(inode), &loc, sizeof(lb_addr)); + udf_release_data(bh); + udf_release_data(ibh); + udf_release_data(nbh); + __udf_read_inode(inode); + return; + } + else + { + udf_release_data(nbh); + udf_release_data(ibh); + } + } + else + udf_release_data(ibh); + } + } + else + udf_release_data(ibh); + } + else if (le16_to_cpu(fe->icbTag.strategyType) != 4) + { + printk(KERN_ERR "udf: unsupported strategy type: %d\n", + le16_to_cpu(fe->icbTag.strategyType)); + udf_release_data(bh); + make_bad_inode(inode); + return; + } + udf_fill_inode(inode, bh); + udf_release_data(bh); +} + +static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) +{ + struct FileEntry *fe; + struct ExtendedFileEntry *efe; + time_t convtime; + long convtime_usec; + int offset, alen; + + inode->i_version = ++event; + UDF_I_NEW_INODE(inode) = 0; + + fe = (struct FileEntry *)bh->b_data; + efe = (struct ExtendedFileEntry *)bh->b_data; + + if (le16_to_cpu(fe->icbTag.strategyType) == 4) + UDF_I_STRAT4096(inode) = 0; + else /* if (le16_to_cpu(fe->icbTag.strategyType) == 4096) */ + UDF_I_STRAT4096(inode) = 1; + + UDF_I_ALLOCTYPE(inode) = le16_to_cpu(fe->icbTag.flags) & ICB_FLAG_ALLOC_MASK; + if (fe->descTag.tagIdent == TID_EXTENDED_FILE_ENTRY) + UDF_I_EXTENDED_FE(inode) = 1; + else if (fe->descTag.tagIdent == TID_FILE_ENTRY) + UDF_I_EXTENDED_FE(inode) = 0; + else if (fe->descTag.tagIdent == TID_UNALLOCATED_SPACE_ENTRY) + { + UDF_I_LENALLOC(inode) = + le32_to_cpu( + ((struct UnallocatedSpaceEntry *)bh->b_data)->lengthAllocDescs); + return; + } + + inode->i_uid = le32_to_cpu(fe->uid); + if ( inode->i_uid == -1 ) inode->i_uid = UDF_SB(inode->i_sb)->s_uid; + + inode->i_gid = le32_to_cpu(fe->gid); + if ( inode->i_gid == -1 ) inode->i_gid = UDF_SB(inode->i_sb)->s_gid; + + inode->i_nlink = le16_to_cpu(fe->fileLinkCount); + if (!inode->i_nlink) + inode->i_nlink = 1; + + inode->i_size = le64_to_cpu(fe->informationLength); + UDF_I_LENEXTENTS(inode) = inode->i_size; + + inode->i_mode = udf_convert_permissions(fe); + inode->i_mode &= ~UDF_SB(inode->i_sb)->s_umask; + + UDF_I_NEXT_ALLOC_BLOCK(inode) = 0; + UDF_I_NEXT_ALLOC_GOAL(inode) = 0; + + if (UDF_I_EXTENDED_FE(inode) == 0) + { + inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << + (inode->i_sb->s_blocksize_bits - 9); + + if ( udf_stamp_to_time(&convtime, &convtime_usec, + lets_to_cpu(fe->accessTime)) ) + { + inode->i_atime = convtime; + } + else + { + inode->i_atime = UDF_SB_RECORDTIME(inode->i_sb); + } + + if ( udf_stamp_to_time(&convtime, &convtime_usec, + lets_to_cpu(fe->modificationTime)) ) + { + inode->i_mtime = convtime; + UDF_I_UMTIME(inode) = convtime_usec; + } + else + { + inode->i_mtime = UDF_SB_RECORDTIME(inode->i_sb); + UDF_I_UMTIME(inode) = 0; + } + + if ( udf_stamp_to_time(&convtime, &convtime_usec, + lets_to_cpu(fe->attrTime)) ) + { + inode->i_ctime = convtime; + UDF_I_UCTIME(inode) = convtime_usec; + } + else + { + inode->i_ctime = UDF_SB_RECORDTIME(inode->i_sb); + UDF_I_UCTIME(inode) = 0; + } + + UDF_I_UNIQUE(inode) = le64_to_cpu(fe->uniqueID); + UDF_I_LENEATTR(inode) = le32_to_cpu(fe->lengthExtendedAttr); + UDF_I_LENALLOC(inode) = le32_to_cpu(fe->lengthAllocDescs); + offset = sizeof(struct FileEntry) + UDF_I_LENEATTR(inode); + alen = offset + UDF_I_LENALLOC(inode); + } + else + { + inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << + (inode->i_sb->s_blocksize_bits - 9); + + if ( udf_stamp_to_time(&convtime, &convtime_usec, + lets_to_cpu(efe->accessTime)) ) + { + inode->i_atime = convtime; + } + else + { + inode->i_atime = UDF_SB_RECORDTIME(inode->i_sb); + } + + if ( udf_stamp_to_time(&convtime, &convtime_usec, + lets_to_cpu(efe->modificationTime)) ) + { + inode->i_mtime = convtime; + UDF_I_UMTIME(inode) = convtime_usec; + } + else + { + inode->i_mtime = UDF_SB_RECORDTIME(inode->i_sb); + UDF_I_UMTIME(inode) = 0; + } + + if ( udf_stamp_to_time(&convtime, &convtime_usec, + lets_to_cpu(efe->createTime)) ) + { + UDF_I_CRTIME(inode) = convtime; + UDF_I_UCRTIME(inode) = convtime_usec; + } + else + { + UDF_I_CRTIME(inode) = UDF_SB_RECORDTIME(inode->i_sb); + UDF_I_UCRTIME(inode) = 0; + } + + if ( udf_stamp_to_time(&convtime, &convtime_usec, + lets_to_cpu(efe->attrTime)) ) + { + inode->i_ctime = convtime; + UDF_I_UCTIME(inode) = convtime_usec; + } + else + { + inode->i_ctime = UDF_SB_RECORDTIME(inode->i_sb); + UDF_I_UCTIME(inode) = 0; + } + + UDF_I_UNIQUE(inode) = le64_to_cpu(efe->uniqueID); + UDF_I_LENEATTR(inode) = le32_to_cpu(efe->lengthExtendedAttr); + UDF_I_LENALLOC(inode) = le32_to_cpu(efe->lengthAllocDescs); + offset = sizeof(struct ExtendedFileEntry) + UDF_I_LENEATTR(inode); + alen = offset + UDF_I_LENALLOC(inode); + } + + switch (fe->icbTag.fileType) + { + case FILE_TYPE_DIRECTORY: + { + inode->i_op = &udf_dir_inode_operations; + inode->i_fop = &udf_dir_operations; + inode->i_mode |= S_IFDIR; + inode->i_nlink ++; + break; + } + case FILE_TYPE_REALTIME: + case FILE_TYPE_REGULAR: + case FILE_TYPE_NONE: + { + if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + inode->i_mode |= S_IFREG; + break; + } + case FILE_TYPE_BLOCK: + { + inode->i_mode |= S_IFBLK; + break; + } + case FILE_TYPE_CHAR: + { + inode->i_mode |= S_IFCHR; + break; + } + case FILE_TYPE_FIFO: + { + init_special_inode(inode, inode->i_mode | S_IFIFO, 0); + break; + } + case FILE_TYPE_SYMLINK: + { + inode->i_data.a_ops = &udf_symlink_aops; + inode->i_op = &page_symlink_inode_operations; + inode->i_mode = S_IFLNK|S_IRWXUGO; + break; + } + default: + { + printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown file type=%d\n", + inode->i_ino, fe->icbTag.fileType); + make_bad_inode(inode); + return; + } + } + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + { + struct buffer_head *tbh = NULL; + struct DeviceSpecificationExtendedAttr *dsea = + (struct DeviceSpecificationExtendedAttr *) + udf_get_extendedattr(inode, 12, 1, &tbh); + + if (dsea) + { + init_special_inode(inode, inode->i_mode, + ((le32_to_cpu(dsea->majorDeviceIdent)) << 8) | + (le32_to_cpu(dsea->minorDeviceIdent) & 0xFF)); + /* Developer ID ??? */ + udf_release_data(tbh); + } + else + { + make_bad_inode(inode); + } + } +} + +static mode_t +udf_convert_permissions(struct FileEntry *fe) +{ + mode_t mode; + Uint32 permissions; + Uint32 flags; + + permissions = le32_to_cpu(fe->permissions); + flags = le16_to_cpu(fe->icbTag.flags); + + mode = (( permissions ) & S_IRWXO) | + (( permissions >> 2 ) & S_IRWXG) | + (( permissions >> 4 ) & S_IRWXU) | + (( flags & ICB_FLAG_SETUID) ? S_ISUID : 0) | + (( flags & ICB_FLAG_SETGID) ? S_ISGID : 0) | + (( flags & ICB_FLAG_STICKY) ? S_ISVTX : 0); + + return mode; +} + +/* + * udf_write_inode + * + * PURPOSE + * Write out the specified inode. + * + * DESCRIPTION + * This routine is called whenever an inode is synced. + * Currently this routine is just a placeholder. + * + * HISTORY + * July 1, 1997 - Andrew E. Mileski + * Written, tested, and released. + */ + +void udf_write_inode(struct inode * inode, int sync) +{ + lock_kernel(); + udf_update_inode(inode, sync); + unlock_kernel(); +} + +int udf_sync_inode(struct inode * inode) +{ + return udf_update_inode(inode, 1); +} + +static int +udf_update_inode(struct inode *inode, int do_sync) +{ + struct buffer_head *bh = NULL; + struct FileEntry *fe; + struct ExtendedFileEntry *efe; + Uint32 udfperms; + Uint16 icbflags; + Uint16 crclen; + int i; + timestamp cpu_time; + int err = 0; + + bh = udf_tread(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, UDF_I_LOCATION(inode), 0), + inode->i_sb->s_blocksize); + + if (!bh) + { + udf_debug("bread failure\n"); + return -EIO; + } + fe = (struct FileEntry *)bh->b_data; + efe = (struct ExtendedFileEntry *)bh->b_data; + if (UDF_I_NEW_INODE(inode) == 1) + { + if (UDF_I_EXTENDED_FE(inode) == 0) + memset(bh->b_data, 0x00, sizeof(struct FileEntry)); + else + memset(bh->b_data, 0x00, sizeof(struct ExtendedFileEntry)); + memset(bh->b_data + udf_file_entry_alloc_offset(inode) + + UDF_I_LENALLOC(inode), 0x0, inode->i_sb->s_blocksize - + udf_file_entry_alloc_offset(inode) - UDF_I_LENALLOC(inode)); + UDF_I_NEW_INODE(inode) = 0; + } + + if (fe->descTag.tagIdent == TID_UNALLOCATED_SPACE_ENTRY) + { + struct UnallocatedSpaceEntry *use = + (struct UnallocatedSpaceEntry *)bh->b_data; + + use->lengthAllocDescs = cpu_to_le32(UDF_I_LENALLOC(inode)); + crclen = sizeof(struct UnallocatedSpaceEntry) + UDF_I_LENALLOC(inode) - + sizeof(tag); + use->descTag.descCRCLength = cpu_to_le16(crclen); + use->descTag.descCRC = cpu_to_le16(udf_crc((char *)use + sizeof(tag), crclen, 0)); + + use->descTag.tagChecksum = 0; + for (i=0; i<16; i++) + if (i != 4) + use->descTag.tagChecksum += ((Uint8 *)&(use->descTag))[i]; + + mark_buffer_dirty(bh); + udf_release_data(bh); + return err; + } + + if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid) + fe->uid = cpu_to_le32(inode->i_uid); + + if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid) + fe->gid = cpu_to_le32(inode->i_gid); + + udfperms = ((inode->i_mode & S_IRWXO) ) | + ((inode->i_mode & S_IRWXG) << 2) | + ((inode->i_mode & S_IRWXU) << 4); + + udfperms |= (le32_to_cpu(fe->permissions) & + (PERM_O_DELETE | PERM_O_CHATTR | + PERM_G_DELETE | PERM_G_CHATTR | + PERM_U_DELETE | PERM_U_CHATTR)); + fe->permissions = cpu_to_le32(udfperms); + + if (S_ISDIR(inode->i_mode)) + fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); + else + fe->fileLinkCount = cpu_to_le16(inode->i_nlink); + + fe->informationLength = cpu_to_le64(inode->i_size); + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + { + EntityID *eid; + struct buffer_head *tbh = NULL; + struct DeviceSpecificationExtendedAttr *dsea = + (struct DeviceSpecificationExtendedAttr *) + udf_get_extendedattr(inode, 12, 1, &tbh); + + if (!dsea) + { + dsea = (struct DeviceSpecificationExtendedAttr *) + udf_add_extendedattr(inode, + sizeof(struct DeviceSpecificationExtendedAttr) + + sizeof(EntityID), 12, 0x3, &tbh); + dsea->attrType = 12; + dsea->attrSubtype = 1; + dsea->attrLength = sizeof(struct DeviceSpecificationExtendedAttr) + + sizeof(EntityID); + dsea->impUseLength = sizeof(EntityID); + } + eid = (EntityID *)dsea->impUse; + memset(eid, 0, sizeof(EntityID)); + strcpy(eid->ident, UDF_ID_DEVELOPER); + eid->identSuffix[0] = UDF_OS_CLASS_UNIX; + eid->identSuffix[1] = UDF_OS_ID_LINUX; + dsea->majorDeviceIdent = kdev_t_to_nr(inode->i_rdev) >> 8; + dsea->minorDeviceIdent = kdev_t_to_nr(inode->i_rdev) & 0xFF; + mark_buffer_dirty_inode(tbh, inode); + udf_release_data(tbh); + } + + if (UDF_I_EXTENDED_FE(inode) == 0) + { + fe->logicalBlocksRecorded = cpu_to_le64( + (inode->i_blocks + (1 << (inode->i_sb->s_blocksize_bits - 9)) - 1) >> + (inode->i_sb->s_blocksize_bits - 9)); + + if (udf_time_to_stamp(&cpu_time, inode->i_atime, 0)) + fe->accessTime = cpu_to_lets(cpu_time); + if (udf_time_to_stamp(&cpu_time, inode->i_mtime, UDF_I_UMTIME(inode))) + fe->modificationTime = cpu_to_lets(cpu_time); + if (udf_time_to_stamp(&cpu_time, inode->i_ctime, UDF_I_UCTIME(inode))) + fe->attrTime = cpu_to_lets(cpu_time); + memset(&(fe->impIdent), 0, sizeof(EntityID)); + strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); + fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + fe->uniqueID = cpu_to_le64(UDF_I_UNIQUE(inode)); + fe->lengthExtendedAttr = cpu_to_le32(UDF_I_LENEATTR(inode)); + fe->lengthAllocDescs = cpu_to_le32(UDF_I_LENALLOC(inode)); + fe->descTag.tagIdent = le16_to_cpu(TID_FILE_ENTRY); + crclen = sizeof(struct FileEntry); + } + else + { + efe->objectSize = cpu_to_le64(inode->i_size); + efe->logicalBlocksRecorded = cpu_to_le64( + (inode->i_blocks + (1 << (inode->i_sb->s_blocksize_bits - 9)) - 1) >> + (inode->i_sb->s_blocksize_bits - 9)); + + if (UDF_I_CRTIME(inode) >= inode->i_atime) + { + UDF_I_CRTIME(inode) = inode->i_atime; + UDF_I_UCRTIME(inode) = 0; + } + if (UDF_I_CRTIME(inode) > inode->i_mtime || + (UDF_I_CRTIME(inode) == inode->i_mtime && + UDF_I_UCRTIME(inode) > UDF_I_UMTIME(inode))) + { + UDF_I_CRTIME(inode) = inode->i_mtime; + UDF_I_UCRTIME(inode) = UDF_I_UMTIME(inode); + } + if (UDF_I_CRTIME(inode) > inode->i_ctime || + (UDF_I_CRTIME(inode) == inode->i_ctime && + UDF_I_UCRTIME(inode) > UDF_I_UCTIME(inode))) + { + UDF_I_CRTIME(inode) = inode->i_ctime; + UDF_I_UCRTIME(inode) = UDF_I_UCTIME(inode); + } + + if (udf_time_to_stamp(&cpu_time, inode->i_atime, 0)) + efe->accessTime = cpu_to_lets(cpu_time); + if (udf_time_to_stamp(&cpu_time, inode->i_mtime, UDF_I_UMTIME(inode))) + efe->modificationTime = cpu_to_lets(cpu_time); + if (udf_time_to_stamp(&cpu_time, UDF_I_CRTIME(inode), UDF_I_UCRTIME(inode))) + efe->createTime = cpu_to_lets(cpu_time); + if (udf_time_to_stamp(&cpu_time, inode->i_ctime, UDF_I_UCTIME(inode))) + efe->attrTime = cpu_to_lets(cpu_time); + + memset(&(efe->impIdent), 0, sizeof(EntityID)); + strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); + efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; + efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; + efe->uniqueID = cpu_to_le64(UDF_I_UNIQUE(inode)); + efe->lengthExtendedAttr = cpu_to_le32(UDF_I_LENEATTR(inode)); + efe->lengthAllocDescs = cpu_to_le32(UDF_I_LENALLOC(inode)); + efe->descTag.tagIdent = le16_to_cpu(TID_EXTENDED_FILE_ENTRY); + crclen = sizeof(struct ExtendedFileEntry); + } + if (UDF_I_STRAT4096(inode)) + { + fe->icbTag.strategyType = cpu_to_le16(4096); + fe->icbTag.strategyParameter = cpu_to_le16(1); + fe->icbTag.numEntries = cpu_to_le16(2); + } + else + { + fe->icbTag.strategyType = cpu_to_le16(4); + fe->icbTag.numEntries = cpu_to_le16(1); + } + + if (S_ISDIR(inode->i_mode)) + fe->icbTag.fileType = FILE_TYPE_DIRECTORY; + else if (S_ISREG(inode->i_mode)) + fe->icbTag.fileType = FILE_TYPE_REGULAR; + else if (S_ISLNK(inode->i_mode)) + fe->icbTag.fileType = FILE_TYPE_SYMLINK; + else if (S_ISBLK(inode->i_mode)) + fe->icbTag.fileType = FILE_TYPE_BLOCK; + else if (S_ISCHR(inode->i_mode)) + fe->icbTag.fileType = FILE_TYPE_CHAR; + else if (S_ISFIFO(inode->i_mode)) + fe->icbTag.fileType = FILE_TYPE_FIFO; + + icbflags = UDF_I_ALLOCTYPE(inode) | + ((inode->i_mode & S_ISUID) ? ICB_FLAG_SETUID : 0) | + ((inode->i_mode & S_ISGID) ? ICB_FLAG_SETGID : 0) | + ((inode->i_mode & S_ISVTX) ? ICB_FLAG_STICKY : 0) | + (le16_to_cpu(fe->icbTag.flags) & + ~(ICB_FLAG_ALLOC_MASK | ICB_FLAG_SETUID | + ICB_FLAG_SETGID | ICB_FLAG_STICKY)); + + fe->icbTag.flags = cpu_to_le16(icbflags); + fe->descTag.descVersion = cpu_to_le16(2); + fe->descTag.tagSerialNum = cpu_to_le16(UDF_SB_SERIALNUM(inode->i_sb)); + fe->descTag.tagLocation = cpu_to_le32(UDF_I_LOCATION(inode).logicalBlockNum); + crclen += UDF_I_LENEATTR(inode) + UDF_I_LENALLOC(inode) - sizeof(tag); + fe->descTag.descCRCLength = cpu_to_le16(crclen); + fe->descTag.descCRC = cpu_to_le16(udf_crc((char *)fe + sizeof(tag), crclen, 0)); + + fe->descTag.tagChecksum = 0; + for (i=0; i<16; i++) + if (i != 4) + fe->descTag.tagChecksum += ((Uint8 *)&(fe->descTag))[i]; + + /* write the data blocks */ + mark_buffer_dirty(bh); + if (do_sync) + { + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + { + printk("IO error syncing udf inode [%s:%08lx]\n", + bdevname(inode->i_dev), inode->i_ino); + err = -EIO; + } + } + udf_release_data(bh); + return err; +} + +/* + * udf_iget + * + * PURPOSE + * Get an inode. + * + * DESCRIPTION + * This routine replaces iget() and read_inode(). + * + * HISTORY + * October 3, 1997 - Andrew E. Mileski + * Written, tested, and released. + * + * 12/19/98 dgb Added semaphore and changed to be a wrapper of iget + */ +struct inode * +udf_iget(struct super_block *sb, lb_addr ino) +{ + struct inode *inode; + unsigned long block; + + block = udf_get_lb_pblock(sb, ino, 0); + + /* Get the inode */ + + inode = iget(sb, block); + /* calls udf_read_inode() ! */ + + if (!inode) + { + printk(KERN_ERR "udf: iget() failed\n"); + return NULL; + } + else if (is_bad_inode(inode)) + { + iput(inode); + return NULL; + } + else if (UDF_I_LOCATION(inode).logicalBlockNum == 0xFFFFFFFF && + UDF_I_LOCATION(inode).partitionReferenceNum == 0xFFFF) + { + memcpy(&UDF_I_LOCATION(inode), &ino, sizeof(lb_addr)); + __udf_read_inode(inode); + if (is_bad_inode(inode)) + { + iput(inode); + return NULL; + } + } + + if ( ino.logicalBlockNum >= UDF_SB_PARTLEN(sb, ino.partitionReferenceNum) ) + { + udf_debug("block=%d, partition=%d out of range\n", + ino.logicalBlockNum, ino.partitionReferenceNum); + make_bad_inode(inode); + iput(inode); + return NULL; + } + + return inode; +} + +Sint8 udf_add_aext(struct inode *inode, lb_addr *bloc, int *extoffset, + lb_addr eloc, Uint32 elen, struct buffer_head **bh, int inc) +{ + int adsize; + short_ad *sad = NULL; + long_ad *lad = NULL; + struct AllocExtDesc *aed; + int ret; + + if (!(*bh)) + { + if (!(*bh = udf_tread(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, *bloc, 0), + inode->i_sb->s_blocksize))) + { + udf_debug("reading block %d failed!\n", + udf_get_lb_pblock(inode->i_sb, *bloc, 0)); + return -1; + } + } + + if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_SHORT) + adsize = sizeof(short_ad); + else if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_LONG) + adsize = sizeof(long_ad); + else + return -1; + + if (*extoffset + (2 * adsize) > inode->i_sb->s_blocksize) + { + char *sptr, *dptr; + struct buffer_head *nbh; + int err, loffset; + lb_addr obloc = *bloc; + + if (!(bloc->logicalBlockNum = udf_new_block(inode->i_sb, inode, + obloc.partitionReferenceNum, obloc.logicalBlockNum, &err))) + { + return -1; + } + if (!(nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, + *bloc, 0), inode->i_sb->s_blocksize))) + { + return -1; + } + lock_buffer(nbh); + memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); + mark_buffer_uptodate(nbh, 1); + unlock_buffer(nbh); + mark_buffer_dirty_inode(nbh, inode); + + aed = (struct AllocExtDesc *)(nbh->b_data); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) + aed->previousAllocExtLocation = cpu_to_le32(obloc.logicalBlockNum); + if (*extoffset + adsize > inode->i_sb->s_blocksize) + { + loffset = *extoffset; + aed->lengthAllocDescs = cpu_to_le32(adsize); + sptr = (*bh)->b_data + *extoffset - adsize; + dptr = nbh->b_data + sizeof(struct AllocExtDesc); + memcpy(dptr, sptr, adsize); + *extoffset = sizeof(struct AllocExtDesc) + adsize; + } + else + { + loffset = *extoffset + adsize; + aed->lengthAllocDescs = cpu_to_le32(0); + sptr = (*bh)->b_data + *extoffset; + *extoffset = sizeof(struct AllocExtDesc); + + if (memcmp(&UDF_I_LOCATION(inode), &obloc, sizeof(lb_addr))) + { + aed = (struct AllocExtDesc *)(*bh)->b_data; + aed->lengthAllocDescs = + cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) + adsize); + } + else + { + UDF_I_LENALLOC(inode) += adsize; + mark_inode_dirty(inode); + } + } + udf_new_tag(nbh->b_data, TID_ALLOC_EXTENT_DESC, 2, 1, + bloc->logicalBlockNum, sizeof(tag)); + switch (UDF_I_ALLOCTYPE(inode)) + { + case ICB_FLAG_AD_SHORT: + { + sad = (short_ad *)sptr; + sad->extLength = cpu_to_le32( + EXTENT_NEXT_EXTENT_ALLOCDECS << 30 | + inode->i_sb->s_blocksize); + sad->extPosition = cpu_to_le32(bloc->logicalBlockNum); + break; + } + case ICB_FLAG_AD_LONG: + { + lad = (long_ad *)sptr; + lad->extLength = cpu_to_le32( + EXTENT_NEXT_EXTENT_ALLOCDECS << 30 | + inode->i_sb->s_blocksize); + lad->extLocation = cpu_to_lelb(*bloc); + memset(lad->impUse, 0x00, sizeof(lad->impUse)); + break; + } + } + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201) + udf_update_tag((*bh)->b_data, loffset); + else + udf_update_tag((*bh)->b_data, sizeof(struct AllocExtDesc)); + mark_buffer_dirty_inode(*bh, inode); + udf_release_data(*bh); + *bh = nbh; + } + + ret = udf_write_aext(inode, *bloc, extoffset, eloc, elen, *bh, inc); + + if (!memcmp(&UDF_I_LOCATION(inode), bloc, sizeof(lb_addr))) + { + UDF_I_LENALLOC(inode) += adsize; + mark_inode_dirty(inode); + } + else + { + aed = (struct AllocExtDesc *)(*bh)->b_data; + aed->lengthAllocDescs = + cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) + adsize); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201) + udf_update_tag((*bh)->b_data, *extoffset + (inc ? 0 : adsize)); + else + udf_update_tag((*bh)->b_data, sizeof(struct AllocExtDesc)); + mark_buffer_dirty_inode(*bh, inode); + } + + return ret; +} + +Sint8 udf_write_aext(struct inode *inode, lb_addr bloc, int *extoffset, + lb_addr eloc, Uint32 elen, struct buffer_head *bh, int inc) +{ + int adsize; + short_ad *sad = NULL; + long_ad *lad = NULL; + + if (!(bh)) + { + if (!(bh = udf_tread(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, bloc, 0), + inode->i_sb->s_blocksize))) + { + udf_debug("reading block %d failed!\n", + udf_get_lb_pblock(inode->i_sb, bloc, 0)); + return -1; + } + } + else + atomic_inc(&bh->b_count); + + if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_SHORT) + adsize = sizeof(short_ad); + else if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_LONG) + adsize = sizeof(long_ad); + else + return -1; + + switch (UDF_I_ALLOCTYPE(inode)) + { + case ICB_FLAG_AD_SHORT: + { + sad = (short_ad *)((bh)->b_data + *extoffset); + sad->extLength = cpu_to_le32(elen); + sad->extPosition = cpu_to_le32(eloc.logicalBlockNum); + break; + } + case ICB_FLAG_AD_LONG: + { + lad = (long_ad *)((bh)->b_data + *extoffset); + lad->extLength = cpu_to_le32(elen); + lad->extLocation = cpu_to_lelb(eloc); + memset(lad->impUse, 0x00, sizeof(lad->impUse)); + break; + } + } + + if (memcmp(&UDF_I_LOCATION(inode), &bloc, sizeof(lb_addr))) + { + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201) + { + struct AllocExtDesc *aed = (struct AllocExtDesc *)(bh)->b_data; + udf_update_tag((bh)->b_data, + le32_to_cpu(aed->lengthAllocDescs) + sizeof(struct AllocExtDesc)); + } + mark_buffer_dirty_inode(bh, inode); + } + else + { + mark_inode_dirty(inode); + mark_buffer_dirty(bh); + } + + if (inc) + *extoffset += adsize; + udf_release_data(bh); + return (elen >> 30); +} + +Sint8 udf_next_aext(struct inode *inode, lb_addr *bloc, int *extoffset, + lb_addr *eloc, Uint32 *elen, struct buffer_head **bh, int inc) +{ + Uint16 tagIdent; + int pos, alen; + Sint8 etype; + + if (!(*bh)) + { + if (!(*bh = udf_tread(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, *bloc, 0), + inode->i_sb->s_blocksize))) + { + udf_debug("reading block %d failed!\n", + udf_get_lb_pblock(inode->i_sb, *bloc, 0)); + return -1; + } + } + + tagIdent = ((tag *)(*bh)->b_data)->tagIdent; + + if (!memcmp(&UDF_I_LOCATION(inode), bloc, sizeof(lb_addr))) + { + if (tagIdent == TID_FILE_ENTRY || tagIdent == TID_EXTENDED_FILE_ENTRY || + UDF_I_NEW_INODE(inode)) + { + pos = udf_file_entry_alloc_offset(inode); + alen = UDF_I_LENALLOC(inode) + pos; + } + else if (tagIdent == TID_UNALLOCATED_SPACE_ENTRY) + { + pos = sizeof(struct UnallocatedSpaceEntry); + alen = UDF_I_LENALLOC(inode) + pos; + } + else + return -1; + } + else if (tagIdent == TID_ALLOC_EXTENT_DESC) + { + struct AllocExtDesc *aed = (struct AllocExtDesc *)(*bh)->b_data; + + pos = sizeof(struct AllocExtDesc); + alen = le32_to_cpu(aed->lengthAllocDescs) + pos; + } + else + return -1; + + if (!(*extoffset)) + *extoffset = pos; + + switch (UDF_I_ALLOCTYPE(inode)) + { + case ICB_FLAG_AD_SHORT: + { + short_ad *sad; + + if (!(sad = udf_get_fileshortad((*bh)->b_data, alen, extoffset, inc))) + return -1; + + if ((etype = le32_to_cpu(sad->extLength) >> 30) == EXTENT_NEXT_EXTENT_ALLOCDECS) + { + bloc->logicalBlockNum = le32_to_cpu(sad->extPosition); + *extoffset = 0; + udf_release_data(*bh); + *bh = NULL; + return udf_next_aext(inode, bloc, extoffset, eloc, elen, bh, inc); + } + else + { + eloc->logicalBlockNum = le32_to_cpu(sad->extPosition); + eloc->partitionReferenceNum = UDF_I_LOCATION(inode).partitionReferenceNum; + *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK; + } + break; + } + case ICB_FLAG_AD_LONG: + { + long_ad *lad; + + if (!(lad = udf_get_filelongad((*bh)->b_data, alen, extoffset, inc))) + return -1; + + if ((etype = le32_to_cpu(lad->extLength) >> 30) == EXTENT_NEXT_EXTENT_ALLOCDECS) + { + *bloc = lelb_to_cpu(lad->extLocation); + *extoffset = 0; + udf_release_data(*bh); + *bh = NULL; + return udf_next_aext(inode, bloc, extoffset, eloc, elen, bh, inc); + } + else + { + *eloc = lelb_to_cpu(lad->extLocation); + *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; + } + break; + } + case ICB_FLAG_AD_IN_ICB: + { + if (UDF_I_LENALLOC(inode) == 0) + return -1; + etype = EXTENT_RECORDED_ALLOCATED; + *eloc = UDF_I_LOCATION(inode); + *elen = UDF_I_LENALLOC(inode); + break; + } + default: + { + udf_debug("alloc_type = %d unsupported\n", UDF_I_ALLOCTYPE(inode)); + return -1; + } + } + if (*elen) + return etype; + + udf_debug("Empty Extent, inode=%ld, alloctype=%d, eloc=%d, elen=%d, etype=%d, extoffset=%d\n", + inode->i_ino, UDF_I_ALLOCTYPE(inode), eloc->logicalBlockNum, *elen, etype, *extoffset); + if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_SHORT) + *extoffset -= sizeof(short_ad); + else if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_LONG) + *extoffset -= sizeof(long_ad); + return -1; +} + +Sint8 udf_current_aext(struct inode *inode, lb_addr *bloc, int *extoffset, + lb_addr *eloc, Uint32 *elen, struct buffer_head **bh, int inc) +{ + int pos, alen; + Sint8 etype; + + if (!(*bh)) + { + if (!(*bh = udf_tread(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, *bloc, 0), + inode->i_sb->s_blocksize))) + { + udf_debug("reading block %d failed!\n", + udf_get_lb_pblock(inode->i_sb, *bloc, 0)); + return -1; + } + } + + if (!memcmp(&UDF_I_LOCATION(inode), bloc, sizeof(lb_addr))) + { + if (!(UDF_I_EXTENDED_FE(inode))) + pos = sizeof(struct FileEntry) + UDF_I_LENEATTR(inode); + else + pos = sizeof(struct ExtendedFileEntry) + UDF_I_LENEATTR(inode); + alen = UDF_I_LENALLOC(inode) + pos; + } + else + { + struct AllocExtDesc *aed = (struct AllocExtDesc *)(*bh)->b_data; + + pos = sizeof(struct AllocExtDesc); + alen = le32_to_cpu(aed->lengthAllocDescs) + pos; + } + + if (!(*extoffset)) + *extoffset = pos; + + switch (UDF_I_ALLOCTYPE(inode)) + { + case ICB_FLAG_AD_SHORT: + { + short_ad *sad; + + if (!(sad = udf_get_fileshortad((*bh)->b_data, alen, extoffset, inc))) + return -1; + + etype = le32_to_cpu(sad->extLength) >> 30; + eloc->logicalBlockNum = le32_to_cpu(sad->extPosition); + eloc->partitionReferenceNum = UDF_I_LOCATION(inode).partitionReferenceNum; + *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK; + break; + } + case ICB_FLAG_AD_LONG: + { + long_ad *lad; + + if (!(lad = udf_get_filelongad((*bh)->b_data, alen, extoffset, inc))) + return -1; + + etype = le32_to_cpu(lad->extLength) >> 30; + *eloc = lelb_to_cpu(lad->extLocation); + *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; + break; + } + default: + { + udf_debug("alloc_type = %d unsupported\n", UDF_I_ALLOCTYPE(inode)); + return -1; + } + } + if (*elen) + return etype; + + udf_debug("Empty Extent!\n"); + if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_SHORT) + *extoffset -= sizeof(short_ad); + else if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_LONG) + *extoffset -= sizeof(long_ad); + return -1; +} + +Sint8 udf_insert_aext(struct inode *inode, lb_addr bloc, int extoffset, + lb_addr neloc, Uint32 nelen, struct buffer_head *bh) +{ + lb_addr oeloc; + Uint32 oelen; + Sint8 etype; + + if (!bh) + { + if (!(bh = udf_tread(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, bloc, 0), + inode->i_sb->s_blocksize))) + { + udf_debug("reading block %d failed!\n", + udf_get_lb_pblock(inode->i_sb, bloc, 0)); + return -1; + } + } + else + atomic_inc(&bh->b_count); + + while ((etype = udf_next_aext(inode, &bloc, &extoffset, &oeloc, &oelen, &bh, 0)) != -1) + { + udf_write_aext(inode, bloc, &extoffset, neloc, nelen, bh, 1); + + neloc = oeloc; + nelen = (etype << 30) | oelen; + } + udf_add_aext(inode, &bloc, &extoffset, neloc, nelen, &bh, 1); + udf_release_data(bh); + return (nelen >> 30); +} + +Sint8 udf_delete_aext(struct inode *inode, lb_addr nbloc, int nextoffset, + lb_addr eloc, Uint32 elen, struct buffer_head *nbh) +{ + struct buffer_head *obh; + lb_addr obloc; + int oextoffset, adsize; + Sint8 etype; + struct AllocExtDesc *aed; + + if (!(nbh)) + { + if (!(nbh = udf_tread(inode->i_sb, + udf_get_lb_pblock(inode->i_sb, nbloc, 0), + inode->i_sb->s_blocksize))) + { + udf_debug("reading block %d failed!\n", + udf_get_lb_pblock(inode->i_sb, nbloc, 0)); + return -1; + } + } + else + atomic_inc(&nbh->b_count); + atomic_inc(&nbh->b_count); + + if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_SHORT) + adsize = sizeof(short_ad); + else if (UDF_I_ALLOCTYPE(inode) == ICB_FLAG_AD_LONG) + adsize = sizeof(long_ad); + else + adsize = 0; + + obh = nbh; + obloc = nbloc; + oextoffset = nextoffset; + + if (udf_next_aext(inode, &nbloc, &nextoffset, &eloc, &elen, &nbh, 1) == -1) + return -1; + + while ((etype = udf_next_aext(inode, &nbloc, &nextoffset, &eloc, &elen, &nbh, 1)) != -1) + { + udf_write_aext(inode, obloc, &oextoffset, eloc, (etype << 30) | elen, obh, 1); + if (memcmp(&nbloc, &obloc, sizeof(lb_addr))) + { + obloc = nbloc; + udf_release_data(obh); + atomic_inc(&nbh->b_count); + obh = nbh; + oextoffset = nextoffset - adsize; + } + } + memset(&eloc, 0x00, sizeof(lb_addr)); + elen = 0; + + if (memcmp(&nbloc, &obloc, sizeof(lb_addr))) + { + udf_free_blocks(inode->i_sb, inode, nbloc, 0, 1); + udf_write_aext(inode, obloc, &oextoffset, eloc, elen, obh, 1); + udf_write_aext(inode, obloc, &oextoffset, eloc, elen, obh, 1); + if (!memcmp(&UDF_I_LOCATION(inode), &obloc, sizeof(lb_addr))) + { + UDF_I_LENALLOC(inode) -= (adsize * 2); + mark_inode_dirty(inode); + } + else + { + aed = (struct AllocExtDesc *)(obh)->b_data; + aed->lengthAllocDescs = + cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) - (2*adsize)); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201) + udf_update_tag((obh)->b_data, oextoffset - (2*adsize)); + else + udf_update_tag((obh)->b_data, sizeof(struct AllocExtDesc)); + mark_buffer_dirty_inode(obh, inode); + } + } + else + { + udf_write_aext(inode, obloc, &oextoffset, eloc, elen, obh, 1); + if (!memcmp(&UDF_I_LOCATION(inode), &obloc, sizeof(lb_addr))) + { + UDF_I_LENALLOC(inode) -= adsize; + mark_inode_dirty(inode); + } + else + { + aed = (struct AllocExtDesc *)(obh)->b_data; + aed->lengthAllocDescs = + cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) - adsize); + if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB_UDFREV(inode->i_sb) >= 0x0201) + udf_update_tag((obh)->b_data, oextoffset - adsize); + else + udf_update_tag((obh)->b_data, sizeof(struct AllocExtDesc)); + mark_buffer_dirty_inode(obh, inode); + } + } + + udf_release_data(nbh); + udf_release_data(obh); + return (elen >> 30); +} + +Sint8 inode_bmap(struct inode *inode, int block, lb_addr *bloc, Uint32 *extoffset, + lb_addr *eloc, Uint32 *elen, Uint32 *offset, struct buffer_head **bh) +{ + Uint64 lbcount = 0, bcount = (Uint64)block << inode->i_sb->s_blocksize_bits; + Sint8 etype; + + if (block < 0) + { + printk(KERN_ERR "udf: inode_bmap: block < 0\n"); + return -1; + } + if (!inode) + { + printk(KERN_ERR "udf: inode_bmap: NULL inode\n"); + return -1; + } + + *extoffset = 0; + *elen = 0; + *bloc = UDF_I_LOCATION(inode); + + do + { + if ((etype = udf_next_aext(inode, bloc, extoffset, eloc, elen, bh, 1)) == -1) + { + *offset = bcount - lbcount; + UDF_I_LENEXTENTS(inode) = lbcount; + return -1; + } + lbcount += *elen; + } while (lbcount <= bcount); + + *offset = bcount + *elen - lbcount; + + return etype; +} + +long udf_block_map(struct inode *inode, long block) +{ + lb_addr eloc, bloc; + Uint32 offset, extoffset, elen; + struct buffer_head *bh = NULL; + int ret; + + lock_kernel(); + + if (inode_bmap(inode, block, &bloc, &extoffset, &eloc, &elen, &offset, &bh) == EXTENT_RECORDED_ALLOCATED) + ret = udf_get_lb_pblock(inode->i_sb, eloc, offset >> inode->i_sb->s_blocksize_bits); + else + ret = 0; + + unlock_kernel(); + + if (bh) + udf_release_data(bh); + + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) + return udf_fixed_to_variable(ret); + else + return ret; +} diff -urN linux-2.4.16-reiserfspatches-immutable/fs/ufs/truncate.c~ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ufs/truncate.c~ --- linux-2.4.16-reiserfspatches-immutable/fs/ufs/truncate.c~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/fs/ufs/truncate.c~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,469 @@ +/* + * linux/fs/ufs/truncate.c + * + * Copyright (C) 1998 + * Daniel Pirkl + * Charles University, Faculty of Mathematics and Physics + * + * from + * + * linux/fs/ext2/truncate.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/truncate.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + +/* + * Real random numbers for secure rm added 94/02/18 + * Idea from Pierre del Perugia + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "swab.h" +#include "util.h" + +#undef UFS_TRUNCATE_DEBUG + +#ifdef UFS_TRUNCATE_DEBUG +#define UFSD(x) printk("(%s, %d), %s: ", __FILE__, __LINE__, __FUNCTION__); printk x; +#else +#define UFSD(x) +#endif + +/* + * Secure deletion currently doesn't work. It interacts very badly + * with buffers shared with memory mappings, and for that reason + * can't be done in the truncate() routines. It should instead be + * done separately in "release()" before calling the truncate routines + * that will release the actual file blocks. + * + * Linus + */ + +#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) +#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) + +#define DATA_BUFFER_USED(bh) \ + (atomic_read(&bh->b_count)>1 || buffer_locked(bh)) + +static int ufs_trunc_direct (struct inode * inode) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct buffer_head * bh; + u32 * p; + unsigned frag1, frag2, frag3, frag4, block1, block2; + unsigned frag_to_free, free_count; + unsigned i, j, tmp; + int retry; + + UFSD(("ENTER\n")) + + sb = inode->i_sb; + uspi = sb->u.ufs_sb.s_uspi; + + frag_to_free = 0; + free_count = 0; + retry = 0; + + frag1 = DIRECT_FRAGMENT; + frag4 = min_t(u32, UFS_NDIR_FRAGMENT, inode->u.ufs_i.i_lastfrag); + frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); + frag3 = frag4 & ~uspi->s_fpbmask; + block1 = block2 = 0; + if (frag2 > frag3) { + frag2 = frag4; + frag3 = frag4 = 0; + } + else if (frag2 < frag3) { + block1 = ufs_fragstoblks (frag2); + block2 = ufs_fragstoblks (frag3); + } + + UFSD(("frag1 %u, frag2 %u, block1 %u, block2 %u, frag3 %u, frag4 %u\n", frag1, frag2, block1, block2, frag3, frag4)) + + if (frag1 >= frag2) + goto next1; + + /* + * Free first free fragments + */ + p = inode->u.ufs_i.i_u1.i_data + ufs_fragstoblks (frag1); + tmp = fs32_to_cpu(sb, *p); + if (!tmp ) + ufs_panic (sb, "ufs_trunc_direct", "internal error"); + frag1 = ufs_fragnum (frag1); + frag2 = ufs_fragnum (frag2); + for (j = frag1; j < frag2; j++) { + bh = get_hash_table (sb->s_dev, tmp + j, uspi->s_fsize); + if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { + retry = 1; + brelse (bh); + goto next1; + } + bforget (bh); + } + inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift; + mark_inode_dirty(inode); + ufs_free_fragments (inode, tmp + frag1, frag2 - frag1); + frag_to_free = tmp + frag1; + +next1: + /* + * Free whole blocks + */ + for (i = block1 ; i < block2; i++) { + p = inode->u.ufs_i.i_u1.i_data + i; + tmp = fs32_to_cpu(sb, *p); + if (!tmp) + continue; + for (j = 0; j < uspi->s_fpb; j++) { + bh = get_hash_table (sb->s_dev, tmp + j, uspi->s_fsize); + if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { + retry = 1; + brelse (bh); + goto next2; + } + bforget (bh); + } + *p = 0; + inode->i_blocks -= uspi->s_nspb; + mark_inode_dirty(inode); + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } +next2:; + } + + if (free_count > 0) + ufs_free_blocks (inode, frag_to_free, free_count); + + if (frag3 >= frag4) + goto next3; + + /* + * Free last free fragments + */ + p = inode->u.ufs_i.i_u1.i_data + ufs_fragstoblks (frag3); + tmp = fs32_to_cpu(sb, *p); + if (!tmp ) + ufs_panic(sb, "ufs_truncate_direct", "internal error"); + frag4 = ufs_fragnum (frag4); + for (j = 0; j < frag4; j++) { + bh = get_hash_table (sb->s_dev, tmp + j, uspi->s_fsize); + if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { + retry = 1; + brelse (bh); + goto next1; + } + bforget (bh); + } + *p = 0; + inode->i_blocks -= frag4 << uspi->s_nspfshift; + mark_inode_dirty(inode); + ufs_free_fragments (inode, tmp, frag4); + next3: + + UFSD(("EXIT\n")) + return retry; +} + + +static int ufs_trunc_indirect (struct inode * inode, unsigned offset, u32 * p) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head * ind_ubh; + struct buffer_head * bh; + u32 * ind; + unsigned indirect_block, i, j, tmp; + unsigned frag_to_free, free_count; + int retry; + + UFSD(("ENTER\n")) + + sb = inode->i_sb; + uspi = sb->u.ufs_sb.s_uspi; + + frag_to_free = 0; + free_count = 0; + retry = 0; + + tmp = fs32_to_cpu(sb, *p); + if (!tmp) + return 0; + ind_ubh = ubh_bread (sb->s_dev, tmp, uspi->s_bsize); + if (tmp != fs32_to_cpu(sb, *p)) { + ubh_brelse (ind_ubh); + return 1; + } + if (!ind_ubh) { + *p = 0; + return 0; + } + + indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; + for (i = indirect_block; i < uspi->s_apb; i++) { + ind = ubh_get_addr32 (ind_ubh, i); + tmp = fs32_to_cpu(sb, *ind); + if (!tmp) + continue; + for (j = 0; j < uspi->s_fpb; j++) { + bh = get_hash_table (sb->s_dev, tmp + j, uspi->s_fsize); + if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *ind)) { + retry = 1; + brelse (bh); + goto next; + } + bforget (bh); + } + *ind = 0; + ubh_mark_buffer_dirty(ind_ubh); + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } + inode->i_blocks -= uspi->s_nspb; + mark_inode_dirty(inode); +next:; + } + + if (free_count > 0) { + ufs_free_blocks (inode, frag_to_free, free_count); + } + for (i = 0; i < uspi->s_apb; i++) + if (*ubh_get_addr32(ind_ubh,i)) + break; + if (i >= uspi->s_apb) { + if (ubh_max_bcount(ind_ubh) != 1) { + retry = 1; + } + else { + tmp = fs32_to_cpu(sb, *p); + *p = 0; + inode->i_blocks -= uspi->s_nspb; + mark_inode_dirty(inode); + ufs_free_blocks (inode, tmp, uspi->s_fpb); + ubh_bforget(ind_ubh); + ind_ubh = NULL; + } + } + if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) { + ubh_ll_rw_block (WRITE, 1, &ind_ubh); + ubh_wait_on_buffer (ind_ubh); + } + ubh_brelse (ind_ubh); + + UFSD(("EXIT\n")) + + return retry; +} + +static int ufs_trunc_dindirect (struct inode * inode, unsigned offset, u32 * p) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head * dind_bh; + unsigned i, tmp, dindirect_block; + u32 * dind; + int retry = 0; + + UFSD(("ENTER\n")) + + sb = inode->i_sb; + uspi = sb->u.ufs_sb.s_uspi; + + dindirect_block = (DIRECT_BLOCK > offset) + ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; + retry = 0; + + tmp = fs32_to_cpu(sb, *p); + if (!tmp) + return 0; + dind_bh = ubh_bread (inode->i_dev, tmp, uspi->s_bsize); + if (tmp != fs32_to_cpu(sb, *p)) { + ubh_brelse (dind_bh); + return 1; + } + if (!dind_bh) { + *p = 0; + return 0; + } + + for (i = dindirect_block ; i < uspi->s_apb ; i++) { + dind = ubh_get_addr32 (dind_bh, i); + tmp = fs32_to_cpu(sb, *dind); + if (!tmp) + continue; + retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); + ubh_mark_buffer_dirty(dind_bh); + } + + for (i = 0; i < uspi->s_apb; i++) + if (*ubh_get_addr32 (dind_bh, i)) + break; + if (i >= uspi->s_apb) { + if (ubh_max_bcount(dind_bh) != 1) + retry = 1; + else { + tmp = fs32_to_cpu(sb, *p); + *p = 0; + inode->i_blocks -= uspi->s_nspb; + mark_inode_dirty(inode); + ufs_free_blocks (inode, tmp, uspi->s_fpb); + ubh_bforget(dind_bh); + dind_bh = NULL; + } + } + if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) { + ubh_ll_rw_block (WRITE, 1, &dind_bh); + ubh_wait_on_buffer (dind_bh); + } + ubh_brelse (dind_bh); + + UFSD(("EXIT\n")) + + return retry; +} + +static int ufs_trunc_tindirect (struct inode * inode) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head * tind_bh; + unsigned tindirect_block, tmp, i; + u32 * tind, * p; + int retry; + + UFSD(("ENTER\n")) + + sb = inode->i_sb; + uspi = sb->u.ufs_sb.s_uspi; + retry = 0; + + tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) + ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; + p = inode->u.ufs_i.i_u1.i_data + UFS_TIND_BLOCK; + if (!(tmp = fs32_to_cpu(sb, *p))) + return 0; + tind_bh = ubh_bread (sb->s_dev, tmp, uspi->s_bsize); + if (tmp != fs32_to_cpu(sb, *p)) { + ubh_brelse (tind_bh); + return 1; + } + if (!tind_bh) { + *p = 0; + return 0; + } + + for (i = tindirect_block ; i < uspi->s_apb ; i++) { + tind = ubh_get_addr32 (tind_bh, i); + retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + + uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); + ubh_mark_buffer_dirty(tind_bh); + } + for (i = 0; i < uspi->s_apb; i++) + if (*ubh_get_addr32 (tind_bh, i)) + break; + if (i >= uspi->s_apb) { + if (ubh_max_bcount(tind_bh) != 1) + retry = 1; + else { + tmp = fs32_to_cpu(sb, *p); + *p = 0; + inode->i_blocks -= uspi->s_nspb; + mark_inode_dirty(inode); + ufs_free_blocks (inode, tmp, uspi->s_fpb); + ubh_bforget(tind_bh); + tind_bh = NULL; + } + } + if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) { + ubh_ll_rw_block (WRITE, 1, &tind_bh); + ubh_wait_on_buffer (tind_bh); + } + ubh_brelse (tind_bh); + + UFSD(("EXIT\n")) + return retry; +} + +void ufs_truncate (struct inode * inode) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct buffer_head * bh; + unsigned offset; + int err, retry; + + UFSD(("ENTER\n")) + sb = inode->i_sb; + uspi = sb->u.ufs_sb.s_uspi; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode)) + return; + while (1) { + retry = ufs_trunc_direct(inode); + retry |= ufs_trunc_indirect (inode, UFS_IND_BLOCK, + (u32 *) &inode->u.ufs_i.i_u1.i_data[UFS_IND_BLOCK]); + retry |= ufs_trunc_dindirect (inode, UFS_IND_BLOCK + uspi->s_apb, + (u32 *) &inode->u.ufs_i.i_u1.i_data[UFS_DIND_BLOCK]); + retry |= ufs_trunc_tindirect (inode); + if (!retry) + break; + if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) + ufs_sync_inode (inode); + run_task_queue(&tq_disk); + current->policy |= SCHED_YIELD; + schedule (); + + + } + offset = inode->i_size & uspi->s_fshift; + if (offset) { + bh = ufs_bread (inode, inode->i_size >> uspi->s_fshift, 0, &err); + if (bh) { + memset (bh->b_data + offset, 0, uspi->s_fsize - offset); + mark_buffer_dirty (bh); + brelse (bh); + } + } + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->u.ufs_i.i_lastfrag = DIRECT_FRAGMENT; + mark_inode_dirty(inode); + UFSD(("EXIT\n")) +} diff -urN linux-2.4.16-reiserfspatches-immutable/include/asm-i386/unistd.h linux-2.4.16-reiserfspatches-immutable-ctx4/include/asm-i386/unistd.h --- linux-2.4.16-reiserfspatches-immutable/include/asm-i386/unistd.h Mon Dec 10 13:12:23 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/include/asm-i386/unistd.h Mon Dec 10 15:01:49 2001 @@ -230,6 +230,8 @@ #define __NR_security 223 /* syscall for security modules */ #define __NR_gettid 224 #define __NR_readahead 225 +#define __NR_new_s_context 226 +#define __NR_set_ipv4root 227 /* user-visible error numbers are in the range -1 - -124: see */ diff -urN linux-2.4.16-reiserfspatches-immutable/include/linux/capability.h linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/capability.h --- linux-2.4.16-reiserfspatches-immutable/include/linux/capability.h Mon Dec 10 13:12:49 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/capability.h Mon Dec 10 15:01:49 2001 @@ -231,6 +231,7 @@ /* Allow enabling/disabling tagged queuing on SCSI controllers and sending arbitrary SCSI commands */ /* Allow setting encryption key on loopback filesystem */ +/* Allow the selection of a security context */ #define CAP_SYS_ADMIN 21 @@ -278,6 +279,10 @@ /* Allow taking of leases on files */ #define CAP_LEASE 28 + +/* Allow opening special device file */ + +#define CAP_OPENDEV 29 #ifdef __KERNEL__ /* diff -urN linux-2.4.16-reiserfspatches-immutable/include/linux/ext2_fs.h~ linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/ext2_fs.h~ --- linux-2.4.16-reiserfspatches-immutable/include/linux/ext2_fs.h~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/ext2_fs.h~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,633 @@ +/* + * linux/include/linux/ext2_fs.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _LINUX_EXT2_FS_H +#define _LINUX_EXT2_FS_H + +#include + +/* + * The second extended filesystem constants/structures + */ + +/* + * Define EXT2FS_DEBUG to produce debug messages + */ +#undef EXT2FS_DEBUG + +/* + * Define EXT2_PREALLOCATE to preallocate data blocks for expanding files + */ +#define EXT2_PREALLOCATE +#define EXT2_DEFAULT_PREALLOC_BLOCKS 8 + +/* + * The second extended file system version + */ +#define EXT2FS_DATE "95/08/09" +#define EXT2FS_VERSION "0.5b" + +/* + * Debug code + */ +#ifdef EXT2FS_DEBUG +# define ext2_debug(f, a...) { \ + printk ("EXT2-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (f, ## a); \ + } +#else +# define ext2_debug(f, a...) /**/ +#endif + +/* + * Special inode numbers + */ +#define EXT2_BAD_INO 1 /* Bad blocks inode */ +#define EXT2_ROOT_INO 2 /* Root inode */ +#define EXT2_ACL_IDX_INO 3 /* ACL inode */ +#define EXT2_ACL_DATA_INO 4 /* ACL inode */ +#define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */ + +/* First non-reserved inode for old ext2 filesystems */ +#define EXT2_GOOD_OLD_FIRST_INO 11 + +/* + * The second extended file system magic number + */ +#define EXT2_SUPER_MAGIC 0xEF53 + +/* + * Maximal count of links to a file + */ +#define EXT2_LINK_MAX 32000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT2_MIN_BLOCK_SIZE 1024 +#define EXT2_MAX_BLOCK_SIZE 4096 +#define EXT2_MIN_BLOCK_LOG_SIZE 10 +#ifdef __KERNEL__ +# define EXT2_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry)) +#define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32)) +#ifdef __KERNEL__ +# define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#else +# define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT2_ADDR_PER_BLOCK_BITS(s) ((s)->u.ext2_sb.s_addr_per_block_bits) +#define EXT2_INODE_SIZE(s) ((s)->u.ext2_sb.s_inode_size) +#define EXT2_FIRST_INO(s) ((s)->u.ext2_sb.s_first_ino) +#else +#define EXT2_INODE_SIZE(s) (((s)->s_rev_level == EXT2_GOOD_OLD_REV) ? \ + EXT2_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT2_FIRST_INO(s) (((s)->s_rev_level == EXT2_GOOD_OLD_REV) ? \ + EXT2_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif + +/* + * Macro-instructions used to manage fragments + */ +#define EXT2_MIN_FRAG_SIZE 1024 +#define EXT2_MAX_FRAG_SIZE 4096 +#define EXT2_MIN_FRAG_LOG_SIZE 10 +#ifdef __KERNEL__ +# define EXT2_FRAG_SIZE(s) ((s)->u.ext2_sb.s_frag_size) +# define EXT2_FRAGS_PER_BLOCK(s) ((s)->u.ext2_sb.s_frags_per_block) +#else +# define EXT2_FRAG_SIZE(s) (EXT2_MIN_FRAG_SIZE << (s)->s_log_frag_size) +# define EXT2_FRAGS_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / EXT2_FRAG_SIZE(s)) +#endif + +/* + * ACL structures + */ +struct ext2_acl_header /* Header of Access Control Lists */ +{ + __u32 aclh_size; + __u32 aclh_file_count; + __u32 aclh_acle_count; + __u32 aclh_first_acle; +}; + +struct ext2_acl_entry /* Access Control List Entry */ +{ + __u32 acle_size; + __u16 acle_perms; /* Access permissions */ + __u16 acle_type; /* Type of entry */ + __u16 acle_tag; /* User or group identity */ + __u16 acle_pad1; + __u32 acle_next; /* Pointer on next entry for the */ + /* same inode or on next free entry */ +}; + +/* + * Structure of a blocks group descriptor + */ +struct ext2_group_desc +{ + __u32 bg_block_bitmap; /* Blocks bitmap block */ + __u32 bg_inode_bitmap; /* Inodes bitmap block */ + __u32 bg_inode_table; /* Inodes table block */ + __u16 bg_free_blocks_count; /* Free blocks count */ + __u16 bg_free_inodes_count; /* Free inodes count */ + __u16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __u32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#ifdef __KERNEL__ +# define EXT2_BLOCKS_PER_GROUP(s) ((s)->u.ext2_sb.s_blocks_per_group) +# define EXT2_DESC_PER_BLOCK(s) ((s)->u.ext2_sb.s_desc_per_block) +# define EXT2_INODES_PER_GROUP(s) ((s)->u.ext2_sb.s_inodes_per_group) +# define EXT2_DESC_PER_BLOCK_BITS(s) ((s)->u.ext2_sb.s_desc_per_block_bits) +#else +# define EXT2_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT2_DESC_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_group_desc)) +# define EXT2_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT2_NDIR_BLOCKS 12 +#define EXT2_IND_BLOCK EXT2_NDIR_BLOCKS +#define EXT2_DIND_BLOCK (EXT2_IND_BLOCK + 1) +#define EXT2_TIND_BLOCK (EXT2_DIND_BLOCK + 1) +#define EXT2_N_BLOCKS (EXT2_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT2_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT2_UNRM_FL 0x00000002 /* Undelete */ +#define EXT2_COMPR_FL 0x00000004 /* Compress file */ +#define EXT2_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT2_IMMUTABLE_FILE_FL 0x00000010 /* Immutable file */ +#define EXT2_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT2_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT2_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT2_DIRTY_FL 0x00000100 +#define EXT2_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT2_NOCOMP_FL 0x00000400 /* Don't compress */ +#define EXT2_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT2_BTREE_FL 0x00001000 /* btree format dir */ +#define EXT2_IMMUTABLE_LINK_FL 0x00008000 /* Immutable link */ +#define EXT2_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ + +#define EXT2_FL_USER_VISIBLE 0x00009FFF /* User visible flags */ +#define EXT2_FL_USER_MODIFIABLE 0x000080FF /* User modifiable flags */ + +/* + * ioctl commands + */ +#define EXT2_IOC_GETFLAGS _IOR('f', 1, long) +#define EXT2_IOC_SETFLAGS _IOW('f', 2, long) +#define EXT2_IOC_GETVERSION _IOR('v', 1, long) +#define EXT2_IOC_SETVERSION _IOW('v', 2, long) + +/* + * Structure of an inode on the disk + */ +struct ext2_inode { + __u16 i_mode; /* File mode */ + __u16 i_uid; /* Low 16 bits of Owner Uid */ + __u32 i_size; /* Size in bytes */ + __u32 i_atime; /* Access time */ + __u32 i_ctime; /* Creation time */ + __u32 i_mtime; /* Modification time */ + __u32 i_dtime; /* Deletion Time */ + __u16 i_gid; /* Low 16 bits of Group Id */ + __u16 i_links_count; /* Links count */ + __u32 i_blocks; /* Blocks count */ + __u32 i_flags; /* File flags */ + union { + struct { + __u32 l_i_reserved1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __u32 i_block[EXT2_N_BLOCKS];/* Pointers to blocks */ + __u32 i_generation; /* File version (for NFS) */ + __u32 i_file_acl; /* File ACL */ + __u32 i_dir_acl; /* Directory ACL */ + __u32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __u16 l_i_uid_high; /* these 2 fields */ + __u16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ +}; + +#define i_size_high i_dir_acl + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_frag osd2.linux2.l_i_frag +#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 +#endif + +#ifdef __hurd__ +#define i_translator osd1.hurd1.h_i_translator +#define i_frag osd2.hurd2.h_i_frag; +#define i_fsize osd2.hurd2.h_i_fsize; +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author +#endif + +#ifdef __masix__ +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_frag osd2.masix2.m_i_frag +#define i_fsize osd2.masix2.m_i_fsize +#define i_reserved2 osd2.masix2.m_i_reserved2 +#endif + +/* + * File system states + */ +#define EXT2_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT2_ERROR_FS 0x0002 /* Errors detected */ + +/* + * Mount flags + */ +#define EXT2_MOUNT_CHECK 0x0001 /* Do mount-time checks */ +#define EXT2_MOUNT_GRPID 0x0004 /* Create files with directory's group */ +#define EXT2_MOUNT_DEBUG 0x0008 /* Some debugging messages */ +#define EXT2_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */ +#define EXT2_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ +#define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ +#define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ +#define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */ + +#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt +#define set_opt(o, opt) o |= EXT2_MOUNT_##opt +#define test_opt(sb, opt) ((sb)->u.ext2_sb.s_mount_opt & \ + EXT2_MOUNT_##opt) +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT2_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT2_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT2_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT2_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT2_ERRORS_PANIC 3 /* Panic */ +#define EXT2_ERRORS_DEFAULT EXT2_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext2_super_block { + __u32 s_inodes_count; /* Inodes count */ + __u32 s_blocks_count; /* Blocks count */ + __u32 s_r_blocks_count; /* Reserved blocks count */ + __u32 s_free_blocks_count; /* Free blocks count */ + __u32 s_free_inodes_count; /* Free inodes count */ + __u32 s_first_data_block; /* First Data Block */ + __u32 s_log_block_size; /* Block size */ + __s32 s_log_frag_size; /* Fragment size */ + __u32 s_blocks_per_group; /* # Blocks per group */ + __u32 s_frags_per_group; /* # Fragments per group */ + __u32 s_inodes_per_group; /* # Inodes per group */ + __u32 s_mtime; /* Mount time */ + __u32 s_wtime; /* Write time */ + __u16 s_mnt_count; /* Mount count */ + __s16 s_max_mnt_count; /* Maximal mount count */ + __u16 s_magic; /* Magic signature */ + __u16 s_state; /* File system state */ + __u16 s_errors; /* Behaviour when detecting errors */ + __u16 s_minor_rev_level; /* minor revision level */ + __u32 s_lastcheck; /* time of last check */ + __u32 s_checkinterval; /* max. time between checks */ + __u32 s_creator_os; /* OS */ + __u32 s_rev_level; /* Revision level */ + __u16 s_def_resuid; /* Default uid for reserved blocks */ + __u16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT2_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __u32 s_first_ino; /* First non-reserved inode */ + __u16 s_inode_size; /* size of inode structure */ + __u16 s_block_group_nr; /* block group # of this superblock */ + __u32 s_feature_compat; /* compatible feature set */ + __u32 s_feature_incompat; /* incompatible feature set */ + __u32 s_feature_ro_compat; /* readonly-compatible feature set */ + __u8 s_uuid[16]; /* 128-bit uuid for volume */ + char s_volume_name[16]; /* volume name */ + char s_last_mounted[64]; /* directory where last mounted */ + __u32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT2_COMPAT_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __u16 s_padding1; + __u32 s_reserved[204]; /* Padding to the end of the block */ +}; + +#ifdef __KERNEL__ +#define EXT2_SB(sb) (&((sb)->u.ext2_sb)) +#else +/* Assume that user mode programs are passing in an ext2fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT2_SB(sb) (sb) +#endif + +/* + * Codes for operating systems + */ +#define EXT2_OS_LINUX 0 +#define EXT2_OS_HURD 1 +#define EXT2_OS_MASIX 2 +#define EXT2_OS_FREEBSD 3 +#define EXT2_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT2_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT2_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT2_CURRENT_REV EXT2_GOOD_OLD_REV +#define EXT2_MAX_SUPP_REV EXT2_DYNAMIC_REV + +#define EXT2_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT2_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT2_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT2_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT2_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT2_SET_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT2_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT2_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT2_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT2_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT2_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT2_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT2_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT2_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT2_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT2_FEATURE_COMPAT_RESIZE_INO 0x0010 +#define EXT2_FEATURE_COMPAT_DIR_INDEX 0x0020 +#define EXT2_FEATURE_COMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT2_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT2_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT2_FEATURE_RO_COMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT2_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 +#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 +#define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff + +#define EXT2_FEATURE_COMPAT_SUPP 0 +#define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE +#define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT2_FEATURE_RO_COMPAT_BTREE_DIR) +#define EXT2_FEATURE_RO_COMPAT_UNSUPPORTED ~EXT2_FEATURE_RO_COMPAT_SUPP +#define EXT2_FEATURE_INCOMPAT_UNSUPPORTED ~EXT2_FEATURE_INCOMPAT_SUPP + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT2_DEF_RESUID 0 +#define EXT2_DEF_RESGID 0 + +/* + * Structure of a directory entry + */ +#define EXT2_NAME_LEN 255 + +struct ext2_dir_entry { + __u32 inode; /* Inode number */ + __u16 rec_len; /* Directory entry length */ + __u16 name_len; /* Name length */ + char name[EXT2_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT2 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext2_dir_entry_2 { + __u32 inode; /* Inode number */ + __u16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT2_NAME_LEN]; /* File name */ +}; + +/* + * Ext2 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +enum { + EXT2_FT_UNKNOWN, + EXT2_FT_REG_FILE, + EXT2_FT_DIR, + EXT2_FT_CHRDEV, + EXT2_FT_BLKDEV, + EXT2_FT_FIFO, + EXT2_FT_SOCK, + EXT2_FT_SYMLINK, + EXT2_FT_MAX +}; + +/* + * EXT2_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT2_DIR_PAD 4 +#define EXT2_DIR_ROUND (EXT2_DIR_PAD - 1) +#define EXT2_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT2_DIR_ROUND) & \ + ~EXT2_DIR_ROUND) + +#ifdef __KERNEL__ +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext2 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* balloc.c */ +extern int ext2_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext2_bg_num_gdb(struct super_block *sb, int group); +extern int ext2_new_block (struct inode *, unsigned long, + __u32 *, __u32 *, int *); +extern void ext2_free_blocks (struct inode *, unsigned long, + unsigned long); +extern unsigned long ext2_count_free_blocks (struct super_block *); +extern void ext2_check_blocks_bitmap (struct super_block *); +extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); + +/* dir.c */ +extern int ext2_add_link (struct dentry *, struct inode *); +extern ino_t ext2_inode_by_name(struct inode *, struct dentry *); +extern int ext2_make_empty(struct inode *, struct inode *); +extern struct ext2_dir_entry_2 * ext2_find_entry (struct inode *,struct dentry *, struct page **); +extern int ext2_delete_entry (struct ext2_dir_entry_2 *, struct page *); +extern int ext2_empty_dir (struct inode *); +extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **); +extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *); + +/* fsync.c */ +extern int ext2_sync_file (struct file *, struct dentry *, int); +extern int ext2_fsync_inode (struct inode *, int); + +/* ialloc.c */ +extern struct inode * ext2_new_inode (const struct inode *, int); +extern void ext2_free_inode (struct inode *); +extern unsigned long ext2_count_free_inodes (struct super_block *); +extern void ext2_check_inodes_bitmap (struct super_block *); +extern unsigned long ext2_count_free (struct buffer_head *, unsigned); + +/* inode.c */ +extern void ext2_read_inode (struct inode *); +extern void ext2_write_inode (struct inode *, int); +extern void ext2_put_inode (struct inode *); +extern void ext2_delete_inode (struct inode *); +extern int ext2_sync_inode (struct inode *); +extern void ext2_discard_prealloc (struct inode *); +extern void ext2_truncate (struct inode *); + +/* ioctl.c */ +extern int ext2_ioctl (struct inode *, struct file *, unsigned int, + unsigned long); + +/* super.c */ +extern void ext2_error (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern NORET_TYPE void ext2_panic (struct super_block *, const char *, + const char *, ...) + __attribute__ ((NORET_AND format (printf, 3, 4))); +extern void ext2_warning (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext2_update_dynamic_rev (struct super_block *sb); +extern void ext2_put_super (struct super_block *); +extern void ext2_write_super (struct super_block *); +extern int ext2_remount (struct super_block *, int *, char *); +extern struct super_block * ext2_read_super (struct super_block *,void *,int); +extern int ext2_statfs (struct super_block *, struct statfs *); + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern struct file_operations ext2_dir_operations; + +/* file.c */ +extern struct inode_operations ext2_file_inode_operations; +extern struct file_operations ext2_file_operations; + +/* inode.c */ +extern struct address_space_operations ext2_aops; + +/* namei.c */ +extern struct inode_operations ext2_dir_inode_operations; + +/* symlink.c */ +extern struct inode_operations ext2_fast_symlink_inode_operations; + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_EXT2_FS_H */ diff -urN linux-2.4.16-reiserfspatches-immutable/include/linux/ext3_fs.h~ linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/ext3_fs.h~ --- linux-2.4.16-reiserfspatches-immutable/include/linux/ext3_fs.h~ Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/ext3_fs.h~ Mon Dec 10 14:28:03 2001 @@ -0,0 +1,716 @@ +/* + * linux/include/linux/ext3_fs.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _LINUX_EXT3_FS_H +#define _LINUX_EXT3_FS_H + +#include + +/* + * The second extended filesystem constants/structures + */ + +/* + * Define EXT3FS_DEBUG to produce debug messages + */ +#undef EXT3FS_DEBUG + +/* + * Define EXT3_PREALLOCATE to preallocate data blocks for expanding files + */ +#undef EXT3_PREALLOCATE /* @@@ Fix this! */ +#define EXT3_DEFAULT_PREALLOC_BLOCKS 8 + +/* + * The second extended file system version + */ +#define EXT3FS_DATE "06 Nov 2001" +#define EXT3FS_VERSION "2.4-0.9.15" + +/* + * Debug code + */ +#ifdef EXT3FS_DEBUG +#define ext3_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext3_debug(f, a...) do {} while (0) +#endif + +/* + * Special inodes numbers + */ +#define EXT3_BAD_INO 1 /* Bad blocks inode */ +#define EXT3_ROOT_INO 2 /* Root inode */ +#define EXT3_ACL_IDX_INO 3 /* ACL inode */ +#define EXT3_ACL_DATA_INO 4 /* ACL inode */ +#define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT3_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext3 filesystems */ +#define EXT3_GOOD_OLD_FIRST_INO 11 + +/* + * The second extended file system magic number + */ +#define EXT3_SUPER_MAGIC 0xEF53 + +/* + * Maximal count of links to a file + */ +#define EXT3_LINK_MAX 32000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT3_MIN_BLOCK_SIZE 1024 +#define EXT3_MAX_BLOCK_SIZE 4096 +#define EXT3_MIN_BLOCK_LOG_SIZE 10 +#ifdef __KERNEL__ +# define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) +#define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) +#ifdef __KERNEL__ +# define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#else +# define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT3_ADDR_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_addr_per_block_bits) +#define EXT3_INODE_SIZE(s) ((s)->u.ext3_sb.s_inode_size) +#define EXT3_FIRST_INO(s) ((s)->u.ext3_sb.s_first_ino) +#else +#define EXT3_INODE_SIZE(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ + EXT3_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT3_FIRST_INO(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ + EXT3_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif + +/* + * Macro-instructions used to manage fragments + */ +#define EXT3_MIN_FRAG_SIZE 1024 +#define EXT3_MAX_FRAG_SIZE 4096 +#define EXT3_MIN_FRAG_LOG_SIZE 10 +#ifdef __KERNEL__ +# define EXT3_FRAG_SIZE(s) ((s)->u.ext3_sb.s_frag_size) +# define EXT3_FRAGS_PER_BLOCK(s) ((s)->u.ext3_sb.s_frags_per_block) +#else +# define EXT3_FRAG_SIZE(s) (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size) +# define EXT3_FRAGS_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s)) +#endif + +/* + * ACL structures + */ +struct ext3_acl_header /* Header of Access Control Lists */ +{ + __u32 aclh_size; + __u32 aclh_file_count; + __u32 aclh_acle_count; + __u32 aclh_first_acle; +}; + +struct ext3_acl_entry /* Access Control List Entry */ +{ + __u32 acle_size; + __u16 acle_perms; /* Access permissions */ + __u16 acle_type; /* Type of entry */ + __u16 acle_tag; /* User or group identity */ + __u16 acle_pad1; + __u32 acle_next; /* Pointer on next entry for the */ + /* same inode or on next free entry */ +}; + +/* + * Structure of a blocks group descriptor + */ +struct ext3_group_desc +{ + __u32 bg_block_bitmap; /* Blocks bitmap block */ + __u32 bg_inode_bitmap; /* Inodes bitmap block */ + __u32 bg_inode_table; /* Inodes table block */ + __u16 bg_free_blocks_count; /* Free blocks count */ + __u16 bg_free_inodes_count; /* Free inodes count */ + __u16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __u32 bg_reserved[3]; +}; + +/* + * Macro-instructions used to manage group descriptors + */ +#ifdef __KERNEL__ +# define EXT3_BLOCKS_PER_GROUP(s) ((s)->u.ext3_sb.s_blocks_per_group) +# define EXT3_DESC_PER_BLOCK(s) ((s)->u.ext3_sb.s_desc_per_block) +# define EXT3_INODES_PER_GROUP(s) ((s)->u.ext3_sb.s_inodes_per_group) +# define EXT3_DESC_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_desc_per_block_bits) +#else +# define EXT3_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT3_DESC_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc)) +# define EXT3_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT3_NDIR_BLOCKS 12 +#define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS +#define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) +#define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) +#define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT3_UNRM_FL 0x00000002 /* Undelete */ +#define EXT3_COMPR_FL 0x00000004 /* Compress file */ +#define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT3_IMMUTABLE_FILE_FL 0x00000010 /* Immutable file */ +#define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT3_DIRTY_FL 0x00000100 +#define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT3_IMMUTABLE_LINK_FL 0x00008000 /* Immutable link */ +#define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + +#define EXT3_FL_USER_VISIBLE 0x0000DFFF /* User visible flags */ +#define EXT3_FL_USER_MODIFIABLE 0x000080FF /* User modifiable flags */ + +/* + * Inode dynamic state flags + */ +#define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ +#define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ + +/* + * ioctl commands + */ +#define EXT3_IOC_GETFLAGS _IOR('f', 1, long) +#define EXT3_IOC_SETFLAGS _IOW('f', 2, long) +#define EXT3_IOC_GETVERSION _IOR('f', 3, long) +#define EXT3_IOC_SETVERSION _IOW('f', 4, long) +#define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) +#define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) +#ifdef CONFIG_JBD_DEBUG +#define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif + +/* + * Structure of an inode on the disk + */ +struct ext3_inode { + __u16 i_mode; /* File mode */ + __u16 i_uid; /* Low 16 bits of Owner Uid */ + __u32 i_size; /* Size in bytes */ + __u32 i_atime; /* Access time */ + __u32 i_ctime; /* Creation time */ + __u32 i_mtime; /* Modification time */ + __u32 i_dtime; /* Deletion Time */ + __u16 i_gid; /* Low 16 bits of Group Id */ + __u16 i_links_count; /* Links count */ + __u32 i_blocks; /* Blocks count */ + __u32 i_flags; /* File flags */ + union { + struct { + __u32 l_i_reserved1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __u32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ + __u32 i_generation; /* File version (for NFS) */ + __u32 i_file_acl; /* File ACL */ + __u32 i_dir_acl; /* Directory ACL */ + __u32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __u16 l_i_uid_high; /* these 2 fields */ + __u16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ +}; + +#define i_size_high i_dir_acl + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_frag osd2.linux2.l_i_frag +#define i_fsize osd2.linux2.l_i_fsize +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_frag osd2.hurd2.h_i_frag; +#define i_fsize osd2.hurd2.h_i_fsize; +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_frag osd2.masix2.m_i_frag +#define i_fsize osd2.masix2.m_i_fsize +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +/* + * File system states + */ +#define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT3_ERROR_FS 0x0002 /* Errors detected */ +#define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Mount flags + */ +#define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ +#define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ +#define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ +#define EXT3_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */ +#define EXT3_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ +#define EXT3_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ +#define EXT3_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ +#define EXT3_MOUNT_NOLOAD 0x0100 /* Don't use existing journal*/ +#define EXT3_MOUNT_ABORT 0x0200 /* Fatal error detected */ +#define EXT3_MOUNT_DATA_FLAGS 0x0C00 /* Mode for data writes: */ + #define EXT3_MOUNT_JOURNAL_DATA 0x0400 /* Write data to journal */ + #define EXT3_MOUNT_ORDERED_DATA 0x0800 /* Flush data before commit */ + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ +#define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ +#define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + +/* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ +#ifndef _LINUX_EXT2_FS_H +#define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt +#define set_opt(o, opt) o |= EXT3_MOUNT_##opt +#define test_opt(sb, opt) ((sb)->u.ext3_sb.s_mount_opt & \ + EXT3_MOUNT_##opt) +#else +#define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD +#define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT +#endif + +#define ext3_set_bit ext2_set_bit +#define ext3_clear_bit ext2_clear_bit +#define ext3_test_bit ext2_test_bit +#define ext3_find_first_zero_bit ext2_find_first_zero_bit +#define ext3_find_next_zero_bit ext2_find_next_zero_bit + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT3_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT3_ERRORS_PANIC 3 /* Panic */ +#define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext3_super_block { +/*00*/ __u32 s_inodes_count; /* Inodes count */ + __u32 s_blocks_count; /* Blocks count */ + __u32 s_r_blocks_count; /* Reserved blocks count */ + __u32 s_free_blocks_count; /* Free blocks count */ +/*10*/ __u32 s_free_inodes_count; /* Free inodes count */ + __u32 s_first_data_block; /* First Data Block */ + __u32 s_log_block_size; /* Block size */ + __s32 s_log_frag_size; /* Fragment size */ +/*20*/ __u32 s_blocks_per_group; /* # Blocks per group */ + __u32 s_frags_per_group; /* # Fragments per group */ + __u32 s_inodes_per_group; /* # Inodes per group */ + __u32 s_mtime; /* Mount time */ +/*30*/ __u32 s_wtime; /* Write time */ + __u16 s_mnt_count; /* Mount count */ + __s16 s_max_mnt_count; /* Maximal mount count */ + __u16 s_magic; /* Magic signature */ + __u16 s_state; /* File system state */ + __u16 s_errors; /* Behaviour when detecting errors */ + __u16 s_minor_rev_level; /* minor revision level */ +/*40*/ __u32 s_lastcheck; /* time of last check */ + __u32 s_checkinterval; /* max. time between checks */ + __u32 s_creator_os; /* OS */ + __u32 s_rev_level; /* Revision level */ +/*50*/ __u16 s_def_resuid; /* Default uid for reserved blocks */ + __u16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT3_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __u32 s_first_ino; /* First non-reserved inode */ + __u16 s_inode_size; /* size of inode structure */ + __u16 s_block_group_nr; /* block group # of this superblock */ + __u32 s_feature_compat; /* compatible feature set */ +/*60*/ __u32 s_feature_incompat; /* incompatible feature set */ + __u32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __u32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __u16 s_padding1; + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ + +/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ +}; + +#ifdef __KERNEL__ +#define EXT3_SB(sb) (&((sb)->u.ext3_sb)) +#define EXT3_I(inode) (&((inode)->u.ext3_i)) +#else +/* Assume that user mode programs are passing in an ext3fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT3_SB(sb) (sb) +#endif + +#define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime + +/* + * Codes for operating systems + */ +#define EXT3_OS_LINUX 0 +#define EXT3_OS_HURD 1 +#define EXT3_OS_MASIX 2 +#define EXT3_OS_FREEBSD 3 +#define EXT3_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV +#define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV + +#define EXT3_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT3_SET_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 + +#define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + +#define EXT3_FEATURE_COMPAT_SUPP 0 +#define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) +#define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT3_DEF_RESUID 0 +#define EXT3_DEF_RESGID 0 + +/* + * Structure of a directory entry + */ +#define EXT3_NAME_LEN 255 + +struct ext3_dir_entry { + __u32 inode; /* Inode number */ + __u16 rec_len; /* Directory entry length */ + __u16 name_len; /* Name length */ + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT3 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext3_dir_entry_2 { + __u32 inode; /* Inode number */ + __u16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT3_NAME_LEN]; /* File name */ +}; + +/* + * Ext3 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT3_FT_UNKNOWN 0 +#define EXT3_FT_REG_FILE 1 +#define EXT3_FT_DIR 2 +#define EXT3_FT_CHRDEV 3 +#define EXT3_FT_BLKDEV 4 +#define EXT3_FT_FIFO 5 +#define EXT3_FT_SOCK 6 +#define EXT3_FT_SYMLINK 7 + +#define EXT3_FT_MAX 8 + +/* + * EXT3_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT3_DIR_PAD 4 +#define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) +#define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) + +#ifdef __KERNEL__ + +/* Filesize hard limits for 64-bit file offsets */ +extern long long ext3_max_sizes[]; + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext3_iloc +{ + struct buffer_head *bh; + struct ext3_inode *raw_inode; + unsigned long block_group; +}; + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in but none of the + * ext3 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* acl.c */ +extern int ext3_permission (struct inode *, int); + +/* balloc.c */ +extern int ext3_bg_has_super(struct super_block *sb, int group); +extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); +extern int ext3_new_block (handle_t *, struct inode *, unsigned long, + __u32 *, __u32 *, int *); +extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, + unsigned long); +extern unsigned long ext3_count_free_blocks (struct super_block *); +extern void ext3_check_blocks_bitmap (struct super_block *); +extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); + +/* bitmap.c */ +extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + +/* dir.c */ +extern int ext3_check_dir_entry(const char *, struct inode *, + struct ext3_dir_entry_2 *, struct buffer_head *, + unsigned long); + +/* file.c */ + +/* fsync.c */ +extern int ext3_sync_file (struct file *, struct dentry *, int); + +/* ialloc.c */ +extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); +extern void ext3_free_inode (handle_t *, struct inode *); +extern struct inode * ext3_orphan_get (struct super_block *, ino_t); +extern unsigned long ext3_count_free_inodes (struct super_block *); +extern void ext3_check_inodes_bitmap (struct super_block *); + +/* inode.c */ + +extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); +extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + +extern int ext3_get_inode_loc (struct inode *, struct ext3_iloc *); +extern void ext3_read_inode (struct inode *); +extern void ext3_write_inode (struct inode *, int); +extern int ext3_setattr (struct dentry *, struct iattr *); +extern void ext3_put_inode (struct inode *); +extern void ext3_delete_inode (struct inode *); +extern int ext3_sync_inode (handle_t *, struct inode *); +extern void ext3_discard_prealloc (struct inode *); +extern void ext3_dirty_inode(struct inode *); +extern int ext3_change_inode_journal_flag(struct inode *, int); + +/* ioctl.c */ +extern int ext3_ioctl (struct inode *, struct file *, unsigned int, + unsigned long); + +/* namei.c */ +extern struct inode_operations ext3_dir_inode_operations; +extern int ext3_orphan_add(handle_t *, struct inode *); +extern int ext3_orphan_del(handle_t *, struct inode *); + +/* super.c */ +extern void ext3_error (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void __ext3_std_error (struct super_block *, const char *, int); +extern void ext3_abort (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern NORET_TYPE void ext3_panic (struct super_block *, const char *, + const char *, ...) + __attribute__ ((NORET_AND format (printf, 3, 4))); +extern void ext3_warning (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext3_update_dynamic_rev (struct super_block *sb); +extern void ext3_put_super (struct super_block *); +extern void ext3_write_super (struct super_block *); +extern void ext3_write_super_lockfs (struct super_block *); +extern void ext3_unlockfs (struct super_block *); +extern int ext3_remount (struct super_block *, int *, char *); +extern struct super_block * ext3_read_super (struct super_block *,void *,int); +extern int ext3_statfs (struct super_block *, struct statfs *); + +/* truncate.c */ +extern void ext3_truncate (struct inode *); + +#define ext3_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext3_std_error((sb), __FUNCTION__, (errno)); \ +} while (0) +extern const char *ext3_decode_error(struct super_block *sb, int errno, char nbuf[16]); + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern struct file_operations ext3_dir_operations; + +/* file.c */ +extern struct inode_operations ext3_file_inode_operations; +extern struct file_operations ext3_file_operations; + +/* symlink.c */ +extern struct inode_operations ext3_fast_symlink_inode_operations; + +extern struct address_space_operations ext3_aops; + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_EXT3_FS_H */ diff -urN linux-2.4.16-reiserfspatches-immutable/include/linux/fs.h~ linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/fs.h~ --- linux-2.4.16-reiserfspatches-immutable/include/linux/fs.h~ Mon Dec 10 13:12:58 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/fs.h~ Mon Dec 10 14:28:03 2001 @@ -127,13 +127,14 @@ /* Inode flags - they have nothing to superblock flags now */ -#define S_SYNC 1 /* Writes are synced at once */ -#define S_NOATIME 2 /* Do not update access times */ -#define S_QUOTA 4 /* Quota initialized for file */ -#define S_APPEND 8 /* Append-only file */ -#define S_IMMUTABLE 16 /* Immutable file */ -#define S_DEAD 32 /* removed, but still open directory */ -#define S_NOQUOTA 64 /* Inode is not counted to quota */ +#define S_SYNC 1 /* Writes are synced at once */ +#define S_NOATIME 2 /* Do not update access times */ +#define S_QUOTA 4 /* Quota initialized for file */ +#define S_APPEND 8 /* Append-only file */ +#define S_IMMUTABLE_FILE 16 /* Immutable file */ +#define S_DEAD 32 /* removed, but still open directory */ +#define S_NOQUOTA 64 /* Inode is not counted to quota */ +#define S_IMMUTABLE_LINK 128 /* Immutable links */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -157,7 +158,8 @@ #define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) -#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) +#define IS_IMMUTABLE_FILE(inode) ((inode)->i_flags & S_IMMUTABLE_FILE) +#define IS_IMMUTABLE_LINK(inode) ((((inode)->i_flags & S_IMMUTABLE_FILE) << 3) ^ ((inode)->i_flags & S_IMMUTABLE_LINK) ) #define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) @@ -356,11 +358,12 @@ /* * This is the inode attributes flag definitions */ -#define ATTR_FLAG_SYNCRONOUS 1 /* Syncronous write */ -#define ATTR_FLAG_NOATIME 2 /* Don't update atime */ -#define ATTR_FLAG_APPEND 4 /* Append-only file */ -#define ATTR_FLAG_IMMUTABLE 8 /* Immutable file */ -#define ATTR_FLAG_NODIRATIME 16 /* Don't update atime for directory */ +#define ATTR_FLAG_SYNCRONOUS 1 /* Syncronous write */ +#define ATTR_FLAG_NOATIME 2 /* Don't update atime */ +#define ATTR_FLAG_APPEND 4 /* Append-only file */ +#define ATTR_FLAG_IMMUTABLE_FILE 8 /* Immutable file */ +#define ATTR_FLAG_NODIRATIME 16 /* Don't update atime for directory */ +#define ATTR_FLAG_IMMUTABLE_LINK 32 /* Immutable file */ /* * Includes for diskquotas and mount structures. @@ -1381,6 +1384,7 @@ extern int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); extern int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, unsigned long *); +extern int generic_cont_expand(struct inode *inode, loff_t size) ; extern int block_commit_write(struct page *page, unsigned from, unsigned to); extern int block_sync_page(struct page *); diff -urN linux-2.4.16-reiserfspatches-immutable/include/linux/sched.h linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/sched.h --- linux-2.4.16-reiserfspatches-immutable/include/linux/sched.h Mon Dec 10 13:12:49 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/include/linux/sched.h Mon Dec 10 15:01:51 2001 @@ -275,6 +275,30 @@ atomic_inc(&__user->__count); \ __user; }) + +/* + We may have a different domainname and nodename for each security + context. By default, a security context share the same as its + parent, potentially the information in system_utsname +*/ +#define S_CTX_INFO_LOCK 1 /* Can't request a new s_context */ +#define S_CTX_INFO_SCHED 2 /* All process in the s_context */ + /* Contribute to the schedular */ +#define S_CTX_INFO_NPROC 4 /* Limit number of processes in a context */ +#define S_CTX_INFO_PRIVATE 8 /* Noone can join this security context */ + + +struct context_info{ + int refcount; + int s_context; + char nodename[65]; + char domainname[65]; + int flags; /* S_CTX_INFO_xxx */ + atomic_t ticks; /* Number of ticks used by all process */ + /* in the s_context */ +}; + + extern struct user_struct root_user; #define INIT_USER (&root_user) @@ -399,6 +423,12 @@ unsigned long sas_ss_sp; size_t sas_ss_size; int (*notifier)(void *priv); +/* Field to make virtual server running in chroot more isolated */ + int s_context; /* Process can only deal with other processes */ + /* with the same s_context */ + __u32 cap_bset; /* Maximum capability of this process and children */ + unsigned long ipv4root; /* Process can only bind to this iP */ + struct context_info *s_info; void *notifier_data; sigset_t *notifier_mask; @@ -500,6 +530,7 @@ blocked: {{0}}, \ alloc_lock: SPIN_LOCK_UNLOCKED, \ journal_info: NULL, \ + cap_bset: CAP_INIT_EFF_SET, \ } @@ -925,6 +956,11 @@ mntput(rootmnt); return res; } + +/* Manage the reference count of the context_info pointer */ +void sys_release_s_info (struct task_struct *); +void sys_assign_s_info (struct task_struct *); +void sys_alloc_s_info (void); #endif /* __KERNEL__ */ diff -urN linux-2.4.16-reiserfspatches-immutable/include/net/route.h linux-2.4.16-reiserfspatches-immutable-ctx4/include/net/route.h --- linux-2.4.16-reiserfspatches-immutable/include/net/route.h Mon Dec 10 13:12:38 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/include/net/route.h Mon Dec 10 15:01:51 2001 @@ -159,6 +159,13 @@ static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif) { int err; + if (current->ipv4root != 0){ + if (src == 0){ + src = current->ipv4root; + }else if (current->ipv4root != src){ + return -EPERM; + } + } err = ip_route_output(rp, dst, src, tos, oif); if (err || (dst && src)) return err; diff -urN linux-2.4.16-reiserfspatches-immutable/kernel/exit.c linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/exit.c --- linux-2.4.16-reiserfspatches-immutable/kernel/exit.c Mon Dec 10 13:12:57 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/exit.c Mon Dec 10 15:01:51 2001 @@ -65,6 +65,7 @@ current->counter += p->counter; if (current->counter >= MAX_COUNTER) current->counter = MAX_COUNTER; + sys_release_s_info(p); p->pid = 0; free_task_struct(p); } else { diff -urN linux-2.4.16-reiserfspatches-immutable/kernel/fork.c linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/fork.c --- linux-2.4.16-reiserfspatches-immutable/kernel/fork.c Mon Dec 10 13:12:57 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/fork.c Mon Dec 10 15:01:51 2001 @@ -585,8 +585,14 @@ *p = *current; retval = -EAGAIN; + if (p->s_info != NULL && (p->s_info->flags & S_CTX_INFO_NPROC)!=0){ + if (p->s_info->refcount >= p->rlim[RLIMIT_NPROC].rlim_max) + goto bad_fork_free; + } if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur) goto bad_fork_free; + + sys_assign_s_info (p); atomic_inc(&p->user->__count); atomic_inc(&p->user->processes); diff -urN linux-2.4.16-reiserfspatches-immutable/kernel/sched.c linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/sched.c --- linux-2.4.16-reiserfspatches-immutable/kernel/sched.c Mon Dec 10 13:12:57 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/sched.c Mon Dec 10 15:01:51 2001 @@ -165,7 +165,13 @@ * Don't do any other calculations if the time slice is * over.. */ - weight = p->counter; + if (p->s_info != NULL + && (p->s_info->flags & S_CTX_INFO_SCHED)!=0){ + weight = atomic_read (&p->s_info->ticks)/p->s_info->refcount; + weight = (weight+p->counter)>>1; + }else{ + weight = p->counter; + } if (!weight) goto out; @@ -605,8 +611,23 @@ spin_unlock_irq(&runqueue_lock); read_lock(&tasklist_lock); - for_each_task(p) + /* + Reset the s_info->ticks to the sum off all + member processes p->counter + */ + for_each_task(p){ + if (p->s_info != NULL + && (p->s_info->flags & S_CTX_INFO_SCHED)!=0){ + atomic_set (&p->s_info->ticks,0); + } + } + for_each_task(p){ p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); + if (p->s_info != NULL + && (p->s_info->flags & S_CTX_INFO_SCHED)!=0){ + atomic_add (p->counter,&p->s_info->ticks); + } + } read_unlock(&tasklist_lock); spin_lock_irq(&runqueue_lock); goto repeat_schedule; diff -urN linux-2.4.16-reiserfspatches-immutable/kernel/signal.c linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/signal.c --- linux-2.4.16-reiserfspatches-immutable/kernel/signal.c Mon Dec 10 13:12:57 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/signal.c Mon Dec 10 15:01:51 2001 @@ -592,7 +592,7 @@ retval = -ESRCH; read_lock(&tasklist_lock); for_each_task(p) { - if (p->pgrp == pgrp) { + if (p->pgrp == pgrp && p->s_context == current->s_context) { int err = send_sig_info(sig, info, p); if (retval) retval = err; @@ -639,7 +639,7 @@ read_lock(&tasklist_lock); p = find_task_by_pid(pid); error = -ESRCH; - if (p) + if (p && p->s_context == current->s_context) error = send_sig_info(sig, info, p); read_unlock(&tasklist_lock); return error; @@ -663,7 +663,7 @@ read_lock(&tasklist_lock); for_each_task(p) { - if (p->pid > 1 && p != current) { + if (p->pid > 1 && p != current && p->s_context == current->s_context) { int err = send_sig_info(sig, info, p); ++count; if (err != -EPERM) @@ -1256,3 +1256,100 @@ return ret ? ret : (unsigned long)old_sa.sa.sa_handler; } #endif /* !alpha && !__ia64__ && !defined(__mips__) */ + +/* + Change to a new security context and reduce the capability + basic set of the current process +*/ +asmlinkage int +sys_new_s_context(int ctx, __u32 remove_cap, int flags) +{ + #define MAX_S_CONTEXT 65535 /* Arbitrary limit */ + int ret = -EPERM; + if (ctx == -1){ + if (current->s_info == NULL + || (current->s_info->flags & S_CTX_INFO_LOCK) == 0){ + /* Ok we allocate a new context. For now, we just increase */ + /* it. Wrap around possible, so we loop */ + static int alloc_ctx=1; + static spinlock_t alloc_ctx_lock = SPIN_LOCK_UNLOCKED; + spin_lock(&alloc_ctx_lock); + while (1){ + int found = 0; + struct task_struct *p; + alloc_ctx++; + /* The s_context 1 is special. It sess all processes */ + if (alloc_ctx == 1){ + alloc_ctx++; + }else if (alloc_ctx > MAX_S_CONTEXT){ + // No need to grow and grow + alloc_ctx = 2; + } + /* Check if in use */ + read_lock(&tasklist_lock); + for_each_task(p) { + if (p->s_context == alloc_ctx){ + found = 1; + break; + } + } + read_unlock(&tasklist_lock); + if (!found) break; + } + current->s_context = alloc_ctx; + current->cap_bset &= (~remove_cap); + ret = alloc_ctx; + sys_alloc_s_info(); + if (current->s_info != NULL){ + current->s_info->flags |= flags; + } + spin_unlock(&alloc_ctx_lock); + } + }else if (ctx == -2){ + /* We keep the same s_context, but lower the capabilities */ + current->cap_bset &= (~remove_cap); + ret = current->s_context; + if (current->s_info != NULL){ + current->s_info->flags |= flags; + } + }else if (ctx <= 0 || ctx > MAX_S_CONTEXT){ + ret = -EINVAL; + }else if (current->s_context == 0 + && capable(CAP_SYS_ADMIN) + && (current->s_info == NULL + ||(current->s_info->flags & S_CTX_INFO_LOCK) == 0)){ + /* The root context can become any context it wants */ + int found = 0; + struct task_struct *p; + /* Check if in use so we reuse the same context_info */ + read_lock(&tasklist_lock); + ret = ctx; + for_each_task(p) { + if (p->s_context == ctx){ + found = 1; + if (p->s_info == NULL + || (p->s_info->flags & S_CTX_INFO_PRIVATE)==0){ + sys_release_s_info(current); + sys_assign_s_info (p); + current->s_info = p->s_info; + }else{ + ret = -EPERM; + } + break; + } + } + read_unlock(&tasklist_lock); + if (ret == ctx){ + current->s_context = ctx; + current->cap_bset &= (~remove_cap); + if (!found){ + sys_alloc_s_info(); + } + if (current->s_info != NULL){ + current->s_info->flags |= flags; + } + } + } + return ret; +} + diff -urN linux-2.4.16-reiserfspatches-immutable/kernel/sys.c linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/sys.c --- linux-2.4.16-reiserfspatches-immutable/kernel/sys.c Mon Dec 10 13:11:17 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/sys.c Mon Dec 10 15:01:51 2001 @@ -1015,17 +1015,80 @@ asmlinkage long sys_newuname(struct new_utsname * name) { int errno = 0; + struct new_utsname tmp,*pttmp; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (current->s_info != NULL){ + tmp = system_utsname; + strcpy (tmp.nodename,current->s_info->nodename); + strcpy (tmp.domainname,current->s_info->domainname); + pttmp = &tmp; + }else{ + pttmp = &system_utsname; + } + if (copy_to_user(name,pttmp,sizeof *name)) errno = -EFAULT; up_read(&uts_sem); return errno; } +/* + Decrease the reference count on the context_info member of a task + Free the struct if the reference count reach 0. +*/ +void sys_release_s_info (struct task_struct *p) +{ + down_write (&uts_sem); + if (p->s_info != NULL){ + p->s_info->refcount--; + if (p->s_info->refcount == 0){ + // printk ("vfree s_info %d\n",p->pid); + vfree (p->s_info); + p->s_info = NULL; + } + } + up_write (&uts_sem); +} +/* + Increase the reference count on the context_info member of a task +*/ +void sys_assign_s_info (struct task_struct *p) +{ + down_write (&uts_sem); + if (p->s_info != NULL) p->s_info->refcount++; + up_write (&uts_sem); +} + +/* + Alloc a new s_info to the current process and release + the one currently owned by the current process. +*/ +void sys_alloc_s_info() +{ + struct context_info *s_info = vmalloc(sizeof(struct context_info)); + // printk ("new s_info %d\n",current->pid); + s_info->s_context = current->s_context; + s_info->refcount = 1; + atomic_set (&s_info->ticks,current->counter); + s_info->flags = 0; + down_read (&uts_sem); + if (current->s_info != NULL){ + strcpy (s_info->nodename,current->s_info->nodename); + strcpy (s_info->domainname,current->s_info->domainname); + }else{ + strcpy (s_info->nodename,system_utsname.nodename); + strcpy (s_info->domainname,system_utsname.domainname); + } + up_read (&uts_sem); + sys_release_s_info (current); + current->s_info = s_info; +} + + asmlinkage long sys_sethostname(char *name, int len) { int errno; + char *nodename; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1033,8 +1096,10 @@ return -EINVAL; down_write(&uts_sem); errno = -EFAULT; - if (!copy_from_user(system_utsname.nodename, name, len)) { - system_utsname.nodename[len] = 0; + nodename = system_utsname.nodename; + if (current->s_info) nodename = current->s_info->nodename; + if (!copy_from_user(nodename, name, len)) { + nodename[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1044,15 +1109,18 @@ asmlinkage long sys_gethostname(char *name, int len) { int i, errno; + char *nodename; if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(system_utsname.nodename); + nodename = system_utsname.nodename; + if (current->s_info != NULL) nodename = current->s_info->nodename; + i = 1 + strlen(nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, system_utsname.nodename, i)) + if (copy_to_user(name, nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1065,6 +1133,7 @@ asmlinkage long sys_setdomainname(char *name, int len) { int errno; + char *domainname; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1072,10 +1141,12 @@ return -EINVAL; down_write(&uts_sem); + domainname = system_utsname.domainname; + if (current->s_info) domainname = current->s_info->domainname; errno = -EFAULT; - if (!copy_from_user(system_utsname.domainname, name, len)) { + if (!copy_from_user(domainname, name, len)) { errno = 0; - system_utsname.domainname[len] = 0; + domainname[len] = 0; } up_write(&uts_sem); return errno; diff -urN linux-2.4.16-reiserfspatches-immutable/kernel/sysctl.c linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/sysctl.c --- linux-2.4.16-reiserfspatches-immutable/kernel/sysctl.c Mon Dec 10 13:12:58 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/sysctl.c Mon Dec 10 15:01:51 2001 @@ -378,6 +378,7 @@ static int test_perm(int mode, int op) { + if (!capable(CAP_SYS_ADMIN)) mode &= ~(0222); if (!current->euid) mode >>= 6; else if (in_egroup_p(0)) @@ -792,7 +793,18 @@ void *buffer, size_t *lenp) { int r; + ctl_table tmp; + /* HACK for per s_context hostname and domainname */ + if (current->s_info != NULL){ + tmp = *table; + table = &tmp; + if (table->data == (void*)&system_utsname.nodename){ + tmp.data = ¤t->s_info->nodename; + }else if (table->data == (void*)&system_utsname.domainname){ + tmp.data = ¤t->s_info->domainname; + } + } if (!write) { down_read(&uts_sem); r=proc_dostring(table,0,filp,buffer,lenp); diff -urN linux-2.4.16-reiserfspatches-immutable/kernel/timer.c linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/timer.c --- linux-2.4.16-reiserfspatches-immutable/kernel/timer.c Mon Dec 10 13:12:01 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/kernel/timer.c Mon Dec 10 15:01:51 2001 @@ -583,6 +583,11 @@ update_one_process(p, user_tick, system, cpu); if (p->pid) { + if (p->s_info != NULL + && (p->s_info->flags & S_CTX_INFO_SCHED)!=0){ + // atomic_sub (ticks*p->s_info->refcount, &p->s_info->ticks); + atomic_dec (&p->s_info->ticks); + } if (--p->counter <= 0) { p->counter = 0; p->need_resched = 1; diff -urN linux-2.4.16-reiserfspatches-immutable/net/ipv4/af_inet.c linux-2.4.16-reiserfspatches-immutable-ctx4/net/ipv4/af_inet.c --- linux-2.4.16-reiserfspatches-immutable/net/ipv4/af_inet.c Mon Dec 10 13:12:38 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/net/ipv4/af_inet.c Mon Dec 10 15:01:51 2001 @@ -477,6 +477,7 @@ unsigned short snum; int chk_addr_ret; int err; + __u32 s_addr; /* If the socket has its own bind function then use it. (RAW) */ if(sk->prot->bind) @@ -485,7 +486,17 @@ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; - chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + s_addr = addr->sin_addr.s_addr; + if (current->ipv4root != 0){ + // printk ("ipv4root0 %08lx %08x\n",current->ipv4root,s_addr); + if (s_addr == 0){ + s_addr = current->ipv4root; + }else if (s_addr != current->ipv4root){ + return -EADDRNOTAVAIL; + } + } + chk_addr_ret = inet_addr_type(s_addr); + // printk ("ipv4root %08lx %08x %d\n",current->ipv4root,s_addr,chk_addr_ret); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since @@ -496,7 +507,7 @@ */ if (sysctl_ip_nonlocal_bind == 0 && sk->protinfo.af_inet.freebind == 0 && - addr->sin_addr.s_addr != INADDR_ANY && + s_addr != INADDR_ANY && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) @@ -521,7 +532,7 @@ (sk->num != 0)) goto out; - sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr; + sk->rcv_saddr = sk->saddr = s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) sk->saddr = 0; /* Use device */ diff -urN linux-2.4.16-reiserfspatches-immutable/net/socket.c linux-2.4.16-reiserfspatches-immutable-ctx4/net/socket.c --- linux-2.4.16-reiserfspatches-immutable/net/socket.c Mon Dec 10 13:12:28 2001 +++ linux-2.4.16-reiserfspatches-immutable-ctx4/net/socket.c Mon Dec 10 15:01:51 2001 @@ -1765,3 +1765,15 @@ len = 0; return len; } + +asmlinkage int sys_set_ipv4root (unsigned long ip) +{ + int ret = -EPERM; + if (current->ipv4root == 0 + || capable(CAP_SYS_ADMIN)){ + ret = 0; + current->ipv4root = ip; + } + return ret; +} +