diff -rc2P linux/Documentation/Configure.help linux-2.4.13/Documentation/Configure.help *** linux/Documentation/Configure.help Sat Oct 20 22:17:19 2001 --- linux-2.4.13/Documentation/Configure.help Fri Nov 9 16:58:00 2001 *************** *** 12059,12062 **** --- 12059,12132 ---- wants to say Y here. + Ext3 journaling file system support (EXPERIMENTAL) + CONFIG_EXT3_FS + This is the journaling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journaling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the filesystem, the on-disk format of + ext3 is identical to ext2. It is possible to freely switch between + using the ext3 driver and the ext2 driver, as long as the filesystem + has been cleanly unmounted, or e2fsck is run on the filesystem. + + To add a journal on an existing ext2 filesystem or change the behavior + of ext3 file systems, you can use the tune2fs utility ("man tune2fs"). + To modify attributes of files and directories on ext3 file systems, + use chattr ("man chattr"). You need to be using e2fsprogs version + 1.20 or later in order to create ext3 journals (available at + ). + + If you want to compile this file system as a module ( = code which + can be inserted in and removed from the running kernel whenever you + want), say M here and read Documentation/modules.txt. The module + will be called ext3.o. Be aware however that the file system of your + root partition (the one containing the directory /) cannot be + compiled as a module, and so this may be dangerous. + + Journal Block Device support (JBD for ext3) (EXPERIMENTAL) + CONFIG_JBD + This is a generic journaling layer for block devices. It is currently + used by the ext3 file system, but it could also be used to add journal + support to other file systems or block devices such as RAID or LVM. + + If you are using the ext3 filesystem, you need to say Y here. If you + are not using ext3 then you will probably want to say N. + + If you want to compile this device as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here and read Documentation/modules.txt. The module will be called + jbd.o. If you are compiling ext3 into the kernel, you cannot compile + this code as a module. + + JBD (ext3) debugging support + CONFIG_JBD_DEBUG + If you are using the ext3 journaled file system (or potentially any + other file system/device using JBD), this option allows you to enable + debugging output while the system is running, in order to help track + down any problems you are having. By default the debugging output + will be turned off. + + If you select Y here, then you will be able to turn on debugging with + "echo N > /proc/sys/fs/jbd-debug", where N is a number between 1 and 5, + the higher the number, the more debugging output is generated. To turn + debugging off again, do "echo 0 > /proc/sys/fs/jbd-debug". + + Buffer Head tracing (DEBUG) + CONFIG_BUFFER_DEBUG + If you are a kernel developer working with file systems or in the block + device layer, this buffer head tracing may help you to track down bugs + in your code. This enables some debugging macros (BUFFER_TRACE, etc) + which allow you to track the state of a buffer through various layers + of code. The debugging code is used primarily by ext3 and JBD code. + + Because this option adds considerably to the size of each buffer, most + people will want to say N here. + BFS file system support (EXPERIMENTAL) CONFIG_BFS_FS diff -rc2P linux/drivers/block/ll_rw_blk.c linux-2.4.13/drivers/block/ll_rw_blk.c *** linux/drivers/block/ll_rw_blk.c Sat Oct 13 13:30:30 2001 --- linux-2.4.13/drivers/block/ll_rw_blk.c Fri Nov 9 16:58:00 2001 *************** *** 672,677 **** down by us so at this point flushpage will block and won't clear the mapped bit under us. */ ! if (!buffer_mapped(bh)) BUG(); /* --- 672,679 ---- down by us so at this point flushpage will block and won't clear the mapped bit under us. */ ! if (!buffer_mapped(bh)) { ! print_buffer_trace(bh); BUG(); + } /* *************** *** 1007,1013 **** switch(rw) { case WRITE: ! if (!atomic_set_buffer_clean(bh)) /* Hmmph! Nothing to write */ goto end_io; __mark_buffer_clean(bh); break; --- 1009,1018 ---- switch(rw) { case WRITE: ! if (!atomic_set_buffer_clean(bh)) { ! BUFFER_TRACE(bh, "already clean"); /* Hmmph! Nothing to write */ goto end_io; + } + BUFFER_TRACE(bh, "set clean, write underway"); __mark_buffer_clean(bh); break; *************** *** 1032,1037 **** sorry: /* Make sure we don't get infinite dirty retries.. */ ! for (i = 0; i < nr; i++) mark_buffer_clean(bhs[i]); } --- 1037,1044 ---- sorry: /* Make sure we don't get infinite dirty retries.. */ ! for (i = 0; i < nr; i++) { ! BUFFER_TRACE(bhs[i], "sorry"); mark_buffer_clean(bhs[i]); + } } *************** *** 1133,1136 **** --- 1140,1144 ---- queue_nr_requests = 128; + /* * Batch frees according to queue length diff -rc2P linux/drivers/block/loop.c linux-2.4.13/drivers/block/loop.c *** linux/drivers/block/loop.c Mon Oct 15 21:53:51 2001 --- linux-2.4.13/drivers/block/loop.c Fri Nov 9 16:58:00 2001 *************** *** 187,190 **** --- 187,192 ---- while (len > 0) { int IV = index * (PAGE_CACHE_SIZE/bsize) + offset/bsize; + int transfer_result; + size = PAGE_CACHE_SIZE - offset; if (size > len) *************** *** 198,205 **** kaddr = page_address(page); flush_dcache_page(page); ! if (lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV)) ! goto write_fail; if (aops->commit_write(file, page, offset, offset+size)) goto unlock; data += size; len -= size; --- 200,216 ---- kaddr = page_address(page); flush_dcache_page(page); ! transfer_result = lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV); ! if (transfer_result) { ! /* ! * The transfer failed, but we still write the data to ! * keep prepare/commit calls balanced. ! */ ! printk(KERN_ERR "loop: transfer error block %ld\n", index); ! memset(kaddr + offset, 0, size); ! } if (aops->commit_write(file, page, offset, offset+size)) goto unlock; + if (transfer_result) + goto unlock; data += size; len -= size; *************** *** 213,220 **** return 0; - write_fail: - printk(KERN_ERR "loop: transfer error block %ld\n", index); - ClearPageUptodate(page); - kunmap(page); unlock: UnlockPage(page); --- 224,227 ---- diff -rc2P linux/drivers/ide/ide-disk.c linux-2.4.13/drivers/ide/ide-disk.c *** linux/drivers/ide/ide-disk.c Thu Oct 11 12:14:32 2001 --- linux-2.4.13/drivers/ide/ide-disk.c Fri Nov 9 16:58:00 2001 *************** *** 368,371 **** --- 368,392 ---- static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) { + #ifdef CONFIG_JBD_DEBUG + /* + * Silently stop writing to this disk to simulate a crash. + */ + extern int journal_no_write[2]; + int i; + + if (rq->cmd != WRITE) + goto write_ok; + + for (i = 0; i < 2; i++) { + if ((journal_no_write[i] & 0xdead0000) == 0xdead0000) { + if (rq->rq_dev == (journal_no_write[i] & 0xffff)) { + ide_end_request(1, HWGROUP(drive)); + return ide_stopped; + } + } + } + write_ok: + ; + #endif if (IDE_CONTROL_REG) OUT_BYTE(drive->ctl,IDE_CONTROL_REG); diff -rc2P linux/fs/Config.in linux-2.4.13/fs/Config.in *** linux/fs/Config.in Thu Oct 4 18:13:18 2001 --- linux-2.4.13/fs/Config.in Fri Nov 9 16:57:59 2001 *************** *** 21,24 **** --- 21,32 ---- dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL + tristate 'Ext3 journalling file system support (EXPERIMENTAL)' CONFIG_EXT3_FS + # CONFIG_JBD could be its own option (even modular), but until there are + # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS + # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS + define_bool CONFIG_JBD $CONFIG_EXT3_FS + dep_mbool ' JBD (ext3) debugging support' CONFIG_JBD_DEBUG $CONFIG_JBD + bool 'Buffer Head tracing (DEBUG)' CONFIG_BUFFER_DEBUG + # msdos file systems tristate 'DOS FAT fs support' CONFIG_FAT_FS diff -rc2P linux/fs/Makefile linux-2.4.13/fs/Makefile *** linux/fs/Makefile Thu Oct 4 18:13:18 2001 --- linux-2.4.13/fs/Makefile Fri Nov 9 16:58:00 2001 *************** *** 8,12 **** O_TARGET := fs.o ! export-objs := filesystems.o open.o dcache.o mod-subdirs := nls --- 8,12 ---- O_TARGET := fs.o ! export-objs := filesystems.o open.o dcache.o buffer.o jbd-kernel.o mod-subdirs := nls *************** *** 15,19 **** fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ ! filesystems.o namespace.o ifeq ($(CONFIG_QUOTA),y) --- 15,19 ---- fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ ! filesystems.o namespace.o jbd-kernel.o ifeq ($(CONFIG_QUOTA),y) *************** *** 27,30 **** --- 27,32 ---- # Do not add any filesystems before this line + subdir-$(CONFIG_EXT3_FS) += ext3 # Before ext2 so root fs can be ext3 + subdir-$(CONFIG_JBD) += jbd subdir-$(CONFIG_EXT2_FS) += ext2 subdir-$(CONFIG_CRAMFS) += cramfs diff -rc2P linux/fs/buffer.c linux-2.4.13/fs/buffer.c *** linux/fs/buffer.c Tue Oct 23 20:54:19 2001 --- linux-2.4.13/fs/buffer.c Fri Nov 9 16:57:59 2001 *************** *** 46,49 **** --- 46,51 ---- #include #include + #include + #include #include *************** *** 614,619 **** by the user. ! Thus invalidate_buffers in general usage is not allwowed to trash dirty ! buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. NOTE: In the case where the user removed a removable-media-disk even if --- 616,625 ---- by the user. ! Thus invalidate_buffers in general usage is not allwowed to trash ! dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to ! be preserved. These buffers are simply skipped. ! ! We also skip buffers which are still in use. For example this can ! happen if a userspace program is reading the block device. NOTE: In the case where the user removed a removable-media-disk even if *************** *** 718,721 **** --- 724,728 ---- bh->b_end_io = handler; bh->b_private = private; + buffer_trace_init(&bh->b_history); } *************** *** 727,730 **** --- 734,738 ---- struct page *page; + BUFFER_TRACE(bh, "enter"); mark_buffer_uptodate(bh, uptodate); *************** *** 1093,1096 **** --- 1101,1110 ---- } + void set_buffer_flushtime(struct buffer_head *bh) + { + bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer; + } + EXPORT_SYMBOL(set_buffer_flushtime); + /* * A buffer may need to be moved from one buffer list to another *************** *** 1100,1103 **** --- 1114,1120 ---- { int dispose = BUF_CLEAN; + + BUFFER_TRACE(bh, "enter"); + if (buffer_locked(bh)) dispose = BUF_LOCKED; *************** *** 1111,1114 **** --- 1128,1132 ---- __insert_into_lru_list(bh, dispose); } + BUFFER_TRACE(bh, "exit"); } *************** *** 1125,1128 **** --- 1143,1147 ---- void __brelse(struct buffer_head * buf) { + BUFFER_TRACE(buf, "entry"); if (atomic_read(&buf->b_count)) { put_bh(buf); *************** *** 1138,1141 **** --- 1157,1161 ---- void __bforget(struct buffer_head * buf) { + BUFFER_TRACE(buf, "enter"); mark_buffer_clean(buf); __brelse(buf); *************** *** 1168,1175 **** * Note: the caller should wake up the buffer_wait list if needed. */ ! static __inline__ void __put_unused_buffer_head(struct buffer_head * bh) { if (bh->b_inode) BUG(); if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { kmem_cache_free(bh_cachep, bh); --- 1188,1207 ---- * Note: the caller should wake up the buffer_wait list if needed. */ ! static void __put_unused_buffer_head(struct buffer_head * bh) { if (bh->b_inode) BUG(); + + J_ASSERT_BH(bh, bh->b_prev_free == 0); + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + if (buffer_jbd(bh)) { + J_ASSERT_BH(bh, bh2jh(bh)->b_transaction == 0); + J_ASSERT_BH(bh, bh2jh(bh)->b_next_transaction == 0); + J_ASSERT_BH(bh, bh2jh(bh)->b_frozen_data == 0); + J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data == 0); + } + #endif + buffer_trace_init(&bh->b_history); + if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { kmem_cache_free(bh_cachep, bh); *************** *** 1185,1188 **** --- 1217,1228 ---- } + void put_unused_buffer_head(struct buffer_head *bh) + { + spin_lock(&unused_list_lock); + __put_unused_buffer_head(bh); + spin_unlock(&unused_list_lock); + } + EXPORT_SYMBOL(put_unused_buffer_head); + /* * Reserve NR_RESERVED buffer heads for async IO requests to avoid *************** *** 1190,1194 **** * buffer heads is now handled in create_buffers(). */ ! static struct buffer_head * get_unused_buffer_head(int async) { struct buffer_head * bh; --- 1230,1234 ---- * buffer heads is now handled in create_buffers(). */ ! struct buffer_head * get_unused_buffer_head(int async) { struct buffer_head * bh; *************** *** 1211,1214 **** --- 1251,1255 ---- bh->b_blocknr = -1; bh->b_this_page = NULL; + buffer_trace_init(&bh->b_history); return bh; } *************** *** 1224,1227 **** --- 1265,1269 ---- nr_unused_buffer_heads--; spin_unlock(&unused_list_lock); + buffer_trace_init(&bh->b_history); return bh; } *************** *** 1231,1234 **** --- 1273,1277 ---- return NULL; } + EXPORT_SYMBOL(get_unused_buffer_head); void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) *************** *** 1245,1248 **** --- 1288,1292 ---- bh->b_data = page_address(page) + offset; } + EXPORT_SYMBOL(set_bh_page); /* *************** *** 1328,1331 **** --- 1372,1376 ---- { if (buffer_mapped(bh)) { + BUFFER_TRACE(bh, "entry"); mark_buffer_clean(bh); lock_buffer(bh); *************** *** 1338,1341 **** --- 1383,1411 ---- } + /** + * try_to_release_page - release old fs-specific metadata on a page + * + */ + + int try_to_release_page(struct page * page, int gfp_mask) + { + if (!PageLocked(page)) + BUG(); + + if (!page->mapping) + goto try_to_free; + if (!page->mapping->a_ops->releasepage) + goto try_to_free; + if (page->mapping->a_ops->releasepage(page, gfp_mask)) + goto try_to_free; + /* + * We couldn't release buffer metadata; don't even bother trying + * to release buffers. + */ + return 0; + try_to_free: + return try_to_free_buffers(page, gfp_mask); + } + /* * We don't have to release all buffers here, but *************** *** 1381,1385 **** */ if (!offset) { ! if (!try_to_free_buffers(page, 0)) return 0; } --- 1451,1455 ---- */ if (!offset) { ! if (!try_to_release_page(page, 0)) return 0; } *************** *** 1409,1412 **** --- 1479,1483 ---- page_cache_get(page); } + EXPORT_SYMBOL(create_empty_buffers); /* *************** *** 1427,1431 **** --- 1498,1505 ---- old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); + J_ASSERT_BH(bh, old_bh != bh); if (old_bh) { + BUFFER_TRACE(old_bh, "old_bh - entry"); + J_ASSERT_BH(old_bh, !buffer_jlist_eq(old_bh, BJ_Metadata)); mark_buffer_clean(old_bh); wait_on_buffer(old_bh); *************** *** 1449,1454 **** /* ! * block_write_full_page() is SMP-safe - currently it's still ! * being called with the kernel lock held, but the code is ready. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) --- 1523,1527 ---- /* ! * block_write_full_page() is SMP threaded - the kernel lock is not held. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) *************** *** 1484,1489 **** if (err) goto out; ! if (buffer_new(bh)) unmap_underlying_metadata(bh); } bh = bh->b_this_page; --- 1557,1564 ---- if (err) goto out; ! if (buffer_new(bh)) { ! BUFFER_TRACE(bh, "new: call unmap_underlying_metadata"); unmap_underlying_metadata(bh); + } } bh = bh->b_this_page; *************** *** 1493,1496 **** --- 1568,1572 ---- /* Stage 2: lock the buffers, mark them clean */ do { + BUFFER_TRACE(bh, "lock it"); lock_buffer(bh); set_buffer_async_io(bh); *************** *** 1549,1554 **** --- 1625,1632 ---- goto out; if (buffer_new(bh)) { + BUFFER_TRACE(bh, "new: call unmap_underlying_metadata"); unmap_underlying_metadata(bh); if (Page_Uptodate(page)) { + BUFFER_TRACE(bh, "setting uptodate"); set_bit(BH_Uptodate, &bh->b_state); continue; *************** *** 1564,1567 **** --- 1642,1646 ---- } if (Page_Uptodate(page)) { + BUFFER_TRACE(bh, "setting uptodate"); set_bit(BH_Uptodate, &bh->b_state); continue; *************** *** 1569,1572 **** --- 1648,1652 ---- if (!buffer_uptodate(bh) && (block_start < from || block_end > to)) { + BUFFER_TRACE(bh, "reading"); ll_rw_block(READ, 1, &bh); *wait_bh++=bh; *************** *** 1607,1610 **** --- 1687,1691 ---- set_bit(BH_Uptodate, &bh->b_state); if (!atomic_set_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "mark dirty"); __mark_dirty(bh); buffer_insert_inode_data_queue(bh, inode); *************** *** 1890,1893 **** --- 1971,1975 ---- kunmap(page); + BUFFER_TRACE(bh, "zeroed end of block"); __mark_buffer_dirty(bh); err = 0; *************** *** 2447,2450 **** --- 2529,2534 ---- return 0; } + EXPORT_SYMBOL(try_to_free_buffers); + EXPORT_SYMBOL(buffermem_pages); /* ================== Debugging =================== */ diff -rc2P linux/fs/ext3/Makefile linux-2.4.13/fs/ext3/Makefile *** linux/fs/ext3/Makefile Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/Makefile Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,16 ---- + # + # Makefile for the linux ext2-filesystem routines. + # + # Note! Dependencies are done automagically by 'make dep', which also + # removes any old dependencies. DON'T put your own dependencies here + # unless it's something special (ie not a .c file). + # + # Note 2! The CFLAGS definitions are now in the main makefile... + + O_TARGET := ext3.o + + obj-y := acl.o balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make diff -rc2P linux/fs/ext3/acl.c linux-2.4.13/fs/ext3/acl.c *** linux/fs/ext3/acl.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/acl.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,17 ---- + /* + * linux/fs/ext3/acl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + + #include + #include + + + /* + * This file will contain the Access Control Lists management for the + * second extended file system. + */ diff -rc2P linux/fs/ext3/balloc.c linux-2.4.13/fs/ext3/balloc.c *** linux/fs/ext3/balloc.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/balloc.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,995 ---- + /* + * linux/fs/ext3/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + + /* + * balloc.c contains the blocks allocation and deallocation routines + */ + + /* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3_read_super). + */ + + + #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + + struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh) + { + unsigned long group_desc; + unsigned long desc; + struct ext3_group_desc * gdp; + + if (block_group >= sb->u.ext3_sb.s_groups_count) { + ext3_error (sb, "ext3_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sb->u.ext3_sb.s_groups_count); + + return NULL; + } + + group_desc = block_group / EXT3_DESC_PER_BLOCK(sb); + desc = block_group % EXT3_DESC_PER_BLOCK(sb); + if (!sb->u.ext3_sb.s_group_desc[group_desc]) { + ext3_error (sb, "ext3_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, desc); + return NULL; + } + + gdp = (struct ext3_group_desc *) + sb->u.ext3_sb.s_group_desc[group_desc]->b_data; + if (bh) + *bh = sb->u.ext3_sb.s_group_desc[group_desc]; + return gdp + desc; + } + + /* + * Read the bitmap for a given block_group, reading into the specified + * slot in the superblock's bitmap cache. + * + * Return >=0 on success or a -ve error code. + */ + + static int read_block_bitmap (struct super_block * sb, + unsigned int block_group, + unsigned long bitmap_nr) + { + struct ext3_group_desc * gdp; + struct buffer_head * bh = NULL; + int retval = -EIO; + + gdp = ext3_get_group_desc (sb, block_group, NULL); + if (!gdp) + goto error_out; + retval = 0; + bh = bread (sb->s_dev, + le32_to_cpu(gdp->bg_block_bitmap), sb->s_blocksize); + if (!bh) { + ext3_error (sb, "read_block_bitmap", + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %lu", + block_group, (unsigned long) gdp->bg_block_bitmap); + retval = -EIO; + } + /* + * On IO error, just leave a zero in the superblock's block pointer for + * this group. The IO will be retried next time. + */ + error_out: + sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group; + sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh; + return retval; + } + + /* + * load_block_bitmap loads the block bitmap for a blocks group + * + * It maintains a cache for the last bitmaps loaded. This cache is managed + * with a LRU algorithm. + * + * Notes: + * 1/ There is one cache per mounted file system. + * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, + * this function reads the bitmap without maintaining a LRU cache. + * + * Return the slot used to store the bitmap, or a -ve error code. + */ + static int __load_block_bitmap (struct super_block * sb, + unsigned int block_group) + { + int i, j, retval = 0; + unsigned long block_bitmap_number; + struct buffer_head * block_bitmap; + + if (block_group >= sb->u.ext3_sb.s_groups_count) + ext3_panic (sb, "load_block_bitmap", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sb->u.ext3_sb.s_groups_count); + + if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) { + if (sb->u.ext3_sb.s_block_bitmap[block_group]) { + if (sb->u.ext3_sb.s_block_bitmap_number[block_group] == + block_group) + return block_group; + ext3_error (sb, "__load_block_bitmap", + "block_group != block_bitmap_number"); + } + retval = read_block_bitmap (sb, block_group, block_group); + if (retval < 0) + return retval; + return block_group; + } + + for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps && + sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++) + ; + if (i < sb->u.ext3_sb.s_loaded_block_bitmaps && + sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) { + block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i]; + block_bitmap = sb->u.ext3_sb.s_block_bitmap[i]; + for (j = i; j > 0; j--) { + sb->u.ext3_sb.s_block_bitmap_number[j] = + sb->u.ext3_sb.s_block_bitmap_number[j - 1]; + sb->u.ext3_sb.s_block_bitmap[j] = + sb->u.ext3_sb.s_block_bitmap[j - 1]; + } + sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number; + sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap; + + /* + * There's still one special case here --- if block_bitmap == 0 + * then our last attempt to read the bitmap failed and we have + * just ended up caching that failure. Try again to read it. + */ + if (!block_bitmap) + retval = read_block_bitmap (sb, block_group, 0); + } else { + if (sb->u.ext3_sb.s_loaded_block_bitmapsu.ext3_sb.s_loaded_block_bitmaps++; + else + brelse (sb->u.ext3_sb.s_block_bitmap + [EXT3_MAX_GROUP_LOADED - 1]); + for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1; + j > 0; j--) { + sb->u.ext3_sb.s_block_bitmap_number[j] = + sb->u.ext3_sb.s_block_bitmap_number[j - 1]; + sb->u.ext3_sb.s_block_bitmap[j] = + sb->u.ext3_sb.s_block_bitmap[j - 1]; + } + retval = read_block_bitmap (sb, block_group, 0); + } + return retval; + } + + /* + * Load the block bitmap for a given block group. First of all do a couple + * of fast lookups for common cases and then pass the request onto the guts + * of the bitmap loader. + * + * Return the slot number of the group in the superblock bitmap cache's on + * success, or a -ve error code. + * + * There is still one inconsistency here --- if the number of groups in this + * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of + * differentiating between a group for which we have never performed a bitmap + * IO request, and a group for which the last bitmap read request failed. + */ + static inline int load_block_bitmap (struct super_block * sb, + unsigned int block_group) + { + int slot; + + /* + * Do the lookup for the slot. First of all, check if we're asking + * for the same slot as last time, and did we succeed that last time? + */ + if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 && + sb->u.ext3_sb.s_block_bitmap_number[0] == block_group && + sb->u.ext3_sb.s_block_bitmap[0]) { + return 0; + } + /* + * Or can we do a fast lookup based on a loaded group on a filesystem + * small enough to be mapped directly into the superblock? + */ + else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && + sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group + && sb->u.ext3_sb.s_block_bitmap[block_group]) { + slot = block_group; + } + /* + * If not, then do a full lookup for this block group. + */ + else { + slot = __load_block_bitmap (sb, block_group); + } + + /* + * <0 means we just got an error + */ + if (slot < 0) + return slot; + + /* + * If it's a valid slot, we may still have cached a previous IO error, + * in which case the bh in the superblock cache will be zero. + */ + if (!sb->u.ext3_sb.s_block_bitmap[slot]) + return -EIO; + + /* + * Must have been read in OK to get this far. + */ + return slot; + } + + /* Free given blocks, update quota and i_blocks field */ + void ext3_free_blocks (handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count) + { + struct buffer_head *bitmap_bh; + struct buffer_head *gd_bh; + unsigned long block_group; + unsigned long bit; + unsigned long i; + int bitmap_nr; + unsigned long overflow; + struct super_block * sb; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + int err = 0, ret; + int dquot_freed_blocks = 0; + + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); + return; + } + lock_super (sb); + es = sb->u.ext3_sb.s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + (block + count) > le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext3_debug ("freeing block %lu\n", block); + + do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + bitmap_nr = load_block_bitmap (sb, block_group); + if (bitmap_nr < 0) + goto error_return; + + bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; + gdp = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!gdp) + goto error_return; + + if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(gdp->bg_inode_table), + sb->u.ext3_sb.s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), + sb->u.ext3_sb.s_itb_per_group)) + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks in system zones - " + "Block = %lu, count = %lu", + block, count); + + /* + * We are about to start releasing blocks in the bitmap, + * so we need undo access. + */ + /* @@@ check errors */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (err) + goto error_return; + + for (i = 0; i < count; i++) { + /* + * An HJ special. This is expensive... + */ + #ifdef CONFIG_JBD_DEBUG + { + struct buffer_head *debug_bh; + debug_bh = get_hash_table(sb->s_dev, block + i, + sb->s_blocksize); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "Deleted!"); + if (!bh2jh(bitmap_bh)->b_committed_data) + BUFFER_TRACE(debug_bh, + "No commited data in bitmap"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); + __brelse(debug_bh); + } + } + #endif + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { + ext3_error (sb, __FUNCTION__, + "bit already cleared for block %lu", + block + i); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + dquot_freed_blocks++; + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1); + es->s_free_blocks_count = + cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1); + } + /* @@@ This prevents newly-allocated data from being + * freed and then reallocated within the same + * transaction. + * + * Ideally we would want to allow that to happen, but to + * do so requires making journal_forget() capable of + * revoking the queued write of a data block, which + * implies blocking on the journal lock. *forget() + * cannot block due to truncate races. + * + * Eventually we can fix this by making journal_forget() + * return a status indicating whether or not it was able + * to revoke the buffer. On successful revoke, it is + * safe not to set the allocation bit in the committed + * bitmap, because we know that there is no outstanding + * activity on the buffer any more and so it is safe to + * reallocate it. + */ + BUFFER_TRACE(bitmap_bh, "clear in b_committed_data"); + J_ASSERT_BH(bitmap_bh, + bh2jh(bitmap_bh)->b_committed_data != NULL); + ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data); + } + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + + /* And the superblock */ + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock"); + ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + if (!err) err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + sb->s_dirt = 1; + error_return: + ext3_std_error(sb, err); + unlock_super(sb); + if (dquot_freed_blocks) + DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); + return; + } + + /* For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes. + */ + static int ext3_test_allocatable(int nr, struct buffer_head *bh) + { + if (ext3_test_bit(nr, bh->b_data)) + return 0; + if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data) + return 1; + return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data); + } + + /* + * Find an allocatable block in a bitmap. We honour both the bitmap and + * its last-committed copy (if that exists), and perform the "most + * appropriate allocation" algorithm of looking for a free block near + * the initial goal; then for a free byte somewhere in the bitmap; then + * for any free bit in the bitmap. + */ + static int find_next_usable_block(int start, + struct buffer_head *bh, int maxblocks) + { + int here, next; + char *p, *r; + + if (start > 0) { + /* + * The goal was occupied; search forward for a free + * block within the next XX blocks. + * + * end_goal is more or less random, but it has to be + * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the + * next 64-bit boundary is simple.. + */ + int end_goal = (start + 63) & ~63; + here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); + if (here < end_goal && ext3_test_allocatable(here, bh)) + return here; + + ext3_debug ("Bit not found near goal\n"); + + } + + here = start; + if (here < 0) + here = 0; + + /* + * There has been no free block found in the near vicinity of + * the goal: do a search forward through the block groups, + * searching in each group first for an entire free byte in the + * bitmap and then for any free bit. + * + * Search first in the remainder of the current group + */ + p = ((char *) bh->b_data) + (here >> 3); + r = memscan(p, 0, (maxblocks - here + 7) >> 3); + next = (r - ((char *) bh->b_data)) << 3; + + if (next < maxblocks && ext3_test_allocatable(next, bh)) + return next; + + /* The bitmap search --- search forward alternately + * through the actual bitmap and the last-committed copy + * until we find a bit free in both. */ + + while (here < maxblocks) { + next = ext3_find_next_zero_bit ((unsigned long *) bh->b_data, + maxblocks, here); + if (next >= maxblocks) + return -1; + if (ext3_test_allocatable(next, bh)) + return next; + + J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data); + here = ext3_find_next_zero_bit + ((unsigned long *) bh2jh(bh)->b_committed_data, + maxblocks, next); + } + return -1; + } + + /* + * ext3_new_block uses a goal block to assist allocation. If the goal is + * free, or there is a free block within 32 blocks of the goal, that block + * is allocated. Otherwise a forward search is made for a free block; within + * each block group the search first looks for an entire free byte in the block + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ + int ext3_new_block (handle_t *handle, struct inode * inode, + unsigned long goal, u32 * prealloc_count, + u32 * prealloc_block, int * errp) + { + struct buffer_head * bh, *bhtmp; + struct buffer_head * bh2; + #if 0 + char * p, * r; + #endif + int i, j, k, tmp, alloctmp; + int bitmap_nr; + int fatal = 0, err; + struct super_block * sb; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + #ifdef EXT3FS_DEBUG + static int goal_hits = 0, goal_attempts = 0; + #endif + *errp = -ENOSPC; + sb = inode->i_sb; + if (!sb) { + printk ("ext3_new_block: nonexistent device"); + return 0; + } + + /* + * Check quota for allocation of this block. + */ + if (DQUOT_ALLOC_BLOCK(inode, 1)) { + *errp = -EDQUOT; + return 0; + } + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + if (le32_to_cpu(es->s_free_blocks_count) <= + le32_to_cpu(es->s_r_blocks_count) && + ((sb->u.ext3_sb.s_resuid != current->fsuid) && + (sb->u.ext3_sb.s_resgid == 0 || + !in_group_p (sb->u.ext3_sb.s_resgid)) && + !capable(CAP_SYS_RESOURCE))) + goto out; + + ext3_debug ("goal=%lu.\n", goal); + + /* + * First, test whether the goal block is free. + */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + i = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + gdp = ext3_get_group_desc (sb, i, &bh2); + if (!gdp) + goto io_error; + + if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { + j = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); + #ifdef EXT3FS_DEBUG + if (j) + goal_attempts++; + #endif + bitmap_nr = load_block_bitmap (sb, i); + if (bitmap_nr < 0) + goto io_error; + + bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; + + ext3_debug ("goal is at %d:%d.\n", i, j); + + if (ext3_test_allocatable(j, bh)) { + #ifdef EXT3FS_DEBUG + goal_hits++; + ext3_debug ("goal bit allocated.\n"); + #endif + goto got_block; + } + + j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb)); + if (j >= 0) + goto search_back; + } + + ext3_debug ("Bit not found in block group %d.\n", i); + + /* + * Now search the rest of the groups. We assume that + * i and gdp correctly point to the last group visited. + */ + for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) { + i++; + if (i >= sb->u.ext3_sb.s_groups_count) + i = 0; + gdp = ext3_get_group_desc (sb, i, &bh2); + if (!gdp) { + *errp = -EIO; + goto out; + } + if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { + bitmap_nr = load_block_bitmap (sb, i); + if (bitmap_nr < 0) + goto io_error; + + bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; + j = find_next_usable_block(-1, bh, + EXT3_BLOCKS_PER_GROUP(sb)); + if (j >= 0) + goto search_back; + } + } + + /* No space left on the device */ + unlock_super (sb); + return 0; + + search_back: + /* + * We have succeeded in finding a free byte in the block + * bitmap. Now search backwards up to 7 bits to find the + * start of this group of free blocks. + */ + for ( k = 0; + k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh); + k++, j--) + ; + + got_block: + + ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count); + + /* Make sure we use undo access for the bitmap, because it is + critical that we do the frozen_data COW on bitmap buffers in + all cases even if the buffer is in BJ_Forget state in the + committing transaction. */ + BUFFER_TRACE(bh, "get undo access for marking new block"); + fatal = ext3_journal_get_undo_access(handle, bh); + if (fatal) goto out; + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto out; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (fatal) goto out; + + tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); + + if (tmp == le32_to_cpu(gdp->bg_block_bitmap) || + tmp == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range (tmp, le32_to_cpu(gdp->bg_inode_table), + sb->u.ext3_sb.s_itb_per_group)) + ext3_error (sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", tmp); + + /* The superblock lock should guard against anybody else beating + * us to this point! */ + J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data)); + BUFFER_TRACE(bh, "setting bitmap bit"); + ext3_set_bit(j, bh->b_data); + + #ifdef CONFIG_JBD_DEBUG + { + struct buffer_head *debug_bh; + + /* Record bitmap buffer state in the newly allocated block */ + debug_bh = get_hash_table(sb->s_dev, tmp, sb->s_blocksize); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "state when allocated"); + BUFFER_TRACE2(debug_bh, bh, "bitmap state"); + brelse(debug_bh); + } + } + #endif + if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data) + J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data)); + bhtmp = bh; + alloctmp = j; + + ext3_debug ("found bit %d\n", j); + + /* + * Do block preallocation now if required. + */ + #ifdef EXT3_PREALLOCATE + /* + * akpm: this is not enabled for ext3. Need to use + * ext3_test_allocatable() + */ + /* Writer: ->i_prealloc* */ + if (prealloc_count && !*prealloc_count) { + int prealloc_goal; + unsigned long next_block = tmp + 1; + + prealloc_goal = es->s_prealloc_blocks ? + es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS; + + *prealloc_block = next_block; + /* Writer: end */ + for (k = 1; + k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb); + k++, next_block++) { + if (DQUOT_PREALLOC_BLOCK(inode, 1)) + break; + /* Writer: ->i_prealloc* */ + if (*prealloc_block + *prealloc_count != next_block || + ext3_set_bit (j + k, bh->b_data)) { + /* Writer: end */ + DQUOT_FREE_BLOCK(inode, 1); + break; + } + (*prealloc_count)++; + /* Writer: end */ + } + /* + * As soon as we go for per-group spinlocks we'll need these + * done inside the loop above. + */ + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - + (k - 1)); + es->s_free_blocks_count = + cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - + (k - 1)); + ext3_debug ("Preallocated a further %lu bits.\n", + (k - 1)); + } + #endif + + j = tmp; + + BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) fatal = err; + + if (j >= le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_new_block", + "block(%d) >= blocks count(%d) - " + "block_group = %d, es == %p ",j, + le32_to_cpu(es->s_blocks_count), i, es); + goto out; + } + + /* + * It is up to the caller to add the new buffer to a journal + * list of some description. We don't know in advance whether + * the caller wants to use it as metadata or data. + */ + + ext3_debug ("allocating block %d. " + "Goal hits %d of %d.\n", j, goal_hits, goal_attempts); + + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); + es->s_free_blocks_count = + cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1); + + BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + + BUFFER_TRACE(bh, "journal_dirty_metadata for superblock"); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + if (!fatal) fatal = err; + + sb->s_dirt = 1; + if (fatal) + goto out; + + unlock_super (sb); + *errp = 0; + return j; + + io_error: + *errp = -EIO; + out: + if (fatal) { + *errp = fatal; + ext3_std_error(sb, fatal); + } + unlock_super (sb); + return 0; + + } + + unsigned long ext3_count_free_blocks (struct super_block * sb) + { + #ifdef EXT3FS_DEBUG + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + bitmap_nr = load_block_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr], + sb->s_blocksize); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); + unlock_super (sb); + return bitmap_count; + #else + return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count); + #endif + } + + static inline int block_in_use (unsigned long block, + struct super_block * sb, + unsigned char * map) + { + return ext3_test_bit ((block - + le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb), map); + } + + static inline int test_root(int a, int b) + { + if (a == 0) + return 1; + while (1) { + if (a == 1) + return 1; + if (a % b) + return 0; + a = a / b; + } + } + + int ext3_group_sparse(int group) + { + return (test_root(group, 3) || test_root(group, 5) || + test_root(group, 7)); + } + + /** + * ext3_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ + int ext3_bg_has_super(struct super_block *sb, int group) + { + if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& + !ext3_group_sparse(group)) + return 0; + return 1; + } + + /** + * ext3_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ + unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) + { + if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& + !ext3_group_sparse(group)) + return 0; + return EXT3_SB(sb)->s_gdb_count; + } + + #ifdef CONFIG_EXT3_CHECK + /* Called at mount-time, super-block is locked */ + void ext3_check_blocks_bitmap (struct super_block * sb) + { + struct buffer_head * bh; + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x, j; + unsigned long desc_blocks; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + bitmap_nr = load_block_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; + + if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data)) + ext3_error(sb, __FUNCTION__, + "Superblock in group %d is marked free", i); + + desc_blocks = ext3_bg_num_gdb(sb, i); + for (j = 0; j < desc_blocks; j++) + if (!ext3_test_bit(j + 1, bh->b_data)) + ext3_error(sb, __FUNCTION__, + "Descriptor block #%ld in group " + "%d is marked free", j, i); + + if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap), + sb, bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Block bitmap for group %d is marked free", + i); + + if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap), + sb, bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Inode bitmap for group %d is marked free", + i); + + for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++) + if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, + sb, bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Block #%d of the inode table in " + "group %d is marked free", j, i); + + x = ext3_count_free (bh, sb->s_blocksize); + if (le16_to_cpu(gdp->bg_free_blocks_count) != x) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Wrong free blocks count for group %d, " + "stored = %d, counted = %lu", i, + le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Wrong free blocks count in super block, " + "stored = %lu, counted = %lu", + (unsigned long)le32_to_cpu(es->s_free_blocks_count), + bitmap_count); + } + #endif diff -rc2P linux/fs/ext3/bitmap.c linux-2.4.13/fs/ext3/bitmap.c *** linux/fs/ext3/bitmap.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/bitmap.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,26 ---- + /* + * linux/fs/ext3/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + + #include + + + static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + + unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) + { + unsigned int i; + unsigned long sum = 0; + + if (!map) + return (0); + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return (sum); + } diff -rc2P linux/fs/ext3/dir.c linux-2.4.13/fs/ext3/dir.c *** linux/fs/ext3/dir.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/dir.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,190 ---- + /* + * linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + + #include + #include + #include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK + }; + + static int ext3_readdir(struct file *, void *, filldir_t); + + struct file_operations ext3_dir_operations = { + read: generic_read_dir, + readdir: ext3_readdir, /* BKL held */ + ioctl: ext3_ioctl, /* BKL held */ + fsync: ext3_sync_file, /* BKL held */ + }; + + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, + unsigned long offset) + { + const char * error_msg = NULL; + const int rlen = le16_to_cpu(de->rec_len); + + if (rlen < EXT3_DIR_REC_LEN(1)) + error_msg = "rec_len is smaller than minimal"; + else if (rlen % 4 != 0) + error_msg = "rec_len % 4 != 0"; + else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) + error_msg = "rec_len is too small for name_len"; + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + else if (le32_to_cpu(de->inode) > + le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) + error_msg = "inode out of bounds"; + + if (error_msg != NULL) + ext3_error (dir->i_sb, function, + "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, offset, + (unsigned long) le32_to_cpu(de->inode), + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; + } + + static int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) + { + int error = 0; + unsigned long offset, blk; + int i, num, stored; + struct buffer_head * bh, * tmp, * bha[16]; + struct ext3_dir_entry_2 * de; + struct super_block * sb; + int err; + struct inode *inode = filp->f_dentry->d_inode; + + sb = inode->i_sb; + + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < inode->i_size) { + blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb); + bh = ext3_bread (0, inode, blk, 0, &err); + if (!bh) { + ext3_error (sb, "ext3_readdir", + "directory #%lu contains a hole at offset %lu", + inode->i_ino, (unsigned long)filp->f_pos); + filp->f_pos += sb->s_blocksize - offset; + continue; + } + + /* + * Do the readahead + */ + if (!offset) { + for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0; + i > 0; i--) { + tmp = ext3_getblk (NULL, inode, ++blk, 0, &err); + if (tmp && !buffer_uptodate(tmp) && + !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse (tmp); + } + if (num) { + ll_rw_block (READA, num, bha); + for (i = 0; i < num; i++) + brelse (bha[i]); + } + } + + revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext3_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + EXT3_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); + if (!ext3_check_dir_entry ("ext3_readdir", inode, de, + bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse (bh); + return stored; + } + offset += le16_to_cpu(de->rec_len); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + unsigned long version = filp->f_version; + unsigned char d_type = DT_UNKNOWN; + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_FILETYPE) + && de->file_type < EXT3_FT_MAX) + d_type = + ext3_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + d_type); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored ++; + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse (bh); + } + UPDATE_ATIME(inode); + return 0; + } diff -rc2P linux/fs/ext3/file.c linux-2.4.13/fs/ext3/file.c *** linux/fs/ext3/file.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/file.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,97 ---- + /* + * linux/fs/ext3/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + + #include + #include + #include + #include + #include + #include + #include + + /* + * Called when an inode is released. Note that this is different + * from ext3_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ + static int ext3_release_file (struct inode * inode, struct file * filp) + { + if (filp->f_mode & FMODE_WRITE) + ext3_discard_prealloc (inode); + return 0; + } + + /* + * Called when an inode is about to be opened. + * We use this to disallow opening RW large files on 32bit systems if + * the caller didn't specify O_LARGEFILE. On 64bit systems we force + * on this flag in sys_open. + */ + static int ext3_open_file (struct inode * inode, struct file * filp) + { + if (!(filp->f_flags & O_LARGEFILE) && + inode->i_size > 0x7FFFFFFFLL) + return -EFBIG; + return 0; + } + + /* + * ext3_file_write(). + * + * Most things are done in ext3_prepare_write() and ext3_commit_write(). + */ + + static ssize_t + ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) + { + int ret; + struct inode *inode = file->f_dentry->d_inode; + + ret = generic_file_write(file, buf, count, ppos); + if ((ret >= 0) && IS_SYNC(inode)) { + if (file->f_flags & O_SYNC) { + /* + * generic_osync_inode() has already done the sync + */ + } else { + int ret2 = ext3_force_commit(inode->i_sb); + if (ret2) + ret = ret2; + } + } + return ret; + } + + struct file_operations ext3_file_operations = { + llseek: generic_file_llseek, /* BKL held */ + read: generic_file_read, /* BKL not held. Don't need */ + write: ext3_file_write, /* BKL not held. Don't need */ + ioctl: ext3_ioctl, /* BKL held */ + mmap: generic_file_mmap, + open: ext3_open_file, /* BKL not held. Don't need */ + release: ext3_release_file, /* BKL not held. Don't need */ + fsync: ext3_sync_file, /* BKL held */ + }; + + struct inode_operations ext3_file_inode_operations = { + truncate: ext3_truncate, /* BKL held */ + setattr: ext3_setattr, /* BKL held */ + }; + diff -rc2P linux/fs/ext3/fsync.c linux-2.4.13/fs/ext3/fsync.c *** linux/fs/ext3/fsync.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/fsync.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,69 ---- + /* + * linux/fs/ext3/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + + #include + #include + #include + #include + #include + #include + #include + + /* + * akpm: A new design for ext3_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. (AKPM: quotas?) + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + * + * Note that there is a serious optimisation we can make here: if the current + * inode is not part of j_running_transaction or j_committing_transaction + * then we have nothing to do. That would require implementation of t_ilist, + * which isn't too hard. + */ + + int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) + { + struct inode *inode = dentry->d_inode; + int ret; + + J_ASSERT(ext3_journal_current_handle() == 0); + + /* + * fsync_inode_buffers() just walks i_dirty_buffers and waits + * on them. It's a no-op for full data journalling because + * i_dirty_buffers will be ampty. + * Really, we only need to start I/O on the dirty buffers - + * we'll end up waiting on them in commit. + */ + ret = fsync_inode_buffers(inode); + + ext3_force_commit(inode->i_sb); + + return ret; + } diff -rc2P linux/fs/ext3/ialloc.c linux-2.4.13/fs/ext3/ialloc.c *** linux/fs/ext3/ialloc.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/ialloc.c Fri Nov 9 17:03:46 2001 *************** *** 0 **** --- 1,664 ---- + /* + * linux/fs/ext3/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + + /* + * ialloc.c contains the inodes allocation and deallocation routines + */ + + /* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3_read_super). + */ + + + /* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return >=0 on success or a -ve error code. + */ + static int read_inode_bitmap (struct super_block * sb, + unsigned long block_group, + unsigned int bitmap_nr) + { + struct ext3_group_desc * gdp; + struct buffer_head * bh = NULL; + int retval = 0; + + gdp = ext3_get_group_desc (sb, block_group, NULL); + if (!gdp) { + retval = -EIO; + goto error_out; + } + bh = bread (sb->s_dev, + le32_to_cpu(gdp->bg_inode_bitmap), sb->s_blocksize); + if (!bh) { + ext3_error (sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %lu", + block_group, (unsigned long) gdp->bg_inode_bitmap); + retval = -EIO; + } + /* + * On IO error, just leave a zero in the superblock's block pointer for + * this group. The IO will be retried next time. + */ + error_out: + sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group; + sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh; + return retval; + } + + /* + * load_inode_bitmap loads the inode bitmap for a blocks group + * + * It maintains a cache for the last bitmaps loaded. This cache is managed + * with a LRU algorithm. + * + * Notes: + * 1/ There is one cache per mounted file system. + * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, + * this function reads the bitmap without maintaining a LRU cache. + * + * Return the slot used to store the bitmap, or a -ve error code. + */ + static int load_inode_bitmap (struct super_block * sb, + unsigned int block_group) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long inode_bitmap_number; + struct buffer_head * inode_bitmap; + int i, j, retval = 0; + + if (block_group >= sbi->s_groups_count) + ext3_panic (sb, "load_inode_bitmap", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + if (sbi->s_loaded_inode_bitmaps > 0 && + sbi->s_inode_bitmap_number[0] == block_group && + sbi->s_inode_bitmap[0] != NULL) + return 0; + if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) { + if (sbi->s_inode_bitmap[block_group]) { + if (sbi->s_inode_bitmap_number[block_group] != + block_group) + ext3_panic(sb, "load_inode_bitmap", + "block_group != inode_bitmap_number"); + return block_group; + } + retval = read_inode_bitmap(sb, block_group, block_group); + if (retval < 0) + return retval; + return block_group; + } + + for (i = 0; i < sbi->s_loaded_inode_bitmaps && + sbi->s_inode_bitmap_number[i] != block_group; i++) + /* do nothing */; + if (i < sbi->s_loaded_inode_bitmaps && + sbi->s_inode_bitmap_number[i] == block_group) { + inode_bitmap_number = sbi->s_inode_bitmap_number[i]; + inode_bitmap = sbi->s_inode_bitmap[i]; + for (j = i; j > 0; j--) { + sbi->s_inode_bitmap_number[j] = + sbi->s_inode_bitmap_number[j - 1]; + sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; + } + sbi->s_inode_bitmap_number[0] = inode_bitmap_number; + sbi->s_inode_bitmap[0] = inode_bitmap; + + /* + * There's still one special case here --- if inode_bitmap == 0 + * then our last attempt to read the bitmap failed and we have + * just ended up caching that failure. Try again to read it. + */ + if (!inode_bitmap) + retval = read_inode_bitmap (sb, block_group, 0); + } else { + if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED) + sbi->s_loaded_inode_bitmaps++; + else + brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]); + for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) { + sbi->s_inode_bitmap_number[j] = + sbi->s_inode_bitmap_number[j - 1]; + sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; + } + retval = read_inode_bitmap (sb, block_group, 0); + } + return retval; + } + + /* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ + void ext3_free_inode (handle_t *handle, struct inode * inode) + { + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head * bh; + struct buffer_head * bh2; + unsigned long block_group; + unsigned long bit; + int bitmap_nr; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + int fatal = 0, err; + + if (!inode->i_dev) { + printk ("ext3_free_inode: inode has no device\n"); + return; + } + if (atomic_read(&inode->i_count) > 1) { + printk ("ext3_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk ("ext3_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk("ext3_free_inode: inode on nonexistent device\n"); + return; + } + + ino = inode->i_ino; + ext3_debug ("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + clear_inode (inode); + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_nr = load_inode_bitmap (sb, block_group); + if (bitmap_nr < 0) + goto error_return; + + bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; + + BUFFER_TRACE(bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh); + if (fatal) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext3_clear_bit (bit, bh->b_data)) + ext3_error (sb, "ext3_free_inode", + "bit already cleared for inode %lu", ino); + else { + gdp = ext3_get_group_desc (sb, block_group, &bh2); + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access"); + fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (fatal) goto error_return; + + if (gdp) { + gdp->bg_free_inodes_count = cpu_to_le16( + le16_to_cpu(gdp->bg_free_inodes_count) + 1); + if (is_directory) + gdp->bg_used_dirs_count = cpu_to_le16( + le16_to_cpu(gdp->bg_used_dirs_count) - 1); + } + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, + "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + sb->s_dirt = 1; + error_return: + ext3_std_error(sb, fatal); + unlock_super(sb); + } + + /* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ + struct inode * ext3_new_inode (handle_t *handle, + const struct inode * dir, int mode) + { + struct super_block * sb; + struct buffer_head * bh; + struct buffer_head * bh2; + int i, j, avefreei; + struct inode * inode; + int bitmap_nr; + struct ext3_group_desc * gdp; + struct ext3_group_desc * tmp; + struct ext3_super_block * es; + int err = 0; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + init_rwsem(&inode->u.ext3_i.truncate_sem); + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + repeat: + gdp = NULL; + i = 0; + + if (S_ISDIR(mode)) { + avefreei = le32_to_cpu(es->s_free_inodes_count) / + sb->u.ext3_sb.s_groups_count; + if (!gdp) { + for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) { + struct buffer_head *temp_buffer; + tmp = ext3_get_group_desc (sb, j, &temp_buffer); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count) && + le16_to_cpu(tmp->bg_free_inodes_count) >= + avefreei) { + if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) > + le16_to_cpu(gdp->bg_free_blocks_count))) { + i = j; + gdp = tmp; + bh2 = temp_buffer; + } + } + } + } + } else { + /* + * Try to place the inode in its parent directory + */ + i = dir->u.ext3_i.i_block_group; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) + gdp = tmp; + else + { + /* + * Use a quadratic hash to find a group with a + * free inode + */ + for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) { + i += j; + if (i >= sb->u.ext3_sb.s_groups_count) + i -= sb->u.ext3_sb.s_groups_count; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count)) { + gdp = tmp; + break; + } + } + } + if (!gdp) { + /* + * That failed: try linear search for a free inode + */ + i = dir->u.ext3_i.i_block_group + 1; + for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) { + if (++i >= sb->u.ext3_sb.s_groups_count) + i = 0; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count)) { + gdp = tmp; + break; + } + } + } + } + + err = -ENOSPC; + if (!gdp) + goto fail; + + err = -EIO; + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + goto fail; + + bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; + + if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data, + EXT3_INODES_PER_GROUP(sb))) < + EXT3_INODES_PER_GROUP(sb)) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) goto fail; + + if (ext3_set_bit (j, bh->b_data)) { + ext3_error (sb, "ext3_new_inode", + "bit already set for inode %d", j); + goto repeat; + } + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) goto fail; + } else { + if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) { + ext3_error (sb, "ext3_new_inode", + "Free inodes count corrupted in group %d", + i); + /* Is it really ENOSPC? */ + err = -ENOSPC; + if (sb->s_flags & MS_RDONLY) + goto fail; + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + gdp->bg_free_inodes_count = 0; + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + } + goto repeat; + } + j += i * EXT3_INODES_PER_GROUP(sb) + 1; + if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d,inode=%d", i, j); + err = -EIO; + goto fail; + } + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + gdp->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); + if (S_ISDIR(mode)) + gdp->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (err) goto fail; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + sb->s_dirt = 1; + if (err) goto fail; + + inode->i_uid = current->fsuid; + if (test_opt (sb, GRPID)) + inode->i_gid = dir->i_gid; + else if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + + inode->i_ino = j; + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blksize = PAGE_SIZE; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; + if (S_ISLNK(mode)) + inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL | EXT3_APPEND_FL); + #ifdef EXT3_FRAGMENTS + inode->u.ext3_i.i_faddr = 0; + inode->u.ext3_i.i_frag_no = 0; + inode->u.ext3_i.i_frag_size = 0; + #endif + inode->u.ext3_i.i_file_acl = 0; + inode->u.ext3_i.i_dir_acl = 0; + inode->u.ext3_i.i_dtime = 0; + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); + #ifdef EXT3_PREALLOCATE + inode->u.ext3_i.i_prealloc_count = 0; + #endif + inode->u.ext3_i.i_block_group = i; + + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (IS_SYNC(inode)) + handle->h_sync = 1; + insert_inode_hash(inode); + inode->i_generation = event++; + + inode->u.ext3_i.i_state = EXT3_STATE_NEW; + err = ext3_mark_inode_dirty(handle, inode); + if (err) goto fail; + + unlock_super (sb); + if(DQUOT_ALLOC_INODE(inode)) { + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + iput(inode); + return ERR_PTR(-EDQUOT); + } + ext3_debug ("allocating inode %lu\n", inode->i_ino); + return inode; + + fail: + unlock_super(sb); + iput(inode); + ext3_std_error(sb, err); + return ERR_PTR(err); + } + + /* Verify that we are loading a valid orphan from disk */ + struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino) + { + ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); + unsigned long block_group; + int bit; + int bitmap_nr; + struct buffer_head *bh; + struct inode *inode = NULL; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan ino %ld! e2fsck was run?\n", ino); + return NULL; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 || + !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) { + ext3_warning(sb, __FUNCTION__, + "inode bitmap error for orphan %ld\n", ino); + return NULL; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) || + is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan inode %ld! e2fsck was run?\n", ino); + printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n", + bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%ld\n", max_ino); + } + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode && inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + return NULL; + } + + return inode; + } + + unsigned long ext3_count_free_inodes (struct super_block * sb) + { + #ifdef EXT3FS_DEBUG + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], + EXT3_INODES_PER_GROUP(sb) / 8); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + unlock_super (sb); + return desc_count; + #else + return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count); + #endif + } + + #ifdef CONFIG_EXT3_CHECK + /* Called at mount-time, super-block is locked */ + void ext3_check_inodes_bitmap (struct super_block * sb) + { + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], + EXT3_INODES_PER_GROUP(sb) / 8); + if (le16_to_cpu(gdp->bg_free_inodes_count) != x) + ext3_error (sb, "ext3_check_inodes_bitmap", + "Wrong free inodes count in group %d, " + "stored = %d, counted = %lu", i, + le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count) + ext3_error (sb, "ext3_check_inodes_bitmap", + "Wrong free inodes count in super block, " + "stored = %lu, counted = %lu", + (unsigned long)le32_to_cpu(es->s_free_inodes_count), + bitmap_count); + } + #endif diff -rc2P linux/fs/ext3/inode.c linux-2.4.13/fs/ext3/inode.c *** linux/fs/ext3/inode.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/inode.c Fri Nov 9 17:03:19 2001 *************** *** 0 **** --- 1,2676 ---- + /* + * linux/fs/ext3/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + + + /* + * SEARCH_FROM_ZERO forces each block allocation to search from the start + * of the filesystem. This is to force rapid reallocation of recently-freed + * blocks. The file fragmentation is horrendous. + */ + #undef SEARCH_FROM_ZERO + + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ + + static int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) + { + int err; + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext3_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call journal_forget"); + ext3_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext3_journal_revoke"); + err = ext3_journal_revoke(handle, blocknr, bh); + if (err) + ext3_abort(inode->i_sb, __FUNCTION__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; + } + + /* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ + + static handle_t *start_transaction(struct inode *inode) + { + long needed; + handle_t *result; + + needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + + result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); + if (!IS_ERR(result)) + return result; + + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; + } + + /* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ + static int try_to_extend_transaction(handle_t *handle, struct inode *inode) + { + long needed; + + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; + needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed)) + return 0; + return 1; + } + + /* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ + static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) + { + long needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + jbd_debug(2, "restarting handle %p\n", handle); + return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed); + } + + /* + * Called at each iput() + */ + void ext3_put_inode (struct inode * inode) + { + ext3_discard_prealloc (inode); + } + + /* + * Called at the last iput() if i_nlink is zero. + */ + void ext3_delete_inode (struct inode * inode) + { + handle_t *handle; + + if (is_bad_inode(inode) || + inode->i_ino == EXT3_ACL_IDX_INO || + inode->i_ino == EXT3_ACL_DATA_INO) + goto no_delete; + + lock_kernel(); + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* If we're going to skip the normal cleanup, we still + * need to make sure that the in-core orphan linked list + * is properly cleaned up. */ + ext3_orphan_del(NULL, inode); + + ext3_std_error(inode->i_sb, PTR_ERR(handle)); + unlock_kernel(); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext3_truncate(inode); + /* + * Kill off the orphan record which ext3_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext3_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext3_orphan_del(handle, inode); + inode->u.ext3_i.i_dtime = CURRENT_TIME; + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext3_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + clear_inode(inode); + else + ext3_free_inode(handle, inode); + ext3_journal_stop(handle, inode); + unlock_kernel(); + return; + no_delete: + clear_inode(inode); /* We must guarantee clearing of inode... */ + } + + void ext3_discard_prealloc (struct inode * inode) + { + #ifdef EXT3_PREALLOCATE + lock_kernel(); + /* Writer: ->i_prealloc* */ + if (inode->u.ext3_i.i_prealloc_count) { + unsigned short total = inode->u.ext3_i.i_prealloc_count; + unsigned long block = inode->u.ext3_i.i_prealloc_block; + inode->u.ext3_i.i_prealloc_count = 0; + inode->u.ext3_i.i_prealloc_block = 0; + /* Writer: end */ + ext3_free_blocks (inode, block, total); + } + unlock_kernel(); + #endif + } + + static int ext3_alloc_block (handle_t *handle, + struct inode * inode, unsigned long goal, int *err) + { + #ifdef EXT3FS_DEBUG + static unsigned long alloc_hits = 0, alloc_attempts = 0; + #endif + unsigned long result; + + #ifdef EXT3_PREALLOCATE + /* Writer: ->i_prealloc* */ + if (inode->u.ext3_i.i_prealloc_count && + (goal == inode->u.ext3_i.i_prealloc_block || + goal + 1 == inode->u.ext3_i.i_prealloc_block)) + { + result = inode->u.ext3_i.i_prealloc_block++; + inode->u.ext3_i.i_prealloc_count--; + /* Writer: end */ + ext3_debug ("preallocation hit (%lu/%lu).\n", + ++alloc_hits, ++alloc_attempts); + } else { + ext3_discard_prealloc (inode); + ext3_debug ("preallocation miss (%lu/%lu).\n", + alloc_hits, ++alloc_attempts); + if (S_ISREG(inode->i_mode)) + result = ext3_new_block (inode, goal, + &inode->u.ext3_i.i_prealloc_count, + &inode->u.ext3_i.i_prealloc_block, err); + else + result = ext3_new_block (inode, goal, 0, 0, err); + /* + * AKPM: this is somewhat sticky. I'm not surprised it was + * disabled in 2.2's ext3. Need to integrate b_committed_data + * guarding with preallocation, if indeed preallocation is + * effective. + */ + } + #else + result = ext3_new_block (handle, inode, goal, 0, 0, err); + #endif + return result; + } + + + typedef struct { + u32 *p; + u32 key; + struct buffer_head *bh; + } Indirect; + + static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v) + { + p->key = *(p->p = v); + p->bh = bh; + } + + static inline int verify_chain(Indirect *from, Indirect *to) + { + while (from <= to && from->key == *from->p) + from++; + return (from > to); + } + + /** + * ext3_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * + * To store the locations of file's data ext3 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + + /* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + + static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4]) + { + int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT3_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + + if (i_block < 0) { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT3_IND_BLOCK; + offsets[n++] = i_block; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT3_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT3_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + } else { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); + } + return n; + } + + /** + * ext3_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ + static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, + Indirect chain[4], int *err) + { + kdev_t dev = inode->i_dev; + int blocksize = inode->i_sb->s_blocksize; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = bread(dev, le32_to_cpu(p->key), blocksize); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (u32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + + changed: + *err = -EAGAIN; + goto no_block; + failure: + *err = -EIO; + no_block: + return p; + } + + /** + * ext3_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the prefered place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * Caller must make sure that @ind is valid and will stay that way. + */ + + static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) + { + u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; + u32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) + if (*p) + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be refered from inode itself? OK, just put it into + * the same cylinder group then. + */ + return (inode->u.ext3_i.i_block_group * + EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); + } + + /** + * ext3_find_goal - find a prefered place for allocation. + * @inode: owner + * @block: block we want + * @chain: chain of indirect blocks + * @partial: pointer to the last triple within a chain + * @goal: place to store the result. + * + * Normally this function find the prefered place for block allocation, + * stores it in *@goal and returns zero. If the branch had been changed + * under us we return -EAGAIN. + */ + + static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], + Indirect *partial, unsigned long *goal) + { + /* Writer: ->i_next_alloc* */ + if (block == inode->u.ext3_i.i_next_alloc_block + 1) { + inode->u.ext3_i.i_next_alloc_block++; + inode->u.ext3_i.i_next_alloc_goal++; + } + #ifdef SEARCH_FROM_ZERO + inode->u.ext3_i.i_next_alloc_block = 0; + inode->u.ext3_i.i_next_alloc_goal = 0; + #endif + /* Writer: end */ + /* Reader: pointers, ->i_next_alloc* */ + if (verify_chain(chain, partial)) { + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block == inode->u.ext3_i.i_next_alloc_block) + *goal = inode->u.ext3_i.i_next_alloc_goal; + if (!*goal) + *goal = ext3_find_near(inode, partial); + #ifdef SEARCH_FROM_ZERO + *goal = 0; + #endif + return 0; + } + /* Reader: end */ + return -EAGAIN; + } + + /** + * ext3_alloc_branch - allocate and set up a chain of blocks. + * @inode: owner + * @num: depth of the chain (number of blocks to allocate) + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates @num blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext3_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext3_get_block(), excpet that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ + + static int ext3_alloc_branch(handle_t *handle, struct inode *inode, + int num, + unsigned long goal, + int *offsets, + Indirect *branch) + { + int blocksize = inode->i_sb->s_blocksize; + int n = 0, keys = 0; + int err = 0; + int i; + int parent = ext3_alloc_block(handle, inode, goal, &err); + + branch[0].key = cpu_to_le32(parent); + if (parent) { + for (n = 1; n < num; n++) { + struct buffer_head *bh; + /* Allocate the next block */ + int nr = ext3_alloc_block(handle, inode, parent, &err); + if (!nr) + break; + branch[n].key = cpu_to_le32(nr); + keys = n+1; + + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = getblk(inode->i_dev, parent, blocksize); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + break; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (u32*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + BUFFER_TRACE(bh, "marking uptodate"); + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + break; + + parent = nr; + } + if (IS_SYNC(inode)) + handle->h_sync = 1; + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < keys; i++) { + BUFFER_TRACE(branch[i].bh, "call journal_forget"); + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) + ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + return err; + } + + /** + * ext3_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext3_alloc_branch) + * @where: location of missing link + * @num: number of blocks we are adding + * + * This function verifies that chain (up to the missing link) had not + * changed, fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. Otherwise (== chain had been changed) + * we free the new blocks (forgetting their buffer_heads, indeed) and + * return -EAGAIN. + */ + + static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, + Indirect chain[4], Indirect *where, int num) + { + int i; + int err = 0; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* Verify that place we are splicing to is still there and vacant */ + + /* Writer: pointers, ->i_next_alloc* */ + if (!verify_chain(chain, where-1) || *where->p) + /* Writer: end */ + goto changed; + + /* That's it */ + + *where->p = where->key; + inode->u.ext3_i.i_next_alloc_block = block; + inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); + #ifdef SEARCH_FROM_ZERO + inode->u.ext3_i.i_next_alloc_block = 0; + inode->u.ext3_i.i_next_alloc_goal = 0; + #endif + /* Writer: end */ + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * akpm: If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + + changed: + /* + * AKPM: if where[i].bh isn't part of the current updating + * transaction then we explode nastily. Test this code path. + */ + jbd_debug(1, "the chain changed: try again\n"); + err = -EAGAIN; + + err_out: + for (i = 1; i < num; i++) { + BUFFER_TRACE(where[i].bh, "call journal_forget"); + ext3_journal_forget(handle, where[i].bh); + } + /* For the normal collision cleanup case, we free up the blocks. + * On genuine filesystem errors we don't even think about doing + * that. */ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, + le32_to_cpu(where[i].key), 1); + return err; + } + + /* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * akpm: `handle' can be NULL if create == 0. + */ + + static int ext3_get_block_handle(handle_t *handle, struct inode *inode, + long iblock, + struct buffer_head *bh_result, int create) + { + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + unsigned long goal; + int left; + int depth = ext3_block_to_path(inode, iblock, offsets); + loff_t new_size; + + J_ASSERT(handle != NULL || create == 0); + + if (depth == 0) + goto out; + + lock_kernel(); + reread: + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + bh_result->b_state &= ~(1UL << BH_New); + got_it: + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key); + bh_result->b_state |= (1UL << BH_Mapped); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { + cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); + unlock_kernel(); + out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) + goto changed; + + left = (chain + depth) - partial; + + /* + * Block out ext3_truncate while we alter the tree + */ + down_read(&inode->u.ext3_i.truncate_sem); + err = ext3_alloc_branch(handle, inode, left, goal, + offsets+(partial-chain), partial); + + /* The ext3_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct */ + if (!err) + err = ext3_splice_branch(handle, inode, iblock, chain, + partial, left); + up_read(&inode->u.ext3_i.truncate_sem); + if (err == -EAGAIN) + goto changed; + if (err) + goto cleanup; + + new_size = inode->i_size; + /* + * This is not racy against ext3_truncate's modification of i_disksize + * because VM/VFS ensures that the file cannot be extended while + * truncate is in progress. It is racy between multiple parallel + * instances of get_block, but we have the BKL. + */ + if (new_size > inode->u.ext3_i.i_disksize) + inode->u.ext3_i.i_disksize = new_size; + + bh_result->b_state |= (1UL << BH_New); + goto got_it; + + changed: + while (partial > chain) { + jbd_debug(1, "buffer chain changed, retrying\n"); + BUFFER_TRACE(partial->bh, "brelsing"); + brelse(partial->bh); + partial--; + } + goto reread; + } + + static int ext3_get_block(struct inode *inode, long iblock, + struct buffer_head *bh_result, int create) + { + handle_t *handle = 0; + int ret; + + if (create) { + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } + ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); + return ret; + } + + /* + * `handle' can be NULL if create is zero + */ + struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, + long block, int create, int * errp) + { + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = getblk(dummy.b_dev, dummy.b_blocknr, + inode->i_sb->s_blocksize); + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != 0); + + /* Now that we do not always journal data, we + should keep in mind whether this should + always journal the new buffer as metadata. + For now, regular file writes use + ext3_get_block instead, so it's not a + problem. */ + lock_kernel(); + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext3_journal_get_create_access(handle, bh); + if (!fatal) { + memset(bh->b_data, 0, + inode->i_sb->s_blocksize); + mark_buffer_uptodate(bh, 1); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) fatal = err; + unlock_kernel(); + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } + return NULL; + } + + struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, + int block, int create, int *err) + { + struct buffer_head * bh; + int prev_blocks; + + prev_blocks = inode->i_blocks; + + bh = ext3_getblk (handle, inode, block, create, err); + if (!bh) + return bh; + #ifdef EXT3_PREALLOCATE + /* + * If the inode has grown, and this is a directory, then use a few + * more of the preallocated blocks to keep directory fragmentation + * down. The preallocated blocks are guaranteed to be contiguous. + */ + if (create && + S_ISDIR(inode->i_mode) && + inode->i_blocks > prev_blocks && + EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_PREALLOC)) { + int i; + struct buffer_head *tmp_bh; + + for (i = 1; + inode->u.ext3_i.i_prealloc_count && + i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; + i++) { + /* + * ext3_getblk will zero out the contents of the + * directory for us + */ + tmp_bh = ext3_getblk(handle, inode, + block+i, create, err); + if (!tmp_bh) { + brelse (bh); + return 0; + } + brelse (tmp_bh); + } + } + #endif + if (buffer_uptodate(bh)) + return bh; + ll_rw_block (READ, 1, &bh); + wait_on_buffer (bh); + if (buffer_uptodate(bh)) + return bh; + brelse (bh); + *err = -EIO; + return NULL; + } + + static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) + { + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = bh->b_this_page) + { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; + } + + /* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext3_get_block() + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext3_writepage() -> + * block_write_full_page(). In that case, we *know* that ext3_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext3 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ + + static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) + { + return ext3_journal_get_write_access(handle, bh); + } + + static int ext3_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) + { + struct inode *inode = page->mapping->host; + handle_t *handle = ext3_journal_current_handle(); + int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + + lock_kernel(); + handle = ext3_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = block_prepare_write(page, from, to, ext3_get_block); + if (ret != 0) + goto prepare_write_failed; + + if (ext3_should_journal_data(inode)) + ret = walk_page_buffers(handle, page->buffers, + from, to, NULL, do_journal_get_write_access); + prepare_write_failed: + if (ret) + ext3_journal_stop(handle, inode); + out: + unlock_kernel(); + return ret; + } + + static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) + { + return ext3_journal_dirty_data(handle, bh, 0); + } + + /* + * For ext3_writepage(). We also brelse() the buffer to account for + * the bget() which ext3_writepage() performs. + */ + static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) + { + int ret = ext3_journal_dirty_data(handle, bh, 1); + __brelse(bh); + return ret; + } + + /* For commit_write() in data=journal mode */ + static int commit_write_fn(handle_t *handle, struct buffer_head *bh) + { + set_bit(BH_Uptodate, &bh->b_state); + return ext3_journal_dirty_metadata(handle, bh); + } + + /* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from block_symlink(). + * + * ext3 inode->i_dirty_buffers policy: If we're journalling data we + * definitely don't want them to appear on the inode at all - instead + * we need to manage them at the JBD layer and we need to intercept + * the relevant sync operations and translate them into journal operations. + * + * If we're not journalling data then we can just leave the buffers + * on ->i_dirty_buffers. If someone writes them out for us then thanks. + * Otherwise we'll do it in commit, if we're using ordered data. + */ + + static int ext3_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) + { + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + + lock_kernel(); + if (ext3_should_journal_data(inode)) { + /* + * Here we duplicate the generic_commit_write() functionality + */ + int partial = 0; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + ret = walk_page_buffers(handle, page->buffers, + from, to, &partial, commit_write_fn); + if (!partial) + SetPageUptodate(page); + kunmap(page); + if (pos > inode->i_size) + inode->i_size = pos; + set_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state); + } else { + if (ext3_should_order_data(inode)) { + ret = walk_page_buffers(handle, page->buffers, + from, to, NULL, journal_dirty_sync_data); + } + /* Be careful here if generic_commit_write becomes a + * required invocation after block_prepare_write. */ + if (ret == 0) + ret = generic_commit_write(file, page, from, to); + } + if (inode->i_size > inode->u.ext3_i.i_disksize) { + inode->u.ext3_i.i_disksize = inode->i_size; + ret2 = ext3_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + ret2 = ext3_journal_stop(handle, inode); + unlock_kernel(); + if (!ret) + ret = ret2; + return ret; + } + + /* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext3 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ + static int ext3_bmap(struct address_space *mapping, long block) + { + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (test_and_clear_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT3_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + journal = EXT3_JOURNAL(inode); + journal_lock_updates(journal); + err = journal_flush(journal); + journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping,block,ext3_get_block); + } + + static int bget_one(handle_t *handle, struct buffer_head *bh) + { + atomic_inc(&bh->b_count); + return 0; + } + + /* + * Note that we always start a transaction even if we're not journalling + * data. This is to preserve ordering: any hole instantiation within + * __block_write_full_page -> ext3_get_block() should be journalled + * along with the data so we don't crash and then get metadata which + * refers to old data. + * + * In all journalling modes block_write_full_page() will start the I/O. + * + * Problem: + * + * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext3_writepage() + * + * Similar for: + * + * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... + * + * Same applies to ext3_get_block(). We will deadlock on various things like + * lock_journal and i_truncate_sem. + * + * Setting PF_MEMALLOC here doesn't work - too many internal memory + * allocations fail. + * + * 16May01: If we're reentered then journal_current_handle() will be + * non-zero. We simply *return*. + * + * 1 July 2001: @@@ FIXME: + * In journalled data mode, a data buffer may be metadata against the + * current transaction. But the same file is part of a shared mapping + * and someone does a writepage() on it. + * + * We will move the buffer onto the async_data list, but *after* it has + * been dirtied. So there's a small window where we have dirty data on + * BJ_Metadata. + * + * Note that this only applies to the last partial page in the file. The + * bit which block_write_full_page() uses prepare/commit for. (That's + * broken code anyway: it's wrong for msync()). + * + * It's a rare case: affects the final partial page, for journalled data + * where the file is subject to bith write() and writepage() in the same + * transction. To fix it we'll need a custom block_write_full_page(). + * We'll probably need that anyway for journalling writepage() output. + * + * We don't honour synchronous mounts for writepage(). That would be + * disastrous. Any write() or metadata operation will sync the fs for + * us. + */ + static int ext3_writepage(struct page *page) + { + struct inode *inode = page->mapping->host; + struct buffer_head *page_buffers; + handle_t *handle = NULL; + int ret = 0, err; + int needed; + int order_data; + + J_ASSERT(PageLocked(page)); + + /* + * We give up here if we're reentered, because it might be + * for a different filesystem. One *could* look for a + * nested transaction opportunity. + */ + lock_kernel(); + if (ext3_journal_current_handle()) + goto out_fail; + + needed = ext3_writepage_trans_blocks(inode); + if (current->flags & PF_MEMALLOC) + handle = ext3_journal_try_start(inode, needed); + else + handle = ext3_journal_start(inode, needed); + + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + order_data = ext3_should_order_data(inode) || + ext3_should_journal_data(inode); + + unlock_kernel(); + + page_buffers = NULL; /* Purely to prevent compiler warning */ + + /* bget() all the buffers */ + if (order_data) { + if (!page->buffers) + create_empty_buffers(page, + inode->i_dev, inode->i_sb->s_blocksize); + page_buffers = page->buffers; + walk_page_buffers(handle, page_buffers, 0, + PAGE_CACHE_SIZE, NULL, bget_one); + } + + ret = block_write_full_page(page, ext3_get_block); + + /* + * The page can become unlocked at any point now, and + * truncate can then come in and change things. So we + * can't touch *page from now on. But *page_buffers is + * safe due to elevated refcount. + */ + + handle = ext3_journal_current_handle(); + lock_kernel(); + + /* And attach them to the current transaction */ + if (order_data) { + err = walk_page_buffers(handle, page_buffers, + 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); + if (!ret) + ret = err; + } + + err = ext3_journal_stop(handle, inode); + if (!ret) + ret = err; + unlock_kernel(); + return ret; + + out_fail: + + unlock_kernel(); + SetPageDirty(page); + UnlockPage(page); + return ret; + } + + static int ext3_readpage(struct file *file, struct page *page) + { + return block_read_full_page(page,ext3_get_block); + } + + + static int ext3_flushpage(struct page *page, unsigned long offset) + { + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + return journal_flushpage(journal, page, offset); + } + + static int ext3_releasepage(struct page *page, int wait) + { + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + return journal_try_to_free_buffers(journal, page, wait); + } + + + struct address_space_operations ext3_aops = { + readpage: ext3_readpage, /* BKL not held. Don't need */ + writepage: ext3_writepage, /* BKL not held. We take it */ + sync_page: block_sync_page, + prepare_write: ext3_prepare_write, /* BKL not held. We take it */ + commit_write: ext3_commit_write, /* BKL not held. We take it */ + bmap: ext3_bmap, /* BKL held */ + flushpage: ext3_flushpage, /* BKL not held. Don't need */ + releasepage: ext3_releasepage, /* BKL not held. Don't need */ + }; + + /* + * ext3_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ + static int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = inode->i_sb->s_blocksize; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + + /* Find the buffer that contains "offset" */ + bh = page->buffers; + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + /* Hole? Nothing to do */ + if (buffer_uptodate(bh)) + goto unlock; + ext3_get_block(inode, iblock, bh, 0); + /* Still unmapped? Nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (Page_Uptodate(page)) + set_bit(BH_Uptodate, &bh->b_state); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (ext3_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + + memset(kmap(page) + offset, 0, length); + flush_dcache_page(page); + kunmap(page); + + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext3_should_journal_data(inode)) { + err = ext3_journal_dirty_metadata(handle, bh); + } else { + if (ext3_should_order_data(inode)) + err = ext3_journal_dirty_data(handle, bh, 0); + __mark_buffer_dirty(bh); + } + + unlock: + UnlockPage(page); + page_cache_release(page); + out: + return err; + } + + /* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ + static inline int all_zeroes(u32 *p, u32 *q) + { + while (p < q) + if (*p++) + return 0; + return 1; + } + + /** + * ext3_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext3_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext3_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is refered + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext3_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + + static Indirect *ext3_find_shared(struct inode *inode, + int depth, + int offsets[4], + Indirect chain[4], + u32 *top) + { + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offest + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext3_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext3. Must leave the tree intact */ + #if 0 + *p->p = 0; + #endif + } + /* Writer: end */ + + while(partial > p) + { + brelse(partial->bh); + partial--; + } + no_top: + return partial; + } + + /* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + */ + static void + ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, + unsigned long block_to_free, unsigned long count, + u32 *first, u32 *last) + { + u32 *p; + kdev_t dev = inode->i_sb->s_dev; + unsigned long blocksize = inode->i_sb->s_blocksize; + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + } + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + } + + /* + * Any buffers which are on the journal will be in memory. We find + * them on the hash table so journal_revoke() will run journal_forget() + * on them. We've already detached each block from the file, so + * bforget() in journal_forget() should be safe. + * + * AKPM: turn on bforget in journal_forget()!!! + */ + for (p = first; p < last; p++) { + u32 nr = le32_to_cpu(*p); + if (nr) { + struct buffer_head *bh; + + *p = 0; + bh = get_hash_table(dev, nr, blocksize); + ext3_forget(handle, 0, inode, bh, nr); + } + } + + ext3_free_blocks(handle, inode, block_to_free, count); + } + + /** + * ext3_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks refered from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ + static void ext3_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, u32 *first, u32 *last) + { + unsigned long block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + u32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + unsigned long nr; /* Current block # */ + u32 *p; /* Pointer into inode/ind + for current block */ + int err; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + ext3_clear_blocks(handle, inode, this_bh, + block_to_free, + count, block_to_free_p, p); + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (count > 0) + ext3_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, this_bh); + } + } + + /** + * ext3_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks refered from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ + static void ext3_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + u32 *first, u32 *last, int depth) + { + unsigned long nr; + u32 *p; + + if (is_handle_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + /* Go read the buffer for the next level down */ + bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext3_error(inode->i_sb, "ext3_free_branches", + "Read failure, inode=%ld, block=%ld", + inode->i_ino, nr); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext3_free_branches(handle, inode, bh, (u32*)bh->b_data, + (u32*)bh->b_data + addr_per_block, + depth); + + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. + * + * If this block has already been committed to the + * journal, a revoke record will be written. And + * revoke records must be emitted *before* clearing + * this block's bit in the bitmaps. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (is_handle_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + } + + ext3_free_blocks(handle, inode, nr, 1); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext3_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext3_free_data(handle, inode, parent_bh, first, last); + } + } + + /* + * ext3_truncate() + * + * We block out ext3_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext3_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commmit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext3_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext3 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext3_truncate() run will find them and release them. + */ + + void ext3_truncate(struct inode * inode) + { + handle_t *handle; + u32 *i_data = inode->u.ext3_i.i_data; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + int nr = 0; + int n; + long last_block; + unsigned blocksize; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode)) + return; + + ext3_discard_prealloc(inode); + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + blocksize = inode->i_sb->s_blocksize; + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); + + ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size); + + + n = ext3_block_to_path(inode, last_block, offsets); + if (n == 0) + goto out_stop; /* error */ + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext3 *really* writes onto the disk inode. + */ + inode->u.ext3_i.i_disksize = inode->i_size; + + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&inode->u.ext3_i.truncate_sem); + + if (n == 1) { /* direct blocks */ + ext3_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT3_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext3_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext3_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext3_free_branches(handle, inode, partial->bh, partial->p + 1, + (u32*)partial->bh->b_data + addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse (partial->bh); + partial--; + } + do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT3_IND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 1); + i_data[EXT3_IND_BLOCK] = 0; + } + case EXT3_IND_BLOCK: + nr = i_data[EXT3_DIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 2); + i_data[EXT3_DIND_BLOCK] = 0; + } + case EXT3_DIND_BLOCK: + nr = i_data[EXT3_TIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 3); + i_data[EXT3_TIND_BLOCK] = 0; + } + case EXT3_TIND_BLOCK: + ; + } + up_write(&inode->u.ext3_i.truncate_sem); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) + handle->h_sync = 1; + out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + ext3_journal_stop(handle, inode); + } + + /* + * ext3_get_inode_loc returns with an extra refcount against the + * inode's underlying buffer_head on success. + */ + + int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) + { + struct buffer_head *bh = 0; + unsigned long block; + unsigned long block_group; + unsigned long group_desc; + unsigned long desc; + unsigned long offset; + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && + inode->i_ino != EXT3_ACL_IDX_INO && + inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( + inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "bad inode number: %lu", inode->i_ino); + goto bad_inode; + } + block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); + if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "group >= groups count"); + goto bad_inode; + } + group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); + desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); + bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; + if (!bh) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "Descriptor not loaded"); + goto bad_inode; + } + + gdp = (struct ext3_group_desc *) bh->b_data; + /* + * Figure out the offset within the block group inode table + */ + offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * + EXT3_INODE_SIZE(inode->i_sb); + block = le32_to_cpu(gdp[desc].bg_inode_table) + + (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); + if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, block); + goto bad_inode; + } + offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); + + iloc->bh = bh; + iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); + iloc->block_group = block_group; + + return 0; + + bad_inode: + return -EIO; + } + + void ext3_read_inode(struct inode * inode) + { + struct ext3_iloc iloc; + struct ext3_inode *raw_inode; + struct buffer_head *bh; + int block; + + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; + raw_inode = iloc.raw_inode; + init_rwsem(&inode->u.ext3_i.truncate_sem); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime = le32_to_cpu(raw_inode->i_atime); + inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); + inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size + * (for stat), not the fs block + * size */ + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + inode->i_version = ++event; + inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags); + #ifdef EXT3_FRAGMENTS + inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); + inode->u.ext3_i.i_frag_no = raw_inode->i_frag; + inode->u.ext3_i.i_frag_size = raw_inode->i_fsize; + #endif + inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } + inode->u.ext3_i.i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + #ifdef EXT3_PREALLOCATE + inode->u.ext3_i.i_prealloc_count = 0; + #endif + inode->u.ext3_i.i_block_group = iloc.block_group; + + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3_N_BLOCKS; block++) + inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); + + brelse (iloc.bh); + + if (inode->i_ino == EXT3_ACL_IDX_INO || + inode->i_ino == EXT3_ACL_DATA_INO) + /* Nothing to do */ ; + else if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (!inode->i_blocks) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + } + } else + init_special_inode(inode, inode->i_mode, + le32_to_cpu(iloc.raw_inode->i_block[0])); + /* inode->i_attr_flags = 0; unused */ + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ + inode->i_flags |= S_SYNC; + } + if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */ + inode->i_flags |= S_APPEND; + } + if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FILE_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */ + inode->i_flags |= S_IMMUTABLE_FILE; + } + if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_LINK_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_LINK; unused */ + inode->i_flags |= S_IMMUTABLE_LINK; + } + if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ + inode->i_flags |= S_NOATIME; + } + return; + + bad_inode: + make_bad_inode(inode); + return; + } + + /* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + */ + + static int ext3_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) + { + struct ext3_inode *raw_inode = iloc->raw_inode; + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + + if (handle) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto out_brelse; + } + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if(!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + /* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!inode->u.ext3_i.i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime); + raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags); + #ifdef EXT3_FRAGMENTS + raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr); + raw_inode->i_frag = inode->u.ext3_i.i_frag_no; + raw_inode->i_fsize = inode->u.ext3_i.i_frag_size; + #else + /* If we are not tracking these fields in the in-memory inode, + * then preserve them on disk, but still initialise them to zero + * for new inodes. */ + if (inode->u.ext3_i.i_state & EXT3_STATE_NEW) { + raw_inode->i_faddr = 0; + raw_inode->i_frag = 0; + raw_inode->i_fsize = 0; + } + #endif + raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); + } else { + raw_inode->i_size_high = + cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); + if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT3_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT3_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + err = ext3_journal_get_write_access(handle, + sb->u.ext3_sb.s_sbh); + if (err) + goto out_brelse; + ext3_update_dynamic_rev(sb); + EXT3_SET_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; + handle->h_sync = 1; + err = ext3_journal_dirty_metadata(handle, + sb->u.ext3_sb.s_sbh); + } + } + } + raw_inode->i_generation = le32_to_cpu(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_block[0] = + cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); + else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = inode->u.ext3_i.i_data[block]; + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; + inode->u.ext3_i.i_state &= ~EXT3_STATE_NEW; + + out_brelse: + brelse (bh); + ext3_std_error(inode->i_sb, err); + return err; + } + + /* + * ext3_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ + void ext3_write_inode(struct inode *inode, int wait) + { + if (current->flags & PF_MEMALLOC) + return; + + if (ext3_journal_current_handle()) { + jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); + return; + } + + if (!wait) + return; + + ext3_force_commit(inode->i_sb); + } + + /* + * ext3_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * This is only needed for regular files. rmdir() has its own path, and + * we can never truncate a direcory except on final unlink (at which + * point i_nlink is zero so recovery is easy.) + * + * Called with the BKL. + */ + + int ext3_setattr(struct dentry *dentry, struct iattr *attr) + { + struct inode *inode = dentry->d_inode; + int error, rc; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + handle_t *handle; + + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + error = ext3_orphan_add(handle, inode); + inode->u.ext3_i.i_disksize = attr->ia_size; + rc = ext3_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext3_journal_stop(handle, inode); + } + + inode_setattr(inode, attr); + + /* If inode_setattr's call to ext3_truncate failed to get a + * transaction handle at all, we need to clean up the in-core + * orphan list manually. */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + + err_out: + ext3_std_error(inode->i_sb, error); + return 0; + } + + + /* + * akpm: how many blocks doth make a writepage()? + * + * With N blocks per page, it may be: + * N data blocks + * 2 indirect block + * 2 dindirect + * 1 tindirect + * N+5 bitmap blocks (from the above) + * N+5 group descriptor summary blocks + * 1 inode block + * 1 superblock. + * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files + * + * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS + * + * With ordered or writeback data it's the same, less the N data blocks. + * + * If the inode's direct blocks can hold an integral number of pages then a + * page cannot straddle two indirect blocks, and we can only touch one indirect + * and dindirect block, and the "5" above becomes "3". + * + * This still overestimates under most circumstances. If we were to pass the + * start and end offsets in here as well we could do block_to_path() on each + * block and work out the exact number of indirects which are touched. Pah. + */ + + int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else + ret = 2 * (bpp + indirects) + 2; + + #ifdef CONFIG_QUOTA + ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; + #endif + + return ret; + } + + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) + { + int err = 0; + + if (handle) { + /* the do_update_inode consumes one bh->b_count */ + atomic_inc(&iloc->bh->b_count); + err = ext3_do_update_inode(handle, inode, iloc); + /* ext3_do_update_inode() does journal_dirty_metadata */ + brelse(iloc->bh); + } else { + printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n"); + } + return err; + } + + /* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + + int + ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc) + { + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + } + ext3_std_error(inode->i_sb, err); + return err; + } + + /* + * akpm: What we do here is to mark the in-core inode as clean + * with respect to inode dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ + int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) + { + struct ext3_iloc iloc; + int err; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (!err) + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + return err; + } + + /* + * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ + void ext3_dirty_inode(struct inode *inode) + { + handle_t *current_handle = ext3_journal_current_handle(); + handle_t *handle; + + lock_kernel(); + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + goto out; + if (current_handle && + current_handle->h_transaction != handle->h_transaction) { + /* This task has a transaction open against a different fs */ + printk(KERN_EMERG __FUNCTION__": transactions do not match!\n"); + } else { + jbd_debug(5, "marking dirty. outer handle=%p\n", + current_handle); + ext3_mark_inode_dirty(handle, inode); + } + ext3_journal_stop(handle, inode); + out: + unlock_kernel(); + } + + #ifdef AKPM + /* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext3_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ + static inline int + ext3_pin_inode(handle_t *handle, struct inode *inode) + { + struct ext3_iloc iloc; + + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext3_journal_dirty_metadata(handle, + iloc.bh); + brelse(iloc.bh); + } + } + ext3_std_error(inode->i_sb, err); + return err; + } + #endif + + int ext3_change_inode_journal_flag(struct inode *inode, int val) + { + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT3_JOURNAL(inode); + if (is_journal_aborted(journal) || IS_RDONLY(inode)) + return -EROFS; + + journal_lock_updates(journal); + journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL; + else + inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL; + + journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext3_mark_inode_dirty(handle, inode); + handle->h_sync = 1; + ext3_journal_stop(handle, inode); + ext3_std_error(inode->i_sb, err); + + return err; + } + + + /* + * ext3_aops_journal_start(). + * + * + * + * We need to take the inode semaphore *outside* the + * journal_start/journal_stop. Otherwise, a different task could do a + * wait_for_commit() while holding ->i_sem, which deadlocks. The rule + * is: transaction open/closes are considered to be a locking operation + * and they nest *inside* ->i_sem. + * ---------------------------------------------------------------------------- + * Possible problem: + * ext3_file_write() + * -> generic_file_write() + * -> __alloc_pages() + * -> page_launder() + * -> ext3_writepage() + * + * And the writepage can be on a different fs while we have a + * transaction open against this one! Bad. + * + * I tried making the task PF_MEMALLOC here, but that simply results in + * 0-order allocation failures passed back to generic_file_write(). + * Instead, we rely on the reentrancy protection in ext3_writepage(). + * ---------------------------------------------------------------------------- + * When we do the journal_start() here we don't really need to reserve + * any blocks - we won't need any until we hit ext3_prepare_write(), + * which does all the needed journal extending. However! There is a + * problem with quotas: + * + * Thread 1: + * sys_sync + * ->sync_dquots + * ->commit_dquot + * ->lock_dquot + * ->write_dquot + * ->ext3_file_write + * ->journal_start + * ->ext3_prepare_write + * ->journal_extend + * ->journal_start + * Thread 2: + * ext3_create (for example) + * ->ext3_new_inode + * ->dquot_initialize + * ->lock_dquot + * + * Deadlock. Thread 1's journal_start blocks because thread 2 has a + * transaction open. Thread 2's transaction will never close because + * thread 2 is stuck waiting for the dquot lock. + * + * So. We must ensure that thread 1 *never* needs to extend the journal + * for quota writes. We do that by reserving enough journal blocks + * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we + * need to extend" test in ext3_prepare_write() succeeds. + */ + + + MODULE_LICENSE("GPL"); diff -rc2P linux/fs/ext3/ioctl.c linux-2.4.13/fs/ext3/ioctl.c *** linux/fs/ext3/ioctl.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/ioctl.c Fri Nov 9 17:03:13 2001 *************** *** 0 **** --- 1,176 ---- + /* + * linux/fs/ext3/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + + #include + #include + #include + #include + #include + #include + + + int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) + { + unsigned int flags; + + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT3_IOC_GETFLAGS: + flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); + case EXT3_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err; + struct ext3_iloc iloc; + unsigned int oldflags; + unsigned int jflag; + + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (get_user(flags, (int *) arg)) + return -EFAULT; + + oldflags = inode->u.ext3_i.i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + } + + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + flags = flags & EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; + inode->u.ext3_i.i_flags = flags; + + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (flags & EXT3_IMMUTABLE_FILE_FL) + inode->i_flags |= S_IMMUTABLE_FILE; + else + inode->i_flags &= ~S_IMMUTABLE_FILE; + + if (flags & EXT3_IMMUTABLE_LINK_FL) + inode->i_flags |= S_IMMUTABLE_LINK; + else + inode->i_flags &= ~S_IMMUTABLE_LINK; + + if (flags & EXT3_NOATIME_FL) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + inode->i_ctime = CURRENT_TIME; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + flags_err: + ext3_journal_stop(handle, inode); + if (err) + return err; + + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int *) arg); + case EXT3_IOC_SETVERSION: + case EXT3_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext3_iloc iloc; + __u32 generation; + int err; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (get_user(generation, (int *) arg)) + return -EFAULT; + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; + + inode->i_ctime = CURRENT_TIME; + inode->i_generation = generation; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + ext3_journal_stop(handle, inode); + return err; + } + #ifdef CONFIG_JBD_DEBUG + case EXT3_IOC_WAIT_FOR_READONLY: + /* + * This is racy - by the time we're woken up and running, + * the superblock could be released. And the module could + * have been unloaded. So sue me. + * + * Returns 1 if it slept, else zero. + */ + { + struct super_block *sb = inode->i_sb; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); + if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) { + schedule(); + ret = 1; + } + remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); + return ret; + } + #endif + default: + return -ENOTTY; + } + } diff -rc2P linux/fs/ext3/namei.c linux-2.4.13/fs/ext3/namei.c *** linux/fs/ext3/namei.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/namei.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,1125 ---- + /* + * linux/fs/ext3/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + + /* + * define how far ahead to read directories while searching them. + */ + #define NAMEI_RA_CHUNKS 2 + #define NAMEI_RA_BLOCKS 4 + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + + /* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * + * `len <= EXT3_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ + static inline int ext3_match (int len, const char * const name, + struct ext3_dir_entry_2 * de) + { + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); + } + + /* + * Returns 0 if not found, -1 on failure, and 1 on success + */ + static int inline search_dirblock(struct buffer_head * bh, + struct inode *dir, + struct dentry *dentry, + unsigned long offset, + struct ext3_dir_entry_2 ** res_dir) + { + struct ext3_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + + de = (struct ext3_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + if ((char *) de + namelen <= dlimit && + ext3_match (namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ext3_check_dir_entry("ext3_find_entry", + dir, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = le16_to_cpu(de->rec_len); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); + } + return 0; + } + + /* + * ext3_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; + struct buffer_head * bh, *ret = NULL; + unsigned long start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; + + *res_dir = NULL; + sb = dir->i_sb; + + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = dir->u.ext3_i.i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; + restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext3_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ, 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, dentry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + dir->u.ext3_i.i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + + cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); + return ret; + } + + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); + brelse (bh); + inode = iget(dir->i_sb, ino); + + if (!inode) + return ERR_PTR(-EACCES); + } + d_add(dentry, inode); + return NULL; + } + + #define S_SHIFT 12 + static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] EXT3_FT_DIR, + [S_IFCHR >> S_SHIFT] EXT3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] EXT3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] EXT3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] EXT3_FT_SOCK, + [S_IFLNK >> S_SHIFT] EXT3_FT_SYMLINK, + }; + + static inline void ext3_set_de_type(struct super_block *sb, + struct ext3_dir_entry_2 *de, + umode_t mode) { + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + } + + /* + * ext3_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext3_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ + + /* + * AKPM: the journalling code here looks wrong on the error paths + */ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset; + unsigned short rec_len; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de, * de1; + struct super_block * sb; + int retval; + + sb = dir->i_sb; + + if (!namelen) + return -EINVAL; + bh = ext3_bread (handle, dir, 0, 0, &retval); + if (!bh) + return retval; + rec_len = EXT3_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ext3_dir_entry_2 *) bh->b_data; + while (1) { + if ((char *)de >= sb->s_blocksize + bh->b_data) { + brelse (bh); + bh = NULL; + bh = ext3_bread (handle, dir, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); + if (!bh) + return retval; + if (dir->i_size <= offset) { + if (dir->i_size == 0) { + brelse(bh); + return -ENOENT; + } + + ext3_debug ("creating next block\n"); + + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = le16_to_cpu(sb->s_blocksize); + dir->u.ext3_i.i_disksize = + dir->i_size = offset + sb->s_blocksize; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + } else { + + ext3_debug ("skipping to next block\n"); + + de = (struct ext3_dir_entry_2 *) bh->b_data; + } + } + if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, + offset)) { + brelse (bh); + return -ENOENT; + } + if (ext3_match (namelen, name, de)) { + brelse (bh); + return -EEXIST; + } + if ((le32_to_cpu(de->inode) == 0 && + le16_to_cpu(de->rec_len) >= rec_len) || + (le16_to_cpu(de->rec_len) >= + EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + /* By now the buffer is marked for journaling */ + offset += le16_to_cpu(de->rec_len); + if (le32_to_cpu(de->inode)) { + de1 = (struct ext3_dir_entry_2 *) ((char *) de + + EXT3_DIR_REC_LEN(de->name_len)); + de1->rec_len = + cpu_to_le16(le16_to_cpu(de->rec_len) - + EXT3_DIR_REC_LEN(de->name_len)); + de->rec_len = cpu_to_le16( + EXT3_DIR_REC_LEN(de->name_len)); + de = de1; + } + de->file_type = EXT3_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext3_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy (de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext3_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + dir->i_version = ++event; + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + return 0; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + brelse (bh); + return -ENOSPC; + } + + /* + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry + */ + static int ext3_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3_dir_entry_2 * de_del, + struct buffer_head * bh) + { + struct ext3_dir_entry_2 * de, * pde; + int i; + + i = 0; + pde = NULL; + de = (struct ext3_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) + return -EIO; + if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + if (pde) + pde->rec_len = + cpu_to_le16(le16_to_cpu(pde->rec_len) + + le16_to_cpu(de->rec_len)); + else + de->inode = 0; + dir->i_version = ++event; + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + return 0; + } + i += le16_to_cpu(de->rec_len); + pde = de; + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + return -ENOENT; + } + + /* + * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we + * do not perform it in these functions. We perform it at the call site, + * if it is needed. + */ + static inline void ext3_inc_count(handle_t *handle, struct inode *inode) + { + inode->i_nlink++; + } + + static inline void ext3_dec_count(handle_t *handle, struct inode *inode) + { + inode->i_nlink--; + } + + static int ext3_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) + { + int err = ext3_add_entry(handle, dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + ext3_dec_count(handle, inode); + iput(inode); + return err; + } + + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ + static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) + { + handle_t *handle; + struct inode * inode; + int err; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle, dir); + return err; + } + + static int ext3_mknod (struct inode * dir, struct dentry *dentry, + int mode, int rdev) + { + handle_t *handle; + struct inode *inode; + int err; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle, dir); + return err; + } + + static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) + { + handle_t *handle; + struct inode * inode; + struct buffer_head * dir_block; + struct ext3_dir_entry_2 * de; + int err; + + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, S_IFDIR); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; + inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + BUFFER_TRACE(dir_block, "get_write_access"); + ext3_journal_get_write_access(handle, dir_block); + de = (struct ext3_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len)); + strcpy (de->name, "."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy (de->name, ".."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + inode->i_nlink = 2; + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); + inode->i_mode = S_IFDIR | mode; + if (dir->i_mode & S_ISGID) + inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) + goto out_no_entry; + dir->i_nlink++; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; + + out_no_entry: + inode->i_nlink = 0; + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + + /* + * routine to check that the specified directory is empty (for rmdir) + */ + static int empty_dir (struct inode * inode) + { + unsigned long offset; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de, * de1; + struct super_block * sb; + int err; + + sb = inode->i_sb; + if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || + !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { + ext3_warning (inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de1 = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3_warning (inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse (bh); + return 1; + } + offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); + de = (struct ext3_dir_entry_2 *) + ((char *) de1 + le16_to_cpu(de1->rec_len)); + while (offset < inode->i_size ) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + brelse (bh); + bh = ext3_bread (NULL, inode, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); + if (!bh) { + #if 0 + ext3_error (sb, "empty_dir", + "directory #%lu contains a hole at offset %lu", + inode->i_ino, offset); + #endif + offset += sb->s_blocksize; + continue; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + } + if (!ext3_check_dir_entry ("empty_dir", inode, de, bh, + offset)) { + brelse (bh); + return 1; + } + if (le32_to_cpu(de->inode)) { + brelse (bh); + return 0; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + brelse (bh); + return 1; + } + + /* ext3_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext3_orphan_cleanup(). + */ + int ext3_orphan_add(handle_t *handle, struct inode *inode) + { + struct super_block *sb = inode->i_sb; + struct ext3_iloc iloc; + int err = 0, rc; + + lock_super(sb); + if (!list_empty(&inode->u.ext3_i.i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: + * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * here (on lock_super()), so race with ext3_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. + */ + J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (err) + goto out_unlock; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); + EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + rc = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) + list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); + jbd_debug(4, "orphan inode %ld will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); + out_unlock: + unlock_super(sb); + ext3_std_error(inode->i_sb, err); + return err; + } + + /* + * ext3_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ + int ext3_orphan_del(handle_t *handle, struct inode *inode) + { + struct list_head *prev; + struct ext3_sb_info *sbi; + ino_t ino_next; + struct ext3_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); + if (list_empty(&inode->u.ext3_i.i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); + prev = inode->u.ext3_i.i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); + + list_del(&inode->u.ext3_i.i_orphan); + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle) + goto out; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %ld\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext3_iloc iloc2; + struct inode *i_prev = + list_entry(prev, struct inode, u.ext3_i.i_orphan); + + jbd_debug(4, "orphan inode %ld will point to %ld\n", + i_prev->i_ino, ino_next); + err = ext3_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto out_brelse; + + out_err: + ext3_std_error(inode->i_sb, err); + out: + unlock_super(inode->i_sb); + return err; + + out_brelse: + brelse(iloc.bh); + goto out_err; + } + + static int ext3_rmdir (struct inode * dir, struct dentry *dentry) + { + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); + if (!bh) + goto end_rmdir; + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = dentry->d_inode; + DQUOT_INIT(inode); + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir (inode)) + goto end_rmdir; + + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) + ext3_warning (inode->i_sb, "ext3_rmdir", + "empty directory has nlink!=2 (%d)", + inode->i_nlink); + inode->i_version = ++event; + inode->i_nlink = 0; + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext3_orphan_add(handle, inode); + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + + end_rmdir: + ext3_journal_stop(handle, dir); + brelse (bh); + return retval; + } + + static int ext3_unlink(struct inode * dir, struct dentry *dentry) + { + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + DQUOT_INIT(inode); + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + ext3_mark_inode_dirty(handle, inode); + inode->i_ctime = dir->i_ctime; + retval = 0; + + end_unlink: + ext3_journal_stop(handle, dir); + brelse (bh); + return retval; + } + + static int ext3_symlink (struct inode * dir, + struct dentry *dentry, const char * symname) + { + handle_t *handle; + struct inode * inode; + int l, err; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > sizeof (inode->u.ext3_i.i_data)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* + * block_symlink() calls back into ext3_prepare/commit_write. + * We have a transaction open. All is sweetness. It also sets + * i_size in generic_commit_write(). + */ + err = block_symlink(inode, symname, l); + if (err) + goto out_no_entry; + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; + memcpy((char*)&inode->u.ext3_i.i_data,symname,l); + inode->i_size = l-1; + } + inode->u.ext3_i.i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; + + out_no_entry: + ext3_dec_count(handle, inode); + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err; + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + if (inode->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode->i_ctime = CURRENT_TIME; + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + ext3_journal_stop(handle, dir); + return err; + } + + #define PARENT_INO(buffer) \ + ((struct ext3_dir_entry_2 *) ((char *) buffer + \ + le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode + + /* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ + static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir,struct dentry *new_dentry) + { + handle_t *handle; + struct inode * old_inode, * new_inode; + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval; + + old_bh = new_bh = dir_bh = NULL; + + handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; + + old_bh = ext3_find_entry (old_dentry, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = old_dentry->d_inode; + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = new_dentry->d_inode; + new_bh = ext3_find_entry (new_dentry, &new_de); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); + new_bh = NULL; + } else { + DQUOT_INIT(new_inode); + } + } + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir (new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir!=old_dir && + new_dir->i_nlink >= EXT3_LINK_MAX) + goto end_rename; + } + if (!new_bh) { + retval = ext3_add_entry (handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + BUFFER_TRACE(new_bh, "get_write_access"); + ext3_journal_get_write_access(handle, new_bh); + new_de->inode = le32_to_cpu(old_inode->i_ino); + if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT3_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version = ++event; + BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, new_bh); + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + ext3_delete_entry(handle, old_dir, old_de, old_bh); + + if (new_inode) { + new_inode->i_nlink--; + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); + PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_bh); + old_dir->i_nlink--; + if (new_inode) { + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; + new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, new_dir); + } + } + ext3_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext3_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext3_orphan_add(handle, new_inode); + } + retval = 0; + + end_rename: + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); + ext3_journal_stop(handle, old_dir); + return retval; + } + + /* + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { + create: ext3_create, /* BKL held */ + lookup: ext3_lookup, /* BKL held */ + link: ext3_link, /* BKL held */ + unlink: ext3_unlink, /* BKL held */ + symlink: ext3_symlink, /* BKL held */ + mkdir: ext3_mkdir, /* BKL held */ + rmdir: ext3_rmdir, /* BKL held */ + mknod: ext3_mknod, /* BKL held */ + rename: ext3_rename, /* BKL held */ + }; diff -rc2P linux/fs/ext3/super.c linux-2.4.13/fs/ext3/super.c *** linux/fs/ext3/super.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/super.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,1743 ---- + /* + * linux/fs/ext3/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #ifdef CONFIG_JBD_DEBUG + static int ext3_ro_after; /* Make fs read-only after this many jiffies */ + #endif + + static int ext3_load_journal(struct super_block *, struct ext3_super_block *); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); + static void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync); + static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es); + static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es); + + #ifdef CONFIG_JBD_DEBUG + /* + * Debug code for turning filesystems "read-only" after a specified + * amount of time. This is for crash/recovery testing. + */ + + static void make_rdonly(kdev_t dev, int *no_write) + { + if (dev) { + printk(KERN_WARNING "Turning device %s read-only\n", + bdevname(dev)); + *no_write = 0xdead0000 + dev; + } + } + + static void turn_fs_readonly(unsigned long arg) + { + struct super_block *sb = (struct super_block *)arg; + + make_rdonly(sb->s_dev, &journal_no_write[0]); + make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]); + wake_up(&EXT3_SB(sb)->ro_wait_queue); + } + + static void setup_ro_after(struct super_block *sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + init_timer(&sbi->turn_ro_timer); + if (ext3_ro_after) { + printk(KERN_DEBUG "fs will go read-only in %d jiffies\n", + ext3_ro_after); + init_waitqueue_head(&sbi->ro_wait_queue); + journal_no_write[0] = 0; + journal_no_write[1] = 0; + sbi->turn_ro_timer.function = turn_fs_readonly; + sbi->turn_ro_timer.data = (unsigned long)sb; + sbi->turn_ro_timer.expires = jiffies + ext3_ro_after; + ext3_ro_after = 0; + add_timer(&sbi->turn_ro_timer); + } + } + + static void clear_ro_after(struct super_block *sb) + { + del_timer_sync(&EXT3_SB(sb)->turn_ro_timer); + journal_no_write[0] = 0; + journal_no_write[1] = 0; + ext3_ro_after = 0; + } + #else + #define setup_ro_after(sb) do {} while (0) + #define clear_ro_after(sb) do {} while (0) + #endif + + + static char error_buf[1024]; + + /* Determine the appropriate response to ext3_error on a given filesystem */ + + static int ext3_error_behaviour(struct super_block *sb) + { + /* First check for mount-time options */ + if (test_opt (sb, ERRORS_PANIC)) + return EXT3_ERRORS_PANIC; + if (test_opt (sb, ERRORS_RO)) + return EXT3_ERRORS_RO; + if (test_opt (sb, ERRORS_CONT)) + return EXT3_ERRORS_CONTINUE; + + /* If no overrides were specified on the mount, then fall back + * to the default behaviour set in the filesystem's superblock + * on disk. */ + switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) { + case EXT3_ERRORS_PANIC: + return EXT3_ERRORS_PANIC; + case EXT3_ERRORS_RO: + return EXT3_ERRORS_RO; + default: + break; + } + return EXT3_ERRORS_CONTINUE; + } + + /* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext3, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will compain about + * that error until we've noted it down and cleared it. + */ + + static void ext3_handle_error(struct super_block *sb) + { + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le32(EXT3_ERROR_FS); + + if (sb->s_flags & MS_RDONLY) + return; + + if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) { + EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + journal_abort(EXT3_SB(sb)->s_journal, -EIO); + } + + if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) + panic ("EXT3-fs (device %s): panic forced after error\n", + bdevname(sb->s_dev)); + + if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) { + printk (KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + + ext3_commit_super(sb, es, 1); + } + + void ext3_error (struct super_block * sb, const char * function, + const char * fmt, ...) + { + va_list args; + + va_start (args, fmt); + vsprintf (error_buf, fmt, args); + va_end (args); + + printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + + ext3_handle_error(sb); + } + + const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]) + { + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + + break; + } + + return errstr; + } + + /* __ext3_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + + void __ext3_std_error (struct super_block * sb, const char * function, + int errno) + { + char nbuf[16]; + const char *errstr = ext3_decode_error(sb, errno, nbuf); + + printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", + bdevname(sb->s_dev), function, errstr); + + ext3_handle_error(sb); + } + + /* + * ext3_abort is a much stronger failure handler than ext3_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + + void ext3_abort (struct super_block * sb, const char * function, + const char * fmt, ...) + { + va_list args; + + printk (KERN_CRIT "ext3_abort called.\n"); + + va_start (args, fmt); + vsprintf (error_buf, fmt, args); + va_end (args); + + if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) + panic ("EXT3-fs panic (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + + printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + + if (sb->s_flags & MS_RDONLY) + return; + + printk (KERN_CRIT "Remounting filesystem read-only\n"); + sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; + sb->s_flags |= MS_RDONLY; + sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT; + journal_abort(EXT3_SB(sb)->s_journal, -EIO); + } + + /* Deal with the reporting of failure conditions while running, such as + * inconsistencies in operation or invalid system states. + * + * Use ext3_error() for cases of invalid filesystem states, as that will + * record an error on disk and force a filesystem check on the next boot. + */ + NORET_TYPE void ext3_panic (struct super_block * sb, const char * function, + const char * fmt, ...) + { + va_list args; + + va_start (args, fmt); + vsprintf (error_buf, fmt, args); + va_end (args); + + /* this is to prevent panic from syncing this filesystem */ + /* AKPM: is this sufficient? */ + sb->s_flags |= MS_RDONLY; + panic ("EXT3-fs panic (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + } + + void ext3_warning (struct super_block * sb, const char * function, + const char * fmt, ...) + { + va_list args; + + va_start (args, fmt); + vsprintf (error_buf, fmt, args); + va_end (args); + printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + } + + void ext3_update_dynamic_rev(struct super_block *sb) + { + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) + return; + + ext3_warning(sb, __FUNCTION__, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT3_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ + } + + /* + * Open the external journal device + */ + static struct block_device *ext3_blkdev_get(kdev_t dev) + { + struct block_device *bdev; + int err = -ENODEV; + + bdev = bdget(kdev_t_to_nr(dev)); + if (bdev == NULL) + goto fail; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS); + if (err < 0) + goto fail; + return bdev; + + fail: + printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n", + bdevname(dev), err); + return NULL; + } + + /* + * Release the journal device + */ + static int ext3_blkdev_put(struct block_device *bdev) + { + return blkdev_put(bdev, BDEV_FS); + } + + static int ext3_blkdev_remove(struct ext3_sb_info *sbi) + { + struct block_device *bdev; + int ret = -ENODEV; + + bdev = sbi->journal_bdev; + if (bdev) { + ret = ext3_blkdev_put(bdev); + sbi->journal_bdev = 0; + } + return ret; + } + + #define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan) + + static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) + { + struct list_head *l; + + printk(KERN_ERR "sb orphan head is %d\n", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n", + inode->i_dev, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + le32_to_cpu(NEXT_ORPHAN(inode))); + } + } + + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + es->s_state = le16_to_cpu(sbi->s_mount_state); + BUFFER_TRACE(sbi->s_sbh, "marking dirty"); + mark_buffer_dirty(sbi->s_sbh); + ext3_commit_super(sb, es, 1); + } + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) + brelse(sbi->s_inode_bitmap[i]); + for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) + brelse(sbi->s_block_bitmap[i]); + brelse(sbi->s_sbh); + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_buffers(sb->s_dev); + if (j_dev != sb->s_dev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + fsync_no_super(j_dev); + invalidate_buffers(j_dev); + ext3_blkdev_remove(sbi); + } + clear_ro_after(sb); + + return; + } + + static struct super_operations ext3_sops = { + read_inode: ext3_read_inode, /* BKL held */ + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ + write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ + unlockfs: ext3_unlockfs, /* BKL not held. We take it */ + statfs: ext3_statfs, /* BKL held */ + remount_fs: ext3_remount, /* BKL held */ + }; + + static int want_value(char *value, char *option) + { + if (!value || !*value) { + printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n", + option); + return -1; + } + return 0; + } + + static int want_null_value(char *value, char *option) + { + if (*value) { + printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n", + option, value); + return -1; + } + return 0; + } + + static int want_numeric(char *value, char *option, unsigned long *number) + { + if (want_value(value, option)) + return -1; + *number = simple_strtoul(value, &value, 0); + if (want_null_value(value, option)) + return -1; + return 0; + } + + /* + * This function has been shamelessly adapted from the msdos fs + */ + static int parse_options (char * options, unsigned long * sb_block, + struct ext3_sb_info *sbi, + unsigned long * inum, + int is_remount) + { + unsigned long *mount_options = &sbi->s_mount_opt; + uid_t *resuid = &sbi->s_resuid; + gid_t *resgid = &sbi->s_resgid; + char * this_char; + char * value; + + if (!options) + return 1; + for (this_char = strtok (options, ","); + this_char != NULL; + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { + set_opt (*mount_options, NO_UID32); + } + else if (!strcmp (this_char, "abort")) + set_opt (*mount_options, ABORT); + else if (!strcmp (this_char, "check")) { + if (!value || !*value || !strcmp (value, "none")) + clear_opt (*mount_options, CHECK); + else + #ifdef CONFIG_EXT3_CHECK + set_opt (*mount_options, CHECK); + #else + printk(KERN_ERR + "EXT3 Check option not supported\n"); + #endif + } + else if (!strcmp (this_char, "debug")) + set_opt (*mount_options, DEBUG); + else if (!strcmp (this_char, "errors")) { + if (want_value(value, "errors")) + return 0; + if (!strcmp (value, "continue")) { + clear_opt (*mount_options, ERRORS_RO); + clear_opt (*mount_options, ERRORS_PANIC); + set_opt (*mount_options, ERRORS_CONT); + } + else if (!strcmp (value, "remount-ro")) { + clear_opt (*mount_options, ERRORS_CONT); + clear_opt (*mount_options, ERRORS_PANIC); + set_opt (*mount_options, ERRORS_RO); + } + else if (!strcmp (value, "panic")) { + clear_opt (*mount_options, ERRORS_CONT); + clear_opt (*mount_options, ERRORS_RO); + set_opt (*mount_options, ERRORS_PANIC); + } + else { + printk (KERN_ERR + "EXT3-fs: Invalid errors option: %s\n", + value); + return 0; + } + } + else if (!strcmp (this_char, "grpid") || + !strcmp (this_char, "bsdgroups")) + set_opt (*mount_options, GRPID); + else if (!strcmp (this_char, "minixdf")) + set_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nocheck")) + clear_opt (*mount_options, CHECK); + else if (!strcmp (this_char, "nogrpid") || + !strcmp (this_char, "sysvgroups")) + clear_opt (*mount_options, GRPID); + else if (!strcmp (this_char, "resgid")) { + unsigned long v; + if (want_numeric(value, "resgid", &v)) + return 0; + *resgid = v; + } + else if (!strcmp (this_char, "resuid")) { + unsigned long v; + if (want_numeric(value, "resuid", &v)) + return 0; + *resuid = v; + } + else if (!strcmp (this_char, "sb")) { + if (want_numeric(value, "sb", sb_block)) + return 0; + } + #ifdef CONFIG_JBD_DEBUG + else if (!strcmp (this_char, "ro-after")) { + unsigned long v; + if (want_numeric(value, "ro-after", &v)) + return 0; + ext3_ro_after = v; + } + #endif + /* Silently ignore the quota options */ + else if (!strcmp (this_char, "grpquota") + || !strcmp (this_char, "noquota") + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create + a journal file here. For now, only allow the + user to specify an existing inode to be the + journal file. */ + if (is_remount) { + printk(KERN_ERR "EXT3-fs: cannot specify " + "journal on remount\n"); + return 0; + } + + if (want_value(value, "journal")) + return 0; + if (!strcmp (value, "update")) + set_opt (*mount_options, UPDATE_JOURNAL); + else if (want_numeric(value, "journal", inum)) + return 0; + } + else if (!strcmp (this_char, "noload")) + set_opt (*mount_options, NOLOAD); + else if (!strcmp (this_char, "data")) { + int data_opt = 0; + + if (want_value(value, "data")) + return 0; + if (!strcmp (value, "journal")) + data_opt = EXT3_MOUNT_JOURNAL_DATA; + else if (!strcmp (value, "ordered")) + data_opt = EXT3_MOUNT_ORDERED_DATA; + else if (!strcmp (value, "writeback")) + data_opt = EXT3_MOUNT_WRITEBACK_DATA; + else { + printk (KERN_ERR + "EXT3-fs: Invalid data option: %s\n", + value); + return 0; + } + if (is_remount) { + if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) != + data_opt) { + printk(KERN_ERR + "EXT3-fs: cannot change data " + "mode on remount\n"); + return 0; + } + } else { + *mount_options &= ~EXT3_MOUNT_DATA_FLAGS; + *mount_options |= data_opt; + } + } else { + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option %s\n", + this_char); + return 0; + } + } + return 1; + } + + static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, + int read_only) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { + printk (KERN_ERR "EXT3-fs warning: revision level too high, " + "forcing read-only mode\n"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT3_VALID_FS)) + printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " + "running e2fsck is recommended\n"); + else if ((sbi->s_mount_state & EXT3_ERROR_FS)) + printk (KERN_WARNING + "EXT3-fs warning: mounting fs with errors, " + "running e2fsck is recommended\n"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + printk (KERN_WARNING + "EXT3-fs warning: maximal mount count reached, " + "running e2fsck is recommended\n"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME)) + printk (KERN_WARNING + "EXT3-fs warning: checktime reached, " + "running e2fsck is recommended\n"); + #if 0 + /* @@@ We _will_ want to clear the valid bit if we find + inconsistencies, to force a fsck at reboot. But for + a plain journaled filesystem we can keep it set as + valid forever! :) */ + es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS); + #endif + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = + (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); + es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); + es->s_mtime = cpu_to_le32(CURRENT_TIME); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super (sb, es, 1); + if (test_opt (sb, DEBUG)) + printk (KERN_INFO + "[EXT3 FS %s, %s, bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]\n", + EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize, + sbi->s_groups_count, + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", + bdevname(sb->s_dev)); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { + printk("external journal on %s\n", + bdevname(EXT3_SB(sb)->s_journal->j_dev)); + } else { + printk("internal journal\n"); + } + #ifdef CONFIG_EXT3_CHECK + if (test_opt (sb, CHECK)) { + ext3_check_blocks_bitmap (sb); + ext3_check_inodes_bitmap (sb); + } + #endif + setup_ro_after(sb); + return res; + } + + static int ext3_check_descriptors (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); + struct ext3_group_desc * gdp = NULL; + int desc_block = 0; + int i; + + ext3_debug ("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) + { + if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0) + gdp = (struct ext3_group_desc *) + sbi->s_group_desc[desc_block++]->b_data; + if (le32_to_cpu(gdp->bg_block_bitmap) < block || + le32_to_cpu(gdp->bg_block_bitmap) >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Block bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_block_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_bitmap) < block || + le32_to_cpu(gdp->bg_inode_bitmap) >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_table) < block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode table for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_table)); + return 0; + } + block += EXT3_BLOCKS_PER_GROUP(sb); + gdp++; + } + return 1; + } + + + /* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext3_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ + static void ext3_orphan_cleanup (struct super_block * sb, + struct ext3_super_block * es) + { + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", + bdevname(sb->s_dev)); + sb->s_flags &= ~MS_RDONLY; + } + + if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + while (es->s_last_orphan) { + struct inode *inode; + + if (!(inode = + ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + if (inode->i_nlink) { + printk(KERN_DEBUG __FUNCTION__ + ": truncating inode %ld to %Ld bytes\n", + inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %ld to %Ld bytes\n", + inode->i_ino, inode->i_size); + ext3_truncate(inode); + nr_truncates++; + } else { + printk(KERN_DEBUG __FUNCTION__ + ": deleting unreferenced inode %ld\n", + inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %ld\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + + #define PLURAL(x) (x), ((x)==1) ? "" : "s" + + if (nr_orphans) + printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", + bdevname(sb->s_dev), PLURAL(nr_orphans)); + if (nr_truncates) + printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", + bdevname(sb->s_dev), PLURAL(nr_truncates)); + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + } + + #define log2(n) ffz(~(n)) + + /* + * Maximal file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^32 sector limit. + */ + static loff_t ext3_max_size(int bits) + { + loff_t res = EXT3_NDIR_BLOCKS; + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > (512LL << 32) - (1 << bits)) + res = (512LL << 32) - (1 << bits); + return res; + } + + struct super_block * ext3_read_super (struct super_block * sb, void * data, + int silent) + { + struct buffer_head * bh; + struct ext3_super_block *es = 0; + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long sb_block = 1; + unsigned long logic_sb_block = 1; + unsigned long offset = 0; + unsigned long journal_inum = 0; + kdev_t dev = sb->s_dev; + int blocksize; + int hblock; + int db_count; + int i; + int needs_recovery; + + #ifdef CONFIG_JBD_DEBUG + ext3_ro_after = 0; + #endif + /* + * See what the current blocksize for the device is, and + * use that as the blocksize. Otherwise (or if the blocksize + * is smaller than the default) use the default. + * This is important for devices that have a hardware + * sectorsize that is larger than the default. + */ + blocksize = EXT3_MIN_BLOCK_SIZE; + hblock = get_hardsect_size(dev); + if (blocksize < hblock) + blocksize = hblock; + + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; + if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { + sb->s_dev = 0; + goto out_fail; + } + + set_blocksize (dev, blocksize); + + /* + * The ext3 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT3_MIN_BLOCK_SIZE) { + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + } + + if (!(bh = bread (dev, logic_sb_block, blocksize))) { + printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext3 macro-instructions depend on its value + */ + es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT3_SUPER_MAGIC) { + if (!silent) + printk(KERN_ERR + "VFS: Can't find ext3 filesystem on dev %s.\n", + bdevname(dev)); + goto failed_mount; + } + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && + (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) + printk(KERN_WARNING + "EXT3-fs warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended\n"); + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) { + printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " + "unsupported optional features (%x).\n", + bdevname(dev), i); + goto failed_mount; + } + if (!(sb->s_flags & MS_RDONLY) && + (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){ + printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " + "unsupported optional features (%x).\n", + bdevname(dev), i); + goto failed_mount; + } + sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10; + sb->s_blocksize = 1 << sb->s_blocksize_bits; + + if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE || + sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) { + printk(KERN_ERR + "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", + blocksize, bdevname(dev)); + goto failed_mount; + } + + sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); + + if (sb->s_blocksize != blocksize) { + blocksize = sb->s_blocksize; + + /* + * Make sure the blocksize for the filesystem is larger + * than the hardware sectorsize for the machine. + */ + if (sb->s_blocksize < hblock) { + printk(KERN_ERR "EXT3-fs: blocksize %d too small for " + "device blocksize %d.\n", blocksize, hblock); + goto failed_mount; + } + + brelse (bh); + set_blocksize (dev, sb->s_blocksize); + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + bh = bread (dev, logic_sb_block, blocksize); + if (!bh) { + printk(KERN_ERR + "EXT3-fs: Can't read superblock on 2nd try.\n"); + return NULL; + } + es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) { + printk (KERN_ERR + "EXT3-fs: Magic mismatch, very weird !\n"); + goto failed_mount; + } + } + + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { + sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) { + printk (KERN_ERR + "EXT3-fs: unsupported inode size: %d\n", + sbi->s_inode_size); + goto failed_mount; + } + } + sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << + le32_to_cpu(es->s_log_frag_size); + if (blocksize != sbi->s_frag_size) { + printk(KERN_ERR + "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", + sbi->s_frag_size, blocksize); + goto failed_mount; + } + sbi->s_frags_per_block = 1; + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); + sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); + sbi->s_sbh = bh; + if (sbi->s_resuid == EXT3_DEF_RESUID) + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + if (sbi->s_resgid == EXT3_DEF_RESGID) + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #blocks per group too big: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_frags_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #fragments per group too big: %lu\n", + sbi->s_frags_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #inodes per group too big: %lu\n", + sbi->s_inodes_per_group); + goto failed_mount; + } + + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / + EXT3_BLOCKS_PER_GROUP(sb); + db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / + EXT3_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + printk (KERN_ERR "EXT3-fs: not enough memory\n"); + goto failed_mount; + } + for (i = 0; i < db_count; i++) { + sbi->s_group_desc[i] = bread(dev, logic_sb_block + i + 1, + blocksize); + if (!sbi->s_group_desc[i]) { + printk (KERN_ERR "EXT3-fs: " + "can't read group descriptor %d\n", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext3_check_descriptors (sb)) { + printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n"); + goto failed_mount2; + } + for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) { + sbi->s_inode_bitmap_number[i] = 0; + sbi->s_inode_bitmap[i] = NULL; + sbi->s_block_bitmap_number[i] = 0; + sbi->s_block_bitmap[i] = NULL; + } + sbi->s_loaded_inode_bitmaps = 0; + sbi->s_loaded_block_bitmaps = 0; + sbi->s_gdb_count = db_count; + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext3_sops; + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + + sb->s_root = 0; + + needs_recovery = (es->s_last_orphan != 0 || + EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_RECOVER)); + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext3_load_journal(sb, es)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) + goto failed_mount2; + } else { + if (!silent) + printk (KERN_ERR + "ext3: No journal on filesystem on %s\n", + bdevname(dev)); + goto failed_mount2; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + capabilities: ORDERED_DATA if the journal can + cope, else JOURNAL_DATA */ + if (journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) + set_opt(sbi->s_mount_opt, ORDERED_DATA); + else + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + break; + + case EXT3_MOUNT_ORDERED_DATA: + case EXT3_MOUNT_WRITEBACK_DATA: + if (!journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { + printk(KERN_ERR "EXT3-fs: Journal does not support " + "requested data journaling mode\n"); + goto failed_mount3; + } + default: + break; + } + + /* + * The journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO)); + if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) || + !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) { + if (sb->s_root) { + dput(sb->s_root); + sb->s_root = NULL; + printk(KERN_ERR + "EXT3-fs: corrupt root inode, run e2fsck\n"); + } else + printk(KERN_ERR "EXT3-fs: get root inode failed\n"); + goto failed_mount3; + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock + * in numerous places. Here we just pop the lock - it's relatively + * harmless, because we are now ready to accept write_super() requests, + * and aviro says that's the only reason for hanging onto the + * superblock lock. + */ + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; + unlock_super(sb); /* akpm: sigh */ + ext3_orphan_cleanup(sb, es); + lock_super(sb); + EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; + if (needs_recovery) + printk (KERN_INFO "EXT3-fs: recovery complete.\n"); + ext3_mark_recovery_complete(sb, es); + printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + + return sb; + + failed_mount3: + journal_destroy(sbi->s_journal); + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + failed_mount: + ext3_blkdev_remove(sbi); + brelse(bh); + out_fail: + return NULL; + } + + static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) + { + struct inode *journal_inode; + journal_t *journal; + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = iget(sb, journal_inum); + if (!journal_inode) { + printk(KERN_ERR "EXT3-fs: no journal found.\n"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", + journal_inode, journal_inode->i_size); + if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) { + printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); + iput(journal_inode); + return NULL; + } + + journal = journal_init_inode(journal_inode); + if (!journal) + iput(journal_inode); + return journal; + } + + static journal_t *ext3_get_dev_journal(struct super_block *sb, + int dev) + { + struct buffer_head * bh; + journal_t *journal; + int start; + int len; + int hblock, blocksize; + unsigned long sb_block; + unsigned long offset; + kdev_t journal_dev = to_kdev_t(dev); + struct ext3_super_block * es; + struct block_device *bdev; + + bdev = ext3_blkdev_get(journal_dev); + if (bdev == NULL) + return NULL; + + blocksize = sb->s_blocksize; + hblock = get_hardsect_size(journal_dev); + if (blocksize < hblock) { + printk(KERN_ERR + "EXT3-fs: blocksize too small for journal device.\n"); + goto out_bdev; + } + + sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; + offset = EXT3_MIN_BLOCK_SIZE % blocksize; + set_blocksize(dev, blocksize); + if (!(bh = bread(dev, sb_block, blocksize))) { + printk(KERN_ERR "EXT3-fs: couldn't read superblock of " + "external journal\n"); + goto out_bdev; + } + + es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { + printk(KERN_ERR "EXT3-fs: external journal has " + "bad superblock\n"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); + brelse(bh); + goto out_bdev; + } + + len = le32_to_cpu(es->s_blocks_count); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = journal_init_dev(journal_dev, sb->s_dev, + start, len, blocksize); + if (!journal) { + printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); + goto out_bdev; + } + ll_rw_block(READ, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); + goto out_journal; + } + if (ntohl(journal->j_superblock->s_nr_users) != 1) { + printk(KERN_ERR "EXT3-fs: External journal has more than one " + "user (unsupported) - %d\n", + ntohl(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT3_SB(sb)->journal_bdev = bdev; + return journal; + out_journal: + journal_destroy(journal); + out_bdev: + ext3_blkdev_put(bdev); + return NULL; + } + + static int ext3_load_journal(struct super_block * sb, + struct ext3_super_block * es) + { + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); + int journal_dev = le32_to_cpu(es->s_journal_dev); + int err; + int really_read_only; + + really_read_only = is_read_only(sb->s_dev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3-fs: INFO: recovery " + "required on readonly filesystem.\n"); + if (really_read_only) { + printk(KERN_ERR "EXT3-fs: write access " + "unavailable, cannot proceed.\n"); + return -EROFS; + } + printk (KERN_INFO "EXT3-fs: write access will " + "be enabled during recovery.\n"); + } + } + + if (journal_inum && journal_dev) { + printk(KERN_ERR "EXT3-fs: filesystem has both journal " + "and inode journals!\n"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext3_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext3_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { + err = journal_update_format(journal); + if (err) { + printk(KERN_ERR "EXT3-fs: error updating journal.\n"); + journal_destroy(journal); + return err; + } + } + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) + journal_wipe(journal, !really_read_only); + + err = journal_load(journal); + if (err) { + printk(KERN_ERR "EXT3-fs: error loading journal.\n"); + journal_destroy(journal); + return err; + } + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); + return 0; + } + + static int ext3_create_journal(struct super_block * sb, + struct ext3_super_block * es, + int journal_inum) + { + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) { + printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " + "create journal.\n"); + return -EROFS; + } + + if (!(journal = ext3_get_journal(sb, journal_inum))) + return -EINVAL; + + printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n", + journal_inum); + + if (journal_create(journal)) { + printk(KERN_ERR "EXT3-fs: error creating journal.\n"); + journal_destroy(journal); + return -EIO; + } + + EXT3_SB(sb)->s_journal = journal; + + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); + + es->s_journal_inum = cpu_to_le32(journal_inum); + sb->s_dirt = 1; + + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + + return 0; + } + + static void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync) + { + es->s_wtime = cpu_to_le32(CURRENT_TIME); + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty"); + mark_buffer_dirty(sb->u.ext3_sb.s_sbh); + if (sync) { + ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh); + wait_on_buffer(sb->u.ext3_sb.s_sbh); + } + } + + + /* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ + static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es) + { + journal_flush(EXT3_SB(sb)->s_journal); + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + sb->s_dirt = 0; + ext3_commit_super(sb, es, 1); + } + } + + /* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ + static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es) + { + journal_t *journal; + int j_errno; + const char *errstr; + + journal = EXT3_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext3_error() or ext3_abort() + */ + + j_errno = journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext3_decode_error(sb, j_errno, nbuf); + ext3_warning(sb, __FUNCTION__, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext3_warning(sb, __FUNCTION__, "Marking fs in need of " + "filesystem check."); + + sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + ext3_commit_super (sb, es, 1); + + journal_clear_err(journal); + } + } + + /* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ + int ext3_force_commit(struct super_block *sb) + { + journal_t *journal; + int ret; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT3_SB(sb)->s_journal; + sb->s_dirt = 0; + lock_kernel(); /* important: lock down j_running_transaction */ + ret = ext3_journal_force_commit(journal); + unlock_kernel(); + return ret; + } + + /* + * Ext3 always journals updates to the superblock itself, so we don't + * have to propagate any other updates to the superblock on disk at this + * point. Just start an async writeback to get the buffers on their way + * to the disk. + * + * This implicitly triggers the writebehind on sync(). + */ + + static int do_sync_supers = 0; + MODULE_PARM(do_sync_supers, "i"); + MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously"); + + void ext3_write_super (struct super_block * sb) + { + tid_t target; + + if (down_trylock(&sb->s_lock) == 0) + BUG(); /* aviro detector */ + sb->s_dirt = 0; + target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + + if (do_sync_supers) { + unlock_super(sb); + log_wait_commit(EXT3_SB(sb)->s_journal, target); + lock_super(sb); + } + } + + /* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + */ + void ext3_write_super_lockfs(struct super_block *sb) + { + sb->s_dirt = 0; + + lock_kernel(); /* 2.4.5 forgot to do this for us */ + if (!(sb->s_flags & MS_RDONLY)) { + journal_t *journal = EXT3_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + journal_lock_updates(journal); + journal_flush(journal); + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + } + unlock_kernel(); + } + + /* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ + void ext3_unlockfs(struct super_block *sb) + { + if (!(sb->s_flags & MS_RDONLY)) { + lock_kernel(); + lock_super(sb); + /* Reser the needs_recovery flag before the fs is unlocked. */ + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + unlock_super(sb); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + unlock_kernel(); + } + } + + int ext3_remount (struct super_block * sb, int * flags, char * data) + { + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long tmp; + + clear_ro_after(sb); + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, &tmp, sbi, &tmp, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + + es = sbi->s_es; + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + return -EROFS; + + if (*flags & MS_RDONLY) { + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && + (sbi->s_mount_state & EXT3_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + ext3_mark_recovery_complete(sb, es); + } else { + int ret; + if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, + ~EXT3_FEATURE_RO_COMPAT_SUPP))) { + printk(KERN_WARNING "EXT3-fs: %s: couldn't " + "remount RDWR because of unsupported " + "optional features (%x).\n", + bdevname(sb->s_dev), ret); + return -EROFS; + } + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + ext3_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if (!ext3_setup_super (sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + } + } + setup_ro_after(sb); + return 0; + } + + int ext3_statfs (struct super_block * sb, struct statfs * buf) + { + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long overhead; + int i; + + if (test_opt (sb, MINIX_DF)) + overhead = 0; + else { + /* + * Compute the overhead (FS structures) + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += (EXT3_SB(sb)->s_groups_count * + (2 + EXT3_SB(sb)->s_itb_per_group)); + } + + buf->f_type = EXT3_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; + buf->f_bfree = ext3_count_free_blocks (sb); + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = ext3_count_free_inodes (sb); + buf->f_namelen = EXT3_NAME_LEN; + return 0; + } + + static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super); + + static int __init init_ext3_fs(void) + { + return register_filesystem(&ext3_fs_type); + } + + static void __exit exit_ext3_fs(void) + { + unregister_filesystem(&ext3_fs_type); + } + + EXPORT_NO_SYMBOLS; + + MODULE_LICENSE("GPL"); + module_init(init_ext3_fs) + module_exit(exit_ext3_fs) diff -rc2P linux/fs/ext3/symlink.c linux-2.4.13/fs/ext3/symlink.c *** linux/fs/ext3/symlink.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/symlink.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,39 ---- + /* + * linux/fs/ext3/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 symlink handling code + */ + + #include + #include + #include + + static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) + { + char *s = (char *)dentry->d_inode->u.ext3_i.i_data; + return vfs_readlink(dentry, buffer, buflen, s); + } + + static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) + { + char *s = (char *)dentry->d_inode->u.ext3_i.i_data; + return vfs_follow_link(nd, s); + } + + struct inode_operations ext3_fast_symlink_inode_operations = { + readlink: ext3_readlink, /* BKL not held. Don't need */ + follow_link: ext3_follow_link, /* BKL not held. Don't need */ + }; diff -rc2P linux/fs/inode.c linux-2.4.13/fs/inode.c *** linux/fs/inode.c Fri Sep 28 21:03:48 2001 --- linux-2.4.13/fs/inode.c Fri Nov 9 16:57:59 2001 *************** *** 110,113 **** --- 110,114 ---- sema_init(&inode->i_sem, 1); sema_init(&inode->i_zombie, 1); + init_rwsem(&inode->i_truncate_sem); spin_lock_init(&inode->i_data.i_shared_lock); } diff -rc2P linux/fs/jbd/Makefile linux-2.4.13/fs/jbd/Makefile *** linux/fs/jbd/Makefile Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/Makefile Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,15 ---- + # + # fs/jbd/Makefile + # + # Makefile for the linux journaling routines. + # + + export-objs := journal.o + O_TARGET := jbd.o + + obj-y := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o + + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make + diff -rc2P linux/fs/jbd/checkpoint.c linux-2.4.13/fs/jbd/checkpoint.c *** linux/fs/jbd/checkpoint.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/checkpoint.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,608 ---- + /* + * linux/fs/checkpoint.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. + * + * Checkpointing is the process of ensuring that a section of the log is + * committed fully to disk, so that that portion of the log can be + * reused. + */ + + #include + #include + #include + #include + #include + #include + + extern spinlock_t journal_datalist_lock; + + /* + * Unlink a buffer from a transaction. + * + * Called with journal_datalist_lock held. + */ + + static inline void __buffer_unlink(struct journal_head *jh) + { + transaction_t *transaction; + + transaction = jh->b_cp_transaction; + jh->b_cp_transaction = NULL; + + jh->b_cpnext->b_cpprev = jh->b_cpprev; + jh->b_cpprev->b_cpnext = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } + + /* + * Try to release a checkpointed buffer from its transaction. + * Returns 1 if we released it. + * Requires journal_datalist_lock + */ + static int __try_to_free_cp_buf(struct journal_head *jh) + { + int ret = 0; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __journal_remove_checkpoint(jh); + __journal_remove_journal_head(bh); + BUFFER_TRACE(bh, "release"); + /* BUF_LOCKED -> BUF_CLEAN (fwiw) */ + refile_buffer(bh); + __brelse(bh); + ret = 1; + } + return ret; + } + + /* + * log_wait_for_space: wait until there is space in the journal. + * + * Called with the journal already locked, but it will be unlocked if we have + * to wait for a checkpoint to free up some space in the log. + */ + + void log_wait_for_space(journal_t *journal, int nblocks) + { + while (log_space_left(journal) < nblocks) { + if (journal->j_flags & JFS_ABORT) + return; + unlock_journal(journal); + down(&journal->j_checkpoint_sem); + lock_journal(journal); + + /* Test again, another process may have checkpointed + * while we were waiting for the checkpoint lock */ + if (log_space_left(journal) < nblocks) { + log_do_checkpoint(journal, nblocks); + } + up(&journal->j_checkpoint_sem); + } + } + + /* + * Clean up a transaction's checkpoint list. + * + * We wait for any pending IO to complete and make sure any clean + * buffers are removed from the transaction. + * + * Return 1 if we performed any actions which might have destroyed the + * checkpoint. (journal_remove_checkpoint() deletes the transaction when + * the last checkpoint buffer is cleansed) + * + * Called with the journal locked. + * Called with journal_datalist_lock held. + */ + static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) + { + struct journal_head *jh, *next_jh, *last_jh; + struct buffer_head *bh; + int ret = 0; + + assert_spin_locked(&journal_datalist_lock); + jh = transaction->t_checkpoint_list; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto out_return_1; + } + + if (jh->b_transaction != NULL) { + transaction_t *transaction = jh->b_transaction; + tid_t tid = transaction->t_tid; + + spin_unlock(&journal_datalist_lock); + log_start_commit(journal, transaction); + unlock_journal(journal); + log_wait_commit(journal, tid); + goto out_return_1; + } + + /* + * We used to test for (jh->b_list != BUF_CLEAN) here. + * But unmap_underlying_metadata() can place buffer onto + * BUF_CLEAN. Since refile_buffer() no longer takes buffers + * off checkpoint lists, we cope with it here + */ + /* + * AKPM: I think the buffer_jdirty test is redundant - it + * shouldn't have NULL b_transaction? + */ + next_jh = jh->b_cpnext; + if (!buffer_dirty(bh) && !buffer_jdirty(bh)) { + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + __journal_remove_journal_head(bh); + refile_buffer(bh); + __brelse(bh); + ret = 1; + } + + jh = next_jh; + } while (jh != last_jh); + + return ret; + out_return_1: + lock_journal(journal); + spin_lock(&journal_datalist_lock); + return 1; + } + + #define NR_BATCH 64 + + static void __flush_batch(struct buffer_head **bhs, int *batch_count) + { + int i; + + spin_unlock(&journal_datalist_lock); + ll_rw_block(WRITE, *batch_count, bhs); + run_task_queue(&tq_disk); + spin_lock(&journal_datalist_lock); + for (i = 0; i < *batch_count; i++) { + struct buffer_head *bh = bhs[i]; + clear_bit(BH_JWrite, &bh->b_state); + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + *batch_count = 0; + } + + /* + * Try to flush one buffer from the checkpoint list to disk. + * + * Return 1 if something happened which requires us to abort the current + * scan of the checkpoint list. + * + * Called with journal_datalist_lock held. + */ + static int __flush_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count, + int *drop_count) + { + struct buffer_head *bh = jh2bh(jh); + int ret = 0; + + if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { + J_ASSERT_JH(jh, jh->b_transaction == NULL); + + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal lock. + * We cannot afford to let the transaction logic start + * messing around with this buffer before we write it to + * disk, as that would break recoverability. + */ + BUFFER_TRACE(bh, "queue"); + atomic_inc(&bh->b_count); + J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state)); + set_bit(BH_JWrite, &bh->b_state); + bhs[*batch_count] = bh; + (*batch_count)++; + if (*batch_count == NR_BATCH) { + __flush_batch(bhs, batch_count); + ret = 1; + } + } else { + int last_buffer = 0; + if (jh->b_cpnext == jh) { + /* We may be about to drop the transaction. Tell the + * caller that the lists have changed. + */ + last_buffer = 1; + } + if (__try_to_free_cp_buf(jh)) { + (*drop_count)++; + ret = last_buffer; + } + } + return ret; + } + + + /* + * Perform an actual checkpoint. We don't write out only enough to + * satisfy the current blocked requests: rather we submit a reasonably + * sized chunk of the outstanding data to disk at once for + * efficiency. log_wait_for_space() will retry if we didn't free enough. + * + * However, we _do_ take into account the amount requested so that once + * the IO has been queued, we can return as soon as enough of it has + * completed to disk. + * + * The journal should be locked before calling this function. + */ + + /* @@@ `nblocks' is unused. Should it be used? */ + int log_do_checkpoint (journal_t *journal, int nblocks) + { + transaction_t *transaction, *last_transaction, *next_transaction; + int result; + int target; + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; + + jbd_debug(1, "Start checkpoint\n"); + + /* + * First thing: if there are any transactions in the log which + * don't need checkpointing, just eliminate them from the + * journal straight away. + */ + result = cleanup_journal_tail(journal); + jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + if (result <= 0) + return result; + + /* + * OK, we need to start writing disk blocks. Try to free up a + * quarter of the log in a single checkpoint if we can. + */ + /* + * AKPM: check this code. I had a feeling a while back that it + * degenerates into a busy loop at unmount time. + */ + target = (journal->j_last - journal->j_first) / 4; + + spin_lock(&journal_datalist_lock); + repeat: + transaction = journal->j_checkpoint_transactions; + if (transaction == NULL) + goto done; + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + + do { + struct journal_head *jh, *last_jh, *next_jh; + int drop_count = 0; + int cleanup_ret, retry = 0; + + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + jh = transaction->t_checkpoint_list; + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + retry = __flush_buffer(journal, jh, bhs, &batch_count, + &drop_count); + } while (jh != last_jh && !retry); + if (batch_count) { + __flush_batch(bhs, &batch_count); + goto repeat; + } + if (retry) + goto repeat; + /* + * We have walked the whole transaction list without + * finding anything to write to disk. We had better be + * able to make some progress or we are in trouble. + */ + cleanup_ret = __cleanup_transaction(journal, transaction); + J_ASSERT(drop_count != 0 || cleanup_ret != 0); + goto repeat; /* __cleanup may have dropped lock */ + } while (transaction != last_transaction); + + done: + spin_unlock(&journal_datalist_lock); + result = cleanup_journal_tail(journal); + if (result < 0) + return result; + + return 0; + } + + /* + * Check the list of checkpoint transactions for the journal to see if + * we have already got rid of any since the last update of the log tail + * in the journal superblock. If so, we can instantly roll the + * superblock forward to remove those transactions from the log. + * + * Return <0 on error, 0 on success, 1 if there was nothing to clean up. + * + * Called with the journal lock held. + * + * This is the only part of the journaling code which really needs to be + * aware of transaction aborts. Checkpointing involves writing to the + * main filesystem area rather than to the journal, so it can proceed + * even in abort state, but we must not update the journal superblock if + * we have an abort error outstanding. + */ + + int cleanup_journal_tail(journal_t *journal) + { + transaction_t * transaction; + tid_t first_tid; + unsigned long blocknr, freed; + + /* OK, work out the oldest transaction remaining in the log, and + * the log block it starts at. + * + * If the log is now empty, we need to work out which is the + * next transaction ID we will write, and where it will + * start. */ + + /* j_checkpoint_transactions needs locking */ + spin_lock(&journal_datalist_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = journal->j_head; + } else { + first_tid = journal->j_transaction_sequence; + blocknr = journal->j_head; + } + spin_unlock(&journal_datalist_lock); + J_ASSERT (blocknr != 0); + + /* If the oldest pinned transaction is at the tail of the log + already then there's not much we can do right now. */ + if (journal->j_tail_sequence == first_tid) + return 1; + + /* OK, update the superblock to recover the freed space. + * Physical blocks come first: have we wrapped beyond the end of + * the log? */ + freed = blocknr - journal->j_tail; + if (blocknr < journal->j_tail) + freed = freed + journal->j_last - journal->j_first; + + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, first_tid, blocknr, freed); + + journal->j_free += freed; + journal->j_tail_sequence = first_tid; + journal->j_tail = blocknr; + if (!(journal->j_flags & JFS_ABORT)) + journal_update_superblock(journal, 1); + return 0; + } + + + /* Checkpoint list management */ + + /* + * journal_clean_checkpoint_list + * + * Find all the written-back checkpoint buffers in the journal and release them. + * + * Called with the journal locked. + * Called with journal_datalist_lock held. + * Returns number of bufers reaped (for debug) + */ + + int __journal_clean_checkpoint_list(journal_t *journal) + { + transaction_t *transaction, *last_transaction, *next_transaction; + int ret = 0; + + transaction = journal->j_checkpoint_transactions; + if (transaction == 0) + goto out; + + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + struct journal_head *jh; + + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + jh = transaction->t_checkpoint_list; + if (jh) { + struct journal_head *last_jh = jh->b_cpprev; + struct journal_head *next_jh = jh; + do { + struct buffer_head *bh; + + jh = next_jh; + next_jh = jh->b_cpnext; + bh = jh2bh(jh); + ret += __try_to_free_cp_buf(jh); + } while (jh != last_jh); + } + } while (transaction != last_transaction); + out: + return ret; + } + + /* + * journal_remove_checkpoint: called after a buffer has been committed + * to disk (either by being write-back flushed to disk, or being + * committed to the log). + * + * We cannot safely clean a transaction out of the log until all of the + * buffer updates committed in that transaction have safely been stored + * elsewhere on disk. To achieve this, all of the buffers in a + * transaction need to be maintained on the transaction's checkpoint + * list until they have been rewritten, at which point this function is + * called to remove the buffer from the existing transaction's + * checkpoint list. + * + * This function is called with the journal locked. + * This function is called with journal_datalist_lock held. + */ + + void __journal_remove_checkpoint(struct journal_head *jh) + { + transaction_t *transaction; + journal_t *journal; + + JBUFFER_TRACE(jh, "entry"); + + if ((transaction = jh->b_cp_transaction) == NULL) { + JBUFFER_TRACE(jh, "not on transaction"); + goto out; + } + + journal = transaction->t_journal; + + __buffer_unlink(jh); + + if (transaction->t_checkpoint_list != NULL) + goto out; + JBUFFER_TRACE(jh, "transaction has no more buffers"); + + /* There is one special case to worry about: if we have just + pulled the buffer off a committing transaction's forget list, + then even if the checkpoint list is empty, the transaction + obviously cannot be dropped! */ + + if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "belongs to committing transaction"); + goto out; + } + + /* OK, that was the last buffer for the transaction: we can now + safely remove this transaction from the log */ + + __journal_drop_transaction(journal, transaction); + + /* Just in case anybody was waiting for more transactions to be + checkpointed... */ + wake_up(&journal->j_wait_logspace); + out: + JBUFFER_TRACE(jh, "exit"); + } + + void journal_remove_checkpoint(struct journal_head *jh) + { + spin_lock(&journal_datalist_lock); + __journal_remove_checkpoint(jh); + spin_unlock(&journal_datalist_lock); + } + + /* + * journal_insert_checkpoint: put a committed buffer onto a checkpoint + * list so that we know when it is safe to clean the transaction out of + * the log. + * + * Called with the journal locked. + * Called with journal_datalist_lock held. + */ + void __journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) + { + JBUFFER_TRACE(jh, "entry"); + J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh))); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + + assert_spin_locked(&journal_datalist_lock); + jh->b_cp_transaction = transaction; + + if (!transaction->t_checkpoint_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_list; + jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_list = jh; + } + + void journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) + { + spin_lock(&journal_datalist_lock); + __journal_insert_checkpoint(jh, transaction); + spin_unlock(&journal_datalist_lock); + } + + /* + * We've finished with this transaction structure: adios... + * + * The transaction must have no links except for the checkpoint by this + * point. + * + * Called with the journal locked. + * Called with journal_datalist_lock held. + */ + + void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) + { + assert_spin_locked(&journal_datalist_lock); + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; + transaction->t_cpprev->t_cpnext = transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = + transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = NULL; + } + + J_ASSERT (transaction->t_ilist == NULL); + J_ASSERT (transaction->t_buffers == NULL); + J_ASSERT (transaction->t_sync_datalist == NULL); + J_ASSERT (transaction->t_async_datalist == NULL); + J_ASSERT (transaction->t_forget == NULL); + J_ASSERT (transaction->t_iobuf_list == NULL); + J_ASSERT (transaction->t_shadow_list == NULL); + J_ASSERT (transaction->t_log_list == NULL); + J_ASSERT (transaction->t_checkpoint_list == NULL); + J_ASSERT (transaction->t_updates == 0); + + J_ASSERT (transaction->t_journal->j_committing_transaction != + transaction); + + jbd_debug (1, "Dropping transaction %d, all done\n", + transaction->t_tid); + kfree (transaction); + } + diff -rc2P linux/fs/jbd/commit.c linux-2.4.13/fs/jbd/commit.c *** linux/fs/jbd/commit.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/commit.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,701 ---- + /* + * linux/fs/commit.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal commit routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + + #include + #include + #include + #include + #include + #include + #include + + extern spinlock_t journal_datalist_lock; + + /* + * Default IO end handler for temporary BJ_IO buffer_heads. + */ + static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) + { + BUFFER_TRACE(bh, ""); + mark_buffer_uptodate(bh, uptodate); + unlock_buffer(bh); + } + + /* + * journal_commit_transaction + * + * The primary function for committing a transaction to the log. This + * function is called by the journal thread to begin a complete commit. + */ + void journal_commit_transaction(journal_t *journal) + { + transaction_t *commit_transaction; + struct journal_head *jh, *new_jh, *descriptor; + struct journal_head *next_jh, *last_jh; + struct buffer_head *wbuf[64]; + int bufs; + int flags; + int blocknr; + char *tagp = NULL; + journal_header_t *header; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + int i; + + /* + * First job: lock down the current transaction and wait for + * all outstanding updates to complete. + */ + + lock_journal(journal); /* Protect journal->j_running_transaction */ + + #ifdef COMMIT_STATS + spin_lock(&journal_datalist_lock); + summarise_journal_usage(journal); + spin_unlock(&journal_datalist_lock); + #endif + + lock_kernel(); + + J_ASSERT (journal->j_running_transaction != NULL); + J_ASSERT (journal->j_committing_transaction == NULL); + + commit_transaction = journal->j_running_transaction; + J_ASSERT (commit_transaction->t_state == T_RUNNING); + + jbd_debug (1, "JBD: starting commit of transaction %d\n", + commit_transaction->t_tid); + + commit_transaction->t_state = T_LOCKED; + while (commit_transaction->t_updates != 0) { + unlock_journal(journal); + sleep_on(&journal->j_wait_updates); + lock_journal(journal); + } + + J_ASSERT (commit_transaction->t_outstanding_credits <= + journal->j_max_transaction_buffers); + + /* Do we need to erase the effects of a prior journal_flush? */ + if (journal->j_flags & JFS_FLUSHED) { + jbd_debug(3, "super block updated\n"); + journal_update_superblock(journal, 1); + } else { + jbd_debug(3, "superblock not updated\n"); + } + + /* + * First thing we are allowed to do is to discard any remaining + * BJ_Reserved buffers. Note, it is _not_ permissible to assume + * that there are no such buffers: if a large filesystem + * operation like a truncate needs to split itself over multiple + * transactions, then it may try to do a journal_restart() while + * there are still BJ_Reserved buffers outstanding. These must + * be released cleanly from the current transaction. + * + * In this case, the filesystem must still reserve write access + * again before modifying the buffer in the new transaction, but + * we do not require it to remember exactly which old buffers it + * has reserved. This is consistent with the existing behaviour + * that multiple journal_get_write_access() calls to the same + * buffer are perfectly permissable. + */ + + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + journal_refile_buffer(jh); + } + + /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory + */ + spin_lock(&journal_datalist_lock); + __journal_clean_checkpoint_list(journal); + spin_unlock(&journal_datalist_lock); + + /* First part of the commit: force the revoke list out to disk. + * The revoke code generates its own metadata blocks on disk for this. + * + * It is important that we do this while the transaction is + * still locked. Generating the revoke records should not + * generate any IO stalls, so this should be quick; and doing + * the work while we have the transaction locked means that we + * only ever have to maintain the revoke list for one + * transaction at a time. + */ + + jbd_debug (3, "JBD: commit phase 1\n"); + + journal_write_revoke_records(journal, commit_transaction); + + /* + * Now that we have built the revoke records, we can start + * reusing the revoke list for a new running transaction. We + * can now safely start committing the old transaction: time to + * get a new running transaction for incoming filesystem updates + */ + + commit_transaction->t_state = T_FLUSH; + + wake_up(&journal->j_wait_transaction_locked); + + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; + + commit_transaction->t_log_start = journal->j_head; + + unlock_kernel(); + + jbd_debug (3, "JBD: commit phase 2\n"); + + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_datalist, so we have to keep looping back to write_out_data + * until we *know* that the list is empty. + */ + write_out_data: + + /* + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + * + * We take journal_datalist_lock to protect the lists from + * journal_try_to_free_buffers(). + */ + spin_lock(&journal_datalist_lock); + + write_out_data_locked: + bufs = 0; + next_jh = commit_transaction->t_sync_datalist; + if (next_jh == NULL) + goto sync_datalist_empty; + last_jh = next_jh->b_tprev; + + do { + struct buffer_head *bh; + + jh = next_jh; + next_jh = jh->b_tnext; + bh = jh2bh(jh); + if (!buffer_locked(bh)) { + if (buffer_dirty(bh)) { + BUFFER_TRACE(bh, "start journal writeout"); + atomic_inc(&bh->b_count); + wbuf[bufs++] = bh; + } else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + __journal_remove_journal_head(bh); + refile_buffer(bh); + __brelse(bh); + } + } + if (bufs == ARRAY_SIZE(wbuf)) { + /* + * Major speedup: start here on the next scan + */ + J_ASSERT(commit_transaction->t_sync_datalist != 0); + commit_transaction->t_sync_datalist = jh; + break; + } + } while (jh != last_jh); + + if (bufs || current->need_resched) { + jbd_debug(2, "submit %d writes\n", bufs); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + if (bufs) + ll_rw_block(WRITE, bufs, wbuf); + if (current->need_resched) + schedule(); + journal_brelse_array(wbuf, bufs); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + if (bufs) + goto write_out_data_locked; + } + + /* + * Wait for all previously submitted IO on the data list to complete. + */ + jh = commit_transaction->t_sync_datalist; + if (jh == NULL) + goto sync_datalist_empty; + + do { + struct buffer_head *bh; + jh = jh->b_tprev; /* Wait on the last written */ + bh = jh2bh(jh); + if (buffer_locked(bh)) { + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + wait_on_buffer(bh); + /* the journal_head may have been removed now */ + lock_journal(journal); + goto write_out_data; + } else if (buffer_dirty(bh)) { + goto write_out_data_locked; + } + } while (jh != commit_transaction->t_sync_datalist); + goto write_out_data_locked; + + sync_datalist_empty: + /* + * Wait for all the async writepage data. As they become unlocked + * in end_buffer_io_async(), the only place where they can be + * reaped is in try_to_free_buffers(), and we're locked against + * that. + */ + while ((jh = commit_transaction->t_async_datalist)) { + struct buffer_head *bh = jh2bh(jh); + if (buffer_locked(bh)) { + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + wait_on_buffer(bh); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + continue; /* List may have changed */ + } + if (jh->b_next_transaction) { + /* + * For writepage() buffers in journalled data mode: a + * later transaction may want the buffer for "metadata" + */ + __journal_refile_buffer(jh); + } else { + BUFFER_TRACE(bh, "finished async writeout: unfile"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + __journal_remove_journal_head(bh); + BUFFER_TRACE(bh, "finished async writeout: refile"); + /* It can sometimes be on BUF_LOCKED due to migration + * from syncdata to asyncdata */ + if (bh->b_list != BUF_CLEAN) + refile_buffer(bh); + __brelse(bh); + } + } + spin_unlock(&journal_datalist_lock); + + /* + * If we found any dirty or locked buffers, then we should have + * looped back up to the write_out_data label. If there weren't + * any then journal_clean_data_list should have wiped the list + * clean by now, so check that it is in fact empty. + */ + J_ASSERT (commit_transaction->t_sync_datalist == NULL); + J_ASSERT (commit_transaction->t_async_datalist == NULL); + + jbd_debug (3, "JBD: commit phase 3\n"); + + /* + * Way to go: we have now written out all of the data for a + * transaction! Now comes the tricky part: we need to write out + * metadata. Loop over the transaction's entire buffer list: + */ + commit_transaction->t_state = T_COMMIT; + + descriptor = 0; + bufs = 0; + while (commit_transaction->t_buffers) { + + /* Find the next buffer to be journaled... */ + + jh = commit_transaction->t_buffers; + + /* If we're in abort mode, we just un-journal the buffer and + release it for background writing. */ + + if (is_journal_aborted(journal)) { + JBUFFER_TRACE(jh, "journal is aborting: refile"); + journal_refile_buffer(jh); + /* If that was the last one, we need to clean up + * any descriptor buffers which may have been + * already allocated, even if we are now + * aborting. */ + if (!commit_transaction->t_buffers) + goto start_journal_io; + continue; + } + + /* Make sure we have a descriptor block in which to + record the metadata buffer. */ + + if (!descriptor) { + struct buffer_head *bh; + + J_ASSERT (bufs == 0); + + jbd_debug(4, "JBD: get descriptor\n"); + + descriptor = journal_get_descriptor_buffer(journal); + bh = jh2bh(descriptor); + jbd_debug(4, "JBD: got buffer %ld (%p)\n", + bh->b_blocknr, bh->b_data); + header = (journal_header_t *)&bh->b_data[0]; + header->h_magic = htonl(JFS_MAGIC_NUMBER); + header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK); + header->h_sequence = htonl(commit_transaction->t_tid); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + space_left = bh->b_size - sizeof(journal_header_t); + first_tag = 1; + set_bit(BH_JWrite, &bh->b_state); + wbuf[bufs++] = bh; + + /* Record it so that we can wait for IO + completion later */ + BUFFER_TRACE(bh, "ph3: file as descriptor"); + journal_file_buffer(descriptor, commit_transaction, + BJ_LogCtl); + } + + /* Where is the buffer to be written? */ + + blocknr = journal_next_log_block(journal); + + /* Bump b_count to prevent truncate from stumbling over + the shadowed buffer! @@@ This can go if we ever get + rid of the BJ_IO/BJ_Shadow pairing of buffers. */ + atomic_inc(&jh2bh(jh)->b_count); + + /* Make a temporary IO buffer with which to write it out + (this will requeue both the metadata buffer and the + temporary IO buffer). new_bh goes on BJ_IO*/ + + set_bit(BH_JWrite, &jh2bh(jh)->b_state); + /* + * akpm: journal_write_metadata_buffer() sets + * new_bh->b_transaction to commit_transaction. + * We need to clean this up before we release new_bh + * (which is of type BJ_IO) + */ + JBUFFER_TRACE(jh, "ph3: write metadata"); + flags = journal_write_metadata_buffer(commit_transaction, + jh, &new_jh, blocknr); + set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); + wbuf[bufs++] = jh2bh(new_jh); + + /* Record the new block's tag in the current descriptor + buffer */ + + tag_flag = 0; + if (flags & 1) + tag_flag |= JFS_FLAG_ESCAPE; + if (!first_tag) + tag_flag |= JFS_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *) tagp; + tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr); + tag->t_flags = htonl(tag_flag); + tagp += sizeof(journal_block_tag_t); + space_left -= sizeof(journal_block_tag_t); + + if (first_tag) { + memcpy (tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == ARRAY_SIZE(wbuf) || + commit_transaction->t_buffers == NULL || + space_left < sizeof(journal_block_tag_t) + 16) { + + jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + submitting the IOs. "tag" still points to + the last tag we set up. */ + + tag->t_flags |= htonl(JFS_FLAG_LAST_TAG); + + start_journal_io: + unlock_journal(journal); + for (i=0; ib_state); + clear_bit(BH_Dirty, &bh->b_state); + bh->b_end_io = journal_end_buffer_io_sync; + submit_bh(WRITE, bh); + } + if (current->need_resched) + schedule(); + lock_journal(journal); + + /* Force a new descriptor to be generated next + time round the loop. */ + descriptor = NULL; + bufs = 0; + } + } + + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the + transaction's t_log_list queue, and metadata buffers are on + the t_iobuf_list queue. + + Wait for the transactions in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ + + jbd_debug(3, "JBD: commit phase 4\n"); + + /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */ + wait_for_iobuf: + while (commit_transaction->t_iobuf_list != NULL) { + struct buffer_head *bh; + jh = commit_transaction->t_iobuf_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + unlock_journal(journal); + wait_on_buffer(bh); + lock_journal(journal); + goto wait_for_iobuf; + } + + clear_bit(BH_JWrite, &jh2bh(jh)->b_state); + + JBUFFER_TRACE(jh, "ph4: unfile after journal write"); + journal_unfile_buffer(jh); + + /* + * akpm: don't put back a buffer_head with stale pointers + * dangling around. + */ + J_ASSERT_JH(jh, jh->b_transaction != NULL); + jh->b_transaction = NULL; + + /* + * ->t_iobuf_list should contain only dummy buffer_heads + * which were created by journal_write_metadata_buffer(). + */ + bh = jh2bh(jh); + BUFFER_TRACE(bh, "dumping temporary bh"); + journal_unlock_journal_head(jh); + __brelse(bh); + J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); + put_unused_buffer_head(bh); + + /* We also have to unlock and free the corresponding + shadowed buffer */ + jh = commit_transaction->t_shadow_list->b_tprev; + bh = jh2bh(jh); + clear_bit(BH_JWrite, &bh->b_state); + J_ASSERT_BH(bh, buffer_jdirty(bh)); + + /* The metadata is now released for reuse, but we need + to remember it against this transaction so that when + we finally commit, we can do any checkpointing + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + journal_file_buffer(jh, commit_transaction, BJ_Forget); + /* Wake up any transactions which were waiting for this + IO to complete */ + wake_up(&bh->b_wait); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); + } + + J_ASSERT (commit_transaction->t_shadow_list == NULL); + + jbd_debug(3, "JBD: commit phase 5\n"); + + /* Here we wait for the revoke record and descriptor record buffers */ + wait_for_ctlbuf: + while (commit_transaction->t_log_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_log_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + unlock_journal(journal); + wait_on_buffer(bh); + lock_journal(journal); + goto wait_for_ctlbuf; + } + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_bit(BH_JWrite, &bh->b_state); + journal_unfile_buffer(jh); + jh->b_transaction = NULL; + journal_unlock_journal_head(jh); + __brelse(bh); /* One for getblk */ + /* AKPM: bforget here */ + } + + jbd_debug(3, "JBD: commit phase 6\n"); + + /* Done it all: now write the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. */ + + if (is_journal_aborted(journal)) + goto skip_commit; + + descriptor = journal_get_descriptor_buffer(journal); + + /* AKPM: buglet - add `i' to tmp! */ + for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { + journal_header_t *tmp = + (journal_header_t*)jh2bh(descriptor)->b_data; + tmp->h_magic = htonl(JFS_MAGIC_NUMBER); + tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK); + tmp->h_sequence = htonl(commit_transaction->t_tid); + } + + unlock_journal(journal); + JBUFFER_TRACE(descriptor, "write commit block"); + { + struct buffer_head *bh = jh2bh(descriptor); + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + __brelse(bh); /* One for getblk() */ + journal_unlock_journal_head(descriptor); + } + lock_journal(journal); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + + skip_commit: + + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(commit_transaction->t_async_datalist == NULL); + J_ASSERT(commit_transaction->t_buffers == NULL); + J_ASSERT(commit_transaction->t_checkpoint_list == NULL); + J_ASSERT(commit_transaction->t_iobuf_list == NULL); + J_ASSERT(commit_transaction->t_shadow_list == NULL); + J_ASSERT(commit_transaction->t_log_list == NULL); + + while (commit_transaction->t_forget) { + transaction_t *cp_transaction; + struct buffer_head *bh; + + jh = commit_transaction->t_forget; + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || + jh->b_transaction == journal->j_running_transaction); + + /* + * If there is undo-protected committed data against + * this buffer, then we can remove it now. If it is a + * buffer needing such protection, the old frozen_data + * field now points to a committed version of the + * buffer, so rotate that field to the new committed + * data. + * + * Otherwise, we can just throw away the frozen data now. + */ + if (jh->b_committed_data) { + kfree(jh->b_committed_data); + jh->b_committed_data = NULL; + if (jh->b_frozen_data) { + jh->b_committed_data = jh->b_frozen_data; + jh->b_frozen_data = NULL; + } + } else if (jh->b_frozen_data) { + kfree(jh->b_frozen_data); + jh->b_frozen_data = NULL; + } + + spin_lock(&journal_datalist_lock); + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); + J_ASSERT_JH(jh, commit_transaction != cp_transaction); + __journal_remove_checkpoint(jh); + } + + /* Only re-checkpoint the buffer_head if it is marked + * dirty. If the buffer was added to the BJ_Forget list + * by journal_forget, it may no longer be dirty and + * there's no point in keeping a checkpoint record for + * it. */ + bh = jh2bh(jh); + if (buffer_jdirty(bh)) { + JBUFFER_TRACE(jh, "add to new checkpointing trans"); + __journal_insert_checkpoint(jh, commit_transaction); + JBUFFER_TRACE(jh, "refile for checkpoint writeback"); + __journal_refile_buffer(jh); + } else { + J_ASSERT_BH(bh, !buffer_dirty(bh)); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + __journal_remove_journal_head(bh); + __brelse(bh); + } + spin_unlock(&journal_datalist_lock); + } + + /* Done with this transaction! */ + + jbd_debug(3, "JBD: commit phase 8\n"); + + J_ASSERT (commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_FINISHED; + + J_ASSERT (commit_transaction == journal->j_committing_transaction); + journal->j_commit_sequence = commit_transaction->t_tid; + journal->j_committing_transaction = NULL; + + spin_lock(&journal_datalist_lock); + if (commit_transaction->t_checkpoint_list == NULL) { + __journal_drop_transaction(journal, commit_transaction); + } else { + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + } + spin_unlock(&journal_datalist_lock); + + jbd_debug(1, "JBD: commit %d complete, head %d\n", + journal->j_commit_sequence, journal->j_tail_sequence); + + unlock_journal(journal); + wake_up(&journal->j_wait_done_commit); + } diff -rc2P linux/fs/jbd/journal.c linux-2.4.13/fs/jbd/journal.c *** linux/fs/jbd/journal.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/journal.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,1716 ---- + /* + * linux/fs/journal.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem journal-writing code; part of the ext2fs + * journaling system. + * + * This file manages journals: areas of disk reserved for logging + * transactional updates. This includes the kernel journaling thread + * which is responsible for scheduling updates to the log. + * + * We do not actually manage the physical storage of the journal in this + * file: that is left to a per-journal policy function, which allows us + * to store the journal within a filesystem-specified area for ext2 + * journaling (ext2 can use a reserved inode for storing the log). + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + EXPORT_SYMBOL(journal_start); + EXPORT_SYMBOL(journal_try_start); + EXPORT_SYMBOL(journal_restart); + EXPORT_SYMBOL(journal_extend); + EXPORT_SYMBOL(journal_stop); + EXPORT_SYMBOL(journal_lock_updates); + EXPORT_SYMBOL(journal_unlock_updates); + EXPORT_SYMBOL(journal_get_write_access); + EXPORT_SYMBOL(journal_get_create_access); + EXPORT_SYMBOL(journal_get_undo_access); + EXPORT_SYMBOL(journal_dirty_data); + EXPORT_SYMBOL(journal_dirty_metadata); + #if 0 + EXPORT_SYMBOL(journal_release_buffer); + #endif + EXPORT_SYMBOL(journal_forget); + #if 0 + EXPORT_SYMBOL(journal_sync_buffer); + #endif + EXPORT_SYMBOL(journal_flush); + EXPORT_SYMBOL(journal_revoke); + + EXPORT_SYMBOL(journal_init_dev); + EXPORT_SYMBOL(journal_init_inode); + EXPORT_SYMBOL(journal_update_format); + EXPORT_SYMBOL(journal_check_used_features); + EXPORT_SYMBOL(journal_check_available_features); + EXPORT_SYMBOL(journal_set_features); + EXPORT_SYMBOL(journal_create); + EXPORT_SYMBOL(journal_load); + EXPORT_SYMBOL(journal_destroy); + EXPORT_SYMBOL(journal_recover); + EXPORT_SYMBOL(journal_update_superblock); + EXPORT_SYMBOL(__journal_abort); + EXPORT_SYMBOL(journal_abort); + EXPORT_SYMBOL(journal_errno); + EXPORT_SYMBOL(journal_ack_err); + EXPORT_SYMBOL(journal_clear_err); + EXPORT_SYMBOL(log_wait_commit); + EXPORT_SYMBOL(log_start_commit); + EXPORT_SYMBOL(journal_wipe); + EXPORT_SYMBOL(journal_blocks_per_page); + EXPORT_SYMBOL(journal_flushpage); + EXPORT_SYMBOL(journal_try_to_free_buffers); + EXPORT_SYMBOL(journal_bmap); + EXPORT_SYMBOL(journal_force_commit); + + static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); + + /* + * journal_datalist_lock is used to protect data buffers: + * + * bh->b_transaction + * bh->b_tprev + * bh->b_tnext + * + * journal_free_buffer() is called from journal_try_to_free_buffer(), and is + * async wrt everything else. + * + * It is also used for checkpoint data, also to protect against + * journal_try_to_free_buffer(): + * + * bh->b_cp_transaction + * bh->b_cpnext + * bh->b_cpprev + * transaction->t_checkpoint_list + * transaction->t_cpnext + * transaction->t_cpprev + * journal->j_checkpoint_transactions + * + * It is global at this time rather than per-journal because it's + * impossible for __journal_free_buffer to go from a buffer_head + * back to a journal_t unracily (well, not true. Fix later) + * + * + * The `datalist' and `checkpoint list' functions are quite + * separate and we could use two spinlocks here. + * + * lru_list_lock nests inside journal_datalist_lock. + */ + spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED; + + /* + * List of all journals in the system. Protected by the BKL. + */ + static LIST_HEAD(all_journals); + + /* + * Helper function used to manage commit timeouts + */ + + static void commit_timeout(unsigned long __data) + { + struct task_struct * p = (struct task_struct *) __data; + + wake_up_process(p); + } + + /* Static check for data structure consistency. There's no code + * invoked --- we'll just get a linker failure if things aren't right. + */ + void __journal_internal_check(void) + { + extern void journal_bad_superblock_size(void); + if (sizeof(struct journal_superblock_s) != 1024) + journal_bad_superblock_size(); + } + + /* + * kjournald: The main thread function used to manage a logging device + * journal. + * + * This kernel thread is responsible for two things: + * + * 1) COMMIT: Every so often we need to commit the current state of the + * filesystem to disk. The journal thread is responsible for writing + * all of the metadata buffers to disk. + * + * 2) CHECKPOINT: We cannot reuse a used section of the log file until all + * of the data in that part of the log has been rewritten elsewhere on + * the disk. Flushing these old buffers to reclaim space in the log is + * known as checkpointing, and this thread is responsible for that job. + */ + + journal_t *current_journal; // AKPM: debug + + int kjournald(void *arg) + { + journal_t *journal = (journal_t *) arg; + transaction_t *transaction; + struct timer_list timer; + + current_journal = journal; + + lock_kernel(); + daemonize(); + spin_lock_irq(¤t->sigmask_lock); + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + sprintf(current->comm, "kjournald"); + + /* Set up an interval timer which can be used to trigger a + commit wakeup after the commit interval expires */ + init_timer(&timer); + timer.data = (unsigned long) current; + timer.function = commit_timeout; + journal->j_commit_timer = &timer; + + /* Record that the journal thread is running */ + journal->j_task = current; + wake_up(&journal->j_wait_done_commit); + + printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", + journal->j_commit_interval / HZ); + list_add(&journal->j_all_journals, &all_journals); + + /* And now, wait forever for commit wakeup events. */ + while (1) { + if (journal->j_flags & JFS_UNMOUNT) + break; + + jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", + journal->j_commit_sequence, journal->j_commit_request); + + if (journal->j_commit_sequence != journal->j_commit_request) { + jbd_debug(1, "OK, requests differ\n"); + if (journal->j_commit_timer_active) { + journal->j_commit_timer_active = 0; + del_timer(journal->j_commit_timer); + } + + journal_commit_transaction(journal); + continue; + } + + wake_up(&journal->j_wait_done_commit); + interruptible_sleep_on(&journal->j_wait_commit); + + jbd_debug(1, "kjournald wakes\n"); + + /* Were we woken up by a commit wakeup event? */ + if ((transaction = journal->j_running_transaction) != NULL && + time_after_eq(jiffies, transaction->t_expires)) { + journal->j_commit_request = transaction->t_tid; + jbd_debug(1, "woke because of timeout\n"); + } + } + + if (journal->j_commit_timer_active) { + journal->j_commit_timer_active = 0; + del_timer_sync(journal->j_commit_timer); + } + + list_del(&journal->j_all_journals); + + journal->j_task = NULL; + wake_up(&journal->j_wait_done_commit); + jbd_debug(1, "Journal thread exiting.\n"); + return 0; + } + + static void journal_start_thread(journal_t *journal) + { + kernel_thread(kjournald, (void *) journal, + CLONE_VM | CLONE_FS | CLONE_FILES); + while (!journal->j_task) + sleep_on(&journal->j_wait_done_commit); + } + + static void journal_kill_thread(journal_t *journal) + { + journal->j_flags |= JFS_UNMOUNT; + + while (journal->j_task) { + wake_up(&journal->j_wait_commit); + sleep_on(&journal->j_wait_done_commit); + } + } + + #if 0 + + This is no longer needed - we do it in commit quite efficiently. + Note that if this function is resurrected, the loop needs to + be reorganised into the next_jh/last_jh algorithm. + + /* + * journal_clean_data_list: cleanup after data IO. + * + * Once the IO system has finished writing the buffers on the transaction's + * data list, we can remove those buffers from the list. This function + * scans the list for such buffers and removes them cleanly. + * + * We assume that the journal is already locked. + * We are called with journal_datalist_lock held. + * + * AKPM: This function looks inefficient. Approximately O(n^2) + * for potentially thousands of buffers. It no longer shows on profiles + * because these buffers are mainly dropped in journal_commit_transaction(). + */ + + void __journal_clean_data_list(transaction_t *transaction) + { + struct journal_head *jh, *next; + + assert_spin_locked(&journal_datalist_lock); + + restart: + jh = transaction->t_sync_datalist; + if (!jh) + goto out; + do { + next = jh->b_tnext; + if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) { + struct buffer_head *bh = jh2bh(jh); + BUFFER_TRACE(bh, "data writeout complete: unfile"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + __journal_remove_journal_head(bh); + refile_buffer(bh); + __brelse(bh); + goto restart; + } + jh = next; + } while (transaction->t_sync_datalist && + jh != transaction->t_sync_datalist); + out: + return; + } + #endif + + /* + * journal_write_metadata_buffer: write a metadata buffer to the journal. + * + * Writes a metadata buffer to a given disk block. The actual IO is not + * performed but a new buffer_head is constructed which labels the data + * to be written with the correct destination disk block. + * + * Any magic-number escaping which needs to be done will cause a + * copy-out here. If the buffer happens to start with the + * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the + * magic number is only written to the log for descripter blocks. In + * this case, we copy the data and replace the first word with 0, and we + * return a result code which indicates that this buffer needs to be + * marked as an escaped buffer in the corresponding log descriptor + * block. The missing word can then be restored when the block is read + * during recovery. + * + * If the source buffer has already been modified by a new transaction + * since we took the last commit snapshot, we use the frozen copy of + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we *have* to lock the buffer to prevent anyone + * else from using and possibly modifying it while the IO is in + * progress. + * + * The function returns a pointer to the buffer_heads to be used for IO. + * + * We assume that the journal has already been locked in this function. + * + * Return value: + * <0: Error + * >=0: Finished OK + * + * On success: + * Bit 0 set == escape performed on the data + * Bit 1 set == buffer copy-out performed (kfree the data after IO) + */ + + static inline unsigned long virt_to_offset(void *p) + {return ((unsigned long) p) & ~PAGE_MASK;} + + int journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct journal_head **jh_out, + int blocknr) + { + int need_copy_out = 0; + int done_copy_out = 0; + int do_escape = 0; + char *mapped_data; + struct buffer_head *new_bh; + struct journal_head * new_jh; + struct page *new_page; + unsigned int new_offset; + + /* + * The buffer really shouldn't be locked: only the current committing + * transaction is allowed to write it, so nobody else is allowed + * to do any IO. + * + * akpm: except if we're journalling data, and write() output is + * also part of a shared mapping, and another thread has + * decided to launch a writepage() against this buffer. + */ + J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in))); + + /* + * If a new transaction has already done a buffer copy-out, then + * we use that version of the data for the commit. + */ + + if (jh_in->b_frozen_data) { + done_copy_out = 1; + new_page = virt_to_page(jh_in->b_frozen_data); + new_offset = virt_to_offset(jh_in->b_frozen_data); + } else { + new_page = jh2bh(jh_in)->b_page; + new_offset = virt_to_offset(jh2bh(jh_in)->b_data); + } + + mapped_data = ((char *) kmap(new_page)) + new_offset; + + /* + * Check for escaping + */ + if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) { + need_copy_out = 1; + do_escape = 1; + } + + /* + * Do we need to do a data copy? + */ + + if (need_copy_out && !done_copy_out) { + char *tmp; + tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS); + + jh_in->b_frozen_data = tmp; + memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size); + + /* If we get to this path, we'll always need the new + address kmapped so that we can clear the escaped + magic number below. */ + kunmap(new_page); + new_page = virt_to_page(tmp); + new_offset = virt_to_offset(tmp); + mapped_data = ((char *) kmap(new_page)) + new_offset; + + done_copy_out = 1; + } + + /* + * Right, time to make up the new buffer_head. + */ + do { + new_bh = get_unused_buffer_head(0); + if (!new_bh) { + printk (KERN_NOTICE __FUNCTION__ + ": ENOMEM at get_unused_buffer_head, " + "trying again.\n"); + current->policy |= SCHED_YIELD; + schedule(); + } + } while (!new_bh); + /* keep subsequent assertions sane */ + new_bh->b_prev_free = 0; + new_bh->b_next_free = 0; + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); + + set_bh_page(new_bh, new_page, new_offset); + + new_jh->b_transaction = NULL; + new_bh->b_size = jh2bh(jh_in)->b_size; + new_bh->b_dev = transaction->t_journal->j_dev; + new_bh->b_blocknr = blocknr; + new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty); + + *jh_out = new_jh; + + /* + * Did we need to do an escaping? Now we've done all the + * copying, we can finally do so. + */ + + if (do_escape) + * ((unsigned int *) mapped_data) = 0; + kunmap(new_page); + + /* + * The to-be-written buffer needs to get moved to the io queue, + * and the original buffer whose contents we are shadowing or + * copying is moved to the transaction's shadow queue. + */ + JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); + journal_file_buffer(jh_in, transaction, BJ_Shadow); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); + journal_file_buffer(new_jh, transaction, BJ_IO); + + return do_escape | (done_copy_out << 1); + } + + /* + * Allocation code for the journal file. Manage the space left in the + * journal, so that we can begin checkpointing when appropriate. + */ + + /* + * log_space_left: Return the number of free blocks left in the journal. + * + * Called with the journal already locked. + */ + + int log_space_left (journal_t *journal) + { + int left = journal->j_free; + + /* Be pessimistic here about the number of those free blocks + * which might be required for log descriptor control blocks. */ + + #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ + + left -= MIN_LOG_RESERVED_BLOCKS; + + if (left <= 0) + return 0; + left -= (left >> 3); + return left; + } + + /* + * This function must be non-allocating for PF_MEMALLOC tasks + */ + tid_t log_start_commit (journal_t *journal, transaction_t *transaction) + { + tid_t target = journal->j_commit_request; + + lock_kernel(); /* Protect journal->j_running_transaction */ + + /* + * A NULL transaction asks us to commit the currently running + * transaction, if there is one. + */ + if (transaction) + target = transaction->t_tid; + else { + transaction = journal->j_running_transaction; + if (!transaction) + goto out; + target = transaction->t_tid; + } + + /* + * Are we already doing a recent enough commit? + */ + if (tid_geq(journal->j_commit_request, target)) + goto out; + + /* + * We want a new commit: OK, mark the request and wakup the + * commit thread. We do _not_ do the commit ourselves. + */ + + journal->j_commit_request = target; + jbd_debug(1, "JBD: requesting commit %d/%d\n", + journal->j_commit_request, + journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + + out: + unlock_kernel(); + return target; + } + + /* + * Wait for a specified commit to complete. + * The caller may not hold the journal lock. + */ + void log_wait_commit (journal_t *journal, tid_t tid) + { + lock_kernel(); + #ifdef CONFIG_JBD_DEBUG + lock_journal(journal); + if (!tid_geq(journal->j_commit_request, tid)) { + printk(KERN_EMERG __FUNCTION__ + ": error: j_commit_request=%d, tid=%d\n", + journal->j_commit_request, tid); + } + unlock_journal(journal); + #endif + while (tid_gt(tid, journal->j_commit_sequence)) { + jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + tid, journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + sleep_on(&journal->j_wait_done_commit); + } + unlock_kernel(); + } + + /* + * Log buffer allocation routines: + */ + + unsigned long journal_next_log_block(journal_t *journal) + { + unsigned long blocknr; + + J_ASSERT(journal->j_free > 1); + + blocknr = journal->j_head; + journal->j_head++; + journal->j_free--; + if (journal->j_head == journal->j_last) + journal->j_head = journal->j_first; + return journal_bmap(journal, blocknr); + } + + /* + * Conversion of logical to physical block numbers for the journal + * + * On external journals the journal blocks are identity-mapped, so + * this is a no-op. If needed, we can use j_blk_offset - everything is + * ready. + */ + unsigned long journal_bmap(journal_t *journal, unsigned long blocknr) + { + unsigned long ret; + + if (journal->j_inode) { + ret = bmap(journal->j_inode, blocknr); + J_ASSERT(ret != 0); + } else { + ret = blocknr; /* +journal->j_blk_offset */ + } + return ret; + } + + /* + * We play buffer_head aliasing tricks to write data/metadata blocks to + * the journal without copying their contents, but for journal + * descriptor blocks we do need to generate bona fide buffers. + */ + + struct journal_head * journal_get_descriptor_buffer(journal_t *journal) + { + struct buffer_head *bh; + unsigned long blocknr = journal_next_log_block(journal); + + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + bh->b_state |= (1 << BH_Dirty); + BUFFER_TRACE(bh, "return this buffer"); + return journal_add_journal_head(bh); + } + + /* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing + * journal blocks from disk. */ + + /* First: create and setup a journal_t object in memory. We initialise + * very few fields yet: that has to wait until we have created the + * journal structures from from scratch, or loaded them from disk. */ + + static journal_t * journal_init_common (void) + { + journal_t *journal; + int err; + + MOD_INC_USE_COUNT; + + journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); + if (!journal) + goto fail; + memset(journal, 0, sizeof(*journal)); + + init_waitqueue_head(&journal->j_wait_transaction_locked); + init_waitqueue_head(&journal->j_wait_logspace); + init_waitqueue_head(&journal->j_wait_done_commit); + init_waitqueue_head(&journal->j_wait_checkpoint); + init_waitqueue_head(&journal->j_wait_commit); + init_waitqueue_head(&journal->j_wait_updates); + init_MUTEX(&journal->j_barrier); + init_MUTEX(&journal->j_checkpoint_sem); + init_MUTEX(&journal->j_sem); + + journal->j_commit_interval = (HZ * 5); + + /* The journal is marked for error until we succeed with recovery! */ + journal->j_flags = JFS_ABORT; + + /* Set up a default-sized revoke table for the new mount. */ + err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + if (err) { + kfree(journal); + goto fail; + } + return journal; + fail: + MOD_DEC_USE_COUNT; + return NULL; + } + + /* journal_init_dev and journal_init_inode: + * + * Create a journal structure assigned some fixed set of disk blocks to + * the journal. We don't actually touch those disk blocks yet, but we + * need to set up all of the mapping information to tell the journaling + * system where the journal blocks are. + * + * journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + * journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ + + journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev, + int start, int len, int blocksize) + { + journal_t *journal = journal_init_common(); + struct buffer_head *bh; + + if (!journal) + return NULL; + + journal->j_dev = dev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + journal->j_blocksize = blocksize; + + bh = getblk(journal->j_dev, start, journal->j_blocksize); + J_ASSERT(bh != NULL); + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; + } + + journal_t * journal_init_inode (struct inode *inode) + { + struct buffer_head *bh; + journal_t *journal = journal_init_common(); + int blocknr; + + if (!journal) + return NULL; + + journal->j_dev = inode->i_dev; + journal->j_fs_dev = inode->i_dev; + journal->j_inode = inode; + jbd_debug(1, + "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", + journal, bdevname(inode->i_dev), inode->i_ino, inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; + journal->j_blocksize = inode->i_sb->s_blocksize; + + blocknr = journal_bmap(journal, 0); + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + J_ASSERT(bh != NULL); + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; + } + + /* + * Given a journal_t structure, initialise the various fields for + * startup of a new journaling session. We use this both when creating + * a journal, and after recovering an old journal to reset it for + * subsequent use. + */ + + static int journal_reset (journal_t *journal) + { + journal_superblock_t *sb = journal->j_superblock; + unsigned int first, last; + + first = ntohl(sb->s_first); + last = ntohl(sb->s_maxlen); + + journal->j_first = first; + journal->j_last = last; + + journal->j_head = first; + journal->j_tail = first; + journal->j_free = last - first; + + journal->j_tail_sequence = journal->j_transaction_sequence; + journal->j_commit_sequence = journal->j_transaction_sequence - 1; + journal->j_commit_request = journal->j_commit_sequence; + + journal->j_max_transaction_buffers = journal->j_maxlen / 4; + + /* Add the dynamic fields and write it to disk. */ + journal_update_superblock(journal, 1); + + lock_journal(journal); + journal_start_thread(journal); + unlock_journal(journal); + + return 0; + } + + /* + * Given a journal_t structure which tells us which disk blocks we can + * use, create a new journal superblock and initialise all of the + * journal fields from scratch. */ + + int journal_create (journal_t *journal) + { + int blocknr; + struct buffer_head *bh; + journal_superblock_t *sb; + int i; + + if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { + printk (KERN_ERR "Journal length (%d blocks) too short.\n", + journal->j_maxlen); + return -EINVAL; + } + + if (journal->j_inode == NULL) { + /* + * We don't know what block to start at! + */ + printk(KERN_EMERG __FUNCTION__ + ": creation of journal on external device!\n"); + BUG(); + } + + /* Zero out the entire journal on disk. We cannot afford to + have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ + jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); + for (i = 0; i < journal->j_maxlen; i++) { + blocknr = journal_bmap(journal, i); + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + wait_on_buffer(bh); + memset (bh->b_data, 0, journal->j_blocksize); + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + BUFFER_TRACE(bh, "marking uptodate"); + mark_buffer_uptodate(bh, 1); + __brelse(bh); + } + sync_dev(journal->j_dev); + jbd_debug(1, "JBD: journal cleared.\n"); + + /* OK, fill in the initial static fields in the new superblock */ + sb = journal->j_superblock; + + sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER); + sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2); + + sb->s_blocksize = htonl(journal->j_blocksize); + sb->s_maxlen = htonl(journal->j_maxlen); + sb->s_first = htonl(1); + + journal->j_transaction_sequence = 1; + + journal->j_flags &= ~JFS_ABORT; + journal->j_format_version = 2; + + return journal_reset(journal); + } + + /* + * Update a journal's dynamic superblock fields and write it to disk, + * optionally waiting for the IO to complete. + */ + + void journal_update_superblock(journal_t *journal, int wait) + { + journal_superblock_t *sb = journal->j_superblock; + struct buffer_head *bh = journal->j_sb_buffer; + + jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, journal->j_errno); + + sb->s_sequence = htonl(journal->j_tail_sequence); + sb->s_start = htonl(journal->j_tail); + sb->s_errno = htonl(journal->j_errno); + + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + if (wait) + wait_on_buffer(bh); + + /* If we have just flushed the log (by marking s_start==0), then + * any future commit will have to be careful to update the + * superblock again to re-record the true start of the log. */ + + if (sb->s_start) + journal->j_flags &= ~JFS_FLUSHED; + else + journal->j_flags |= JFS_FLUSHED; + } + + + /* + * Read the superblock for a given journal, performing initial + * validation of the format. + */ + + static int journal_get_superblock(journal_t *journal) + { + struct buffer_head *bh; + journal_superblock_t *sb; + + bh = journal->j_sb_buffer; + + J_ASSERT(bh != NULL); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + printk (KERN_ERR + "JBD: IO error reading journal superblock\n"); + return -EIO; + } + } + + sb = journal->j_superblock; + + if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) || + sb->s_blocksize != htonl(journal->j_blocksize)) { + printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + return -EINVAL; + } + + switch(ntohl(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V1: + journal->j_format_version = 1; + break; + case JFS_SUPERBLOCK_V2: + journal->j_format_version = 2; + break; + default: + printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + return -EINVAL; + } + + if (ntohl(sb->s_maxlen) < journal->j_maxlen) + journal->j_maxlen = ntohl(sb->s_maxlen); + else if (ntohl(sb->s_maxlen) > journal->j_maxlen) { + printk (KERN_WARNING "JBD: journal file too short\n"); + return -EINVAL; + } + + return 0; + } + + /* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ + + static int load_superblock(journal_t *journal) + { + int err; + journal_superblock_t *sb; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + journal->j_tail_sequence = ntohl(sb->s_sequence); + journal->j_tail = ntohl(sb->s_start); + journal->j_first = ntohl(sb->s_first); + journal->j_last = ntohl(sb->s_maxlen); + journal->j_errno = ntohl(sb->s_errno); + + return 0; + } + + + /* + * Given a journal_t structure which tells us which disk blocks contain + * a journal, read the journal from disk to initialise the in-memory + * structures. + */ + + int journal_load(journal_t *journal) + { + int err; + + err = load_superblock(journal); + if (err) + return err; + + /* If this is a V2 superblock, then we have to check the + * features flags on it. */ + + if (journal->j_format_version >= 2) { + journal_superblock_t *sb = journal->j_superblock; + + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { + printk (KERN_WARNING + "JBD: Unrecognised features on journal\n"); + return -EINVAL; + } + } + + /* Let the recovery code check whether it needs to recover any + * data from the journal. */ + if (journal_recover(journal)) + goto recovery_error; + + /* OK, we've finished with the dynamic journal bits: + * reinitialise the dynamic contents of the superblock in memory + * and reset them on disk. */ + if (journal_reset(journal)) + goto recovery_error; + + journal->j_flags &= ~JFS_ABORT; + journal->j_flags |= JFS_LOADED; + return 0; + + recovery_error: + printk (KERN_WARNING "JBD: recovery failed\n"); + return -EIO; + } + + /* + * Release a journal_t structure once it is no longer in use by the + * journaled object. + */ + + void journal_destroy (journal_t *journal) + { + /* Wait for the commit thread to wake up and die. */ + journal_kill_thread(journal); + + /* Force a final log commit */ + if (journal->j_running_transaction) + journal_commit_transaction(journal); + + /* Force any old transactions to disk */ + lock_journal(journal); + while (journal->j_checkpoint_transactions != NULL) + log_do_checkpoint(journal, 1); + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + J_ASSERT(journal->j_checkpoint_transactions == NULL); + + /* We can now mark the journal as empty. */ + journal->j_tail = 0; + journal->j_tail_sequence = ++journal->j_transaction_sequence; + journal_update_superblock(journal, 1); + + if (journal->j_inode) + iput(journal->j_inode); + if (journal->j_revoke) + journal_destroy_revoke(journal); + + unlock_journal(journal); + brelse(journal->j_sb_buffer); + kfree(journal); + MOD_DEC_USE_COUNT; + } + + + /* Published API: Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. */ + + int journal_check_used_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) + { + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + if (journal->j_format_version == 1) + return 0; + + sb = journal->j_superblock; + + if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && + ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && + ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) + return 1; + + return 0; + } + + /* Published API: Check whether the journaling code supports the use of + * all of a given set of features on this journal. Return true + * (non-zero) if it can. */ + + int journal_check_available_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) + { + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + + sb = journal->j_superblock; + + /* We can support any known requested features iff the + * superblock is in version 2. Otherwise we fail to support any + * extended sb features. */ + + if (journal->j_format_version != 2) + return 0; + + if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && + (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && + (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) + return 1; + + return 0; + } + + /* Published API: Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. */ + + int journal_set_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) + { + journal_superblock_t *sb; + + if (journal_check_used_features(journal, compat, ro, incompat)) + return 1; + + if (!journal_check_available_features(journal, compat, ro, incompat)) + return 0; + + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat |= cpu_to_be32(compat); + sb->s_feature_ro_compat |= cpu_to_be32(ro); + sb->s_feature_incompat |= cpu_to_be32(incompat); + + return 1; + } + + + /* + * Published API: + * Given an initialised but unloaded journal struct, poke about in the + * on-disk structure to update it to the most recent supported version. + */ + + int journal_update_format (journal_t *journal) + { + journal_superblock_t *sb; + int err; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + switch (ntohl(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V2: + return 0; + case JFS_SUPERBLOCK_V1: + return journal_convert_superblock_v1(journal, sb); + default: + break; + } + return -EINVAL; + } + + static int journal_convert_superblock_v1(journal_t *journal, + journal_superblock_t *sb) + { + int offset, blocksize; + struct buffer_head *bh; + + printk(KERN_WARNING + "JBD: Converting superblock from version 1 to 2.\n"); + + /* Pre-initialise new fields to zero */ + offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); + blocksize = ntohl(sb->s_blocksize); + memset(&sb->s_feature_compat, 0, blocksize-offset); + + sb->s_nr_users = cpu_to_be32(1); + sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); + journal->j_format_version = 2; + + bh = journal->j_sb_buffer; + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + return 0; + } + + + /* + * Flush all data for a given journal to disk and empty the journal. + * Filesystems can use this when remounting readonly to ensure that + * recovery does not need to happen on remount. + */ + + int journal_flush (journal_t *journal) + { + int err = 0; + transaction_t *transaction = NULL; + unsigned long old_tail; + + lock_kernel(); + + /* Force everything buffered to the log... */ + if (journal->j_running_transaction) { + transaction = journal->j_running_transaction; + log_start_commit(journal, transaction); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + /* Wait for the log commit to complete... */ + if (transaction) + log_wait_commit(journal, transaction->t_tid); + + /* ...and flush everything in the log out to disk. */ + lock_journal(journal); + while (!err && journal->j_checkpoint_transactions != NULL) + err = log_do_checkpoint(journal, journal->j_maxlen); + cleanup_journal_tail(journal); + + /* Finally, mark the journal as really needing no recovery. + * This sets s_start==0 in the underlying superblock, which is + * the magic code for a fully-recovered superblock. Any future + * commits of data to the journal will restore the current + * s_start value. */ + old_tail = journal->j_tail; + journal->j_tail = 0; + journal_update_superblock(journal, 1); + journal->j_tail = old_tail; + + unlock_journal(journal); + + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); + + unlock_kernel(); + + return err; + } + + /* + * Wipe out all of the contents of a journal, safely. This will produce + * a warning if the journal contains any valid recovery information. + * Must be called between journal_init_*() and journal_load(). + * + * If (write) is non-zero, then we wipe out the journal on disk; otherwise + * we merely suppress recovery. + */ + + int journal_wipe (journal_t *journal, int write) + { + journal_superblock_t *sb; + int err = 0; + + J_ASSERT (!(journal->j_flags & JFS_LOADED)); + + err = load_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + if (!journal->j_tail) + goto no_recovery; + + printk (KERN_WARNING "JBD: %s recovery information on journal\n", + write ? "Clearing" : "Ignoring"); + + err = journal_skip_recovery(journal); + if (write) + journal_update_superblock(journal, 1); + + no_recovery: + return err; + } + + /* + * journal_dev_name: format a character string to describe on what + * device this journal is present. + */ + + const char * journal_dev_name(journal_t *journal) + { + kdev_t dev; + + if (journal->j_inode) + dev = journal->j_inode->i_dev; + else + dev = journal->j_dev; + + return bdevname(dev); + } + + /* + * journal_abort: perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final journal_stop, which will receive the -EIO error. + * + * Finally, the journal_abort call allows the caller to supply an errno + * which will be recored (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + * Errors which originate from within the journaling layer will NOT + * supply an errno; a null errno implies that absolutely no further + * writes are done to the journal (unless there are any already in + * progress). + */ + + /* Quick version for internal journal use (doesn't lock the journal) */ + void __journal_abort (journal_t *journal) + { + transaction_t *transaction; + + printk (KERN_ERR "Aborting journal on device %s.\n", + journal_dev_name(journal)); + + journal->j_flags |= JFS_ABORT; + transaction = journal->j_running_transaction; + if (transaction) + log_start_commit(journal, transaction); + } + + /* Full version for external use */ + void journal_abort (journal_t *journal, int errno) + { + lock_journal(journal); + + if (journal->j_flags & JFS_ABORT) + goto out; + + if (!journal->j_errno) + journal->j_errno = errno; + + __journal_abort(journal); + + if (errno) + journal_update_superblock(journal, 1); + + out: + unlock_journal(journal); + } + + int journal_errno (journal_t *journal) + { + int err; + + lock_journal(journal); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + err = journal->j_errno; + unlock_journal(journal); + return err; + } + + int journal_clear_err (journal_t *journal) + { + int err = 0; + + lock_journal(journal); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + journal->j_errno = 0; + unlock_journal(journal); + return err; + } + + void journal_ack_err (journal_t *journal) + { + lock_journal(journal); + if (journal->j_errno) + journal->j_flags |= JFS_ACK_ERR; + unlock_journal(journal); + } + + int journal_blocks_per_page(struct inode *inode) + { + return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + } + + /* + * shrink_journal_memory(). + * Called when we're under memory pressure. Free up all the written-back + * checkpointed metadata buffers. + */ + void shrink_journal_memory(void) + { + struct list_head *list; + + lock_kernel(); + list_for_each(list, &all_journals) { + journal_t *journal = + list_entry(list, journal_t, j_all_journals); + spin_lock(&journal_datalist_lock); + __journal_clean_checkpoint_list(journal); + spin_unlock(&journal_datalist_lock); + } + unlock_kernel(); + } + + /* + * Simple support for retying memory allocations. Introduced to help to + * debug different VM deadlock avoidance strategies. + */ + /* + * Simple support for retying memory allocations. Introduced to help to + * debug different VM deadlock avoidance strategies. + */ + void * __jbd_kmalloc (char *where, size_t size, int flags, int retry) + { + void *p; + static unsigned long last_warning; + + while (1) { + p = kmalloc(size, flags); + if (p) + return p; + if (!retry) + return NULL; + /* Log every retry for debugging. Also log them to the + * syslog, but do rate-limiting on the non-debugging + * messages. */ + jbd_debug(1, "ENOMEM in %s, retrying.\n", where); + + if (time_after(jiffies, last_warning + 5*HZ)) { + printk(KERN_NOTICE + "ENOMEM in %s, retrying.\n", where); + last_warning = jiffies; + } + + current->policy |= SCHED_YIELD; + schedule(); + } + } + + /* + * Journal_head storage management + */ + static kmem_cache_t *journal_head_cache; + #ifdef CONFIG_JBD_DEBUG + static atomic_t nr_journal_heads = ATOMIC_INIT(0); + #endif + + static int journal_init_journal_head_cache(void) + { + int retval; + + J_ASSERT(journal_head_cache == 0); + journal_head_cache = kmem_cache_create("journal_head", + sizeof(struct journal_head), + 0, /* offset */ + 0, /* flags */ + NULL, /* ctor */ + NULL); /* dtor */ + retval = 0; + if (journal_head_cache == 0) { + retval = -ENOMEM; + printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + } + return retval; + } + + static void journal_destroy_journal_head_cache(void) + { + J_ASSERT(journal_head_cache != NULL); + kmem_cache_destroy(journal_head_cache); + journal_head_cache = 0; + } + + /* + * journal_head splicing and dicing + */ + static struct journal_head *journal_alloc_journal_head(void) + { + struct journal_head *ret; + static unsigned long last_warning; + + #ifdef CONFIG_JBD_DEBUG + atomic_inc(&nr_journal_heads); + #endif + ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + if (ret == 0) { + jbd_debug(1, "out of memory for journal_head\n"); + if (time_after(jiffies, last_warning + 5*HZ)) { + printk(KERN_NOTICE "ENOMEM in " __FUNCTION__ + ", retrying.\n"); + last_warning = jiffies; + } + while (ret == 0) { + current->policy |= SCHED_YIELD; + schedule(); + ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + } + } + return ret; + } + + static void journal_free_journal_head(struct journal_head *jh) + { + #ifdef CONFIG_JBD_DEBUG + atomic_dec(&nr_journal_heads); + memset(jh, 0x5b, sizeof(*jh)); + #endif + kmem_cache_free(journal_head_cache, jh); + } + + /* + * A journal_head is attached to a buffer_head whenever JBD has an + * interest in the buffer. + * + * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit + * is set. This bit is tested in core kernel code where we need to take + * JBD-specific actions. Testing the zeroness of ->b_private is not reliable + * there. + * + * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. + * + * When a buffer has its BH_JBD bit set it is immune from being released by + * core kernel code, mainly via ->b_count. + * + * A journal_head may be detached from its buffer_head when the journal_head's + * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. + * Various places in JBD call journal_remove_journal_head() to indicate that the + * journal_head can be dropped if needed. + * + * Various places in the kernel want to attach a journal_head to a buffer_head + * _before_ attaching the journal_head to a transaction. To protect the + * journal_head in this situation, journal_add_journal_head elevates the + * journal_head's b_jcount refcount by one. The caller must call + * journal_unlock_journal_head() to undo this. + * + * So the typical usage would be: + * + * (Attach a journal_head if needed. Increments b_jcount) + * struct journal_head *jh = journal_add_journal_head(bh); + * ... + * jh->b_transaction = xxx; + * journal_unlock_journal_head(jh); + * + * Now, the journal_head's b_jcount is zero, but it is safe from being released + * because it has a non-zero b_transaction. + */ + + /* + * Give a buffer_head a journal_head. + * + * Doesn't need the journal lock. + * May sleep. + * Cannot be called with journal_datalist_lock held. + */ + struct journal_head *journal_add_journal_head(struct buffer_head *bh) + { + struct journal_head *jh; + + spin_lock(&journal_datalist_lock); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + } else { + J_ASSERT_BH(bh, + (atomic_read(&bh->b_count) > 0) || + (bh->b_page && bh->b_page->mapping)); + spin_unlock(&journal_datalist_lock); + jh = journal_alloc_journal_head(); + memset(jh, 0, sizeof(*jh)); + spin_lock(&journal_datalist_lock); + + if (buffer_jbd(bh)) { + /* Someone did it for us! */ + J_ASSERT_BH(bh, bh->b_private != NULL); + journal_free_journal_head(jh); + jh = bh->b_private; + } else { + /* + * We actually don't need jh_splice_lock when + * adding a journal_head - only on removal. + */ + spin_lock(&jh_splice_lock); + set_bit(BH_JBD, &bh->b_state); + bh->b_private = jh; + jh->b_bh = bh; + atomic_inc(&bh->b_count); + spin_unlock(&jh_splice_lock); + BUFFER_TRACE(bh, "added journal_head"); + } + } + jh->b_jcount++; + spin_unlock(&journal_datalist_lock); + return bh->b_private; + } + + /* + * journal_remove_journal_head(): if the buffer isn't attached to a transaction + * and has a zero b_jcount then remove and release its journal_head. If we did + * see that the buffer is not used by any transaction we also "logically" + * decrement ->b_count. + * + * We in fact take an additional increment on ->b_count as a convenience, + * because the caller usually wants to do additional things with the bh + * after calling here. + * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some + * time. Once the caller has run __brelse(), the buffer is eligible for + * reaping by try_to_free_buffers(). + * + * Requires journal_datalist_lock. + */ + void __journal_remove_journal_head(struct buffer_head *bh) + { + struct journal_head *jh = bh2jh(bh); + + assert_spin_locked(&journal_datalist_lock); + J_ASSERT_JH(jh, jh->b_jcount >= 0); + atomic_inc(&bh->b_count); + if (jh->b_jcount == 0) { + if (jh->b_transaction == NULL && + jh->b_next_transaction == NULL && + jh->b_cp_transaction == NULL) { + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + spin_lock(&jh_splice_lock); + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_bit(BH_JBD, &bh->b_state); + __brelse(bh); + spin_unlock(&jh_splice_lock); + journal_free_journal_head(jh); + } else { + BUFFER_TRACE(bh, "journal_head was locked"); + } + } + } + + void journal_unlock_journal_head(struct journal_head *jh) + { + spin_lock(&journal_datalist_lock); + J_ASSERT_JH(jh, jh->b_jcount > 0); + --jh->b_jcount; + if (!jh->b_jcount && !jh->b_transaction) { + struct buffer_head *bh; + bh = jh2bh(jh); + __journal_remove_journal_head(bh); + __brelse(bh); + } + + spin_unlock(&journal_datalist_lock); + } + + void journal_remove_journal_head(struct buffer_head *bh) + { + spin_lock(&journal_datalist_lock); + __journal_remove_journal_head(bh); + spin_unlock(&journal_datalist_lock); + } + + /* + * Module startup and shutdown + */ + + static int __init journal_init_caches(void) + { + int ret; + + ret = journal_init_revoke_caches(); + if (ret == 0) + ret = journal_init_journal_head_cache(); + return ret; + } + + static void journal_destroy_caches(void) + { + journal_destroy_revoke_caches(); + journal_destroy_journal_head_cache(); + } + + static int __init journal_init(void) + { + int ret; + + printk(KERN_INFO "Journalled Block Device driver loaded\n"); + ret = journal_init_caches(); + if (ret != 0) + journal_destroy_caches(); + return ret; + } + + static void __exit journal_exit(void) + { + #ifdef CONFIG_JBD_DEBUG + int n = atomic_read(&nr_journal_heads); + if (n) + printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + #endif + journal_destroy_caches(); + } + + MODULE_LICENSE("GPL"); + module_init(journal_init); + module_exit(journal_exit); + diff -rc2P linux/fs/jbd/recovery.c linux-2.4.13/fs/jbd/recovery.c *** linux/fs/jbd/recovery.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/recovery.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,586 ---- + /* + * linux/fs/recovery.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999-2000 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal recovery routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + + #ifndef __KERNEL__ + #include "jfs_user.h" + #else + #include + #include + #include + #include + #include + #include + #endif + + /* + * Maintain information about the progress of the recovery job, so that + * the different passes can carry information between them. + */ + struct recovery_info + { + tid_t start_transaction; + tid_t end_transaction; + + int nr_replays; + int nr_revokes; + int nr_revoke_hits; + }; + + enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; + static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass); + static int scan_revoke_records(journal_t *, struct buffer_head *, + tid_t, struct recovery_info *); + + #ifdef __KERNEL__ + + /* Release readahead buffers after use */ + void journal_brelse_array(struct buffer_head *b[], int n) + { + while (--n >= 0) + brelse (b[n]); + } + + + /* + * When reading from the journal, we are going through the block device + * layer directly and so there is no readahead being done for us. We + * need to implement any readahead ourselves if we want it to happen at + * all. Recovery is basically one long sequential read, so make sure we + * do the IO in reasonably large chunks. + * + * This is not so critical that we need to be enormously clever about + * the readahead size, though. 128K is a purely arbitrary, good-enough + * fixed value. + */ + + #define MAXBUF 8 + static int do_readahead(journal_t *journal, unsigned int start) + { + int err; + unsigned int max, nbufs, next, blocknr; + struct buffer_head *bh; + + struct buffer_head * bufs[MAXBUF]; + + /* Do up to 128K of readahead */ + max = start + (128 * 1024 / journal->j_blocksize); + if (max > journal->j_maxlen) + max = journal->j_maxlen; + + /* Do the readahead itself. We'll submit MAXBUF buffer_heads at + * a time to the block device IO layer. */ + + nbufs = 0; + + for (next = start; next < max; next++) { + blocknr = journal_bmap(journal, next); + + if (!blocknr) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + next); + err = -EIO; + goto failed; + } + + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + err = -ENOMEM; + goto failed; + } + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + bufs[nbufs++] = bh; + if (nbufs == MAXBUF) { + ll_rw_block(READ, nbufs, bufs); + journal_brelse_array(bufs, nbufs); + nbufs = 0; + } + } else + brelse(bh); + } + + if (nbufs) + ll_rw_block(READ, nbufs, bufs); + err = 0; + + failed: + if (nbufs) + journal_brelse_array(bufs, nbufs); + return err; + } + + #endif /* __KERNEL__ */ + + + /* + * Read a block from the journal + */ + + static int jread(struct buffer_head **bhp, journal_t *journal, + unsigned int offset) + { + unsigned int blocknr; + struct buffer_head *bh; + + *bhp = NULL; + + J_ASSERT (offset < journal->j_maxlen); + + blocknr = journal_bmap(journal, offset); + + if (!blocknr) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + offset); + return -EIO; + } + + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + if (!buffer_uptodate(bh)) { + /* If this is a brand new buffer, start readahead. + Otherwise, we assume we are already reading it. */ + if (!buffer_req(bh)) + do_readahead(journal, offset); + wait_on_buffer(bh); + } + + if (!buffer_uptodate(bh)) { + printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + offset); + brelse(bh); + return -EIO; + } + + *bhp = bh; + return 0; + } + + + /* + * Count the number of in-use tags in a journal descriptor block. + */ + + static int count_tags(struct buffer_head *bh, int size) + { + char * tagp; + journal_block_tag_t * tag; + int nr = 0; + + tagp = &bh->b_data[sizeof(journal_header_t)]; + + while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { + tag = (journal_block_tag_t *) tagp; + + nr++; + tagp += sizeof(journal_block_tag_t); + if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID))) + tagp += 16; + + if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG)) + break; + } + + return nr; + } + + + /* Make sure we wrap around the log correctly! */ + #define wrap(journal, var) \ + do { \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ + } while (0) + + /* + * journal_recover + * + * The primary function for recovering the log contents when mounting a + * journaled device. + * + * Recovery is done in three passes. In the first pass, we look for the + * end of the log. In the second, we assemble the list of revoke + * blocks. In the third and final pass, we replay any un-revoked blocks + * in the log. + */ + + int journal_recover(journal_t *journal) + { + int err; + journal_superblock_t * sb; + + struct recovery_info info; + + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; + + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly + * unmounted. + */ + + if (!sb->s_start) { + jbd_debug(1, "No recovery required, last transaction %d\n", + ntohl(sb->s_sequence)); + journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1; + return 0; + } + + + err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = do_one_pass(journal, &info, PASS_REVOKE); + if (!err) + err = do_one_pass(journal, &info, PASS_REPLAY); + + jbd_debug(0, "JBD: recovery, exit status %d, " + "recovered transactions %u to %u\n", + err, info.start_transaction, info.end_transaction); + jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", + info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + + /* Restart the log at the next transaction ID, thus invalidating + * any existing commit records in the log. */ + journal->j_transaction_sequence = ++info.end_transaction; + + journal_clear_revoke(journal); + fsync_no_super(journal->j_fs_dev); + return err; + } + + /* + * journal_skip_recovery + * + * Locate any valid recovery information from the journal and set up the + * journal structures in memory to ignore it (presumably because the + * caller has evidence that it is out of date). + * + * We perform one pass over the journal to allow us to tell the user how + * much recovery information is being erased, and to let us initialise + * the journal transaction sequence numbers to the next unused ID. + */ + + int journal_skip_recovery(journal_t *journal) + { + int err; + journal_superblock_t * sb; + + struct recovery_info info; + + memset (&info, 0, sizeof(info)); + sb = journal->j_superblock; + + err = do_one_pass(journal, &info, PASS_SCAN); + + if (err) { + printk(KERN_ERR "JBD: error %d scanning journal\n", err); + ++journal->j_transaction_sequence; + } else { + #ifdef CONFIG_JBD_DEBUG + int dropped = info.end_transaction - ntohl(sb->s_sequence); + #endif + + jbd_debug(0, + "JBD: ignoring %d transaction%s from the journal.\n", + dropped, (dropped == 1) ? "" : "s"); + journal->j_transaction_sequence = ++info.end_transaction; + } + + journal->j_tail = 0; + + return err; + } + + static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) + { + + unsigned int first_commit_ID, next_commit_ID; + unsigned long next_log_block; + int err, success = 0; + journal_superblock_t * sb; + journal_header_t * tmp; + struct buffer_head * bh; + unsigned int sequence; + int blocktype; + + /* Precompute the maximum metadata descriptors in a descriptor block */ + int MAX_BLOCKS_PER_DESC; + MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) + / sizeof(journal_block_tag_t)); + + /* + * First thing is to establish what we expect to find in the log + * (in terms of transaction IDs), and where (in terms of log + * block offsets): query the superblock. + */ + + sb = journal->j_superblock; + next_commit_ID = ntohl(sb->s_sequence); + next_log_block = ntohl(sb->s_start); + + first_commit_ID = next_commit_ID; + if (pass == PASS_SCAN) + info->start_transaction = first_commit_ID; + + jbd_debug(1, "Starting recovery pass %d\n", pass); + + /* + * Now we walk through the log, transaction by transaction, + * making sure that each transaction has a commit block in the + * expected place. Each complete transaction gets replayed back + * into the main filesystem. + */ + + while (1) { + int flags; + char * tagp; + journal_block_tag_t * tag; + struct buffer_head * obh; + struct buffer_head * nbh; + + /* If we already know where to stop the log traversal, + * check right now that we haven't gone past the end of + * the log. */ + + if (pass != PASS_SCAN) + if (tid_geq(next_commit_ID, info->end_transaction)) + break; + + jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit + * record. */ + + jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + err = jread(&bh, journal, next_log_block); + if (err) + goto failed; + + next_log_block++; + wrap(journal, next_log_block); + + /* What kind of buffer is it? + * + * If it is a descriptor block, check that it has the + * expected sequence number. Otherwise, we're all done + * here. */ + + tmp = (journal_header_t *)bh->b_data; + + if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) { + brelse(bh); + break; + } + + blocktype = ntohl(tmp->h_blocktype); + sequence = ntohl(tmp->h_sequence); + jbd_debug(3, "Found magic %d, sequence %d\n", + blocktype, sequence); + + if (sequence != next_commit_ID) { + brelse(bh); + break; + } + + /* OK, we have a valid descriptor block which matches + * all of the sequence number checks. What are we going + * to do with it? That depends on the pass... */ + + switch(blocktype) { + case JFS_DESCRIPTOR_BLOCK: + /* If it is a valid descriptor block, replay it + * in pass REPLAY; otherwise, just skip over the + * blocks it describes. */ + if (pass != PASS_REPLAY) { + next_log_block += + count_tags(bh, journal->j_blocksize); + wrap(journal, next_log_block); + brelse(bh); + continue; + } + + /* A descriptor block: we can now write all of + * the data blocks. Yay, useful work is finally + * getting done here! */ + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) + <= journal->j_blocksize) { + unsigned long io_block; + + tag = (journal_block_tag_t *) tagp; + flags = ntohl(tag->t_flags); + + io_block = next_log_block++; + wrap(journal, next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but + * report failure at the end. */ + success = err; + printk (KERN_ERR + "JBD: IO error %d recovering " + "block %ld in log\n", + err, io_block); + } else { + unsigned long blocknr; + + J_ASSERT(obh != NULL); + blocknr = ntohl(tag->t_blocknr); + + /* If the block has been + * revoked, then we're all done + * here. */ + if (journal_test_revoke + (journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Find a buffer for the new + * data being restored */ + nbh = getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + if (nbh == NULL) { + printk(KERN_ERR + "JBD: Out of memory " + "during recovery.\n"); + err = -ENOMEM; + brelse(bh); + brelse(obh); + goto failed; + } + + memcpy(nbh->b_data, obh->b_data, + journal->j_blocksize); + if (flags & JFS_FLAG_ESCAPE) { + *((unsigned int *)bh->b_data) = + htonl(JFS_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + mark_buffer_uptodate(nbh, 1); + ++info->nr_replays; + /* ll_rw_block(WRITE, 1, &nbh); */ + brelse(obh); + brelse(nbh); + } + + skip_write: + tagp += sizeof(journal_block_tag_t); + if (!(flags & JFS_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JFS_FLAG_LAST_TAG) + break; + } + + brelse(bh); + continue; + + case JFS_COMMIT_BLOCK: + /* Found an expected commit block: not much to + * do other than move on to the next sequence + * number. */ + brelse(bh); + next_commit_ID++; + continue; + + case JFS_REVOKE_BLOCK: + /* If we aren't in the REVOKE pass, then we can + * just skip over this block. */ + if (pass != PASS_REVOKE) { + brelse(bh); + continue; + } + + err = scan_revoke_records(journal, bh, + next_commit_ID, info); + brelse(bh); + if (err) + goto failed; + continue; + + default: + jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + blocktype); + goto done; + } + } + + done: + /* + * We broke out of the log scan loop: either we came to the + * known end of the log or we found an unexpected block in the + * log. If the latter happened, then we know that the "current" + * transaction marks the end of the valid log. + */ + + if (pass == PASS_SCAN) + info->end_transaction = next_commit_ID; + else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { + printk (KERN_ERR "JBD: recovery pass %d ended at " + "transaction %u, expected %u\n", + pass, next_commit_ID, info->end_transaction); + if (!success) + success = -EIO; + } + } + + return success; + + failed: + return err; + } + + + /* Scan a revoke record, marking all blocks mentioned as revoked. */ + + static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, + tid_t sequence, struct recovery_info *info) + { + journal_revoke_header_t *header; + int offset, max; + + header = (journal_revoke_header_t *) bh->b_data; + offset = sizeof(journal_revoke_header_t); + max = ntohl(header->r_count); + + while (offset < max) { + unsigned long blocknr; + int err; + + blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset))); + offset += 4; + err = journal_set_revoke(journal, blocknr, sequence); + if (err) + return err; + ++info->nr_revokes; + } + return 0; + } diff -rc2P linux/fs/jbd/revoke.c linux-2.4.13/fs/jbd/revoke.c *** linux/fs/jbd/revoke.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/revoke.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,631 ---- + /* + * linux/fs/revoke.c + * + * Written by Stephen C. Tweedie , 2000 + * + * Copyright 2000 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal revoke routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + * + * Revoke is the mechanism used to prevent old log records for deleted + * metadata from being replayed on top of newer data using the same + * blocks. The revoke mechanism is used in two separate places: + * + * + Commit: during commit we write the entire list of the current + * transaction's revoked blocks to the journal + * + * + Recovery: during recovery we record the transaction ID of all + * revoked blocks. If there are multiple revoke records in the log + * for a single block, only the last one counts, and if there is a log + * entry for a block beyond the last revoke, then that log entry still + * gets replayed. + * + * We can get interactions between revokes and new log data within a + * single transaction: + * + * Block is revoked and then journaled: + * The desired end result is the journaling of the new block, so we + * cancel the revoke before the transaction commits. + * + * Block is journaled and then revoked: + * The revoke must take precedence over the write of the block, so we + * need either to cancel the journal entry or to write the revoke + * later in the log than the log block. In this case, we choose the + * latter: journaling a block cancels any revoke record for that block + * in the current transaction, so any revoke for that block in the + * transaction must have happened after the block was journaled and so + * the revoke must take precedence. + * + * Block is revoked and then written as data: + * The data write is allowed to succeed, but the revoke is _not_ + * cancelled. We still need to prevent old log records from + * overwriting the new data. We don't even need to clear the revoke + * bit here. + * + * Revoke information on buffers is a tri-state value: + * + * RevokeValid clear: no cached revoke status, need to look it up + * RevokeValid set, Revoked clear: + * buffer has not been revoked, and cancel_revoke + * need do nothing. + * RevokeValid set, Revoked set: + * buffer has been revoked. + */ + + #ifndef __KERNEL__ + #include "jfs_user.h" + #else + #include + #include + #include + #include + #include + #include + #include + #include + #include + #endif + + static kmem_cache_t *revoke_record_cache; + static kmem_cache_t *revoke_table_cache; + + /* Each revoke record represents one single revoked block. During + journal replay, this involves recording the transaction ID of the + last transaction to revoke this block. */ + + struct jbd_revoke_record_s + { + struct list_head hash; + tid_t sequence; /* Used for recovery only */ + unsigned long blocknr; + }; + + + /* The revoke table is just a simple hash table of revoke records. */ + struct jbd_revoke_table_s + { + /* It is conceivable that we might want a larger hash table + * for recovery. Must be a power of two. */ + int hash_size; + int hash_shift; + struct list_head *hash_table; + }; + + + #ifdef __KERNEL__ + static void write_one_revoke_record(journal_t *, transaction_t *, + struct journal_head **, int *, + struct jbd_revoke_record_s *); + static void flush_descriptor(journal_t *, struct journal_head *, int); + #endif + + /* Utility functions to maintain the revoke table */ + + /* Borrowed from buffer.c: this is a tried and tested block hash function */ + static inline int hash(journal_t *journal, unsigned long block) + { + struct jbd_revoke_table_s *table = journal->j_revoke; + int hash_shift = table->hash_shift; + + return ((block << (hash_shift - 6)) ^ + (block >> 13) ^ + (block << (hash_shift - 12))) & (table->hash_size - 1); + } + + int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq) + { + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + + repeat: + record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); + if (!record) + goto oom; + + record->sequence = seq; + record->blocknr = blocknr; + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + list_add(&record->hash, hash_list); + return 0; + + oom: + if (!journal_oom_retry) + return -ENOMEM; + jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n"); + current->policy |= SCHED_YIELD; + schedule(); + goto repeat; + } + + /* Find a revoke record in the journal's hash table. */ + + static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, + unsigned long blocknr) + { + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + + record = (struct jbd_revoke_record_s *) hash_list->next; + while (&(record->hash) != hash_list) { + if (record->blocknr == blocknr) + return record; + record = (struct jbd_revoke_record_s *) record->hash.next; + } + return NULL; + } + + int __init journal_init_revoke_caches(void) + { + revoke_record_cache = kmem_cache_create("revoke_record", + sizeof(struct jbd_revoke_record_s), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (revoke_record_cache == 0) + return -ENOMEM; + + revoke_table_cache = kmem_cache_create("revoke_table", + sizeof(struct jbd_revoke_table_s), + 0, 0, NULL, NULL); + if (revoke_table_cache == 0) { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = NULL; + return -ENOMEM; + } + return 0; + } + + void journal_destroy_revoke_caches(void) + { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = 0; + kmem_cache_destroy(revoke_table_cache); + revoke_table_cache = 0; + } + + /* Initialise the revoke table for a given journal to a given size. */ + + int journal_init_revoke(journal_t *journal, int hash_size) + { + int shift, tmp; + + J_ASSERT (journal->j_revoke == NULL); + + journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!journal->j_revoke) + return -ENOMEM; + + /* Check that the hash_size is a power of two */ + J_ASSERT ((hash_size & (hash_size-1)) == 0); + + journal->j_revoke->hash_size = hash_size; + + shift = 0; + tmp = hash_size; + while((tmp >>= 1UL) != 0UL) + shift++; + journal->j_revoke->hash_shift = shift; + + journal->j_revoke->hash_table = + kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); + if (!journal->j_revoke->hash_table) { + kmem_cache_free(revoke_table_cache, journal->j_revoke); + journal->j_revoke = NULL; + return -ENOMEM; + } + + for (tmp = 0; tmp < hash_size; tmp++) + INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + + return 0; + } + + /* Destoy a journal's revoke table. The table must already be empty! */ + + void journal_destroy_revoke(journal_t *journal) + { + struct jbd_revoke_table_s *table; + struct list_head *hash_list; + int i; + + table = journal->j_revoke; + if (!table) + return; + + for (i=0; ihash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT (list_empty(hash_list)); + } + + kfree(table->hash_table); + kmem_cache_free(revoke_table_cache, table); + journal->j_revoke = NULL; + } + + + #ifdef __KERNEL__ + + /* + * journal_revoke: revoke a given buffer_head from the journal. This + * prevents the block from being replayed during recovery if we take a + * crash after this current transaction commits. Any subsequent + * metadata writes of the buffer in this transaction cancel the + * revoke. + * + * Note that this call may block --- it is up to the caller to make + * sure that there are no further calls to journal_write_metadata + * before the revoke is complete. In ext3, this implies calling the + * revoke before clearing the block bitmap when we are deleting + * metadata. + * + * Revoke performs a journal_forget on any buffer_head passed in as a + * parameter, but does _not_ forget the buffer_head if the bh was only + * found implicitly. + * + * bh_in may not be a journalled buffer - it may have come off + * the hash tables without an attached journal_head. + * + * If bh_in is non-zero, journal_revoke() will decrement its b_count + * by one. + */ + + int journal_revoke(handle_t *handle, unsigned long blocknr, + struct buffer_head *bh_in) + { + struct buffer_head *bh = NULL; + journal_t *journal; + kdev_t dev; + int err; + + if (bh_in) + BUFFER_TRACE(bh_in, "enter"); + + journal = handle->h_transaction->t_journal; + if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ + J_ASSERT (!"Cannot set revoke feature!"); + return -EINVAL; + } + + dev = journal->j_fs_dev; + bh = bh_in; + + if (!bh) { + bh = get_hash_table(dev, blocknr, journal->j_blocksize); + if (bh) + BUFFER_TRACE(bh, "found on hash"); + } + #ifdef JBD_EXPENSIVE_CHECKING + else { + struct buffer_head *bh2; + + /* If there is a different buffer_head lying around in + * memory anywhere... */ + bh2 = get_hash_table(dev, blocknr, journal->j_blocksize); + if (bh2) { + /* ... and it has RevokeValid status... */ + if ((bh2 != bh) && + test_bit(BH_RevokeValid, &bh2->b_state)) + /* ...then it better be revoked too, + * since it's illegal to create a revoke + * record against a buffer_head which is + * not marked revoked --- that would + * risk missing a subsequent revoke + * cancel. */ + J_ASSERT_BH(bh2, test_bit(BH_Revoked, & + bh2->b_state)); + __brelse(bh2); + } + } + #endif + + /* We really ought not ever to revoke twice in a row without + first having the revoke cancelled: it's illegal to free a + block twice without allocating it in between! */ + if (bh) { + J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state)); + set_bit(BH_Revoked, &bh->b_state); + set_bit(BH_RevokeValid, &bh->b_state); + if (bh_in) { + BUFFER_TRACE(bh_in, "call journal_forget"); + journal_forget(handle, bh_in); + } else { + BUFFER_TRACE(bh, "call brelse"); + __brelse(bh); + } + } + + lock_journal(journal); + jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); + err = insert_revoke_hash(journal, blocknr, + handle->h_transaction->t_tid); + unlock_journal(journal); + BUFFER_TRACE(bh_in, "exit"); + return err; + } + + /* + * Cancel an outstanding revoke. For use only internally by the + * journaling code (called from journal_get_write_access). + * + * We trust the BH_Revoked bit on the buffer if the buffer is already + * being journaled: if there is no revoke pending on the buffer, then we + * don't do anything here. + * + * This would break if it were possible for a buffer to be revoked and + * discarded, and then reallocated within the same transaction. In such + * a case we would have lost the revoked bit, but when we arrived here + * the second time we would still have a pending revoke to cancel. So, + * do not trust the Revoked bit on buffers unless RevokeValid is also + * set. + * + * The caller must have the journal locked. + */ + int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) + { + struct jbd_revoke_record_s *record; + journal_t *journal = handle->h_transaction->t_journal; + int need_cancel; + int did_revoke = 0; /* akpm: debug */ + struct buffer_head *bh = jh2bh(jh); + + jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); + + /* Is the existing Revoke bit valid? If so, we trust it, and + * only perform the full cancel if the revoke bit is set. If + * not, we can't trust the revoke bit, and we need to do the + * full search for a revoke record. */ + if (test_and_set_bit(BH_RevokeValid, &bh->b_state)) + need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state)); + else { + need_cancel = 1; + clear_bit(BH_Revoked, &bh->b_state); + } + + if (need_cancel) { + record = find_revoke_record(journal, bh->b_blocknr); + if (record) { + jbd_debug(4, "cancelled existing revoke on " + "blocknr %lu\n", bh->b_blocknr); + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + did_revoke = 1; + } + } + + #ifdef JBD_EXPENSIVE_CHECKING + /* There better not be one left behind by now! */ + record = find_revoke_record(journal, bh->b_blocknr); + J_ASSERT_JH(jh, record == NULL); + #endif + + /* Finally, have we just cleared revoke on an unhashed + * buffer_head? If so, we'd better make sure we clear the + * revoked status on any hashed alias too, otherwise the revoke + * state machine will get very upset later on. */ + if (need_cancel && !bh->b_pprev) { + struct buffer_head *bh2; + bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); + if (bh2) { + clear_bit(BH_Revoked, &bh2->b_state); + __brelse(bh2); + } + } + + return did_revoke; + } + + + /* + * Write revoke records to the journal for all entries in the current + * revoke hash, deleting the entries as we go. + * + * Called with the journal lock held. + */ + + void journal_write_revoke_records(journal_t *journal, + transaction_t *transaction) + { + struct journal_head *descriptor; + struct jbd_revoke_record_s *record; + struct jbd_revoke_table_s *revoke; + struct list_head *hash_list; + int i, offset, count; + + descriptor = NULL; + offset = 0; + count = 0; + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + + while (!list_empty(hash_list)) { + record = (struct jbd_revoke_record_s *) + hash_list->next; + write_one_revoke_record(journal, transaction, + &descriptor, &offset, + record); + count++; + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + } + } + if (descriptor) + flush_descriptor(journal, descriptor, offset); + jbd_debug(1, "Wrote %d revoke records\n", count); + } + + /* + * Write out one revoke record. We need to create a new descriptor + * block if the old one is full or if we have not already created one. + */ + + static void write_one_revoke_record(journal_t *journal, + transaction_t *transaction, + struct journal_head **descriptorp, + int *offsetp, + struct jbd_revoke_record_s *record) + { + struct journal_head *descriptor; + int offset; + journal_header_t *header; + + /* If we are already aborting, this all becomes a noop. We + still need to go round the loop in + journal_write_revoke_records in order to free all of the + revoke records: only the IO to the journal is omitted. */ + if (is_journal_aborted(journal)) + return; + + descriptor = *descriptorp; + offset = *offsetp; + + /* Make sure we have a descriptor with space left for the record */ + if (descriptor) { + if (offset == journal->j_blocksize) { + flush_descriptor(journal, descriptor, offset); + descriptor = NULL; + } + } + + if (!descriptor) { + descriptor = journal_get_descriptor_buffer(journal); + header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; + header->h_magic = htonl(JFS_MAGIC_NUMBER); + header->h_blocktype = htonl(JFS_REVOKE_BLOCK); + header->h_sequence = htonl(transaction->t_tid); + + /* Record it so that we can wait for IO completion later */ + JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); + journal_file_buffer(descriptor, transaction, BJ_LogCtl); + + offset = sizeof(journal_revoke_header_t); + *descriptorp = descriptor; + } + + * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) = + htonl(record->blocknr); + offset += 4; + *offsetp = offset; + } + + /* + * Flush a revoke descriptor out to the journal. If we are aborting, + * this is a noop; otherwise we are generating a buffer which needs to + * be waited for during commit, so it has to go onto the appropriate + * journal buffer list. + */ + + static void flush_descriptor(journal_t *journal, + struct journal_head *descriptor, + int offset) + { + journal_revoke_header_t *header; + + if (is_journal_aborted(journal)) { + JBUFFER_TRACE(descriptor, "brelse"); + __brelse(jh2bh(descriptor)); + return; + } + + header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; + header->r_count = htonl(offset); + set_bit(BH_JWrite, &jh2bh(descriptor)->b_state); + { + struct buffer_head *bh = jh2bh(descriptor); + BUFFER_TRACE(bh, "write"); + ll_rw_block (WRITE, 1, &bh); + } + } + + #endif + + /* + * Revoke support for recovery. + * + * Recovery needs to be able to: + * + * record all revoke records, including the tid of the latest instance + * of each revoke in the journal + * + * check whether a given block in a given transaction should be replayed + * (ie. has not been revoked by a revoke record in that or a subsequent + * transaction) + * + * empty the revoke table after recovery. + */ + + /* + * First, setting revoke records. We create a new revoke record for + * every block ever revoked in the log as we scan it for recovery, and + * we update the existing records if we find multiple revokes for a + * single block. + */ + + int journal_set_revoke(journal_t *journal, + unsigned long blocknr, + tid_t sequence) + { + struct jbd_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (record) { + /* If we have multiple occurences, only record the + * latest sequence number in the hashed record */ + if (tid_gt(sequence, record->sequence)) + record->sequence = sequence; + return 0; + } + return insert_revoke_hash(journal, blocknr, sequence); + } + + /* + * Test revoke records. For a given block referenced in the log, has + * that block been revoked? A revoke record with a given transaction + * sequence number revokes all blocks in that transaction and earlier + * ones, but later transactions still need replayed. + */ + + int journal_test_revoke(journal_t *journal, + unsigned long blocknr, + tid_t sequence) + { + struct jbd_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) + return 0; + return 1; + } + + /* + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. + */ + + void journal_clear_revoke(journal_t *journal) + { + int i; + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + struct jbd_revoke_table_s *revoke; + + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + while (!list_empty(hash_list)) { + record = (struct jbd_revoke_record_s*) hash_list->next; + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + } + } + } + diff -rc2P linux/fs/jbd/transaction.c linux-2.4.13/fs/jbd/transaction.c *** linux/fs/jbd/transaction.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/transaction.c Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,2078 ---- + /* + * linux/fs/transaction.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem transaction handling code; part of the ext2fs + * journaling system. + * + * This file manages transactions (compound commits managed by the + * journaling code) and handles (individual atomic operations by the + * filesystem). + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include /* Uggh... needed for buffermem_pages */ + + + extern spinlock_t journal_datalist_lock; + + /* + * get_transaction: obtain a new transaction_t object. + * + * Simply allocate and initialise a new transaction. Create it in + * RUNNING state and add it to the current journal (which should not + * have an existing running transaction: we only make a new transaction + * once we have started to commit the old one). + * + * Preconditions: + * The journal MUST be locked. We don't perform atomic mallocs on the + * new transaction and we can't block without protecting against other + * processes trying to touch the journal while it is in transition. + */ + + static transaction_t * get_transaction (journal_t * journal, int is_try) + { + transaction_t * transaction; + + transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS); + if (!transaction) + return NULL; + + memset (transaction, 0, sizeof (transaction_t)); + + transaction->t_journal = journal; + transaction->t_state = T_RUNNING; + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; + + /* Set up the commit timer for the new transaction. */ + J_ASSERT (!journal->j_commit_timer_active); + journal->j_commit_timer_active = 1; + journal->j_commit_timer->expires = transaction->t_expires; + add_timer(journal->j_commit_timer); + + J_ASSERT (journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; + + return transaction; + } + + /* + * Handle management. + * + * A handle_t is an object which represents a single atomic update to a + * filesystem, and which tracks all of the modifications which form part + * of that one update. + */ + + /* + * start_this_handle: Given a handle, deal with any locking or stalling + * needed to make sure that there is enough journal space for the handle + * to begin. Attach the handle to a transaction and set up the + * transaction's buffer credits. + */ + + static int start_this_handle(journal_t *journal, handle_t *handle) + { + transaction_t *transaction; + int needed; + int nblocks = handle->h_buffer_credits; + + jbd_debug(3, "New handle %p going live.\n", handle); + + repeat: + + lock_journal(journal); + + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { + unlock_journal(journal); + return -EROFS; + } + + /* Wait on the journal's transaction barrier if necessary */ + if (journal->j_barrier_count) { + unlock_journal(journal); + sleep_on(&journal->j_wait_transaction_locked); + goto repeat; + } + + repeat_locked: + if (!journal->j_running_transaction) + get_transaction(journal, 0); + /* @@@ Error? */ + J_ASSERT(journal->j_running_transaction); + + transaction = journal->j_running_transaction; + + /* If the current transaction is locked down for commit, wait + * for the lock to be released. */ + + if (transaction->t_state == T_LOCKED) { + unlock_journal(journal); + jbd_debug(3, "Handle %p stalling...\n", handle); + sleep_on(&journal->j_wait_transaction_locked); + goto repeat; + } + + /* If there is not enough space left in the log to write all + * potential buffers requested by this operation, we need to + * stall pending a log checkpoint to free some more log + * space. */ + + needed = transaction->t_outstanding_credits + nblocks; + + if (needed > journal->j_max_transaction_buffers) { + /* If the current transaction is already too large, then + * start to commit it: we can then go back and attach + * this handle to a new transaction. */ + + jbd_debug(2, "Handle %p starting new commit...\n", handle); + log_start_commit(journal, transaction); + unlock_journal(journal); + sleep_on(&journal->j_wait_transaction_locked); + lock_journal(journal); + goto repeat_locked; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + * + * The worst part is, any transaction currently committing can + * reduce the free space arbitrarily. Be careful to account for + * those buffers when checkpointing. + */ + + /* + * @@@ AKPM: This seems rather over-defensive. We're giving commit + * a _lot_ of headroom: 1/4 of the journal plus the size of + * the committing transaction. Really, we only need to give it + * committing_transaction->t_outstanding_credits plus "enough" for + * the log control blocks. + * Also, this test is inconsitent with the matching one in + * journal_extend(). + */ + needed = journal->j_max_transaction_buffers; + if (journal->j_committing_transaction) + needed += journal->j_committing_transaction-> + t_outstanding_credits; + + if (log_space_left(journal) < needed) { + jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); + log_wait_for_space(journal, needed); + goto repeat_locked; + } + + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. */ + + handle->h_transaction = transaction; + transaction->t_outstanding_credits += nblocks; + transaction->t_updates++; + transaction->t_handle_count++; + jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", + handle, nblocks, transaction->t_outstanding_credits, + log_space_left(journal)); + + unlock_journal(journal); + + return 0; + } + + /* + * Obtain a new handle. + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. + * + * This function is visible to journal users (like ext2fs), so is not + * called with the journal already locked. + * + * Return a pointer to a newly allocated handle, or NULL on failure + */ + + handle_t *journal_start(journal_t *journal, int nblocks) + { + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; + } + + handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); + if (!handle) + return ERR_PTR(-ENOMEM); + memset (handle, 0, sizeof (handle_t)); + + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + current->journal_info = handle; + + err = start_this_handle(journal, handle); + if (err < 0) { + kfree(handle); + current->journal_info = NULL; + return ERR_PTR(err); + } + + return handle; + } + + /* + * Return zero on success + */ + static int try_start_this_handle(journal_t *journal, handle_t *handle) + { + transaction_t *transaction; + int needed; + int nblocks = handle->h_buffer_credits; + int ret = 0; + + jbd_debug(3, "New handle %p maybe going live.\n", handle); + + lock_journal(journal); + + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { + ret = -EROFS; + goto fail_unlock; + } + + if (journal->j_barrier_count) + goto fail_unlock; + + if (!journal->j_running_transaction && get_transaction(journal, 1) == 0) + goto fail_unlock; + + transaction = journal->j_running_transaction; + if (transaction->t_state == T_LOCKED) + goto fail_unlock; + + needed = transaction->t_outstanding_credits + nblocks; + /* We could run log_start_commit here */ + if (needed > journal->j_max_transaction_buffers) + goto fail_unlock; + + needed = journal->j_max_transaction_buffers; + if (journal->j_committing_transaction) + needed += journal->j_committing_transaction-> + t_outstanding_credits; + + if (log_space_left(journal) < needed) + goto fail_unlock; + + handle->h_transaction = transaction; + transaction->t_outstanding_credits += nblocks; + transaction->t_updates++; + jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", + handle, nblocks, transaction->t_outstanding_credits, + log_space_left(journal)); + unlock_journal(journal); + return 0; + + fail_unlock: + unlock_journal(journal); + if (ret >= 0) + ret = -1; + return ret; + } + + /* + * Try to start a handle, but non-blockingly. If we weren't able + * to, return an ERR_PTR value. + */ + handle_t *journal_try_start(journal_t *journal, int nblocks) + { + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + jbd_debug(4, "h_ref %d -> %d\n", + handle->h_ref, + handle->h_ref + 1); + J_ASSERT(handle->h_transaction->t_journal == journal); + if (is_handle_aborted(handle)) + return ERR_PTR(-EIO); + handle->h_ref++; + return handle; + } else { + jbd_debug(4, "no current transaction\n"); + } + + if (is_journal_aborted(journal)) + return ERR_PTR(-EIO); + + handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); + if (!handle) + return ERR_PTR(-ENOMEM); + memset (handle, 0, sizeof (handle_t)); + + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + current->journal_info = handle; + + err = try_start_this_handle(journal, handle); + if (err < 0) { + kfree(handle); + current->journal_info = NULL; + return ERR_PTR(err); + } + + return handle; + } + + /* + * journal_extend: extend buffer credits. + * + * Some transactions, such as large extends and truncates, can be done + * atomically all at once or in several stages. The operation requests + * a credit for a number of buffer modications in advance, but can + * extend its credit if it needs more. + * + * journal_extend tries to give the running handle more buffer credits. + * It does not guarantee that allocation: this is a best-effort only. + * The calling process MUST be able to deal cleanly with a failure to + * extend here. + * + * Return 0 on success, non-zero on failure. + * + * return code < 0 implies an error + * return code > 0 implies normal transaction-full status. + */ + + int journal_extend (handle_t *handle, int nblocks) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int result; + int wanted; + + lock_journal (journal); + + result = -EIO; + if (is_handle_aborted(handle)) + goto error_out; + + result = 1; + + /* Don't extend a locked-down transaction! */ + if (handle->h_transaction->t_state != T_RUNNING) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction not running\n", handle, nblocks); + goto error_out; + } + + wanted = transaction->t_outstanding_credits + nblocks; + + if (wanted > journal->j_max_transaction_buffers) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction too large\n", handle, nblocks); + goto error_out; + } + + if (wanted > log_space_left(journal)) { + jbd_debug(3, "denied handle %p %d blocks: " + "insufficient log space\n", handle, nblocks); + goto error_out; + } + + handle->h_buffer_credits += nblocks; + transaction->t_outstanding_credits += nblocks; + result = 0; + + jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); + + error_out: + unlock_journal (journal); + return result; + } + + + /* + * journal_restart: restart a handle for a multi-transaction filesystem + * operation. + * + * If the journal_extend() call above fails to grant new buffer credits + * to a running handle, a call to journal_restart will commit the + * handle's transaction so far and reattach the handle to a new + * transaction capabable of guaranteeing the requested number of + * credits. + */ + + int journal_restart(handle_t *handle, int nblocks) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int ret; + + /* If we've had an abort of any type, don't even think about + * actually doing the restart! */ + if (is_handle_aborted(handle)) + return 0; + + /* First unlink the handle from its current transaction, and + * start the commit on that. */ + + J_ASSERT (transaction->t_updates > 0); + J_ASSERT (journal_current_handle() == handle); + + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + + if (!transaction->t_updates) + wake_up(&journal->j_wait_updates); + + jbd_debug(2, "restarting handle %p\n", handle); + log_start_commit(journal, transaction); + + handle->h_buffer_credits = nblocks; + ret = start_this_handle(journal, handle); + return ret; + } + + + /* + * Barrier operation: establish a transaction barrier. + * + * This locks out any further updates from being started, and blocks + * until all existing updates have completed, returning only once the + * journal is in a quiescent state with no updates running. + * + * The journal lock should not be held on entry. + */ + + void journal_lock_updates (journal_t *journal) + { + lock_journal(journal); + ++journal->j_barrier_count; + + /* Wait until there are no running updates */ + while (1) { + transaction_t *transaction = journal->j_running_transaction; + if (!transaction) + break; + if (!transaction->t_updates) + break; + + unlock_journal(journal); + sleep_on(&journal->j_wait_updates); + lock_journal(journal); + } + + unlock_journal(journal); + + /* We have now established a barrier against other normal + * updates, but we also need to barrier against other + * journal_lock_updates() calls to make sure that we serialise + * special journal-locked operations too. */ + down(&journal->j_barrier); + } + + /* + * Release a transaction barrier obtained with journal_lock_updates(). + * + * Should be called without the journal lock held. + */ + + void journal_unlock_updates (journal_t *journal) + { + lock_journal(journal); + + J_ASSERT (journal->j_barrier_count != 0); + + up(&journal->j_barrier); + --journal->j_barrier_count; + wake_up(&journal->j_wait_transaction_locked); + unlock_journal(journal); + } + + /* + * journal_get_write_access: notify intent to modify a buffer for metadata + * (not data) update. + * + * If the buffer is already part of the current transaction, then there + * is nothing we need to do. If it is already part of a prior + * transaction which we are still committing to disk, then we need to + * make sure that we do not overwrite the old copy: we do copy-out to + * preserve the copy going to disk. We also account the buffer against + * the handle's metadata buffer credits (unless the buffer is already + * part of the transaction, that is). + * + * Returns an error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're write()ing a buffer which is also part of a shared mapping. + */ + + static int + do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int error; + char *frozen_buffer = NULL; + int need_copy = 0; + + jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + + JBUFFER_TRACE(jh, "entry"); + repeat: + /* @@@ Need to check for errors here at some point. */ + + /* + * AKPM: neither bdflush nor kupdate run with the BKL. There's + * nothing we can do to prevent them from starting writeout of a + * BUF_DIRTY buffer at any time. And checkpointing buffers are on + * BUF_DIRTY. So. We no longer assert that the buffer is unlocked. + * + * However. It is very wrong for us to allow ext3 to start directly + * altering the ->b_data of buffers which may at that very time be + * undergoing writeout to the client filesystem. This can leave + * the filesystem in an inconsistent, transient state if we crash. + * So what we do is to steal the buffer if it is in checkpoint + * mode and dirty. The journal lock will keep out checkpoint-mode + * state transitions within journal_remove_checkpoint() and the buffer + * is locked to keep bdflush/kupdate/whoever away from it as well. + * + * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a + * simple lock_journal(). This code here will care for locked buffers. + */ + /* + * The buffer_locked() || buffer_dirty() tests here are simply an + * optimisation tweak. If anyone else in the system decides to + * lock this buffer later on, we'll blow up. There doesn't seem + * to be a good reason why they should do this. + */ + if (jh->b_cp_transaction && + (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) { + unlock_journal(journal); + lock_buffer(jh2bh(jh)); + spin_lock(&journal_datalist_lock); + if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) { + /* OK, we need to steal it */ + JBUFFER_TRACE(jh, "stealing from checkpoint mode"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + + J_ASSERT(handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + + /* This will clear BH_Dirty and set BH_JBDDirty. */ + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + + /* And pull it off BUF_DIRTY, onto BUF_CLEAN */ + refile_buffer(jh2bh(jh)); + + /* + * The buffer is now hidden from bdflush. It is + * metadata against the current transaction. + */ + JBUFFER_TRACE(jh, "steal from cp mode is complete"); + } + spin_unlock(&journal_datalist_lock); + unlock_buffer(jh2bh(jh)); + lock_journal(journal); + } + + J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh))); + + error = -EROFS; + if (is_handle_aborted(handle)) + goto out_unlocked; + error = 0; + + spin_lock(&journal_datalist_lock); + + /* The buffer is already part of this transaction if + * b_transaction or b_next_transaction points to it. */ + + if (jh->b_transaction == transaction || + jh->b_next_transaction == transaction) + goto done_locked; + + /* If there is already a copy-out version of this buffer, then + * we don't need to make another one. */ + + if (jh->b_frozen_data) { + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + jh->b_next_transaction = transaction; + + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + goto done_locked; + } + + /* Is there data here we need to preserve? */ + + if (jh->b_transaction && jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* There is one case we have to be very careful about. + * If the committing transaction is currently writing + * this buffer out to disk and has NOT made a copy-out, + * then we cannot modify the buffer contents at all + * right now. The essence of copy-out is that it is the + * extra copy, not the primary copy, which gets + * journaled. If the primary copy is already going to + * disk then we cannot do copy-out here. */ + + if (jh->b_jlist == BJ_Shadow) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + /* commit wakes up all shadow buffers after IO */ + sleep_on(&jh2bh(jh)->b_wait); + lock_journal(journal); + goto repeat; + } + + /* Only do the copy if the currently-owning transaction + * still needs it. If it is on the Forget list, the + * committing transaction is past that stage. The + * buffer had better remain locked during the kmalloc, + * but that should be true --- we hold the journal lock + * still and the buffer is already on the BUF_JOURNAL + * list so won't be flushed. + * + * Subtle point, though: if this is a get_undo_access, + * then we will be relying on the frozen_data to contain + * the new value of the committed_data record after the + * transaction, so we HAVE to force the frozen_data copy + * in that case. */ + + if (jh->b_jlist != BJ_Forget || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, + GFP_NOFS); + lock_journal(journal); + if (!frozen_buffer) { + printk(KERN_EMERG __FUNCTION__ + "OOM for frozen_buffer\n"); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + spin_lock(&journal_datalist_lock); + goto done_locked; + } + goto repeat; + } + + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + need_copy = 1; + } + jh->b_next_transaction = transaction; + } + + J_ASSERT(handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + + /* Finally, if the buffer is not journaled right now, we need to + * make sure it doesn't get written to disk before the caller + * actually commits the new data. */ + + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + jh->b_transaction = transaction; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + } + + done_locked: + spin_unlock(&journal_datalist_lock); + if (need_copy) { + struct page *page; + int offset; + char *source; + + J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh))); + page = jh2bh(jh)->b_page; + offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; + source = kmap(page); + memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); + kunmap(page); + } + + + /* If we are about to journal a buffer, then any revoke pending + on it is no longer valid. */ + journal_cancel_revoke(handle, jh); + + out_unlocked: + if (frozen_buffer) + kfree(frozen_buffer); + + JBUFFER_TRACE(jh, "exit"); + return error; + } + + int journal_get_write_access (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = journal_add_journal_head(bh); + int rc; + + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ + lock_journal(journal); + rc = do_get_write_access(handle, jh, 0); + journal_unlock_journal_head(jh); + unlock_journal(journal); + return rc; + } + + + /* + * When the user wants to journal a newly created buffer_head + * (ie. getblk() returned a new buffer and we are going to populate it + * manually rather than reading off disk), then we need to keep the + * buffer_head locked until it has been completely filled with new + * data. In this case, we should be able to make the assertion that + * the bh is not already part of an existing transaction. + * + * The buffer should already be locked by the caller by this point. + * There is no lock ranking violation: it was a newly created, + * unlocked buffer beforehand. */ + + int journal_get_create_access (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = journal_add_journal_head(bh); + int err; + + jbd_debug(5, "journal_head %p\n", jh); + lock_journal(journal); + err = -EROFS; + if (is_handle_aborted(handle)) + goto out; + err = 0; + + JBUFFER_TRACE(jh, "entry"); + /* The buffer may already belong to this transaction due to + * pre-zeroing in the filesystem's new_block code. It may also + * be on the previous, committing transaction's lists, but it + * HAS to be in Forget state in that case: the transaction must + * have deleted the buffer for it to be reused here. */ + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget))); + + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + + spin_lock(&journal_datalist_lock); + if (jh->b_transaction == NULL) { + jh->b_transaction = transaction; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + JBUFFER_TRACE(jh, "refile"); + refile_buffer(jh2bh(jh)); + } else if (jh->b_transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "set next transaction"); + jh->b_next_transaction = transaction; + } + spin_unlock(&journal_datalist_lock); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect + * blocks which contain freed but then revoked metadata. We need + * to cancel the revoke in case we end up freeing it yet again + * and the reallocating as data - this would cause a second revoke, + * which hits an assertion error. + */ + JBUFFER_TRACE(jh, "cancelling revoke"); + journal_cancel_revoke(handle, jh); + journal_unlock_journal_head(jh); + out: + unlock_journal(journal); + return err; + } + + + + /* + * journal_get_undo_access: Notify intent to modify metadata with non- + * rewindable consequences + * + * Sometimes there is a need to distinguish between metadata which has + * been committed to disk and that which has not. The ext3fs code uses + * this for freeing and allocating space: we have to make sure that we + * do not reuse freed space until the deallocation has been committed, + * since if we overwrote that space we would make the delete + * un-rewindable in case of a crash. + * + * To deal with that, journal_get_undo_access requests write access to a + * buffer for parts of non-rewindable operations such as delete + * operations on the bitmaps. The journaling code must keep a copy of + * the buffer's contents prior to the undo_access call until such time + * as we know that the buffer has definitely been committed to disk. + * + * We never need to know which transaction the committed data is part + * of: buffers touched here are guaranteed to be dirtied later and so + * will be committed to a new transaction in due course, at which point + * we can discard the old committed data pointer. + * + * Returns error number or 0 on success. + */ + + int journal_get_undo_access (handle_t *handle, struct buffer_head *bh) + { + journal_t *journal = handle->h_transaction->t_journal; + int err; + struct journal_head *jh = journal_add_journal_head(bh); + + JBUFFER_TRACE(jh, "entry"); + lock_journal(journal); + + /* Do this first --- it can drop the journal lock, so we want to + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. */ + err = do_get_write_access (handle, jh, 1); + if (err) + goto out; + + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size, + GFP_NOFS); + if (!jh->b_committed_data) { + printk(KERN_EMERG __FUNCTION__ + ": No memory for committed data!\n"); + err = -ENOMEM; + goto out; + } + + memcpy (jh->b_committed_data, jh2bh(jh)->b_data, + jh2bh(jh)->b_size); + } + + out: + if (!err) + J_ASSERT_JH(jh, jh->b_committed_data); + journal_unlock_journal_head(jh); + unlock_journal(journal); + return err; + } + + /* + * journal_dirty_data: mark a buffer as containing dirty data which + * needs to be flushed before we can commit the current transaction. + * + * The buffer is placed on the transaction's data list and is marked as + * belonging to the transaction. + * + * If `async' is set then the writebask will be initiated by the caller + * using submit_bh -> end_buffer_io_async. We put the buffer onto + * t_async_datalist. + * + * Returns error number or 0 on success. + * + * journal_dirty_data() can be called via page_launder->ext3_writepage + * by kswapd. So it cannot block. Happily, there's nothing here + * which needs lock_journal if `async' is set. + * + * When the buffer is on the current transaction we freely move it + * between BJ_AsyncData and BJ_SyncData according to who tried to + * change its state last. + */ + + int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async) + { + journal_t *journal = handle->h_transaction->t_journal; + int need_brelse = 0; + int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData; + struct journal_head *jh; + + if (is_handle_aborted(handle)) + return 0; + + jh = journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + + /* + * The buffer could *already* be dirty. Writeout can start + * at any time. + */ + jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); + + /* + * What if the buffer is already part of a running transaction? + * + * There are two cases: + * 1) It is part of the current running transaction. Refile it, + * just in case we have allocated it as metadata, deallocated + * it, then reallocated it as data. + * 2) It is part of the previous, still-committing transaction. + * If all we want to do is to guarantee that the buffer will be + * written to disk before this new transaction commits, then + * being sure that the *previous* transaction has this same + * property is sufficient for us! Just leave it on its old + * transaction. + * + * In case (2), the buffer must not already exist as metadata + * --- that would violate write ordering (a transaction is free + * to write its data at any point, even before the previous + * committing transaction has committed). The caller must + * never, ever allow this to happen: there's nothing we can do + * about it in this layer. + */ + spin_lock(&journal_datalist_lock); + if (jh->b_transaction) { + JBUFFER_TRACE(jh, "has transaction"); + if (jh->b_transaction != handle->h_transaction) { + JBUFFER_TRACE(jh, "belongs to older transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* @@@ IS THIS TRUE ? */ + /* + * Not any more. Scenario: someone does a write() + * in data=journal mode. The buffer's transaction has + * moved into commit. Then someone does another + * write() to the file. We do the frozen data copyout + * and set b_next_transaction to point to j_running_t. + * And while we're in that state, someone does a + * writepage() in an attempt to pageout the same area + * of the file via a shared mapping. At present that + * calls journal_dirty_data(), and we get right here. + * It may be too late to journal the data. Simply + * falling through to the next test will suffice: the + * data will be dirty and wil be checkpointed. The + * ordering comments in the next comment block still + * apply. + */ + //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + + /* + * If we're journalling data, and this buffer was + * subject to a write(), it could be metadata, forget + * or shadow against the committing transaction. Now, + * someone has dirtied the same darn page via a mapping + * and it is being writepage()'d. + * We *could* just steal the page from commit, with some + * fancy locking there. Instead, we just skip it - + * don't tie the page's buffers to the new transaction + * at all. + * Implication: if we crash before the writepage() data + * is written into the filesystem, recovery will replay + * the write() data. + */ + if (jh->b_jlist != BJ_None && + jh->b_jlist != BJ_SyncData && + jh->b_jlist != BJ_AsyncData) { + JBUFFER_TRACE(jh, "Not stealing"); + goto no_journal; + } + + /* + * This buffer may be undergoing writeout in commit. We + * can't return from here and let the caller dirty it + * again because that can cause the write-out loop in + * commit to never terminate. + */ + if (!async && buffer_dirty(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&journal_datalist_lock); + need_brelse = 1; + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + spin_lock(&journal_datalist_lock); + /* The buffer may become locked again at any + time if it is redirtied */ + } + + /* journal_clean_data_list() may have got there first */ + if (jh->b_transaction != NULL) { + JBUFFER_TRACE(jh, "unfile from commit"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + } + /* The buffer will be refiled below */ + + } + /* + * Special case --- the buffer might actually have been + * allocated and then immediately deallocated in the previous, + * committing transaction, so might still be left on that + * transaction's metadata lists. + */ + if (jh->b_jlist != wanted_jlist) { + JBUFFER_TRACE(jh, "not on correct data list: unfile"); + J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + JBUFFER_TRACE(jh, "file as data"); + __journal_file_buffer(jh, handle->h_transaction, + wanted_jlist); + } + } else { + JBUFFER_TRACE(jh, "not on a transaction"); + __journal_file_buffer(jh, handle->h_transaction, wanted_jlist); + } + /* + * We need to mark the buffer dirty and refile it inside the lock to + * protect it from release by journal_try_to_free_buffer() + * + * We set ->b_flushtime to something small enough to typically keep + * kupdate away from the buffer. + * + * We don't need to do a balance_dirty() - __block_commit_write() + * does that. + */ + if (!async && !atomic_set_buffer_dirty(jh2bh(jh))) { + jh2bh(jh)->b_flushtime = + jiffies + journal->j_commit_interval + 1 * HZ; + refile_buffer(jh2bh(jh)); + } + no_journal: + spin_unlock(&journal_datalist_lock); + if (need_brelse) { + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + JBUFFER_TRACE(jh, "exit"); + journal_unlock_journal_head(jh); + return 0; + } + + /* + * journal_dirty_metadata: mark a buffer as containing dirty metadata + * which needs to be journaled as part of the current transaction. + * + * The buffer is placed on the transaction's metadata list and is marked + * as belonging to the transaction. + * + * Special care needs to be taken if the buffer already belongs to the + * current committing transaction (in which case we should have frozen + * data present for that commit). In that case, we don't relink the + * buffer: that only gets done when the old transaction finally + * completes its commit. + * + * Returns error number or 0 on success. + */ + + int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = bh2jh(bh); + + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + lock_journal(journal); + if (is_handle_aborted(handle)) + goto out_unlock; + + spin_lock(&journal_datalist_lock); + set_bit(BH_JBDDirty, &bh->b_state); + set_buffer_flushtime(bh); + + J_ASSERT_JH(jh, jh->b_transaction != NULL); + + /* + * Metadata already on the current transaction list doesn't + * need to be filed. Metadata on another transaction's list must + * be committing, and will be refiled once the commit completes: + * leave it alone for now. + */ + + if (jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "already on other transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + /* And this case is illegal: we can't reuse another + * transaction's data buffer, ever. */ + /* FIXME: writepage() should be journalled */ + J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData); + goto done_locked; + } + + /* That test should have eliminated the following case: */ + J_ASSERT_JH(jh, jh->b_frozen_data == 0); + + JBUFFER_TRACE(jh, "file as BJ_Metadata"); + __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); + + done_locked: + spin_unlock(&journal_datalist_lock); + JBUFFER_TRACE(jh, "exit"); + out_unlock: + unlock_journal(journal); + return 0; + } + + #if 0 + /* + * journal_release_buffer: undo a get_write_access without any buffer + * updates, if the update decided in the end that it didn't need access. + * + * journal_get_write_access() can block, so it is quite possible for a + * journaling component to decide after the write access is returned + * that global state has changed and the update is no longer required. */ + + void journal_release_buffer (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = bh2jh(bh); + + lock_journal(journal); + JBUFFER_TRACE(jh, "entry"); + + /* If the buffer is reserved but not modified by this + * transaction, then it is safe to release it. In all other + * cases, just leave the buffer as it is. */ + + spin_lock(&journal_datalist_lock); + if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction && + !buffer_jdirty(jh2bh(jh))) { + JBUFFER_TRACE(jh, "unused: refiling it"); + handle->h_buffer_credits++; + __journal_refile_buffer(jh); + } + spin_unlock(&journal_datalist_lock); + + JBUFFER_TRACE(jh, "exit"); + unlock_journal(journal); + } + #endif + + /* + * journal_forget: bforget() for potentially-journaled buffers. We can + * only do the bforget if there are no commits pending against the + * buffer. If the buffer is dirty in the current running transaction we + * can safely unlink it. + * + * bh may not be a journalled buffer at all - it may be a non-JBD + * buffer which came off the hashtable. Check for this. + * + * Decrements bh->b_count by one. + * + * Allow this call even if the handle has aborted --- it may be part of + * the caller's cleanup after an abort. + */ + + void journal_forget (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh; + + BUFFER_TRACE(bh, "entry"); + + lock_journal(journal); + spin_lock(&journal_datalist_lock); + + if (!buffer_jbd(bh)) + goto not_jbd; + jh = bh2jh(bh); + + if (jh->b_transaction == handle->h_transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + + /* If we are forgetting a buffer which is already part + * of this transaction, then we can just drop it from + * the transaction immediately. */ + clear_bit(BH_Dirty, &bh->b_state); + clear_bit(BH_JBDDirty, &bh->b_state); + + JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); + J_ASSERT_JH(jh, !jh->b_committed_data); + + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + + /* + * We are no longer going to journal this buffer. + * However, the commit of this transaction is still + * important to the buffer: the delete that we are now + * processing might obsolete an old log entry, so by + * committing, we can satisfy the buffer's checkpoint. + * + * So, if we have a checkpoint on the buffer, we should + * now refile the buffer on our BJ_Forget list so that + * we know to remove the checkpoint after we commit. + */ + + if (jh->b_cp_transaction) { + __journal_file_buffer(jh, transaction, BJ_Forget); + } else { + __journal_remove_journal_head(bh); + __brelse(bh); + if (!buffer_jbd(bh)) { + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + __bforget(bh); + return; + } + } + + } else if (jh->b_transaction) { + J_ASSERT_JH(jh, (jh->b_transaction == + journal->j_committing_transaction)); + /* However, if the buffer is still owned by a prior + * (committing) transaction, we can't drop it yet... */ + JBUFFER_TRACE(jh, "belongs to older transaction"); + /* ... but we CAN drop it from the new transaction if we + * have also modified it since the original commit. */ + + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == transaction); + jh->b_next_transaction = NULL; + } + } + + not_jbd: + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + __brelse(bh); + return; + } + + #if 0 /* Unused */ + /* + * journal_sync_buffer: flush a potentially-journaled buffer to disk. + * + * Used for O_SYNC filesystem operations. If the buffer is journaled, + * we need to complete the O_SYNC by waiting for the transaction to + * complete. It is an error to call journal_sync_buffer before + * journal_stop! + */ + + void journal_sync_buffer(struct buffer_head *bh) + { + transaction_t *transaction; + journal_t *journal; + long sequence; + struct journal_head *jh; + + /* If the buffer isn't journaled, this is easy: just sync it to + * disk. */ + BUFFER_TRACE(bh, "entry"); + + spin_lock(&journal_datalist_lock); + if (!buffer_jbd(bh)) { + spin_unlock(&journal_datalist_lock); + return; + } + jh = bh2jh(bh); + if (jh->b_transaction == NULL) { + /* If the buffer has already been journaled, then this + * is a noop. */ + if (jh->b_cp_transaction == NULL) { + spin_unlock(&journal_datalist_lock); + return; + } + atomic_inc(&bh->b_count); + spin_unlock(&journal_datalist_lock); + ll_rw_block (WRITE, 1, &bh); + wait_on_buffer(bh); + __brelse(bh); + goto out; + } + + /* Otherwise, just wait until the transaction is synced to disk. */ + transaction = jh->b_transaction; + journal = transaction->t_journal; + sequence = transaction->t_tid; + spin_unlock(&journal_datalist_lock); + + jbd_debug(2, "requesting commit for jh %p\n", jh); + log_start_commit (journal, transaction); + + while (tid_gt(sequence, journal->j_commit_sequence)) { + wake_up(&journal->j_wait_done_commit); + sleep_on(&journal->j_wait_done_commit); + } + JBUFFER_TRACE(jh, "exit"); + out: + return; + } + #endif + + /* + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining + * buffer credits to the transaction and remove the handle. The only + * complication is that we need to start a commit operation if the + * filesystem is marked for synchronous update. + * + * journal_stop itself will not usually return an error, but it may + * do so in unusual circumstances. In particular, expect it to + * return -EIO if a journal_abort has been executed since the + * transaction began. + */ + + int journal_stop(handle_t *handle) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int old_handle_count, err; + + if (!handle) + return 0; + + J_ASSERT (transaction->t_updates > 0); + J_ASSERT (journal_current_handle() == handle); + + if (is_handle_aborted(handle)) + err = -EIO; + else + err = 0; + + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } + + jbd_debug(4, "Handle %p going down\n", handle); + + /* + * Implement synchronous transaction batching. If the handle + * was synchronous, don't force a commit immediately. Let's + * yield and let another thread piggyback onto this transaction. + * Keep doing that while new threads continue to arrive. + * It doesn't cost much - we're about to run a commit and sleep + * on IO anyway. Speeds up many-threaded, many-dir operations + * by 30x or more... + */ + if (handle->h_sync) { + do { + old_handle_count = transaction->t_handle_count; + set_current_state(TASK_RUNNING); + current->policy |= SCHED_YIELD; + schedule(); + } while (old_handle_count != transaction->t_handle_count); + } + + current->journal_info = NULL; + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + if (!transaction->t_updates) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the + * transaction is too old now. + */ + if (handle->h_sync || + transaction->t_outstanding_credits > + journal->j_max_transaction_buffers || + time_after_eq(jiffies, transaction->t_expires)) { + /* Do this even for aborted journals: an abort still + * completes the commit thread, it just doesn't write + * anything to disk. */ + tid_t tid = transaction->t_tid; + + jbd_debug(2, "transaction too old, requesting commit for " + "handle %p\n", handle); + /* This is non-blocking */ + log_start_commit(journal, transaction); + + /* + * Special case: JFS_SYNC synchronous updates require us + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) + log_wait_commit(journal, tid); + } + kfree(handle); + return err; + } + + /* + * For synchronous operations: force any uncommitted trasnactions + * to disk. May seem kludgy, but it reuses all the handle batching + * code in a very simple manner. + */ + int journal_force_commit(journal_t *journal) + { + handle_t *handle; + int ret = 0; + + lock_kernel(); + handle = journal_start(journal, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + handle->h_sync = 1; + journal_stop(handle); + out: + unlock_kernel(); + return ret; + } + + /* + * + * List management code snippets: various functions for manipulating the + * transaction buffer lists. + * + */ + + /* + * Append a buffer to a transaction list, given the transaction's list head + * pointer. + * journal_datalist_lock is held. + */ + + static inline void + __blist_add_buffer(struct journal_head **list, struct journal_head *jh) + { + if (!*list) { + jh->b_tnext = jh->b_tprev = jh; + *list = jh; + } else { + /* Insert at the tail of the list to preserve order */ + struct journal_head *first = *list, *last = first->b_tprev; + jh->b_tprev = last; + jh->b_tnext = first; + last->b_tnext = first->b_tprev = jh; + } + } + + /* + * Remove a buffer from a transaction list, given the transaction's list + * head pointer. + * + * Called with journal_datalist_lock held, and the journal may not + * be locked. + */ + + static inline void + __blist_del_buffer(struct journal_head **list, struct journal_head *jh) + { + if (*list == jh) { + *list = jh->b_tnext; + if (*list == jh) + *list = 0; + } + jh->b_tprev->b_tnext = jh->b_tnext; + jh->b_tnext->b_tprev = jh->b_tprev; + } + + /* + * Remove a buffer from the appropriate transaction list. + * + * Note that this function can *change* the value of + * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget, + * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller + * is holding onto a copy of one of thee pointers, it could go bad. + * Generally the caller needs to re-read the pointer from the transaction_t. + * + * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called + * via journal_try_to_free_buffer() or journal_clean_data_list(). In that + * case, journal_datalist_lock will be held, and the journal may not be locked. + */ + void __journal_unfile_buffer(struct journal_head *jh) + { + struct journal_head **list = 0; + transaction_t * transaction; + + assert_spin_locked(&journal_datalist_lock); + transaction = jh->b_transaction; + + #ifdef __SMP__ + J_ASSERT (current->lock_depth >= 0); + #endif + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + + if (jh->b_jlist != BJ_None) + J_ASSERT_JH(jh, transaction != 0); + + switch (jh->b_jlist) { + case BJ_None: + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_AsyncData: + list = &transaction->t_async_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers--; + J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_del_buffer(list, jh); + jh->b_jlist = BJ_None; + if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) { + set_bit(BH_Dirty, &jh2bh(jh)->b_state); + } + } + + void journal_unfile_buffer(struct journal_head *jh) + { + spin_lock(&journal_datalist_lock); + __journal_unfile_buffer(jh); + spin_unlock(&journal_datalist_lock); + } + + /* + * Called from journal_try_to_free_buffers(). The journal is not + * locked. lru_list_lock is not held. + * + * Here we see why journal_datalist_lock is global and not per-journal. + * We cannot get back to this buffer's journal pointer without locking + * out journal_clean_data_list() in some manner. + * + * One could use journal_datalist_lock to get unracy access to a + * per-journal lock. + * + * Called with journal_datalist_lock held. + * + * Returns non-zero iff we were able to free the journal_head. + */ + static int __journal_try_to_free_buffer(struct buffer_head *bh, + int *locked_or_dirty) + { + struct journal_head *jh; + + assert_spin_locked(&journal_datalist_lock); + + if (!buffer_jbd(bh)) + return 1; + jh = bh2jh(bh); + + if (buffer_locked(bh) || buffer_dirty(bh)) { + *locked_or_dirty = 1; + goto out; + } + + if (!buffer_uptodate(bh)) + goto out; + + if (jh->b_next_transaction != 0) + goto out; + + if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { + if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) { + /* A written-back ordered data buffer */ + JBUFFER_TRACE(jh, "release data"); + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + __journal_remove_journal_head(bh); + __brelse(bh); + } + } + else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { + /* written-back checkpointed metadata buffer */ + if (jh->b_jlist == BJ_None) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __journal_remove_checkpoint(jh); + __journal_remove_journal_head(bh); + __brelse(bh); + } + } + return !buffer_jbd(bh); + + out: + return 0; + } + + /* + * journal_try_to_free_buffers(). For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. Called with lru_list_lock + * not held. Does its own locking. + * + * This complicates JBD locking somewhat. We aren't protected by the + * BKL here. We wish to remove the buffer from its committing or + * running transaction's ->t_datalist via __journal_unfile_buffer. + * + * This may *change* the value of transaction_t->t_datalist, so anyone + * who looks at t_datalist needs to lock against this function. + * + * Even worse, someone may be doing a journal_dirty_data on this + * buffer. So we need to lock against that. journal_dirty_data() + * will come out of the lock with the buffer dirty, which makes it + * ineligible for release here. + * + * Who else is affected by this? hmm... Really the only contender + * is do_get_write_access() - it could be looking at the buffer while + * journal_try_to_free_buffer() is changing its state. But that + * cannot happen because we never reallocate freed data as metadata + * while the data is part of a transaction. Yes? + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this is the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + */ + int journal_try_to_free_buffers(journal_t *journal, + struct page *page, int gfp_mask) + { + struct buffer_head *bh; + struct buffer_head *tmp; + int locked_or_dirty = 0; + int call_ttfb = 1; + + J_ASSERT(PageLocked(page)); + + bh = page->buffers; + tmp = bh; + spin_lock(&journal_datalist_lock); + do { + struct buffer_head *p = tmp; + + tmp = tmp->b_this_page; + if (buffer_jbd(p)) + if (!__journal_try_to_free_buffer(p, &locked_or_dirty)) + call_ttfb = 0; + } while (tmp != bh); + spin_unlock(&journal_datalist_lock); + + if (!(gfp_mask & (__GFP_IO|__GFP_WAIT))) + goto out; + if (!locked_or_dirty) + goto out; + /* + * The VM wants us to do writeout, or to block on IO, or both. + * So we allow try_to_free_buffers to be called even if the page + * still has journalled buffers. + */ + call_ttfb = 1; + out: + return call_ttfb; + } + + /* + * This buffer is no longer needed. If it is on an older transaction's + * checkpoint list we need to record it on this transaction's forget list + * to pin this buffer (and hence its checkpointing transaction) down until + * this transaction commits. If the buffer isn't on a checkpoint list, we + * release it. + * Returns non-zero if JBD no longer has an interest in the buffer. + */ + static int dispose_buffer(struct journal_head *jh, + transaction_t *transaction) + { + int may_free = 1; + struct buffer_head *bh = jh2bh(jh); + + spin_lock(&journal_datalist_lock); + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + + if (jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_file_buffer(jh, transaction, BJ_Forget); + clear_bit(BH_JBDDirty, &bh->b_state); + may_free = 0; + } else { + JBUFFER_TRACE(jh, "on running transaction"); + __journal_remove_journal_head(bh); + __brelse(bh); + } + spin_unlock(&journal_datalist_lock); + return may_free; + } + + /* + * journal_flushpage + * + * This code is tricky. It has a number of cases to deal with. + * + * There are two invariants which this code relies on: + * + * i_size must be updated on disk before we start calling flushpage on the + * data. + * + * This is done in ext3 by defining an ext3_setattr method which + * updates i_size before truncate gets going. By maintaining this + * invariant, we can be sure that it is safe to throw away any buffers + * attached to the current transaction: once the transaction commits, + * we know that the data will not be needed. + * + * Note however that we can *not* throw away data belonging to the + * previous, committing transaction! + * + * Any disk blocks which *are* part of the previous, committing + * transaction (and which therefore cannot be discarded immediately) are + * not going to be reused in the new running transaction + * + * The bitmap committed_data images guarantee this: any block which is + * allocated in one transaction and removed in the next will be marked + * as in-use in the committed_data bitmap, so cannot be reused until + * the next transaction to delete the block commits. This means that + * leaving committing buffers dirty is quite safe: the disk blocks + * cannot be reallocated to a different file and so buffer aliasing is + * not possible. + * + * + * The above applies mainly to ordered data mode. In writeback mode we + * don't make guarantees about the order in which data hits disk --- in + * particular we don't guarantee that new dirty data is flushed before + * transaction commit --- so it is always safe just to discard data + * immediately in that mode. --sct + */ + + /* + * The journal_unmap_buffer helper function returns zero if the buffer + * concerned remains pinned as an anonymous buffer belonging to an older + * transaction. + * + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ + static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) + { + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; + + BUFFER_TRACE(bh, "entry"); + + if (!buffer_mapped(bh)) + return 1; + + /* It is safe to proceed here without the + * journal_datalist_spinlock because the buffers cannot be + * stolen by try_to_free_buffers as long as we are holding the + * page lock. --sct */ + + if (!buffer_jbd(bh)) + goto zap_buffer; + + jh = bh2jh(bh); + transaction = jh->b_transaction; + if (transaction == NULL) { + /* First case: not on any transaction. If it + * has no checkpoint link, then we can zap it: + * it's a writeback-mode buffer so we don't care + * if it hits disk safely. */ + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "not on any transaction: zap"); + goto zap_buffer; + } + + if (!buffer_dirty(bh)) { + /* bdflush has written it. We can drop it now */ + goto zap_buffer; + } + + /* OK, it must be in the journal but still not + * written fully to disk: it's metadata or + * journaled data... */ + + if (journal->j_running_transaction) { + /* ... and once the current transaction has + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); + return dispose_buffer(jh, + journal->j_running_transaction); + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have + * passed into commit. We must attach this buffer to + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); + return dispose_buffer(jh, + journal->j_committing_transaction); + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ + clear_bit(BH_JBDDirty, &bh->b_state); + goto zap_buffer; + } + } + } else if (transaction == journal->j_committing_transaction) { + /* If it is committing, we simply cannot touch it. We + * can remove it's next_transaction pointer from the + * running transaction if that is set, but nothing + * else. */ + JBUFFER_TRACE(jh, "on committing transaction"); + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == + journal->j_running_transaction); + jh->b_next_transaction = NULL; + } + return 0; + } else { + /* Good, the buffer belongs to the running transaction. + * We are writing our own transaction's data, not any + * previous one's, so it is safe to throw it away + * (remember that we expect the filesystem to have set + * i_size already for this truncate so recovery will not + * expose the disk blocks we are discarding here.) */ + J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + may_free = dispose_buffer(jh, transaction); + } + + zap_buffer: + if (buffer_dirty(bh)) + mark_buffer_clean(bh); + J_ASSERT_BH(bh, !buffer_jdirty(bh)); + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); + clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_New, &bh->b_state); + return may_free; + } + + /* + * Return non-zero if the page's buffers were successfully reaped + */ + int journal_flushpage(journal_t *journal, + struct page *page, + unsigned long offset) + { + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + int may_free = 1; + + if (!PageLocked(page)) + BUG(); + if (!page->buffers) + return 1; + + /* We will potentially be playing with lists other than just the + * data lists (especially for journaled data mode), so be + * cautious in our locking. */ + lock_journal(journal); + + head = bh = page->buffers; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* AKPM: doing lock_buffer here may be overly paranoid */ + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); + may_free &= journal_unmap_buffer(journal, bh); + unlock_buffer(bh); + } + curr_off = next_off; + bh = next; + + } while (bh != head); + + unlock_journal(journal); + + if (!offset) { + if (!may_free || !try_to_free_buffers(page, 0)) { + atomic_inc(&buffermem_pages); + return 0; + } + J_ASSERT(page->buffers == NULL); + } + + return 1; + } + + + + /* + * File a buffer on the given transaction list. + */ + + void __journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) + { + struct journal_head **list = 0; + + assert_spin_locked(&journal_datalist_lock); + + #ifdef __SMP__ + J_ASSERT (current->lock_depth >= 0); + #endif + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == 0); + + if (jh->b_transaction) { + if (jh->b_jlist == jlist) + return; + __journal_unfile_buffer(jh); + } else { + jh->b_transaction = transaction; + } + + switch (jlist) { + case BJ_None: + J_ASSERT_JH(jh, !jh->b_committed_data); + J_ASSERT_JH(jh, !jh->b_frozen_data); + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_AsyncData: + list = &transaction->t_async_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers++; + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_add_buffer(list, jh); + jh->b_jlist = jlist; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (atomic_set_buffer_clean(jh2bh(jh))) { + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); + } + } + } + + void journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) + { + spin_lock(&journal_datalist_lock); + __journal_file_buffer(jh, transaction, jlist); + spin_unlock(&journal_datalist_lock); + } + + /* + * Remove a buffer from its current buffer list in preparation for + * dropping it from its current transaction entirely. If the buffer has + * already started to be used by a subsequent transaction, refile the + * buffer on that transaction's metadata list. + */ + + void __journal_refile_buffer(struct journal_head *jh) + { + assert_spin_locked(&journal_datalist_lock); + #ifdef __SMP__ + J_ASSERT_JH(jh, current->lock_depth >= 0); + #endif + __journal_unfile_buffer(jh); + + /* If the buffer is now unused, just drop it. If it has been + modified by a later transaction, add it to the new + transaction's metadata list. */ + + jh->b_transaction = jh->b_next_transaction; + jh->b_next_transaction = NULL; + + if (jh->b_transaction != NULL) { + __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + } else { + /* Onto BUF_DIRTY for writeback */ + refile_buffer(jh2bh(jh)); + } + } + + /* + * For the unlocked version of this call, also make sure that any + * hanging journal_head is cleaned up if necessary. + * + * __journal_refile_buffer is usually called as part of a single locked + * operation on a buffer_head, in which the caller is probably going to + * be hooking the journal_head onto other lists. In that case it is up + * to the caller to remove the journal_head if necessary. For the + * unlocked journal_refile_buffer call, the caller isn't going to be + * doing anything else to the buffer so we need to do the cleanup + * ourselves to avoid a jh leak. + * + * *** The journal_head may be freed by this call! *** + */ + void journal_refile_buffer(struct journal_head *jh) + { + struct buffer_head *bh; + + spin_lock(&journal_datalist_lock); + bh = jh2bh(jh); + + __journal_refile_buffer(jh); + __journal_remove_journal_head(bh); + + spin_unlock(&journal_datalist_lock); + __brelse(bh); + } diff -rc2P linux/fs/jbd-kernel.c linux-2.4.13/fs/jbd-kernel.c *** linux/fs/jbd-kernel.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd-kernel.c Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,336 ---- + /* + * fs/jbd-kernel.c + * + * Support code for the Journalling Block Device layer. + * This file contains things which have to be in-kernel when + * JBD is a module. + * + * 15 May 2001 Andrew Morton + * Created + */ + + #include + #include + #include + #include + #include + + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + + /* + * jh_splice_lock needs explantion. + * + * In a number of places we want to do things like: + * + * if (buffer_jbd(bh) && bh2jh(bh)->foo) + * + * This is racy on SMP, because another CPU could remove the journal_head + * in the middle of this expression. We need locking. + * + * But we can greatly optimise the locking cost by testing BH_JBD + * outside the lock. So, effectively: + * + * ret = 0; + * if (buffer_jbd(bh)) { + * spin_lock(&jh_splice_lock); + * if (buffer_jbd(bh)) { (* Still there? *) + * ret = bh2jh(bh)->foo; + * } + * spin_unlock(&jh_splice_lock); + * } + * return ret; + * + * Now, that protects us from races where another CPU can remove the + * journal_head. But it doesn't defend us from the situation where another + * CPU can *add* a journal_head. This is a correctness issue. But it's not + * a problem because a) the calling code was *already* racy and b) it often + * can't happen at the call site and c) the places where we add journal_heads + * tend to be under external locking. + */ + spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED; + EXPORT_SYMBOL(jh_splice_lock); + + #ifdef CONFIG_JBD_DEBUG + /* + * Some sanity testing which is called from mark_buffer_clean(), + * and must be present in the main kernel. + */ + + void jbd_preclean_buffer_check(struct buffer_head *bh) + { + if (buffer_jbd(bh)) { + struct journal_head *jh = bh2jh(bh); + + transaction_t *transaction = jh->b_transaction; + journal_t *journal; + + if (jh->b_jlist == 0 && transaction == NULL) + return; + + J_ASSERT_JH(jh, (jh->b_jlist == 0 || + jh->b_jlist == BJ_LogCtl || + jh->b_jlist == BJ_IO || + jh->b_jlist == BJ_Forget || + buffer_jbd_data(bh))); + J_ASSERT_JH(jh, transaction != NULL); + /* The kernel may be unmapping old data. We expect it + * to be dirty in that case, unless the buffer has + * already been forgotten by a transaction. */ + if (jh->b_jlist != BJ_Forget) { + #if 1 + if (!buffer_dirty(bh)) { + printk(__FUNCTION__": clean of clean buffer\n"); + print_buffer_trace(bh); + return; + } + #endif + J_ASSERT_BH(bh, buffer_dirty(bh)); + if (!buffer_jbd_data(bh)) { + J_ASSERT_JH(jh, + test_bit(BH_JWrite, + &jh2bh(jh)->b_state)); + } + } + + journal = transaction->t_journal; + J_ASSERT_JH(jh, + transaction == journal->j_running_transaction || + transaction == journal->j_committing_transaction); + } + } + EXPORT_SYMBOL(jbd_preclean_buffer_check); + #endif /* CONFIG_JBD_DEBUG */ + + /* + * Entries in /proc/sys/fs + */ + + int journal_oom_retry = 1; + EXPORT_SYMBOL(journal_oom_retry); + #if defined(CONFIG_JBD_DEBUG) + int journal_enable_debug; + int journal_no_write[2]; + EXPORT_SYMBOL(journal_enable_debug); + EXPORT_SYMBOL(journal_no_write); + #endif + + #endif /* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */ + + /* + * Support functions for BUFFER_TRACE() + */ + #ifdef CONFIG_BUFFER_DEBUG + + static spinlock_t trace_lock = SPIN_LOCK_UNLOCKED; + + void buffer_trace(struct buffer_head *dest, + struct buffer_head *src, char *info) + { + struct buffer_history_item *bhist_i; + unsigned long flags; + + if (dest == 0 || src == 0) + return; + + spin_lock_irqsave(&trace_lock, flags); + + /* + * Sometimes we don't initialise the ring pointers. (locally declared + * temp buffer_heads). Feebly attempt to detect and correct that here. + */ + if ((dest->b_history.b_history_head - dest->b_history.b_history_tail > + BUFFER_HISTORY_SIZE)) { + dest->b_history.b_history_head = 0; + dest->b_history.b_history_tail = 0; + } + bhist_i = dest->b_history.b + + (dest->b_history.b_history_head & (BUFFER_HISTORY_SIZE - 1)); + bhist_i->info = info; + bhist_i->b_state = src->b_state; + bhist_i->b_list = src->b_list; + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + bhist_i->b_trans_is_running = 0; + bhist_i->b_trans_is_committing = 0; + bhist_i->b_blocknr = src->b_blocknr; + if (buffer_jbd(src)) { + struct journal_head *jh; + journal_t *journal; + transaction_t *transaction; + + /* Footwork to avoid racing with journal_remove_journal_head */ + jh = src->b_private; + if (jh == 0) + goto raced; + transaction = jh->b_transaction; + if (src->b_private == 0) + goto raced; + bhist_i->b_jcount = jh->b_jcount; + bhist_i->b_jbd = 1; + bhist_i->b_jlist = jh->b_jlist; + bhist_i->b_frozen_data = jh->b_frozen_data; + bhist_i->b_committed_data = jh->b_committed_data; + bhist_i->b_transaction = !!jh->b_transaction; + bhist_i->b_next_transaction = !!jh->b_next_transaction; + bhist_i->b_cp_transaction = !!jh->b_cp_transaction; + + if (transaction) { + journal = transaction->t_journal; + bhist_i->b_trans_is_running = transaction == + journal->j_running_transaction; + bhist_i->b_trans_is_committing = transaction == + journal->j_committing_transaction; + } + } else { + raced: + bhist_i->b_jcount = 0; + bhist_i->b_jbd = 0; + bhist_i->b_jlist = 0; + bhist_i->b_frozen_data = 0; + bhist_i->b_committed_data = 0; + bhist_i->b_transaction = 0; + bhist_i->b_next_transaction = 0; + bhist_i->b_cp_transaction = 0; + } + #endif /* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */ + + bhist_i->on_lru = (src->b_prev_free != 0 && src->b_next_free != 0); + bhist_i->on_hash = (src->b_pprev != 0); + bhist_i->cpu = smp_processor_id(); + bhist_i->b_count = atomic_read(&src->b_count); + + dest->b_history.b_history_head++; + if (dest->b_history.b_history_head - dest->b_history.b_history_tail > + BUFFER_HISTORY_SIZE) + dest->b_history.b_history_tail = + dest->b_history.b_history_head - BUFFER_HISTORY_SIZE; + + spin_unlock_irqrestore(&trace_lock, flags); + } + + static const char *b_list_to_string(unsigned int b_list) + { + switch (b_list) { + case BUF_CLEAN: return "BUF_CLEAN"; + case BUF_LOCKED: return "BUF_LOCKED"; + case BUF_DIRTY: return "BUF_DIRTY"; + default: return "Bad b_list"; + } + } + + static const char *b_jlist_to_string(unsigned int b_list) + { + switch (b_list) { + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + case BJ_None: return "BJ_None"; + case BJ_SyncData: return "BJ_SyncData"; + case BJ_AsyncData: return "BJ_AsyncData"; + case BJ_Metadata: return "BJ_Metadata"; + case BJ_Forget: return "BJ_Forget"; + case BJ_IO: return "BJ_IO"; + case BJ_Shadow: return "BJ_Shadow"; + case BJ_LogCtl: return "BJ_LogCtl"; + case BJ_Reserved: return "BJ_Reserved"; + #endif + default: return "Bad b_jlist"; + } + } + + static void print_one_hist(struct buffer_history_item *bhist_i) + { + printk(" %s\n", bhist_i->info); + printk(" b_state:0x%lx b_list:%s b_jlist:%s on_lru:%d\n", + bhist_i->b_state, + b_list_to_string(bhist_i->b_list), + b_jlist_to_string(bhist_i->b_jlist), + bhist_i->on_lru); + printk(" cpu:%d on_hash:%d b_count:%d b_blocknr:%lu\n", + bhist_i->cpu, + bhist_i->on_hash, + bhist_i->b_count, + bhist_i->b_blocknr); + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + printk(" b_jbd:%u b_frozen_data:%p b_committed_data:%p\n", + bhist_i->b_jbd, + bhist_i->b_frozen_data, + bhist_i->b_committed_data); + printk(" b_transaction:%u b_next_transaction:%u " + "b_cp_transaction:%u b_trans_is_running:%u\n", + bhist_i->b_transaction, + bhist_i->b_next_transaction, + bhist_i->b_cp_transaction, + bhist_i->b_trans_is_running); + printk(" b_trans_is_comitting:%u b_jcount:%u ", + bhist_i->b_trans_is_committing, + bhist_i->b_jcount); + #endif + printk("\n"); + } + + void print_buffer_fields(struct buffer_head *bh) + { + printk("b_next:%p, b_blocknr:%lu b_count:%d b_flushtime:%lu\n", + bh->b_next, bh->b_blocknr, atomic_read(&bh->b_count), + bh->b_flushtime); + printk("b_next_free:%p b_prev_free:%p b_this_page:%p b_reqnext:%p\n", + bh->b_next_free, bh->b_prev_free, bh->b_this_page, + bh->b_reqnext); + printk("b_pprev:%p b_data:%p b_page:%p b_inode:%p b_list:%d\n", + bh->b_pprev, bh->b_data, bh->b_page, bh->b_inode, bh->b_list); + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + if (buffer_jbd(bh)) { + struct journal_head *jh = bh2jh(bh); + + printk("b_jlist:%u b_frozen_data:%p b_committed_data:%p\n", + jh->b_jlist, jh->b_frozen_data, jh->b_committed_data); + printk(" b_transaction:%p b_next_transaction:%p " + "b_cp_transaction:%p\n", + jh->b_transaction, jh->b_next_transaction, + jh->b_cp_transaction); + printk("b_cpnext:%p b_cpprev:%p\n", + jh->b_cpnext, jh->b_cpprev); + } + #endif + } + + void print_buffer_trace(struct buffer_head *bh) + { + #ifdef CONFIG_X86 + extern void show_stack(unsigned long * esp); + #endif + + unsigned long idx, count; + unsigned long flags; + + printk("buffer trace for buffer at 0x%p (I am CPU %d)\n", + bh, smp_processor_id()); + BUFFER_TRACE(bh, ""); /* Record state now */ + + spin_lock_irqsave(&trace_lock, flags); + for ( idx = bh->b_history.b_history_tail, count = 0; + idx < bh->b_history.b_history_head && + count < BUFFER_HISTORY_SIZE; + idx++, count++) + print_one_hist(bh->b_history.b + + (idx & (BUFFER_HISTORY_SIZE - 1))); + + print_buffer_fields(bh); + spin_unlock_irqrestore(&trace_lock, flags); + #ifdef CONFIG_X86 + show_stack(NULL); + #endif + printk("\n"); + } + + static struct buffer_head *failed_buffer_head; /* For access with debuggers */ + + void buffer_assertion_failure(struct buffer_head *bh) + { + failed_buffer_head = bh; + print_buffer_trace(bh); + } + EXPORT_SYMBOL(buffer_trace); + EXPORT_SYMBOL(print_buffer_trace); + EXPORT_SYMBOL(buffer_assertion_failure); + EXPORT_SYMBOL(print_buffer_fields); + #endif /* CONFIG_BUFFER_DEBUG */ + diff -rc2P linux/fs/open.c linux-2.4.13/fs/open.c *** linux/fs/open.c Fri Nov 9 16:15:08 2001 --- linux-2.4.13/fs/open.c Fri Nov 9 16:57:59 2001 *************** *** 72,75 **** --- 72,81 ---- } + /* + * i_sem is taken outside i_truncate_sem because that is the + * order in which these locks are taken on the path + * generic_file_write->copy_from_user->handle_mm_fault->do_no_page + */ + int do_truncate(struct dentry *dentry, loff_t length) { *************** *** 83,89 **** --- 89,97 ---- down(&inode->i_sem); + down_write(&inode->i_truncate_sem); newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; error = notify_change(dentry, &newattrs); + up_write(&inode->i_truncate_sem); up(&inode->i_sem); return error; diff -rc2P linux/include/linux/buffer-trace.h linux-2.4.13/include/linux/buffer-trace.h *** linux/include/linux/buffer-trace.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/buffer-trace.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,84 ---- + /* + * include/linux/buffer-trace.h + * + * Debugging support for recording buffer_head state transitions + * + * May 2001, akpm + * Created + */ + + #ifndef BUFFER_TRACE_H_INCLUDED + #define BUFFER_TRACE_H_INCLUDED + + #include + + #ifdef CONFIG_BUFFER_DEBUG + + /* The number of records per buffer_head. Must be a power of two */ + #define BUFFER_HISTORY_SIZE 32 + + struct buffer_head; + + /* This gets embedded in struct buffer_head */ + struct buffer_history { + struct buffer_history_item { + char *info; + unsigned long b_state; + unsigned b_list:3; + unsigned b_jlist:4; + unsigned on_lru:1; + unsigned on_hash:1; + unsigned cpu:3; + unsigned b_count:8; + unsigned long b_blocknr; /* For src != dest */ + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + unsigned b_jcount:4; + unsigned b_jbd:1; + unsigned b_transaction:1; + unsigned b_next_transaction:1; + unsigned b_cp_transaction:1; + unsigned b_trans_is_running:1; + unsigned b_trans_is_committing:1; + void *b_frozen_data; + void *b_committed_data; + #endif + } b[BUFFER_HISTORY_SIZE]; + unsigned long b_history_head; /* Next place to write */ + unsigned long b_history_tail; /* Oldest valid entry */ + }; + + static inline void buffer_trace_init(struct buffer_history *bhist) + { + bhist->b_history_head = 0; + bhist->b_history_tail = 0; + } + extern void buffer_trace(struct buffer_head *dest, + struct buffer_head *src, char *info); + extern void print_buffer_fields(struct buffer_head *bh); + extern void print_buffer_trace(struct buffer_head *bh); + + #define BUFFER_STRINGIFY2(X) #X + #define BUFFER_STRINGIFY(X) BUFFER_STRINGIFY2(X) + + #define BUFFER_TRACE2(dest, src, info) \ + do { \ + buffer_trace((dest), (src), \ + __FUNCTION__"() ["__FILE__":" \ + BUFFER_STRINGIFY(__LINE__)"] " info); \ + } while (0) + + #define BUFFER_TRACE(bh, info) BUFFER_TRACE2(bh, bh, info) + #define JBUFFER_TRACE(jh, info) BUFFER_TRACE(jh2bh(jh), info) + + #else /* CONFIG_BUFFER_DEBUG */ + + #define buffer_trace_init(bh) do {} while (0) + #define print_buffer_fields(bh) do {} while (0) + #define print_buffer_trace(bh) do {} while (0) + #define BUFFER_TRACE(bh, info) do {} while (0) + #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) + #define JBUFFER_TRACE(jh, info) do {} while (0) + + #endif /* CONFIG_BUFFER_DEBUG */ + + #endif /* BUFFER_TRACE_H_INCLUDED */ diff -rc2P linux/include/linux/capability.h linux-2.4.13/include/linux/capability.h *** linux/include/linux/capability.h Fri Nov 9 16:15:08 2001 --- linux-2.4.13/include/linux/capability.h Fri Nov 9 16:58:00 2001 *************** *** 251,254 **** --- 251,256 ---- /* Override quota limits. */ /* Override reserved space on ext2 filesystem */ + /* Modify data journaling mode on ext3 filesystem (uses journaling + resources) */ /* NOTE: ext2 honors fsuid when checking for resource overrides, so you can override using fsuid too */ diff -rc2P linux/include/linux/capability.h.orig linux-2.4.13/include/linux/capability.h.orig *** linux/include/linux/capability.h.orig Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/capability.h.orig Fri Nov 9 16:15:08 2001 *************** *** 0 **** --- 1,356 ---- + /* + * This is + * + * Andrew G. Morgan + * Alexander Kjeldaas + * with help from Aleph1, Roland Buresund and Andrew Main. + * + * See here for the libcap library ("POSIX draft" compliance): + * + * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.2/ + */ + + #ifndef _LINUX_CAPABILITY_H + #define _LINUX_CAPABILITY_H + + #include + #include + + /* User-level do most of the mapping between kernel and user + capabilities based on the version tag given by the kernel. The + kernel might be somewhat backwards compatible, but don't bet on + it. */ + + /* XXX - Note, cap_t, is defined by POSIX to be an "opaque" pointer to + a set of three capability sets. The transposition of 3*the + following structure to such a composite is better handled in a user + library since the draft standard requires the use of malloc/free + etc.. */ + + #define _LINUX_CAPABILITY_VERSION 0x19980330 + + typedef struct __user_cap_header_struct { + __u32 version; + int pid; + } *cap_user_header_t; + + typedef struct __user_cap_data_struct { + __u32 effective; + __u32 permitted; + __u32 inheritable; + } *cap_user_data_t; + + #ifdef __KERNEL__ + + /* #define STRICT_CAP_T_TYPECHECKS */ + + #ifdef STRICT_CAP_T_TYPECHECKS + + typedef struct kernel_cap_struct { + __u32 cap; + } kernel_cap_t; + + #else + + typedef __u32 kernel_cap_t; + + #endif + + #define _USER_CAP_HEADER_SIZE (2*sizeof(__u32)) + #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) + + #endif + + + /** + ** POSIX-draft defined capabilities. + **/ + + /* In a system with the [_POSIX_CHOWN_RESTRICTED] option defined, this + overrides the restriction of changing file ownership and group + ownership. */ + + #define CAP_CHOWN 0 + + /* Override all DAC access, including ACL execute access if + [_POSIX_ACL] is defined. Excluding DAC access covered by + CAP_LINUX_IMMUTABLE. */ + + #define CAP_DAC_OVERRIDE 1 + + /* Overrides all DAC restrictions regarding read and search on files + and directories, including ACL restrictions if [_POSIX_ACL] is + defined. Excluding DAC access covered by CAP_LINUX_IMMUTABLE. */ + + #define CAP_DAC_READ_SEARCH 2 + + /* Overrides all restrictions about allowed operations on files, where + file owner ID must be equal to the user ID, except where CAP_FSETID + is applicable. It doesn't override MAC and DAC restrictions. */ + + #define CAP_FOWNER 3 + + /* Overrides the following restrictions that the effective user ID + shall match the file owner ID when setting the S_ISUID and S_ISGID + bits on that file; that the effective group ID (or one of the + supplementary group IDs) shall match the file owner ID when setting + the S_ISGID bit on that file; that the S_ISUID and S_ISGID bits are + cleared on successful return from chown(2) (not implemented). */ + + #define CAP_FSETID 4 + + /* Used to decide between falling back on the old suser() or fsuser(). */ + + #define CAP_FS_MASK 0x1f + + /* Overrides the restriction that the real or effective user ID of a + process sending a signal must match the real or effective user ID + of the process receiving the signal. */ + + #define CAP_KILL 5 + + /* Allows setgid(2) manipulation */ + /* Allows setgroups(2) */ + /* Allows forged gids on socket credentials passing. */ + + #define CAP_SETGID 6 + + /* Allows set*uid(2) manipulation (including fsuid). */ + /* Allows forged pids on socket credentials passing. */ + + #define CAP_SETUID 7 + + + /** + ** Linux-specific capabilities + **/ + + /* Transfer any capability in your permitted set to any pid, + remove any capability in your permitted set from any pid */ + + #define CAP_SETPCAP 8 + + /* Allow modification of S_IMMUTABLE and S_APPEND file attributes */ + + #define CAP_LINUX_IMMUTABLE 9 + + /* Allows binding to TCP/UDP sockets below 1024 */ + /* Allows binding to ATM VCIs below 32 */ + + #define CAP_NET_BIND_SERVICE 10 + + /* Allow broadcasting, listen to multicast */ + + #define CAP_NET_BROADCAST 11 + + /* Allow interface configuration */ + /* Allow administration of IP firewall, masquerading and accounting */ + /* Allow setting debug option on sockets */ + /* Allow modification of routing tables */ + /* Allow setting arbitrary process / process group ownership on + sockets */ + /* Allow binding to any address for transparent proxying */ + /* Allow setting TOS (type of service) */ + /* Allow setting promiscuous mode */ + /* Allow clearing driver statistics */ + /* Allow multicasting */ + /* Allow read/write of device-specific registers */ + /* Allow activation of ATM control sockets */ + + #define CAP_NET_ADMIN 12 + + /* Allow use of RAW sockets */ + /* Allow use of PACKET sockets */ + + #define CAP_NET_RAW 13 + + /* Allow locking of shared memory segments */ + /* Allow mlock and mlockall (which doesn't really have anything to do + with IPC) */ + + #define CAP_IPC_LOCK 14 + + /* Override IPC ownership checks */ + + #define CAP_IPC_OWNER 15 + + /* Insert and remove kernel modules - modify kernel without limit */ + /* Modify cap_bset */ + #define CAP_SYS_MODULE 16 + + /* Allow ioperm/iopl access */ + /* Allow sending USB messages to any device via /proc/bus/usb */ + + #define CAP_SYS_RAWIO 17 + + /* Allow use of chroot() */ + + #define CAP_SYS_CHROOT 18 + + /* Allow ptrace() of any process */ + + #define CAP_SYS_PTRACE 19 + + /* Allow configuration of process accounting */ + + #define CAP_SYS_PACCT 20 + + /* Allow configuration of the secure attention key */ + /* Allow administration of the random device */ + /* Allow examination and configuration of disk quotas */ + /* Allow configuring the kernel's syslog (printk behaviour) */ + /* Allow setting the domainname */ + /* Allow setting the hostname */ + /* Allow calling bdflush() */ + /* Allow mount() and umount(), setting up new smb connection */ + /* Allow some autofs root ioctls */ + /* Allow nfsservctl */ + /* Allow VM86_REQUEST_IRQ */ + /* Allow to read/write pci config on alpha */ + /* Allow irix_prctl on mips (setstacksize) */ + /* Allow flushing all cache on m68k (sys_cacheflush) */ + /* Allow removing semaphores */ + /* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores + and shared memory */ + /* Allow locking/unlocking of shared memory segment */ + /* Allow turning swap on/off */ + /* Allow forged pids on socket credentials passing */ + /* Allow setting readahead and flushing buffers on block devices */ + /* Allow setting geometry in floppy driver */ + /* Allow turning DMA on/off in xd driver */ + /* Allow administration of md devices (mostly the above, but some + extra ioctls) */ + /* Allow tuning the ide driver */ + /* Allow access to the nvram device */ + /* Allow administration of apm_bios, serial and bttv (TV) device */ + /* Allow manufacturer commands in isdn CAPI support driver */ + /* Allow reading non-standardized portions of pci configuration space */ + /* Allow DDI debug ioctl on sbpcd driver */ + /* Allow setting up serial ports */ + /* Allow sending raw qic-117 commands */ + /* Allow enabling/disabling tagged queuing on SCSI controllers and sending + arbitrary SCSI commands */ + /* Allow setting encryption key on loopback filesystem */ + /* Allow the selection of a security context */ + + #define CAP_SYS_ADMIN 21 + + /* Allow use of reboot() */ + + #define CAP_SYS_BOOT 22 + + /* Allow raising priority and setting priority on other (different + UID) processes */ + /* Allow use of FIFO and round-robin (realtime) scheduling on own + processes and setting the scheduling algorithm used by another + process. */ + + #define CAP_SYS_NICE 23 + + /* Override resource limits. Set resource limits. */ + /* Override quota limits. */ + /* Override reserved space on ext2 filesystem */ + /* NOTE: ext2 honors fsuid when checking for resource overrides, so + you can override using fsuid too */ + /* Override size restrictions on IPC message queues */ + /* Allow more than 64hz interrupts from the real-time clock */ + /* Override max number of consoles on console allocation */ + /* Override max number of keymaps */ + + #define CAP_SYS_RESOURCE 24 + + /* Allow manipulation of system clock */ + /* Allow irix_stime on mips */ + /* Allow setting the real-time clock */ + + #define CAP_SYS_TIME 25 + + /* Allow configuration of tty devices */ + /* Allow vhangup() of tty */ + + #define CAP_SYS_TTY_CONFIG 26 + + /* Allow the privileged aspects of mknod() */ + + #define CAP_MKNOD 27 + + /* Allow taking of leases on files */ + + #define CAP_LEASE 28 + + /* Allow opening special device file */ + + #define CAP_OPENDEV 29 + + #ifdef __KERNEL__ + /* + * Bounding set + */ + extern kernel_cap_t cap_bset; + + /* + * Internal kernel functions only + */ + + #ifdef STRICT_CAP_T_TYPECHECKS + + #define to_cap_t(x) { x } + #define cap_t(x) (x).cap + + #else + + #define to_cap_t(x) (x) + #define cap_t(x) (x) + + #endif + + #define CAP_EMPTY_SET to_cap_t(0) + #define CAP_FULL_SET to_cap_t(~0) + #define CAP_INIT_EFF_SET to_cap_t(~0 & ~CAP_TO_MASK(CAP_SETPCAP)) + #define CAP_INIT_INH_SET to_cap_t(0) + + #define CAP_TO_MASK(x) (1 << (x)) + #define cap_raise(c, flag) (cap_t(c) |= CAP_TO_MASK(flag)) + #define cap_lower(c, flag) (cap_t(c) &= ~CAP_TO_MASK(flag)) + #define cap_raised(c, flag) (cap_t(c) & CAP_TO_MASK(flag)) + + static inline kernel_cap_t cap_combine(kernel_cap_t a, kernel_cap_t b) + { + kernel_cap_t dest; + cap_t(dest) = cap_t(a) | cap_t(b); + return dest; + } + + static inline kernel_cap_t cap_intersect(kernel_cap_t a, kernel_cap_t b) + { + kernel_cap_t dest; + cap_t(dest) = cap_t(a) & cap_t(b); + return dest; + } + + static inline kernel_cap_t cap_drop(kernel_cap_t a, kernel_cap_t drop) + { + kernel_cap_t dest; + cap_t(dest) = cap_t(a) & ~cap_t(drop); + return dest; + } + + static inline kernel_cap_t cap_invert(kernel_cap_t c) + { + kernel_cap_t dest; + cap_t(dest) = ~cap_t(c); + return dest; + } + + #define cap_isclear(c) (!cap_t(c)) + #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set))) + + #define cap_clear(c) do { cap_t(c) = 0; } while(0) + #define cap_set_full(c) do { cap_t(c) = ~0; } while(0) + #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0) + + #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK) + + #endif /* __KERNEL__ */ + + #endif /* !_LINUX_CAPABILITY_H */ diff -rc2P linux/include/linux/ext3_fs.h linux-2.4.13/include/linux/ext3_fs.h *** linux/include/linux/ext3_fs.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/ext3_fs.h Fri Nov 9 17:05:34 2001 *************** *** 0 **** --- 1,716 ---- + /* + * linux/include/linux/ext3_fs.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + + #ifndef _LINUX_EXT3_FS_H + #define _LINUX_EXT3_FS_H + + #include + + /* + * The second extended filesystem constants/structures + */ + + /* + * Define EXT3FS_DEBUG to produce debug messages + */ + #undef EXT3FS_DEBUG + + /* + * Define EXT3_PREALLOCATE to preallocate data blocks for expanding files + */ + #undef EXT3_PREALLOCATE /* @@@ Fix this! */ + #define EXT3_DEFAULT_PREALLOC_BLOCKS 8 + + /* + * The second extended file system version + */ + #define EXT3FS_DATE "21 Oct 2001" + #define EXT3FS_VERSION "2.4-0.9.13" + + /* + * Debug code + */ + #ifdef EXT3FS_DEBUG + #define ext3_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) + #else + #define ext3_debug(f, a...) do {} while (0) + #endif + + /* + * Special inodes numbers + */ + #define EXT3_BAD_INO 1 /* Bad blocks inode */ + #define EXT3_ROOT_INO 2 /* Root inode */ + #define EXT3_ACL_IDX_INO 3 /* ACL inode */ + #define EXT3_ACL_DATA_INO 4 /* ACL inode */ + #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ + #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ + #define EXT3_JOURNAL_INO 8 /* Journal inode */ + + /* First non-reserved inode for old ext3 filesystems */ + #define EXT3_GOOD_OLD_FIRST_INO 11 + + /* + * The second extended file system magic number + */ + #define EXT3_SUPER_MAGIC 0xEF53 + + /* + * Maximal count of links to a file + */ + #define EXT3_LINK_MAX 32000 + + /* + * Macro-instructions used to manage several block sizes + */ + #define EXT3_MIN_BLOCK_SIZE 1024 + #define EXT3_MAX_BLOCK_SIZE 4096 + #define EXT3_MIN_BLOCK_LOG_SIZE 10 + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) + #else + # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif + #define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) + #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) + #else + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) + #endif + #ifdef __KERNEL__ + #define EXT3_ADDR_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_addr_per_block_bits) + #define EXT3_INODE_SIZE(s) ((s)->u.ext3_sb.s_inode_size) + #define EXT3_FIRST_INO(s) ((s)->u.ext3_sb.s_first_ino) + #else + #define EXT3_INODE_SIZE(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ + EXT3_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) + #define EXT3_FIRST_INO(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ + EXT3_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) + #endif + + /* + * Macro-instructions used to manage fragments + */ + #define EXT3_MIN_FRAG_SIZE 1024 + #define EXT3_MAX_FRAG_SIZE 4096 + #define EXT3_MIN_FRAG_LOG_SIZE 10 + #ifdef __KERNEL__ + # define EXT3_FRAG_SIZE(s) ((s)->u.ext3_sb.s_frag_size) + # define EXT3_FRAGS_PER_BLOCK(s) ((s)->u.ext3_sb.s_frags_per_block) + #else + # define EXT3_FRAG_SIZE(s) (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size) + # define EXT3_FRAGS_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s)) + #endif + + /* + * ACL structures + */ + struct ext3_acl_header /* Header of Access Control Lists */ + { + __u32 aclh_size; + __u32 aclh_file_count; + __u32 aclh_acle_count; + __u32 aclh_first_acle; + }; + + struct ext3_acl_entry /* Access Control List Entry */ + { + __u32 acle_size; + __u16 acle_perms; /* Access permissions */ + __u16 acle_type; /* Type of entry */ + __u16 acle_tag; /* User or group identity */ + __u16 acle_pad1; + __u32 acle_next; /* Pointer on next entry for the */ + /* same inode or on next free entry */ + }; + + /* + * Structure of a blocks group descriptor + */ + struct ext3_group_desc + { + __u32 bg_block_bitmap; /* Blocks bitmap block */ + __u32 bg_inode_bitmap; /* Inodes bitmap block */ + __u32 bg_inode_table; /* Inodes table block */ + __u16 bg_free_blocks_count; /* Free blocks count */ + __u16 bg_free_inodes_count; /* Free inodes count */ + __u16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __u32 bg_reserved[3]; + }; + + /* + * Macro-instructions used to manage group descriptors + */ + #ifdef __KERNEL__ + # define EXT3_BLOCKS_PER_GROUP(s) ((s)->u.ext3_sb.s_blocks_per_group) + # define EXT3_DESC_PER_BLOCK(s) ((s)->u.ext3_sb.s_desc_per_block) + # define EXT3_INODES_PER_GROUP(s) ((s)->u.ext3_sb.s_inodes_per_group) + # define EXT3_DESC_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_desc_per_block_bits) + #else + # define EXT3_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) + # define EXT3_DESC_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc)) + # define EXT3_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) + #endif + + /* + * Constants relative to the data blocks + */ + #define EXT3_NDIR_BLOCKS 12 + #define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS + #define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) + #define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) + #define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) + + /* + * Inode flags + */ + #define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ + #define EXT3_UNRM_FL 0x00000002 /* Undelete */ + #define EXT3_COMPR_FL 0x00000004 /* Compress file */ + #define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ + #define EXT3_IMMUTABLE_FILE_FL 0x00000010 /* Immutable file */ + #define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ + #define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ + #define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ + /* Reserved for compression usage... */ + #define EXT3_DIRTY_FL 0x00000100 + #define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ + #define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ + #define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ + /* End compression flags --- maybe not all used */ + #define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ + #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ + #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ + #define EXT3_IMMUTABLE_LINK_FL 0x00008000 /* Immutable link */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + + #define EXT3_FL_USER_VISIBLE 0x00009FFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000080FF /* User modifiable flags */ + + /* + * Inode dynamic state flags + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ + + /* + * ioctl commands + */ + #define EXT3_IOC_GETFLAGS _IOR('f', 1, long) + #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) + #define EXT3_IOC_GETVERSION _IOR('f', 3, long) + #define EXT3_IOC_SETVERSION _IOW('f', 4, long) + #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) + #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) + #ifdef CONFIG_JBD_DEBUG + #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) + #endif + + /* + * Structure of an inode on the disk + */ + struct ext3_inode { + __u16 i_mode; /* File mode */ + __u16 i_uid; /* Low 16 bits of Owner Uid */ + __u32 i_size; /* Size in bytes */ + __u32 i_atime; /* Access time */ + __u32 i_ctime; /* Creation time */ + __u32 i_mtime; /* Modification time */ + __u32 i_dtime; /* Deletion Time */ + __u16 i_gid; /* Low 16 bits of Group Id */ + __u16 i_links_count; /* Links count */ + __u32 i_blocks; /* Blocks count */ + __u32 i_flags; /* File flags */ + union { + struct { + __u32 l_i_reserved1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __u32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ + __u32 i_generation; /* File version (for NFS) */ + __u32 i_file_acl; /* File ACL */ + __u32 i_dir_acl; /* Directory ACL */ + __u32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __u16 l_i_uid_high; /* these 2 fields */ + __u16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + }; + + #define i_size_high i_dir_acl + + #if defined(__KERNEL__) || defined(__linux__) + #define i_reserved1 osd1.linux1.l_i_reserved1 + #define i_frag osd2.linux2.l_i_frag + #define i_fsize osd2.linux2.l_i_fsize + #define i_uid_low i_uid + #define i_gid_low i_gid + #define i_uid_high osd2.linux2.l_i_uid_high + #define i_gid_high osd2.linux2.l_i_gid_high + #define i_reserved2 osd2.linux2.l_i_reserved2 + + #elif defined(__GNU__) + + #define i_translator osd1.hurd1.h_i_translator + #define i_frag osd2.hurd2.h_i_frag; + #define i_fsize osd2.hurd2.h_i_fsize; + #define i_uid_high osd2.hurd2.h_i_uid_high + #define i_gid_high osd2.hurd2.h_i_gid_high + #define i_author osd2.hurd2.h_i_author + + #elif defined(__masix__) + + #define i_reserved1 osd1.masix1.m_i_reserved1 + #define i_frag osd2.masix2.m_i_frag + #define i_fsize osd2.masix2.m_i_fsize + #define i_reserved2 osd2.masix2.m_i_reserved2 + + #endif /* defined(__KERNEL__) || defined(__linux__) */ + + /* + * File system states + */ + #define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ + #define EXT3_ERROR_FS 0x0002 /* Errors detected */ + #define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ + + /* + * Mount flags + */ + #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ + #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ + #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ + #define EXT3_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */ + #define EXT3_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ + #define EXT3_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ + #define EXT3_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ + #define EXT3_MOUNT_NOLOAD 0x0100 /* Don't use existing journal*/ + #define EXT3_MOUNT_ABORT 0x0200 /* Fatal error detected */ + #define EXT3_MOUNT_DATA_FLAGS 0x0C00 /* Mode for data writes: */ + #define EXT3_MOUNT_JOURNAL_DATA 0x0400 /* Write data to journal */ + #define EXT3_MOUNT_ORDERED_DATA 0x0800 /* Flush data before commit */ + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H + #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt + #define set_opt(o, opt) o |= EXT3_MOUNT_##opt + #define test_opt(sb, opt) ((sb)->u.ext3_sb.s_mount_opt & \ + EXT3_MOUNT_##opt) + #else + #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD + #define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT + #endif + + #define ext3_set_bit ext2_set_bit + #define ext3_clear_bit ext2_clear_bit + #define ext3_test_bit ext2_test_bit + #define ext3_find_first_zero_bit ext2_find_first_zero_bit + #define ext3_find_next_zero_bit ext2_find_next_zero_bit + + /* + * Maximal mount counts between two filesystem checks + */ + #define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ + #define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + + /* + * Behaviour when detecting errors + */ + #define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ + #define EXT3_ERRORS_RO 2 /* Remount fs read-only */ + #define EXT3_ERRORS_PANIC 3 /* Panic */ + #define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE + + /* + * Structure of the super block + */ + struct ext3_super_block { + /*00*/ __u32 s_inodes_count; /* Inodes count */ + __u32 s_blocks_count; /* Blocks count */ + __u32 s_r_blocks_count; /* Reserved blocks count */ + __u32 s_free_blocks_count; /* Free blocks count */ + /*10*/ __u32 s_free_inodes_count; /* Free inodes count */ + __u32 s_first_data_block; /* First Data Block */ + __u32 s_log_block_size; /* Block size */ + __s32 s_log_frag_size; /* Fragment size */ + /*20*/ __u32 s_blocks_per_group; /* # Blocks per group */ + __u32 s_frags_per_group; /* # Fragments per group */ + __u32 s_inodes_per_group; /* # Inodes per group */ + __u32 s_mtime; /* Mount time */ + /*30*/ __u32 s_wtime; /* Write time */ + __u16 s_mnt_count; /* Mount count */ + __s16 s_max_mnt_count; /* Maximal mount count */ + __u16 s_magic; /* Magic signature */ + __u16 s_state; /* File system state */ + __u16 s_errors; /* Behaviour when detecting errors */ + __u16 s_minor_rev_level; /* minor revision level */ + /*40*/ __u32 s_lastcheck; /* time of last check */ + __u32 s_checkinterval; /* max. time between checks */ + __u32 s_creator_os; /* OS */ + __u32 s_rev_level; /* Revision level */ + /*50*/ __u16 s_def_resuid; /* Default uid for reserved blocks */ + __u16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT3_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __u32 s_first_ino; /* First non-reserved inode */ + __u16 s_inode_size; /* size of inode structure */ + __u16 s_block_group_nr; /* block group # of this superblock */ + __u32 s_feature_compat; /* compatible feature set */ + /*60*/ __u32 s_feature_incompat; /* incompatible feature set */ + __u32 s_feature_ro_compat; /* readonly-compatible feature set */ + /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ + /*78*/ char s_volume_name[16]; /* volume name */ + /*88*/ char s_last_mounted[64]; /* directory where last mounted */ + /*C8*/ __u32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __u16 s_padding1; + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ + /*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ + /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ + + /*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ + #define EXT3_SB(sb) (&((sb)->u.ext3_sb)) + #define EXT3_I(inode) (&((inode)->u.ext3_i)) + #else + /* Assume that user mode programs are passing in an ext3fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ + #define EXT3_SB(sb) (sb) + #endif + + #define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime + + /* + * Codes for operating systems + */ + #define EXT3_OS_LINUX 0 + #define EXT3_OS_HURD 1 + #define EXT3_OS_MASIX 2 + #define EXT3_OS_FREEBSD 3 + #define EXT3_OS_LITES 4 + + /* + * Revision levels + */ + #define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ + #define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + + #define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV + #define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV + + #define EXT3_GOOD_OLD_INODE_SIZE 128 + + /* + * Feature set definitions + */ + + #define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) + #define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) + #define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) + #define EXT3_SET_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) + #define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) + #define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) + #define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) + #define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) + #define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + + #define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 + #define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 + #define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 + #define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 + #define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 + #define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 + + #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 + #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 + #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 + + #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 + #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + + #define EXT3_FEATURE_COMPAT_SUPP 0 + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) + + /* + * Default values for user and/or group using reserved blocks + */ + #define EXT3_DEF_RESUID 0 + #define EXT3_DEF_RESGID 0 + + /* + * Structure of a directory entry + */ + #define EXT3_NAME_LEN 255 + + struct ext3_dir_entry { + __u32 inode; /* Inode number */ + __u16 rec_len; /* Directory entry length */ + __u16 name_len; /* Name length */ + char name[EXT3_NAME_LEN]; /* File name */ + }; + + /* + * The new version of the directory entry. Since EXT3 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ + struct ext3_dir_entry_2 { + __u32 inode; /* Inode number */ + __u16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT3_NAME_LEN]; /* File name */ + }; + + /* + * Ext3 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ + #define EXT3_FT_UNKNOWN 0 + #define EXT3_FT_REG_FILE 1 + #define EXT3_FT_DIR 2 + #define EXT3_FT_CHRDEV 3 + #define EXT3_FT_BLKDEV 4 + #define EXT3_FT_FIFO 5 + #define EXT3_FT_SOCK 6 + #define EXT3_FT_SYMLINK 7 + + #define EXT3_FT_MAX 8 + + /* + * EXT3_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ + #define EXT3_DIR_PAD 4 + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) + + #ifdef __KERNEL__ + + /* Filesize hard limits for 64-bit file offsets */ + extern long long ext3_max_sizes[]; + + /* + * Describe an inode's exact location on disk and in memory + */ + struct ext3_iloc + { + struct buffer_head *bh; + struct ext3_inode *raw_inode; + unsigned long block_group; + }; + + /* + * Function prototypes + */ + + /* + * Ok, these declarations are also in but none of the + * ext3 source programs needs to include it so they are duplicated here. + */ + # define NORET_TYPE /**/ + # define ATTRIB_NORET __attribute__((noreturn)) + # define NORET_AND noreturn, + + /* acl.c */ + extern int ext3_permission (struct inode *, int); + + /* balloc.c */ + extern int ext3_bg_has_super(struct super_block *sb, int group); + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, + __u32 *, __u32 *, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, + unsigned long); + extern unsigned long ext3_count_free_blocks (struct super_block *); + extern void ext3_check_blocks_bitmap (struct super_block *); + extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); + + /* bitmap.c */ + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* dir.c */ + extern int ext3_check_dir_entry(const char *, struct inode *, + struct ext3_dir_entry_2 *, struct buffer_head *, + unsigned long); + + /* file.c */ + + /* fsync.c */ + extern int ext3_sync_file (struct file *, struct dentry *, int); + + /* ialloc.c */ + extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); + extern void ext3_free_inode (handle_t *, struct inode *); + extern struct inode * ext3_orphan_get (struct super_block *, ino_t); + extern unsigned long ext3_count_free_inodes (struct super_block *); + extern void ext3_check_inodes_bitmap (struct super_block *); + + /* inode.c */ + + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + + extern int ext3_get_inode_loc (struct inode *, struct ext3_iloc *); + extern void ext3_read_inode (struct inode *); + extern void ext3_write_inode (struct inode *, int); + extern int ext3_setattr (struct dentry *, struct iattr *); + extern void ext3_put_inode (struct inode *); + extern void ext3_delete_inode (struct inode *); + extern int ext3_sync_inode (handle_t *, struct inode *); + extern void ext3_discard_prealloc (struct inode *); + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + + /* ioctl.c */ + extern int ext3_ioctl (struct inode *, struct file *, unsigned int, + unsigned long); + + /* namei.c */ + extern struct inode_operations ext3_dir_inode_operations; + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); + extern void __ext3_std_error (struct super_block *, const char *, int); + extern void ext3_abort (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); + extern NORET_TYPE void ext3_panic (struct super_block *, const char *, + const char *, ...) + __attribute__ ((NORET_AND format (printf, 3, 4))); + extern void ext3_warning (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); + extern void ext3_update_dynamic_rev (struct super_block *sb); + extern void ext3_put_super (struct super_block *); + extern void ext3_write_super (struct super_block *); + extern void ext3_write_super_lockfs (struct super_block *); + extern void ext3_unlockfs (struct super_block *); + extern int ext3_remount (struct super_block *, int *, char *); + extern struct super_block * ext3_read_super (struct super_block *,void *,int); + extern int ext3_statfs (struct super_block *, struct statfs *); + + /* truncate.c */ + extern void ext3_truncate (struct inode *); + + #define ext3_std_error(sb, errno) \ + do { \ + if ((errno)) \ + __ext3_std_error((sb), __FUNCTION__, (errno)); \ + } while (0) + extern const char *ext3_decode_error(struct super_block *sb, int errno, char nbuf[16]); + + /* + * Inodes and files operations + */ + + /* dir.c */ + extern struct file_operations ext3_dir_operations; + + /* file.c */ + extern struct inode_operations ext3_file_inode_operations; + extern struct file_operations ext3_file_operations; + + /* symlink.c */ + extern struct inode_operations ext3_fast_symlink_inode_operations; + + extern struct address_space_operations ext3_aops; + + #endif /* __KERNEL__ */ + + #endif /* _LINUX_EXT3_FS_H */ diff -rc2P linux/include/linux/ext3_fs_i.h linux-2.4.13/include/linux/ext3_fs_i.h *** linux/include/linux/ext3_fs_i.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/ext3_fs_i.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,78 ---- + /* + * linux/include/linux/ext3_fs_i.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_i.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + + #ifndef _LINUX_EXT3_FS_I + #define _LINUX_EXT3_FS_I + + #include + + /* + * second extended file system inode data in memory + */ + struct ext3_inode_info { + __u32 i_data[15]; + __u32 i_flags; + #ifdef EXT3_FRAGMENTS + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; + __u16 unused; /* formerly i_osync */ + #endif + __u32 i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + __u32 i_block_group; + __u32 i_state; /* Dynamic state flags for ext3 */ + __u32 i_next_alloc_block; + __u32 i_next_alloc_goal; + #ifdef EXT3_PREALLOCATE + __u32 i_prealloc_block; + __u32 i_prealloc_count; + #endif + __u32 i_dir_start_lookup; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext3_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext3_get_block (growth) and ext3_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * truncate_sem is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext3 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff -rc2P linux/include/linux/ext3_fs_sb.h linux-2.4.13/include/linux/ext3_fs_sb.h *** linux/include/linux/ext3_fs_sb.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/ext3_fs_sb.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,77 ---- + /* + * linux/include/linux/ext3_fs_sb.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_sb.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + + #ifndef _LINUX_EXT3_FS_SB + #define _LINUX_EXT3_FS_SB + + #ifdef __KERNEL__ + #include + #include + #endif + + /* + * The following is not needed anymore since the descriptors buffer + * heads are now dynamically allocated + */ + /* #define EXT3_MAX_GROUP_DESC 8 */ + + #define EXT3_MAX_GROUP_LOADED 8 + + /* + * third extended-fs super-block data in memory + */ + struct ext3_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned short s_loaded_inode_bitmaps; + unsigned short s_loaded_block_bitmaps; + unsigned long s_inode_bitmap_number[EXT3_MAX_GROUP_LOADED]; + struct buffer_head * s_inode_bitmap[EXT3_MAX_GROUP_LOADED]; + unsigned long s_block_bitmap_number[EXT3_MAX_GROUP_LOADED]; + struct buffer_head * s_block_bitmap[EXT3_MAX_GROUP_LOADED]; + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + + /* Journaling */ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; + unsigned long s_commit_interval; + struct block_device *journal_bdev; + #ifdef CONFIG_JBD_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ diff -rc2P linux/include/linux/ext3_jbd.h linux-2.4.13/include/linux/ext3_jbd.h *** linux/include/linux/ext3_jbd.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/ext3_jbd.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,290 ---- + /* + * linux/include/linux/ext3_jbd.h + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Ext3-specific journaling extensions. + */ + + #ifndef _LINUX_EXT3_JBD_H + #define _LINUX_EXT3_JBD_H + + #include + #include + #include + + #define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal) + + /* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. */ + + #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 + + /* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + + #define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) + + extern int ext3_writepage_trans_blocks(struct inode *inode); + + /* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + + #define EXT3_DELETE_TRANS_BLOCKS (2 * EXT3_DATA_TRANS_BLOCKS + 64) + + /* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + + #define EXT3_MAX_TRANS_DATA 64 + + /* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + + #define EXT3_RESERVE_TRANS_BLOCKS 12 + + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc); + + /* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + + int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc); + + int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode); + + /* + * Wrapper functions with which ext3 calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext3 can control + * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't + * been done yet. + */ + + static inline void ext3_journal_abort_handle(const char *caller, + const char *err_fn, + struct buffer_head *bh, + handle_t *handle, + int err) + { + char nbuf[16]; + const char *errstr = ext3_decode_error(NULL, err, nbuf); + + printk(KERN_ERR "%s: aborting transaction: %s in %s", + caller, errstr, err_fn); + + if (bh) + BUFFER_TRACE(bh, "abort"); + journal_abort_handle(handle); + if (!handle->h_err) + handle->h_err = err; + } + + static inline int + __ext3_journal_get_undo_access(const char *where, + handle_t *handle, struct buffer_head *bh) + { + int err = journal_get_undo_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline int + __ext3_journal_get_write_access(const char *where, + handle_t *handle, struct buffer_head *bh) + { + int err = journal_get_write_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline int + __ext3_journal_dirty_data(const char *where, + handle_t *handle, struct buffer_head *bh, int async) + { + int err = journal_dirty_data(handle, bh, async); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline void + ext3_journal_forget(handle_t *handle, struct buffer_head *bh) + { + journal_forget(handle, bh); + } + + static inline int + __ext3_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh) + { + int err = journal_revoke(handle, blocknr, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline int + __ext3_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh) + { + int err = journal_get_create_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline int + __ext3_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh) + { + int err = journal_dirty_metadata(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + + #define ext3_journal_get_undo_access(handle, bh) \ + __ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh)) + #define ext3_journal_get_write_access(handle, bh) \ + __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh)) + #define ext3_journal_dirty_data(handle, bh, async) \ + __ext3_journal_dirty_data(__FUNCTION__, (handle), (bh), (async)) + #define ext3_journal_revoke(handle, blocknr, bh) \ + __ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) + #define ext3_journal_get_create_access(handle, bh) \ + __ext3_journal_get_create_access(__FUNCTION__, (handle), (bh)) + #define ext3_journal_dirty_metadata(handle, bh) \ + __ext3_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) + + + + /* + * Wrappers for journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ + static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) + { + if (inode->i_sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + return journal_start(EXT3_JOURNAL(inode), nblocks); + } + + static inline handle_t * + ext3_journal_try_start(struct inode *inode, int nblocks) + { + if (inode->i_sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + return journal_try_start(EXT3_JOURNAL(inode), nblocks); + } + + /* + * The only special thing we need to do here is to make sure that all + * journal_stop calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ + static inline int __ext3_journal_stop(const char *where, + handle_t *handle, struct inode *inode) + { + int err = handle->h_err; + int rc = journal_stop(handle); + + inode->i_sb->s_dirt = 1; + if (!err) + err = rc; + if (err) + __ext3_std_error(inode->i_sb, where, err); + return err; + } + #define ext3_journal_stop(handle, inode) \ + __ext3_journal_stop(__FUNCTION__, (handle), (inode)) + + static inline handle_t *ext3_journal_current_handle(void) + { + return journal_current_handle(); + } + + static inline void + ext3_log_start_commit(journal_t *journal, transaction_t *transaction) + { + log_start_commit(journal, transaction); + } + + static inline void ext3_log_wait_commit(journal_t *journal, tid_t tid) + { + log_wait_commit(journal, tid); + } + + static inline int ext3_journal_extend(handle_t *handle, int nblocks) + { + return journal_extend(handle, nblocks); + } + + static inline int ext3_journal_restart(handle_t *handle, int nblocks) + { + return journal_restart(handle, nblocks); + } + + static inline int ext3_journal_blocks_per_page(struct inode *inode) + { + return journal_blocks_per_page(inode); + } + + static inline int ext3_journal_force_commit(journal_t *journal) + { + return journal_force_commit(journal); + } + + /* super.c */ + int ext3_force_commit(struct super_block *sb); + + static inline int ext3_should_journal_data(struct inode *inode) + { + if (!S_ISREG(inode->i_mode)) + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) + return 1; + if (inode->u.ext3_i.i_flags & EXT3_JOURNAL_DATA_FL) + return 1; + return 0; + } + + static inline int ext3_should_order_data(struct inode *inode) + { + return (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA); + } + + + #endif /* _LINUX_EXT3_JBD_H */ diff -rc2P linux/include/linux/fs.h linux-2.4.13/include/linux/fs.h *** linux/include/linux/fs.h Fri Nov 9 16:15:08 2001 --- linux-2.4.13/include/linux/fs.h Fri Nov 9 16:58:00 2001 *************** *** 22,25 **** --- 22,26 ---- #include #include + #include #include *************** *** 219,222 **** --- 220,224 ---- BH_Wait_IO, /* 1 if we should write out this buffer */ BH_launder, /* 1 if we should throttle on this buffer */ + BH_JBD, /* 1 if it has an attached journal_head */ BH_PrivateStart,/* not a state bit, but the first bit available *************** *** 265,268 **** --- 267,274 ---- struct inode * b_inode; struct list_head b_inode_buffers; /* doubly linked list of inode dirty buffers */ + + #ifdef CONFIG_BUFFER_DEBUG + struct buffer_history b_history; + #endif }; *************** *** 290,293 **** --- 296,300 ---- #include #include + #include #include #include *************** *** 380,387 **** --- 387,400 ---- int (*readpage)(struct file *, struct page *); int (*sync_page)(struct page *); + /* + * ext3 requires that a successful prepare_write() call be followed + * by a commit_write() call - they must be balanced + */ int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ int (*bmap)(struct address_space *, long); + int (*flushpage) (struct page *, unsigned long); + int (*releasepage) (struct page *, int); #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); *************** *** 445,448 **** --- 458,462 ---- unsigned long i_version; struct semaphore i_sem; + struct rw_semaphore i_truncate_sem; /* Nests inside i_sem */ struct semaphore i_zombie; struct inode_operations *i_op; *************** *** 474,477 **** --- 488,492 ---- struct minix_inode_info minix_i; struct ext2_inode_info ext2_i; + struct ext3_inode_info ext3_i; struct hpfs_inode_info hpfs_i; struct ntfs_inode_info ntfs_i; *************** *** 662,665 **** --- 677,681 ---- #include #include + #include #include #include *************** *** 718,721 **** --- 734,738 ---- struct minix_sb_info minix_sb; struct ext2_sb_info ext2_sb; + struct ext3_sb_info ext3_sb; struct hpfs_sb_info hpfs_sb; struct ntfs_sb_info ntfs_sb; *************** *** 1091,1094 **** --- 1108,1112 ---- extern int try_to_free_buffers(struct page *, unsigned int); extern void refile_buffer(struct buffer_head * buf); + extern void create_empty_buffers(struct page *, kdev_t, unsigned long); extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate); *************** *** 1132,1135 **** --- 1150,1157 ---- static inline void mark_buffer_clean(struct buffer_head * bh) { + #if defined(CONFIG_JBD_DEBUG) + extern void jbd_preclean_buffer_check(struct buffer_head *); + jbd_preclean_buffer_check(bh); /* @@@ Expensive debugging */ + #endif if (atomic_set_buffer_clean(bh)) __mark_buffer_clean(bh); *************** *** 1173,1176 **** --- 1195,1199 ---- } + extern void set_buffer_flushtime(struct buffer_head *); extern void balance_dirty(void); extern int check_disk_change(kdev_t); *************** *** 1352,1355 **** --- 1375,1380 ---- extern struct buffer_head * bread(kdev_t, int, int); extern void wakeup_bdflush(void); + extern void put_unused_buffer_head(struct buffer_head * bh); + extern struct buffer_head * get_unused_buffer_head(int async); extern int brw_page(int, struct page *, kdev_t, int [], int); *************** *** 1358,1361 **** --- 1383,1387 ---- /* Generic buffer handling for block filesystems.. */ + extern int try_to_release_page(struct page * page, int gfp_mask); extern int discard_bh_page(struct page *, unsigned long, int); #define block_flushpage(page, offset) discard_bh_page(page, offset, 1) diff -rc2P linux/include/linux/fs.h.orig linux-2.4.13/include/linux/fs.h.orig *** linux/include/linux/fs.h.orig Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/fs.h.orig Fri Nov 9 16:15:08 2001 *************** *** 0 **** --- 1,1569 ---- + #ifndef _LINUX_FS_H + #define _LINUX_FS_H + + /* + * This file has definitions for some important file table + * structures etc. + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + + struct poll_table_struct; + + + /* + * It's silly to have NR_OPEN bigger than NR_FILE, but you can change + * the file limit at runtime and only root can increase the per-process + * nr_file rlimit, so it's safe to set up a ridiculously high absolute + * upper limit on files-per-process. + * + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. + */ + + /* Fixed constants first: */ + #undef NR_OPEN + #define NR_OPEN (1024*1024) /* Absolute upper limit on fd num */ + #define INR_OPEN 1024 /* Initial setting for nfile rlimits */ + + #define BLOCK_SIZE_BITS 10 + #define BLOCK_SIZE (1<i_sb->s_flags & (flg)) + + #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY) + #define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC)) + #define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK) + + #define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA) + #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) + #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) + #define IS_IMMUTABLE_FILE(inode) ((inode)->i_flags & S_IMMUTABLE_FILE) + #define IS_IMMUTABLE_LINK(inode) ((((inode)->i_flags & S_IMMUTABLE_FILE) << 3) ^ ((inode)->i_flags & S_IMMUTABLE_LINK) ) + #define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME)) + #define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME) + + #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) + + /* the read-only stuff doesn't really belong here, but any other place is + probably as bad and I don't want to create yet another include file. */ + + #define BLKROSET _IO(0x12,93) /* set device read-only (0 = read-write) */ + #define BLKROGET _IO(0x12,94) /* get read-only status (0 = read_write) */ + #define BLKRRPART _IO(0x12,95) /* re-read partition table */ + #define BLKGETSIZE _IO(0x12,96) /* return device size /512 (long *arg) */ + #define BLKFLSBUF _IO(0x12,97) /* flush buffer cache */ + #define BLKRASET _IO(0x12,98) /* Set read ahead for block device */ + #define BLKRAGET _IO(0x12,99) /* get current read ahead setting */ + #define BLKFRASET _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */ + #define BLKFRAGET _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */ + #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ + #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ + #define BLKSSZGET _IO(0x12,104)/* get block device sector size */ + #if 0 + #define BLKPG _IO(0x12,105)/* See blkpg.h */ + #define BLKELVGET _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))/* elevator get */ + #define BLKELVSET _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))/* elevator set */ + /* This was here just to show that the number is taken - + probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */ + #endif + /* A jump here: 108-111 have been used for various private purposes. */ + #define BLKBSZGET _IOR(0x12,112,sizeof(int)) + #define BLKBSZSET _IOW(0x12,113,sizeof(int)) + #define BLKGETSIZE64 _IOR(0x12,114,sizeof(u64)) /* return device size in bytes (u64 *arg) */ + + #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ + #define FIBMAP _IO(0x00,1) /* bmap access */ + #define FIGETBSZ _IO(0x00,2) /* get the block size used for bmap */ + + #ifdef __KERNEL__ + + #include + #include + + extern void update_atime (struct inode *); + #define UPDATE_ATIME(inode) update_atime (inode) + + extern void buffer_init(unsigned long); + extern void inode_init(unsigned long); + extern void mnt_init(unsigned long); + + /* bh state bits */ + enum bh_state_bits { + BH_Uptodate, /* 1 if the buffer contains valid data */ + BH_Dirty, /* 1 if the buffer is dirty */ + BH_Lock, /* 1 if the buffer is locked */ + BH_Req, /* 0 if the buffer has been invalidated */ + BH_Mapped, /* 1 if the buffer has a disk mapping */ + BH_New, /* 1 if the buffer is new and not yet written out */ + BH_Async, /* 1 if the buffer is under end_buffer_io_async I/O */ + BH_Wait_IO, /* 1 if we should write out this buffer */ + BH_launder, /* 1 if we should throttle on this buffer */ + + BH_PrivateStart,/* not a state bit, but the first bit available + * for private allocation by other entities + */ + }; + + /* + * Try to keep the most commonly used fields in single cache lines (16 + * bytes) to improve performance. This ordering should be + * particularly beneficial on 32-bit processors. + * + * We use the first 16 bytes for the data which is used in searches + * over the block hash lists (ie. getblk() and friends). + * + * The second 16 bytes we use for lru buffer scans, as used by + * sync_buffers() and refill_freelist(). -- sct + */ + struct buffer_head { + /* First cache line: */ + struct buffer_head *b_next; /* Hash queue list */ + unsigned long b_blocknr; /* block number */ + unsigned short b_size; /* block size */ + unsigned short b_list; /* List that this buffer appears */ + kdev_t b_dev; /* device (B_FREE = free) */ + + atomic_t b_count; /* users using this block */ + kdev_t b_rdev; /* Real device */ + unsigned long b_state; /* buffer state bitmap (see above) */ + unsigned long b_flushtime; /* Time when (dirty) buffer should be written */ + + struct buffer_head *b_next_free;/* lru/free list linkage */ + struct buffer_head *b_prev_free;/* doubly linked list of buffers */ + struct buffer_head *b_this_page;/* circular list of buffers in one page */ + struct buffer_head *b_reqnext; /* request queue */ + + struct buffer_head **b_pprev; /* doubly linked list of hash-queue */ + char * b_data; /* pointer to data block */ + struct page *b_page; /* the page this bh is mapped to */ + void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ + void *b_private; /* reserved for b_end_io */ + + unsigned long b_rsector; /* Real buffer location on disk */ + wait_queue_head_t b_wait; + + struct inode * b_inode; + struct list_head b_inode_buffers; /* doubly linked list of inode dirty buffers */ + }; + + typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); + void init_buffer(struct buffer_head *, bh_end_io_t *, void *); + + #define __buffer_state(bh, state) (((bh)->b_state & (1UL << BH_##state)) != 0) + + #define buffer_uptodate(bh) __buffer_state(bh,Uptodate) + #define buffer_dirty(bh) __buffer_state(bh,Dirty) + #define buffer_locked(bh) __buffer_state(bh,Lock) + #define buffer_req(bh) __buffer_state(bh,Req) + #define buffer_mapped(bh) __buffer_state(bh,Mapped) + #define buffer_new(bh) __buffer_state(bh,New) + #define buffer_async(bh) __buffer_state(bh,Async) + + #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) + + extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset); + + #define touch_buffer(bh) mark_page_accessed(bh->b_page) + + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + /* + * Attribute flags. These should be or-ed together to figure out what + * has been changed! + */ + #define ATTR_MODE 1 + #define ATTR_UID 2 + #define ATTR_GID 4 + #define ATTR_SIZE 8 + #define ATTR_ATIME 16 + #define ATTR_MTIME 32 + #define ATTR_CTIME 64 + #define ATTR_ATIME_SET 128 + #define ATTR_MTIME_SET 256 + #define ATTR_FORCE 512 /* Not a change, but a change it */ + #define ATTR_ATTR_FLAG 1024 + + /* + * This is the Inode Attributes structure, used for notify_change(). It + * uses the above definitions as flags, to know which values have changed. + * Also, in this manner, a Filesystem can look at only the values it cares + * about. Basically, these are the attributes that the VFS layer can + * request to change from the FS layer. + * + * Derek Atkins 94-10-20 + */ + struct iattr { + unsigned int ia_valid; + umode_t ia_mode; + uid_t ia_uid; + gid_t ia_gid; + loff_t ia_size; + time_t ia_atime; + time_t ia_mtime; + time_t ia_ctime; + unsigned int ia_attr_flags; + }; + + /* + * This is the inode attributes flag definitions + */ + #define ATTR_FLAG_SYNCRONOUS 1 /* Syncronous write */ + #define ATTR_FLAG_NOATIME 2 /* Don't update atime */ + #define ATTR_FLAG_APPEND 4 /* Append-only file */ + #define ATTR_FLAG_IMMUTABLE_FILE 8 /* Immutable file */ + #define ATTR_FLAG_NODIRATIME 16 /* Don't update atime for directory */ + #define ATTR_FLAG_IMMUTABLE_LINK 32 /* Immutable file */ + + /* + * Includes for diskquotas and mount structures. + */ + #include + #include + + /* + * oh the beauties of C type declarations. + */ + struct page; + struct address_space; + struct kiobuf; + + struct address_space_operations { + int (*writepage)(struct page *); + int (*readpage)(struct file *, struct page *); + int (*sync_page)(struct page *); + int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); + int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ + int (*bmap)(struct address_space *, long); + #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ + int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); + }; + + struct address_space { + struct list_head clean_pages; /* list of clean pages */ + struct list_head dirty_pages; /* list of dirty pages */ + struct list_head locked_pages; /* list of locked pages */ + unsigned long nrpages; /* number of total pages */ + struct address_space_operations *a_ops; /* methods */ + struct inode *host; /* owner: inode, block_device */ + struct vm_area_struct *i_mmap; /* list of private mappings */ + struct vm_area_struct *i_mmap_shared; /* list of shared mappings */ + spinlock_t i_shared_lock; /* and spinlock protecting it */ + int gfp_mask; /* how to allocate the pages */ + }; + + struct char_device { + struct list_head hash; + atomic_t count; + dev_t dev; + atomic_t openers; + struct semaphore sem; + }; + + struct block_device { + struct list_head bd_hash; + atomic_t bd_count; + struct inode * bd_inode; + dev_t bd_dev; /* not a kdev_t - it's a search key */ + int bd_openers; + const struct block_device_operations *bd_op; + struct semaphore bd_sem; /* open/close mutex */ + struct list_head bd_inodes; + }; + + struct inode { + struct list_head i_hash; + struct list_head i_list; + struct list_head i_dentry; + + struct list_head i_dirty_buffers; + struct list_head i_dirty_data_buffers; + + unsigned long i_ino; + atomic_t i_count; + kdev_t i_dev; + umode_t i_mode; + nlink_t i_nlink; + uid_t i_uid; + gid_t i_gid; + kdev_t i_rdev; + loff_t i_size; + time_t i_atime; + time_t i_mtime; + time_t i_ctime; + unsigned int i_blkbits; + unsigned long i_blksize; + unsigned long i_blocks; + unsigned long i_version; + struct semaphore i_sem; + struct semaphore i_zombie; + struct inode_operations *i_op; + struct file_operations *i_fop; /* former ->i_op->default_file_ops */ + struct super_block *i_sb; + wait_queue_head_t i_wait; + struct file_lock *i_flock; + struct address_space *i_mapping; + struct address_space i_data; + struct dquot *i_dquot[MAXQUOTAS]; + /* These three should probably be a union */ + struct list_head i_devices; + struct pipe_inode_info *i_pipe; + struct block_device *i_bdev; + struct char_device *i_cdev; + + unsigned long i_dnotify_mask; /* Directory notify events */ + struct dnotify_struct *i_dnotify; /* for directory notifications */ + + unsigned long i_state; + + unsigned int i_flags; + unsigned char i_sock; + + atomic_t i_writecount; + unsigned int i_attr_flags; + __u32 i_generation; + union { + struct minix_inode_info minix_i; + struct ext2_inode_info ext2_i; + struct hpfs_inode_info hpfs_i; + struct ntfs_inode_info ntfs_i; + struct msdos_inode_info msdos_i; + struct umsdos_inode_info umsdos_i; + struct iso_inode_info isofs_i; + struct nfs_inode_info nfs_i; + struct sysv_inode_info sysv_i; + struct affs_inode_info affs_i; + struct ufs_inode_info ufs_i; + struct efs_inode_info efs_i; + struct romfs_inode_info romfs_i; + struct shmem_inode_info shmem_i; + struct coda_inode_info coda_i; + struct smb_inode_info smbfs_i; + struct hfs_inode_info hfs_i; + struct adfs_inode_info adfs_i; + struct qnx4_inode_info qnx4_i; + struct reiserfs_inode_info reiserfs_i; + struct bfs_inode_info bfs_i; + struct udf_inode_info udf_i; + struct ncp_inode_info ncpfs_i; + struct proc_inode_info proc_i; + struct socket socket_i; + struct usbdev_inode_info usbdev_i; + struct jffs2_inode_info jffs2_i; + void *generic_ip; + } u; + }; + + struct fown_struct { + int pid; /* pid or -pgrp where SIGIO should be sent */ + uid_t uid, euid; /* uid/euid of process setting the owner */ + int signum; /* posix.1b rt signal to be delivered on IO */ + }; + + struct file { + struct list_head f_list; + struct dentry *f_dentry; + struct vfsmount *f_vfsmnt; + struct file_operations *f_op; + atomic_t f_count; + unsigned int f_flags; + mode_t f_mode; + loff_t f_pos; + unsigned long f_reada, f_ramax, f_raend, f_ralen, f_rawin; + struct fown_struct f_owner; + unsigned int f_uid, f_gid; + int f_error; + + unsigned long f_version; + + /* needed for tty driver, and maybe others */ + void *private_data; + + /* preallocated helper kiobuf to speedup O_DIRECT */ + struct kiobuf *f_iobuf; + long f_iobuf_lock; + }; + extern spinlock_t files_lock; + #define file_list_lock() spin_lock(&files_lock); + #define file_list_unlock() spin_unlock(&files_lock); + + #define get_file(x) atomic_inc(&(x)->f_count) + #define file_count(x) atomic_read(&(x)->f_count) + + extern int init_private_file(struct file *, struct dentry *, int); + + #define MAX_NON_LFS ((1UL<<31) - 1) + + #define FL_POSIX 1 + #define FL_FLOCK 2 + #define FL_BROKEN 4 /* broken flock() emulation */ + #define FL_ACCESS 8 /* for processes suspended by mandatory locking */ + #define FL_LOCKD 16 /* lock held by rpc.lockd */ + #define FL_LEASE 32 /* lease held on this file */ + + /* + * The POSIX file lock owner is determined by + * the "struct files_struct" in the thread group + * (or NULL for no owner - BSD locks). + * + * Lockd stuffs a "host" pointer into this. + */ + typedef struct files_struct *fl_owner_t; + + struct file_lock { + struct file_lock *fl_next; /* singly linked list for this inode */ + struct list_head fl_link; /* doubly linked list of all locks */ + struct list_head fl_block; /* circular list of blocked processes */ + fl_owner_t fl_owner; + unsigned int fl_pid; + wait_queue_head_t fl_wait; + struct file *fl_file; + unsigned char fl_flags; + unsigned char fl_type; + loff_t fl_start; + loff_t fl_end; + + void (*fl_notify)(struct file_lock *); /* unblock callback */ + void (*fl_insert)(struct file_lock *); /* lock insertion callback */ + void (*fl_remove)(struct file_lock *); /* lock removal callback */ + + struct fasync_struct * fl_fasync; /* for lease break notifications */ + + union { + struct nfs_lock_info nfs_fl; + } fl_u; + }; + + /* The following constant reflects the upper bound of the file/locking space */ + #ifndef OFFSET_MAX + #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) + #define OFFSET_MAX INT_LIMIT(loff_t) + #define OFFT_OFFSET_MAX INT_LIMIT(off_t) + #endif + + extern struct list_head file_lock_list; + + #include + + extern int fcntl_getlk(unsigned int, struct flock *); + extern int fcntl_setlk(unsigned int, unsigned int, struct flock *); + + extern int fcntl_getlk64(unsigned int, struct flock64 *); + extern int fcntl_setlk64(unsigned int, unsigned int, struct flock64 *); + + /* fs/locks.c */ + extern void locks_init_lock(struct file_lock *); + extern void locks_copy_lock(struct file_lock *, struct file_lock *); + extern void locks_remove_posix(struct file *, fl_owner_t); + extern void locks_remove_flock(struct file *); + extern struct file_lock *posix_test_lock(struct file *, struct file_lock *); + extern int posix_lock_file(struct file *, struct file_lock *, unsigned int); + extern void posix_block_lock(struct file_lock *, struct file_lock *); + extern void posix_unblock_lock(struct file_lock *); + extern int posix_locks_deadlock(struct file_lock *, struct file_lock *); + extern int __get_lease(struct inode *inode, unsigned int flags); + extern time_t lease_get_mtime(struct inode *); + extern int lock_may_read(struct inode *, loff_t start, unsigned long count); + extern int lock_may_write(struct inode *, loff_t start, unsigned long count); + + struct fasync_struct { + int magic; + int fa_fd; + struct fasync_struct *fa_next; /* singly linked list */ + struct file *fa_file; + }; + + #define FASYNC_MAGIC 0x4601 + + /* SMP safe fasync helpers: */ + extern int fasync_helper(int, struct file *, int, struct fasync_struct **); + /* can be called from interrupts */ + extern void kill_fasync(struct fasync_struct **, int, int); + /* only for net: no internal synchronization */ + extern void __kill_fasync(struct fasync_struct *, int, int); + + struct nameidata { + struct dentry *dentry; + struct vfsmount *mnt; + struct qstr last; + unsigned int flags; + int last_type; + }; + + #define DQUOT_USR_ENABLED 0x01 /* User diskquotas enabled */ + #define DQUOT_GRP_ENABLED 0x02 /* Group diskquotas enabled */ + + struct quota_mount_options + { + unsigned int flags; /* Flags for diskquotas on this device */ + struct semaphore dqio_sem; /* lock device while I/O in progress */ + struct semaphore dqoff_sem; /* serialize quota_off() and quota_on() on device */ + struct file *files[MAXQUOTAS]; /* fp's to quotafiles */ + time_t inode_expire[MAXQUOTAS]; /* expiretime for inode-quota */ + time_t block_expire[MAXQUOTAS]; /* expiretime for block-quota */ + char rsquash[MAXQUOTAS]; /* for quotas threat root as any other user */ + }; + + /* + * Umount options + */ + + #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ + #define MNT_DETACH 0x00000002 /* Just detach from the tree */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + extern struct list_head super_blocks; + extern spinlock_t sb_lock; + + #define sb_entry(list) list_entry((list), struct super_block, s_list) + #define S_BIAS (1<<30) + struct super_block { + struct list_head s_list; /* Keep this first */ + kdev_t s_dev; + unsigned long s_blocksize; + unsigned char s_blocksize_bits; + unsigned char s_dirt; + unsigned long long s_maxbytes; /* Max file size */ + struct file_system_type *s_type; + struct super_operations *s_op; + struct dquot_operations *dq_op; + unsigned long s_flags; + unsigned long s_magic; + struct dentry *s_root; + struct rw_semaphore s_umount; + struct semaphore s_lock; + int s_count; + atomic_t s_active; + + struct list_head s_dirty; /* dirty inodes */ + struct list_head s_locked_inodes;/* inodes being synced */ + struct list_head s_files; + + struct block_device *s_bdev; + struct list_head s_instances; + struct quota_mount_options s_dquot; /* Diskquota specific options */ + + union { + struct minix_sb_info minix_sb; + struct ext2_sb_info ext2_sb; + struct hpfs_sb_info hpfs_sb; + struct ntfs_sb_info ntfs_sb; + struct msdos_sb_info msdos_sb; + struct isofs_sb_info isofs_sb; + struct nfs_sb_info nfs_sb; + struct sysv_sb_info sysv_sb; + struct affs_sb_info affs_sb; + struct ufs_sb_info ufs_sb; + struct efs_sb_info efs_sb; + struct shmem_sb_info shmem_sb; + struct romfs_sb_info romfs_sb; + struct smb_sb_info smbfs_sb; + struct hfs_sb_info hfs_sb; + struct adfs_sb_info adfs_sb; + struct qnx4_sb_info qnx4_sb; + struct reiserfs_sb_info reiserfs_sb; + struct bfs_sb_info bfs_sb; + struct udf_sb_info udf_sb; + struct ncp_sb_info ncpfs_sb; + struct usbdev_sb_info usbdevfs_sb; + struct jffs2_sb_info jffs2_sb; + struct cramfs_sb_info cramfs_sb; + void *generic_sbp; + } u; + /* + * The next field is for VFS *only*. No filesystems have any business + * even looking at it. You had been warned. + */ + struct semaphore s_vfs_rename_sem; /* Kludge */ + + /* The next field is used by knfsd when converting a (inode number based) + * file handle into a dentry. As it builds a path in the dcache tree from + * the bottom up, there may for a time be a subpath of dentrys which is not + * connected to the main tree. This semaphore ensure that there is only ever + * one such free path per filesystem. Note that unconnected files (or other + * non-directories) are allowed, but not unconnected diretories. + */ + struct semaphore s_nfsd_free_path_sem; + }; + + /* + * VFS helper functions.. + */ + extern int vfs_create(struct inode *, struct dentry *, int); + extern int vfs_mkdir(struct inode *, struct dentry *, int); + extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t); + extern int vfs_symlink(struct inode *, struct dentry *, const char *); + extern int vfs_link(struct dentry *, struct inode *, struct dentry *); + extern int vfs_rmdir(struct inode *, struct dentry *); + extern int vfs_unlink(struct inode *, struct dentry *); + extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *); + + /* + * File types + */ + #define DT_UNKNOWN 0 + #define DT_FIFO 1 + #define DT_CHR 2 + #define DT_DIR 4 + #define DT_BLK 6 + #define DT_REG 8 + #define DT_LNK 10 + #define DT_SOCK 12 + #define DT_WHT 14 + + /* + * This is the "filldir" function type, used by readdir() to let + * the kernel specify what kind of dirent layout it wants to have. + * This allows the kernel to read directories into kernel space or + * to have different dirent layouts depending on the binary type. + */ + typedef int (*filldir_t)(void *, const char *, int, loff_t, ino_t, unsigned); + + struct block_device_operations { + int (*open) (struct inode *, struct file *); + int (*release) (struct inode *, struct file *); + int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long); + int (*check_media_change) (kdev_t); + int (*revalidate) (kdev_t); + }; + + /* + * NOTE: + * read, write, poll, fsync, readv, writev can be called + * without the big kernel lock held in all filesystems. + */ + struct file_operations { + struct module *owner; + loff_t (*llseek) (struct file *, loff_t, int); + ssize_t (*read) (struct file *, char *, size_t, loff_t *); + ssize_t (*write) (struct file *, const char *, size_t, loff_t *); + int (*readdir) (struct file *, void *, filldir_t); + unsigned int (*poll) (struct file *, struct poll_table_struct *); + int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long); + int (*mmap) (struct file *, struct vm_area_struct *); + int (*open) (struct inode *, struct file *); + int (*flush) (struct file *); + int (*release) (struct inode *, struct file *); + int (*fsync) (struct file *, struct dentry *, int datasync); + int (*fasync) (int, struct file *, int); + int (*lock) (struct file *, int, struct file_lock *); + ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *); + ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); + unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + }; + + struct inode_operations { + int (*create) (struct inode *,struct dentry *,int); + struct dentry * (*lookup) (struct inode *,struct dentry *); + int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*unlink) (struct inode *,struct dentry *); + int (*symlink) (struct inode *,struct dentry *,const char *); + int (*mkdir) (struct inode *,struct dentry *,int); + int (*rmdir) (struct inode *,struct dentry *); + int (*mknod) (struct inode *,struct dentry *,int,int); + int (*rename) (struct inode *, struct dentry *, + struct inode *, struct dentry *); + int (*readlink) (struct dentry *, char *,int); + int (*follow_link) (struct dentry *, struct nameidata *); + void (*truncate) (struct inode *); + int (*permission) (struct inode *, int); + int (*revalidate) (struct dentry *); + int (*setattr) (struct dentry *, struct iattr *); + int (*getattr) (struct dentry *, struct iattr *); + }; + + /* + * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called + * without the big kernel lock held in all filesystems. + */ + struct super_operations { + void (*read_inode) (struct inode *); + + /* reiserfs kludge. reiserfs needs 64 bits of information to + ** find an inode. We are using the read_inode2 call to get + ** that information. We don't like this, and are waiting on some + ** VFS changes for the real solution. + ** iget4 calls read_inode2, iff it is defined + */ + void (*read_inode2) (struct inode *, void *) ; + void (*dirty_inode) (struct inode *); + void (*write_inode) (struct inode *, int); + void (*put_inode) (struct inode *); + void (*delete_inode) (struct inode *); + void (*put_super) (struct super_block *); + void (*write_super) (struct super_block *); + void (*write_super_lockfs) (struct super_block *); + void (*unlockfs) (struct super_block *); + int (*statfs) (struct super_block *, struct statfs *); + int (*remount_fs) (struct super_block *, int *, char *); + void (*clear_inode) (struct inode *); + void (*umount_begin) (struct super_block *); + + /* Following are for knfsd to interact with "interesting" filesystems + * Currently just reiserfs, but possibly FAT and others later + * + * fh_to_dentry is given a filehandle fragement with length, and a type flag + * and must return a dentry for the referenced object or, if "parent" is + * set, a dentry for the parent of the object. + * If a dentry cannot be found, a "root" dentry should be created and + * flaged as DCACHE_NFSD_DISCONNECTED. nfsd_iget is an example implementation. + * + * dentry_to_fh is given a dentry and must generate the filesys specific + * part of the file handle. Available length is passed in *lenp and used + * length should be returned therein. + * If need_parent is set, then dentry_to_fh should encode sufficient information + * to find the (current) parent. + * dentry_to_fh should return a 1byte "type" which will be passed back in + * the fhtype arguement to fh_to_dentry. Type of 0 is reserved. + * If filesystem was exportable before the introduction of fh_to_dentry, + * types 1 and 2 should be used is that same way as the generic code. + * Type 255 means error. + * + * Lengths are in units of 4bytes, not bytes. + */ + struct dentry * (*fh_to_dentry)(struct super_block *sb, __u32 *fh, int len, int fhtype, int parent); + int (*dentry_to_fh)(struct dentry *, __u32 *fh, int *lenp, int need_parent); + }; + + /* Inode state bits.. */ + #define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */ + #define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */ + #define I_DIRTY_PAGES 4 /* Data-related inode changes pending */ + #define I_LOCK 8 + #define I_FREEING 16 + #define I_CLEAR 32 + + #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) + + extern void __mark_inode_dirty(struct inode *, int); + static inline void mark_inode_dirty(struct inode *inode) + { + __mark_inode_dirty(inode, I_DIRTY); + } + + static inline void mark_inode_dirty_sync(struct inode *inode) + { + __mark_inode_dirty(inode, I_DIRTY_SYNC); + } + + static inline void mark_inode_dirty_pages(struct inode *inode) + { + __mark_inode_dirty(inode, I_DIRTY_PAGES); + } + + struct dquot_operations { + void (*initialize) (struct inode *, short); + void (*drop) (struct inode *); + int (*alloc_block) (struct inode *, unsigned long, char); + int (*alloc_inode) (const struct inode *, unsigned long); + void (*free_block) (struct inode *, unsigned long); + void (*free_inode) (const struct inode *, unsigned long); + int (*transfer) (struct inode *, struct iattr *); + }; + + struct file_system_type { + const char *name; + int fs_flags; + struct super_block *(*read_super) (struct super_block *, void *, int); + struct module *owner; + struct file_system_type * next; + struct list_head fs_supers; + }; + + #define DECLARE_FSTYPE(var,type,read,flags) \ + struct file_system_type var = { \ + name: type, \ + read_super: read, \ + fs_flags: flags, \ + owner: THIS_MODULE, \ + } + + #define DECLARE_FSTYPE_DEV(var,type,read) \ + DECLARE_FSTYPE(var,type,read,FS_REQUIRES_DEV) + + /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ + #define fops_get(fops) \ + (((fops) && (fops)->owner) \ + ? ( try_inc_mod_count((fops)->owner) ? (fops) : NULL ) \ + : (fops)) + + #define fops_put(fops) \ + do { \ + if ((fops) && (fops)->owner) \ + __MOD_DEC_USE_COUNT((fops)->owner); \ + } while(0) + + extern int register_filesystem(struct file_system_type *); + extern int unregister_filesystem(struct file_system_type *); + extern struct vfsmount *kern_mount(struct file_system_type *); + extern int may_umount(struct vfsmount *); + extern long do_mount(char *, char *, char *, unsigned long, void *); + + #define kern_umount mntput + + extern int vfs_statfs(struct super_block *, struct statfs *); + + /* Return value for VFS lock functions - tells locks.c to lock conventionally + * REALLY kosha for root NFS and nfs_lock + */ + #define LOCK_USE_CLNT 1 + + #define FLOCK_VERIFY_READ 1 + #define FLOCK_VERIFY_WRITE 2 + + extern int locks_mandatory_locked(struct inode *); + extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t); + + /* + * Candidates for mandatory locking have the setgid bit set + * but no group execute bit - an otherwise meaningless combination. + */ + #define MANDATORY_LOCK(inode) \ + (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) + + static inline int locks_verify_locked(struct inode *inode) + { + if (MANDATORY_LOCK(inode)) + return locks_mandatory_locked(inode); + return 0; + } + + static inline int locks_verify_area(int read_write, struct inode *inode, + struct file *filp, loff_t offset, + size_t count) + { + if (inode->i_flock && MANDATORY_LOCK(inode)) + return locks_mandatory_area(read_write, inode, filp, offset, count); + return 0; + } + + static inline int locks_verify_truncate(struct inode *inode, + struct file *filp, + loff_t size) + { + if (inode->i_flock && MANDATORY_LOCK(inode)) + return locks_mandatory_area( + FLOCK_VERIFY_WRITE, inode, filp, + size < inode->i_size ? size : inode->i_size, + (size < inode->i_size ? inode->i_size - size + : size - inode->i_size) + ); + return 0; + } + + static inline int get_lease(struct inode *inode, unsigned int mode) + { + if (inode->i_flock && (inode->i_flock->fl_flags & FL_LEASE)) + return __get_lease(inode, mode); + return 0; + } + + /* fs/open.c */ + + asmlinkage long sys_open(const char *, int, int); + asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */ + extern int do_truncate(struct dentry *, loff_t start); + + extern struct file *filp_open(const char *, int, int); + extern struct file * dentry_open(struct dentry *, struct vfsmount *, int); + extern int filp_close(struct file *, fl_owner_t id); + extern char * getname(const char *); + + /* fs/dcache.c */ + extern void vfs_caches_init(unsigned long); + + #define __getname() kmem_cache_alloc(names_cachep, SLAB_KERNEL) + #define putname(name) kmem_cache_free(names_cachep, (void *)(name)) + + enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW}; + extern int register_blkdev(unsigned int, const char *, struct block_device_operations *); + extern int unregister_blkdev(unsigned int, const char *); + extern struct block_device *bdget(dev_t); + extern int bd_acquire(struct inode *inode); + extern void bd_forget(struct inode *inode); + extern void bdput(struct block_device *); + extern struct char_device *cdget(dev_t); + extern void cdput(struct char_device *); + extern int blkdev_open(struct inode *, struct file *); + extern int blkdev_close(struct inode *, struct file *); + extern struct file_operations def_blk_fops; + extern struct address_space_operations def_blk_aops; + extern struct file_operations def_fifo_fops; + extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long); + extern int blkdev_get(struct block_device *, mode_t, unsigned, int); + extern int blkdev_put(struct block_device *, int); + + /* fs/devices.c */ + extern const struct block_device_operations *get_blkfops(unsigned int); + extern int register_chrdev(unsigned int, const char *, struct file_operations *); + extern int unregister_chrdev(unsigned int, const char *); + extern int chrdev_open(struct inode *, struct file *); + extern const char * bdevname(kdev_t); + extern const char * cdevname(kdev_t); + extern const char * kdevname(kdev_t); + extern void init_special_inode(struct inode *, umode_t, int); + + /* Invalid inode operations -- fs/bad_inode.c */ + extern void make_bad_inode(struct inode *); + extern int is_bad_inode(struct inode *); + + extern struct file_operations read_fifo_fops; + extern struct file_operations write_fifo_fops; + extern struct file_operations rdwr_fifo_fops; + extern struct file_operations read_pipe_fops; + extern struct file_operations write_pipe_fops; + extern struct file_operations rdwr_pipe_fops; + + extern int fs_may_remount_ro(struct super_block *); + + extern int try_to_free_buffers(struct page *, unsigned int); + extern void refile_buffer(struct buffer_head * buf); + extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate); + + /* reiserfs_writepage needs this */ + extern void set_buffer_async_io(struct buffer_head *bh) ; + + #define BUF_CLEAN 0 + #define BUF_LOCKED 1 /* Buffers scheduled for write */ + #define BUF_DIRTY 2 /* Dirty buffers, not yet scheduled for write */ + #define NR_LIST 3 + + static inline void get_bh(struct buffer_head * bh) + { + atomic_inc(&(bh)->b_count); + } + + static inline void put_bh(struct buffer_head *bh) + { + smp_mb__before_atomic_dec(); + atomic_dec(&bh->b_count); + } + + /* + * This is called by bh->b_end_io() handlers when I/O has completed. + */ + static inline void mark_buffer_uptodate(struct buffer_head * bh, int on) + { + if (on) + set_bit(BH_Uptodate, &bh->b_state); + else + clear_bit(BH_Uptodate, &bh->b_state); + } + + #define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state) + + static inline void __mark_buffer_clean(struct buffer_head *bh) + { + refile_buffer(bh); + } + + static inline void mark_buffer_clean(struct buffer_head * bh) + { + if (atomic_set_buffer_clean(bh)) + __mark_buffer_clean(bh); + } + + extern void FASTCALL(__mark_dirty(struct buffer_head *bh)); + extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh)); + extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh)); + extern void FASTCALL(buffer_insert_inode_data_queue(struct buffer_head *, struct inode *)); + + #define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state) + + static inline void mark_buffer_async(struct buffer_head * bh, int on) + { + if (on) + set_bit(BH_Async, &bh->b_state); + else + clear_bit(BH_Async, &bh->b_state); + } + + /* + * If an error happens during the make_request, this function + * has to be recalled. It marks the buffer as clean and not + * uptodate, and it notifys the upper layer about the end + * of the I/O. + */ + static inline void buffer_IO_error(struct buffer_head * bh) + { + mark_buffer_clean(bh); + /* + * b_end_io has to clear the BH_Uptodate bitflag in the error case! + */ + bh->b_end_io(bh, 0); + } + + extern void buffer_insert_inode_queue(struct buffer_head *, struct inode *); + static inline void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) + { + mark_buffer_dirty(bh); + buffer_insert_inode_queue(bh, inode); + } + + extern void balance_dirty(void); + extern int check_disk_change(kdev_t); + extern int invalidate_inodes(struct super_block *); + extern int invalidate_device(kdev_t, int); + extern void invalidate_inode_pages(struct inode *); + extern void invalidate_inode_pages2(struct address_space *); + extern void invalidate_inode_buffers(struct inode *); + #define invalidate_buffers(dev) __invalidate_buffers((dev), 0) + #define destroy_buffers(dev) __invalidate_buffers((dev), 1) + extern void invalidate_bdev(struct block_device *, int); + extern void __invalidate_buffers(kdev_t dev, int); + extern void sync_inodes(kdev_t); + extern void sync_unlocked_inodes(void); + extern void write_inode_now(struct inode *, int); + extern int sync_buffers(kdev_t, int); + extern void sync_dev(kdev_t); + extern int fsync_dev(kdev_t); + extern int fsync_super(struct super_block *); + extern int fsync_no_super(kdev_t); + extern void sync_inodes_sb(struct super_block *); + extern int osync_inode_buffers(struct inode *); + extern int osync_inode_data_buffers(struct inode *); + extern int fsync_inode_buffers(struct inode *); + extern int fsync_inode_data_buffers(struct inode *); + extern int inode_has_buffers(struct inode *); + extern void filemap_fdatasync(struct address_space *); + extern void filemap_fdatawait(struct address_space *); + extern void sync_supers(kdev_t); + extern int bmap(struct inode *, int); + extern int notify_change(struct dentry *, struct iattr *); + extern int permission(struct inode *, int); + extern int vfs_permission(struct inode *, int); + extern int get_write_access(struct inode *); + extern int deny_write_access(struct file *); + static inline void put_write_access(struct inode * inode) + { + atomic_dec(&inode->i_writecount); + } + static inline void allow_write_access(struct file *file) + { + if (file) + atomic_inc(&file->f_dentry->d_inode->i_writecount); + } + extern int do_pipe(int *); + + extern int open_namei(const char *, int, int, struct nameidata *); + + extern int kernel_read(struct file *, unsigned long, char *, unsigned long); + extern struct file * open_exec(const char *); + + /* fs/dcache.c -- generic fs support functions */ + extern int is_subdir(struct dentry *, struct dentry *); + extern ino_t find_inode_number(struct dentry *, struct qstr *); + + /* + * Kernel pointers have redundant information, so we can use a + * scheme where we can return either an error code or a dentry + * pointer with the same return value. + * + * This should be a per-architecture thing, to allow different + * error and pointer decisions. + */ + static inline void *ERR_PTR(long error) + { + return (void *) error; + } + + static inline long PTR_ERR(const void *ptr) + { + return (long) ptr; + } + + static inline long IS_ERR(const void *ptr) + { + return (unsigned long)ptr > (unsigned long)-1000L; + } + + /* + * The bitmask for a lookup event: + * - follow links at the end + * - require a directory + * - ending slashes ok even for nonexistent files + * - internal "there are more path compnents" flag + */ + #define LOOKUP_FOLLOW (1) + #define LOOKUP_DIRECTORY (2) + #define LOOKUP_CONTINUE (4) + #define LOOKUP_POSITIVE (8) + #define LOOKUP_PARENT (16) + #define LOOKUP_NOALT (32) + /* + * Type of the last component on LOOKUP_PARENT + */ + enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; + + /* + * "descriptor" for what we're up to with a read for sendfile(). + * This allows us to use the same read code yet + * have multiple different users of the data that + * we read from a file. + * + * The simplest case just copies the data to user + * mode. + */ + typedef struct { + size_t written; + size_t count; + char * buf; + int error; + } read_descriptor_t; + + typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); + + /* needed for stackable file system support */ + extern loff_t default_llseek(struct file *file, loff_t offset, int origin); + + extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *)); + extern int FASTCALL(path_walk(const char *, struct nameidata *)); + extern int FASTCALL(link_path_walk(const char *, struct nameidata *)); + extern void path_release(struct nameidata *); + extern int follow_down(struct vfsmount **, struct dentry **); + extern int follow_up(struct vfsmount **, struct dentry **); + extern struct dentry * lookup_one_len(const char *, struct dentry *, int); + extern struct dentry * lookup_hash(struct qstr *, struct dentry *); + #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd) + #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd) + + extern void iput(struct inode *); + extern void force_delete(struct inode *); + extern struct inode * igrab(struct inode *); + extern ino_t iunique(struct super_block *, ino_t); + + typedef int (*find_inode_t)(struct inode *, unsigned long, void *); + extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *); + static inline struct inode *iget(struct super_block *sb, unsigned long ino) + { + return iget4(sb, ino, NULL, NULL); + } + + extern void clear_inode(struct inode *); + extern struct inode * get_empty_inode(void); + + static inline struct inode * new_inode(struct super_block *sb) + { + struct inode *inode = get_empty_inode(); + if (inode) { + inode->i_sb = sb; + inode->i_dev = sb->s_dev; + inode->i_blkbits = sb->s_blocksize_bits; + } + return inode; + } + extern void remove_suid(struct inode *inode); + + extern void insert_inode_hash(struct inode *); + extern void remove_inode_hash(struct inode *); + extern struct file * get_empty_filp(void); + extern void file_move(struct file *f, struct list_head *list); + extern struct buffer_head * get_hash_table(kdev_t, int, int); + extern struct buffer_head * getblk(kdev_t, int, int); + extern void ll_rw_block(int, int, struct buffer_head * bh[]); + extern void submit_bh(int, struct buffer_head *); + extern int is_read_only(kdev_t); + extern void __brelse(struct buffer_head *); + static inline void brelse(struct buffer_head *buf) + { + if (buf) + __brelse(buf); + } + extern void __bforget(struct buffer_head *); + static inline void bforget(struct buffer_head *buf) + { + if (buf) + __bforget(buf); + } + extern int set_blocksize(kdev_t, int); + extern struct buffer_head * bread(kdev_t, int, int); + extern void wakeup_bdflush(void); + + extern int brw_page(int, struct page *, kdev_t, int [], int); + + typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int); + + /* Generic buffer handling for block filesystems.. */ + extern int discard_bh_page(struct page *, unsigned long, int); + #define block_flushpage(page, offset) discard_bh_page(page, offset, 1) + #define block_invalidate_page(page) discard_bh_page(page, 0, 0) + extern int block_symlink(struct inode *, const char *, int); + extern int block_write_full_page(struct page*, get_block_t*); + extern int block_read_full_page(struct page*, get_block_t*); + extern int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); + extern int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, + unsigned long *); + extern int block_commit_write(struct page *page, unsigned from, unsigned to); + extern int block_sync_page(struct page *); + + int generic_block_bmap(struct address_space *, long, get_block_t *); + int generic_commit_write(struct file *, struct page *, unsigned, unsigned); + int block_truncate_page(struct address_space *, loff_t, get_block_t *); + extern void create_empty_buffers(struct page *, kdev_t, unsigned long); + + extern int waitfor_one_page(struct page*); + extern int generic_file_mmap(struct file *, struct vm_area_struct *); + extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); + extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *); + extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *); + extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t); + extern loff_t no_llseek(struct file *file, loff_t offset, int origin); + extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin); + extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *); + extern int generic_file_open(struct inode * inode, struct file * filp); + + extern struct file_operations generic_ro_fops; + + extern int vfs_readlink(struct dentry *, char *, int, const char *); + extern int vfs_follow_link(struct nameidata *, const char *); + extern int page_readlink(struct dentry *, char *, int); + extern int page_follow_link(struct dentry *, struct nameidata *); + extern struct inode_operations page_symlink_inode_operations; + + extern int vfs_readdir(struct file *, filldir_t, void *); + extern int dcache_readdir(struct file *, void *, filldir_t); + + extern struct file_system_type *get_fs_type(const char *name); + extern struct super_block *get_super(kdev_t); + extern void drop_super(struct super_block *sb); + static inline int is_mounted(kdev_t dev) + { + struct super_block *sb = get_super(dev); + if (sb) { + drop_super(sb); + return 1; + } + return 0; + } + unsigned long generate_cluster(kdev_t, int b[], int); + unsigned long generate_cluster_swab32(kdev_t, int b[], int); + extern kdev_t ROOT_DEV; + extern char root_device_name[]; + + + extern void show_buffers(void); + extern void mount_root(void); + + #ifdef CONFIG_BLK_DEV_INITRD + extern kdev_t real_root_dev; + extern int change_root(kdev_t, const char *); + #endif + + extern ssize_t char_read(struct file *, char *, size_t, loff_t *); + extern ssize_t block_read(struct file *, char *, size_t, loff_t *); + extern int read_ahead[]; + + extern ssize_t char_write(struct file *, const char *, size_t, loff_t *); + extern ssize_t block_write(struct file *, const char *, size_t, loff_t *); + + extern int file_fsync(struct file *, struct dentry *, int); + extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx); + extern int generic_osync_inode(struct inode *, int); + #define OSYNC_METADATA (1<<0) + #define OSYNC_DATA (1<<1) + #define OSYNC_INODE (1<<2) + + extern int inode_change_ok(struct inode *, struct iattr *); + extern int inode_setattr(struct inode *, struct iattr *); + + /* + * Common dentry functions for inclusion in the VFS + * or in other stackable file systems. Some of these + * functions were in linux/fs/ C (VFS) files. + * + */ + + /* + * Locking the parent is needed to: + * - serialize directory operations + * - make sure the parent doesn't change from + * under us in the middle of an operation. + * + * NOTE! Right now we'd rather use a "struct inode" + * for this, but as I expect things to move toward + * using dentries instead for most things it is + * probably better to start with the conceptually + * better interface of relying on a path of dentries. + */ + static inline struct dentry *lock_parent(struct dentry *dentry) + { + struct dentry *dir = dget(dentry->d_parent); + + down(&dir->d_inode->i_sem); + return dir; + } + + static inline struct dentry *get_parent(struct dentry *dentry) + { + return dget(dentry->d_parent); + } + + static inline void unlock_dir(struct dentry *dir) + { + up(&dir->d_inode->i_sem); + dput(dir); + } + + /* + * Whee.. Deadlock country. Happily there are only two VFS + * operations that does this.. + */ + static inline void double_down(struct semaphore *s1, struct semaphore *s2) + { + if (s1 != s2) { + if ((unsigned long) s1 < (unsigned long) s2) { + struct semaphore *tmp = s2; + s2 = s1; s1 = tmp; + } + down(s1); + } + down(s2); + } + + /* + * Ewwwwwwww... _triple_ lock. We are guaranteed that the 3rd argument is + * not equal to 1st and not equal to 2nd - the first case (target is parent of + * source) would be already caught, the second is plain impossible (target is + * its own parent and that case would be caught even earlier). Very messy. + * I _think_ that it works, but no warranties - please, look it through. + * Pox on bloody lusers who mandated overwriting rename() for directories... + */ + + static inline void triple_down(struct semaphore *s1, + struct semaphore *s2, + struct semaphore *s3) + { + if (s1 != s2) { + if ((unsigned long) s1 < (unsigned long) s2) { + if ((unsigned long) s1 < (unsigned long) s3) { + struct semaphore *tmp = s3; + s3 = s1; s1 = tmp; + } + if ((unsigned long) s1 < (unsigned long) s2) { + struct semaphore *tmp = s2; + s2 = s1; s1 = tmp; + } + } else { + if ((unsigned long) s1 < (unsigned long) s3) { + struct semaphore *tmp = s3; + s3 = s1; s1 = tmp; + } + if ((unsigned long) s2 < (unsigned long) s3) { + struct semaphore *tmp = s3; + s3 = s2; s2 = tmp; + } + } + down(s1); + } else if ((unsigned long) s2 < (unsigned long) s3) { + struct semaphore *tmp = s3; + s3 = s2; s2 = tmp; + } + down(s2); + down(s3); + } + + static inline void double_up(struct semaphore *s1, struct semaphore *s2) + { + up(s1); + if (s1 != s2) + up(s2); + } + + static inline void triple_up(struct semaphore *s1, + struct semaphore *s2, + struct semaphore *s3) + { + up(s1); + if (s1 != s2) + up(s2); + up(s3); + } + + static inline void double_lock(struct dentry *d1, struct dentry *d2) + { + double_down(&d1->d_inode->i_sem, &d2->d_inode->i_sem); + } + + static inline void double_unlock(struct dentry *d1, struct dentry *d2) + { + double_up(&d1->d_inode->i_sem,&d2->d_inode->i_sem); + dput(d1); + dput(d2); + } + + #endif /* __KERNEL__ */ + + #endif /* _LINUX_FS_H */ diff -rc2P linux/include/linux/jbd.h linux-2.4.13/include/linux/jbd.h *** linux/include/linux/jbd.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/jbd.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,878 ---- + /* + * linux/include/linux/jbd.h + * + * Written by Stephen C. Tweedie + * + * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Definitions for transaction data structures for the buffer cache + * filesystem journaling support. + */ + + #ifndef _LINUX_JBD_H + #define _LINUX_JBD_H + + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || !defined(__KERNEL__) + + /* Allow this file to be included directly into e2fsprogs */ + #ifndef __KERNEL__ + #include "jfs_compat.h" + #define JFS_DEBUG + #define jfs_debug jbd_debug + #else + + #include + #include + #include + #endif + + extern int journal_oom_retry; + + #ifdef CONFIG_JBD_DEBUG + /* + * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal + * consistency checks. By default we don't do this unless + * CONFIG_JBD_DEBUG is on. + */ + #define JBD_EXPENSIVE_CHECKING + + extern int journal_enable_debug; + extern int journal_no_write[2]; + + #define jbd_debug(n, f, a...) \ + do { \ + if ((n) <= journal_enable_debug) { \ + printk (KERN_DEBUG "(%s, %d): %s: ", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (f, ## a); \ + } \ + } while (0) + #else + #define jbd_debug(f, a...) /**/ + #endif + + extern void * __jbd_kmalloc (char *where, size_t size, int flags, int retry); + #define jbd_kmalloc(size, flags) \ + __jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry) + #define jbd_rep_kmalloc(size, flags) \ + __jbd_kmalloc(__FUNCTION__, (size), (flags), 1) + + #define JFS_MIN_JOURNAL_BLOCKS 1024 + + #ifdef __KERNEL__ + typedef struct handle_s handle_t; /* Atomic operation type */ + typedef struct journal_s journal_t; /* Journal control structure */ + #endif + + /* + * Internal structures used by the logging mechanism: + */ + + #define JFS_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */ + + /* + * On-disk structures + */ + + /* + * Descriptor block types: + */ + + #define JFS_DESCRIPTOR_BLOCK 1 + #define JFS_COMMIT_BLOCK 2 + #define JFS_SUPERBLOCK_V1 3 + #define JFS_SUPERBLOCK_V2 4 + #define JFS_REVOKE_BLOCK 5 + + /* + * Standard header for all descriptor blocks: + */ + typedef struct journal_header_s + { + __u32 h_magic; + __u32 h_blocktype; + __u32 h_sequence; + } journal_header_t; + + + /* + * The block tag: used to describe a single buffer in the journal + */ + typedef struct journal_block_tag_s + { + __u32 t_blocknr; /* The on-disk block number */ + __u32 t_flags; /* See below */ + } journal_block_tag_t; + + /* + * The revoke descriptor: used on disk to describe a series of blocks to + * be revoked from the log + */ + typedef struct journal_revoke_header_s + { + journal_header_t r_header; + int r_count; /* Count of bytes used in the block */ + } journal_revoke_header_t; + + + /* Definitions for the journal tag flags word: */ + #define JFS_FLAG_ESCAPE 1 /* on-disk block is escaped */ + #define JFS_FLAG_SAME_UUID 2 /* block has same uuid as previous */ + #define JFS_FLAG_DELETED 4 /* block deleted by this transaction */ + #define JFS_FLAG_LAST_TAG 8 /* last tag in this descriptor block */ + + + /* + * The journal superblock. All fields are in big-endian byte order. + */ + typedef struct journal_superblock_s + { + /* 0x0000 */ + journal_header_t s_header; + + /* 0x000C */ + /* Static information describing the journal */ + __u32 s_blocksize; /* journal device blocksize */ + __u32 s_maxlen; /* total blocks in journal file */ + __u32 s_first; /* first block of log information */ + + /* 0x0018 */ + /* Dynamic information describing the current state of the log */ + __u32 s_sequence; /* first commit ID expected in log */ + __u32 s_start; /* blocknr of start of log */ + + /* 0x0020 */ + /* Error value, as set by journal_abort(). */ + __s32 s_errno; + + /* 0x0024 */ + /* Remaining fields are only valid in a version-2 superblock */ + __u32 s_feature_compat; /* compatible feature set */ + __u32 s_feature_incompat; /* incompatible feature set */ + __u32 s_feature_ro_compat; /* readonly-compatible feature set */ + /* 0x0030 */ + __u8 s_uuid[16]; /* 128-bit uuid for journal */ + + /* 0x0040 */ + __u32 s_nr_users; /* Nr of filesystems sharing log */ + + __u32 s_dynsuper; /* Blocknr of dynamic superblock copy*/ + + /* 0x0048 */ + __u32 s_max_transaction; /* Limit of journal blocks per trans.*/ + __u32 s_max_trans_data; /* Limit of data blocks per trans. */ + + /* 0x0050 */ + __u32 s_padding[44]; + + /* 0x0100 */ + __u8 s_users[16*48]; /* ids of all fs'es sharing the log */ + /* 0x0400 */ + } journal_superblock_t; + + #define JFS_HAS_COMPAT_FEATURE(j,mask) \ + ((j)->j_format_version >= 2 && \ + ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask)))) + #define JFS_HAS_RO_COMPAT_FEATURE(j,mask) \ + ((j)->j_format_version >= 2 && \ + ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask)))) + #define JFS_HAS_INCOMPAT_FEATURE(j,mask) \ + ((j)->j_format_version >= 2 && \ + ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask)))) + + #define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001 + + /* Features known to this kernel version: */ + #define JFS_KNOWN_COMPAT_FEATURES 0 + #define JFS_KNOWN_ROCOMPAT_FEATURES 0 + #define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE + + #ifdef __KERNEL__ + + #include + #include + + #define JBD_ASSERTIONS + #ifdef JBD_ASSERTIONS + #define J_ASSERT(assert) \ + do { \ + if (!(assert)) { \ + printk (KERN_EMERG \ + "Assertion failure in %s() at %s:%d: \"%s\"\n", \ + __FUNCTION__, __FILE__, __LINE__, # assert); \ + BUG(); \ + } \ + } while (0) + + #if defined(CONFIG_BUFFER_DEBUG) + void buffer_assertion_failure(struct buffer_head *bh); + #define J_ASSERT_BH(bh, expr) \ + do { \ + if (!(expr)) \ + buffer_assertion_failure(bh); \ + J_ASSERT(expr); \ + } while (0) + #define J_ASSERT_JH(jh, expr) J_ASSERT_BH(jh2bh(jh), expr) + #else + #define J_ASSERT_BH(bh, expr) J_ASSERT(expr) + #define J_ASSERT_JH(jh, expr) J_ASSERT(expr) + #endif + + #else + #define J_ASSERT(assert) + #endif /* JBD_ASSERTIONS */ + + enum jbd_state_bits { + BH_JWrite + = BH_PrivateStart, /* 1 if being written to log (@@@ DEBUGGING) */ + BH_Freed, /* 1 if buffer has been freed (truncated) */ + BH_Revoked, /* 1 if buffer has been revoked from the log */ + BH_RevokeValid, /* 1 if buffer revoked flag is valid */ + BH_JBDDirty, /* 1 if buffer is dirty but journaled */ + }; + + /* Return true if the buffer is one which JBD is managing */ + static inline int buffer_jbd(struct buffer_head *bh) + { + return __buffer_state(bh, JBD); + } + + static inline struct buffer_head *jh2bh(struct journal_head *jh) + { + return jh->b_bh; + } + + static inline struct journal_head *bh2jh(struct buffer_head *bh) + { + return bh->b_private; + } + + struct jbd_revoke_table_s; + + /* The handle_t type represents a single atomic update being performed + * by some process. All filesystem modifications made by the process go + * through this handle. Recursive operations (such as quota operations) + * are gathered into a single update. + * + * The buffer credits field is used to account for journaled buffers + * being modified by the running process. To ensure that there is + * enough log space for all outstanding operations, we need to limit the + * number of outstanding buffers possible at any time. When the + * operation completes, any buffer credits not used are credited back to + * the transaction, so that at all times we know how many buffers the + * outstanding updates on a transaction might possibly touch. */ + + struct handle_s + { + /* Which compound transaction is this update a part of? */ + transaction_t * h_transaction; + + /* Number of remaining buffers we are allowed to dirty: */ + int h_buffer_credits; + + /* Reference count on this handle */ + int h_ref; + + /* Field for caller's use to track errors through large fs + operations */ + int h_err; + + /* Flags */ + unsigned int h_sync: 1; /* sync-on-close */ + unsigned int h_jdata: 1; /* force data journaling */ + unsigned int h_aborted: 1; /* fatal error on handle */ + }; + + + /* The transaction_t type is the guts of the journaling mechanism. It + * tracks a compound transaction through its various states: + * + * RUNNING: accepting new updates + * LOCKED: Updates still running but we don't accept new ones + * RUNDOWN: Updates are tidying up but have finished requesting + * new buffers to modify (state not used for now) + * FLUSH: All updates complete, but we are still writing to disk + * COMMIT: All data on disk, writing commit record + * FINISHED: We still have to keep the transaction for checkpointing. + * + * The transaction keeps track of all of the buffers modified by a + * running transaction, and all of the buffers committed but not yet + * flushed to home for finished transactions. + */ + + struct transaction_s + { + /* Pointer to the journal for this transaction. */ + journal_t * t_journal; + + /* Sequence number for this transaction */ + tid_t t_tid; + + /* Transaction's current state */ + enum { + T_RUNNING, + T_LOCKED, + T_RUNDOWN, + T_FLUSH, + T_COMMIT, + T_FINISHED + } t_state; + + /* Where in the log does this transaction's commit start? */ + unsigned long t_log_start; + + /* Doubly-linked circular list of all inodes owned by this + transaction */ /* AKPM: unused */ + struct inode * t_ilist; + + /* Number of buffers on the t_buffers list */ + int t_nr_buffers; + + /* Doubly-linked circular list of all buffers reserved but not + yet modified by this transaction */ + struct journal_head * t_reserved_list; + + /* Doubly-linked circular list of all metadata buffers owned by this + transaction */ + struct journal_head * t_buffers; + + /* + * Doubly-linked circular list of all data buffers still to be + * flushed before this transaction can be committed. + * Protected by journal_datalist_lock. + */ + struct journal_head * t_sync_datalist; + + /* + * Doubly-linked circular list of all writepage data buffers + * still to be written before this transaction can be committed. + * Protected by journal_datalist_lock. + */ + struct journal_head * t_async_datalist; + + /* Doubly-linked circular list of all forget buffers (superceded + buffers which we can un-checkpoint once this transaction + commits) */ + struct journal_head * t_forget; + + /* + * Doubly-linked circular list of all buffers still to be + * flushed before this transaction can be checkpointed. + */ + /* Protected by journal_datalist_lock */ + struct journal_head * t_checkpoint_list; + + /* Doubly-linked circular list of temporary buffers currently + undergoing IO in the log */ + struct journal_head * t_iobuf_list; + + /* Doubly-linked circular list of metadata buffers being + shadowed by log IO. The IO buffers on the iobuf list and the + shadow buffers on this list match each other one for one at + all times. */ + struct journal_head * t_shadow_list; + + /* Doubly-linked circular list of control buffers being written + to the log. */ + struct journal_head * t_log_list; + + /* Number of outstanding updates running on this transaction */ + int t_updates; + + /* Number of buffers reserved for use by all handles in this + * transaction handle but not yet modified. */ + int t_outstanding_credits; + + /* + * Forward and backward links for the circular list of all + * transactions awaiting checkpoint. + */ + /* Protected by journal_datalist_lock */ + transaction_t *t_cpnext, *t_cpprev; + + /* When will the transaction expire (become due for commit), in + * jiffies ? */ + unsigned long t_expires; + + /* How many handles used this transaction? */ + int t_handle_count; + }; + + + /* The journal_t maintains all of the journaling state information for a + * single filesystem. It is linked to from the fs superblock structure. + * + * We use the journal_t to keep track of all outstanding transaction + * activity on the filesystem, and to manage the state of the log + * writing process. */ + + struct journal_s + { + /* General journaling state flags */ + unsigned long j_flags; + + /* Is there an outstanding uncleared error on the journal (from + * a prior abort)? */ + int j_errno; + + /* The superblock buffer */ + struct buffer_head * j_sb_buffer; + journal_superblock_t * j_superblock; + + /* Version of the superblock format */ + int j_format_version; + + /* Number of processes waiting to create a barrier lock */ + int j_barrier_count; + + /* The barrier lock itself */ + struct semaphore j_barrier; + + /* Transactions: The current running transaction... */ + transaction_t * j_running_transaction; + + /* ... the transaction we are pushing to disk ... */ + transaction_t * j_committing_transaction; + + /* ... and a linked circular list of all transactions waiting + * for checkpointing. */ + /* Protected by journal_datalist_lock */ + transaction_t * j_checkpoint_transactions; + + /* Wait queue for waiting for a locked transaction to start + committing, or for a barrier lock to be released */ + wait_queue_head_t j_wait_transaction_locked; + + /* Wait queue for waiting for checkpointing to complete */ + wait_queue_head_t j_wait_logspace; + + /* Wait queue for waiting for commit to complete */ + wait_queue_head_t j_wait_done_commit; + + /* Wait queue to trigger checkpointing */ + wait_queue_head_t j_wait_checkpoint; + + /* Wait queue to trigger commit */ + wait_queue_head_t j_wait_commit; + + /* Wait queue to wait for updates to complete */ + wait_queue_head_t j_wait_updates; + + /* Semaphore for locking against concurrent checkpoints */ + struct semaphore j_checkpoint_sem; + + /* The main journal lock, used by lock_journal() */ + struct semaphore j_sem; + + /* Journal head: identifies the first unused block in the journal. */ + unsigned long j_head; + + /* Journal tail: identifies the oldest still-used block in the + * journal. */ + unsigned long j_tail; + + /* Journal free: how many free blocks are there in the journal? */ + unsigned long j_free; + + /* Journal start and end: the block numbers of the first usable + * block and one beyond the last usable block in the journal. */ + unsigned long j_first, j_last; + + /* Device, blocksize and starting block offset for the location + * where we store the journal. */ + kdev_t j_dev; + int j_blocksize; + unsigned int j_blk_offset; + + /* Device which holds the client fs. For internal journal this + * will be equal to j_dev. */ + kdev_t j_fs_dev; + + /* Total maximum capacity of the journal region on disk. */ + unsigned int j_maxlen; + + /* Optional inode where we store the journal. If present, all + * journal block numbers are mapped into this inode via + * bmap(). */ + struct inode * j_inode; + + /* Sequence number of the oldest transaction in the log */ + tid_t j_tail_sequence; + /* Sequence number of the next transaction to grant */ + tid_t j_transaction_sequence; + /* Sequence number of the most recently committed transaction */ + tid_t j_commit_sequence; + /* Sequence number of the most recent transaction wanting commit */ + tid_t j_commit_request; + + /* Journal uuid: identifies the object (filesystem, LVM volume + * etc) backed by this journal. This will eventually be + * replaced by an array of uuids, allowing us to index multiple + * devices within a single journal and to perform atomic updates + * across them. */ + + __u8 j_uuid[16]; + + /* Pointer to the current commit thread for this journal */ + struct task_struct * j_task; + + /* Maximum number of metadata buffers to allow in a single + * compound commit transaction */ + int j_max_transaction_buffers; + + /* What is the maximum transaction lifetime before we begin a + * commit? */ + unsigned long j_commit_interval; + + /* The timer used to wakeup the commit thread: */ + struct timer_list * j_commit_timer; + int j_commit_timer_active; + + /* Link all journals together - system-wide */ + struct list_head j_all_journals; + + /* The revoke table: maintains the list of revoked blocks in the + current transaction. */ + struct jbd_revoke_table_s *j_revoke; + }; + + /* + * Journal flag definitions + */ + #define JFS_UNMOUNT 0x001 /* Journal thread is being destroyed */ + #define JFS_ABORT 0x002 /* Journaling has been aborted for errors. */ + #define JFS_ACK_ERR 0x004 /* The errno in the sb has been acked */ + #define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */ + #define JFS_LOADED 0x010 /* The journal superblock has been loaded */ + + /* + * Function declarations for the journaling transaction and buffer + * management + */ + + /* Filing buffers */ + extern void __journal_unfile_buffer(struct journal_head *); + extern void journal_unfile_buffer(struct journal_head *); + extern void __journal_refile_buffer(struct journal_head *); + extern void journal_refile_buffer(struct journal_head *); + extern void __journal_file_buffer(struct journal_head *, transaction_t *, int); + extern void __journal_free_buffer(struct journal_head *bh); + extern void journal_file_buffer(struct journal_head *, transaction_t *, int); + extern void __journal_clean_data_list(transaction_t *transaction); + + /* Log buffer allocation */ + extern struct journal_head * journal_get_descriptor_buffer(journal_t *); + extern unsigned long journal_next_log_block(journal_t *); + + /* Commit management */ + extern void journal_commit_transaction(journal_t *); + + /* Checkpoint list management */ + int __journal_clean_checkpoint_list(journal_t *journal); + extern void journal_remove_checkpoint(struct journal_head *); + extern void __journal_remove_checkpoint(struct journal_head *); + extern void journal_insert_checkpoint(struct journal_head *, transaction_t *); + extern void __journal_insert_checkpoint(struct journal_head *,transaction_t *); + + /* Buffer IO */ + extern int + journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct journal_head **jh_out, + int blocknr); + + /* Transaction locking */ + extern void __wait_on_journal (journal_t *); + + /* + * Journal locking. + * + * We need to lock the journal during transaction state changes so that + * nobody ever tries to take a handle on the running transaction while + * we are in the middle of moving it to the commit phase. + * + * Note that the locking is completely interrupt unsafe. We never touch + * journal structures from interrupts. + * + * In 2.2, the BKL was required for lock_journal. This is no longer + * the case. + */ + + static inline void lock_journal(journal_t *journal) + { + down(&journal->j_sem); + } + + /* This returns zero if we acquired the semaphore */ + static inline int try_lock_journal(journal_t * journal) + { + return down_trylock(&journal->j_sem); + } + + static inline void unlock_journal(journal_t * journal) + { + up(&journal->j_sem); + } + + + static inline handle_t *journal_current_handle(void) + { + return current->journal_info; + } + + /* The journaling code user interface: + * + * Create and destroy handles + * Register buffer modifications against the current transaction. + */ + + extern handle_t *journal_start(journal_t *, int nblocks); + extern handle_t *journal_try_start(journal_t *, int nblocks); + extern int journal_restart (handle_t *, int nblocks); + extern int journal_extend (handle_t *, int nblocks); + extern int journal_get_write_access (handle_t *, struct buffer_head *); + extern int journal_get_create_access (handle_t *, struct buffer_head *); + extern int journal_get_undo_access (handle_t *, struct buffer_head *); + extern int journal_dirty_data (handle_t *, + struct buffer_head *, int async); + extern int journal_dirty_metadata (handle_t *, struct buffer_head *); + extern void journal_release_buffer (handle_t *, struct buffer_head *); + extern void journal_forget (handle_t *, struct buffer_head *); + extern void journal_sync_buffer (struct buffer_head *); + extern int journal_flushpage(journal_t *, struct page *, unsigned long); + extern int journal_try_to_free_buffers(journal_t *, struct page *, int); + extern int journal_stop(handle_t *); + extern int journal_flush (journal_t *); + + extern void journal_lock_updates (journal_t *); + extern void journal_unlock_updates (journal_t *); + + extern journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev, + int start, int len, int bsize); + extern journal_t * journal_init_inode (struct inode *); + extern int journal_update_format (journal_t *); + extern int journal_check_used_features + (journal_t *, unsigned long, unsigned long, unsigned long); + extern int journal_check_available_features + (journal_t *, unsigned long, unsigned long, unsigned long); + extern int journal_set_features + (journal_t *, unsigned long, unsigned long, unsigned long); + extern int journal_create (journal_t *); + extern int journal_load (journal_t *journal); + extern void journal_destroy (journal_t *); + extern int journal_recover (journal_t *journal); + extern int journal_wipe (journal_t *, int); + extern int journal_skip_recovery (journal_t *); + extern void journal_update_superblock (journal_t *, int); + extern void __journal_abort (journal_t *); + extern void journal_abort (journal_t *, int); + extern int journal_errno (journal_t *); + extern void journal_ack_err (journal_t *); + extern int journal_clear_err (journal_t *); + extern unsigned long journal_bmap(journal_t *journal, unsigned long blocknr); + extern int journal_force_commit(journal_t *journal); + + /* + * journal_head management + */ + extern struct journal_head + *journal_add_journal_head(struct buffer_head *bh); + extern void journal_remove_journal_head(struct buffer_head *bh); + extern void __journal_remove_journal_head(struct buffer_head *bh); + extern void journal_unlock_journal_head(struct journal_head *jh); + + /* Primary revoke support */ + #define JOURNAL_REVOKE_DEFAULT_HASH 256 + extern int journal_init_revoke(journal_t *, int); + extern void journal_destroy_revoke_caches(void); + extern int journal_init_revoke_caches(void); + + extern void journal_destroy_revoke(journal_t *); + extern int journal_revoke (handle_t *, + unsigned long, struct buffer_head *); + extern int journal_cancel_revoke(handle_t *, struct journal_head *); + extern void journal_write_revoke_records(journal_t *, transaction_t *); + + /* Recovery revoke support */ + extern int journal_set_revoke(journal_t *, unsigned long, tid_t); + extern int journal_test_revoke(journal_t *, unsigned long, tid_t); + extern void journal_clear_revoke(journal_t *); + extern void journal_brelse_array(struct buffer_head *b[], int n); + + /* The log thread user interface: + * + * Request space in the current transaction, and force transaction commit + * transitions on demand. + */ + + extern int log_space_left (journal_t *); /* Called with journal locked */ + extern tid_t log_start_commit (journal_t *, transaction_t *); + extern void log_wait_commit (journal_t *, tid_t); + extern int log_do_checkpoint (journal_t *, int); + + extern void log_wait_for_space(journal_t *, int nblocks); + extern void __journal_drop_transaction(journal_t *, transaction_t *); + extern int cleanup_journal_tail(journal_t *); + + /* Reduce journal memory usage by flushing */ + extern void shrink_journal_memory(void); + + /* Debugging code only: */ + + #define jbd_ENOSYS() \ + do { \ + printk (KERN_ERR "JBD unimplemented function " __FUNCTION__); \ + current->state = TASK_UNINTERRUPTIBLE; \ + schedule(); \ + } while (1) + + /* + * is_journal_abort + * + * Simple test wrapper function to test the JFS_ABORT state flag. This + * bit, when set, indicates that we have had a fatal error somewhere, + * either inside the journaling layer or indicated to us by the client + * (eg. ext3), and that we and should not commit any further + * transactions. + */ + + static inline int is_journal_aborted(journal_t *journal) + { + return journal->j_flags & JFS_ABORT; + } + + static inline int is_handle_aborted(handle_t *handle) + { + if (handle->h_aborted) + return 1; + return is_journal_aborted(handle->h_transaction->t_journal); + } + + static inline void journal_abort_handle(handle_t *handle) + { + handle->h_aborted = 1; + } + + /* Not all architectures define BUG() */ + #ifndef BUG + #define BUG() do { \ + printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \ + * ((char *) 0) = 0; \ + } while (0) + #endif /* BUG */ + + #endif /* __KERNEL__ */ + + /* Comparison functions for transaction IDs: perform comparisons using + * modulo arithmetic so that they work over sequence number wraps. */ + + static inline int tid_gt(tid_t x, tid_t y) + { + int difference = (x - y); + return (difference > 0); + } + + static inline int tid_geq(tid_t x, tid_t y) + { + int difference = (x - y); + return (difference >= 0); + } + + extern int journal_blocks_per_page(struct inode *inode); + + /* + * Definitions which augment the buffer_head layer + */ + + /* JBD additions */ + + /* journaling buffer types */ + #define BJ_None 0 /* Not journaled */ + #define BJ_SyncData 1 /* Normal data: flush before commit */ + #define BJ_AsyncData 2 /* writepage data: wait on it before commit */ + #define BJ_Metadata 3 /* Normal journaled metadata */ + #define BJ_Forget 4 /* Buffer superceded by this transaction */ + #define BJ_IO 5 /* Buffer is for temporary IO use */ + #define BJ_Shadow 6 /* Buffer contents being shadowed to the log */ + #define BJ_LogCtl 7 /* Buffer contains log descriptors */ + #define BJ_Reserved 8 /* Buffer is reserved for access by journal */ + #define BJ_Types 9 + + extern int jbd_blocks_per_page(struct inode *inode); + + #ifdef __KERNEL__ + + extern spinlock_t jh_splice_lock; + /* + * Once `expr1' has been found true, take jh_splice_lock + * and then reevaluate everything. + */ + #define SPLICE_LOCK(expr1, expr2) \ + ({ \ + int ret = (expr1); \ + if (ret) { \ + spin_lock(&jh_splice_lock); \ + ret = (expr1) && (expr2); \ + spin_unlock(&jh_splice_lock); \ + } \ + ret; \ + }) + + /* + * A number of buffer state predicates. They test for + * buffer_jbd() because they are used in core kernel code. + * + * These will be racy on SMP unless we're *sure* that the + * buffer won't be detached from the journalling system + * in parallel. + */ + + /* Return true if the buffer is on journal list `list' */ + static inline int buffer_jlist_eq(struct buffer_head *bh, int list) + { + return SPLICE_LOCK(buffer_jbd(bh), bh2jh(bh)->b_jlist == list); + } + + /* Return true if this bufer is dirty wrt the journal */ + static inline int buffer_jdirty(struct buffer_head *bh) + { + return buffer_jbd(bh) && __buffer_state(bh, JBDDirty); + } + + /* Return true if it's a data buffer which journalling is managing */ + static inline int buffer_jbd_data(struct buffer_head *bh) + { + return SPLICE_LOCK(buffer_jbd(bh), + bh2jh(bh)->b_jlist == BJ_SyncData || + bh2jh(bh)->b_jlist == BJ_AsyncData); + } + + #ifdef CONFIG_SMP + #define assert_spin_locked(lock) J_ASSERT(spin_is_locked(lock)) + #else + #define assert_spin_locked(lock) do {} while(0) + #endif + + #endif /* __KERNEL__ */ + + #endif /* CONFIG_JBD || CONFIG_JBD_MODULE || !__KERNEL__ */ + + /* + * Compatibility no-ops which allow the kernel to compile without CONFIG_JBD + * go here. + */ + + #if defined(__KERNEL__) && !(defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)) + + #define J_ASSERT(expr) do {} while (0) + #define J_ASSERT_BH(bh, expr) do {} while (0) + #define buffer_jbd(bh) 0 + #define buffer_jlist_eq(bh, val) 0 + #define journal_buffer_journal_lru(bh) 0 + + #endif /* defined(__KERNEL__) && !defined(CONFIG_JBD) */ + #endif /* _LINUX_JBD_H */ diff -rc2P linux/include/linux/journal-head.h linux-2.4.13/include/linux/journal-head.h *** linux/include/linux/journal-head.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/journal-head.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,70 ---- + /* + * include/linux/journal-head.h + * + * buffer_head fields for JBD + * + * 27 May 2001 ANdrew Morton + * Created - pulled out of fs.h + */ + + #ifndef JOURNAL_HEAD_H_INCLUDED + #define JOURNAL_HEAD_H_INCLUDED + + typedef unsigned int tid_t; /* Unique transaction ID */ + typedef struct transaction_s transaction_t; /* Compound transaction type */ + struct buffer_head; + + struct journal_head { + #ifndef CONFIG_JBD_UNIFIED_BUFFERS + /* Points back to our buffer_head. */ + struct buffer_head *b_bh; + #endif + + /* Reference count - see description in journal.c */ + int b_jcount; + + /* Journaling list for this buffer */ + unsigned b_jlist; + + /* Copy of the buffer data frozen for writing to the log. */ + char * b_frozen_data; + + /* Pointer to a saved copy of the buffer containing no + uncommitted deallocation references, so that allocations can + avoid overwriting uncommitted deletes. */ + char * b_committed_data; + + /* Pointer to the compound transaction which owns this buffer's + metadata: either the running transaction or the committing + transaction (if there is one). Only applies to buffers on a + transaction's data or metadata journaling list. */ + /* Protected by journal_datalist_lock */ + transaction_t * b_transaction; + + /* Pointer to the running compound transaction which is + currently modifying the buffer's metadata, if there was + already a transaction committing it when the new transaction + touched it. */ + transaction_t * b_next_transaction; + + /* Doubly-linked list of buffers on a transaction's data, + metadata or forget queue. */ + /* Protected by journal_datalist_lock */ + struct journal_head *b_tnext, *b_tprev; + + /* + * Pointer to the compound transaction against which this buffer + * is checkpointed. Only dirty buffers can be checkpointed. + */ + /* Protected by journal_datalist_lock */ + transaction_t * b_cp_transaction; + + /* + * Doubly-linked list of buffers still remaining to be flushed + * before an old transaction can be checkpointed. + */ + /* Protected by journal_datalist_lock */ + struct journal_head *b_cpnext, *b_cpprev; + }; + + #endif /* JOURNAL_HEAD_H_INCLUDED */ diff -rc2P linux/include/linux/sched.h linux-2.4.13/include/linux/sched.h *** linux/include/linux/sched.h Fri Nov 9 16:15:08 2001 --- linux-2.4.13/include/linux/sched.h Fri Nov 9 16:58:32 2001 *************** *** 420,423 **** --- 420,425 ---- /* Protection of (de-)allocation: mm, files, fs, tty */ spinlock_t alloc_lock; + /* journalling filesystem info */ + void *journal_info; /* Field to make virtual server running in chroot more isolated */ int s_context; /* Process can only deal with other processes */ *************** *** 513,516 **** --- 515,519 ---- blocked: {{0}}, \ alloc_lock: SPIN_LOCK_UNLOCKED, \ + journal_info: NULL, \ cap_bset: CAP_INIT_EFF_SET, \ } diff -rc2P linux/include/linux/sched.h.orig linux-2.4.13/include/linux/sched.h.orig *** linux/include/linux/sched.h.orig Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/sched.h.orig Fri Nov 9 16:15:08 2001 *************** *** 0 **** --- 1,936 ---- + #ifndef _LINUX_SCHED_H + #define _LINUX_SCHED_H + + #include /* for HZ */ + + extern unsigned long event; + + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + #include + + #include + #include + #include + #include + #include + #include + + struct exec_domain; + + /* + * cloning flags: + */ + #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ + #define CLONE_VM 0x00000100 /* set if VM shared between processes */ + #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ + #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ + #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ + #define CLONE_PID 0x00001000 /* set if pid shared */ + #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ + #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ + #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ + #define CLONE_THREAD 0x00010000 /* Same thread group? */ + + #define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) + + /* + * These are the constant used to fake the fixed-point load-average + * counting. Some notes: + * - 11 bit fractions expand to 22 bits by the multiplies: this gives + * a load-average precision of 10 bits integer + 11 bits fractional + * - if you want to count load-averages more often, you need more + * precision, or rounding will get you. With 2-second counting freq, + * the EXP_n values would be 1981, 2034 and 2043 if still using only + * 11 bit fractions. + */ + extern unsigned long avenrun[]; /* Load averages */ + + #define FSHIFT 11 /* nr of bits of precision */ + #define FIXED_1 (1<>= FSHIFT; + + #define CT_TO_SECS(x) ((x) / HZ) + #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) + + extern int nr_running, nr_threads; + extern int last_pid; + + #include + #include + #include + #include + #include + + #include + + #define TASK_RUNNING 0 + #define TASK_INTERRUPTIBLE 1 + #define TASK_UNINTERRUPTIBLE 2 + #define TASK_ZOMBIE 4 + #define TASK_STOPPED 8 + + #define __set_task_state(tsk, state_value) \ + do { (tsk)->state = (state_value); } while (0) + #ifdef CONFIG_SMP + #define set_task_state(tsk, state_value) \ + set_mb((tsk)->state, (state_value)) + #else + #define set_task_state(tsk, state_value) \ + __set_task_state((tsk), (state_value)) + #endif + + #define __set_current_state(state_value) \ + do { current->state = (state_value); } while (0) + #ifdef CONFIG_SMP + #define set_current_state(state_value) \ + set_mb(current->state, (state_value)) + #else + #define set_current_state(state_value) \ + __set_current_state(state_value) + #endif + + /* + * Scheduling policies + */ + #define SCHED_OTHER 0 + #define SCHED_FIFO 1 + #define SCHED_RR 2 + + /* + * This is an additional bit set when we want to + * yield the CPU for one re-schedule.. + */ + #define SCHED_YIELD 0x10 + + struct sched_param { + int sched_priority; + }; + + struct completion; + + #ifdef __KERNEL__ + + #include + + /* + * This serializes "schedule()" and also protects + * the run-queue from deletions/modifications (but + * _adding_ to the beginning of the run-queue has + * a separate lock). + */ + extern rwlock_t tasklist_lock; + extern spinlock_t runqueue_lock; + extern spinlock_t mmlist_lock; + + extern void sched_init(void); + extern void init_idle(void); + extern void show_state(void); + extern void cpu_init (void); + extern void trap_init(void); + extern void update_process_times(int user); + extern void update_one_process(struct task_struct *p, unsigned long user, + unsigned long system, int cpu); + + #define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern signed long FASTCALL(schedule_timeout(signed long timeout)); + asmlinkage void schedule(void); + + extern int schedule_task(struct tq_struct *task); + extern void flush_scheduled_tasks(void); + extern int start_context_thread(void); + extern int current_is_keventd(void); + + /* + * The default fd array needs to be at least BITS_PER_LONG, + * as this is the granularity returned by copy_fdset(). + */ + #define NR_OPEN_DEFAULT BITS_PER_LONG + + /* + * Open file table structure + */ + struct files_struct { + atomic_t count; + rwlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ + int max_fds; + int max_fdset; + int next_fd; + struct file ** fd; /* current fd array */ + fd_set *close_on_exec; + fd_set *open_fds; + fd_set close_on_exec_init; + fd_set open_fds_init; + struct file * fd_array[NR_OPEN_DEFAULT]; + }; + + #define INIT_FILES \ + { \ + count: ATOMIC_INIT(1), \ + file_lock: RW_LOCK_UNLOCKED, \ + max_fds: NR_OPEN_DEFAULT, \ + max_fdset: __FD_SETSIZE, \ + next_fd: 0, \ + fd: &init_files.fd_array[0], \ + close_on_exec: &init_files.close_on_exec_init, \ + open_fds: &init_files.open_fds_init, \ + close_on_exec_init: { { 0, } }, \ + open_fds_init: { { 0, } }, \ + fd_array: { NULL, } \ + } + + /* Maximum number of active map areas.. This is a random (large) number */ + #define MAX_MAP_COUNT (65536) + + struct mm_struct { + struct vm_area_struct * mmap; /* list of VMAs */ + rb_root_t mm_rb; + struct vm_area_struct * mmap_cache; /* last find_vma result */ + pgd_t * pgd; + atomic_t mm_users; /* How many users with user space? */ + atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + int map_count; /* number of VMAs */ + struct rw_semaphore mmap_sem; + spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ + + struct list_head mmlist; /* List of all active mm's. These are globally strung + * together off init_mm.mmlist, and are protected + * by mmlist_lock + */ + + unsigned long start_code, end_code, start_data, end_data; + unsigned long start_brk, brk, start_stack; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long rss, total_vm, locked_vm; + unsigned long def_flags; + unsigned long cpu_vm_mask; + unsigned long swap_address; + + unsigned dumpable:1; + + /* Architecture-specific MM context */ + mm_context_t context; + }; + + extern int mmlist_nr; + + #define INIT_MM(name) \ + { \ + mm_rb: RB_ROOT, \ + pgd: swapper_pg_dir, \ + mm_users: ATOMIC_INIT(2), \ + mm_count: ATOMIC_INIT(1), \ + mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ + page_table_lock: SPIN_LOCK_UNLOCKED, \ + mmlist: LIST_HEAD_INIT(name.mmlist), \ + } + + struct signal_struct { + atomic_t count; + struct k_sigaction action[_NSIG]; + spinlock_t siglock; + }; + + + #define INIT_SIGNALS { \ + count: ATOMIC_INIT(1), \ + action: { {{0,}}, }, \ + siglock: SPIN_LOCK_UNLOCKED \ + } + + /* + * Some day this will be a full-fledged user tracking system.. + */ + struct user_struct { + atomic_t __count; /* reference count */ + atomic_t processes; /* How many processes does this user have? */ + atomic_t files; /* How many open files does this user have? */ + + /* Hash table maintenance information */ + struct user_struct *next, **pprev; + uid_t uid; + }; + + #define get_current_user() ({ \ + struct user_struct *__user = current->user; \ + atomic_inc(&__user->__count); \ + __user; }) + + + /* + We may have a different domainname and nodename for each security + context. By default, a security context share the same as its + parent, potentially the information in system_utsname + */ + #define S_CTX_INFO_LOCK 1 /* Can't request a new s_context */ + #define S_CTX_INFO_SCHED 2 /* All process in the s_context */ + /* Contribute to the schedular */ + struct context_info{ + int refcount; + int s_context; + char nodename[65]; + char domainname[65]; + int flags; /* S_CTX_INFO_xxx */ + atomic_t ticks; /* Number of ticks used by all process */ + /* in the s_context */ + }; + + + extern struct user_struct root_user; + #define INIT_USER (&root_user) + + struct task_struct { + /* + * offsets of these are hardcoded elsewhere - touch with care + */ + volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ + unsigned long flags; /* per process flags, defined below */ + int sigpending; + mm_segment_t addr_limit; /* thread address space: + 0-0xBFFFFFFF for user-thead + 0-0xFFFFFFFF for kernel-thread + */ + struct exec_domain *exec_domain; + volatile long need_resched; + unsigned long ptrace; + + int lock_depth; /* Lock depth */ + + /* + * offset 32 begins here on 32-bit platforms. We keep + * all fields in a single cacheline that are needed for + * the goodness() loop in schedule(). + */ + long counter; + long nice; + unsigned long policy; + struct mm_struct *mm; + int has_cpu, processor; + unsigned long cpus_allowed; + /* + * (only the 'next' pointer fits into the cacheline, but + * that's just fine.) + */ + struct list_head run_list; + unsigned long sleep_time; + + struct task_struct *next_task, *prev_task; + struct mm_struct *active_mm; + struct list_head local_pages; + unsigned int allocation_order, nr_local_pages; + + /* task state */ + struct linux_binfmt *binfmt; + int exit_code, exit_signal; + int pdeath_signal; /* The signal sent when the parent dies */ + /* ??? */ + unsigned long personality; + int did_exec:1; + pid_t pid; + pid_t pgrp; + pid_t tty_old_pgrp; + pid_t session; + pid_t tgid; + /* boolean value for session group leader */ + int leader; + /* + * pointers to (original) parent process, youngest child, younger sibling, + * older sibling, respectively. (p->father can be replaced with + * p->p_pptr->pid) + */ + struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; + struct list_head thread_group; + + /* PID hash table linkage. */ + struct task_struct *pidhash_next; + struct task_struct **pidhash_pprev; + + wait_queue_head_t wait_chldexit; /* for wait4() */ + struct completion *vfork_done; /* for vfork() */ + unsigned long rt_priority; + unsigned long it_real_value, it_prof_value, it_virt_value; + unsigned long it_real_incr, it_prof_incr, it_virt_incr; + struct timer_list real_timer; + struct tms times; + unsigned long start_time; + long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; + /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ + unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; + int swappable:1; + /* process credentials */ + uid_t uid,euid,suid,fsuid; + gid_t gid,egid,sgid,fsgid; + int ngroups; + gid_t groups[NGROUPS]; + kernel_cap_t cap_effective, cap_inheritable, cap_permitted; + int keep_capabilities:1; + struct user_struct *user; + /* limits */ + struct rlimit rlim[RLIM_NLIMITS]; + unsigned short used_math; + char comm[16]; + /* file system info */ + int link_count, total_link_count; + struct tty_struct *tty; /* NULL if no tty */ + unsigned int locks; /* How many file locks are being held */ + /* ipc stuff */ + struct sem_undo *semundo; + struct sem_queue *semsleeping; + /* CPU-specific state of this task */ + struct thread_struct thread; + /* filesystem information */ + struct fs_struct *fs; + /* open file information */ + struct files_struct *files; + /* signal handlers */ + spinlock_t sigmask_lock; /* Protects signal and blocked */ + struct signal_struct *sig; + + sigset_t blocked; + struct sigpending pending; + + unsigned long sas_ss_sp; + size_t sas_ss_size; + int (*notifier)(void *priv); + void *notifier_data; + sigset_t *notifier_mask; + + /* Thread group tracking */ + u32 parent_exec_id; + u32 self_exec_id; + /* Protection of (de-)allocation: mm, files, fs, tty */ + spinlock_t alloc_lock; + /* Field to make virtual server running in chroot more isolated */ + int s_context; /* Process can only deal with other processes */ + /* with the same s_context */ + __u32 cap_bset; /* Maximum capability of this process and children */ + unsigned long ipv4root; /* Process can only bind to this iP */ + struct context_info *s_info; + }; + + /* + * Per process flags + */ + #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ + /* Not implemented yet, only for 486*/ + #define PF_STARTING 0x00000002 /* being created */ + #define PF_EXITING 0x00000004 /* getting shut down */ + #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ + #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ + #define PF_DUMPCORE 0x00000200 /* dumped core */ + #define PF_SIGNALED 0x00000400 /* killed by a signal */ + #define PF_MEMALLOC 0x00000800 /* Allocating memory */ + #define PF_FREE_PAGES 0x00002000 /* per process page freeing */ + + #define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ + + /* + * Ptrace flags + */ + + #define PT_PTRACED 0x00000001 + #define PT_TRACESYS 0x00000002 + #define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */ + #define PT_TRACESYSGOOD 0x00000008 + #define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */ + + /* + * Limit the stack by to some sane default: root can always + * increase this limit if needed.. 8MB seems reasonable. + */ + #define _STK_LIM (8*1024*1024) + + #define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ + #define MAX_COUNTER (20*HZ/100) + #define DEF_NICE (0) + + + /* + * The default (Linux) execution domain. + */ + extern struct exec_domain default_exec_domain; + + /* + * INIT_TASK is used to set up the first task table, touch at + * your own risk!. Base=0, limit=0x1fffff (=2MB) + */ + #define INIT_TASK(tsk) \ + { \ + state: 0, \ + flags: 0, \ + sigpending: 0, \ + addr_limit: KERNEL_DS, \ + exec_domain: &default_exec_domain, \ + lock_depth: -1, \ + counter: DEF_COUNTER, \ + nice: DEF_NICE, \ + policy: SCHED_OTHER, \ + mm: NULL, \ + active_mm: &init_mm, \ + cpus_allowed: -1, \ + run_list: LIST_HEAD_INIT(tsk.run_list), \ + next_task: &tsk, \ + prev_task: &tsk, \ + p_opptr: &tsk, \ + p_pptr: &tsk, \ + thread_group: LIST_HEAD_INIT(tsk.thread_group), \ + wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ + real_timer: { \ + function: it_real_fn \ + }, \ + cap_effective: CAP_INIT_EFF_SET, \ + cap_inheritable: CAP_INIT_INH_SET, \ + cap_permitted: CAP_FULL_SET, \ + keep_capabilities: 0, \ + rlim: INIT_RLIMITS, \ + user: INIT_USER, \ + comm: "swapper", \ + thread: INIT_THREAD, \ + fs: &init_fs, \ + files: &init_files, \ + sigmask_lock: SPIN_LOCK_UNLOCKED, \ + sig: &init_signals, \ + pending: { NULL, &tsk.pending.head, {{0}}}, \ + blocked: {{0}}, \ + alloc_lock: SPIN_LOCK_UNLOCKED, \ + cap_bset: CAP_INIT_EFF_SET, \ + } + + + #ifndef INIT_TASK_SIZE + # define INIT_TASK_SIZE 2048*sizeof(long) + #endif + + union task_union { + struct task_struct task; + unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; + }; + + extern union task_union init_task_union; + + extern struct mm_struct init_mm; + extern struct task_struct *init_tasks[NR_CPUS]; + + /* PID hashing. (shouldnt this be dynamic?) */ + #define PIDHASH_SZ (4096 >> 2) + extern struct task_struct *pidhash[PIDHASH_SZ]; + + #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) + + static inline void hash_pid(struct task_struct *p) + { + struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; + + if((p->pidhash_next = *htable) != NULL) + (*htable)->pidhash_pprev = &p->pidhash_next; + *htable = p; + p->pidhash_pprev = htable; + } + + static inline void unhash_pid(struct task_struct *p) + { + if(p->pidhash_next) + p->pidhash_next->pidhash_pprev = p->pidhash_pprev; + *p->pidhash_pprev = p->pidhash_next; + } + + static inline struct task_struct *find_task_by_pid(int pid) + { + struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; + + for(p = *htable; p && p->pid != pid; p = p->pidhash_next) + ; + + return p; + } + + /* per-UID process charging. */ + extern struct user_struct * alloc_uid(uid_t); + extern void free_uid(struct user_struct *); + + #include + + extern unsigned long volatile jiffies; + extern unsigned long itimer_ticks; + extern unsigned long itimer_next; + extern struct timeval xtime; + extern void do_timer(struct pt_regs *); + + extern unsigned int * prof_buffer; + extern unsigned long prof_len; + extern unsigned long prof_shift; + + #define CURRENT_TIME (xtime.tv_sec) + + extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); + extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); + extern void FASTCALL(sleep_on(wait_queue_head_t *q)); + extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); + extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); + extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, + signed long timeout)); + extern int FASTCALL(wake_up_process(struct task_struct * tsk)); + + #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) + #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) + #define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) + #define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) + #define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) + #define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) + #define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) + #define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) + #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) + #define wake_up_interruptible_sync_nr(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) + asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); + + extern int in_group_p(gid_t); + extern int in_egroup_p(gid_t); + + extern void proc_caches_init(void); + extern void flush_signals(struct task_struct *); + extern void flush_signal_handlers(struct task_struct *); + extern int dequeue_signal(sigset_t *, siginfo_t *); + extern void block_all_signals(int (*notifier)(void *priv), void *priv, + sigset_t *mask); + extern void unblock_all_signals(void); + extern int send_sig_info(int, struct siginfo *, struct task_struct *); + extern int force_sig_info(int, struct siginfo *, struct task_struct *); + extern int kill_pg_info(int, struct siginfo *, pid_t); + extern int kill_sl_info(int, struct siginfo *, pid_t); + extern int kill_proc_info(int, struct siginfo *, pid_t); + extern void notify_parent(struct task_struct *, int); + extern void do_notify_parent(struct task_struct *, int); + extern void force_sig(int, struct task_struct *); + extern int send_sig(int, struct task_struct *, int); + extern int kill_pg(pid_t, int, int); + extern int kill_sl(pid_t, int, int); + extern int kill_proc(pid_t, int, int); + extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); + extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); + + static inline int signal_pending(struct task_struct *p) + { + return (p->sigpending != 0); + } + + /* + * Re-calculate pending state from the set of locally pending + * signals, globally pending signals, and blocked signals. + */ + static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) + { + unsigned long ready; + long i; + + switch (_NSIG_WORDS) { + default: + for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) + ready |= signal->sig[i] &~ blocked->sig[i]; + break; + + case 4: ready = signal->sig[3] &~ blocked->sig[3]; + ready |= signal->sig[2] &~ blocked->sig[2]; + ready |= signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 2: ready = signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 1: ready = signal->sig[0] &~ blocked->sig[0]; + } + return ready != 0; + } + + /* Reevaluate whether the task has signals pending delivery. + This is required every time the blocked sigset_t changes. + All callers should have t->sigmask_lock. */ + + static inline void recalc_sigpending(struct task_struct *t) + { + t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); + } + + /* True if we are on the alternate signal stack. */ + + static inline int on_sig_stack(unsigned long sp) + { + return (sp - current->sas_ss_sp < current->sas_ss_size); + } + + static inline int sas_ss_flags(unsigned long sp) + { + return (current->sas_ss_size == 0 ? SS_DISABLE + : on_sig_stack(sp) ? SS_ONSTACK : 0); + } + + extern int request_irq(unsigned int, + void (*handler)(int, void *, struct pt_regs *), + unsigned long, const char *, void *); + extern void free_irq(unsigned int, void *); + + /* + * This has now become a routine instead of a macro, it sets a flag if + * it returns true (to do BSD-style accounting where the process is flagged + * if it uses root privs). The implication of this is that you should do + * normal permissions checks first, and check suser() last. + * + * [Dec 1997 -- Chris Evans] + * For correctness, the above considerations need to be extended to + * fsuser(). This is done, along with moving fsuser() checks to be + * last. + * + * These will be removed, but in the mean time, when the SECURE_NOROOT + * flag is set, uids don't grant privilege. + */ + static inline int suser(void) + { + if (!issecure(SECURE_NOROOT) && current->euid == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; + } + + static inline int fsuser(void) + { + if (!issecure(SECURE_NOROOT) && current->fsuid == 0) { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; + } + + /* + * capable() checks for a particular capability. + * New privilege checks should use this interface, rather than suser() or + * fsuser(). See include/linux/capability.h for defined capabilities. + */ + + static inline int capable(int cap) + { + #if 1 /* ok now */ + if (cap_raised(current->cap_effective, cap)) + #else + if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0) + #endif + { + current->flags |= PF_SUPERPRIV; + return 1; + } + return 0; + } + + /* + * Routines for handling mm_structs + */ + extern struct mm_struct * mm_alloc(void); + + extern struct mm_struct * start_lazy_tlb(void); + extern void end_lazy_tlb(struct mm_struct *mm); + + /* mmdrop drops the mm and the page tables */ + extern inline void FASTCALL(__mmdrop(struct mm_struct *)); + static inline void mmdrop(struct mm_struct * mm) + { + if (atomic_dec_and_test(&mm->mm_count)) + __mmdrop(mm); + } + + /* mmput gets rid of the mappings and all user-space */ + extern void mmput(struct mm_struct *); + /* Remove the current tasks stale references to the old mm_struct */ + extern void mm_release(void); + + /* + * Routines for handling the fd arrays + */ + extern struct file ** alloc_fd_array(int); + extern int expand_fd_array(struct files_struct *, int nr); + extern void free_fd_array(struct file **, int); + + extern fd_set *alloc_fdset(int); + extern int expand_fdset(struct files_struct *, int nr); + extern void free_fdset(fd_set *, int); + + extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); + extern void flush_thread(void); + extern void exit_thread(void); + + extern void exit_mm(struct task_struct *); + extern void exit_files(struct task_struct *); + extern void exit_sighand(struct task_struct *); + + extern void reparent_to_init(void); + extern void daemonize(void); + + extern int do_execve(char *, char **, char **, struct pt_regs *); + extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); + + extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); + extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); + extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); + + #define __wait_event(wq, condition) \ + do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + schedule(); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ + } while (0) + + #define wait_event(wq, condition) \ + do { \ + if (condition) \ + break; \ + __wait_event(wq, condition); \ + } while (0) + + #define __wait_event_interruptible(wq, condition, ret) \ + do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (condition) \ + break; \ + if (!signal_pending(current)) { \ + schedule(); \ + continue; \ + } \ + ret = -ERESTARTSYS; \ + break; \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ + } while (0) + + #define wait_event_interruptible(wq, condition) \ + ({ \ + int __ret = 0; \ + if (!(condition)) \ + __wait_event_interruptible(wq, condition, __ret); \ + __ret; \ + }) + + #define REMOVE_LINKS(p) do { \ + (p)->next_task->prev_task = (p)->prev_task; \ + (p)->prev_task->next_task = (p)->next_task; \ + if ((p)->p_osptr) \ + (p)->p_osptr->p_ysptr = (p)->p_ysptr; \ + if ((p)->p_ysptr) \ + (p)->p_ysptr->p_osptr = (p)->p_osptr; \ + else \ + (p)->p_pptr->p_cptr = (p)->p_osptr; \ + } while (0) + + #define SET_LINKS(p) do { \ + (p)->next_task = &init_task; \ + (p)->prev_task = init_task.prev_task; \ + init_task.prev_task->next_task = (p); \ + init_task.prev_task = (p); \ + (p)->p_ysptr = NULL; \ + if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ + (p)->p_osptr->p_ysptr = p; \ + (p)->p_pptr->p_cptr = p; \ + } while (0) + + #define for_each_task(p) \ + for (p = &init_task ; (p = p->next_task) != &init_task ; ) + + #define next_thread(p) \ + list_entry((p)->thread_group.next, struct task_struct, thread_group) + + static inline void del_from_runqueue(struct task_struct * p) + { + nr_running--; + p->sleep_time = jiffies; + list_del(&p->run_list); + p->run_list.next = NULL; + } + + static inline int task_on_runqueue(struct task_struct *p) + { + return (p->run_list.next != NULL); + } + + static inline void unhash_process(struct task_struct *p) + { + if (task_on_runqueue(p)) BUG(); + write_lock_irq(&tasklist_lock); + nr_threads--; + unhash_pid(p); + REMOVE_LINKS(p); + list_del(&p->thread_group); + write_unlock_irq(&tasklist_lock); + } + + /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ + static inline void task_lock(struct task_struct *p) + { + spin_lock(&p->alloc_lock); + } + + static inline void task_unlock(struct task_struct *p) + { + spin_unlock(&p->alloc_lock); + } + + /* write full pathname into buffer and return start of pathname */ + static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, + char *buf, int buflen) + { + char *res; + struct vfsmount *rootmnt; + struct dentry *root; + read_lock(¤t->fs->lock); + rootmnt = mntget(current->fs->rootmnt); + root = dget(current->fs->root); + read_unlock(¤t->fs->lock); + spin_lock(&dcache_lock); + res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); + spin_unlock(&dcache_lock); + dput(root); + mntput(rootmnt); + return res; + } + + /* Manage the reference count of the context_info pointer */ + void sys_release_s_info (struct task_struct *); + void sys_assign_s_info (struct task_struct *); + void sys_alloc_s_info (void); + + #endif /* __KERNEL__ */ + + #endif diff -rc2P linux/include/linux/sched.h.rej linux-2.4.13/include/linux/sched.h.rej *** linux/include/linux/sched.h.rej Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/sched.h.rej Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,36 ---- + *************** + *** 399,404 **** + u32 self_exec_id; + /* Protection of (de-)allocation: mm, files, fs, tty */ + spinlock_t alloc_lock; + }; + + /* + --- 399,407 ---- + u32 self_exec_id; + /* Protection of (de-)allocation: mm, files, fs, tty */ + spinlock_t alloc_lock; + + + + /* journalling filesystem info */ + + void *journal_info; + }; + + /* + *************** + *** 485,491 **** + sig: &init_signals, \ + pending: { NULL, &tsk.pending.head, {{0}}}, \ + blocked: {{0}}, \ + - alloc_lock: SPIN_LOCK_UNLOCKED \ + } + + + --- 488,495 ---- + sig: &init_signals, \ + pending: { NULL, &tsk.pending.head, {{0}}}, \ + blocked: {{0}}, \ + + alloc_lock: SPIN_LOCK_UNLOCKED, \ + + journal_info: NULL \ + } + + diff -rc2P linux/kernel/sysctl.c linux-2.4.13/kernel/sysctl.c *** linux/kernel/sysctl.c Fri Nov 9 16:15:08 2001 --- linux-2.4.13/kernel/sysctl.c Fri Nov 9 16:58:00 2001 *************** *** 30,33 **** --- 30,35 ---- #include #include + #include + #include #include *************** *** 303,306 **** --- 305,316 ---- {FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int), 0644, NULL, &proc_dointvec}, + #ifdef CONFIG_JBD_DEBUG + {FS_LEASE_TIME+1, "jbd-debug", &journal_enable_debug, sizeof (int), + 0644, NULL, &proc_dointvec}, + #endif + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + {FS_LEASE_TIME+2, "jbd-oom-retry", &journal_oom_retry, sizeof (int), + 0644, NULL, &proc_dointvec}, + #endif {0} }; diff -rc2P linux/mm/filemap.c linux-2.4.13/mm/filemap.c *** linux/mm/filemap.c Tue Oct 23 20:52:48 2001 --- linux-2.4.13/mm/filemap.c Fri Nov 9 16:58:00 2001 *************** *** 201,211 **** } static inline void truncate_partial_page(struct page *page, unsigned partial) { memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); - if (page->buffers) ! block_flushpage(page, partial); ! } --- 201,218 ---- } + static int do_flushpage(struct page *page, unsigned long offset) + { + int (*flushpage) (struct page *, unsigned long); + flushpage = page->mapping->a_ops->flushpage; + if (flushpage) + return (*flushpage)(page, offset); + return block_flushpage(page, offset); + } + static inline void truncate_partial_page(struct page *page, unsigned partial) { memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial); if (page->buffers) ! do_flushpage(page, partial); } *************** *** 213,217 **** { /* Leave it on the LRU if it gets converted into anonymous buffers */ ! if (!page->buffers || block_flushpage(page, 0)) lru_cache_del(page); --- 220,224 ---- { /* Leave it on the LRU if it gets converted into anonymous buffers */ ! if (!page->buffers || do_flushpage(page, 0)) lru_cache_del(page); *************** *** 1119,1122 **** --- 1126,1130 ---- } + /* * Mark a page as having seen activity. *************** *** 2817,2821 **** err = written ? written : status; out: - up(&inode->i_sem); return err; --- 2825,2828 ---- diff -rc2P linux/mm/memory.c linux-2.4.13/mm/memory.c *** linux/mm/memory.c Mon Oct 15 15:09:50 2001 --- linux-2.4.13/mm/memory.c Fri Nov 9 16:58:00 2001 *************** *** 1243,1250 **** struct page * new_page; pte_t entry; ! if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, write_access, address); spin_unlock(&mm->page_table_lock); /* --- 1243,1256 ---- struct page * new_page; pte_t entry; ! int ret; ! struct inode *inode = NULL; ! if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, write_access, address); spin_unlock(&mm->page_table_lock); + if (vma->vm_file && vma->vm_file->f_dentry) + inode = vma->vm_file->f_dentry->d_inode; + if (inode) + down_read(&inode->i_truncate_sem); /* *************** *** 1256,1263 **** spin_lock(&mm->page_table_lock); ! if (new_page == NULL) /* no page was available -- SIGBUS */ ! return 0; ! if (new_page == NOPAGE_OOM) ! return -1; /* * This silly early PAGE_DIRTY setting removes a race --- 1262,1275 ---- spin_lock(&mm->page_table_lock); ! if (new_page == NULL) { /* no page was available -- SIGBUS */ ! ret = 0; ! goto out; ! } ! ! if (new_page == NOPAGE_OOM) { ! ret = -1; ! goto out; ! } ! /* * This silly early PAGE_DIRTY setting removes a race *************** *** 1285,1294 **** /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); ! return 1; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); ! return 2; /* Major fault */ } --- 1297,1311 ---- /* One of our sibling threads was faster, back out. */ page_cache_release(new_page); ! ret = 1; ! goto out; } /* no need to invalidate: a not-present page shouldn't be cached */ update_mmu_cache(vma, address, entry); ! ret = 2; /* Major fault */ ! out: ! if (inode) ! up_read(&inode->i_truncate_sem); ! return ret; } diff -rc2P linux/mm/vmscan.c linux-2.4.13/mm/vmscan.c *** linux/mm/vmscan.c Wed Oct 24 00:48:55 2001 --- linux-2.4.13/mm/vmscan.c Fri Nov 9 16:58:00 2001 *************** *** 8,12 **** * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. - * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $ * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel. --- 8,11 ---- *************** *** 415,419 **** page_cache_get(page); ! if (try_to_free_buffers(page, gfp_mask)) { if (!page->mapping) { /* --- 414,418 ---- page_cache_get(page); ! if (try_to_release_page(page, gfp_mask)) { if (!page->mapping) { /* *************** *** 436,440 **** /* * The page is still in pagecache so undo the stuff ! * before the try_to_free_buffers since we've not * finished and we can now try the next step. */ --- 435,439 ---- /* * The page is still in pagecache so undo the stuff ! * before the try_to_release_page since we've not * finished and we can now try the next step. */