diff -rc2P linux/Documentation/Configure.help linux-2.4.13/Documentation/Configure.help *** linux/Documentation/Configure.help Sat Oct 20 22:17:19 2001 --- linux-2.4.13/Documentation/Configure.help Fri Nov 9 16:58:00 2001 *************** *** 12059,12062 **** --- 12059,12132 ---- wants to say Y here. + Ext3 journaling file system support (EXPERIMENTAL) + CONFIG_EXT3_FS + This is the journaling version of the Second extended file system + (often called ext3), the de facto standard Linux file system + (method to organize files on a storage device) for hard disks. + + The journaling code included in this driver means you do not have + to run e2fsck (file system checker) on your file systems after a + crash. The journal keeps track of any changes that were being made + at the time the system crashed, and can ensure that your file system + is consistent without the need for a lengthy check. + + Other than adding the journal to the filesystem, the on-disk format of + ext3 is identical to ext2. It is possible to freely switch between + using the ext3 driver and the ext2 driver, as long as the filesystem + has been cleanly unmounted, or e2fsck is run on the filesystem. + + To add a journal on an existing ext2 filesystem or change the behavior + of ext3 file systems, you can use the tune2fs utility ("man tune2fs"). + To modify attributes of files and directories on ext3 file systems, + use chattr ("man chattr"). You need to be using e2fsprogs version + 1.20 or later in order to create ext3 journals (available at + ). + + If you want to compile this file system as a module ( = code which + can be inserted in and removed from the running kernel whenever you + want), say M here and read Documentation/modules.txt. The module + will be called ext3.o. Be aware however that the file system of your + root partition (the one containing the directory /) cannot be + compiled as a module, and so this may be dangerous. + + Journal Block Device support (JBD for ext3) (EXPERIMENTAL) + CONFIG_JBD + This is a generic journaling layer for block devices. It is currently + used by the ext3 file system, but it could also be used to add journal + support to other file systems or block devices such as RAID or LVM. + + If you are using the ext3 filesystem, you need to say Y here. If you + are not using ext3 then you will probably want to say N. + + If you want to compile this device as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here and read Documentation/modules.txt. The module will be called + jbd.o. If you are compiling ext3 into the kernel, you cannot compile + this code as a module. + + JBD (ext3) debugging support + CONFIG_JBD_DEBUG + If you are using the ext3 journaled file system (or potentially any + other file system/device using JBD), this option allows you to enable + debugging output while the system is running, in order to help track + down any problems you are having. By default the debugging output + will be turned off. + + If you select Y here, then you will be able to turn on debugging with + "echo N > /proc/sys/fs/jbd-debug", where N is a number between 1 and 5, + the higher the number, the more debugging output is generated. To turn + debugging off again, do "echo 0 > /proc/sys/fs/jbd-debug". + + Buffer Head tracing (DEBUG) + CONFIG_BUFFER_DEBUG + If you are a kernel developer working with file systems or in the block + device layer, this buffer head tracing may help you to track down bugs + in your code. This enables some debugging macros (BUFFER_TRACE, etc) + which allow you to track the state of a buffer through various layers + of code. The debugging code is used primarily by ext3 and JBD code. + + Because this option adds considerably to the size of each buffer, most + people will want to say N here. + BFS file system support (EXPERIMENTAL) CONFIG_BFS_FS diff -rc2P linux/drivers/block/ll_rw_blk.c linux-2.4.13/drivers/block/ll_rw_blk.c *** linux/drivers/block/ll_rw_blk.c Sat Oct 13 13:30:30 2001 --- linux-2.4.13/drivers/block/ll_rw_blk.c Fri Nov 9 16:58:00 2001 *************** *** 672,677 **** down by us so at this point flushpage will block and won't clear the mapped bit under us. */ ! if (!buffer_mapped(bh)) BUG(); /* --- 672,679 ---- down by us so at this point flushpage will block and won't clear the mapped bit under us. */ ! if (!buffer_mapped(bh)) { ! print_buffer_trace(bh); BUG(); + } /* *************** *** 1007,1013 **** switch(rw) { case WRITE: ! if (!atomic_set_buffer_clean(bh)) /* Hmmph! Nothing to write */ goto end_io; __mark_buffer_clean(bh); break; --- 1009,1018 ---- switch(rw) { case WRITE: ! if (!atomic_set_buffer_clean(bh)) { ! BUFFER_TRACE(bh, "already clean"); /* Hmmph! Nothing to write */ goto end_io; + } + BUFFER_TRACE(bh, "set clean, write underway"); __mark_buffer_clean(bh); break; *************** *** 1032,1037 **** sorry: /* Make sure we don't get infinite dirty retries.. */ ! for (i = 0; i < nr; i++) mark_buffer_clean(bhs[i]); } --- 1037,1044 ---- sorry: /* Make sure we don't get infinite dirty retries.. */ ! for (i = 0; i < nr; i++) { ! BUFFER_TRACE(bhs[i], "sorry"); mark_buffer_clean(bhs[i]); + } } *************** *** 1133,1136 **** --- 1140,1144 ---- queue_nr_requests = 128; + /* * Batch frees according to queue length diff -rc2P linux/drivers/block/loop.c linux-2.4.13/drivers/block/loop.c *** linux/drivers/block/loop.c Mon Oct 15 21:53:51 2001 --- linux-2.4.13/drivers/block/loop.c Fri Nov 9 16:58:00 2001 *************** *** 187,190 **** --- 187,192 ---- while (len > 0) { int IV = index * (PAGE_CACHE_SIZE/bsize) + offset/bsize; + int transfer_result; + size = PAGE_CACHE_SIZE - offset; if (size > len) *************** *** 198,205 **** kaddr = page_address(page); flush_dcache_page(page); ! if (lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV)) ! goto write_fail; if (aops->commit_write(file, page, offset, offset+size)) goto unlock; data += size; len -= size; --- 200,216 ---- kaddr = page_address(page); flush_dcache_page(page); ! transfer_result = lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV); ! if (transfer_result) { ! /* ! * The transfer failed, but we still write the data to ! * keep prepare/commit calls balanced. ! */ ! printk(KERN_ERR "loop: transfer error block %ld\n", index); ! memset(kaddr + offset, 0, size); ! } if (aops->commit_write(file, page, offset, offset+size)) goto unlock; + if (transfer_result) + goto unlock; data += size; len -= size; *************** *** 213,220 **** return 0; - write_fail: - printk(KERN_ERR "loop: transfer error block %ld\n", index); - ClearPageUptodate(page); - kunmap(page); unlock: UnlockPage(page); --- 224,227 ---- diff -rc2P linux/drivers/ide/ide-disk.c linux-2.4.13/drivers/ide/ide-disk.c *** linux/drivers/ide/ide-disk.c Thu Oct 11 12:14:32 2001 --- linux-2.4.13/drivers/ide/ide-disk.c Fri Nov 9 16:58:00 2001 *************** *** 368,371 **** --- 368,392 ---- static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block) { + #ifdef CONFIG_JBD_DEBUG + /* + * Silently stop writing to this disk to simulate a crash. + */ + extern int journal_no_write[2]; + int i; + + if (rq->cmd != WRITE) + goto write_ok; + + for (i = 0; i < 2; i++) { + if ((journal_no_write[i] & 0xdead0000) == 0xdead0000) { + if (rq->rq_dev == (journal_no_write[i] & 0xffff)) { + ide_end_request(1, HWGROUP(drive)); + return ide_stopped; + } + } + } + write_ok: + ; + #endif if (IDE_CONTROL_REG) OUT_BYTE(drive->ctl,IDE_CONTROL_REG); diff -rc2P linux/fs/Config.in linux-2.4.13/fs/Config.in *** linux/fs/Config.in Thu Oct 4 18:13:18 2001 --- linux-2.4.13/fs/Config.in Fri Nov 9 16:57:59 2001 *************** *** 21,24 **** --- 21,32 ---- dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL + tristate 'Ext3 journalling file system support (EXPERIMENTAL)' CONFIG_EXT3_FS + # CONFIG_JBD could be its own option (even modular), but until there are + # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS + # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS + define_bool CONFIG_JBD $CONFIG_EXT3_FS + dep_mbool ' JBD (ext3) debugging support' CONFIG_JBD_DEBUG $CONFIG_JBD + bool 'Buffer Head tracing (DEBUG)' CONFIG_BUFFER_DEBUG + # msdos file systems tristate 'DOS FAT fs support' CONFIG_FAT_FS diff -rc2P linux/fs/Makefile linux-2.4.13/fs/Makefile *** linux/fs/Makefile Thu Oct 4 18:13:18 2001 --- linux-2.4.13/fs/Makefile Fri Nov 9 16:58:00 2001 *************** *** 8,12 **** O_TARGET := fs.o ! export-objs := filesystems.o open.o dcache.o mod-subdirs := nls --- 8,12 ---- O_TARGET := fs.o ! export-objs := filesystems.o open.o dcache.o buffer.o jbd-kernel.o mod-subdirs := nls *************** *** 15,19 **** fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ ! filesystems.o namespace.o ifeq ($(CONFIG_QUOTA),y) --- 15,19 ---- fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \ dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \ ! filesystems.o namespace.o jbd-kernel.o ifeq ($(CONFIG_QUOTA),y) *************** *** 27,30 **** --- 27,32 ---- # Do not add any filesystems before this line + subdir-$(CONFIG_EXT3_FS) += ext3 # Before ext2 so root fs can be ext3 + subdir-$(CONFIG_JBD) += jbd subdir-$(CONFIG_EXT2_FS) += ext2 subdir-$(CONFIG_CRAMFS) += cramfs diff -rc2P linux/fs/buffer.c linux-2.4.13/fs/buffer.c *** linux/fs/buffer.c Tue Oct 23 20:54:19 2001 --- linux-2.4.13/fs/buffer.c Fri Nov 9 16:57:59 2001 *************** *** 46,49 **** --- 46,51 ---- #include #include + #include + #include #include *************** *** 614,619 **** by the user. ! Thus invalidate_buffers in general usage is not allwowed to trash dirty ! buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved. NOTE: In the case where the user removed a removable-media-disk even if --- 616,625 ---- by the user. ! Thus invalidate_buffers in general usage is not allwowed to trash ! dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to ! be preserved. These buffers are simply skipped. ! ! We also skip buffers which are still in use. For example this can ! happen if a userspace program is reading the block device. NOTE: In the case where the user removed a removable-media-disk even if *************** *** 718,721 **** --- 724,728 ---- bh->b_end_io = handler; bh->b_private = private; + buffer_trace_init(&bh->b_history); } *************** *** 727,730 **** --- 734,738 ---- struct page *page; + BUFFER_TRACE(bh, "enter"); mark_buffer_uptodate(bh, uptodate); *************** *** 1093,1096 **** --- 1101,1110 ---- } + void set_buffer_flushtime(struct buffer_head *bh) + { + bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer; + } + EXPORT_SYMBOL(set_buffer_flushtime); + /* * A buffer may need to be moved from one buffer list to another *************** *** 1100,1103 **** --- 1114,1120 ---- { int dispose = BUF_CLEAN; + + BUFFER_TRACE(bh, "enter"); + if (buffer_locked(bh)) dispose = BUF_LOCKED; *************** *** 1111,1114 **** --- 1128,1132 ---- __insert_into_lru_list(bh, dispose); } + BUFFER_TRACE(bh, "exit"); } *************** *** 1125,1128 **** --- 1143,1147 ---- void __brelse(struct buffer_head * buf) { + BUFFER_TRACE(buf, "entry"); if (atomic_read(&buf->b_count)) { put_bh(buf); *************** *** 1138,1141 **** --- 1157,1161 ---- void __bforget(struct buffer_head * buf) { + BUFFER_TRACE(buf, "enter"); mark_buffer_clean(buf); __brelse(buf); *************** *** 1168,1175 **** * Note: the caller should wake up the buffer_wait list if needed. */ ! static __inline__ void __put_unused_buffer_head(struct buffer_head * bh) { if (bh->b_inode) BUG(); if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { kmem_cache_free(bh_cachep, bh); --- 1188,1207 ---- * Note: the caller should wake up the buffer_wait list if needed. */ ! static void __put_unused_buffer_head(struct buffer_head * bh) { if (bh->b_inode) BUG(); + + J_ASSERT_BH(bh, bh->b_prev_free == 0); + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + if (buffer_jbd(bh)) { + J_ASSERT_BH(bh, bh2jh(bh)->b_transaction == 0); + J_ASSERT_BH(bh, bh2jh(bh)->b_next_transaction == 0); + J_ASSERT_BH(bh, bh2jh(bh)->b_frozen_data == 0); + J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data == 0); + } + #endif + buffer_trace_init(&bh->b_history); + if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { kmem_cache_free(bh_cachep, bh); *************** *** 1185,1188 **** --- 1217,1228 ---- } + void put_unused_buffer_head(struct buffer_head *bh) + { + spin_lock(&unused_list_lock); + __put_unused_buffer_head(bh); + spin_unlock(&unused_list_lock); + } + EXPORT_SYMBOL(put_unused_buffer_head); + /* * Reserve NR_RESERVED buffer heads for async IO requests to avoid *************** *** 1190,1194 **** * buffer heads is now handled in create_buffers(). */ ! static struct buffer_head * get_unused_buffer_head(int async) { struct buffer_head * bh; --- 1230,1234 ---- * buffer heads is now handled in create_buffers(). */ ! struct buffer_head * get_unused_buffer_head(int async) { struct buffer_head * bh; *************** *** 1211,1214 **** --- 1251,1255 ---- bh->b_blocknr = -1; bh->b_this_page = NULL; + buffer_trace_init(&bh->b_history); return bh; } *************** *** 1224,1227 **** --- 1265,1269 ---- nr_unused_buffer_heads--; spin_unlock(&unused_list_lock); + buffer_trace_init(&bh->b_history); return bh; } *************** *** 1231,1234 **** --- 1273,1277 ---- return NULL; } + EXPORT_SYMBOL(get_unused_buffer_head); void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset) *************** *** 1245,1248 **** --- 1288,1292 ---- bh->b_data = page_address(page) + offset; } + EXPORT_SYMBOL(set_bh_page); /* *************** *** 1328,1331 **** --- 1372,1376 ---- { if (buffer_mapped(bh)) { + BUFFER_TRACE(bh, "entry"); mark_buffer_clean(bh); lock_buffer(bh); *************** *** 1338,1341 **** --- 1383,1411 ---- } + /** + * try_to_release_page - release old fs-specific metadata on a page + * + */ + + int try_to_release_page(struct page * page, int gfp_mask) + { + if (!PageLocked(page)) + BUG(); + + if (!page->mapping) + goto try_to_free; + if (!page->mapping->a_ops->releasepage) + goto try_to_free; + if (page->mapping->a_ops->releasepage(page, gfp_mask)) + goto try_to_free; + /* + * We couldn't release buffer metadata; don't even bother trying + * to release buffers. + */ + return 0; + try_to_free: + return try_to_free_buffers(page, gfp_mask); + } + /* * We don't have to release all buffers here, but *************** *** 1381,1385 **** */ if (!offset) { ! if (!try_to_free_buffers(page, 0)) return 0; } --- 1451,1455 ---- */ if (!offset) { ! if (!try_to_release_page(page, 0)) return 0; } *************** *** 1409,1412 **** --- 1479,1483 ---- page_cache_get(page); } + EXPORT_SYMBOL(create_empty_buffers); /* *************** *** 1427,1431 **** --- 1498,1505 ---- old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); + J_ASSERT_BH(bh, old_bh != bh); if (old_bh) { + BUFFER_TRACE(old_bh, "old_bh - entry"); + J_ASSERT_BH(old_bh, !buffer_jlist_eq(old_bh, BJ_Metadata)); mark_buffer_clean(old_bh); wait_on_buffer(old_bh); *************** *** 1449,1454 **** /* ! * block_write_full_page() is SMP-safe - currently it's still ! * being called with the kernel lock held, but the code is ready. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) --- 1523,1527 ---- /* ! * block_write_full_page() is SMP threaded - the kernel lock is not held. */ static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block) *************** *** 1484,1489 **** if (err) goto out; ! if (buffer_new(bh)) unmap_underlying_metadata(bh); } bh = bh->b_this_page; --- 1557,1564 ---- if (err) goto out; ! if (buffer_new(bh)) { ! BUFFER_TRACE(bh, "new: call unmap_underlying_metadata"); unmap_underlying_metadata(bh); + } } bh = bh->b_this_page; *************** *** 1493,1496 **** --- 1568,1572 ---- /* Stage 2: lock the buffers, mark them clean */ do { + BUFFER_TRACE(bh, "lock it"); lock_buffer(bh); set_buffer_async_io(bh); *************** *** 1549,1554 **** --- 1625,1632 ---- goto out; if (buffer_new(bh)) { + BUFFER_TRACE(bh, "new: call unmap_underlying_metadata"); unmap_underlying_metadata(bh); if (Page_Uptodate(page)) { + BUFFER_TRACE(bh, "setting uptodate"); set_bit(BH_Uptodate, &bh->b_state); continue; *************** *** 1564,1567 **** --- 1642,1646 ---- } if (Page_Uptodate(page)) { + BUFFER_TRACE(bh, "setting uptodate"); set_bit(BH_Uptodate, &bh->b_state); continue; *************** *** 1569,1572 **** --- 1648,1652 ---- if (!buffer_uptodate(bh) && (block_start < from || block_end > to)) { + BUFFER_TRACE(bh, "reading"); ll_rw_block(READ, 1, &bh); *wait_bh++=bh; *************** *** 1607,1610 **** --- 1687,1691 ---- set_bit(BH_Uptodate, &bh->b_state); if (!atomic_set_buffer_dirty(bh)) { + BUFFER_TRACE(bh, "mark dirty"); __mark_dirty(bh); buffer_insert_inode_data_queue(bh, inode); *************** *** 1890,1893 **** --- 1971,1975 ---- kunmap(page); + BUFFER_TRACE(bh, "zeroed end of block"); __mark_buffer_dirty(bh); err = 0; *************** *** 2447,2450 **** --- 2529,2534 ---- return 0; } + EXPORT_SYMBOL(try_to_free_buffers); + EXPORT_SYMBOL(buffermem_pages); /* ================== Debugging =================== */ diff -rc2P linux/fs/ext3/Makefile linux-2.4.13/fs/ext3/Makefile *** linux/fs/ext3/Makefile Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/Makefile Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,16 ---- + # + # Makefile for the linux ext2-filesystem routines. + # + # Note! Dependencies are done automagically by 'make dep', which also + # removes any old dependencies. DON'T put your own dependencies here + # unless it's something special (ie not a .c file). + # + # Note 2! The CFLAGS definitions are now in the main makefile... + + O_TARGET := ext3.o + + obj-y := acl.o balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + ioctl.o namei.o super.o symlink.o + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make diff -rc2P linux/fs/ext3/acl.c linux-2.4.13/fs/ext3/acl.c *** linux/fs/ext3/acl.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/acl.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,17 ---- + /* + * linux/fs/ext3/acl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + + #include + #include + + + /* + * This file will contain the Access Control Lists management for the + * second extended file system. + */ diff -rc2P linux/fs/ext3/balloc.c linux-2.4.13/fs/ext3/balloc.c *** linux/fs/ext3/balloc.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/balloc.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,995 ---- + /* + * linux/fs/ext3/balloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + + /* + * balloc.c contains the blocks allocation and deallocation routines + */ + + /* + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3_read_super). + */ + + + #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + + struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh) + { + unsigned long group_desc; + unsigned long desc; + struct ext3_group_desc * gdp; + + if (block_group >= sb->u.ext3_sb.s_groups_count) { + ext3_error (sb, "ext3_get_group_desc", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sb->u.ext3_sb.s_groups_count); + + return NULL; + } + + group_desc = block_group / EXT3_DESC_PER_BLOCK(sb); + desc = block_group % EXT3_DESC_PER_BLOCK(sb); + if (!sb->u.ext3_sb.s_group_desc[group_desc]) { + ext3_error (sb, "ext3_get_group_desc", + "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, desc); + return NULL; + } + + gdp = (struct ext3_group_desc *) + sb->u.ext3_sb.s_group_desc[group_desc]->b_data; + if (bh) + *bh = sb->u.ext3_sb.s_group_desc[group_desc]; + return gdp + desc; + } + + /* + * Read the bitmap for a given block_group, reading into the specified + * slot in the superblock's bitmap cache. + * + * Return >=0 on success or a -ve error code. + */ + + static int read_block_bitmap (struct super_block * sb, + unsigned int block_group, + unsigned long bitmap_nr) + { + struct ext3_group_desc * gdp; + struct buffer_head * bh = NULL; + int retval = -EIO; + + gdp = ext3_get_group_desc (sb, block_group, NULL); + if (!gdp) + goto error_out; + retval = 0; + bh = bread (sb->s_dev, + le32_to_cpu(gdp->bg_block_bitmap), sb->s_blocksize); + if (!bh) { + ext3_error (sb, "read_block_bitmap", + "Cannot read block bitmap - " + "block_group = %d, block_bitmap = %lu", + block_group, (unsigned long) gdp->bg_block_bitmap); + retval = -EIO; + } + /* + * On IO error, just leave a zero in the superblock's block pointer for + * this group. The IO will be retried next time. + */ + error_out: + sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group; + sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh; + return retval; + } + + /* + * load_block_bitmap loads the block bitmap for a blocks group + * + * It maintains a cache for the last bitmaps loaded. This cache is managed + * with a LRU algorithm. + * + * Notes: + * 1/ There is one cache per mounted file system. + * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, + * this function reads the bitmap without maintaining a LRU cache. + * + * Return the slot used to store the bitmap, or a -ve error code. + */ + static int __load_block_bitmap (struct super_block * sb, + unsigned int block_group) + { + int i, j, retval = 0; + unsigned long block_bitmap_number; + struct buffer_head * block_bitmap; + + if (block_group >= sb->u.ext3_sb.s_groups_count) + ext3_panic (sb, "load_block_bitmap", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sb->u.ext3_sb.s_groups_count); + + if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) { + if (sb->u.ext3_sb.s_block_bitmap[block_group]) { + if (sb->u.ext3_sb.s_block_bitmap_number[block_group] == + block_group) + return block_group; + ext3_error (sb, "__load_block_bitmap", + "block_group != block_bitmap_number"); + } + retval = read_block_bitmap (sb, block_group, block_group); + if (retval < 0) + return retval; + return block_group; + } + + for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps && + sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++) + ; + if (i < sb->u.ext3_sb.s_loaded_block_bitmaps && + sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) { + block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i]; + block_bitmap = sb->u.ext3_sb.s_block_bitmap[i]; + for (j = i; j > 0; j--) { + sb->u.ext3_sb.s_block_bitmap_number[j] = + sb->u.ext3_sb.s_block_bitmap_number[j - 1]; + sb->u.ext3_sb.s_block_bitmap[j] = + sb->u.ext3_sb.s_block_bitmap[j - 1]; + } + sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number; + sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap; + + /* + * There's still one special case here --- if block_bitmap == 0 + * then our last attempt to read the bitmap failed and we have + * just ended up caching that failure. Try again to read it. + */ + if (!block_bitmap) + retval = read_block_bitmap (sb, block_group, 0); + } else { + if (sb->u.ext3_sb.s_loaded_block_bitmapsu.ext3_sb.s_loaded_block_bitmaps++; + else + brelse (sb->u.ext3_sb.s_block_bitmap + [EXT3_MAX_GROUP_LOADED - 1]); + for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1; + j > 0; j--) { + sb->u.ext3_sb.s_block_bitmap_number[j] = + sb->u.ext3_sb.s_block_bitmap_number[j - 1]; + sb->u.ext3_sb.s_block_bitmap[j] = + sb->u.ext3_sb.s_block_bitmap[j - 1]; + } + retval = read_block_bitmap (sb, block_group, 0); + } + return retval; + } + + /* + * Load the block bitmap for a given block group. First of all do a couple + * of fast lookups for common cases and then pass the request onto the guts + * of the bitmap loader. + * + * Return the slot number of the group in the superblock bitmap cache's on + * success, or a -ve error code. + * + * There is still one inconsistency here --- if the number of groups in this + * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of + * differentiating between a group for which we have never performed a bitmap + * IO request, and a group for which the last bitmap read request failed. + */ + static inline int load_block_bitmap (struct super_block * sb, + unsigned int block_group) + { + int slot; + + /* + * Do the lookup for the slot. First of all, check if we're asking + * for the same slot as last time, and did we succeed that last time? + */ + if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 && + sb->u.ext3_sb.s_block_bitmap_number[0] == block_group && + sb->u.ext3_sb.s_block_bitmap[0]) { + return 0; + } + /* + * Or can we do a fast lookup based on a loaded group on a filesystem + * small enough to be mapped directly into the superblock? + */ + else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && + sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group + && sb->u.ext3_sb.s_block_bitmap[block_group]) { + slot = block_group; + } + /* + * If not, then do a full lookup for this block group. + */ + else { + slot = __load_block_bitmap (sb, block_group); + } + + /* + * <0 means we just got an error + */ + if (slot < 0) + return slot; + + /* + * If it's a valid slot, we may still have cached a previous IO error, + * in which case the bh in the superblock cache will be zero. + */ + if (!sb->u.ext3_sb.s_block_bitmap[slot]) + return -EIO; + + /* + * Must have been read in OK to get this far. + */ + return slot; + } + + /* Free given blocks, update quota and i_blocks field */ + void ext3_free_blocks (handle_t *handle, struct inode * inode, + unsigned long block, unsigned long count) + { + struct buffer_head *bitmap_bh; + struct buffer_head *gd_bh; + unsigned long block_group; + unsigned long bit; + unsigned long i; + int bitmap_nr; + unsigned long overflow; + struct super_block * sb; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + int err = 0, ret; + int dquot_freed_blocks = 0; + + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); + return; + } + lock_super (sb); + es = sb->u.ext3_sb.s_es; + if (block < le32_to_cpu(es->s_first_data_block) || + (block + count) > le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks not in datazone - " + "block = %lu, count = %lu", block, count); + goto error_return; + } + + ext3_debug ("freeing block %lu\n", block); + + do_more: + overflow = 0; + block_group = (block - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + bit = (block - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + /* + * Check to see if we are freeing blocks across a group + * boundary. + */ + if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) { + overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb); + count -= overflow; + } + bitmap_nr = load_block_bitmap (sb, block_group); + if (bitmap_nr < 0) + goto error_return; + + bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; + gdp = ext3_get_group_desc (sb, block_group, &gd_bh); + if (!gdp) + goto error_return; + + if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) || + in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) || + in_range (block, le32_to_cpu(gdp->bg_inode_table), + sb->u.ext3_sb.s_itb_per_group) || + in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table), + sb->u.ext3_sb.s_itb_per_group)) + ext3_error (sb, "ext3_free_blocks", + "Freeing blocks in system zones - " + "Block = %lu, count = %lu", + block, count); + + /* + * We are about to start releasing blocks in the bitmap, + * so we need undo access. + */ + /* @@@ check errors */ + BUFFER_TRACE(bitmap_bh, "getting undo access"); + err = ext3_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto error_return; + + /* + * We are about to modify some metadata. Call the journal APIs + * to unshare ->b_data if a currently-committing transaction is + * using it + */ + BUFFER_TRACE(gd_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, gd_bh); + if (err) + goto error_return; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (err) + goto error_return; + + for (i = 0; i < count; i++) { + /* + * An HJ special. This is expensive... + */ + #ifdef CONFIG_JBD_DEBUG + { + struct buffer_head *debug_bh; + debug_bh = get_hash_table(sb->s_dev, block + i, + sb->s_blocksize); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "Deleted!"); + if (!bh2jh(bitmap_bh)->b_committed_data) + BUFFER_TRACE(debug_bh, + "No commited data in bitmap"); + BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); + __brelse(debug_bh); + } + } + #endif + BUFFER_TRACE(bitmap_bh, "clear bit"); + if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { + ext3_error (sb, __FUNCTION__, + "bit already cleared for block %lu", + block + i); + BUFFER_TRACE(bitmap_bh, "bit already cleared"); + } else { + dquot_freed_blocks++; + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1); + es->s_free_blocks_count = + cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1); + } + /* @@@ This prevents newly-allocated data from being + * freed and then reallocated within the same + * transaction. + * + * Ideally we would want to allow that to happen, but to + * do so requires making journal_forget() capable of + * revoking the queued write of a data block, which + * implies blocking on the journal lock. *forget() + * cannot block due to truncate races. + * + * Eventually we can fix this by making journal_forget() + * return a status indicating whether or not it was able + * to revoke the buffer. On successful revoke, it is + * safe not to set the allocation bit in the committed + * bitmap, because we know that there is no outstanding + * activity on the buffer any more and so it is safe to + * reallocate it. + */ + BUFFER_TRACE(bitmap_bh, "clear in b_committed_data"); + J_ASSERT_BH(bitmap_bh, + bh2jh(bitmap_bh)->b_committed_data != NULL); + ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data); + } + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext3_journal_dirty_metadata(handle, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext3_journal_dirty_metadata(handle, gd_bh); + if (!err) err = ret; + + /* And the superblock */ + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock"); + ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + if (!err) err = ret; + + if (overflow && !err) { + block += count; + count = overflow; + goto do_more; + } + sb->s_dirt = 1; + error_return: + ext3_std_error(sb, err); + unlock_super(sb); + if (dquot_freed_blocks) + DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); + return; + } + + /* For ext3 allocations, we must not reuse any blocks which are + * allocated in the bitmap buffer's "last committed data" copy. This + * prevents deletes from freeing up the page for reuse until we have + * committed the delete transaction. + * + * If we didn't do this, then deleting something and reallocating it as + * data would allow the old block to be overwritten before the + * transaction committed (because we force data to disk before commit). + * This would lead to corruption if we crashed between overwriting the + * data and committing the delete. + * + * @@@ We may want to make this allocation behaviour conditional on + * data-writes at some point, and disable it for metadata allocations or + * sync-data inodes. + */ + static int ext3_test_allocatable(int nr, struct buffer_head *bh) + { + if (ext3_test_bit(nr, bh->b_data)) + return 0; + if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data) + return 1; + return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data); + } + + /* + * Find an allocatable block in a bitmap. We honour both the bitmap and + * its last-committed copy (if that exists), and perform the "most + * appropriate allocation" algorithm of looking for a free block near + * the initial goal; then for a free byte somewhere in the bitmap; then + * for any free bit in the bitmap. + */ + static int find_next_usable_block(int start, + struct buffer_head *bh, int maxblocks) + { + int here, next; + char *p, *r; + + if (start > 0) { + /* + * The goal was occupied; search forward for a free + * block within the next XX blocks. + * + * end_goal is more or less random, but it has to be + * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the + * next 64-bit boundary is simple.. + */ + int end_goal = (start + 63) & ~63; + here = ext3_find_next_zero_bit(bh->b_data, end_goal, start); + if (here < end_goal && ext3_test_allocatable(here, bh)) + return here; + + ext3_debug ("Bit not found near goal\n"); + + } + + here = start; + if (here < 0) + here = 0; + + /* + * There has been no free block found in the near vicinity of + * the goal: do a search forward through the block groups, + * searching in each group first for an entire free byte in the + * bitmap and then for any free bit. + * + * Search first in the remainder of the current group + */ + p = ((char *) bh->b_data) + (here >> 3); + r = memscan(p, 0, (maxblocks - here + 7) >> 3); + next = (r - ((char *) bh->b_data)) << 3; + + if (next < maxblocks && ext3_test_allocatable(next, bh)) + return next; + + /* The bitmap search --- search forward alternately + * through the actual bitmap and the last-committed copy + * until we find a bit free in both. */ + + while (here < maxblocks) { + next = ext3_find_next_zero_bit ((unsigned long *) bh->b_data, + maxblocks, here); + if (next >= maxblocks) + return -1; + if (ext3_test_allocatable(next, bh)) + return next; + + J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data); + here = ext3_find_next_zero_bit + ((unsigned long *) bh2jh(bh)->b_committed_data, + maxblocks, next); + } + return -1; + } + + /* + * ext3_new_block uses a goal block to assist allocation. If the goal is + * free, or there is a free block within 32 blocks of the goal, that block + * is allocated. Otherwise a forward search is made for a free block; within + * each block group the search first looks for an entire free byte in the block + * bitmap, and then for any free bit if that fails. + * This function also updates quota and i_blocks field. + */ + int ext3_new_block (handle_t *handle, struct inode * inode, + unsigned long goal, u32 * prealloc_count, + u32 * prealloc_block, int * errp) + { + struct buffer_head * bh, *bhtmp; + struct buffer_head * bh2; + #if 0 + char * p, * r; + #endif + int i, j, k, tmp, alloctmp; + int bitmap_nr; + int fatal = 0, err; + struct super_block * sb; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + #ifdef EXT3FS_DEBUG + static int goal_hits = 0, goal_attempts = 0; + #endif + *errp = -ENOSPC; + sb = inode->i_sb; + if (!sb) { + printk ("ext3_new_block: nonexistent device"); + return 0; + } + + /* + * Check quota for allocation of this block. + */ + if (DQUOT_ALLOC_BLOCK(inode, 1)) { + *errp = -EDQUOT; + return 0; + } + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + if (le32_to_cpu(es->s_free_blocks_count) <= + le32_to_cpu(es->s_r_blocks_count) && + ((sb->u.ext3_sb.s_resuid != current->fsuid) && + (sb->u.ext3_sb.s_resgid == 0 || + !in_group_p (sb->u.ext3_sb.s_resgid)) && + !capable(CAP_SYS_RESOURCE))) + goto out; + + ext3_debug ("goal=%lu.\n", goal); + + /* + * First, test whether the goal block is free. + */ + if (goal < le32_to_cpu(es->s_first_data_block) || + goal >= le32_to_cpu(es->s_blocks_count)) + goal = le32_to_cpu(es->s_first_data_block); + i = (goal - le32_to_cpu(es->s_first_data_block)) / + EXT3_BLOCKS_PER_GROUP(sb); + gdp = ext3_get_group_desc (sb, i, &bh2); + if (!gdp) + goto io_error; + + if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { + j = ((goal - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)); + #ifdef EXT3FS_DEBUG + if (j) + goal_attempts++; + #endif + bitmap_nr = load_block_bitmap (sb, i); + if (bitmap_nr < 0) + goto io_error; + + bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; + + ext3_debug ("goal is at %d:%d.\n", i, j); + + if (ext3_test_allocatable(j, bh)) { + #ifdef EXT3FS_DEBUG + goal_hits++; + ext3_debug ("goal bit allocated.\n"); + #endif + goto got_block; + } + + j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb)); + if (j >= 0) + goto search_back; + } + + ext3_debug ("Bit not found in block group %d.\n", i); + + /* + * Now search the rest of the groups. We assume that + * i and gdp correctly point to the last group visited. + */ + for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) { + i++; + if (i >= sb->u.ext3_sb.s_groups_count) + i = 0; + gdp = ext3_get_group_desc (sb, i, &bh2); + if (!gdp) { + *errp = -EIO; + goto out; + } + if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) { + bitmap_nr = load_block_bitmap (sb, i); + if (bitmap_nr < 0) + goto io_error; + + bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr]; + j = find_next_usable_block(-1, bh, + EXT3_BLOCKS_PER_GROUP(sb)); + if (j >= 0) + goto search_back; + } + } + + /* No space left on the device */ + unlock_super (sb); + return 0; + + search_back: + /* + * We have succeeded in finding a free byte in the block + * bitmap. Now search backwards up to 7 bits to find the + * start of this group of free blocks. + */ + for ( k = 0; + k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh); + k++, j--) + ; + + got_block: + + ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count); + + /* Make sure we use undo access for the bitmap, because it is + critical that we do the frozen_data COW on bitmap buffers in + all cases even if the buffer is in BJ_Forget state in the + committing transaction. */ + BUFFER_TRACE(bh, "get undo access for marking new block"); + fatal = ext3_journal_get_undo_access(handle, bh); + if (fatal) goto out; + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto out; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (fatal) goto out; + + tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(es->s_first_data_block); + + if (tmp == le32_to_cpu(gdp->bg_block_bitmap) || + tmp == le32_to_cpu(gdp->bg_inode_bitmap) || + in_range (tmp, le32_to_cpu(gdp->bg_inode_table), + sb->u.ext3_sb.s_itb_per_group)) + ext3_error (sb, "ext3_new_block", + "Allocating block in system zone - " + "block = %u", tmp); + + /* The superblock lock should guard against anybody else beating + * us to this point! */ + J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data)); + BUFFER_TRACE(bh, "setting bitmap bit"); + ext3_set_bit(j, bh->b_data); + + #ifdef CONFIG_JBD_DEBUG + { + struct buffer_head *debug_bh; + + /* Record bitmap buffer state in the newly allocated block */ + debug_bh = get_hash_table(sb->s_dev, tmp, sb->s_blocksize); + if (debug_bh) { + BUFFER_TRACE(debug_bh, "state when allocated"); + BUFFER_TRACE2(debug_bh, bh, "bitmap state"); + brelse(debug_bh); + } + } + #endif + if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data) + J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data)); + bhtmp = bh; + alloctmp = j; + + ext3_debug ("found bit %d\n", j); + + /* + * Do block preallocation now if required. + */ + #ifdef EXT3_PREALLOCATE + /* + * akpm: this is not enabled for ext3. Need to use + * ext3_test_allocatable() + */ + /* Writer: ->i_prealloc* */ + if (prealloc_count && !*prealloc_count) { + int prealloc_goal; + unsigned long next_block = tmp + 1; + + prealloc_goal = es->s_prealloc_blocks ? + es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS; + + *prealloc_block = next_block; + /* Writer: end */ + for (k = 1; + k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb); + k++, next_block++) { + if (DQUOT_PREALLOC_BLOCK(inode, 1)) + break; + /* Writer: ->i_prealloc* */ + if (*prealloc_block + *prealloc_count != next_block || + ext3_set_bit (j + k, bh->b_data)) { + /* Writer: end */ + DQUOT_FREE_BLOCK(inode, 1); + break; + } + (*prealloc_count)++; + /* Writer: end */ + } + /* + * As soon as we go for per-group spinlocks we'll need these + * done inside the loop above. + */ + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - + (k - 1)); + es->s_free_blocks_count = + cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - + (k - 1)); + ext3_debug ("Preallocated a further %lu bits.\n", + (k - 1)); + } + #endif + + j = tmp; + + BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) fatal = err; + + if (j >= le32_to_cpu(es->s_blocks_count)) { + ext3_error (sb, "ext3_new_block", + "block(%d) >= blocks count(%d) - " + "block_group = %d, es == %p ",j, + le32_to_cpu(es->s_blocks_count), i, es); + goto out; + } + + /* + * It is up to the caller to add the new buffer to a journal + * list of some description. We don't know in advance whether + * the caller wants to use it as metadata or data. + */ + + ext3_debug ("allocating block %d. " + "Goal hits %d of %d.\n", j, goal_hits, goal_attempts); + + gdp->bg_free_blocks_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); + es->s_free_blocks_count = + cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1); + + BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + + BUFFER_TRACE(bh, "journal_dirty_metadata for superblock"); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + if (!fatal) fatal = err; + + sb->s_dirt = 1; + if (fatal) + goto out; + + unlock_super (sb); + *errp = 0; + return j; + + io_error: + *errp = -EIO; + out: + if (fatal) { + *errp = fatal; + ext3_std_error(sb, fatal); + } + unlock_super (sb); + return 0; + + } + + unsigned long ext3_count_free_blocks (struct super_block * sb) + { + #ifdef EXT3FS_DEBUG + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + bitmap_nr = load_block_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr], + sb->s_blocksize); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count); + unlock_super (sb); + return bitmap_count; + #else + return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count); + #endif + } + + static inline int block_in_use (unsigned long block, + struct super_block * sb, + unsigned char * map) + { + return ext3_test_bit ((block - + le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb), map); + } + + static inline int test_root(int a, int b) + { + if (a == 0) + return 1; + while (1) { + if (a == 1) + return 1; + if (a % b) + return 0; + a = a / b; + } + } + + int ext3_group_sparse(int group) + { + return (test_root(group, 3) || test_root(group, 5) || + test_root(group, 7)); + } + + /** + * ext3_bg_has_super - number of blocks used by the superblock in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the superblock (primary or backup) + * in this group. Currently this will be only 0 or 1. + */ + int ext3_bg_has_super(struct super_block *sb, int group) + { + if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& + !ext3_group_sparse(group)) + return 0; + return 1; + } + + /** + * ext3_bg_num_gdb - number of blocks used by the group table in group + * @sb: superblock for filesystem + * @group: group number to check + * + * Return the number of blocks used by the group descriptor table + * (primary or backup) in this group. In the future there may be a + * different number of descriptor blocks in each group. + */ + unsigned long ext3_bg_num_gdb(struct super_block *sb, int group) + { + if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&& + !ext3_group_sparse(group)) + return 0; + return EXT3_SB(sb)->s_gdb_count; + } + + #ifdef CONFIG_EXT3_CHECK + /* Called at mount-time, super-block is locked */ + void ext3_check_blocks_bitmap (struct super_block * sb) + { + struct buffer_head * bh; + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x, j; + unsigned long desc_blocks; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + bitmap_nr = load_block_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr]; + + if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data)) + ext3_error(sb, __FUNCTION__, + "Superblock in group %d is marked free", i); + + desc_blocks = ext3_bg_num_gdb(sb, i); + for (j = 0; j < desc_blocks; j++) + if (!ext3_test_bit(j + 1, bh->b_data)) + ext3_error(sb, __FUNCTION__, + "Descriptor block #%ld in group " + "%d is marked free", j, i); + + if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap), + sb, bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Block bitmap for group %d is marked free", + i); + + if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap), + sb, bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Inode bitmap for group %d is marked free", + i); + + for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++) + if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j, + sb, bh->b_data)) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Block #%d of the inode table in " + "group %d is marked free", j, i); + + x = ext3_count_free (bh, sb->s_blocksize); + if (le16_to_cpu(gdp->bg_free_blocks_count) != x) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Wrong free blocks count for group %d, " + "stored = %d, counted = %lu", i, + le16_to_cpu(gdp->bg_free_blocks_count), x); + bitmap_count += x; + } + if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count) + ext3_error (sb, "ext3_check_blocks_bitmap", + "Wrong free blocks count in super block, " + "stored = %lu, counted = %lu", + (unsigned long)le32_to_cpu(es->s_free_blocks_count), + bitmap_count); + } + #endif diff -rc2P linux/fs/ext3/bitmap.c linux-2.4.13/fs/ext3/bitmap.c *** linux/fs/ext3/bitmap.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/bitmap.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,26 ---- + /* + * linux/fs/ext3/bitmap.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + + #include + + + static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; + + unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars) + { + unsigned int i; + unsigned long sum = 0; + + if (!map) + return (0); + for (i = 0; i < numchars; i++) + sum += nibblemap[map->b_data[i] & 0xf] + + nibblemap[(map->b_data[i] >> 4) & 0xf]; + return (sum); + } diff -rc2P linux/fs/ext3/dir.c linux-2.4.13/fs/ext3/dir.c *** linux/fs/ext3/dir.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/dir.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,190 ---- + /* + * linux/fs/ext3/dir.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/dir.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 directory handling functions + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + + #include + #include + #include + + static unsigned char ext3_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK + }; + + static int ext3_readdir(struct file *, void *, filldir_t); + + struct file_operations ext3_dir_operations = { + read: generic_read_dir, + readdir: ext3_readdir, /* BKL held */ + ioctl: ext3_ioctl, /* BKL held */ + fsync: ext3_sync_file, /* BKL held */ + }; + + int ext3_check_dir_entry (const char * function, struct inode * dir, + struct ext3_dir_entry_2 * de, + struct buffer_head * bh, + unsigned long offset) + { + const char * error_msg = NULL; + const int rlen = le16_to_cpu(de->rec_len); + + if (rlen < EXT3_DIR_REC_LEN(1)) + error_msg = "rec_len is smaller than minimal"; + else if (rlen % 4 != 0) + error_msg = "rec_len % 4 != 0"; + else if (rlen < EXT3_DIR_REC_LEN(de->name_len)) + error_msg = "rec_len is too small for name_len"; + else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) + error_msg = "directory entry across blocks"; + else if (le32_to_cpu(de->inode) > + le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count)) + error_msg = "inode out of bounds"; + + if (error_msg != NULL) + ext3_error (dir->i_sb, function, + "bad entry in directory #%lu: %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error_msg, offset, + (unsigned long) le32_to_cpu(de->inode), + rlen, de->name_len); + return error_msg == NULL ? 1 : 0; + } + + static int ext3_readdir(struct file * filp, + void * dirent, filldir_t filldir) + { + int error = 0; + unsigned long offset, blk; + int i, num, stored; + struct buffer_head * bh, * tmp, * bha[16]; + struct ext3_dir_entry_2 * de; + struct super_block * sb; + int err; + struct inode *inode = filp->f_dentry->d_inode; + + sb = inode->i_sb; + + stored = 0; + bh = NULL; + offset = filp->f_pos & (sb->s_blocksize - 1); + + while (!error && !stored && filp->f_pos < inode->i_size) { + blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb); + bh = ext3_bread (0, inode, blk, 0, &err); + if (!bh) { + ext3_error (sb, "ext3_readdir", + "directory #%lu contains a hole at offset %lu", + inode->i_ino, (unsigned long)filp->f_pos); + filp->f_pos += sb->s_blocksize - offset; + continue; + } + + /* + * Do the readahead + */ + if (!offset) { + for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0; + i > 0; i--) { + tmp = ext3_getblk (NULL, inode, ++blk, 0, &err); + if (tmp && !buffer_uptodate(tmp) && + !buffer_locked(tmp)) + bha[num++] = tmp; + else + brelse (tmp); + } + if (num) { + ll_rw_block (READA, num, bha); + for (i = 0; i < num; i++) + brelse (bha[i]); + } + } + + revalidate: + /* If the dir block has changed since the last call to + * readdir(2), then we might be pointing to an invalid + * dirent right now. Scan from the start of the block + * to make sure. */ + if (filp->f_version != inode->i_version) { + for (i = 0; i < sb->s_blocksize && i < offset; ) { + de = (struct ext3_dir_entry_2 *) + (bh->b_data + i); + /* It's too expensive to do a full + * dirent test each time round this + * loop, but we do have to test at + * least that it is non-zero. A + * failure will be detected in the + * dirent test below. */ + if (le16_to_cpu(de->rec_len) < + EXT3_DIR_REC_LEN(1)) + break; + i += le16_to_cpu(de->rec_len); + } + offset = i; + filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) + | offset; + filp->f_version = inode->i_version; + } + + while (!error && filp->f_pos < inode->i_size + && offset < sb->s_blocksize) { + de = (struct ext3_dir_entry_2 *) (bh->b_data + offset); + if (!ext3_check_dir_entry ("ext3_readdir", inode, de, + bh, offset)) { + /* On error, skip the f_pos to the + next block. */ + filp->f_pos = (filp->f_pos | + (sb->s_blocksize - 1)) + 1; + brelse (bh); + return stored; + } + offset += le16_to_cpu(de->rec_len); + if (le32_to_cpu(de->inode)) { + /* We might block in the next section + * if the data destination is + * currently swapped out. So, use a + * version stamp to detect whether or + * not the directory has been modified + * during the copy operation. + */ + unsigned long version = filp->f_version; + unsigned char d_type = DT_UNKNOWN; + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_FILETYPE) + && de->file_type < EXT3_FT_MAX) + d_type = + ext3_filetype_table[de->file_type]; + error = filldir(dirent, de->name, + de->name_len, + filp->f_pos, + le32_to_cpu(de->inode), + d_type); + if (error) + break; + if (version != filp->f_version) + goto revalidate; + stored ++; + } + filp->f_pos += le16_to_cpu(de->rec_len); + } + offset = 0; + brelse (bh); + } + UPDATE_ATIME(inode); + return 0; + } diff -rc2P linux/fs/ext3/file.c linux-2.4.13/fs/ext3/file.c *** linux/fs/ext3/file.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/file.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,97 ---- + /* + * linux/fs/ext3/file.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/file.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 fs regular file handling primitives + * + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + */ + + #include + #include + #include + #include + #include + #include + #include + + /* + * Called when an inode is released. Note that this is different + * from ext3_file_open: open gets called at every open, but release + * gets called only when /all/ the files are closed. + */ + static int ext3_release_file (struct inode * inode, struct file * filp) + { + if (filp->f_mode & FMODE_WRITE) + ext3_discard_prealloc (inode); + return 0; + } + + /* + * Called when an inode is about to be opened. + * We use this to disallow opening RW large files on 32bit systems if + * the caller didn't specify O_LARGEFILE. On 64bit systems we force + * on this flag in sys_open. + */ + static int ext3_open_file (struct inode * inode, struct file * filp) + { + if (!(filp->f_flags & O_LARGEFILE) && + inode->i_size > 0x7FFFFFFFLL) + return -EFBIG; + return 0; + } + + /* + * ext3_file_write(). + * + * Most things are done in ext3_prepare_write() and ext3_commit_write(). + */ + + static ssize_t + ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) + { + int ret; + struct inode *inode = file->f_dentry->d_inode; + + ret = generic_file_write(file, buf, count, ppos); + if ((ret >= 0) && IS_SYNC(inode)) { + if (file->f_flags & O_SYNC) { + /* + * generic_osync_inode() has already done the sync + */ + } else { + int ret2 = ext3_force_commit(inode->i_sb); + if (ret2) + ret = ret2; + } + } + return ret; + } + + struct file_operations ext3_file_operations = { + llseek: generic_file_llseek, /* BKL held */ + read: generic_file_read, /* BKL not held. Don't need */ + write: ext3_file_write, /* BKL not held. Don't need */ + ioctl: ext3_ioctl, /* BKL held */ + mmap: generic_file_mmap, + open: ext3_open_file, /* BKL not held. Don't need */ + release: ext3_release_file, /* BKL not held. Don't need */ + fsync: ext3_sync_file, /* BKL held */ + }; + + struct inode_operations ext3_file_inode_operations = { + truncate: ext3_truncate, /* BKL held */ + setattr: ext3_setattr, /* BKL held */ + }; + diff -rc2P linux/fs/ext3/fsync.c linux-2.4.13/fs/ext3/fsync.c *** linux/fs/ext3/fsync.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/fsync.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,69 ---- + /* + * linux/fs/ext3/fsync.c + * + * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) + * from + * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * from + * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3fs fsync primitive + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * + * Removed unnecessary code duplication for little endian machines + * and excessive __inline__s. + * Andi Kleen, 1997 + * + * Major simplications and cleanup - we only need to do the metadata, because + * we can depend on generic_block_fdatasync() to sync the data blocks. + */ + + #include + #include + #include + #include + #include + #include + #include + + /* + * akpm: A new design for ext3_sync_file(). + * + * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). + * There cannot be a transaction open by this task. (AKPM: quotas?) + * Another task could have dirtied this inode. Its data can be in any + * state in the journalling system. + * + * What we do is just kick off a commit and wait on it. This will snapshot the + * inode to disk. + * + * Note that there is a serious optimisation we can make here: if the current + * inode is not part of j_running_transaction or j_committing_transaction + * then we have nothing to do. That would require implementation of t_ilist, + * which isn't too hard. + */ + + int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) + { + struct inode *inode = dentry->d_inode; + int ret; + + J_ASSERT(ext3_journal_current_handle() == 0); + + /* + * fsync_inode_buffers() just walks i_dirty_buffers and waits + * on them. It's a no-op for full data journalling because + * i_dirty_buffers will be ampty. + * Really, we only need to start I/O on the dirty buffers - + * we'll end up waiting on them in commit. + */ + ret = fsync_inode_buffers(inode); + + ext3_force_commit(inode->i_sb); + + return ret; + } diff -rc2P linux/fs/ext3/ialloc.c linux-2.4.13/fs/ext3/ialloc.c *** linux/fs/ext3/ialloc.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/ialloc.c Fri Nov 9 17:03:46 2001 *************** *** 0 **** --- 1,664 ---- + /* + * linux/fs/ext3/ialloc.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * BSD ufs-inspired inode and directory allocation by + * Stephen Tweedie (sct@redhat.com), 1993 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + + /* + * ialloc.c contains the inodes allocation and deallocation routines + */ + + /* + * The free inodes are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap + * block for inodes, N blocks for the inode table and data blocks. + * + * The file system contains group descriptors which are located after the + * super block. Each descriptor contains the number of the bitmap block and + * the free blocks count in the block. The descriptors are loaded in memory + * when a file system is mounted (see ext3_read_super). + */ + + + /* + * Read the inode allocation bitmap for a given block_group, reading + * into the specified slot in the superblock's bitmap cache. + * + * Return >=0 on success or a -ve error code. + */ + static int read_inode_bitmap (struct super_block * sb, + unsigned long block_group, + unsigned int bitmap_nr) + { + struct ext3_group_desc * gdp; + struct buffer_head * bh = NULL; + int retval = 0; + + gdp = ext3_get_group_desc (sb, block_group, NULL); + if (!gdp) { + retval = -EIO; + goto error_out; + } + bh = bread (sb->s_dev, + le32_to_cpu(gdp->bg_inode_bitmap), sb->s_blocksize); + if (!bh) { + ext3_error (sb, "read_inode_bitmap", + "Cannot read inode bitmap - " + "block_group = %lu, inode_bitmap = %lu", + block_group, (unsigned long) gdp->bg_inode_bitmap); + retval = -EIO; + } + /* + * On IO error, just leave a zero in the superblock's block pointer for + * this group. The IO will be retried next time. + */ + error_out: + sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group; + sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh; + return retval; + } + + /* + * load_inode_bitmap loads the inode bitmap for a blocks group + * + * It maintains a cache for the last bitmaps loaded. This cache is managed + * with a LRU algorithm. + * + * Notes: + * 1/ There is one cache per mounted file system. + * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups, + * this function reads the bitmap without maintaining a LRU cache. + * + * Return the slot used to store the bitmap, or a -ve error code. + */ + static int load_inode_bitmap (struct super_block * sb, + unsigned int block_group) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long inode_bitmap_number; + struct buffer_head * inode_bitmap; + int i, j, retval = 0; + + if (block_group >= sbi->s_groups_count) + ext3_panic (sb, "load_inode_bitmap", + "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); + if (sbi->s_loaded_inode_bitmaps > 0 && + sbi->s_inode_bitmap_number[0] == block_group && + sbi->s_inode_bitmap[0] != NULL) + return 0; + if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) { + if (sbi->s_inode_bitmap[block_group]) { + if (sbi->s_inode_bitmap_number[block_group] != + block_group) + ext3_panic(sb, "load_inode_bitmap", + "block_group != inode_bitmap_number"); + return block_group; + } + retval = read_inode_bitmap(sb, block_group, block_group); + if (retval < 0) + return retval; + return block_group; + } + + for (i = 0; i < sbi->s_loaded_inode_bitmaps && + sbi->s_inode_bitmap_number[i] != block_group; i++) + /* do nothing */; + if (i < sbi->s_loaded_inode_bitmaps && + sbi->s_inode_bitmap_number[i] == block_group) { + inode_bitmap_number = sbi->s_inode_bitmap_number[i]; + inode_bitmap = sbi->s_inode_bitmap[i]; + for (j = i; j > 0; j--) { + sbi->s_inode_bitmap_number[j] = + sbi->s_inode_bitmap_number[j - 1]; + sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; + } + sbi->s_inode_bitmap_number[0] = inode_bitmap_number; + sbi->s_inode_bitmap[0] = inode_bitmap; + + /* + * There's still one special case here --- if inode_bitmap == 0 + * then our last attempt to read the bitmap failed and we have + * just ended up caching that failure. Try again to read it. + */ + if (!inode_bitmap) + retval = read_inode_bitmap (sb, block_group, 0); + } else { + if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED) + sbi->s_loaded_inode_bitmaps++; + else + brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]); + for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) { + sbi->s_inode_bitmap_number[j] = + sbi->s_inode_bitmap_number[j - 1]; + sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1]; + } + retval = read_inode_bitmap (sb, block_group, 0); + } + return retval; + } + + /* + * NOTE! When we get the inode, we're the only people + * that have access to it, and as such there are no + * race conditions we have to worry about. The inode + * is not on the hash-lists, and it cannot be reached + * through the filesystem because the directory entry + * has been deleted earlier. + * + * HOWEVER: we must make sure that we get no aliases, + * which means that we have to call "clear_inode()" + * _before_ we mark the inode not in use in the inode + * bitmaps. Otherwise a newly created file might use + * the same inode number (not actually the same pointer + * though), and then we'd have two inodes sharing the + * same inode number and space on the harddisk. + */ + void ext3_free_inode (handle_t *handle, struct inode * inode) + { + struct super_block * sb = inode->i_sb; + int is_directory; + unsigned long ino; + struct buffer_head * bh; + struct buffer_head * bh2; + unsigned long block_group; + unsigned long bit; + int bitmap_nr; + struct ext3_group_desc * gdp; + struct ext3_super_block * es; + int fatal = 0, err; + + if (!inode->i_dev) { + printk ("ext3_free_inode: inode has no device\n"); + return; + } + if (atomic_read(&inode->i_count) > 1) { + printk ("ext3_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); + return; + } + if (inode->i_nlink) { + printk ("ext3_free_inode: inode has nlink=%d\n", + inode->i_nlink); + return; + } + if (!sb) { + printk("ext3_free_inode: inode on nonexistent device\n"); + return; + } + + ino = inode->i_ino; + ext3_debug ("freeing inode %lu\n", ino); + + /* + * Note: we must free any quota before locking the superblock, + * as writing the quota to disk may need the lock as well. + */ + DQUOT_INIT(inode); + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + + is_directory = S_ISDIR(inode->i_mode); + + /* Do this BEFORE marking the inode not in use or returning an error */ + clear_inode (inode); + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_free_inode", + "reserved or nonexistent inode %lu", ino); + goto error_return; + } + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + bitmap_nr = load_inode_bitmap (sb, block_group); + if (bitmap_nr < 0) + goto error_return; + + bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; + + BUFFER_TRACE(bh, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh); + if (fatal) + goto error_return; + + /* Ok, now we can actually update the inode bitmaps.. */ + if (!ext3_clear_bit (bit, bh->b_data)) + ext3_error (sb, "ext3_free_inode", + "bit already cleared for inode %lu", ino); + else { + gdp = ext3_get_group_desc (sb, block_group, &bh2); + + BUFFER_TRACE(bh2, "get_write_access"); + fatal = ext3_journal_get_write_access(handle, bh2); + if (fatal) goto error_return; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access"); + fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (fatal) goto error_return; + + if (gdp) { + gdp->bg_free_inodes_count = cpu_to_le16( + le16_to_cpu(gdp->bg_free_inodes_count) + 1); + if (is_directory) + gdp->bg_used_dirs_count = cpu_to_le16( + le16_to_cpu(gdp->bg_used_dirs_count) - 1); + } + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (!fatal) fatal = err; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1); + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, + "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + if (!fatal) fatal = err; + } + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) + fatal = err; + sb->s_dirt = 1; + error_return: + ext3_std_error(sb, fatal); + unlock_super(sb); + } + + /* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of + * the groups with above-average free space, that group with the fewest + * directories already is chosen. + * + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ + struct inode * ext3_new_inode (handle_t *handle, + const struct inode * dir, int mode) + { + struct super_block * sb; + struct buffer_head * bh; + struct buffer_head * bh2; + int i, j, avefreei; + struct inode * inode; + int bitmap_nr; + struct ext3_group_desc * gdp; + struct ext3_group_desc * tmp; + struct ext3_super_block * es; + int err = 0; + + /* Cannot create files in a deleted directory */ + if (!dir || !dir->i_nlink) + return ERR_PTR(-EPERM); + + sb = dir->i_sb; + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + init_rwsem(&inode->u.ext3_i.truncate_sem); + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + repeat: + gdp = NULL; + i = 0; + + if (S_ISDIR(mode)) { + avefreei = le32_to_cpu(es->s_free_inodes_count) / + sb->u.ext3_sb.s_groups_count; + if (!gdp) { + for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) { + struct buffer_head *temp_buffer; + tmp = ext3_get_group_desc (sb, j, &temp_buffer); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count) && + le16_to_cpu(tmp->bg_free_inodes_count) >= + avefreei) { + if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) > + le16_to_cpu(gdp->bg_free_blocks_count))) { + i = j; + gdp = tmp; + bh2 = temp_buffer; + } + } + } + } + } else { + /* + * Try to place the inode in its parent directory + */ + i = dir->u.ext3_i.i_block_group; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && le16_to_cpu(tmp->bg_free_inodes_count)) + gdp = tmp; + else + { + /* + * Use a quadratic hash to find a group with a + * free inode + */ + for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) { + i += j; + if (i >= sb->u.ext3_sb.s_groups_count) + i -= sb->u.ext3_sb.s_groups_count; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count)) { + gdp = tmp; + break; + } + } + } + if (!gdp) { + /* + * That failed: try linear search for a free inode + */ + i = dir->u.ext3_i.i_block_group + 1; + for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) { + if (++i >= sb->u.ext3_sb.s_groups_count) + i = 0; + tmp = ext3_get_group_desc (sb, i, &bh2); + if (tmp && + le16_to_cpu(tmp->bg_free_inodes_count)) { + gdp = tmp; + break; + } + } + } + } + + err = -ENOSPC; + if (!gdp) + goto fail; + + err = -EIO; + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + goto fail; + + bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr]; + + if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data, + EXT3_INODES_PER_GROUP(sb))) < + EXT3_INODES_PER_GROUP(sb)) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) goto fail; + + if (ext3_set_bit (j, bh->b_data)) { + ext3_error (sb, "ext3_new_inode", + "bit already set for inode %d", j); + goto repeat; + } + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) goto fail; + } else { + if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) { + ext3_error (sb, "ext3_new_inode", + "Free inodes count corrupted in group %d", + i); + /* Is it really ENOSPC? */ + err = -ENOSPC; + if (sb->s_flags & MS_RDONLY) + goto fail; + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + gdp->bg_free_inodes_count = 0; + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + } + goto repeat; + } + j += i * EXT3_INODES_PER_GROUP(sb) + 1; + if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) { + ext3_error (sb, "ext3_new_inode", + "reserved inode or inode > inodes count - " + "block_group = %d,inode=%d", i, j); + err = -EIO; + goto fail; + } + + BUFFER_TRACE(bh2, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh2); + if (err) goto fail; + gdp->bg_free_inodes_count = + cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); + if (S_ISDIR(mode)) + gdp->bg_used_dirs_count = + cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); + BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh2); + if (err) goto fail; + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (err) goto fail; + es->s_free_inodes_count = + cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1); + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + sb->s_dirt = 1; + if (err) goto fail; + + inode->i_uid = current->fsuid; + if (test_opt (sb, GRPID)) + inode->i_gid = dir->i_gid; + else if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + + inode->i_ino = j; + /* This is the optimal IO size (for stat), not the fs block size */ + inode->i_blksize = PAGE_SIZE; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL; + if (S_ISLNK(mode)) + inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL | EXT3_APPEND_FL); + #ifdef EXT3_FRAGMENTS + inode->u.ext3_i.i_faddr = 0; + inode->u.ext3_i.i_frag_no = 0; + inode->u.ext3_i.i_frag_size = 0; + #endif + inode->u.ext3_i.i_file_acl = 0; + inode->u.ext3_i.i_dir_acl = 0; + inode->u.ext3_i.i_dtime = 0; + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); + #ifdef EXT3_PREALLOCATE + inode->u.ext3_i.i_prealloc_count = 0; + #endif + inode->u.ext3_i.i_block_group = i; + + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + if (IS_SYNC(inode)) + handle->h_sync = 1; + insert_inode_hash(inode); + inode->i_generation = event++; + + inode->u.ext3_i.i_state = EXT3_STATE_NEW; + err = ext3_mark_inode_dirty(handle, inode); + if (err) goto fail; + + unlock_super (sb); + if(DQUOT_ALLOC_INODE(inode)) { + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; + inode->i_nlink = 0; + iput(inode); + return ERR_PTR(-EDQUOT); + } + ext3_debug ("allocating inode %lu\n", inode->i_ino); + return inode; + + fail: + unlock_super(sb); + iput(inode); + ext3_std_error(sb, err); + return ERR_PTR(err); + } + + /* Verify that we are loading a valid orphan from disk */ + struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino) + { + ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count); + unsigned long block_group; + int bit; + int bitmap_nr; + struct buffer_head *bh; + struct inode *inode = NULL; + + /* Error cases - e2fsck has already cleaned up for us */ + if (ino > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan ino %ld! e2fsck was run?\n", ino); + return NULL; + } + + block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); + bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); + if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 || + !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) { + ext3_warning(sb, __FUNCTION__, + "inode bitmap error for orphan %ld\n", ino); + return NULL; + } + + /* Having the inode bit set should be a 100% indicator that this + * is a valid orphan (no e2fsck run on fs). Orphans also include + * inodes that were being truncated, so we can't check i_nlink==0. + */ + if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) || + is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) { + ext3_warning(sb, __FUNCTION__, + "bad orphan inode %ld! e2fsck was run?\n", ino); + printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n", + bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data)); + printk(KERN_NOTICE "inode=%p\n", inode); + if (inode) { + printk(KERN_NOTICE "is_bad_inode(inode)=%d\n", + is_bad_inode(inode)); + printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n", + NEXT_ORPHAN(inode)); + printk(KERN_NOTICE "max_ino=%ld\n", max_ino); + } + /* Avoid freeing blocks if we got a bad deleted inode */ + if (inode && inode->i_nlink == 0) + inode->i_blocks = 0; + iput(inode); + return NULL; + } + + return inode; + } + + unsigned long ext3_count_free_inodes (struct super_block * sb) + { + #ifdef EXT3FS_DEBUG + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + lock_super (sb); + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], + EXT3_INODES_PER_GROUP(sb) / 8); + printk ("group %d: stored = %d, counted = %lu\n", + i, le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + unlock_super (sb); + return desc_count; + #else + return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count); + #endif + } + + #ifdef CONFIG_EXT3_CHECK + /* Called at mount-time, super-block is locked */ + void ext3_check_inodes_bitmap (struct super_block * sb) + { + struct ext3_super_block * es; + unsigned long desc_count, bitmap_count, x; + int bitmap_nr; + struct ext3_group_desc * gdp; + int i; + + es = sb->u.ext3_sb.s_es; + desc_count = 0; + bitmap_count = 0; + gdp = NULL; + for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) { + gdp = ext3_get_group_desc (sb, i, NULL); + if (!gdp) + continue; + desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + bitmap_nr = load_inode_bitmap (sb, i); + if (bitmap_nr < 0) + continue; + + x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr], + EXT3_INODES_PER_GROUP(sb) / 8); + if (le16_to_cpu(gdp->bg_free_inodes_count) != x) + ext3_error (sb, "ext3_check_inodes_bitmap", + "Wrong free inodes count in group %d, " + "stored = %d, counted = %lu", i, + le16_to_cpu(gdp->bg_free_inodes_count), x); + bitmap_count += x; + } + if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count) + ext3_error (sb, "ext3_check_inodes_bitmap", + "Wrong free inodes count in super block, " + "stored = %lu, counted = %lu", + (unsigned long)le32_to_cpu(es->s_free_inodes_count), + bitmap_count); + } + #endif diff -rc2P linux/fs/ext3/inode.c linux-2.4.13/fs/ext3/inode.c *** linux/fs/ext3/inode.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/inode.c Fri Nov 9 17:03:19 2001 *************** *** 0 **** --- 1,2676 ---- + /* + * linux/fs/ext3/inode.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Goal-directed block allocation by Stephen Tweedie + * (sct@redhat.com), 1993, 1998 + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * 64-bit file support on 64-bit platforms by Jakub Jelinek + * (jj@sunsite.ms.mff.cuni.cz) + * + * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + + + /* + * SEARCH_FROM_ZERO forces each block allocation to search from the start + * of the filesystem. This is to force rapid reallocation of recently-freed + * blocks. The file fragmentation is horrendous. + */ + #undef SEARCH_FROM_ZERO + + /* The ext3 forget function must perform a revoke if we are freeing data + * which has been journaled. Metadata (eg. indirect blocks) must be + * revoked in all cases. + * + * "bh" may be NULL: a metadata block may have been freed from memory + * but there may still be a record of it in the journal, and that record + * still needs to be revoked. + */ + + static int ext3_forget(handle_t *handle, int is_metadata, + struct inode *inode, struct buffer_head *bh, + int blocknr) + { + int err; + + BUFFER_TRACE(bh, "enter"); + + jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " + "data mode %lx\n", + bh, is_metadata, inode->i_mode, + test_opt(inode->i_sb, DATA_FLAGS)); + + /* Never use the revoke function if we are doing full data + * journaling: there is no need to, and a V1 superblock won't + * support it. Otherwise, only skip the revoke on un-journaled + * data blocks. */ + + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA || + (!is_metadata && !ext3_should_journal_data(inode))) { + if (bh) { + BUFFER_TRACE(bh, "call journal_forget"); + ext3_journal_forget(handle, bh); + } + return 0; + } + + /* + * data!=journal && (is_metadata || should_journal_data(inode)) + */ + BUFFER_TRACE(bh, "call ext3_journal_revoke"); + err = ext3_journal_revoke(handle, blocknr, bh); + if (err) + ext3_abort(inode->i_sb, __FUNCTION__, + "error %d when attempting revoke", err); + BUFFER_TRACE(bh, "exit"); + return err; + } + + /* + * Truncate transactions can be complex and absolutely huge. So we need to + * be able to restart the transaction at a conventient checkpoint to make + * sure we don't overflow the journal. + * + * start_transaction gets us a new handle for a truncate transaction, + * and extend_transaction tries to extend the existing one a bit. If + * extend fails, we need to propagate the failure up and restart the + * transaction in the top-level truncate loop. --sct + */ + + static handle_t *start_transaction(struct inode *inode) + { + long needed; + handle_t *result; + + needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + + result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed); + if (!IS_ERR(result)) + return result; + + ext3_std_error(inode->i_sb, PTR_ERR(result)); + return result; + } + + /* + * Try to extend this transaction for the purposes of truncation. + * + * Returns 0 if we managed to create more room. If we can't create more + * room, and the transaction must be restarted we return 1. + */ + static int try_to_extend_transaction(handle_t *handle, struct inode *inode) + { + long needed; + + if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS) + return 0; + needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed)) + return 0; + return 1; + } + + /* + * Restart the transaction associated with *handle. This does a commit, + * so before we call here everything must be consistently dirtied against + * this transaction. + */ + static int ext3_journal_test_restart(handle_t *handle, struct inode *inode) + { + long needed = inode->i_blocks; + if (needed > EXT3_MAX_TRANS_DATA) + needed = EXT3_MAX_TRANS_DATA; + jbd_debug(2, "restarting handle %p\n", handle); + return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed); + } + + /* + * Called at each iput() + */ + void ext3_put_inode (struct inode * inode) + { + ext3_discard_prealloc (inode); + } + + /* + * Called at the last iput() if i_nlink is zero. + */ + void ext3_delete_inode (struct inode * inode) + { + handle_t *handle; + + if (is_bad_inode(inode) || + inode->i_ino == EXT3_ACL_IDX_INO || + inode->i_ino == EXT3_ACL_DATA_INO) + goto no_delete; + + lock_kernel(); + handle = start_transaction(inode); + if (IS_ERR(handle)) { + /* If we're going to skip the normal cleanup, we still + * need to make sure that the in-core orphan linked list + * is properly cleaned up. */ + ext3_orphan_del(NULL, inode); + + ext3_std_error(inode->i_sb, PTR_ERR(handle)); + unlock_kernel(); + goto no_delete; + } + + if (IS_SYNC(inode)) + handle->h_sync = 1; + inode->i_size = 0; + if (inode->i_blocks) + ext3_truncate(inode); + /* + * Kill off the orphan record which ext3_truncate created. + * AKPM: I think this can be inside the above `if'. + * Note that ext3_orphan_del() has to be able to cope with the + * deletion of a non-existent orphan - this is because we don't + * know if ext3_truncate() actually created an orphan record. + * (Well, we could do this if we need to, but heck - it works) + */ + ext3_orphan_del(handle, inode); + inode->u.ext3_i.i_dtime = CURRENT_TIME; + + /* + * One subtle ordering requirement: if anything has gone wrong + * (transaction abort, IO errors, whatever), then we can still + * do these next steps (the fs will already have been marked as + * having errors), but we can't free the inode if the mark_dirty + * fails. + */ + if (ext3_mark_inode_dirty(handle, inode)) + /* If that failed, just do the required in-core inode clear. */ + clear_inode(inode); + else + ext3_free_inode(handle, inode); + ext3_journal_stop(handle, inode); + unlock_kernel(); + return; + no_delete: + clear_inode(inode); /* We must guarantee clearing of inode... */ + } + + void ext3_discard_prealloc (struct inode * inode) + { + #ifdef EXT3_PREALLOCATE + lock_kernel(); + /* Writer: ->i_prealloc* */ + if (inode->u.ext3_i.i_prealloc_count) { + unsigned short total = inode->u.ext3_i.i_prealloc_count; + unsigned long block = inode->u.ext3_i.i_prealloc_block; + inode->u.ext3_i.i_prealloc_count = 0; + inode->u.ext3_i.i_prealloc_block = 0; + /* Writer: end */ + ext3_free_blocks (inode, block, total); + } + unlock_kernel(); + #endif + } + + static int ext3_alloc_block (handle_t *handle, + struct inode * inode, unsigned long goal, int *err) + { + #ifdef EXT3FS_DEBUG + static unsigned long alloc_hits = 0, alloc_attempts = 0; + #endif + unsigned long result; + + #ifdef EXT3_PREALLOCATE + /* Writer: ->i_prealloc* */ + if (inode->u.ext3_i.i_prealloc_count && + (goal == inode->u.ext3_i.i_prealloc_block || + goal + 1 == inode->u.ext3_i.i_prealloc_block)) + { + result = inode->u.ext3_i.i_prealloc_block++; + inode->u.ext3_i.i_prealloc_count--; + /* Writer: end */ + ext3_debug ("preallocation hit (%lu/%lu).\n", + ++alloc_hits, ++alloc_attempts); + } else { + ext3_discard_prealloc (inode); + ext3_debug ("preallocation miss (%lu/%lu).\n", + alloc_hits, ++alloc_attempts); + if (S_ISREG(inode->i_mode)) + result = ext3_new_block (inode, goal, + &inode->u.ext3_i.i_prealloc_count, + &inode->u.ext3_i.i_prealloc_block, err); + else + result = ext3_new_block (inode, goal, 0, 0, err); + /* + * AKPM: this is somewhat sticky. I'm not surprised it was + * disabled in 2.2's ext3. Need to integrate b_committed_data + * guarding with preallocation, if indeed preallocation is + * effective. + */ + } + #else + result = ext3_new_block (handle, inode, goal, 0, 0, err); + #endif + return result; + } + + + typedef struct { + u32 *p; + u32 key; + struct buffer_head *bh; + } Indirect; + + static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v) + { + p->key = *(p->p = v); + p->bh = bh; + } + + static inline int verify_chain(Indirect *from, Indirect *to) + { + while (from <= to && from->key == *from->p) + from++; + return (from > to); + } + + /** + * ext3_block_to_path - parse the block number into array of offsets + * @inode: inode in question (we are only interested in its superblock) + * @i_block: block number to be parsed + * @offsets: array to store the offsets in + * + * To store the locations of file's data ext3 uses a data structure common + * for UNIX filesystems - tree of pointers anchored in the inode, with + * data blocks at leaves and indirect blocks in intermediate nodes. + * This function translates the block number into path in that tree - + * return value is the path length and @offsets[n] is the offset of + * pointer to (n+1)th node in the nth one. If @block is out of range + * (negative or too large) warning is printed and zero returned. + * + * Note: function doesn't find node addresses, so no IO is needed. All + * we need to know is the capacity of indirect blocks (taken from the + * inode->i_sb). + */ + + /* + * Portability note: the last comparison (check that we fit into triple + * indirect block) is spelled differently, because otherwise on an + * architecture with 32-bit longs and 8Kb pages we might get into trouble + * if our filesystem had 8Kb blocks. We might use long long, but that would + * kill us on x86. Oh, well, at least the sign propagation does not matter - + * i_block would have to be negative in the very beginning, so we would not + * get there at all. + */ + + static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4]) + { + int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb); + const long direct_blocks = EXT3_NDIR_BLOCKS, + indirect_blocks = ptrs, + double_blocks = (1 << (ptrs_bits * 2)); + int n = 0; + + if (i_block < 0) { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0"); + } else if (i_block < direct_blocks) { + offsets[n++] = i_block; + } else if ( (i_block -= direct_blocks) < indirect_blocks) { + offsets[n++] = EXT3_IND_BLOCK; + offsets[n++] = i_block; + } else if ((i_block -= indirect_blocks) < double_blocks) { + offsets[n++] = EXT3_DIND_BLOCK; + offsets[n++] = i_block >> ptrs_bits; + offsets[n++] = i_block & (ptrs - 1); + } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) { + offsets[n++] = EXT3_TIND_BLOCK; + offsets[n++] = i_block >> (ptrs_bits * 2); + offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1); + offsets[n++] = i_block & (ptrs - 1); + } else { + ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big"); + } + return n; + } + + /** + * ext3_get_branch - read the chain of indirect blocks leading to data + * @inode: inode in question + * @depth: depth of the chain (1 - direct pointer, etc.) + * @offsets: offsets of pointers in inode/indirect blocks + * @chain: place to store the result + * @err: here we store the error value + * + * Function fills the array of triples and returns %NULL + * if everything went OK or the pointer to the last filled triple + * (incomplete one) otherwise. Upon the return chain[i].key contains + * the number of (i+1)-th block in the chain (as it is stored in memory, + * i.e. little-endian 32-bit), chain[i].p contains the address of that + * number (it points into struct inode for i==0 and into the bh->b_data + * for i>0) and chain[i].bh points to the buffer_head of i-th indirect + * block for i>0 and NULL for i==0. In other words, it holds the block + * numbers of the chain, addresses they were taken from (and where we can + * verify that chain did not change) and buffer_heads hosting these + * numbers. + * + * Function stops when it stumbles upon zero pointer (absent block) + * (pointer to last triple returned, *@err == 0) + * or when it gets an IO error reading an indirect block + * (ditto, *@err == -EIO) + * or when it notices that chain had been changed while it was reading + * (ditto, *@err == -EAGAIN) + * or when it reads all @depth-1 indirect blocks successfully and finds + * the whole chain, all way to the data (returns %NULL, *err == 0). + */ + static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets, + Indirect chain[4], int *err) + { + kdev_t dev = inode->i_dev; + int blocksize = inode->i_sb->s_blocksize; + Indirect *p = chain; + struct buffer_head *bh; + + *err = 0; + /* i_data is not going away, no lock needed */ + add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets); + if (!p->key) + goto no_block; + while (--depth) { + bh = bread(dev, le32_to_cpu(p->key), blocksize); + if (!bh) + goto failure; + /* Reader: pointers */ + if (!verify_chain(chain, p)) + goto changed; + add_chain(++p, bh, (u32*)bh->b_data + *++offsets); + /* Reader: end */ + if (!p->key) + goto no_block; + } + return NULL; + + changed: + *err = -EAGAIN; + goto no_block; + failure: + *err = -EIO; + no_block: + return p; + } + + /** + * ext3_find_near - find a place for allocation with sufficient locality + * @inode: owner + * @ind: descriptor of indirect block. + * + * This function returns the prefered place for block allocation. + * It is used when heuristic for sequential allocation fails. + * Rules are: + * + if there is a block to the left of our position - allocate near it. + * + if pointer will live in indirect block - allocate near that block. + * + if pointer will live in inode - allocate in the same + * cylinder group. + * Caller must make sure that @ind is valid and will stay that way. + */ + + static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind) + { + u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data; + u32 *p; + + /* Try to find previous block */ + for (p = ind->p - 1; p >= start; p--) + if (*p) + return le32_to_cpu(*p); + + /* No such thing, so let's try location of indirect block */ + if (ind->bh) + return ind->bh->b_blocknr; + + /* + * It is going to be refered from inode itself? OK, just put it into + * the same cylinder group then. + */ + return (inode->u.ext3_i.i_block_group * + EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + + le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block); + } + + /** + * ext3_find_goal - find a prefered place for allocation. + * @inode: owner + * @block: block we want + * @chain: chain of indirect blocks + * @partial: pointer to the last triple within a chain + * @goal: place to store the result. + * + * Normally this function find the prefered place for block allocation, + * stores it in *@goal and returns zero. If the branch had been changed + * under us we return -EAGAIN. + */ + + static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4], + Indirect *partial, unsigned long *goal) + { + /* Writer: ->i_next_alloc* */ + if (block == inode->u.ext3_i.i_next_alloc_block + 1) { + inode->u.ext3_i.i_next_alloc_block++; + inode->u.ext3_i.i_next_alloc_goal++; + } + #ifdef SEARCH_FROM_ZERO + inode->u.ext3_i.i_next_alloc_block = 0; + inode->u.ext3_i.i_next_alloc_goal = 0; + #endif + /* Writer: end */ + /* Reader: pointers, ->i_next_alloc* */ + if (verify_chain(chain, partial)) { + /* + * try the heuristic for sequential allocation, + * failing that at least try to get decent locality. + */ + if (block == inode->u.ext3_i.i_next_alloc_block) + *goal = inode->u.ext3_i.i_next_alloc_goal; + if (!*goal) + *goal = ext3_find_near(inode, partial); + #ifdef SEARCH_FROM_ZERO + *goal = 0; + #endif + return 0; + } + /* Reader: end */ + return -EAGAIN; + } + + /** + * ext3_alloc_branch - allocate and set up a chain of blocks. + * @inode: owner + * @num: depth of the chain (number of blocks to allocate) + * @offsets: offsets (in the blocks) to store the pointers to next. + * @branch: place to store the chain in. + * + * This function allocates @num blocks, zeroes out all but the last one, + * links them into chain and (if we are synchronous) writes them to disk. + * In other words, it prepares a branch that can be spliced onto the + * inode. It stores the information about that chain in the branch[], in + * the same format as ext3_get_branch() would do. We are calling it after + * we had read the existing part of chain and partial points to the last + * triple of that (one with zero ->key). Upon the exit we have the same + * picture as after the successful ext3_get_block(), excpet that in one + * place chain is disconnected - *branch->p is still zero (we did not + * set the last link), but branch->key contains the number that should + * be placed into *branch->p to fill that gap. + * + * If allocation fails we free all blocks we've allocated (and forget + * their buffer_heads) and return the error value the from failed + * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain + * as described above and return 0. + */ + + static int ext3_alloc_branch(handle_t *handle, struct inode *inode, + int num, + unsigned long goal, + int *offsets, + Indirect *branch) + { + int blocksize = inode->i_sb->s_blocksize; + int n = 0, keys = 0; + int err = 0; + int i; + int parent = ext3_alloc_block(handle, inode, goal, &err); + + branch[0].key = cpu_to_le32(parent); + if (parent) { + for (n = 1; n < num; n++) { + struct buffer_head *bh; + /* Allocate the next block */ + int nr = ext3_alloc_block(handle, inode, parent, &err); + if (!nr) + break; + branch[n].key = cpu_to_le32(nr); + keys = n+1; + + /* + * Get buffer_head for parent block, zero it out + * and set the pointer to new one, then send + * parent to disk. + */ + bh = getblk(inode->i_dev, parent, blocksize); + branch[n].bh = bh; + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + err = ext3_journal_get_create_access(handle, bh); + if (err) { + unlock_buffer(bh); + brelse(bh); + break; + } + + memset(bh->b_data, 0, blocksize); + branch[n].p = (u32*) bh->b_data + offsets[n]; + *branch[n].p = branch[n].key; + BUFFER_TRACE(bh, "marking uptodate"); + mark_buffer_uptodate(bh, 1); + unlock_buffer(bh); + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (err) + break; + + parent = nr; + } + if (IS_SYNC(inode)) + handle->h_sync = 1; + } + if (n == num) + return 0; + + /* Allocation failed, free what we already allocated */ + for (i = 1; i < keys; i++) { + BUFFER_TRACE(branch[i].bh, "call journal_forget"); + ext3_journal_forget(handle, branch[i].bh); + } + for (i = 0; i < keys; i++) + ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1); + return err; + } + + /** + * ext3_splice_branch - splice the allocated branch onto inode. + * @inode: owner + * @block: (logical) number of block we are adding + * @chain: chain of indirect blocks (with a missing link - see + * ext3_alloc_branch) + * @where: location of missing link + * @num: number of blocks we are adding + * + * This function verifies that chain (up to the missing link) had not + * changed, fills the missing link and does all housekeeping needed in + * inode (->i_blocks, etc.). In case of success we end up with the full + * chain to new block and return 0. Otherwise (== chain had been changed) + * we free the new blocks (forgetting their buffer_heads, indeed) and + * return -EAGAIN. + */ + + static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block, + Indirect chain[4], Indirect *where, int num) + { + int i; + int err = 0; + + /* + * If we're splicing into a [td]indirect block (as opposed to the + * inode) then we need to get write access to the [td]indirect block + * before the splice. + */ + if (where->bh) { + BUFFER_TRACE(where->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, where->bh); + if (err) + goto err_out; + } + /* Verify that place we are splicing to is still there and vacant */ + + /* Writer: pointers, ->i_next_alloc* */ + if (!verify_chain(chain, where-1) || *where->p) + /* Writer: end */ + goto changed; + + /* That's it */ + + *where->p = where->key; + inode->u.ext3_i.i_next_alloc_block = block; + inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key); + #ifdef SEARCH_FROM_ZERO + inode->u.ext3_i.i_next_alloc_block = 0; + inode->u.ext3_i.i_next_alloc_goal = 0; + #endif + /* Writer: end */ + + /* We are done with atomic stuff, now do the rest of housekeeping */ + + inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + + /* had we spliced it onto indirect block? */ + if (where->bh) { + /* + * akpm: If we spliced it onto an indirect block, we haven't + * altered the inode. Note however that if it is being spliced + * onto an indirect block at the very end of the file (the + * file is growing) then we *will* alter the inode to reflect + * the new i_size. But that is not done here - it is done in + * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode. + */ + jbd_debug(5, "splicing indirect only\n"); + BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, where->bh); + if (err) + goto err_out; + } else { + /* + * OK, we spliced it into the inode itself on a direct block. + * Inode was dirtied above. + */ + jbd_debug(5, "splicing direct\n"); + } + return err; + + changed: + /* + * AKPM: if where[i].bh isn't part of the current updating + * transaction then we explode nastily. Test this code path. + */ + jbd_debug(1, "the chain changed: try again\n"); + err = -EAGAIN; + + err_out: + for (i = 1; i < num; i++) { + BUFFER_TRACE(where[i].bh, "call journal_forget"); + ext3_journal_forget(handle, where[i].bh); + } + /* For the normal collision cleanup case, we free up the blocks. + * On genuine filesystem errors we don't even think about doing + * that. */ + if (err == -EAGAIN) + for (i = 0; i < num; i++) + ext3_free_blocks(handle, inode, + le32_to_cpu(where[i].key), 1); + return err; + } + + /* + * Allocation strategy is simple: if we have to allocate something, we will + * have to go the whole way to leaf. So let's do it before attaching anything + * to tree, set linkage between the newborn blocks, write them if sync is + * required, recheck the path, free and repeat if check fails, otherwise + * set the last missing link (that will protect us from any truncate-generated + * removals - all blocks on the path are immune now) and possibly force the + * write on the parent block. + * That has a nice additional property: no special recovery from the failed + * allocations is needed - we simply release blocks and do not touch anything + * reachable from inode. + * + * akpm: `handle' can be NULL if create == 0. + */ + + static int ext3_get_block_handle(handle_t *handle, struct inode *inode, + long iblock, + struct buffer_head *bh_result, int create) + { + int err = -EIO; + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + unsigned long goal; + int left; + int depth = ext3_block_to_path(inode, iblock, offsets); + loff_t new_size; + + J_ASSERT(handle != NULL || create == 0); + + if (depth == 0) + goto out; + + lock_kernel(); + reread: + partial = ext3_get_branch(inode, depth, offsets, chain, &err); + + /* Simplest case - block found, no allocation needed */ + if (!partial) { + bh_result->b_state &= ~(1UL << BH_New); + got_it: + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key); + bh_result->b_state |= (1UL << BH_Mapped); + /* Clean up and exit */ + partial = chain+depth-1; /* the whole chain */ + goto cleanup; + } + + /* Next simple case - plain lookup or failed read of indirect block */ + if (!create || err == -EIO) { + cleanup: + while (partial > chain) { + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + BUFFER_TRACE(bh_result, "returned"); + unlock_kernel(); + out: + return err; + } + + /* + * Indirect block might be removed by truncate while we were + * reading it. Handling of that case (forget what we've got and + * reread) is taken out of the main path. + */ + if (err == -EAGAIN) + goto changed; + + if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0) + goto changed; + + left = (chain + depth) - partial; + + /* + * Block out ext3_truncate while we alter the tree + */ + down_read(&inode->u.ext3_i.truncate_sem); + err = ext3_alloc_branch(handle, inode, left, goal, + offsets+(partial-chain), partial); + + /* The ext3_splice_branch call will free and forget any buffers + * on the new chain if there is a failure, but that risks using + * up transaction credits, especially for bitmaps where the + * credits cannot be returned. Can we handle this somehow? We + * may need to return -EAGAIN upwards in the worst case. --sct */ + if (!err) + err = ext3_splice_branch(handle, inode, iblock, chain, + partial, left); + up_read(&inode->u.ext3_i.truncate_sem); + if (err == -EAGAIN) + goto changed; + if (err) + goto cleanup; + + new_size = inode->i_size; + /* + * This is not racy against ext3_truncate's modification of i_disksize + * because VM/VFS ensures that the file cannot be extended while + * truncate is in progress. It is racy between multiple parallel + * instances of get_block, but we have the BKL. + */ + if (new_size > inode->u.ext3_i.i_disksize) + inode->u.ext3_i.i_disksize = new_size; + + bh_result->b_state |= (1UL << BH_New); + goto got_it; + + changed: + while (partial > chain) { + jbd_debug(1, "buffer chain changed, retrying\n"); + BUFFER_TRACE(partial->bh, "brelsing"); + brelse(partial->bh); + partial--; + } + goto reread; + } + + static int ext3_get_block(struct inode *inode, long iblock, + struct buffer_head *bh_result, int create) + { + handle_t *handle = 0; + int ret; + + if (create) { + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } + ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); + return ret; + } + + /* + * `handle' can be NULL if create is zero + */ + struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode, + long block, int create, int * errp) + { + struct buffer_head dummy; + int fatal = 0, err; + + J_ASSERT(handle != NULL || create == 0); + + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); + *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = getblk(dummy.b_dev, dummy.b_blocknr, + inode->i_sb->s_blocksize); + if (buffer_new(&dummy)) { + J_ASSERT(create != 0); + J_ASSERT(handle != 0); + + /* Now that we do not always journal data, we + should keep in mind whether this should + always journal the new buffer as metadata. + For now, regular file writes use + ext3_get_block instead, so it's not a + problem. */ + lock_kernel(); + lock_buffer(bh); + BUFFER_TRACE(bh, "call get_create_access"); + fatal = ext3_journal_get_create_access(handle, bh); + if (!fatal) { + memset(bh->b_data, 0, + inode->i_sb->s_blocksize); + mark_buffer_uptodate(bh, 1); + } + unlock_buffer(bh); + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + err = ext3_journal_dirty_metadata(handle, bh); + if (!fatal) fatal = err; + unlock_kernel(); + } else { + BUFFER_TRACE(bh, "not a new buffer"); + } + if (fatal) { + *errp = fatal; + brelse(bh); + bh = NULL; + } + return bh; + } + return NULL; + } + + struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode, + int block, int create, int *err) + { + struct buffer_head * bh; + int prev_blocks; + + prev_blocks = inode->i_blocks; + + bh = ext3_getblk (handle, inode, block, create, err); + if (!bh) + return bh; + #ifdef EXT3_PREALLOCATE + /* + * If the inode has grown, and this is a directory, then use a few + * more of the preallocated blocks to keep directory fragmentation + * down. The preallocated blocks are guaranteed to be contiguous. + */ + if (create && + S_ISDIR(inode->i_mode) && + inode->i_blocks > prev_blocks && + EXT3_HAS_COMPAT_FEATURE(inode->i_sb, + EXT3_FEATURE_COMPAT_DIR_PREALLOC)) { + int i; + struct buffer_head *tmp_bh; + + for (i = 1; + inode->u.ext3_i.i_prealloc_count && + i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks; + i++) { + /* + * ext3_getblk will zero out the contents of the + * directory for us + */ + tmp_bh = ext3_getblk(handle, inode, + block+i, create, err); + if (!tmp_bh) { + brelse (bh); + return 0; + } + brelse (tmp_bh); + } + } + #endif + if (buffer_uptodate(bh)) + return bh; + ll_rw_block (READ, 1, &bh); + wait_on_buffer (bh); + if (buffer_uptodate(bh)) + return bh; + brelse (bh); + *err = -EIO; + return NULL; + } + + static int walk_page_buffers( handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)( handle_t *handle, + struct buffer_head *bh)) + { + struct buffer_head *bh; + unsigned block_start, block_end; + unsigned blocksize = head->b_size; + int err, ret = 0; + + for ( bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = bh->b_this_page) + { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (partial && !buffer_uptodate(bh)) + *partial = 1; + continue; + } + err = (*fn)(handle, bh); + if (!ret) + ret = err; + } + return ret; + } + + /* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext3_get_block() + * and the commit_write(). So doing the journal_start at the start of + * prepare_write() is the right place. + * + * Also, this function can nest inside ext3_writepage() -> + * block_write_full_page(). In that case, we *know* that ext3_writepage() + * has generated enough buffer credits to do the whole page. So we won't + * block on the journal in that case, which is good, because the caller may + * be PF_MEMALLOC. + * + * By accident, ext3 can be reentered when a transaction is open via + * quota file writes. If we were to commit the transaction while thus + * reentered, there can be a deadlock - we would be holding a quota + * lock, and the commit would never complete if another thread had a + * transaction open and was blocking on the quota lock - a ranking + * violation. + * + * So what we do is to rely on the fact that journal_stop/journal_start + * will _not_ run commit under these circumstances because handle->h_ref + * is elevated. We'll still have enough credits for the tiny quotafile + * write. + */ + + static int do_journal_get_write_access(handle_t *handle, + struct buffer_head *bh) + { + return ext3_journal_get_write_access(handle, bh); + } + + static int ext3_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) + { + struct inode *inode = page->mapping->host; + handle_t *handle = ext3_journal_current_handle(); + int ret, needed_blocks = ext3_writepage_trans_blocks(inode); + + lock_kernel(); + handle = ext3_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = block_prepare_write(page, from, to, ext3_get_block); + if (ret != 0) + goto prepare_write_failed; + + if (ext3_should_journal_data(inode)) + ret = walk_page_buffers(handle, page->buffers, + from, to, NULL, do_journal_get_write_access); + prepare_write_failed: + if (ret) + ext3_journal_stop(handle, inode); + out: + unlock_kernel(); + return ret; + } + + static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh) + { + return ext3_journal_dirty_data(handle, bh, 0); + } + + /* + * For ext3_writepage(). We also brelse() the buffer to account for + * the bget() which ext3_writepage() performs. + */ + static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh) + { + int ret = ext3_journal_dirty_data(handle, bh, 1); + __brelse(bh); + return ret; + } + + /* For commit_write() in data=journal mode */ + static int commit_write_fn(handle_t *handle, struct buffer_head *bh) + { + set_bit(BH_Uptodate, &bh->b_state); + return ext3_journal_dirty_metadata(handle, bh); + } + + /* + * We need to pick up the new inode size which generic_commit_write gave us + * `file' can be NULL - eg, when called from block_symlink(). + * + * ext3 inode->i_dirty_buffers policy: If we're journalling data we + * definitely don't want them to appear on the inode at all - instead + * we need to manage them at the JBD layer and we need to intercept + * the relevant sync operations and translate them into journal operations. + * + * If we're not journalling data then we can just leave the buffers + * on ->i_dirty_buffers. If someone writes them out for us then thanks. + * Otherwise we'll do it in commit, if we're using ordered data. + */ + + static int ext3_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to) + { + handle_t *handle = ext3_journal_current_handle(); + struct inode *inode = page->mapping->host; + int ret = 0, ret2; + + lock_kernel(); + if (ext3_should_journal_data(inode)) { + /* + * Here we duplicate the generic_commit_write() functionality + */ + int partial = 0; + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + ret = walk_page_buffers(handle, page->buffers, + from, to, &partial, commit_write_fn); + if (!partial) + SetPageUptodate(page); + kunmap(page); + if (pos > inode->i_size) + inode->i_size = pos; + set_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state); + } else { + if (ext3_should_order_data(inode)) { + ret = walk_page_buffers(handle, page->buffers, + from, to, NULL, journal_dirty_sync_data); + } + /* Be careful here if generic_commit_write becomes a + * required invocation after block_prepare_write. */ + if (ret == 0) + ret = generic_commit_write(file, page, from, to); + } + if (inode->i_size > inode->u.ext3_i.i_disksize) { + inode->u.ext3_i.i_disksize = inode->i_size; + ret2 = ext3_mark_inode_dirty(handle, inode); + if (!ret) + ret = ret2; + } + ret2 = ext3_journal_stop(handle, inode); + unlock_kernel(); + if (!ret) + ret = ret2; + return ret; + } + + /* + * bmap() is special. It gets used by applications such as lilo and by + * the swapper to find the on-disk block of a specific piece of data. + * + * Naturally, this is dangerous if the block concerned is still in the + * journal. If somebody makes a swapfile on an ext3 data-journaling + * filesystem and enables swap, then they may get a nasty shock when the + * data getting swapped to that swapfile suddenly gets overwritten by + * the original zero's written out previously to the journal and + * awaiting writeback in the kernel's buffer cache. + * + * So, if we see any bmap calls here on a modified, data-journaled file, + * take extra steps to flush any blocks which might be in the cache. + */ + static int ext3_bmap(struct address_space *mapping, long block) + { + struct inode *inode = mapping->host; + journal_t *journal; + int err; + + if (test_and_clear_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state)) { + /* + * This is a REALLY heavyweight approach, but the use of + * bmap on dirty files is expected to be extremely rare: + * only if we run lilo or swapon on a freshly made file + * do we expect this to happen. + * + * (bmap requires CAP_SYS_RAWIO so this does not + * represent an unprivileged user DOS attack --- we'd be + * in trouble if mortal users could trigger this path at + * will.) + * + * NB. EXT3_STATE_JDATA is not set on files other than + * regular files. If somebody wants to bmap a directory + * or symlink and gets confused because the buffer + * hasn't yet been flushed to disk, they deserve + * everything they get. + */ + + journal = EXT3_JOURNAL(inode); + journal_lock_updates(journal); + err = journal_flush(journal); + journal_unlock_updates(journal); + + if (err) + return 0; + } + + return generic_block_bmap(mapping,block,ext3_get_block); + } + + static int bget_one(handle_t *handle, struct buffer_head *bh) + { + atomic_inc(&bh->b_count); + return 0; + } + + /* + * Note that we always start a transaction even if we're not journalling + * data. This is to preserve ordering: any hole instantiation within + * __block_write_full_page -> ext3_get_block() should be journalled + * along with the data so we don't crash and then get metadata which + * refers to old data. + * + * In all journalling modes block_write_full_page() will start the I/O. + * + * Problem: + * + * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext3_writepage() + * + * Similar for: + * + * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ... + * + * Same applies to ext3_get_block(). We will deadlock on various things like + * lock_journal and i_truncate_sem. + * + * Setting PF_MEMALLOC here doesn't work - too many internal memory + * allocations fail. + * + * 16May01: If we're reentered then journal_current_handle() will be + * non-zero. We simply *return*. + * + * 1 July 2001: @@@ FIXME: + * In journalled data mode, a data buffer may be metadata against the + * current transaction. But the same file is part of a shared mapping + * and someone does a writepage() on it. + * + * We will move the buffer onto the async_data list, but *after* it has + * been dirtied. So there's a small window where we have dirty data on + * BJ_Metadata. + * + * Note that this only applies to the last partial page in the file. The + * bit which block_write_full_page() uses prepare/commit for. (That's + * broken code anyway: it's wrong for msync()). + * + * It's a rare case: affects the final partial page, for journalled data + * where the file is subject to bith write() and writepage() in the same + * transction. To fix it we'll need a custom block_write_full_page(). + * We'll probably need that anyway for journalling writepage() output. + * + * We don't honour synchronous mounts for writepage(). That would be + * disastrous. Any write() or metadata operation will sync the fs for + * us. + */ + static int ext3_writepage(struct page *page) + { + struct inode *inode = page->mapping->host; + struct buffer_head *page_buffers; + handle_t *handle = NULL; + int ret = 0, err; + int needed; + int order_data; + + J_ASSERT(PageLocked(page)); + + /* + * We give up here if we're reentered, because it might be + * for a different filesystem. One *could* look for a + * nested transaction opportunity. + */ + lock_kernel(); + if (ext3_journal_current_handle()) + goto out_fail; + + needed = ext3_writepage_trans_blocks(inode); + if (current->flags & PF_MEMALLOC) + handle = ext3_journal_try_start(inode, needed); + else + handle = ext3_journal_start(inode, needed); + + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_fail; + } + + order_data = ext3_should_order_data(inode) || + ext3_should_journal_data(inode); + + unlock_kernel(); + + page_buffers = NULL; /* Purely to prevent compiler warning */ + + /* bget() all the buffers */ + if (order_data) { + if (!page->buffers) + create_empty_buffers(page, + inode->i_dev, inode->i_sb->s_blocksize); + page_buffers = page->buffers; + walk_page_buffers(handle, page_buffers, 0, + PAGE_CACHE_SIZE, NULL, bget_one); + } + + ret = block_write_full_page(page, ext3_get_block); + + /* + * The page can become unlocked at any point now, and + * truncate can then come in and change things. So we + * can't touch *page from now on. But *page_buffers is + * safe due to elevated refcount. + */ + + handle = ext3_journal_current_handle(); + lock_kernel(); + + /* And attach them to the current transaction */ + if (order_data) { + err = walk_page_buffers(handle, page_buffers, + 0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data); + if (!ret) + ret = err; + } + + err = ext3_journal_stop(handle, inode); + if (!ret) + ret = err; + unlock_kernel(); + return ret; + + out_fail: + + unlock_kernel(); + SetPageDirty(page); + UnlockPage(page); + return ret; + } + + static int ext3_readpage(struct file *file, struct page *page) + { + return block_read_full_page(page,ext3_get_block); + } + + + static int ext3_flushpage(struct page *page, unsigned long offset) + { + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + return journal_flushpage(journal, page, offset); + } + + static int ext3_releasepage(struct page *page, int wait) + { + journal_t *journal = EXT3_JOURNAL(page->mapping->host); + return journal_try_to_free_buffers(journal, page, wait); + } + + + struct address_space_operations ext3_aops = { + readpage: ext3_readpage, /* BKL not held. Don't need */ + writepage: ext3_writepage, /* BKL not held. We take it */ + sync_page: block_sync_page, + prepare_write: ext3_prepare_write, /* BKL not held. We take it */ + commit_write: ext3_commit_write, /* BKL not held. We take it */ + bmap: ext3_bmap, /* BKL held */ + flushpage: ext3_flushpage, /* BKL not held. Don't need */ + releasepage: ext3_releasepage, /* BKL not held. Don't need */ + }; + + /* + * ext3_block_truncate_page() zeroes out a mapping from file offset `from' + * up to the end of the block which corresponds to `from'. + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ + static int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize, iblock, length, pos; + struct inode *inode = mapping->host; + struct page *page; + struct buffer_head *bh; + int err; + + blocksize = inode->i_sb->s_blocksize; + length = offset & (blocksize - 1); + + /* Block boundary? Nothing to do */ + if (!length) + return 0; + + length = blocksize - length; + iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + + page = grab_cache_page(mapping, index); + err = -ENOMEM; + if (!page) + goto out; + + if (!page->buffers) + create_empty_buffers(page, inode->i_dev, blocksize); + + /* Find the buffer that contains "offset" */ + bh = page->buffers; + pos = blocksize; + while (offset >= pos) { + bh = bh->b_this_page; + iblock++; + pos += blocksize; + } + + err = 0; + if (!buffer_mapped(bh)) { + /* Hole? Nothing to do */ + if (buffer_uptodate(bh)) + goto unlock; + ext3_get_block(inode, iblock, bh, 0); + /* Still unmapped? Nothing to do */ + if (!buffer_mapped(bh)) + goto unlock; + } + + /* Ok, it's mapped. Make sure it's up-to-date */ + if (Page_Uptodate(page)) + set_bit(BH_Uptodate, &bh->b_state); + + if (!buffer_uptodate(bh)) { + err = -EIO; + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + /* Uhhuh. Read error. Complain and punt. */ + if (!buffer_uptodate(bh)) + goto unlock; + } + + if (ext3_should_journal_data(inode)) { + BUFFER_TRACE(bh, "get write access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto unlock; + } + + memset(kmap(page) + offset, 0, length); + flush_dcache_page(page); + kunmap(page); + + BUFFER_TRACE(bh, "zeroed end of block"); + + err = 0; + if (ext3_should_journal_data(inode)) { + err = ext3_journal_dirty_metadata(handle, bh); + } else { + if (ext3_should_order_data(inode)) + err = ext3_journal_dirty_data(handle, bh, 0); + __mark_buffer_dirty(bh); + } + + unlock: + UnlockPage(page); + page_cache_release(page); + out: + return err; + } + + /* + * Probably it should be a library function... search for first non-zero word + * or memcmp with zero_page, whatever is better for particular architecture. + * Linus? + */ + static inline int all_zeroes(u32 *p, u32 *q) + { + while (p < q) + if (*p++) + return 0; + return 1; + } + + /** + * ext3_find_shared - find the indirect blocks for partial truncation. + * @inode: inode in question + * @depth: depth of the affected branch + * @offsets: offsets of pointers in that branch (see ext3_block_to_path) + * @chain: place to store the pointers to partial indirect blocks + * @top: place to the (detached) top of branch + * + * This is a helper function used by ext3_truncate(). + * + * When we do truncate() we may have to clean the ends of several + * indirect blocks but leave the blocks themselves alive. Block is + * partially truncated if some data below the new i_size is refered + * from it (and it is on the path to the first completely truncated + * data block, indeed). We have to free the top of that path along + * with everything to the right of the path. Since no allocation + * past the truncation point is possible until ext3_truncate() + * finishes, we may safely do the latter, but top of branch may + * require special attention - pageout below the truncation point + * might try to populate it. + * + * We atomically detach the top of branch from the tree, store the + * block number of its root in *@top, pointers to buffer_heads of + * partially truncated blocks - in @chain[].bh and pointers to + * their last elements that should not be removed - in + * @chain[].p. Return value is the pointer to last filled element + * of @chain. + * + * The work left to caller to do the actual freeing of subtrees: + * a) free the subtree starting from *@top + * b) free the subtrees whose roots are stored in + * (@chain[i].p+1 .. end of @chain[i].bh->b_data) + * c) free the subtrees growing from the inode past the @chain[0]. + * (no partially truncated stuff there). */ + + static Indirect *ext3_find_shared(struct inode *inode, + int depth, + int offsets[4], + Indirect chain[4], + u32 *top) + { + Indirect *partial, *p; + int k, err; + + *top = 0; + /* Make k index the deepest non-null offest + 1 */ + for (k = depth; k > 1 && !offsets[k-1]; k--) + ; + partial = ext3_get_branch(inode, k, offsets, chain, &err); + /* Writer: pointers */ + if (!partial) + partial = chain + k-1; + /* + * If the branch acquired continuation since we've looked at it - + * fine, it should all survive and (new) top doesn't belong to us. + */ + if (!partial->key && *partial->p) + /* Writer: end */ + goto no_top; + for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--) + ; + /* + * OK, we've found the last block that must survive. The rest of our + * branch should be detached before unlocking. However, if that rest + * of branch is all ours and does not grow immediately from the inode + * it's easier to cheat and just decrement partial->p. + */ + if (p == chain + k - 1 && p > chain) { + p->p--; + } else { + *top = *p->p; + /* Nope, don't do this in ext3. Must leave the tree intact */ + #if 0 + *p->p = 0; + #endif + } + /* Writer: end */ + + while(partial > p) + { + brelse(partial->bh); + partial--; + } + no_top: + return partial; + } + + /* + * Zero a number of block pointers in either an inode or an indirect block. + * If we restart the transaction we must again get write access to the + * indirect block for further modification. + * + * We release `count' blocks on disk, but (last - first) may be greater + * than `count' because there can be holes in there. + */ + static void + ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, + unsigned long block_to_free, unsigned long count, + u32 *first, u32 *last) + { + u32 *p; + kdev_t dev = inode->i_sb->s_dev; + unsigned long blocksize = inode->i_sb->s_blocksize; + + if (try_to_extend_transaction(handle, inode)) { + if (bh) { + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + } + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + } + + /* + * Any buffers which are on the journal will be in memory. We find + * them on the hash table so journal_revoke() will run journal_forget() + * on them. We've already detached each block from the file, so + * bforget() in journal_forget() should be safe. + * + * AKPM: turn on bforget in journal_forget()!!! + */ + for (p = first; p < last; p++) { + u32 nr = le32_to_cpu(*p); + if (nr) { + struct buffer_head *bh; + + *p = 0; + bh = get_hash_table(dev, nr, blocksize); + ext3_forget(handle, 0, inode, bh, nr); + } + } + + ext3_free_blocks(handle, inode, block_to_free, count); + } + + /** + * ext3_free_data - free a list of data blocks + * @handle: handle for this transaction + * @inode: inode we are dealing with + * @this_bh: indirect buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: points immediately past the end of array + * + * We are freeing all blocks refered from that array (numbers are stored as + * little-endian 32-bit) and updating @inode->i_blocks appropriately. + * + * We accumulate contiguous runs of blocks to free. Conveniently, if these + * blocks are contiguous then releasing them at one time will only affect one + * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't + * actually use a lot of journal space. + * + * @this_bh will be %NULL if @first and @last point into the inode's direct + * block pointers. + */ + static void ext3_free_data(handle_t *handle, struct inode *inode, + struct buffer_head *this_bh, u32 *first, u32 *last) + { + unsigned long block_to_free = 0; /* Starting block # of a run */ + unsigned long count = 0; /* Number of blocks in the run */ + u32 *block_to_free_p = NULL; /* Pointer into inode/ind + corresponding to + block_to_free */ + unsigned long nr; /* Current block # */ + u32 *p; /* Pointer into inode/ind + for current block */ + int err; + + if (this_bh) { /* For indirect block */ + BUFFER_TRACE(this_bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, this_bh); + /* Important: if we can't update the indirect pointers + * to the blocks, we can't free them. */ + if (err) + return; + } + + for (p = first; p < last; p++) { + nr = le32_to_cpu(*p); + if (nr) { + /* accumulate blocks to free if they're contiguous */ + if (count == 0) { + block_to_free = nr; + block_to_free_p = p; + count = 1; + } else if (nr == block_to_free + count) { + count++; + } else { + ext3_clear_blocks(handle, inode, this_bh, + block_to_free, + count, block_to_free_p, p); + block_to_free = nr; + block_to_free_p = p; + count = 1; + } + } + } + + if (count > 0) + ext3_clear_blocks(handle, inode, this_bh, block_to_free, + count, block_to_free_p, p); + + if (this_bh) { + BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, this_bh); + } + } + + /** + * ext3_free_branches - free an array of branches + * @handle: JBD handle for this transaction + * @inode: inode we are dealing with + * @parent_bh: the buffer_head which contains *@first and *@last + * @first: array of block numbers + * @last: pointer immediately past the end of array + * @depth: depth of the branches to free + * + * We are freeing all blocks refered from these branches (numbers are + * stored as little-endian 32-bit) and updating @inode->i_blocks + * appropriately. + */ + static void ext3_free_branches(handle_t *handle, struct inode *inode, + struct buffer_head *parent_bh, + u32 *first, u32 *last, int depth) + { + unsigned long nr; + u32 *p; + + if (is_handle_aborted(handle)) + return; + + if (depth--) { + struct buffer_head *bh; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + p = last; + while (--p >= first) { + nr = le32_to_cpu(*p); + if (!nr) + continue; /* A hole */ + + /* Go read the buffer for the next level down */ + bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize); + + /* + * A read failure? Report error and clear slot + * (should be rare). + */ + if (!bh) { + ext3_error(inode->i_sb, "ext3_free_branches", + "Read failure, inode=%ld, block=%ld", + inode->i_ino, nr); + continue; + } + + /* This zaps the entire block. Bottom up. */ + BUFFER_TRACE(bh, "free child branches"); + ext3_free_branches(handle, inode, bh, (u32*)bh->b_data, + (u32*)bh->b_data + addr_per_block, + depth); + + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. + * + * If this block has already been committed to the + * journal, a revoke record will be written. And + * revoke records must be emitted *before* clearing + * this block's bit in the bitmaps. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + + /* + * Everything below this this pointer has been + * released. Now let this top-of-subtree go. + * + * We want the freeing of this indirect block to be + * atomic in the journal with the updating of the + * bitmap block which owns it. So make some room in + * the journal. + * + * We zero the parent pointer *after* freeing its + * pointee in the bitmaps, so if extend_transaction() + * for some reason fails to put the bitmap changes and + * the release into the same transaction, recovery + * will merely complain about releasing a free block, + * rather than leaking blocks. + */ + if (is_handle_aborted(handle)) + return; + if (try_to_extend_transaction(handle, inode)) { + ext3_mark_inode_dirty(handle, inode); + ext3_journal_test_restart(handle, inode); + } + + ext3_free_blocks(handle, inode, nr, 1); + + if (parent_bh) { + /* + * The block which we have just freed is + * pointed to by an indirect block: journal it + */ + BUFFER_TRACE(parent_bh, "get_write_access"); + if (!ext3_journal_get_write_access(handle, + parent_bh)){ + *p = 0; + BUFFER_TRACE(parent_bh, + "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, + parent_bh); + } + } + } + } else { + /* We have reached the bottom of the tree. */ + BUFFER_TRACE(parent_bh, "free data blocks"); + ext3_free_data(handle, inode, parent_bh, first, last); + } + } + + /* + * ext3_truncate() + * + * We block out ext3_get_block() block instantiations across the entire + * transaction, and VFS/VM ensures that ext3_truncate() cannot run + * simultaneously on behalf of the same inode. + * + * As we work through the truncate and commmit bits of it to the journal there + * is one core, guiding principle: the file's tree must always be consistent on + * disk. We must be able to restart the truncate after a crash. + * + * The file's tree may be transiently inconsistent in memory (although it + * probably isn't), but whenever we close off and commit a journal transaction, + * the contents of (the filesystem + the journal) must be consistent and + * restartable. It's pretty simple, really: bottom up, right to left (although + * left-to-right works OK too). + * + * Note that at recovery time, journal replay occurs *before* the restart of + * truncate against the orphan inode list. + * + * The committed inode has the new, desired i_size (which is the same as + * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see + * that this inode's truncate did not complete and it will again call + * ext3_truncate() to have another go. So there will be instantiated blocks + * to the right of the truncation point in a crashed ext3 filesystem. But + * that's fine - as long as they are linked from the inode, the post-crash + * ext3_truncate() run will find them and release them. + */ + + void ext3_truncate(struct inode * inode) + { + handle_t *handle; + u32 *i_data = inode->u.ext3_i.i_data; + int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb); + int offsets[4]; + Indirect chain[4]; + Indirect *partial; + int nr = 0; + int n; + long last_block; + unsigned blocksize; + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode)) + return; + + ext3_discard_prealloc(inode); + + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ + + blocksize = inode->i_sb->s_blocksize; + last_block = (inode->i_size + blocksize-1) + >> EXT3_BLOCK_SIZE_BITS(inode->i_sb); + + ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size); + + + n = ext3_block_to_path(inode, last_block, offsets); + if (n == 0) + goto out_stop; /* error */ + + /* + * OK. This truncate is going to happen. We add the inode to the + * orphan list, so that if this truncate spans multiple transactions, + * and we crash, we will resume the truncate when the filesystem + * recovers. It also marks the inode dirty, to catch the new size. + * + * Implication: the file must always be in a sane, consistent + * truncatable state while each transaction commits. + */ + if (ext3_orphan_add(handle, inode)) + goto out_stop; + + /* + * The orphan list entry will now protect us from any crash which + * occurs before the truncate completes, so it is now safe to propagate + * the new, shorter inode size (held for now in i_size) into the + * on-disk inode. We do this via i_disksize, which is the value which + * ext3 *really* writes onto the disk inode. + */ + inode->u.ext3_i.i_disksize = inode->i_size; + + /* + * From here we block out all ext3_get_block() callers who want to + * modify the block allocation tree. + */ + down_write(&inode->u.ext3_i.truncate_sem); + + if (n == 1) { /* direct blocks */ + ext3_free_data(handle, inode, NULL, i_data+offsets[0], + i_data + EXT3_NDIR_BLOCKS); + goto do_indirects; + } + + partial = ext3_find_shared(inode, n, offsets, chain, &nr); + /* Kill the top of shared branch (not detached) */ + if (nr) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, (chain+n-1) - partial); + *partial->p = 0; + /* + * We mark the inode dirty prior to restart, + * and prior to stop. No need for it here. + */ + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); + ext3_free_branches(handle, inode, partial->bh, + partial->p, + partial->p+1, (chain+n-1) - partial); + } + } + /* Clear the ends of indirect blocks on the shared branch */ + while (partial > chain) { + ext3_free_branches(handle, inode, partial->bh, partial->p + 1, + (u32*)partial->bh->b_data + addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse (partial->bh); + partial--; + } + do_indirects: + /* Kill the remaining (whole) subtrees */ + switch (offsets[0]) { + default: + nr = i_data[EXT3_IND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 1); + i_data[EXT3_IND_BLOCK] = 0; + } + case EXT3_IND_BLOCK: + nr = i_data[EXT3_DIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 2); + i_data[EXT3_DIND_BLOCK] = 0; + } + case EXT3_DIND_BLOCK: + nr = i_data[EXT3_TIND_BLOCK]; + if (nr) { + ext3_free_branches(handle, inode, NULL, + &nr, &nr+1, 3); + i_data[EXT3_TIND_BLOCK] = 0; + } + case EXT3_TIND_BLOCK: + ; + } + up_write(&inode->u.ext3_i.truncate_sem); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + + /* In a multi-transaction truncate, we only make the final + * transaction synchronous */ + if (IS_SYNC(inode)) + handle->h_sync = 1; + out_stop: + /* + * If this was a simple ftruncate(), and the file will remain alive + * then we need to clear up the orphan record which we created above. + * However, if this was a real unlink then we were called by + * ext3_delete_inode(), and we allow that function to clean up the + * orphan info for us. + */ + if (inode->i_nlink) + ext3_orphan_del(handle, inode); + + ext3_journal_stop(handle, inode); + } + + /* + * ext3_get_inode_loc returns with an extra refcount against the + * inode's underlying buffer_head on success. + */ + + int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc) + { + struct buffer_head *bh = 0; + unsigned long block; + unsigned long block_group; + unsigned long group_desc; + unsigned long desc; + unsigned long offset; + struct ext3_group_desc * gdp; + + if ((inode->i_ino != EXT3_ROOT_INO && + inode->i_ino != EXT3_ACL_IDX_INO && + inode->i_ino != EXT3_ACL_DATA_INO && + inode->i_ino != EXT3_JOURNAL_INO && + inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) || + inode->i_ino > le32_to_cpu( + inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "bad inode number: %lu", inode->i_ino); + goto bad_inode; + } + block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb); + if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "group >= groups count"); + goto bad_inode; + } + group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb); + desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1); + bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc]; + if (!bh) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "Descriptor not loaded"); + goto bad_inode; + } + + gdp = (struct ext3_group_desc *) bh->b_data; + /* + * Figure out the offset within the block group inode table + */ + offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) * + EXT3_INODE_SIZE(inode->i_sb); + block = le32_to_cpu(gdp[desc].bg_inode_table) + + (offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb)); + if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) { + ext3_error (inode->i_sb, "ext3_get_inode_loc", + "unable to read inode block - " + "inode=%lu, block=%lu", inode->i_ino, block); + goto bad_inode; + } + offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1); + + iloc->bh = bh; + iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset); + iloc->block_group = block_group; + + return 0; + + bad_inode: + return -EIO; + } + + void ext3_read_inode(struct inode * inode) + { + struct ext3_iloc iloc; + struct ext3_inode *raw_inode; + struct buffer_head *bh; + int block; + + if(ext3_get_inode_loc(inode, &iloc)) + goto bad_inode; + bh = iloc.bh; + raw_inode = iloc.raw_inode; + init_rwsem(&inode->u.ext3_i.truncate_sem); + inode->i_mode = le16_to_cpu(raw_inode->i_mode); + inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); + inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if(!(test_opt (inode->i_sb, NO_UID32))) { + inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; + inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; + } + inode->i_nlink = le16_to_cpu(raw_inode->i_links_count); + inode->i_size = le32_to_cpu(raw_inode->i_size); + inode->i_atime = le32_to_cpu(raw_inode->i_atime); + inode->i_ctime = le32_to_cpu(raw_inode->i_ctime); + inode->i_mtime = le32_to_cpu(raw_inode->i_mtime); + inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime); + /* We now have enough fields to check if the inode was active or not. + * This is needed because nfsd might try to access dead inodes + * the test is that same one that e2fsck uses + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { + if (inode->i_mode == 0 || + !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) { + /* this inode is deleted */ + brelse (bh); + goto bad_inode; + } + /* The only unlinked inodes we let through here have + * valid i_mode and are being read by the orphan + * recovery code: that's fine, we're about to complete + * the process of deleting those. */ + } + inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size + * (for stat), not the fs block + * size */ + inode->i_blocks = le32_to_cpu(raw_inode->i_blocks); + inode->i_version = ++event; + inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags); + #ifdef EXT3_FRAGMENTS + inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr); + inode->u.ext3_i.i_frag_no = raw_inode->i_frag; + inode->u.ext3_i.i_frag_size = raw_inode->i_fsize; + #endif + inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl); + if (!S_ISREG(inode->i_mode)) { + inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + } else { + inode->i_size |= + ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; + } + inode->u.ext3_i.i_disksize = inode->i_size; + inode->i_generation = le32_to_cpu(raw_inode->i_generation); + #ifdef EXT3_PREALLOCATE + inode->u.ext3_i.i_prealloc_count = 0; + #endif + inode->u.ext3_i.i_block_group = iloc.block_group; + + /* + * NOTE! The in-memory inode i_data array is in little-endian order + * even on big-endian machines: we do NOT byteswap the block numbers! + */ + for (block = 0; block < EXT3_N_BLOCKS; block++) + inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block]; + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); + + brelse (iloc.bh); + + if (inode->i_ino == EXT3_ACL_IDX_INO || + inode->i_ino == EXT3_ACL_DATA_INO) + /* Nothing to do */ ; + else if (S_ISREG(inode->i_mode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (!inode->i_blocks) + inode->i_op = &ext3_fast_symlink_inode_operations; + else { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + } + } else + init_special_inode(inode, inode->i_mode, + le32_to_cpu(iloc.raw_inode->i_block[0])); + /* inode->i_attr_flags = 0; unused */ + if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */ + inode->i_flags |= S_SYNC; + } + if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_APPEND; unused */ + inode->i_flags |= S_APPEND; + } + if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FILE_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE; unused */ + inode->i_flags |= S_IMMUTABLE_FILE; + } + if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_LINK_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_LINK; unused */ + inode->i_flags |= S_IMMUTABLE_LINK; + } + if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) { + /* inode->i_attr_flags |= ATTR_FLAG_NOATIME; unused */ + inode->i_flags |= S_NOATIME; + } + return; + + bad_inode: + make_bad_inode(inode); + return; + } + + /* + * Post the struct inode info into an on-disk inode location in the + * buffer-cache. This gobbles the caller's reference to the + * buffer_head in the inode location struct. + */ + + static int ext3_do_update_inode(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) + { + struct ext3_inode *raw_inode = iloc->raw_inode; + struct buffer_head *bh = iloc->bh; + int err = 0, rc, block; + + if (handle) { + BUFFER_TRACE(bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, bh); + if (err) + goto out_brelse; + } + raw_inode->i_mode = cpu_to_le16(inode->i_mode); + if(!(test_opt(inode->i_sb, NO_UID32))) { + raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); + raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); + /* + * Fix up interoperability with old kernels. Otherwise, old inodes get + * re-used with the upper 16 bits of the uid/gid intact + */ + if(!inode->u.ext3_i.i_dtime) { + raw_inode->i_uid_high = + cpu_to_le16(high_16_bits(inode->i_uid)); + raw_inode->i_gid_high = + cpu_to_le16(high_16_bits(inode->i_gid)); + } else { + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + } else { + raw_inode->i_uid_low = + cpu_to_le16(fs_high2lowuid(inode->i_uid)); + raw_inode->i_gid_low = + cpu_to_le16(fs_high2lowgid(inode->i_gid)); + raw_inode->i_uid_high = 0; + raw_inode->i_gid_high = 0; + } + raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); + raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize); + raw_inode->i_atime = cpu_to_le32(inode->i_atime); + raw_inode->i_ctime = cpu_to_le32(inode->i_ctime); + raw_inode->i_mtime = cpu_to_le32(inode->i_mtime); + raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); + raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime); + raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags); + #ifdef EXT3_FRAGMENTS + raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr); + raw_inode->i_frag = inode->u.ext3_i.i_frag_no; + raw_inode->i_fsize = inode->u.ext3_i.i_frag_size; + #else + /* If we are not tracking these fields in the in-memory inode, + * then preserve them on disk, but still initialise them to zero + * for new inodes. */ + if (inode->u.ext3_i.i_state & EXT3_STATE_NEW) { + raw_inode->i_faddr = 0; + raw_inode->i_frag = 0; + raw_inode->i_fsize = 0; + } + #endif + raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl); + if (!S_ISREG(inode->i_mode)) { + raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl); + } else { + raw_inode->i_size_high = + cpu_to_le32(inode->u.ext3_i.i_disksize >> 32); + if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) { + struct super_block *sb = inode->i_sb; + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE) || + EXT3_SB(sb)->s_es->s_rev_level == + cpu_to_le32(EXT3_GOOD_OLD_REV)) { + /* If this is the first large file + * created, add a flag to the superblock. + */ + err = ext3_journal_get_write_access(handle, + sb->u.ext3_sb.s_sbh); + if (err) + goto out_brelse; + ext3_update_dynamic_rev(sb); + EXT3_SET_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_LARGE_FILE); + sb->s_dirt = 1; + handle->h_sync = 1; + err = ext3_journal_dirty_metadata(handle, + sb->u.ext3_sb.s_sbh); + } + } + } + raw_inode->i_generation = le32_to_cpu(inode->i_generation); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + raw_inode->i_block[0] = + cpu_to_le32(kdev_t_to_nr(inode->i_rdev)); + else for (block = 0; block < EXT3_N_BLOCKS; block++) + raw_inode->i_block[block] = inode->u.ext3_i.i_data[block]; + + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + rc = ext3_journal_dirty_metadata(handle, bh); + if (!err) + err = rc; + inode->u.ext3_i.i_state &= ~EXT3_STATE_NEW; + + out_brelse: + brelse (bh); + ext3_std_error(inode->i_sb, err); + return err; + } + + /* + * ext3_write_inode() + * + * We are called from a few places: + * + * - Within generic_file_write() for O_SYNC files. + * Here, there will be no transaction running. We wait for any running + * trasnaction to commit. + * + * - Within sys_sync(), kupdate and such. + * We wait on commit, if tol to. + * + * - Within prune_icache() (PF_MEMALLOC == true) + * Here we simply return. We can't afford to block kswapd on the + * journal commit. + * + * In all cases it is actually safe for us to return without doing anything, + * because the inode has been copied into a raw inode buffer in + * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for + * knfsd. + * + * Note that we are absolutely dependent upon all inode dirtiers doing the + * right thing: they *must* call mark_inode_dirty() after dirtying info in + * which we are interested. + * + * It would be a bug for them to not do this. The code: + * + * mark_inode_dirty(inode) + * stuff(); + * inode->i_size = expr; + * + * is in error because a kswapd-driven write_inode() could occur while + * `stuff()' is running, and the new i_size will be lost. Plus the inode + * will no longer be on the superblock's dirty inode list. + */ + void ext3_write_inode(struct inode *inode, int wait) + { + if (current->flags & PF_MEMALLOC) + return; + + if (ext3_journal_current_handle()) { + jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n"); + return; + } + + if (!wait) + return; + + ext3_force_commit(inode->i_sb); + } + + /* + * ext3_setattr() + * + * Called from notify_change. + * + * We want to trap VFS attempts to truncate the file as soon as + * possible. In particular, we want to make sure that when the VFS + * shrinks i_size, we put the inode on the orphan list and modify + * i_disksize immediately, so that during the subsequent flushing of + * dirty pages and freeing of disk blocks, we can guarantee that any + * commit will leave the blocks being flushed in an unused state on + * disk. (On recovery, the inode will get truncated and the blocks will + * be freed, so we have a strong guarantee that no future commit will + * leave these blocks visible to the user.) + * + * This is only needed for regular files. rmdir() has its own path, and + * we can never truncate a direcory except on final unlink (at which + * point i_nlink is zero so recovery is easy.) + * + * Called with the BKL. + */ + + int ext3_setattr(struct dentry *dentry, struct iattr *attr) + { + struct inode *inode = dentry->d_inode; + int error, rc; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) { + handle_t *handle; + + handle = ext3_journal_start(inode, 3); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + goto err_out; + } + + error = ext3_orphan_add(handle, inode); + inode->u.ext3_i.i_disksize = attr->ia_size; + rc = ext3_mark_inode_dirty(handle, inode); + if (!error) + error = rc; + ext3_journal_stop(handle, inode); + } + + inode_setattr(inode, attr); + + /* If inode_setattr's call to ext3_truncate failed to get a + * transaction handle at all, we need to clean up the in-core + * orphan list manually. */ + if (inode->i_nlink) + ext3_orphan_del(NULL, inode); + + err_out: + ext3_std_error(inode->i_sb, error); + return 0; + } + + + /* + * akpm: how many blocks doth make a writepage()? + * + * With N blocks per page, it may be: + * N data blocks + * 2 indirect block + * 2 dindirect + * 1 tindirect + * N+5 bitmap blocks (from the above) + * N+5 group descriptor summary blocks + * 1 inode block + * 1 superblock. + * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files + * + * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS + * + * With ordered or writeback data it's the same, less the N data blocks. + * + * If the inode's direct blocks can hold an integral number of pages then a + * page cannot straddle two indirect blocks, and we can only touch one indirect + * and dindirect block, and the "5" above becomes "3". + * + * This still overestimates under most circumstances. If we were to pass the + * start and end offsets in here as well we could do block_to_path() on each + * block and work out the exact number of indirects which are touched. Pah. + */ + + int ext3_writepage_trans_blocks(struct inode *inode) + { + int bpp = ext3_journal_blocks_per_page(inode); + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else + ret = 2 * (bpp + indirects) + 2; + + #ifdef CONFIG_QUOTA + ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; + #endif + + return ret; + } + + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc) + { + int err = 0; + + if (handle) { + /* the do_update_inode consumes one bh->b_count */ + atomic_inc(&iloc->bh->b_count); + err = ext3_do_update_inode(handle, inode, iloc); + /* ext3_do_update_inode() does journal_dirty_metadata */ + brelse(iloc->bh); + } else { + printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n"); + } + return err; + } + + /* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + + int + ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc) + { + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext3_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; + } + } + } + ext3_std_error(inode->i_sb, err); + return err; + } + + /* + * akpm: What we do here is to mark the in-core inode as clean + * with respect to inode dirtiness (it may still be data-dirty). + * This means that the in-core inode may be reaped by prune_icache + * without having to perform any I/O. This is a very good thing, + * because *any* task may call prune_icache - even ones which + * have a transaction open against a different journal. + * + * Is this cheating? Not really. Sure, we haven't written the + * inode out, but prune_icache isn't a user-visible syncing function. + * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) + * we start and wait on commits. + * + * Is this efficient/effective? Well, we're being nice to the system + * by cleaning up our inodes proactively so they can be reaped + * without I/O. But we are potentially leaving up to five seconds' + * worth of inodes floating about which prune_icache wants us to + * write out. One way to fix that would be to get prune_icache() + * to do a write_super() to free up some memory. It has the desired + * effect. + */ + int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode) + { + struct ext3_iloc iloc; + int err; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (!err) + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + return err; + } + + /* + * akpm: ext3_dirty_inode() is called from __mark_inode_dirty() + * + * We're really interested in the case where a file is being extended. + * i_size has been changed by generic_commit_write() and we thus need + * to include the updated inode in the current transaction. + * + * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks + * are allocated to the file. + * + * If the inode is marked synchronous, we don't honour that here - doing + * so would cause a commit on atime updates, which we don't bother doing. + * We handle synchronous inodes at the highest possible level. + */ + void ext3_dirty_inode(struct inode *inode) + { + handle_t *current_handle = ext3_journal_current_handle(); + handle_t *handle; + + lock_kernel(); + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + goto out; + if (current_handle && + current_handle->h_transaction != handle->h_transaction) { + /* This task has a transaction open against a different fs */ + printk(KERN_EMERG __FUNCTION__": transactions do not match!\n"); + } else { + jbd_debug(5, "marking dirty. outer handle=%p\n", + current_handle); + ext3_mark_inode_dirty(handle, inode); + } + ext3_journal_stop(handle, inode); + out: + unlock_kernel(); + } + + #ifdef AKPM + /* + * Bind an inode's backing buffer_head into this transaction, to prevent + * it from being flushed to disk early. Unlike + * ext3_reserve_inode_write, this leaves behind no bh reference and + * returns no iloc structure, so the caller needs to repeat the iloc + * lookup to mark the inode dirty later. + */ + static inline int + ext3_pin_inode(handle_t *handle, struct inode *inode) + { + struct ext3_iloc iloc; + + int err = 0; + if (handle) { + err = ext3_get_inode_loc(inode, &iloc); + if (!err) { + BUFFER_TRACE(iloc.bh, "get_write_access"); + err = journal_get_write_access(handle, iloc.bh); + if (!err) + err = ext3_journal_dirty_metadata(handle, + iloc.bh); + brelse(iloc.bh); + } + } + ext3_std_error(inode->i_sb, err); + return err; + } + #endif + + int ext3_change_inode_journal_flag(struct inode *inode, int val) + { + journal_t *journal; + handle_t *handle; + int err; + + /* + * We have to be very careful here: changing a data block's + * journaling status dynamically is dangerous. If we write a + * data block to the journal, change the status and then delete + * that block, we risk forgetting to revoke the old log record + * from the journal and so a subsequent replay can corrupt data. + * So, first we make sure that the journal is empty and that + * nobody is changing anything. + */ + + journal = EXT3_JOURNAL(inode); + if (is_journal_aborted(journal) || IS_RDONLY(inode)) + return -EROFS; + + journal_lock_updates(journal); + journal_flush(journal); + + /* + * OK, there are no updates running now, and all cached data is + * synced to disk. We are now in a completely consistent state + * which doesn't have anything in the journal, and we know that + * no filesystem updates are running, so it is safe to modify + * the inode's in-core data-journaling state flag now. + */ + + if (val) + inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL; + else + inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL; + + journal_unlock_updates(journal); + + /* Finally we can mark the inode as dirty. */ + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + err = ext3_mark_inode_dirty(handle, inode); + handle->h_sync = 1; + ext3_journal_stop(handle, inode); + ext3_std_error(inode->i_sb, err); + + return err; + } + + + /* + * ext3_aops_journal_start(). + * + * + * + * We need to take the inode semaphore *outside* the + * journal_start/journal_stop. Otherwise, a different task could do a + * wait_for_commit() while holding ->i_sem, which deadlocks. The rule + * is: transaction open/closes are considered to be a locking operation + * and they nest *inside* ->i_sem. + * ---------------------------------------------------------------------------- + * Possible problem: + * ext3_file_write() + * -> generic_file_write() + * -> __alloc_pages() + * -> page_launder() + * -> ext3_writepage() + * + * And the writepage can be on a different fs while we have a + * transaction open against this one! Bad. + * + * I tried making the task PF_MEMALLOC here, but that simply results in + * 0-order allocation failures passed back to generic_file_write(). + * Instead, we rely on the reentrancy protection in ext3_writepage(). + * ---------------------------------------------------------------------------- + * When we do the journal_start() here we don't really need to reserve + * any blocks - we won't need any until we hit ext3_prepare_write(), + * which does all the needed journal extending. However! There is a + * problem with quotas: + * + * Thread 1: + * sys_sync + * ->sync_dquots + * ->commit_dquot + * ->lock_dquot + * ->write_dquot + * ->ext3_file_write + * ->journal_start + * ->ext3_prepare_write + * ->journal_extend + * ->journal_start + * Thread 2: + * ext3_create (for example) + * ->ext3_new_inode + * ->dquot_initialize + * ->lock_dquot + * + * Deadlock. Thread 1's journal_start blocks because thread 2 has a + * transaction open. Thread 2's transaction will never close because + * thread 2 is stuck waiting for the dquot lock. + * + * So. We must ensure that thread 1 *never* needs to extend the journal + * for quota writes. We do that by reserving enough journal blocks + * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we + * need to extend" test in ext3_prepare_write() succeeds. + */ + + + MODULE_LICENSE("GPL"); diff -rc2P linux/fs/ext3/ioctl.c linux-2.4.13/fs/ext3/ioctl.c *** linux/fs/ext3/ioctl.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/ioctl.c Fri Nov 9 17:03:13 2001 *************** *** 0 **** --- 1,176 ---- + /* + * linux/fs/ext3/ioctl.c + * + * Copyright (C) 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + */ + + #include + #include + #include + #include + #include + #include + + + int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) + { + unsigned int flags; + + ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { + case EXT3_IOC_GETFLAGS: + flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); + case EXT3_IOC_SETFLAGS: { + handle_t *handle = NULL; + int err; + struct ext3_iloc iloc; + unsigned int oldflags; + unsigned int jflag; + + if (IS_RDONLY(inode)) + return -EROFS; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (get_user(flags, (int *) arg)) + return -EFAULT; + + oldflags = inode->u.ext3_i.i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT3_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + } + + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + if (IS_SYNC(inode)) + handle->h_sync = 1; + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + flags = flags & EXT3_FL_USER_MODIFIABLE; + flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE; + inode->u.ext3_i.i_flags = flags; + + if (flags & EXT3_SYNC_FL) + inode->i_flags |= S_SYNC; + else + inode->i_flags &= ~S_SYNC; + if (flags & EXT3_APPEND_FL) + inode->i_flags |= S_APPEND; + else + inode->i_flags &= ~S_APPEND; + if (flags & EXT3_IMMUTABLE_FILE_FL) + inode->i_flags |= S_IMMUTABLE_FILE; + else + inode->i_flags &= ~S_IMMUTABLE_FILE; + + if (flags & EXT3_IMMUTABLE_LINK_FL) + inode->i_flags |= S_IMMUTABLE_LINK; + else + inode->i_flags &= ~S_IMMUTABLE_LINK; + + if (flags & EXT3_NOATIME_FL) + inode->i_flags |= S_NOATIME; + else + inode->i_flags &= ~S_NOATIME; + inode->i_ctime = CURRENT_TIME; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + flags_err: + ext3_journal_stop(handle, inode); + if (err) + return err; + + if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) + err = ext3_change_inode_journal_flag(inode, jflag); + return err; + } + case EXT3_IOC_GETVERSION: + case EXT3_IOC_GETVERSION_OLD: + return put_user(inode->i_generation, (int *) arg); + case EXT3_IOC_SETVERSION: + case EXT3_IOC_SETVERSION_OLD: { + handle_t *handle; + struct ext3_iloc iloc; + __u32 generation; + int err; + + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + if (IS_RDONLY(inode)) + return -EROFS; + if (get_user(generation, (int *) arg)) + return -EFAULT; + + handle = ext3_journal_start(inode, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + return err; + + inode->i_ctime = CURRENT_TIME; + inode->i_generation = generation; + + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + ext3_journal_stop(handle, inode); + return err; + } + #ifdef CONFIG_JBD_DEBUG + case EXT3_IOC_WAIT_FOR_READONLY: + /* + * This is racy - by the time we're woken up and running, + * the superblock could be released. And the module could + * have been unloaded. So sue me. + * + * Returns 1 if it slept, else zero. + */ + { + struct super_block *sb = inode->i_sb; + DECLARE_WAITQUEUE(wait, current); + int ret = 0; + + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); + if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) { + schedule(); + ret = 1; + } + remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait); + return ret; + } + #endif + default: + return -ENOTTY; + } + } diff -rc2P linux/fs/ext3/namei.c linux-2.4.13/fs/ext3/namei.c *** linux/fs/ext3/namei.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/namei.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,1125 ---- + /* + * linux/fs/ext3/namei.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/namei.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + * Directory entry file type support and forward compatibility hooks + * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + + /* + * define how far ahead to read directories while searching them. + */ + #define NAMEI_RA_CHUNKS 2 + #define NAMEI_RA_BLOCKS 4 + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b)) + + /* + * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure. + * + * `len <= EXT3_NAME_LEN' is guaranteed by caller. + * `de != NULL' is guaranteed by caller. + */ + static inline int ext3_match (int len, const char * const name, + struct ext3_dir_entry_2 * de) + { + if (len != de->name_len) + return 0; + if (!de->inode) + return 0; + return !memcmp(name, de->name, len); + } + + /* + * Returns 0 if not found, -1 on failure, and 1 on success + */ + static int inline search_dirblock(struct buffer_head * bh, + struct inode *dir, + struct dentry *dentry, + unsigned long offset, + struct ext3_dir_entry_2 ** res_dir) + { + struct ext3_dir_entry_2 * de; + char * dlimit; + int de_len; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + + de = (struct ext3_dir_entry_2 *) bh->b_data; + dlimit = bh->b_data + dir->i_sb->s_blocksize; + while ((char *) de < dlimit) { + /* this code is executed quadratically often */ + /* do minimal checking `by hand' */ + + if ((char *) de + namelen <= dlimit && + ext3_match (namelen, name, de)) { + /* found a match - just to be sure, do a full check */ + if (!ext3_check_dir_entry("ext3_find_entry", + dir, de, bh, offset)) + return -1; + *res_dir = de; + return 1; + } + /* prevent looping on a bad block */ + de_len = le16_to_cpu(de->rec_len); + if (de_len <= 0) + return -1; + offset += de_len; + de = (struct ext3_dir_entry_2 *) ((char *) de + de_len); + } + return 0; + } + + /* + * ext3_find_entry() + * + * finds an entry in the specified directory with the wanted name. It + * returns the cache buffer in which the entry was found, and the entry + * itself (as a parameter - res_dir). It does NOT read the inode of the + * entry - you'll have to do that yourself if you want to. + * + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ + static struct buffer_head * ext3_find_entry (struct dentry *dentry, + struct ext3_dir_entry_2 ** res_dir) + { + struct super_block * sb; + struct buffer_head * bh_use[NAMEI_RA_SIZE]; + struct buffer_head * bh, *ret = NULL; + unsigned long start, block, b; + int ra_max = 0; /* Number of bh's in the readahead + buffer, bh_use[] */ + int ra_ptr = 0; /* Current index into readahead + buffer */ + int num = 0; + int nblocks, i, err; + struct inode *dir = dentry->d_parent->d_inode; + + *res_dir = NULL; + sb = dir->i_sb; + + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + start = dir->u.ext3_i.i_dir_start_lookup; + if (start >= nblocks) + start = 0; + block = start; + restart: + do { + /* + * We deal with the read-ahead logic here. + */ + if (ra_ptr >= ra_max) { + /* Refill the readahead buffer */ + ra_ptr = 0; + b = block; + for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) { + /* + * Terminate if we reach the end of the + * directory and must wrap, or if our + * search has finished at this block. + */ + if (b >= nblocks || (num && block == start)) { + bh_use[ra_max] = NULL; + break; + } + num++; + bh = ext3_getblk(NULL, dir, b++, 0, &err); + bh_use[ra_max] = bh; + if (bh) + ll_rw_block(READ, 1, &bh); + } + } + if ((bh = bh_use[ra_ptr++]) == NULL) + goto next; + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* read error, skip block & hope for the best */ + brelse(bh); + goto next; + } + i = search_dirblock(bh, dir, dentry, + block << EXT3_BLOCK_SIZE_BITS(sb), res_dir); + if (i == 1) { + dir->u.ext3_i.i_dir_start_lookup = block; + ret = bh; + goto cleanup_and_exit; + } else { + brelse(bh); + if (i < 0) + goto cleanup_and_exit; + } + next: + if (++block >= nblocks) + block = 0; + } while (block != start); + + /* + * If the directory has grown while we were searching, then + * search the last part of the directory before giving up. + */ + block = nblocks; + nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb); + if (block < nblocks) { + start = 0; + goto restart; + } + + cleanup_and_exit: + /* Clean up the read-ahead blocks */ + for (; ra_ptr < ra_max; ra_ptr++) + brelse (bh_use[ra_ptr]); + return ret; + } + + static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry) + { + struct inode * inode; + struct ext3_dir_entry_2 * de; + struct buffer_head * bh; + + if (dentry->d_name.len > EXT3_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + bh = ext3_find_entry(dentry, &de); + inode = NULL; + if (bh) { + unsigned long ino = le32_to_cpu(de->inode); + brelse (bh); + inode = iget(dir->i_sb, ino); + + if (!inode) + return ERR_PTR(-EACCES); + } + d_add(dentry, inode); + return NULL; + } + + #define S_SHIFT 12 + static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] EXT3_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] EXT3_FT_DIR, + [S_IFCHR >> S_SHIFT] EXT3_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] EXT3_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] EXT3_FT_FIFO, + [S_IFSOCK >> S_SHIFT] EXT3_FT_SOCK, + [S_IFLNK >> S_SHIFT] EXT3_FT_SYMLINK, + }; + + static inline void ext3_set_de_type(struct super_block *sb, + struct ext3_dir_entry_2 *de, + umode_t mode) { + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE)) + de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + } + + /* + * ext3_add_entry() + * + * adds a file entry to the specified directory, using the same + * semantics as ext3_find_entry(). It returns NULL if it failed. + * + * NOTE!! The inode part of 'de' is left at 0 - which means you + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ + + /* + * AKPM: the journalling code here looks wrong on the error paths + */ + static int ext3_add_entry (handle_t *handle, struct dentry *dentry, + struct inode *inode) + { + struct inode *dir = dentry->d_parent->d_inode; + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + unsigned long offset; + unsigned short rec_len; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de, * de1; + struct super_block * sb; + int retval; + + sb = dir->i_sb; + + if (!namelen) + return -EINVAL; + bh = ext3_bread (handle, dir, 0, 0, &retval); + if (!bh) + return retval; + rec_len = EXT3_DIR_REC_LEN(namelen); + offset = 0; + de = (struct ext3_dir_entry_2 *) bh->b_data; + while (1) { + if ((char *)de >= sb->s_blocksize + bh->b_data) { + brelse (bh); + bh = NULL; + bh = ext3_bread (handle, dir, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval); + if (!bh) + return retval; + if (dir->i_size <= offset) { + if (dir->i_size == 0) { + brelse(bh); + return -ENOENT; + } + + ext3_debug ("creating next block\n"); + + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + de = (struct ext3_dir_entry_2 *) bh->b_data; + de->inode = 0; + de->rec_len = le16_to_cpu(sb->s_blocksize); + dir->u.ext3_i.i_disksize = + dir->i_size = offset + sb->s_blocksize; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + } else { + + ext3_debug ("skipping to next block\n"); + + de = (struct ext3_dir_entry_2 *) bh->b_data; + } + } + if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh, + offset)) { + brelse (bh); + return -ENOENT; + } + if (ext3_match (namelen, name, de)) { + brelse (bh); + return -EEXIST; + } + if ((le32_to_cpu(de->inode) == 0 && + le16_to_cpu(de->rec_len) >= rec_len) || + (le16_to_cpu(de->rec_len) >= + EXT3_DIR_REC_LEN(de->name_len) + rec_len)) { + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + /* By now the buffer is marked for journaling */ + offset += le16_to_cpu(de->rec_len); + if (le32_to_cpu(de->inode)) { + de1 = (struct ext3_dir_entry_2 *) ((char *) de + + EXT3_DIR_REC_LEN(de->name_len)); + de1->rec_len = + cpu_to_le16(le16_to_cpu(de->rec_len) - + EXT3_DIR_REC_LEN(de->name_len)); + de->rec_len = cpu_to_le16( + EXT3_DIR_REC_LEN(de->name_len)); + de = de1; + } + de->file_type = EXT3_FT_UNKNOWN; + if (inode) { + de->inode = cpu_to_le32(inode->i_ino); + ext3_set_de_type(dir->i_sb, de, inode->i_mode); + } else + de->inode = 0; + de->name_len = namelen; + memcpy (de->name, name, namelen); + /* + * XXX shouldn't update any times until successful + * completion of syscall, but too many callers depend + * on this. + * + * XXX similarly, too many callers depend on + * ext3_new_inode() setting the times, but error + * recovery deletes the inode, so the worst that can + * happen is that the times are slightly out of date + * and/or different from the directory change time. + */ + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + dir->i_version = ++event; + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + return 0; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + brelse (bh); + return -ENOSPC; + } + + /* + * ext3_delete_entry deletes a directory entry by merging it with the + * previous entry + */ + static int ext3_delete_entry (handle_t *handle, + struct inode * dir, + struct ext3_dir_entry_2 * de_del, + struct buffer_head * bh) + { + struct ext3_dir_entry_2 * de, * pde; + int i; + + i = 0; + pde = NULL; + de = (struct ext3_dir_entry_2 *) bh->b_data; + while (i < bh->b_size) { + if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i)) + return -EIO; + if (de == de_del) { + BUFFER_TRACE(bh, "get_write_access"); + ext3_journal_get_write_access(handle, bh); + if (pde) + pde->rec_len = + cpu_to_le16(le16_to_cpu(pde->rec_len) + + le16_to_cpu(de->rec_len)); + else + de->inode = 0; + dir->i_version = ++event; + BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, bh); + return 0; + } + i += le16_to_cpu(de->rec_len); + pde = de; + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + return -ENOENT; + } + + /* + * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we + * do not perform it in these functions. We perform it at the call site, + * if it is needed. + */ + static inline void ext3_inc_count(handle_t *handle, struct inode *inode) + { + inode->i_nlink++; + } + + static inline void ext3_dec_count(handle_t *handle, struct inode *inode) + { + inode->i_nlink--; + } + + static int ext3_add_nondir(handle_t *handle, + struct dentry *dentry, struct inode *inode) + { + int err = ext3_add_entry(handle, dentry, inode); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + ext3_dec_count(handle, inode); + iput(inode); + return err; + } + + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it + * is so far negative - it has no inode. + * + * If the create succeeds, we fill in the inode information + * with d_instantiate(). + */ + static int ext3_create (struct inode * dir, struct dentry * dentry, int mode) + { + handle_t *handle; + struct inode * inode; + int err; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &ext3_file_inode_operations; + inode->i_fop = &ext3_file_operations; + inode->i_mapping->a_ops = &ext3_aops; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle, dir); + return err; + } + + static int ext3_mknod (struct inode * dir, struct dentry *dentry, + int mode, int rdev) + { + handle_t *handle; + struct inode *inode; + int err; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, mode); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + } + ext3_journal_stop(handle, dir); + return err; + } + + static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) + { + handle_t *handle; + struct inode * inode; + struct buffer_head * dir_block; + struct ext3_dir_entry_2 * de; + int err; + + if (dir->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, S_IFDIR); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + inode->i_op = &ext3_dir_inode_operations; + inode->i_fop = &ext3_dir_operations; + inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize; + inode->i_blocks = 0; + dir_block = ext3_bread (handle, inode, 0, 1, &err); + if (!dir_block) { + inode->i_nlink--; /* is this nlink == 0? */ + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + BUFFER_TRACE(dir_block, "get_write_access"); + ext3_journal_get_write_access(handle, dir_block); + de = (struct ext3_dir_entry_2 *) dir_block->b_data; + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; + de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len)); + strcpy (de->name, "."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + de->inode = cpu_to_le32(dir->i_ino); + de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1)); + de->name_len = 2; + strcpy (de->name, ".."); + ext3_set_de_type(dir->i_sb, de, S_IFDIR); + inode->i_nlink = 2; + BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_block); + brelse (dir_block); + inode->i_mode = S_IFDIR | mode; + if (dir->i_mode & S_ISGID) + inode->i_mode |= S_ISGID; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_entry (handle, dentry, inode); + if (err) + goto out_no_entry; + dir->i_nlink++; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + d_instantiate(dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; + + out_no_entry: + inode->i_nlink = 0; + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + + /* + * routine to check that the specified directory is empty (for rmdir) + */ + static int empty_dir (struct inode * inode) + { + unsigned long offset; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de, * de1; + struct super_block * sb; + int err; + + sb = inode->i_sb; + if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || + !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { + ext3_warning (inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no data block", + inode->i_ino); + return 1; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + de1 = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (le32_to_cpu(de->inode) != inode->i_ino || + !le32_to_cpu(de1->inode) || + strcmp (".", de->name) || + strcmp ("..", de1->name)) { + ext3_warning (inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse (bh); + return 1; + } + offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len); + de = (struct ext3_dir_entry_2 *) + ((char *) de1 + le16_to_cpu(de1->rec_len)); + while (offset < inode->i_size ) { + if (!bh || + (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { + brelse (bh); + bh = ext3_bread (NULL, inode, + offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); + if (!bh) { + #if 0 + ext3_error (sb, "empty_dir", + "directory #%lu contains a hole at offset %lu", + inode->i_ino, offset); + #endif + offset += sb->s_blocksize; + continue; + } + de = (struct ext3_dir_entry_2 *) bh->b_data; + } + if (!ext3_check_dir_entry ("empty_dir", inode, de, bh, + offset)) { + brelse (bh); + return 1; + } + if (le32_to_cpu(de->inode)) { + brelse (bh); + return 0; + } + offset += le16_to_cpu(de->rec_len); + de = (struct ext3_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + } + brelse (bh); + return 1; + } + + /* ext3_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext3_orphan_cleanup(). + */ + int ext3_orphan_add(handle_t *handle, struct inode *inode) + { + struct super_block *sb = inode->i_sb; + struct ext3_iloc iloc; + int err = 0, rc; + + lock_super(sb); + if (!list_empty(&inode->u.ext3_i.i_orphan)) + goto out_unlock; + + /* Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. */ + + /* @@@ FIXME: Observation from aviro: + * I think I can trigger J_ASSERT in ext3_orphan_add(). We block + * here (on lock_super()), so race with ext3_link() which might bump + * ->i_nlink. For, say it, character device. Not a regular file, + * not a directory, not a symlink and ->i_nlink > 0. + */ + J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh); + if (err) + goto out_unlock; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_unlock; + + /* Insert this inode at the head of the on-disk orphan list... */ + NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan); + EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh); + rc = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + + /* Only add to the head of the in-memory list if all the + * previous operations succeeded. If the orphan_add is going to + * fail (possibly taking the journal offline), we can't risk + * leaving the inode on the orphan list: stray orphan-list + * entries can cause panics at unmount time. + * + * This is safe: on error we're going to ignore the orphan list + * anyway on the next recovery. */ + if (!err) + list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan); + + jbd_debug(4, "superblock will point to %ld\n", inode->i_ino); + jbd_debug(4, "orphan inode %ld will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); + out_unlock: + unlock_super(sb); + ext3_std_error(inode->i_sb, err); + return err; + } + + /* + * ext3_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ + int ext3_orphan_del(handle_t *handle, struct inode *inode) + { + struct list_head *prev; + struct ext3_sb_info *sbi; + ino_t ino_next; + struct ext3_iloc iloc; + int err = 0; + + lock_super(inode->i_sb); + if (list_empty(&inode->u.ext3_i.i_orphan)) { + unlock_super(inode->i_sb); + return 0; + } + + ino_next = NEXT_ORPHAN(inode); + prev = inode->u.ext3_i.i_orphan.prev; + sbi = EXT3_SB(inode->i_sb); + + jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino); + + list_del(&inode->u.ext3_i.i_orphan); + INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle) + goto out; + + err = ext3_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_err; + + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %ld\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext3_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto out_brelse; + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + err = ext3_journal_dirty_metadata(handle, sbi->s_sbh); + } else { + struct ext3_iloc iloc2; + struct inode *i_prev = + list_entry(prev, struct inode, u.ext3_i.i_orphan); + + jbd_debug(4, "orphan inode %ld will point to %ld\n", + i_prev->i_ino, ino_next); + err = ext3_reserve_inode_write(handle, i_prev, &iloc2); + if (err) + goto out_brelse; + NEXT_ORPHAN(i_prev) = ino_next; + err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext3_mark_iloc_dirty(handle, inode, &iloc); + if (err) + goto out_brelse; + + out_err: + ext3_std_error(inode->i_sb, err); + out: + unlock_super(inode->i_sb); + return err; + + out_brelse: + brelse(iloc.bh); + goto out_err; + } + + static int ext3_rmdir (struct inode * dir, struct dentry *dentry) + { + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); + if (!bh) + goto end_rmdir; + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = dentry->d_inode; + DQUOT_INIT(inode); + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_rmdir; + + retval = -ENOTEMPTY; + if (!empty_dir (inode)) + goto end_rmdir; + + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_rmdir; + if (inode->i_nlink != 2) + ext3_warning (inode->i_sb, "ext3_rmdir", + "empty directory has nlink!=2 (%d)", + inode->i_nlink); + inode->i_version = ++event; + inode->i_nlink = 0; + /* There's no need to set i_disksize: the fact that i_nlink is + * zero will ensure that the right thing happens during any + * recovery. */ + inode->i_size = 0; + ext3_orphan_add(handle, inode); + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + + end_rmdir: + ext3_journal_stop(handle, dir); + brelse (bh); + return retval; + } + + static int ext3_unlink(struct inode * dir, struct dentry *dentry) + { + int retval; + struct inode * inode; + struct buffer_head * bh; + struct ext3_dir_entry_2 * de; + handle_t *handle; + + handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + retval = -ENOENT; + bh = ext3_find_entry (dentry, &de); + if (!bh) + goto end_unlink; + + inode = dentry->d_inode; + DQUOT_INIT(inode); + + retval = -EIO; + if (le32_to_cpu(de->inode) != inode->i_ino) + goto end_unlink; + + if (!inode->i_nlink) { + ext3_warning (inode->i_sb, "ext3_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + retval = ext3_delete_entry(handle, dir, de, bh); + if (retval) + goto end_unlink; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, dir); + inode->i_nlink--; + if (!inode->i_nlink) + ext3_orphan_add(handle, inode); + ext3_mark_inode_dirty(handle, inode); + inode->i_ctime = dir->i_ctime; + retval = 0; + + end_unlink: + ext3_journal_stop(handle, dir); + brelse (bh); + return retval; + } + + static int ext3_symlink (struct inode * dir, + struct dentry *dentry, const char * symname) + { + handle_t *handle; + struct inode * inode; + int l, err; + + l = strlen(symname)+1; + if (l > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; + + if (l > sizeof (inode->u.ext3_i.i_data)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &ext3_aops; + /* + * block_symlink() calls back into ext3_prepare/commit_write. + * We have a transaction open. All is sweetness. It also sets + * i_size in generic_commit_write(). + */ + err = block_symlink(inode, symname, l); + if (err) + goto out_no_entry; + } else { + inode->i_op = &ext3_fast_symlink_inode_operations; + memcpy((char*)&inode->u.ext3_i.i_data,symname,l); + inode->i_size = l-1; + } + inode->u.ext3_i.i_disksize = inode->i_size; + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + out_stop: + ext3_journal_stop(handle, dir); + return err; + + out_no_entry: + ext3_dec_count(handle, inode); + ext3_mark_inode_dirty(handle, inode); + iput (inode); + goto out_stop; + } + + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { + handle_t *handle; + struct inode *inode = old_dentry->d_inode; + int err; + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + if (inode->i_nlink >= EXT3_LINK_MAX) + return -EMLINK; + + handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(dir)) + handle->h_sync = 1; + + inode->i_ctime = CURRENT_TIME; + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + + ext3_mark_inode_dirty(handle, inode); + err = ext3_add_nondir(handle, dentry, inode); + ext3_journal_stop(handle, dir); + return err; + } + + #define PARENT_INO(buffer) \ + ((struct ext3_dir_entry_2 *) ((char *) buffer + \ + le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode + + /* + * Anybody can rename anything with this: the permission checks are left to the + * higher-level routines. + */ + static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir,struct dentry *new_dentry) + { + handle_t *handle; + struct inode * old_inode, * new_inode; + struct buffer_head * old_bh, * new_bh, * dir_bh; + struct ext3_dir_entry_2 * old_de, * new_de; + int retval; + + old_bh = new_bh = dir_bh = NULL; + + handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (IS_SYNC(old_dir) || IS_SYNC(new_dir)) + handle->h_sync = 1; + + old_bh = ext3_find_entry (old_dentry, &old_de); + /* + * Check for inode number is _not_ due to possible IO errors. + * We might rmdir the source, keep it as pwd of some process + * and merrily kill the link to whatever was created under the + * same name. Goodbye sticky bit ;-< + */ + old_inode = old_dentry->d_inode; + retval = -ENOENT; + if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) + goto end_rename; + + new_inode = new_dentry->d_inode; + new_bh = ext3_find_entry (new_dentry, &new_de); + if (new_bh) { + if (!new_inode) { + brelse (new_bh); + new_bh = NULL; + } else { + DQUOT_INIT(new_inode); + } + } + if (S_ISDIR(old_inode->i_mode)) { + if (new_inode) { + retval = -ENOTEMPTY; + if (!empty_dir (new_inode)) + goto end_rename; + } + retval = -EIO; + dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval); + if (!dir_bh) + goto end_rename; + if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) + goto end_rename; + retval = -EMLINK; + if (!new_inode && new_dir!=old_dir && + new_dir->i_nlink >= EXT3_LINK_MAX) + goto end_rename; + } + if (!new_bh) { + retval = ext3_add_entry (handle, new_dentry, old_inode); + if (retval) + goto end_rename; + } else { + BUFFER_TRACE(new_bh, "get write access"); + BUFFER_TRACE(new_bh, "get_write_access"); + ext3_journal_get_write_access(handle, new_bh); + new_de->inode = le32_to_cpu(old_inode->i_ino); + if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb, + EXT3_FEATURE_INCOMPAT_FILETYPE)) + new_de->file_type = old_de->file_type; + new_dir->i_version = ++event; + BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, new_bh); + brelse(new_bh); + new_bh = NULL; + } + + /* + * Like most other Unix systems, set the ctime for inodes on a + * rename. + */ + old_inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, old_inode); + + /* + * ok, that's it + */ + ext3_delete_entry(handle, old_dir, old_de, old_bh); + + if (new_inode) { + new_inode->i_nlink--; + new_inode->i_ctime = CURRENT_TIME; + } + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + if (dir_bh) { + BUFFER_TRACE(dir_bh, "get_write_access"); + ext3_journal_get_write_access(handle, dir_bh); + PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino); + BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata"); + ext3_journal_dirty_metadata(handle, dir_bh); + old_dir->i_nlink--; + if (new_inode) { + new_inode->i_nlink--; + } else { + new_dir->i_nlink++; + new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL; + ext3_mark_inode_dirty(handle, new_dir); + } + } + ext3_mark_inode_dirty(handle, old_dir); + if (new_inode) { + ext3_mark_inode_dirty(handle, new_inode); + if (!new_inode->i_nlink) + ext3_orphan_add(handle, new_inode); + } + retval = 0; + + end_rename: + brelse (dir_bh); + brelse (old_bh); + brelse (new_bh); + ext3_journal_stop(handle, old_dir); + return retval; + } + + /* + * directories can handle most operations... + */ + struct inode_operations ext3_dir_inode_operations = { + create: ext3_create, /* BKL held */ + lookup: ext3_lookup, /* BKL held */ + link: ext3_link, /* BKL held */ + unlink: ext3_unlink, /* BKL held */ + symlink: ext3_symlink, /* BKL held */ + mkdir: ext3_mkdir, /* BKL held */ + rmdir: ext3_rmdir, /* BKL held */ + mknod: ext3_mknod, /* BKL held */ + rename: ext3_rename, /* BKL held */ + }; diff -rc2P linux/fs/ext3/super.c linux-2.4.13/fs/ext3/super.c *** linux/fs/ext3/super.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/super.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,1743 ---- + /* + * linux/fs/ext3/super.c + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/inode.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Big-endian to little-endian byte-swapping/bitmaps by + * David S. Miller (davem@caip.rutgers.edu), 1995 + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #ifdef CONFIG_JBD_DEBUG + static int ext3_ro_after; /* Make fs read-only after this many jiffies */ + #endif + + static int ext3_load_journal(struct super_block *, struct ext3_super_block *); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, + int); + static void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync); + static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es); + static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es); + + #ifdef CONFIG_JBD_DEBUG + /* + * Debug code for turning filesystems "read-only" after a specified + * amount of time. This is for crash/recovery testing. + */ + + static void make_rdonly(kdev_t dev, int *no_write) + { + if (dev) { + printk(KERN_WARNING "Turning device %s read-only\n", + bdevname(dev)); + *no_write = 0xdead0000 + dev; + } + } + + static void turn_fs_readonly(unsigned long arg) + { + struct super_block *sb = (struct super_block *)arg; + + make_rdonly(sb->s_dev, &journal_no_write[0]); + make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]); + wake_up(&EXT3_SB(sb)->ro_wait_queue); + } + + static void setup_ro_after(struct super_block *sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + init_timer(&sbi->turn_ro_timer); + if (ext3_ro_after) { + printk(KERN_DEBUG "fs will go read-only in %d jiffies\n", + ext3_ro_after); + init_waitqueue_head(&sbi->ro_wait_queue); + journal_no_write[0] = 0; + journal_no_write[1] = 0; + sbi->turn_ro_timer.function = turn_fs_readonly; + sbi->turn_ro_timer.data = (unsigned long)sb; + sbi->turn_ro_timer.expires = jiffies + ext3_ro_after; + ext3_ro_after = 0; + add_timer(&sbi->turn_ro_timer); + } + } + + static void clear_ro_after(struct super_block *sb) + { + del_timer_sync(&EXT3_SB(sb)->turn_ro_timer); + journal_no_write[0] = 0; + journal_no_write[1] = 0; + ext3_ro_after = 0; + } + #else + #define setup_ro_after(sb) do {} while (0) + #define clear_ro_after(sb) do {} while (0) + #endif + + + static char error_buf[1024]; + + /* Determine the appropriate response to ext3_error on a given filesystem */ + + static int ext3_error_behaviour(struct super_block *sb) + { + /* First check for mount-time options */ + if (test_opt (sb, ERRORS_PANIC)) + return EXT3_ERRORS_PANIC; + if (test_opt (sb, ERRORS_RO)) + return EXT3_ERRORS_RO; + if (test_opt (sb, ERRORS_CONT)) + return EXT3_ERRORS_CONTINUE; + + /* If no overrides were specified on the mount, then fall back + * to the default behaviour set in the filesystem's superblock + * on disk. */ + switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) { + case EXT3_ERRORS_PANIC: + return EXT3_ERRORS_PANIC; + case EXT3_ERRORS_RO: + return EXT3_ERRORS_RO; + default: + break; + } + return EXT3_ERRORS_CONTINUE; + } + + /* Deal with the reporting of failure conditions on a filesystem such as + * inconsistencies detected or read IO failures. + * + * On ext2, we can store the error state of the filesystem in the + * superblock. That is not possible on ext3, because we may have other + * write ordering constraints on the superblock which prevent us from + * writing it out straight away; and given that the journal is about to + * be aborted, we can't rely on the current, or future, transactions to + * write out the superblock safely. + * + * We'll just use the journal_abort() error code to record an error in + * the journal instead. On recovery, the journal will compain about + * that error until we've noted it down and cleared it. + */ + + static void ext3_handle_error(struct super_block *sb) + { + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le32(EXT3_ERROR_FS); + + if (sb->s_flags & MS_RDONLY) + return; + + if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) { + EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT; + journal_abort(EXT3_SB(sb)->s_journal, -EIO); + } + + if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) + panic ("EXT3-fs (device %s): panic forced after error\n", + bdevname(sb->s_dev)); + + if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) { + printk (KERN_CRIT "Remounting filesystem read-only\n"); + sb->s_flags |= MS_RDONLY; + } + + ext3_commit_super(sb, es, 1); + } + + void ext3_error (struct super_block * sb, const char * function, + const char * fmt, ...) + { + va_list args; + + va_start (args, fmt); + vsprintf (error_buf, fmt, args); + va_end (args); + + printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + + ext3_handle_error(sb); + } + + const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]) + { + char *errstr = NULL; + + switch (errno) { + case -EIO: + errstr = "IO failure"; + break; + case -ENOMEM: + errstr = "Out of memory"; + break; + case -EROFS: + if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT) + errstr = "Journal has aborted"; + else + errstr = "Readonly filesystem"; + break; + default: + /* If the caller passed in an extra buffer for unknown + * errors, textualise them now. Else we just return + * NULL. */ + if (nbuf) { + /* Check for truncated error codes... */ + if (snprintf(nbuf, 16, "error %d", -errno) >= 0) + errstr = nbuf; + } + + break; + } + + return errstr; + } + + /* __ext3_std_error decodes expected errors from journaling functions + * automatically and invokes the appropriate error response. */ + + void __ext3_std_error (struct super_block * sb, const char * function, + int errno) + { + char nbuf[16]; + const char *errstr = ext3_decode_error(sb, errno, nbuf); + + printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n", + bdevname(sb->s_dev), function, errstr); + + ext3_handle_error(sb); + } + + /* + * ext3_abort is a much stronger failure handler than ext3_error. The + * abort function may be used to deal with unrecoverable failures such + * as journal IO errors or ENOMEM at a critical moment in log management. + * + * We unconditionally force the filesystem into an ABORT|READONLY state, + * unless the error response on the fs has been set to panic in which + * case we take the easy way out and panic immediately. + */ + + void ext3_abort (struct super_block * sb, const char * function, + const char * fmt, ...) + { + va_list args; + + printk (KERN_CRIT "ext3_abort called.\n"); + + va_start (args, fmt); + vsprintf (error_buf, fmt, args); + va_end (args); + + if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) + panic ("EXT3-fs panic (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + + printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + + if (sb->s_flags & MS_RDONLY) + return; + + printk (KERN_CRIT "Remounting filesystem read-only\n"); + sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; + sb->s_flags |= MS_RDONLY; + sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT; + journal_abort(EXT3_SB(sb)->s_journal, -EIO); + } + + /* Deal with the reporting of failure conditions while running, such as + * inconsistencies in operation or invalid system states. + * + * Use ext3_error() for cases of invalid filesystem states, as that will + * record an error on disk and force a filesystem check on the next boot. + */ + NORET_TYPE void ext3_panic (struct super_block * sb, const char * function, + const char * fmt, ...) + { + va_list args; + + va_start (args, fmt); + vsprintf (error_buf, fmt, args); + va_end (args); + + /* this is to prevent panic from syncing this filesystem */ + /* AKPM: is this sufficient? */ + sb->s_flags |= MS_RDONLY; + panic ("EXT3-fs panic (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + } + + void ext3_warning (struct super_block * sb, const char * function, + const char * fmt, ...) + { + va_list args; + + va_start (args, fmt); + vsprintf (error_buf, fmt, args); + va_end (args); + printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n", + bdevname(sb->s_dev), function, error_buf); + } + + void ext3_update_dynamic_rev(struct super_block *sb) + { + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + + if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) + return; + + ext3_warning(sb, __FUNCTION__, + "updating to rev %d because of new feature flag, " + "running e2fsck is recommended", + EXT3_DYNAMIC_REV); + + es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO); + es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE); + es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV); + /* leave es->s_feature_*compat flags alone */ + /* es->s_uuid will be set by e2fsck if empty */ + + /* + * The rest of the superblock fields should be zero, and if not it + * means they are likely already in use, so leave them alone. We + * can leave it up to e2fsck to clean up any inconsistencies there. + */ + } + + /* + * Open the external journal device + */ + static struct block_device *ext3_blkdev_get(kdev_t dev) + { + struct block_device *bdev; + int err = -ENODEV; + + bdev = bdget(kdev_t_to_nr(dev)); + if (bdev == NULL) + goto fail; + err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS); + if (err < 0) + goto fail; + return bdev; + + fail: + printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n", + bdevname(dev), err); + return NULL; + } + + /* + * Release the journal device + */ + static int ext3_blkdev_put(struct block_device *bdev) + { + return blkdev_put(bdev, BDEV_FS); + } + + static int ext3_blkdev_remove(struct ext3_sb_info *sbi) + { + struct block_device *bdev; + int ret = -ENODEV; + + bdev = sbi->journal_bdev; + if (bdev) { + ret = ext3_blkdev_put(bdev); + sbi->journal_bdev = 0; + } + return ret; + } + + #define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan) + + static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi) + { + struct list_head *l; + + printk(KERN_ERR "sb orphan head is %d\n", + le32_to_cpu(sbi->s_es->s_last_orphan)); + + printk(KERN_ERR "sb_info orphan list:\n"); + list_for_each(l, &sbi->s_orphan) { + struct inode *inode = orphan_list_entry(l); + printk(KERN_ERR " " + "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n", + inode->i_dev, inode->i_ino, inode, + inode->i_mode, inode->i_nlink, + le32_to_cpu(NEXT_ORPHAN(inode))); + } + } + + void ext3_put_super (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + kdev_t j_dev = sbi->s_journal->j_dev; + int i; + + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + es->s_state = le16_to_cpu(sbi->s_mount_state); + BUFFER_TRACE(sbi->s_sbh, "marking dirty"); + mark_buffer_dirty(sbi->s_sbh); + ext3_commit_super(sb, es, 1); + } + + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) + brelse(sbi->s_inode_bitmap[i]); + for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) + brelse(sbi->s_block_bitmap[i]); + brelse(sbi->s_sbh); + + /* Debugging code just in case the in-memory inode orphan list + * isn't empty. The on-disk one can be non-empty if we've + * detected an error and taken the fs readonly, but the + * in-memory list had better be clean by this point. */ + if (!list_empty(&sbi->s_orphan)) + dump_orphan_list(sb, sbi); + J_ASSERT(list_empty(&sbi->s_orphan)); + + invalidate_buffers(sb->s_dev); + if (j_dev != sb->s_dev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ + fsync_no_super(j_dev); + invalidate_buffers(j_dev); + ext3_blkdev_remove(sbi); + } + clear_ro_after(sb); + + return; + } + + static struct super_operations ext3_sops = { + read_inode: ext3_read_inode, /* BKL held */ + write_inode: ext3_write_inode, /* BKL not held. Don't need */ + dirty_inode: ext3_dirty_inode, /* BKL not held. We take it */ + put_inode: ext3_put_inode, /* BKL not held. Don't need */ + delete_inode: ext3_delete_inode, /* BKL not held. We take it */ + put_super: ext3_put_super, /* BKL held */ + write_super: ext3_write_super, /* BKL held */ + write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */ + unlockfs: ext3_unlockfs, /* BKL not held. We take it */ + statfs: ext3_statfs, /* BKL held */ + remount_fs: ext3_remount, /* BKL held */ + }; + + static int want_value(char *value, char *option) + { + if (!value || !*value) { + printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n", + option); + return -1; + } + return 0; + } + + static int want_null_value(char *value, char *option) + { + if (*value) { + printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n", + option, value); + return -1; + } + return 0; + } + + static int want_numeric(char *value, char *option, unsigned long *number) + { + if (want_value(value, option)) + return -1; + *number = simple_strtoul(value, &value, 0); + if (want_null_value(value, option)) + return -1; + return 0; + } + + /* + * This function has been shamelessly adapted from the msdos fs + */ + static int parse_options (char * options, unsigned long * sb_block, + struct ext3_sb_info *sbi, + unsigned long * inum, + int is_remount) + { + unsigned long *mount_options = &sbi->s_mount_opt; + uid_t *resuid = &sbi->s_resuid; + gid_t *resgid = &sbi->s_resgid; + char * this_char; + char * value; + + if (!options) + return 1; + for (this_char = strtok (options, ","); + this_char != NULL; + this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; + if (!strcmp (this_char, "bsddf")) + clear_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nouid32")) { + set_opt (*mount_options, NO_UID32); + } + else if (!strcmp (this_char, "abort")) + set_opt (*mount_options, ABORT); + else if (!strcmp (this_char, "check")) { + if (!value || !*value || !strcmp (value, "none")) + clear_opt (*mount_options, CHECK); + else + #ifdef CONFIG_EXT3_CHECK + set_opt (*mount_options, CHECK); + #else + printk(KERN_ERR + "EXT3 Check option not supported\n"); + #endif + } + else if (!strcmp (this_char, "debug")) + set_opt (*mount_options, DEBUG); + else if (!strcmp (this_char, "errors")) { + if (want_value(value, "errors")) + return 0; + if (!strcmp (value, "continue")) { + clear_opt (*mount_options, ERRORS_RO); + clear_opt (*mount_options, ERRORS_PANIC); + set_opt (*mount_options, ERRORS_CONT); + } + else if (!strcmp (value, "remount-ro")) { + clear_opt (*mount_options, ERRORS_CONT); + clear_opt (*mount_options, ERRORS_PANIC); + set_opt (*mount_options, ERRORS_RO); + } + else if (!strcmp (value, "panic")) { + clear_opt (*mount_options, ERRORS_CONT); + clear_opt (*mount_options, ERRORS_RO); + set_opt (*mount_options, ERRORS_PANIC); + } + else { + printk (KERN_ERR + "EXT3-fs: Invalid errors option: %s\n", + value); + return 0; + } + } + else if (!strcmp (this_char, "grpid") || + !strcmp (this_char, "bsdgroups")) + set_opt (*mount_options, GRPID); + else if (!strcmp (this_char, "minixdf")) + set_opt (*mount_options, MINIX_DF); + else if (!strcmp (this_char, "nocheck")) + clear_opt (*mount_options, CHECK); + else if (!strcmp (this_char, "nogrpid") || + !strcmp (this_char, "sysvgroups")) + clear_opt (*mount_options, GRPID); + else if (!strcmp (this_char, "resgid")) { + unsigned long v; + if (want_numeric(value, "resgid", &v)) + return 0; + *resgid = v; + } + else if (!strcmp (this_char, "resuid")) { + unsigned long v; + if (want_numeric(value, "resuid", &v)) + return 0; + *resuid = v; + } + else if (!strcmp (this_char, "sb")) { + if (want_numeric(value, "sb", sb_block)) + return 0; + } + #ifdef CONFIG_JBD_DEBUG + else if (!strcmp (this_char, "ro-after")) { + unsigned long v; + if (want_numeric(value, "ro-after", &v)) + return 0; + ext3_ro_after = v; + } + #endif + /* Silently ignore the quota options */ + else if (!strcmp (this_char, "grpquota") + || !strcmp (this_char, "noquota") + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; + else if (!strcmp (this_char, "journal")) { + /* @@@ FIXME */ + /* Eventually we will want to be able to create + a journal file here. For now, only allow the + user to specify an existing inode to be the + journal file. */ + if (is_remount) { + printk(KERN_ERR "EXT3-fs: cannot specify " + "journal on remount\n"); + return 0; + } + + if (want_value(value, "journal")) + return 0; + if (!strcmp (value, "update")) + set_opt (*mount_options, UPDATE_JOURNAL); + else if (want_numeric(value, "journal", inum)) + return 0; + } + else if (!strcmp (this_char, "noload")) + set_opt (*mount_options, NOLOAD); + else if (!strcmp (this_char, "data")) { + int data_opt = 0; + + if (want_value(value, "data")) + return 0; + if (!strcmp (value, "journal")) + data_opt = EXT3_MOUNT_JOURNAL_DATA; + else if (!strcmp (value, "ordered")) + data_opt = EXT3_MOUNT_ORDERED_DATA; + else if (!strcmp (value, "writeback")) + data_opt = EXT3_MOUNT_WRITEBACK_DATA; + else { + printk (KERN_ERR + "EXT3-fs: Invalid data option: %s\n", + value); + return 0; + } + if (is_remount) { + if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) != + data_opt) { + printk(KERN_ERR + "EXT3-fs: cannot change data " + "mode on remount\n"); + return 0; + } + } else { + *mount_options &= ~EXT3_MOUNT_DATA_FLAGS; + *mount_options |= data_opt; + } + } else { + printk (KERN_ERR + "EXT3-fs: Unrecognized mount option %s\n", + this_char); + return 0; + } + } + return 1; + } + + static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, + int read_only) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + int res = 0; + + if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) { + printk (KERN_ERR "EXT3-fs warning: revision level too high, " + "forcing read-only mode\n"); + res = MS_RDONLY; + } + if (read_only) + return res; + if (!(sbi->s_mount_state & EXT3_VALID_FS)) + printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, " + "running e2fsck is recommended\n"); + else if ((sbi->s_mount_state & EXT3_ERROR_FS)) + printk (KERN_WARNING + "EXT3-fs warning: mounting fs with errors, " + "running e2fsck is recommended\n"); + else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && + le16_to_cpu(es->s_mnt_count) >= + (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) + printk (KERN_WARNING + "EXT3-fs warning: maximal mount count reached, " + "running e2fsck is recommended\n"); + else if (le32_to_cpu(es->s_checkinterval) && + (le32_to_cpu(es->s_lastcheck) + + le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME)) + printk (KERN_WARNING + "EXT3-fs warning: checktime reached, " + "running e2fsck is recommended\n"); + #if 0 + /* @@@ We _will_ want to clear the valid bit if we find + inconsistencies, to force a fsck at reboot. But for + a plain journaled filesystem we can keep it set as + valid forever! :) */ + es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS); + #endif + if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) + es->s_max_mnt_count = + (__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); + es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); + es->s_mtime = cpu_to_le32(CURRENT_TIME); + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super (sb, es, 1); + if (test_opt (sb, DEBUG)) + printk (KERN_INFO + "[EXT3 FS %s, %s, bs=%lu, gc=%lu, " + "bpg=%lu, ipg=%lu, mo=%04lx]\n", + EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize, + sbi->s_groups_count, + EXT3_BLOCKS_PER_GROUP(sb), + EXT3_INODES_PER_GROUP(sb), + sbi->s_mount_opt); + printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ", + bdevname(sb->s_dev)); + if (EXT3_SB(sb)->s_journal->j_inode == NULL) { + printk("external journal on %s\n", + bdevname(EXT3_SB(sb)->s_journal->j_dev)); + } else { + printk("internal journal\n"); + } + #ifdef CONFIG_EXT3_CHECK + if (test_opt (sb, CHECK)) { + ext3_check_blocks_bitmap (sb); + ext3_check_inodes_bitmap (sb); + } + #endif + setup_ro_after(sb); + return res; + } + + static int ext3_check_descriptors (struct super_block * sb) + { + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block); + struct ext3_group_desc * gdp = NULL; + int desc_block = 0; + int i; + + ext3_debug ("Checking group descriptors"); + + for (i = 0; i < sbi->s_groups_count; i++) + { + if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0) + gdp = (struct ext3_group_desc *) + sbi->s_group_desc[desc_block++]->b_data; + if (le32_to_cpu(gdp->bg_block_bitmap) < block || + le32_to_cpu(gdp->bg_block_bitmap) >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Block bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_block_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_bitmap) < block || + le32_to_cpu(gdp->bg_inode_bitmap) >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode bitmap for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_bitmap)); + return 0; + } + if (le32_to_cpu(gdp->bg_inode_table) < block || + le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >= + block + EXT3_BLOCKS_PER_GROUP(sb)) + { + ext3_error (sb, "ext3_check_descriptors", + "Inode table for group %d" + " not in group (block %lu)!", + i, (unsigned long) + le32_to_cpu(gdp->bg_inode_table)); + return 0; + } + block += EXT3_BLOCKS_PER_GROUP(sb); + gdp++; + } + return 1; + } + + + /* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext3_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ + static void ext3_orphan_cleanup (struct super_block * sb, + struct ext3_super_block * es) + { + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; + if (!es->s_last_orphan) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n", + bdevname(sb->s_dev)); + sb->s_flags &= ~MS_RDONLY; + } + + if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) { + if (es->s_last_orphan) + jbd_debug(1, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + while (es->s_last_orphan) { + struct inode *inode; + + if (!(inode = + ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan); + if (inode->i_nlink) { + printk(KERN_DEBUG __FUNCTION__ + ": truncating inode %ld to %Ld bytes\n", + inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %ld to %Ld bytes\n", + inode->i_ino, inode->i_size); + ext3_truncate(inode); + nr_truncates++; + } else { + printk(KERN_DEBUG __FUNCTION__ + ": deleting unreferenced inode %ld\n", + inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %ld\n", + inode->i_ino); + nr_orphans++; + } + iput(inode); /* The delete magic happens here! */ + } + + #define PLURAL(x) (x), ((x)==1) ? "" : "s" + + if (nr_orphans) + printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n", + bdevname(sb->s_dev), PLURAL(nr_orphans)); + if (nr_truncates) + printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n", + bdevname(sb->s_dev), PLURAL(nr_truncates)); + sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + } + + #define log2(n) ffz(~(n)) + + /* + * Maximal file size. There is a direct, and {,double-,triple-}indirect + * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks. + * We need to be 1 filesystem block less than the 2^32 sector limit. + */ + static loff_t ext3_max_size(int bits) + { + loff_t res = EXT3_NDIR_BLOCKS; + res += 1LL << (bits-2); + res += 1LL << (2*(bits-2)); + res += 1LL << (3*(bits-2)); + res <<= bits; + if (res > (512LL << 32) - (1 << bits)) + res = (512LL << 32) - (1 << bits); + return res; + } + + struct super_block * ext3_read_super (struct super_block * sb, void * data, + int silent) + { + struct buffer_head * bh; + struct ext3_super_block *es = 0; + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long sb_block = 1; + unsigned long logic_sb_block = 1; + unsigned long offset = 0; + unsigned long journal_inum = 0; + kdev_t dev = sb->s_dev; + int blocksize; + int hblock; + int db_count; + int i; + int needs_recovery; + + #ifdef CONFIG_JBD_DEBUG + ext3_ro_after = 0; + #endif + /* + * See what the current blocksize for the device is, and + * use that as the blocksize. Otherwise (or if the blocksize + * is smaller than the default) use the default. + * This is important for devices that have a hardware + * sectorsize that is larger than the default. + */ + blocksize = EXT3_MIN_BLOCK_SIZE; + hblock = get_hardsect_size(dev); + if (blocksize < hblock) + blocksize = hblock; + + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; + if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) { + sb->s_dev = 0; + goto out_fail; + } + + set_blocksize (dev, blocksize); + + /* + * The ext3 superblock will not be buffer aligned for other than 1kB + * block sizes. We need to calculate the offset from buffer start. + */ + if (blocksize != EXT3_MIN_BLOCK_SIZE) { + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + } + + if (!(bh = bread (dev, logic_sb_block, blocksize))) { + printk (KERN_ERR "EXT3-fs: unable to read superblock\n"); + goto out_fail; + } + /* + * Note: s_es must be initialized as soon as possible because + * some ext3 macro-instructions depend on its value + */ + es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + sbi->s_es = es; + sb->s_magic = le16_to_cpu(es->s_magic); + if (sb->s_magic != EXT3_SUPER_MAGIC) { + if (!silent) + printk(KERN_ERR + "VFS: Can't find ext3 filesystem on dev %s.\n", + bdevname(dev)); + goto failed_mount; + } + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV && + (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) || + EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U))) + printk(KERN_WARNING + "EXT3-fs warning: feature flags set on rev 0 fs, " + "running e2fsck is recommended\n"); + /* + * Check feature flags regardless of the revision level, since we + * previously didn't change the revision level when setting the flags, + * so there is a chance incompat flags are set on a rev 0 filesystem. + */ + if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) { + printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of " + "unsupported optional features (%x).\n", + bdevname(dev), i); + goto failed_mount; + } + if (!(sb->s_flags & MS_RDONLY) && + (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){ + printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of " + "unsupported optional features (%x).\n", + bdevname(dev), i); + goto failed_mount; + } + sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10; + sb->s_blocksize = 1 << sb->s_blocksize_bits; + + if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE || + sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) { + printk(KERN_ERR + "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n", + blocksize, bdevname(dev)); + goto failed_mount; + } + + sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits); + + if (sb->s_blocksize != blocksize) { + blocksize = sb->s_blocksize; + + /* + * Make sure the blocksize for the filesystem is larger + * than the hardware sectorsize for the machine. + */ + if (sb->s_blocksize < hblock) { + printk(KERN_ERR "EXT3-fs: blocksize %d too small for " + "device blocksize %d.\n", blocksize, hblock); + goto failed_mount; + } + + brelse (bh); + set_blocksize (dev, sb->s_blocksize); + logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize; + offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize; + bh = bread (dev, logic_sb_block, blocksize); + if (!bh) { + printk(KERN_ERR + "EXT3-fs: Can't read superblock on 2nd try.\n"); + return NULL; + } + es = (struct ext3_super_block *)(((char *)bh->b_data) + offset); + sbi->s_es = es; + if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) { + printk (KERN_ERR + "EXT3-fs: Magic mismatch, very weird !\n"); + goto failed_mount; + } + } + + if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) { + sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE; + sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO; + } else { + sbi->s_inode_size = le16_to_cpu(es->s_inode_size); + sbi->s_first_ino = le32_to_cpu(es->s_first_ino); + if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) { + printk (KERN_ERR + "EXT3-fs: unsupported inode size: %d\n", + sbi->s_inode_size); + goto failed_mount; + } + } + sbi->s_frag_size = EXT3_MIN_FRAG_SIZE << + le32_to_cpu(es->s_log_frag_size); + if (blocksize != sbi->s_frag_size) { + printk(KERN_ERR + "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n", + sbi->s_frag_size, blocksize); + goto failed_mount; + } + sbi->s_frags_per_block = 1; + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb); + sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block; + sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc); + sbi->s_sbh = bh; + if (sbi->s_resuid == EXT3_DEF_RESUID) + sbi->s_resuid = le16_to_cpu(es->s_def_resuid); + if (sbi->s_resgid == EXT3_DEF_RESGID) + sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb)); + + if (sbi->s_blocks_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #blocks per group too big: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } + if (sbi->s_frags_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #fragments per group too big: %lu\n", + sbi->s_frags_per_group); + goto failed_mount; + } + if (sbi->s_inodes_per_group > blocksize * 8) { + printk (KERN_ERR + "EXT3-fs: #inodes per group too big: %lu\n", + sbi->s_inodes_per_group); + goto failed_mount; + } + + sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) - + le32_to_cpu(es->s_first_data_block) + + EXT3_BLOCKS_PER_GROUP(sb) - 1) / + EXT3_BLOCKS_PER_GROUP(sb); + db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) / + EXT3_DESC_PER_BLOCK(sb); + sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), + GFP_KERNEL); + if (sbi->s_group_desc == NULL) { + printk (KERN_ERR "EXT3-fs: not enough memory\n"); + goto failed_mount; + } + for (i = 0; i < db_count; i++) { + sbi->s_group_desc[i] = bread(dev, logic_sb_block + i + 1, + blocksize); + if (!sbi->s_group_desc[i]) { + printk (KERN_ERR "EXT3-fs: " + "can't read group descriptor %d\n", i); + db_count = i; + goto failed_mount2; + } + } + if (!ext3_check_descriptors (sb)) { + printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n"); + goto failed_mount2; + } + for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) { + sbi->s_inode_bitmap_number[i] = 0; + sbi->s_inode_bitmap[i] = NULL; + sbi->s_block_bitmap_number[i] = 0; + sbi->s_block_bitmap[i] = NULL; + } + sbi->s_loaded_inode_bitmaps = 0; + sbi->s_loaded_block_bitmaps = 0; + sbi->s_gdb_count = db_count; + /* + * set up enough so that it can read an inode + */ + sb->s_op = &ext3_sops; + INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + + sb->s_root = 0; + + needs_recovery = (es->s_last_orphan != 0 || + EXT3_HAS_INCOMPAT_FEATURE(sb, + EXT3_FEATURE_INCOMPAT_RECOVER)); + + /* + * The first inode we look at is the journal inode. Don't try + * root first: it may be modified in the journal! + */ + if (!test_opt(sb, NOLOAD) && + EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) { + if (ext3_load_journal(sb, es)) + goto failed_mount2; + } else if (journal_inum) { + if (ext3_create_journal(sb, es, journal_inum)) + goto failed_mount2; + } else { + if (!silent) + printk (KERN_ERR + "ext3: No journal on filesystem on %s\n", + bdevname(dev)); + goto failed_mount2; + } + + /* We have now updated the journal if required, so we can + * validate the data journaling mode. */ + switch (test_opt(sb, DATA_FLAGS)) { + case 0: + /* No mode set, assume a default based on the journal + capabilities: ORDERED_DATA if the journal can + cope, else JOURNAL_DATA */ + if (journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) + set_opt(sbi->s_mount_opt, ORDERED_DATA); + else + set_opt(sbi->s_mount_opt, JOURNAL_DATA); + break; + + case EXT3_MOUNT_ORDERED_DATA: + case EXT3_MOUNT_WRITEBACK_DATA: + if (!journal_check_available_features + (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) { + printk(KERN_ERR "EXT3-fs: Journal does not support " + "requested data journaling mode\n"); + goto failed_mount3; + } + default: + break; + } + + /* + * The journal_load will have done any necessary log recovery, + * so we can safely mount the rest of the filesystem now. + */ + + sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO)); + if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) || + !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) { + if (sb->s_root) { + dput(sb->s_root); + sb->s_root = NULL; + printk(KERN_ERR + "EXT3-fs: corrupt root inode, run e2fsck\n"); + } else + printk(KERN_ERR "EXT3-fs: get root inode failed\n"); + goto failed_mount3; + } + + ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); + /* + * akpm: core read_super() calls in here with the superblock locked. + * That deadlocks, because orphan cleanup needs to lock the superblock + * in numerous places. Here we just pop the lock - it's relatively + * harmless, because we are now ready to accept write_super() requests, + * and aviro says that's the only reason for hanging onto the + * superblock lock. + */ + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; + unlock_super(sb); /* akpm: sigh */ + ext3_orphan_cleanup(sb, es); + lock_super(sb); + EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; + if (needs_recovery) + printk (KERN_INFO "EXT3-fs: recovery complete.\n"); + ext3_mark_recovery_complete(sb, es); + printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n", + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + + return sb; + + failed_mount3: + journal_destroy(sbi->s_journal); + failed_mount2: + for (i = 0; i < db_count; i++) + brelse(sbi->s_group_desc[i]); + kfree(sbi->s_group_desc); + failed_mount: + ext3_blkdev_remove(sbi); + brelse(bh); + out_fail: + return NULL; + } + + static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) + { + struct inode *journal_inode; + journal_t *journal; + + /* First, test for the existence of a valid inode on disk. Bad + * things happen if we iget() an unused inode, as the subsequent + * iput() will try to delete it. */ + + journal_inode = iget(sb, journal_inum); + if (!journal_inode) { + printk(KERN_ERR "EXT3-fs: no journal found.\n"); + return NULL; + } + if (!journal_inode->i_nlink) { + make_bad_inode(journal_inode); + iput(journal_inode); + printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n"); + return NULL; + } + + jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", + journal_inode, journal_inode->i_size); + if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) { + printk(KERN_ERR "EXT3-fs: invalid journal inode.\n"); + iput(journal_inode); + return NULL; + } + + journal = journal_init_inode(journal_inode); + if (!journal) + iput(journal_inode); + return journal; + } + + static journal_t *ext3_get_dev_journal(struct super_block *sb, + int dev) + { + struct buffer_head * bh; + journal_t *journal; + int start; + int len; + int hblock, blocksize; + unsigned long sb_block; + unsigned long offset; + kdev_t journal_dev = to_kdev_t(dev); + struct ext3_super_block * es; + struct block_device *bdev; + + bdev = ext3_blkdev_get(journal_dev); + if (bdev == NULL) + return NULL; + + blocksize = sb->s_blocksize; + hblock = get_hardsect_size(journal_dev); + if (blocksize < hblock) { + printk(KERN_ERR + "EXT3-fs: blocksize too small for journal device.\n"); + goto out_bdev; + } + + sb_block = EXT3_MIN_BLOCK_SIZE / blocksize; + offset = EXT3_MIN_BLOCK_SIZE % blocksize; + set_blocksize(dev, blocksize); + if (!(bh = bread(dev, sb_block, blocksize))) { + printk(KERN_ERR "EXT3-fs: couldn't read superblock of " + "external journal\n"); + goto out_bdev; + } + + es = (struct ext3_super_block *) (((char *)bh->b_data) + offset); + if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) || + !(le32_to_cpu(es->s_feature_incompat) & + EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) { + printk(KERN_ERR "EXT3-fs: external journal has " + "bad superblock\n"); + brelse(bh); + goto out_bdev; + } + + if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { + printk(KERN_ERR "EXT3-fs: journal UUID does not match\n"); + brelse(bh); + goto out_bdev; + } + + len = le32_to_cpu(es->s_blocks_count); + start = sb_block + 1; + brelse(bh); /* we're done with the superblock */ + + journal = journal_init_dev(journal_dev, sb->s_dev, + start, len, blocksize); + if (!journal) { + printk(KERN_ERR "EXT3-fs: failed to create device journal\n"); + goto out_bdev; + } + ll_rw_block(READ, 1, &journal->j_sb_buffer); + wait_on_buffer(journal->j_sb_buffer); + if (!buffer_uptodate(journal->j_sb_buffer)) { + printk(KERN_ERR "EXT3-fs: I/O error on journal device\n"); + goto out_journal; + } + if (ntohl(journal->j_superblock->s_nr_users) != 1) { + printk(KERN_ERR "EXT3-fs: External journal has more than one " + "user (unsupported) - %d\n", + ntohl(journal->j_superblock->s_nr_users)); + goto out_journal; + } + EXT3_SB(sb)->journal_bdev = bdev; + return journal; + out_journal: + journal_destroy(journal); + out_bdev: + ext3_blkdev_put(bdev); + return NULL; + } + + static int ext3_load_journal(struct super_block * sb, + struct ext3_super_block * es) + { + journal_t *journal; + int journal_inum = le32_to_cpu(es->s_journal_inum); + int journal_dev = le32_to_cpu(es->s_journal_dev); + int err; + int really_read_only; + + really_read_only = is_read_only(sb->s_dev); + + /* + * Are we loading a blank journal or performing recovery after a + * crash? For recovery, we need to check in advance whether we + * can get read-write access to the device. + */ + + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) { + if (sb->s_flags & MS_RDONLY) { + printk(KERN_INFO "EXT3-fs: INFO: recovery " + "required on readonly filesystem.\n"); + if (really_read_only) { + printk(KERN_ERR "EXT3-fs: write access " + "unavailable, cannot proceed.\n"); + return -EROFS; + } + printk (KERN_INFO "EXT3-fs: write access will " + "be enabled during recovery.\n"); + } + } + + if (journal_inum && journal_dev) { + printk(KERN_ERR "EXT3-fs: filesystem has both journal " + "and inode journals!\n"); + return -EINVAL; + } + + if (journal_inum) { + if (!(journal = ext3_get_journal(sb, journal_inum))) + return -EINVAL; + } else { + if (!(journal = ext3_get_dev_journal(sb, journal_dev))) + return -EINVAL; + } + + + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { + err = journal_update_format(journal); + if (err) { + printk(KERN_ERR "EXT3-fs: error updating journal.\n"); + journal_destroy(journal); + return err; + } + } + + if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) + journal_wipe(journal, !really_read_only); + + err = journal_load(journal); + if (err) { + printk(KERN_ERR "EXT3-fs: error loading journal.\n"); + journal_destroy(journal); + return err; + } + + EXT3_SB(sb)->s_journal = journal; + ext3_clear_journal_err(sb, es); + return 0; + } + + static int ext3_create_journal(struct super_block * sb, + struct ext3_super_block * es, + int journal_inum) + { + journal_t *journal; + + if (sb->s_flags & MS_RDONLY) { + printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to " + "create journal.\n"); + return -EROFS; + } + + if (!(journal = ext3_get_journal(sb, journal_inum))) + return -EINVAL; + + printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n", + journal_inum); + + if (journal_create(journal)) { + printk(KERN_ERR "EXT3-fs: error creating journal.\n"); + journal_destroy(journal); + return -EIO; + } + + EXT3_SB(sb)->s_journal = journal; + + ext3_update_dynamic_rev(sb); + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL); + + es->s_journal_inum = cpu_to_le32(journal_inum); + sb->s_dirt = 1; + + /* Make sure we flush the recovery flag to disk. */ + ext3_commit_super(sb, es, 1); + + return 0; + } + + static void ext3_commit_super (struct super_block * sb, + struct ext3_super_block * es, + int sync) + { + es->s_wtime = cpu_to_le32(CURRENT_TIME); + BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty"); + mark_buffer_dirty(sb->u.ext3_sb.s_sbh); + if (sync) { + ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh); + wait_on_buffer(sb->u.ext3_sb.s_sbh); + } + } + + + /* + * Have we just finished recovery? If so, and if we are mounting (or + * remounting) the filesystem readonly, then we will end up with a + * consistent fs on disk. Record that fact. + */ + static void ext3_mark_recovery_complete(struct super_block * sb, + struct ext3_super_block * es) + { + journal_flush(EXT3_SB(sb)->s_journal); + if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && + sb->s_flags & MS_RDONLY) { + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + sb->s_dirt = 0; + ext3_commit_super(sb, es, 1); + } + } + + /* + * If we are mounting (or read-write remounting) a filesystem whose journal + * has recorded an error from a previous lifetime, move that error to the + * main filesystem now. + */ + static void ext3_clear_journal_err(struct super_block * sb, + struct ext3_super_block * es) + { + journal_t *journal; + int j_errno; + const char *errstr; + + journal = EXT3_SB(sb)->s_journal; + + /* + * Now check for any error status which may have been recorded in the + * journal by a prior ext3_error() or ext3_abort() + */ + + j_errno = journal_errno(journal); + if (j_errno) { + char nbuf[16]; + + errstr = ext3_decode_error(sb, j_errno, nbuf); + ext3_warning(sb, __FUNCTION__, "Filesystem error recorded " + "from previous mount: %s", errstr); + ext3_warning(sb, __FUNCTION__, "Marking fs in need of " + "filesystem check."); + + sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS; + es->s_state |= cpu_to_le16(EXT3_ERROR_FS); + ext3_commit_super (sb, es, 1); + + journal_clear_err(journal); + } + } + + /* + * Force the running and committing transactions to commit, + * and wait on the commit. + */ + int ext3_force_commit(struct super_block *sb) + { + journal_t *journal; + int ret; + + if (sb->s_flags & MS_RDONLY) + return 0; + + journal = EXT3_SB(sb)->s_journal; + sb->s_dirt = 0; + lock_kernel(); /* important: lock down j_running_transaction */ + ret = ext3_journal_force_commit(journal); + unlock_kernel(); + return ret; + } + + /* + * Ext3 always journals updates to the superblock itself, so we don't + * have to propagate any other updates to the superblock on disk at this + * point. Just start an async writeback to get the buffers on their way + * to the disk. + * + * This implicitly triggers the writebehind on sync(). + */ + + static int do_sync_supers = 0; + MODULE_PARM(do_sync_supers, "i"); + MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously"); + + void ext3_write_super (struct super_block * sb) + { + tid_t target; + + if (down_trylock(&sb->s_lock) == 0) + BUG(); /* aviro detector */ + sb->s_dirt = 0; + target = log_start_commit(EXT3_SB(sb)->s_journal, NULL); + + if (do_sync_supers) { + unlock_super(sb); + log_wait_commit(EXT3_SB(sb)->s_journal, target); + lock_super(sb); + } + } + + /* + * LVM calls this function before a (read-only) snapshot is created. This + * gives us a chance to flush the journal completely and mark the fs clean. + */ + void ext3_write_super_lockfs(struct super_block *sb) + { + sb->s_dirt = 0; + + lock_kernel(); /* 2.4.5 forgot to do this for us */ + if (!(sb->s_flags & MS_RDONLY)) { + journal_t *journal = EXT3_SB(sb)->s_journal; + + /* Now we set up the journal barrier. */ + journal_lock_updates(journal); + journal_flush(journal); + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + } + unlock_kernel(); + } + + /* + * Called by LVM after the snapshot is done. We need to reset the RECOVER + * flag here, even though the filesystem is not technically dirty yet. + */ + void ext3_unlockfs(struct super_block *sb) + { + if (!(sb->s_flags & MS_RDONLY)) { + lock_kernel(); + lock_super(sb); + /* Reser the needs_recovery flag before the fs is unlocked. */ + EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); + ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1); + unlock_super(sb); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + unlock_kernel(); + } + } + + int ext3_remount (struct super_block * sb, int * flags, char * data) + { + struct ext3_super_block * es; + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long tmp; + + clear_ro_after(sb); + + /* + * Allow the "check" option to be passed as a remount option. + */ + if (!parse_options(data, &tmp, sbi, &tmp, 1)) + return -EINVAL; + + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + + es = sbi->s_es; + + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) + return -EROFS; + + if (*flags & MS_RDONLY) { + /* + * First of all, the unconditional stuff we have to do + * to disable replay of the journal when we next remount + */ + sb->s_flags |= MS_RDONLY; + + /* + * OK, test if we are remounting a valid rw partition + * readonly, and if so set the rdonly flag and then + * mark the partition as valid again. + */ + if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) && + (sbi->s_mount_state & EXT3_VALID_FS)) + es->s_state = cpu_to_le16(sbi->s_mount_state); + + ext3_mark_recovery_complete(sb, es); + } else { + int ret; + if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, + ~EXT3_FEATURE_RO_COMPAT_SUPP))) { + printk(KERN_WARNING "EXT3-fs: %s: couldn't " + "remount RDWR because of unsupported " + "optional features (%x).\n", + bdevname(sb->s_dev), ret); + return -EROFS; + } + /* + * Mounting a RDONLY partition read-write, so reread + * and store the current valid flag. (It may have + * been changed by e2fsck since we originally mounted + * the partition.) + */ + ext3_clear_journal_err(sb, es); + sbi->s_mount_state = le16_to_cpu(es->s_state); + if (!ext3_setup_super (sb, es, 0)) + sb->s_flags &= ~MS_RDONLY; + } + } + setup_ro_after(sb); + return 0; + } + + int ext3_statfs (struct super_block * sb, struct statfs * buf) + { + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long overhead; + int i; + + if (test_opt (sb, MINIX_DF)) + overhead = 0; + else { + /* + * Compute the overhead (FS structures) + */ + + /* + * All of the blocks before first_data_block are + * overhead + */ + overhead = le32_to_cpu(es->s_first_data_block); + + /* + * Add the overhead attributed to the superblock and + * block group descriptors. If the sparse superblocks + * feature is turned on, then not all groups have this. + */ + for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + overhead += ext3_bg_has_super(sb, i) + + ext3_bg_num_gdb(sb, i); + + /* + * Every block group has an inode bitmap, a block + * bitmap, and an inode table. + */ + overhead += (EXT3_SB(sb)->s_groups_count * + (2 + EXT3_SB(sb)->s_itb_per_group)); + } + + buf->f_type = EXT3_SUPER_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead; + buf->f_bfree = ext3_count_free_blocks (sb); + buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count); + if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count)) + buf->f_bavail = 0; + buf->f_files = le32_to_cpu(es->s_inodes_count); + buf->f_ffree = ext3_count_free_inodes (sb); + buf->f_namelen = EXT3_NAME_LEN; + return 0; + } + + static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super); + + static int __init init_ext3_fs(void) + { + return register_filesystem(&ext3_fs_type); + } + + static void __exit exit_ext3_fs(void) + { + unregister_filesystem(&ext3_fs_type); + } + + EXPORT_NO_SYMBOLS; + + MODULE_LICENSE("GPL"); + module_init(init_ext3_fs) + module_exit(exit_ext3_fs) diff -rc2P linux/fs/ext3/symlink.c linux-2.4.13/fs/ext3/symlink.c *** linux/fs/ext3/symlink.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/ext3/symlink.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,39 ---- + /* + * linux/fs/ext3/symlink.c + * + * Only fast symlinks left here - the rest is done by generic code. AV, 1999 + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/fs/minix/symlink.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * ext3 symlink handling code + */ + + #include + #include + #include + + static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen) + { + char *s = (char *)dentry->d_inode->u.ext3_i.i_data; + return vfs_readlink(dentry, buffer, buflen, s); + } + + static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd) + { + char *s = (char *)dentry->d_inode->u.ext3_i.i_data; + return vfs_follow_link(nd, s); + } + + struct inode_operations ext3_fast_symlink_inode_operations = { + readlink: ext3_readlink, /* BKL not held. Don't need */ + follow_link: ext3_follow_link, /* BKL not held. Don't need */ + }; diff -rc2P linux/fs/inode.c linux-2.4.13/fs/inode.c *** linux/fs/inode.c Fri Sep 28 21:03:48 2001 --- linux-2.4.13/fs/inode.c Fri Nov 9 16:57:59 2001 *************** *** 110,113 **** --- 110,114 ---- sema_init(&inode->i_sem, 1); sema_init(&inode->i_zombie, 1); + init_rwsem(&inode->i_truncate_sem); spin_lock_init(&inode->i_data.i_shared_lock); } diff -rc2P linux/fs/jbd/Makefile linux-2.4.13/fs/jbd/Makefile *** linux/fs/jbd/Makefile Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/Makefile Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,15 ---- + # + # fs/jbd/Makefile + # + # Makefile for the linux journaling routines. + # + + export-objs := journal.o + O_TARGET := jbd.o + + obj-y := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o + + obj-m := $(O_TARGET) + + include $(TOPDIR)/Rules.make + diff -rc2P linux/fs/jbd/checkpoint.c linux-2.4.13/fs/jbd/checkpoint.c *** linux/fs/jbd/checkpoint.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/checkpoint.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,608 ---- + /* + * linux/fs/checkpoint.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Checkpoint routines for the generic filesystem journaling code. + * Part of the ext2fs journaling system. + * + * Checkpointing is the process of ensuring that a section of the log is + * committed fully to disk, so that that portion of the log can be + * reused. + */ + + #include + #include + #include + #include + #include + #include + + extern spinlock_t journal_datalist_lock; + + /* + * Unlink a buffer from a transaction. + * + * Called with journal_datalist_lock held. + */ + + static inline void __buffer_unlink(struct journal_head *jh) + { + transaction_t *transaction; + + transaction = jh->b_cp_transaction; + jh->b_cp_transaction = NULL; + + jh->b_cpnext->b_cpprev = jh->b_cpprev; + jh->b_cpprev->b_cpnext = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = jh->b_cpnext; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; + } + + /* + * Try to release a checkpointed buffer from its transaction. + * Returns 1 if we released it. + * Requires journal_datalist_lock + */ + static int __try_to_free_cp_buf(struct journal_head *jh) + { + int ret = 0; + struct buffer_head *bh = jh2bh(jh); + + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __journal_remove_checkpoint(jh); + __journal_remove_journal_head(bh); + BUFFER_TRACE(bh, "release"); + /* BUF_LOCKED -> BUF_CLEAN (fwiw) */ + refile_buffer(bh); + __brelse(bh); + ret = 1; + } + return ret; + } + + /* + * log_wait_for_space: wait until there is space in the journal. + * + * Called with the journal already locked, but it will be unlocked if we have + * to wait for a checkpoint to free up some space in the log. + */ + + void log_wait_for_space(journal_t *journal, int nblocks) + { + while (log_space_left(journal) < nblocks) { + if (journal->j_flags & JFS_ABORT) + return; + unlock_journal(journal); + down(&journal->j_checkpoint_sem); + lock_journal(journal); + + /* Test again, another process may have checkpointed + * while we were waiting for the checkpoint lock */ + if (log_space_left(journal) < nblocks) { + log_do_checkpoint(journal, nblocks); + } + up(&journal->j_checkpoint_sem); + } + } + + /* + * Clean up a transaction's checkpoint list. + * + * We wait for any pending IO to complete and make sure any clean + * buffers are removed from the transaction. + * + * Return 1 if we performed any actions which might have destroyed the + * checkpoint. (journal_remove_checkpoint() deletes the transaction when + * the last checkpoint buffer is cleansed) + * + * Called with the journal locked. + * Called with journal_datalist_lock held. + */ + static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) + { + struct journal_head *jh, *next_jh, *last_jh; + struct buffer_head *bh; + int ret = 0; + + assert_spin_locked(&journal_datalist_lock); + jh = transaction->t_checkpoint_list; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto out_return_1; + } + + if (jh->b_transaction != NULL) { + transaction_t *transaction = jh->b_transaction; + tid_t tid = transaction->t_tid; + + spin_unlock(&journal_datalist_lock); + log_start_commit(journal, transaction); + unlock_journal(journal); + log_wait_commit(journal, tid); + goto out_return_1; + } + + /* + * We used to test for (jh->b_list != BUF_CLEAN) here. + * But unmap_underlying_metadata() can place buffer onto + * BUF_CLEAN. Since refile_buffer() no longer takes buffers + * off checkpoint lists, we cope with it here + */ + /* + * AKPM: I think the buffer_jdirty test is redundant - it + * shouldn't have NULL b_transaction? + */ + next_jh = jh->b_cpnext; + if (!buffer_dirty(bh) && !buffer_jdirty(bh)) { + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + __journal_remove_journal_head(bh); + refile_buffer(bh); + __brelse(bh); + ret = 1; + } + + jh = next_jh; + } while (jh != last_jh); + + return ret; + out_return_1: + lock_journal(journal); + spin_lock(&journal_datalist_lock); + return 1; + } + + #define NR_BATCH 64 + + static void __flush_batch(struct buffer_head **bhs, int *batch_count) + { + int i; + + spin_unlock(&journal_datalist_lock); + ll_rw_block(WRITE, *batch_count, bhs); + run_task_queue(&tq_disk); + spin_lock(&journal_datalist_lock); + for (i = 0; i < *batch_count; i++) { + struct buffer_head *bh = bhs[i]; + clear_bit(BH_JWrite, &bh->b_state); + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + *batch_count = 0; + } + + /* + * Try to flush one buffer from the checkpoint list to disk. + * + * Return 1 if something happened which requires us to abort the current + * scan of the checkpoint list. + * + * Called with journal_datalist_lock held. + */ + static int __flush_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count, + int *drop_count) + { + struct buffer_head *bh = jh2bh(jh); + int ret = 0; + + if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { + J_ASSERT_JH(jh, jh->b_transaction == NULL); + + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal lock. + * We cannot afford to let the transaction logic start + * messing around with this buffer before we write it to + * disk, as that would break recoverability. + */ + BUFFER_TRACE(bh, "queue"); + atomic_inc(&bh->b_count); + J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state)); + set_bit(BH_JWrite, &bh->b_state); + bhs[*batch_count] = bh; + (*batch_count)++; + if (*batch_count == NR_BATCH) { + __flush_batch(bhs, batch_count); + ret = 1; + } + } else { + int last_buffer = 0; + if (jh->b_cpnext == jh) { + /* We may be about to drop the transaction. Tell the + * caller that the lists have changed. + */ + last_buffer = 1; + } + if (__try_to_free_cp_buf(jh)) { + (*drop_count)++; + ret = last_buffer; + } + } + return ret; + } + + + /* + * Perform an actual checkpoint. We don't write out only enough to + * satisfy the current blocked requests: rather we submit a reasonably + * sized chunk of the outstanding data to disk at once for + * efficiency. log_wait_for_space() will retry if we didn't free enough. + * + * However, we _do_ take into account the amount requested so that once + * the IO has been queued, we can return as soon as enough of it has + * completed to disk. + * + * The journal should be locked before calling this function. + */ + + /* @@@ `nblocks' is unused. Should it be used? */ + int log_do_checkpoint (journal_t *journal, int nblocks) + { + transaction_t *transaction, *last_transaction, *next_transaction; + int result; + int target; + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; + + jbd_debug(1, "Start checkpoint\n"); + + /* + * First thing: if there are any transactions in the log which + * don't need checkpointing, just eliminate them from the + * journal straight away. + */ + result = cleanup_journal_tail(journal); + jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + if (result <= 0) + return result; + + /* + * OK, we need to start writing disk blocks. Try to free up a + * quarter of the log in a single checkpoint if we can. + */ + /* + * AKPM: check this code. I had a feeling a while back that it + * degenerates into a busy loop at unmount time. + */ + target = (journal->j_last - journal->j_first) / 4; + + spin_lock(&journal_datalist_lock); + repeat: + transaction = journal->j_checkpoint_transactions; + if (transaction == NULL) + goto done; + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + + do { + struct journal_head *jh, *last_jh, *next_jh; + int drop_count = 0; + int cleanup_ret, retry = 0; + + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + jh = transaction->t_checkpoint_list; + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; + next_jh = jh->b_cpnext; + retry = __flush_buffer(journal, jh, bhs, &batch_count, + &drop_count); + } while (jh != last_jh && !retry); + if (batch_count) { + __flush_batch(bhs, &batch_count); + goto repeat; + } + if (retry) + goto repeat; + /* + * We have walked the whole transaction list without + * finding anything to write to disk. We had better be + * able to make some progress or we are in trouble. + */ + cleanup_ret = __cleanup_transaction(journal, transaction); + J_ASSERT(drop_count != 0 || cleanup_ret != 0); + goto repeat; /* __cleanup may have dropped lock */ + } while (transaction != last_transaction); + + done: + spin_unlock(&journal_datalist_lock); + result = cleanup_journal_tail(journal); + if (result < 0) + return result; + + return 0; + } + + /* + * Check the list of checkpoint transactions for the journal to see if + * we have already got rid of any since the last update of the log tail + * in the journal superblock. If so, we can instantly roll the + * superblock forward to remove those transactions from the log. + * + * Return <0 on error, 0 on success, 1 if there was nothing to clean up. + * + * Called with the journal lock held. + * + * This is the only part of the journaling code which really needs to be + * aware of transaction aborts. Checkpointing involves writing to the + * main filesystem area rather than to the journal, so it can proceed + * even in abort state, but we must not update the journal superblock if + * we have an abort error outstanding. + */ + + int cleanup_journal_tail(journal_t *journal) + { + transaction_t * transaction; + tid_t first_tid; + unsigned long blocknr, freed; + + /* OK, work out the oldest transaction remaining in the log, and + * the log block it starts at. + * + * If the log is now empty, we need to work out which is the + * next transaction ID we will write, and where it will + * start. */ + + /* j_checkpoint_transactions needs locking */ + spin_lock(&journal_datalist_lock); + transaction = journal->j_checkpoint_transactions; + if (transaction) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_committing_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = transaction->t_log_start; + } else if ((transaction = journal->j_running_transaction) != NULL) { + first_tid = transaction->t_tid; + blocknr = journal->j_head; + } else { + first_tid = journal->j_transaction_sequence; + blocknr = journal->j_head; + } + spin_unlock(&journal_datalist_lock); + J_ASSERT (blocknr != 0); + + /* If the oldest pinned transaction is at the tail of the log + already then there's not much we can do right now. */ + if (journal->j_tail_sequence == first_tid) + return 1; + + /* OK, update the superblock to recover the freed space. + * Physical blocks come first: have we wrapped beyond the end of + * the log? */ + freed = blocknr - journal->j_tail; + if (blocknr < journal->j_tail) + freed = freed + journal->j_last - journal->j_first; + + jbd_debug(1, + "Cleaning journal tail from %d to %d (offset %lu), " + "freeing %lu\n", + journal->j_tail_sequence, first_tid, blocknr, freed); + + journal->j_free += freed; + journal->j_tail_sequence = first_tid; + journal->j_tail = blocknr; + if (!(journal->j_flags & JFS_ABORT)) + journal_update_superblock(journal, 1); + return 0; + } + + + /* Checkpoint list management */ + + /* + * journal_clean_checkpoint_list + * + * Find all the written-back checkpoint buffers in the journal and release them. + * + * Called with the journal locked. + * Called with journal_datalist_lock held. + * Returns number of bufers reaped (for debug) + */ + + int __journal_clean_checkpoint_list(journal_t *journal) + { + transaction_t *transaction, *last_transaction, *next_transaction; + int ret = 0; + + transaction = journal->j_checkpoint_transactions; + if (transaction == 0) + goto out; + + last_transaction = transaction->t_cpprev; + next_transaction = transaction; + do { + struct journal_head *jh; + + transaction = next_transaction; + next_transaction = transaction->t_cpnext; + jh = transaction->t_checkpoint_list; + if (jh) { + struct journal_head *last_jh = jh->b_cpprev; + struct journal_head *next_jh = jh; + do { + struct buffer_head *bh; + + jh = next_jh; + next_jh = jh->b_cpnext; + bh = jh2bh(jh); + ret += __try_to_free_cp_buf(jh); + } while (jh != last_jh); + } + } while (transaction != last_transaction); + out: + return ret; + } + + /* + * journal_remove_checkpoint: called after a buffer has been committed + * to disk (either by being write-back flushed to disk, or being + * committed to the log). + * + * We cannot safely clean a transaction out of the log until all of the + * buffer updates committed in that transaction have safely been stored + * elsewhere on disk. To achieve this, all of the buffers in a + * transaction need to be maintained on the transaction's checkpoint + * list until they have been rewritten, at which point this function is + * called to remove the buffer from the existing transaction's + * checkpoint list. + * + * This function is called with the journal locked. + * This function is called with journal_datalist_lock held. + */ + + void __journal_remove_checkpoint(struct journal_head *jh) + { + transaction_t *transaction; + journal_t *journal; + + JBUFFER_TRACE(jh, "entry"); + + if ((transaction = jh->b_cp_transaction) == NULL) { + JBUFFER_TRACE(jh, "not on transaction"); + goto out; + } + + journal = transaction->t_journal; + + __buffer_unlink(jh); + + if (transaction->t_checkpoint_list != NULL) + goto out; + JBUFFER_TRACE(jh, "transaction has no more buffers"); + + /* There is one special case to worry about: if we have just + pulled the buffer off a committing transaction's forget list, + then even if the checkpoint list is empty, the transaction + obviously cannot be dropped! */ + + if (transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "belongs to committing transaction"); + goto out; + } + + /* OK, that was the last buffer for the transaction: we can now + safely remove this transaction from the log */ + + __journal_drop_transaction(journal, transaction); + + /* Just in case anybody was waiting for more transactions to be + checkpointed... */ + wake_up(&journal->j_wait_logspace); + out: + JBUFFER_TRACE(jh, "exit"); + } + + void journal_remove_checkpoint(struct journal_head *jh) + { + spin_lock(&journal_datalist_lock); + __journal_remove_checkpoint(jh); + spin_unlock(&journal_datalist_lock); + } + + /* + * journal_insert_checkpoint: put a committed buffer onto a checkpoint + * list so that we know when it is safe to clean the transaction out of + * the log. + * + * Called with the journal locked. + * Called with journal_datalist_lock held. + */ + void __journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) + { + JBUFFER_TRACE(jh, "entry"); + J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh))); + J_ASSERT_JH(jh, jh->b_cp_transaction == NULL); + + assert_spin_locked(&journal_datalist_lock); + jh->b_cp_transaction = transaction; + + if (!transaction->t_checkpoint_list) { + jh->b_cpnext = jh->b_cpprev = jh; + } else { + jh->b_cpnext = transaction->t_checkpoint_list; + jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev; + jh->b_cpprev->b_cpnext = jh; + jh->b_cpnext->b_cpprev = jh; + } + transaction->t_checkpoint_list = jh; + } + + void journal_insert_checkpoint(struct journal_head *jh, + transaction_t *transaction) + { + spin_lock(&journal_datalist_lock); + __journal_insert_checkpoint(jh, transaction); + spin_unlock(&journal_datalist_lock); + } + + /* + * We've finished with this transaction structure: adios... + * + * The transaction must have no links except for the checkpoint by this + * point. + * + * Called with the journal locked. + * Called with journal_datalist_lock held. + */ + + void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) + { + assert_spin_locked(&journal_datalist_lock); + if (transaction->t_cpnext) { + transaction->t_cpnext->t_cpprev = transaction->t_cpprev; + transaction->t_cpprev->t_cpnext = transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = + transaction->t_cpnext; + if (journal->j_checkpoint_transactions == transaction) + journal->j_checkpoint_transactions = NULL; + } + + J_ASSERT (transaction->t_ilist == NULL); + J_ASSERT (transaction->t_buffers == NULL); + J_ASSERT (transaction->t_sync_datalist == NULL); + J_ASSERT (transaction->t_async_datalist == NULL); + J_ASSERT (transaction->t_forget == NULL); + J_ASSERT (transaction->t_iobuf_list == NULL); + J_ASSERT (transaction->t_shadow_list == NULL); + J_ASSERT (transaction->t_log_list == NULL); + J_ASSERT (transaction->t_checkpoint_list == NULL); + J_ASSERT (transaction->t_updates == 0); + + J_ASSERT (transaction->t_journal->j_committing_transaction != + transaction); + + jbd_debug (1, "Dropping transaction %d, all done\n", + transaction->t_tid); + kfree (transaction); + } + diff -rc2P linux/fs/jbd/commit.c linux-2.4.13/fs/jbd/commit.c *** linux/fs/jbd/commit.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/commit.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,701 ---- + /* + * linux/fs/commit.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal commit routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + + #include + #include + #include + #include + #include + #include + #include + + extern spinlock_t journal_datalist_lock; + + /* + * Default IO end handler for temporary BJ_IO buffer_heads. + */ + static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) + { + BUFFER_TRACE(bh, ""); + mark_buffer_uptodate(bh, uptodate); + unlock_buffer(bh); + } + + /* + * journal_commit_transaction + * + * The primary function for committing a transaction to the log. This + * function is called by the journal thread to begin a complete commit. + */ + void journal_commit_transaction(journal_t *journal) + { + transaction_t *commit_transaction; + struct journal_head *jh, *new_jh, *descriptor; + struct journal_head *next_jh, *last_jh; + struct buffer_head *wbuf[64]; + int bufs; + int flags; + int blocknr; + char *tagp = NULL; + journal_header_t *header; + journal_block_tag_t *tag = NULL; + int space_left = 0; + int first_tag = 0; + int tag_flag; + int i; + + /* + * First job: lock down the current transaction and wait for + * all outstanding updates to complete. + */ + + lock_journal(journal); /* Protect journal->j_running_transaction */ + + #ifdef COMMIT_STATS + spin_lock(&journal_datalist_lock); + summarise_journal_usage(journal); + spin_unlock(&journal_datalist_lock); + #endif + + lock_kernel(); + + J_ASSERT (journal->j_running_transaction != NULL); + J_ASSERT (journal->j_committing_transaction == NULL); + + commit_transaction = journal->j_running_transaction; + J_ASSERT (commit_transaction->t_state == T_RUNNING); + + jbd_debug (1, "JBD: starting commit of transaction %d\n", + commit_transaction->t_tid); + + commit_transaction->t_state = T_LOCKED; + while (commit_transaction->t_updates != 0) { + unlock_journal(journal); + sleep_on(&journal->j_wait_updates); + lock_journal(journal); + } + + J_ASSERT (commit_transaction->t_outstanding_credits <= + journal->j_max_transaction_buffers); + + /* Do we need to erase the effects of a prior journal_flush? */ + if (journal->j_flags & JFS_FLUSHED) { + jbd_debug(3, "super block updated\n"); + journal_update_superblock(journal, 1); + } else { + jbd_debug(3, "superblock not updated\n"); + } + + /* + * First thing we are allowed to do is to discard any remaining + * BJ_Reserved buffers. Note, it is _not_ permissible to assume + * that there are no such buffers: if a large filesystem + * operation like a truncate needs to split itself over multiple + * transactions, then it may try to do a journal_restart() while + * there are still BJ_Reserved buffers outstanding. These must + * be released cleanly from the current transaction. + * + * In this case, the filesystem must still reserve write access + * again before modifying the buffer in the new transaction, but + * we do not require it to remember exactly which old buffers it + * has reserved. This is consistent with the existing behaviour + * that multiple journal_get_write_access() calls to the same + * buffer are perfectly permissable. + */ + + while (commit_transaction->t_reserved_list) { + jh = commit_transaction->t_reserved_list; + JBUFFER_TRACE(jh, "reserved, unused: refile"); + journal_refile_buffer(jh); + } + + /* + * Now try to drop any written-back buffers from the journal's + * checkpoint lists. We do this *before* commit because it potentially + * frees some memory + */ + spin_lock(&journal_datalist_lock); + __journal_clean_checkpoint_list(journal); + spin_unlock(&journal_datalist_lock); + + /* First part of the commit: force the revoke list out to disk. + * The revoke code generates its own metadata blocks on disk for this. + * + * It is important that we do this while the transaction is + * still locked. Generating the revoke records should not + * generate any IO stalls, so this should be quick; and doing + * the work while we have the transaction locked means that we + * only ever have to maintain the revoke list for one + * transaction at a time. + */ + + jbd_debug (3, "JBD: commit phase 1\n"); + + journal_write_revoke_records(journal, commit_transaction); + + /* + * Now that we have built the revoke records, we can start + * reusing the revoke list for a new running transaction. We + * can now safely start committing the old transaction: time to + * get a new running transaction for incoming filesystem updates + */ + + commit_transaction->t_state = T_FLUSH; + + wake_up(&journal->j_wait_transaction_locked); + + journal->j_committing_transaction = commit_transaction; + journal->j_running_transaction = NULL; + + commit_transaction->t_log_start = journal->j_head; + + unlock_kernel(); + + jbd_debug (3, "JBD: commit phase 2\n"); + + /* + * Now start flushing things to disk, in the order they appear + * on the transaction lists. Data blocks go first. + */ + + /* + * Whenever we unlock the journal and sleep, things can get added + * onto ->t_datalist, so we have to keep looping back to write_out_data + * until we *know* that the list is empty. + */ + write_out_data: + + /* + * Cleanup any flushed data buffers from the data list. Even in + * abort mode, we want to flush this out as soon as possible. + * + * We take journal_datalist_lock to protect the lists from + * journal_try_to_free_buffers(). + */ + spin_lock(&journal_datalist_lock); + + write_out_data_locked: + bufs = 0; + next_jh = commit_transaction->t_sync_datalist; + if (next_jh == NULL) + goto sync_datalist_empty; + last_jh = next_jh->b_tprev; + + do { + struct buffer_head *bh; + + jh = next_jh; + next_jh = jh->b_tnext; + bh = jh2bh(jh); + if (!buffer_locked(bh)) { + if (buffer_dirty(bh)) { + BUFFER_TRACE(bh, "start journal writeout"); + atomic_inc(&bh->b_count); + wbuf[bufs++] = bh; + } else { + BUFFER_TRACE(bh, "writeout complete: unfile"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + __journal_remove_journal_head(bh); + refile_buffer(bh); + __brelse(bh); + } + } + if (bufs == ARRAY_SIZE(wbuf)) { + /* + * Major speedup: start here on the next scan + */ + J_ASSERT(commit_transaction->t_sync_datalist != 0); + commit_transaction->t_sync_datalist = jh; + break; + } + } while (jh != last_jh); + + if (bufs || current->need_resched) { + jbd_debug(2, "submit %d writes\n", bufs); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + if (bufs) + ll_rw_block(WRITE, bufs, wbuf); + if (current->need_resched) + schedule(); + journal_brelse_array(wbuf, bufs); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + if (bufs) + goto write_out_data_locked; + } + + /* + * Wait for all previously submitted IO on the data list to complete. + */ + jh = commit_transaction->t_sync_datalist; + if (jh == NULL) + goto sync_datalist_empty; + + do { + struct buffer_head *bh; + jh = jh->b_tprev; /* Wait on the last written */ + bh = jh2bh(jh); + if (buffer_locked(bh)) { + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + wait_on_buffer(bh); + /* the journal_head may have been removed now */ + lock_journal(journal); + goto write_out_data; + } else if (buffer_dirty(bh)) { + goto write_out_data_locked; + } + } while (jh != commit_transaction->t_sync_datalist); + goto write_out_data_locked; + + sync_datalist_empty: + /* + * Wait for all the async writepage data. As they become unlocked + * in end_buffer_io_async(), the only place where they can be + * reaped is in try_to_free_buffers(), and we're locked against + * that. + */ + while ((jh = commit_transaction->t_async_datalist)) { + struct buffer_head *bh = jh2bh(jh); + if (buffer_locked(bh)) { + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + wait_on_buffer(bh); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + continue; /* List may have changed */ + } + if (jh->b_next_transaction) { + /* + * For writepage() buffers in journalled data mode: a + * later transaction may want the buffer for "metadata" + */ + __journal_refile_buffer(jh); + } else { + BUFFER_TRACE(bh, "finished async writeout: unfile"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + __journal_remove_journal_head(bh); + BUFFER_TRACE(bh, "finished async writeout: refile"); + /* It can sometimes be on BUF_LOCKED due to migration + * from syncdata to asyncdata */ + if (bh->b_list != BUF_CLEAN) + refile_buffer(bh); + __brelse(bh); + } + } + spin_unlock(&journal_datalist_lock); + + /* + * If we found any dirty or locked buffers, then we should have + * looped back up to the write_out_data label. If there weren't + * any then journal_clean_data_list should have wiped the list + * clean by now, so check that it is in fact empty. + */ + J_ASSERT (commit_transaction->t_sync_datalist == NULL); + J_ASSERT (commit_transaction->t_async_datalist == NULL); + + jbd_debug (3, "JBD: commit phase 3\n"); + + /* + * Way to go: we have now written out all of the data for a + * transaction! Now comes the tricky part: we need to write out + * metadata. Loop over the transaction's entire buffer list: + */ + commit_transaction->t_state = T_COMMIT; + + descriptor = 0; + bufs = 0; + while (commit_transaction->t_buffers) { + + /* Find the next buffer to be journaled... */ + + jh = commit_transaction->t_buffers; + + /* If we're in abort mode, we just un-journal the buffer and + release it for background writing. */ + + if (is_journal_aborted(journal)) { + JBUFFER_TRACE(jh, "journal is aborting: refile"); + journal_refile_buffer(jh); + /* If that was the last one, we need to clean up + * any descriptor buffers which may have been + * already allocated, even if we are now + * aborting. */ + if (!commit_transaction->t_buffers) + goto start_journal_io; + continue; + } + + /* Make sure we have a descriptor block in which to + record the metadata buffer. */ + + if (!descriptor) { + struct buffer_head *bh; + + J_ASSERT (bufs == 0); + + jbd_debug(4, "JBD: get descriptor\n"); + + descriptor = journal_get_descriptor_buffer(journal); + bh = jh2bh(descriptor); + jbd_debug(4, "JBD: got buffer %ld (%p)\n", + bh->b_blocknr, bh->b_data); + header = (journal_header_t *)&bh->b_data[0]; + header->h_magic = htonl(JFS_MAGIC_NUMBER); + header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK); + header->h_sequence = htonl(commit_transaction->t_tid); + + tagp = &bh->b_data[sizeof(journal_header_t)]; + space_left = bh->b_size - sizeof(journal_header_t); + first_tag = 1; + set_bit(BH_JWrite, &bh->b_state); + wbuf[bufs++] = bh; + + /* Record it so that we can wait for IO + completion later */ + BUFFER_TRACE(bh, "ph3: file as descriptor"); + journal_file_buffer(descriptor, commit_transaction, + BJ_LogCtl); + } + + /* Where is the buffer to be written? */ + + blocknr = journal_next_log_block(journal); + + /* Bump b_count to prevent truncate from stumbling over + the shadowed buffer! @@@ This can go if we ever get + rid of the BJ_IO/BJ_Shadow pairing of buffers. */ + atomic_inc(&jh2bh(jh)->b_count); + + /* Make a temporary IO buffer with which to write it out + (this will requeue both the metadata buffer and the + temporary IO buffer). new_bh goes on BJ_IO*/ + + set_bit(BH_JWrite, &jh2bh(jh)->b_state); + /* + * akpm: journal_write_metadata_buffer() sets + * new_bh->b_transaction to commit_transaction. + * We need to clean this up before we release new_bh + * (which is of type BJ_IO) + */ + JBUFFER_TRACE(jh, "ph3: write metadata"); + flags = journal_write_metadata_buffer(commit_transaction, + jh, &new_jh, blocknr); + set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); + wbuf[bufs++] = jh2bh(new_jh); + + /* Record the new block's tag in the current descriptor + buffer */ + + tag_flag = 0; + if (flags & 1) + tag_flag |= JFS_FLAG_ESCAPE; + if (!first_tag) + tag_flag |= JFS_FLAG_SAME_UUID; + + tag = (journal_block_tag_t *) tagp; + tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr); + tag->t_flags = htonl(tag_flag); + tagp += sizeof(journal_block_tag_t); + space_left -= sizeof(journal_block_tag_t); + + if (first_tag) { + memcpy (tagp, journal->j_uuid, 16); + tagp += 16; + space_left -= 16; + first_tag = 0; + } + + /* If there's no more to do, or if the descriptor is full, + let the IO rip! */ + + if (bufs == ARRAY_SIZE(wbuf) || + commit_transaction->t_buffers == NULL || + space_left < sizeof(journal_block_tag_t) + 16) { + + jbd_debug(4, "JBD: Submit %d IOs\n", bufs); + + /* Write an end-of-descriptor marker before + submitting the IOs. "tag" still points to + the last tag we set up. */ + + tag->t_flags |= htonl(JFS_FLAG_LAST_TAG); + + start_journal_io: + unlock_journal(journal); + for (i=0; ib_state); + clear_bit(BH_Dirty, &bh->b_state); + bh->b_end_io = journal_end_buffer_io_sync; + submit_bh(WRITE, bh); + } + if (current->need_resched) + schedule(); + lock_journal(journal); + + /* Force a new descriptor to be generated next + time round the loop. */ + descriptor = NULL; + bufs = 0; + } + } + + /* Lo and behold: we have just managed to send a transaction to + the log. Before we can commit it, wait for the IO so far to + complete. Control buffers being written are on the + transaction's t_log_list queue, and metadata buffers are on + the t_iobuf_list queue. + + Wait for the transactions in reverse order. That way we are + less likely to be woken up until all IOs have completed, and + so we incur less scheduling load. + */ + + jbd_debug(3, "JBD: commit phase 4\n"); + + /* akpm: these are BJ_IO, and journal_datalist_lock is not needed */ + wait_for_iobuf: + while (commit_transaction->t_iobuf_list != NULL) { + struct buffer_head *bh; + jh = commit_transaction->t_iobuf_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + unlock_journal(journal); + wait_on_buffer(bh); + lock_journal(journal); + goto wait_for_iobuf; + } + + clear_bit(BH_JWrite, &jh2bh(jh)->b_state); + + JBUFFER_TRACE(jh, "ph4: unfile after journal write"); + journal_unfile_buffer(jh); + + /* + * akpm: don't put back a buffer_head with stale pointers + * dangling around. + */ + J_ASSERT_JH(jh, jh->b_transaction != NULL); + jh->b_transaction = NULL; + + /* + * ->t_iobuf_list should contain only dummy buffer_heads + * which were created by journal_write_metadata_buffer(). + */ + bh = jh2bh(jh); + BUFFER_TRACE(bh, "dumping temporary bh"); + journal_unlock_journal_head(jh); + __brelse(bh); + J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0); + put_unused_buffer_head(bh); + + /* We also have to unlock and free the corresponding + shadowed buffer */ + jh = commit_transaction->t_shadow_list->b_tprev; + bh = jh2bh(jh); + clear_bit(BH_JWrite, &bh->b_state); + J_ASSERT_BH(bh, buffer_jdirty(bh)); + + /* The metadata is now released for reuse, but we need + to remember it against this transaction so that when + we finally commit, we can do any checkpointing + required. */ + JBUFFER_TRACE(jh, "file as BJ_Forget"); + journal_file_buffer(jh, commit_transaction, BJ_Forget); + /* Wake up any transactions which were waiting for this + IO to complete */ + wake_up(&bh->b_wait); + JBUFFER_TRACE(jh, "brelse shadowed buffer"); + __brelse(bh); + } + + J_ASSERT (commit_transaction->t_shadow_list == NULL); + + jbd_debug(3, "JBD: commit phase 5\n"); + + /* Here we wait for the revoke record and descriptor record buffers */ + wait_for_ctlbuf: + while (commit_transaction->t_log_list != NULL) { + struct buffer_head *bh; + + jh = commit_transaction->t_log_list->b_tprev; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + unlock_journal(journal); + wait_on_buffer(bh); + lock_journal(journal); + goto wait_for_ctlbuf; + } + + BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile"); + clear_bit(BH_JWrite, &bh->b_state); + journal_unfile_buffer(jh); + jh->b_transaction = NULL; + journal_unlock_journal_head(jh); + __brelse(bh); /* One for getblk */ + /* AKPM: bforget here */ + } + + jbd_debug(3, "JBD: commit phase 6\n"); + + /* Done it all: now write the commit record. We should have + * cleaned up our previous buffers by now, so if we are in abort + * mode we can now just skip the rest of the journal write + * entirely. */ + + if (is_journal_aborted(journal)) + goto skip_commit; + + descriptor = journal_get_descriptor_buffer(journal); + + /* AKPM: buglet - add `i' to tmp! */ + for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) { + journal_header_t *tmp = + (journal_header_t*)jh2bh(descriptor)->b_data; + tmp->h_magic = htonl(JFS_MAGIC_NUMBER); + tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK); + tmp->h_sequence = htonl(commit_transaction->t_tid); + } + + unlock_journal(journal); + JBUFFER_TRACE(descriptor, "write commit block"); + { + struct buffer_head *bh = jh2bh(descriptor); + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + __brelse(bh); /* One for getblk() */ + journal_unlock_journal_head(descriptor); + } + lock_journal(journal); + + /* End of a transaction! Finally, we can do checkpoint + processing: any buffers committed as a result of this + transaction can be removed from any checkpoint list it was on + before. */ + + skip_commit: + + jbd_debug(3, "JBD: commit phase 7\n"); + + J_ASSERT(commit_transaction->t_sync_datalist == NULL); + J_ASSERT(commit_transaction->t_async_datalist == NULL); + J_ASSERT(commit_transaction->t_buffers == NULL); + J_ASSERT(commit_transaction->t_checkpoint_list == NULL); + J_ASSERT(commit_transaction->t_iobuf_list == NULL); + J_ASSERT(commit_transaction->t_shadow_list == NULL); + J_ASSERT(commit_transaction->t_log_list == NULL); + + while (commit_transaction->t_forget) { + transaction_t *cp_transaction; + struct buffer_head *bh; + + jh = commit_transaction->t_forget; + J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || + jh->b_transaction == journal->j_running_transaction); + + /* + * If there is undo-protected committed data against + * this buffer, then we can remove it now. If it is a + * buffer needing such protection, the old frozen_data + * field now points to a committed version of the + * buffer, so rotate that field to the new committed + * data. + * + * Otherwise, we can just throw away the frozen data now. + */ + if (jh->b_committed_data) { + kfree(jh->b_committed_data); + jh->b_committed_data = NULL; + if (jh->b_frozen_data) { + jh->b_committed_data = jh->b_frozen_data; + jh->b_frozen_data = NULL; + } + } else if (jh->b_frozen_data) { + kfree(jh->b_frozen_data); + jh->b_frozen_data = NULL; + } + + spin_lock(&journal_datalist_lock); + cp_transaction = jh->b_cp_transaction; + if (cp_transaction) { + JBUFFER_TRACE(jh, "remove from old cp transaction"); + J_ASSERT_JH(jh, commit_transaction != cp_transaction); + __journal_remove_checkpoint(jh); + } + + /* Only re-checkpoint the buffer_head if it is marked + * dirty. If the buffer was added to the BJ_Forget list + * by journal_forget, it may no longer be dirty and + * there's no point in keeping a checkpoint record for + * it. */ + bh = jh2bh(jh); + if (buffer_jdirty(bh)) { + JBUFFER_TRACE(jh, "add to new checkpointing trans"); + __journal_insert_checkpoint(jh, commit_transaction); + JBUFFER_TRACE(jh, "refile for checkpoint writeback"); + __journal_refile_buffer(jh); + } else { + J_ASSERT_BH(bh, !buffer_dirty(bh)); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + __journal_remove_journal_head(bh); + __brelse(bh); + } + spin_unlock(&journal_datalist_lock); + } + + /* Done with this transaction! */ + + jbd_debug(3, "JBD: commit phase 8\n"); + + J_ASSERT (commit_transaction->t_state == T_COMMIT); + commit_transaction->t_state = T_FINISHED; + + J_ASSERT (commit_transaction == journal->j_committing_transaction); + journal->j_commit_sequence = commit_transaction->t_tid; + journal->j_committing_transaction = NULL; + + spin_lock(&journal_datalist_lock); + if (commit_transaction->t_checkpoint_list == NULL) { + __journal_drop_transaction(journal, commit_transaction); + } else { + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + } + spin_unlock(&journal_datalist_lock); + + jbd_debug(1, "JBD: commit %d complete, head %d\n", + journal->j_commit_sequence, journal->j_tail_sequence); + + unlock_journal(journal); + wake_up(&journal->j_wait_done_commit); + } diff -rc2P linux/fs/jbd/journal.c linux-2.4.13/fs/jbd/journal.c *** linux/fs/jbd/journal.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/journal.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,1716 ---- + /* + * linux/fs/journal.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem journal-writing code; part of the ext2fs + * journaling system. + * + * This file manages journals: areas of disk reserved for logging + * transactional updates. This includes the kernel journaling thread + * which is responsible for scheduling updates to the log. + * + * We do not actually manage the physical storage of the journal in this + * file: that is left to a per-journal policy function, which allows us + * to store the journal within a filesystem-specified area for ext2 + * journaling (ext2 can use a reserved inode for storing the log). + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + EXPORT_SYMBOL(journal_start); + EXPORT_SYMBOL(journal_try_start); + EXPORT_SYMBOL(journal_restart); + EXPORT_SYMBOL(journal_extend); + EXPORT_SYMBOL(journal_stop); + EXPORT_SYMBOL(journal_lock_updates); + EXPORT_SYMBOL(journal_unlock_updates); + EXPORT_SYMBOL(journal_get_write_access); + EXPORT_SYMBOL(journal_get_create_access); + EXPORT_SYMBOL(journal_get_undo_access); + EXPORT_SYMBOL(journal_dirty_data); + EXPORT_SYMBOL(journal_dirty_metadata); + #if 0 + EXPORT_SYMBOL(journal_release_buffer); + #endif + EXPORT_SYMBOL(journal_forget); + #if 0 + EXPORT_SYMBOL(journal_sync_buffer); + #endif + EXPORT_SYMBOL(journal_flush); + EXPORT_SYMBOL(journal_revoke); + + EXPORT_SYMBOL(journal_init_dev); + EXPORT_SYMBOL(journal_init_inode); + EXPORT_SYMBOL(journal_update_format); + EXPORT_SYMBOL(journal_check_used_features); + EXPORT_SYMBOL(journal_check_available_features); + EXPORT_SYMBOL(journal_set_features); + EXPORT_SYMBOL(journal_create); + EXPORT_SYMBOL(journal_load); + EXPORT_SYMBOL(journal_destroy); + EXPORT_SYMBOL(journal_recover); + EXPORT_SYMBOL(journal_update_superblock); + EXPORT_SYMBOL(__journal_abort); + EXPORT_SYMBOL(journal_abort); + EXPORT_SYMBOL(journal_errno); + EXPORT_SYMBOL(journal_ack_err); + EXPORT_SYMBOL(journal_clear_err); + EXPORT_SYMBOL(log_wait_commit); + EXPORT_SYMBOL(log_start_commit); + EXPORT_SYMBOL(journal_wipe); + EXPORT_SYMBOL(journal_blocks_per_page); + EXPORT_SYMBOL(journal_flushpage); + EXPORT_SYMBOL(journal_try_to_free_buffers); + EXPORT_SYMBOL(journal_bmap); + EXPORT_SYMBOL(journal_force_commit); + + static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); + + /* + * journal_datalist_lock is used to protect data buffers: + * + * bh->b_transaction + * bh->b_tprev + * bh->b_tnext + * + * journal_free_buffer() is called from journal_try_to_free_buffer(), and is + * async wrt everything else. + * + * It is also used for checkpoint data, also to protect against + * journal_try_to_free_buffer(): + * + * bh->b_cp_transaction + * bh->b_cpnext + * bh->b_cpprev + * transaction->t_checkpoint_list + * transaction->t_cpnext + * transaction->t_cpprev + * journal->j_checkpoint_transactions + * + * It is global at this time rather than per-journal because it's + * impossible for __journal_free_buffer to go from a buffer_head + * back to a journal_t unracily (well, not true. Fix later) + * + * + * The `datalist' and `checkpoint list' functions are quite + * separate and we could use two spinlocks here. + * + * lru_list_lock nests inside journal_datalist_lock. + */ + spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED; + + /* + * List of all journals in the system. Protected by the BKL. + */ + static LIST_HEAD(all_journals); + + /* + * Helper function used to manage commit timeouts + */ + + static void commit_timeout(unsigned long __data) + { + struct task_struct * p = (struct task_struct *) __data; + + wake_up_process(p); + } + + /* Static check for data structure consistency. There's no code + * invoked --- we'll just get a linker failure if things aren't right. + */ + void __journal_internal_check(void) + { + extern void journal_bad_superblock_size(void); + if (sizeof(struct journal_superblock_s) != 1024) + journal_bad_superblock_size(); + } + + /* + * kjournald: The main thread function used to manage a logging device + * journal. + * + * This kernel thread is responsible for two things: + * + * 1) COMMIT: Every so often we need to commit the current state of the + * filesystem to disk. The journal thread is responsible for writing + * all of the metadata buffers to disk. + * + * 2) CHECKPOINT: We cannot reuse a used section of the log file until all + * of the data in that part of the log has been rewritten elsewhere on + * the disk. Flushing these old buffers to reclaim space in the log is + * known as checkpointing, and this thread is responsible for that job. + */ + + journal_t *current_journal; // AKPM: debug + + int kjournald(void *arg) + { + journal_t *journal = (journal_t *) arg; + transaction_t *transaction; + struct timer_list timer; + + current_journal = journal; + + lock_kernel(); + daemonize(); + spin_lock_irq(¤t->sigmask_lock); + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + sprintf(current->comm, "kjournald"); + + /* Set up an interval timer which can be used to trigger a + commit wakeup after the commit interval expires */ + init_timer(&timer); + timer.data = (unsigned long) current; + timer.function = commit_timeout; + journal->j_commit_timer = &timer; + + /* Record that the journal thread is running */ + journal->j_task = current; + wake_up(&journal->j_wait_done_commit); + + printk(KERN_INFO "kjournald starting. Commit interval %ld seconds\n", + journal->j_commit_interval / HZ); + list_add(&journal->j_all_journals, &all_journals); + + /* And now, wait forever for commit wakeup events. */ + while (1) { + if (journal->j_flags & JFS_UNMOUNT) + break; + + jbd_debug(1, "commit_sequence=%d, commit_request=%d\n", + journal->j_commit_sequence, journal->j_commit_request); + + if (journal->j_commit_sequence != journal->j_commit_request) { + jbd_debug(1, "OK, requests differ\n"); + if (journal->j_commit_timer_active) { + journal->j_commit_timer_active = 0; + del_timer(journal->j_commit_timer); + } + + journal_commit_transaction(journal); + continue; + } + + wake_up(&journal->j_wait_done_commit); + interruptible_sleep_on(&journal->j_wait_commit); + + jbd_debug(1, "kjournald wakes\n"); + + /* Were we woken up by a commit wakeup event? */ + if ((transaction = journal->j_running_transaction) != NULL && + time_after_eq(jiffies, transaction->t_expires)) { + journal->j_commit_request = transaction->t_tid; + jbd_debug(1, "woke because of timeout\n"); + } + } + + if (journal->j_commit_timer_active) { + journal->j_commit_timer_active = 0; + del_timer_sync(journal->j_commit_timer); + } + + list_del(&journal->j_all_journals); + + journal->j_task = NULL; + wake_up(&journal->j_wait_done_commit); + jbd_debug(1, "Journal thread exiting.\n"); + return 0; + } + + static void journal_start_thread(journal_t *journal) + { + kernel_thread(kjournald, (void *) journal, + CLONE_VM | CLONE_FS | CLONE_FILES); + while (!journal->j_task) + sleep_on(&journal->j_wait_done_commit); + } + + static void journal_kill_thread(journal_t *journal) + { + journal->j_flags |= JFS_UNMOUNT; + + while (journal->j_task) { + wake_up(&journal->j_wait_commit); + sleep_on(&journal->j_wait_done_commit); + } + } + + #if 0 + + This is no longer needed - we do it in commit quite efficiently. + Note that if this function is resurrected, the loop needs to + be reorganised into the next_jh/last_jh algorithm. + + /* + * journal_clean_data_list: cleanup after data IO. + * + * Once the IO system has finished writing the buffers on the transaction's + * data list, we can remove those buffers from the list. This function + * scans the list for such buffers and removes them cleanly. + * + * We assume that the journal is already locked. + * We are called with journal_datalist_lock held. + * + * AKPM: This function looks inefficient. Approximately O(n^2) + * for potentially thousands of buffers. It no longer shows on profiles + * because these buffers are mainly dropped in journal_commit_transaction(). + */ + + void __journal_clean_data_list(transaction_t *transaction) + { + struct journal_head *jh, *next; + + assert_spin_locked(&journal_datalist_lock); + + restart: + jh = transaction->t_sync_datalist; + if (!jh) + goto out; + do { + next = jh->b_tnext; + if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) { + struct buffer_head *bh = jh2bh(jh); + BUFFER_TRACE(bh, "data writeout complete: unfile"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + __journal_remove_journal_head(bh); + refile_buffer(bh); + __brelse(bh); + goto restart; + } + jh = next; + } while (transaction->t_sync_datalist && + jh != transaction->t_sync_datalist); + out: + return; + } + #endif + + /* + * journal_write_metadata_buffer: write a metadata buffer to the journal. + * + * Writes a metadata buffer to a given disk block. The actual IO is not + * performed but a new buffer_head is constructed which labels the data + * to be written with the correct destination disk block. + * + * Any magic-number escaping which needs to be done will cause a + * copy-out here. If the buffer happens to start with the + * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the + * magic number is only written to the log for descripter blocks. In + * this case, we copy the data and replace the first word with 0, and we + * return a result code which indicates that this buffer needs to be + * marked as an escaped buffer in the corresponding log descriptor + * block. The missing word can then be restored when the block is read + * during recovery. + * + * If the source buffer has already been modified by a new transaction + * since we took the last commit snapshot, we use the frozen copy of + * that data for IO. If we end up using the existing buffer_head's data + * for the write, then we *have* to lock the buffer to prevent anyone + * else from using and possibly modifying it while the IO is in + * progress. + * + * The function returns a pointer to the buffer_heads to be used for IO. + * + * We assume that the journal has already been locked in this function. + * + * Return value: + * <0: Error + * >=0: Finished OK + * + * On success: + * Bit 0 set == escape performed on the data + * Bit 1 set == buffer copy-out performed (kfree the data after IO) + */ + + static inline unsigned long virt_to_offset(void *p) + {return ((unsigned long) p) & ~PAGE_MASK;} + + int journal_write_metadata_buffer(transaction_t *transaction, + struct journal_head *jh_in, + struct journal_head **jh_out, + int blocknr) + { + int need_copy_out = 0; + int done_copy_out = 0; + int do_escape = 0; + char *mapped_data; + struct buffer_head *new_bh; + struct journal_head * new_jh; + struct page *new_page; + unsigned int new_offset; + + /* + * The buffer really shouldn't be locked: only the current committing + * transaction is allowed to write it, so nobody else is allowed + * to do any IO. + * + * akpm: except if we're journalling data, and write() output is + * also part of a shared mapping, and another thread has + * decided to launch a writepage() against this buffer. + */ + J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in))); + + /* + * If a new transaction has already done a buffer copy-out, then + * we use that version of the data for the commit. + */ + + if (jh_in->b_frozen_data) { + done_copy_out = 1; + new_page = virt_to_page(jh_in->b_frozen_data); + new_offset = virt_to_offset(jh_in->b_frozen_data); + } else { + new_page = jh2bh(jh_in)->b_page; + new_offset = virt_to_offset(jh2bh(jh_in)->b_data); + } + + mapped_data = ((char *) kmap(new_page)) + new_offset; + + /* + * Check for escaping + */ + if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) { + need_copy_out = 1; + do_escape = 1; + } + + /* + * Do we need to do a data copy? + */ + + if (need_copy_out && !done_copy_out) { + char *tmp; + tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS); + + jh_in->b_frozen_data = tmp; + memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size); + + /* If we get to this path, we'll always need the new + address kmapped so that we can clear the escaped + magic number below. */ + kunmap(new_page); + new_page = virt_to_page(tmp); + new_offset = virt_to_offset(tmp); + mapped_data = ((char *) kmap(new_page)) + new_offset; + + done_copy_out = 1; + } + + /* + * Right, time to make up the new buffer_head. + */ + do { + new_bh = get_unused_buffer_head(0); + if (!new_bh) { + printk (KERN_NOTICE __FUNCTION__ + ": ENOMEM at get_unused_buffer_head, " + "trying again.\n"); + current->policy |= SCHED_YIELD; + schedule(); + } + } while (!new_bh); + /* keep subsequent assertions sane */ + new_bh->b_prev_free = 0; + new_bh->b_next_free = 0; + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); + + set_bh_page(new_bh, new_page, new_offset); + + new_jh->b_transaction = NULL; + new_bh->b_size = jh2bh(jh_in)->b_size; + new_bh->b_dev = transaction->t_journal->j_dev; + new_bh->b_blocknr = blocknr; + new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty); + + *jh_out = new_jh; + + /* + * Did we need to do an escaping? Now we've done all the + * copying, we can finally do so. + */ + + if (do_escape) + * ((unsigned int *) mapped_data) = 0; + kunmap(new_page); + + /* + * The to-be-written buffer needs to get moved to the io queue, + * and the original buffer whose contents we are shadowing or + * copying is moved to the transaction's shadow queue. + */ + JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); + journal_file_buffer(jh_in, transaction, BJ_Shadow); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); + journal_file_buffer(new_jh, transaction, BJ_IO); + + return do_escape | (done_copy_out << 1); + } + + /* + * Allocation code for the journal file. Manage the space left in the + * journal, so that we can begin checkpointing when appropriate. + */ + + /* + * log_space_left: Return the number of free blocks left in the journal. + * + * Called with the journal already locked. + */ + + int log_space_left (journal_t *journal) + { + int left = journal->j_free; + + /* Be pessimistic here about the number of those free blocks + * which might be required for log descriptor control blocks. */ + + #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */ + + left -= MIN_LOG_RESERVED_BLOCKS; + + if (left <= 0) + return 0; + left -= (left >> 3); + return left; + } + + /* + * This function must be non-allocating for PF_MEMALLOC tasks + */ + tid_t log_start_commit (journal_t *journal, transaction_t *transaction) + { + tid_t target = journal->j_commit_request; + + lock_kernel(); /* Protect journal->j_running_transaction */ + + /* + * A NULL transaction asks us to commit the currently running + * transaction, if there is one. + */ + if (transaction) + target = transaction->t_tid; + else { + transaction = journal->j_running_transaction; + if (!transaction) + goto out; + target = transaction->t_tid; + } + + /* + * Are we already doing a recent enough commit? + */ + if (tid_geq(journal->j_commit_request, target)) + goto out; + + /* + * We want a new commit: OK, mark the request and wakup the + * commit thread. We do _not_ do the commit ourselves. + */ + + journal->j_commit_request = target; + jbd_debug(1, "JBD: requesting commit %d/%d\n", + journal->j_commit_request, + journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + + out: + unlock_kernel(); + return target; + } + + /* + * Wait for a specified commit to complete. + * The caller may not hold the journal lock. + */ + void log_wait_commit (journal_t *journal, tid_t tid) + { + lock_kernel(); + #ifdef CONFIG_JBD_DEBUG + lock_journal(journal); + if (!tid_geq(journal->j_commit_request, tid)) { + printk(KERN_EMERG __FUNCTION__ + ": error: j_commit_request=%d, tid=%d\n", + journal->j_commit_request, tid); + } + unlock_journal(journal); + #endif + while (tid_gt(tid, journal->j_commit_sequence)) { + jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", + tid, journal->j_commit_sequence); + wake_up(&journal->j_wait_commit); + sleep_on(&journal->j_wait_done_commit); + } + unlock_kernel(); + } + + /* + * Log buffer allocation routines: + */ + + unsigned long journal_next_log_block(journal_t *journal) + { + unsigned long blocknr; + + J_ASSERT(journal->j_free > 1); + + blocknr = journal->j_head; + journal->j_head++; + journal->j_free--; + if (journal->j_head == journal->j_last) + journal->j_head = journal->j_first; + return journal_bmap(journal, blocknr); + } + + /* + * Conversion of logical to physical block numbers for the journal + * + * On external journals the journal blocks are identity-mapped, so + * this is a no-op. If needed, we can use j_blk_offset - everything is + * ready. + */ + unsigned long journal_bmap(journal_t *journal, unsigned long blocknr) + { + unsigned long ret; + + if (journal->j_inode) { + ret = bmap(journal->j_inode, blocknr); + J_ASSERT(ret != 0); + } else { + ret = blocknr; /* +journal->j_blk_offset */ + } + return ret; + } + + /* + * We play buffer_head aliasing tricks to write data/metadata blocks to + * the journal without copying their contents, but for journal + * descriptor blocks we do need to generate bona fide buffers. + */ + + struct journal_head * journal_get_descriptor_buffer(journal_t *journal) + { + struct buffer_head *bh; + unsigned long blocknr = journal_next_log_block(journal); + + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + bh->b_state |= (1 << BH_Dirty); + BUFFER_TRACE(bh, "return this buffer"); + return journal_add_journal_head(bh); + } + + /* + * Management for journal control blocks: functions to create and + * destroy journal_t structures, and to initialise and read existing + * journal blocks from disk. */ + + /* First: create and setup a journal_t object in memory. We initialise + * very few fields yet: that has to wait until we have created the + * journal structures from from scratch, or loaded them from disk. */ + + static journal_t * journal_init_common (void) + { + journal_t *journal; + int err; + + MOD_INC_USE_COUNT; + + journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL); + if (!journal) + goto fail; + memset(journal, 0, sizeof(*journal)); + + init_waitqueue_head(&journal->j_wait_transaction_locked); + init_waitqueue_head(&journal->j_wait_logspace); + init_waitqueue_head(&journal->j_wait_done_commit); + init_waitqueue_head(&journal->j_wait_checkpoint); + init_waitqueue_head(&journal->j_wait_commit); + init_waitqueue_head(&journal->j_wait_updates); + init_MUTEX(&journal->j_barrier); + init_MUTEX(&journal->j_checkpoint_sem); + init_MUTEX(&journal->j_sem); + + journal->j_commit_interval = (HZ * 5); + + /* The journal is marked for error until we succeed with recovery! */ + journal->j_flags = JFS_ABORT; + + /* Set up a default-sized revoke table for the new mount. */ + err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + if (err) { + kfree(journal); + goto fail; + } + return journal; + fail: + MOD_DEC_USE_COUNT; + return NULL; + } + + /* journal_init_dev and journal_init_inode: + * + * Create a journal structure assigned some fixed set of disk blocks to + * the journal. We don't actually touch those disk blocks yet, but we + * need to set up all of the mapping information to tell the journaling + * system where the journal blocks are. + * + * journal_init_dev creates a journal which maps a fixed contiguous + * range of blocks on an arbitrary block device. + * + * journal_init_inode creates a journal which maps an on-disk inode as + * the journal. The inode must exist already, must support bmap() and + * must have all data blocks preallocated. + */ + + journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev, + int start, int len, int blocksize) + { + journal_t *journal = journal_init_common(); + struct buffer_head *bh; + + if (!journal) + return NULL; + + journal->j_dev = dev; + journal->j_fs_dev = fs_dev; + journal->j_blk_offset = start; + journal->j_maxlen = len; + journal->j_blocksize = blocksize; + + bh = getblk(journal->j_dev, start, journal->j_blocksize); + J_ASSERT(bh != NULL); + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; + } + + journal_t * journal_init_inode (struct inode *inode) + { + struct buffer_head *bh; + journal_t *journal = journal_init_common(); + int blocknr; + + if (!journal) + return NULL; + + journal->j_dev = inode->i_dev; + journal->j_fs_dev = inode->i_dev; + journal->j_inode = inode; + jbd_debug(1, + "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", + journal, bdevname(inode->i_dev), inode->i_ino, inode->i_size, + inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); + + journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits; + journal->j_blocksize = inode->i_sb->s_blocksize; + + blocknr = journal_bmap(journal, 0); + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + J_ASSERT(bh != NULL); + journal->j_sb_buffer = bh; + journal->j_superblock = (journal_superblock_t *)bh->b_data; + + return journal; + } + + /* + * Given a journal_t structure, initialise the various fields for + * startup of a new journaling session. We use this both when creating + * a journal, and after recovering an old journal to reset it for + * subsequent use. + */ + + static int journal_reset (journal_t *journal) + { + journal_superblock_t *sb = journal->j_superblock; + unsigned int first, last; + + first = ntohl(sb->s_first); + last = ntohl(sb->s_maxlen); + + journal->j_first = first; + journal->j_last = last; + + journal->j_head = first; + journal->j_tail = first; + journal->j_free = last - first; + + journal->j_tail_sequence = journal->j_transaction_sequence; + journal->j_commit_sequence = journal->j_transaction_sequence - 1; + journal->j_commit_request = journal->j_commit_sequence; + + journal->j_max_transaction_buffers = journal->j_maxlen / 4; + + /* Add the dynamic fields and write it to disk. */ + journal_update_superblock(journal, 1); + + lock_journal(journal); + journal_start_thread(journal); + unlock_journal(journal); + + return 0; + } + + /* + * Given a journal_t structure which tells us which disk blocks we can + * use, create a new journal superblock and initialise all of the + * journal fields from scratch. */ + + int journal_create (journal_t *journal) + { + int blocknr; + struct buffer_head *bh; + journal_superblock_t *sb; + int i; + + if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) { + printk (KERN_ERR "Journal length (%d blocks) too short.\n", + journal->j_maxlen); + return -EINVAL; + } + + if (journal->j_inode == NULL) { + /* + * We don't know what block to start at! + */ + printk(KERN_EMERG __FUNCTION__ + ": creation of journal on external device!\n"); + BUG(); + } + + /* Zero out the entire journal on disk. We cannot afford to + have any blocks on disk beginning with JFS_MAGIC_NUMBER. */ + jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); + for (i = 0; i < journal->j_maxlen; i++) { + blocknr = journal_bmap(journal, i); + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + wait_on_buffer(bh); + memset (bh->b_data, 0, journal->j_blocksize); + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + BUFFER_TRACE(bh, "marking uptodate"); + mark_buffer_uptodate(bh, 1); + __brelse(bh); + } + sync_dev(journal->j_dev); + jbd_debug(1, "JBD: journal cleared.\n"); + + /* OK, fill in the initial static fields in the new superblock */ + sb = journal->j_superblock; + + sb->s_header.h_magic = htonl(JFS_MAGIC_NUMBER); + sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2); + + sb->s_blocksize = htonl(journal->j_blocksize); + sb->s_maxlen = htonl(journal->j_maxlen); + sb->s_first = htonl(1); + + journal->j_transaction_sequence = 1; + + journal->j_flags &= ~JFS_ABORT; + journal->j_format_version = 2; + + return journal_reset(journal); + } + + /* + * Update a journal's dynamic superblock fields and write it to disk, + * optionally waiting for the IO to complete. + */ + + void journal_update_superblock(journal_t *journal, int wait) + { + journal_superblock_t *sb = journal->j_superblock; + struct buffer_head *bh = journal->j_sb_buffer; + + jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", + journal->j_tail, journal->j_tail_sequence, journal->j_errno); + + sb->s_sequence = htonl(journal->j_tail_sequence); + sb->s_start = htonl(journal->j_tail); + sb->s_errno = htonl(journal->j_errno); + + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + if (wait) + wait_on_buffer(bh); + + /* If we have just flushed the log (by marking s_start==0), then + * any future commit will have to be careful to update the + * superblock again to re-record the true start of the log. */ + + if (sb->s_start) + journal->j_flags &= ~JFS_FLUSHED; + else + journal->j_flags |= JFS_FLUSHED; + } + + + /* + * Read the superblock for a given journal, performing initial + * validation of the format. + */ + + static int journal_get_superblock(journal_t *journal) + { + struct buffer_head *bh; + journal_superblock_t *sb; + + bh = journal->j_sb_buffer; + + J_ASSERT(bh != NULL); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + printk (KERN_ERR + "JBD: IO error reading journal superblock\n"); + return -EIO; + } + } + + sb = journal->j_superblock; + + if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) || + sb->s_blocksize != htonl(journal->j_blocksize)) { + printk(KERN_WARNING "JBD: no valid journal superblock found\n"); + return -EINVAL; + } + + switch(ntohl(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V1: + journal->j_format_version = 1; + break; + case JFS_SUPERBLOCK_V2: + journal->j_format_version = 2; + break; + default: + printk(KERN_WARNING "JBD: unrecognised superblock format ID\n"); + return -EINVAL; + } + + if (ntohl(sb->s_maxlen) < journal->j_maxlen) + journal->j_maxlen = ntohl(sb->s_maxlen); + else if (ntohl(sb->s_maxlen) > journal->j_maxlen) { + printk (KERN_WARNING "JBD: journal file too short\n"); + return -EINVAL; + } + + return 0; + } + + /* + * Load the on-disk journal superblock and read the key fields into the + * journal_t. + */ + + static int load_superblock(journal_t *journal) + { + int err; + journal_superblock_t *sb; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + journal->j_tail_sequence = ntohl(sb->s_sequence); + journal->j_tail = ntohl(sb->s_start); + journal->j_first = ntohl(sb->s_first); + journal->j_last = ntohl(sb->s_maxlen); + journal->j_errno = ntohl(sb->s_errno); + + return 0; + } + + + /* + * Given a journal_t structure which tells us which disk blocks contain + * a journal, read the journal from disk to initialise the in-memory + * structures. + */ + + int journal_load(journal_t *journal) + { + int err; + + err = load_superblock(journal); + if (err) + return err; + + /* If this is a V2 superblock, then we have to check the + * features flags on it. */ + + if (journal->j_format_version >= 2) { + journal_superblock_t *sb = journal->j_superblock; + + if ((sb->s_feature_ro_compat & + ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || + (sb->s_feature_incompat & + ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) { + printk (KERN_WARNING + "JBD: Unrecognised features on journal\n"); + return -EINVAL; + } + } + + /* Let the recovery code check whether it needs to recover any + * data from the journal. */ + if (journal_recover(journal)) + goto recovery_error; + + /* OK, we've finished with the dynamic journal bits: + * reinitialise the dynamic contents of the superblock in memory + * and reset them on disk. */ + if (journal_reset(journal)) + goto recovery_error; + + journal->j_flags &= ~JFS_ABORT; + journal->j_flags |= JFS_LOADED; + return 0; + + recovery_error: + printk (KERN_WARNING "JBD: recovery failed\n"); + return -EIO; + } + + /* + * Release a journal_t structure once it is no longer in use by the + * journaled object. + */ + + void journal_destroy (journal_t *journal) + { + /* Wait for the commit thread to wake up and die. */ + journal_kill_thread(journal); + + /* Force a final log commit */ + if (journal->j_running_transaction) + journal_commit_transaction(journal); + + /* Force any old transactions to disk */ + lock_journal(journal); + while (journal->j_checkpoint_transactions != NULL) + log_do_checkpoint(journal, 1); + + J_ASSERT(journal->j_running_transaction == NULL); + J_ASSERT(journal->j_committing_transaction == NULL); + J_ASSERT(journal->j_checkpoint_transactions == NULL); + + /* We can now mark the journal as empty. */ + journal->j_tail = 0; + journal->j_tail_sequence = ++journal->j_transaction_sequence; + journal_update_superblock(journal, 1); + + if (journal->j_inode) + iput(journal->j_inode); + if (journal->j_revoke) + journal_destroy_revoke(journal); + + unlock_journal(journal); + brelse(journal->j_sb_buffer); + kfree(journal); + MOD_DEC_USE_COUNT; + } + + + /* Published API: Check whether the journal uses all of a given set of + * features. Return true (non-zero) if it does. */ + + int journal_check_used_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) + { + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + if (journal->j_format_version == 1) + return 0; + + sb = journal->j_superblock; + + if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) && + ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) && + ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat)) + return 1; + + return 0; + } + + /* Published API: Check whether the journaling code supports the use of + * all of a given set of features on this journal. Return true + * (non-zero) if it can. */ + + int journal_check_available_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) + { + journal_superblock_t *sb; + + if (!compat && !ro && !incompat) + return 1; + + sb = journal->j_superblock; + + /* We can support any known requested features iff the + * superblock is in version 2. Otherwise we fail to support any + * extended sb features. */ + + if (journal->j_format_version != 2) + return 0; + + if ((compat & JFS_KNOWN_COMPAT_FEATURES) == compat && + (ro & JFS_KNOWN_ROCOMPAT_FEATURES) == ro && + (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat) + return 1; + + return 0; + } + + /* Published API: Mark a given journal feature as present on the + * superblock. Returns true if the requested features could be set. */ + + int journal_set_features (journal_t *journal, unsigned long compat, + unsigned long ro, unsigned long incompat) + { + journal_superblock_t *sb; + + if (journal_check_used_features(journal, compat, ro, incompat)) + return 1; + + if (!journal_check_available_features(journal, compat, ro, incompat)) + return 0; + + jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + compat, ro, incompat); + + sb = journal->j_superblock; + + sb->s_feature_compat |= cpu_to_be32(compat); + sb->s_feature_ro_compat |= cpu_to_be32(ro); + sb->s_feature_incompat |= cpu_to_be32(incompat); + + return 1; + } + + + /* + * Published API: + * Given an initialised but unloaded journal struct, poke about in the + * on-disk structure to update it to the most recent supported version. + */ + + int journal_update_format (journal_t *journal) + { + journal_superblock_t *sb; + int err; + + err = journal_get_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + switch (ntohl(sb->s_header.h_blocktype)) { + case JFS_SUPERBLOCK_V2: + return 0; + case JFS_SUPERBLOCK_V1: + return journal_convert_superblock_v1(journal, sb); + default: + break; + } + return -EINVAL; + } + + static int journal_convert_superblock_v1(journal_t *journal, + journal_superblock_t *sb) + { + int offset, blocksize; + struct buffer_head *bh; + + printk(KERN_WARNING + "JBD: Converting superblock from version 1 to 2.\n"); + + /* Pre-initialise new fields to zero */ + offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb); + blocksize = ntohl(sb->s_blocksize); + memset(&sb->s_feature_compat, 0, blocksize-offset); + + sb->s_nr_users = cpu_to_be32(1); + sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2); + journal->j_format_version = 2; + + bh = journal->j_sb_buffer; + BUFFER_TRACE(bh, "marking dirty"); + mark_buffer_dirty(bh); + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + return 0; + } + + + /* + * Flush all data for a given journal to disk and empty the journal. + * Filesystems can use this when remounting readonly to ensure that + * recovery does not need to happen on remount. + */ + + int journal_flush (journal_t *journal) + { + int err = 0; + transaction_t *transaction = NULL; + unsigned long old_tail; + + lock_kernel(); + + /* Force everything buffered to the log... */ + if (journal->j_running_transaction) { + transaction = journal->j_running_transaction; + log_start_commit(journal, transaction); + } else if (journal->j_committing_transaction) + transaction = journal->j_committing_transaction; + + /* Wait for the log commit to complete... */ + if (transaction) + log_wait_commit(journal, transaction->t_tid); + + /* ...and flush everything in the log out to disk. */ + lock_journal(journal); + while (!err && journal->j_checkpoint_transactions != NULL) + err = log_do_checkpoint(journal, journal->j_maxlen); + cleanup_journal_tail(journal); + + /* Finally, mark the journal as really needing no recovery. + * This sets s_start==0 in the underlying superblock, which is + * the magic code for a fully-recovered superblock. Any future + * commits of data to the journal will restore the current + * s_start value. */ + old_tail = journal->j_tail; + journal->j_tail = 0; + journal_update_superblock(journal, 1); + journal->j_tail = old_tail; + + unlock_journal(journal); + + J_ASSERT(!journal->j_running_transaction); + J_ASSERT(!journal->j_committing_transaction); + J_ASSERT(!journal->j_checkpoint_transactions); + J_ASSERT(journal->j_head == journal->j_tail); + J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); + + unlock_kernel(); + + return err; + } + + /* + * Wipe out all of the contents of a journal, safely. This will produce + * a warning if the journal contains any valid recovery information. + * Must be called between journal_init_*() and journal_load(). + * + * If (write) is non-zero, then we wipe out the journal on disk; otherwise + * we merely suppress recovery. + */ + + int journal_wipe (journal_t *journal, int write) + { + journal_superblock_t *sb; + int err = 0; + + J_ASSERT (!(journal->j_flags & JFS_LOADED)); + + err = load_superblock(journal); + if (err) + return err; + + sb = journal->j_superblock; + + if (!journal->j_tail) + goto no_recovery; + + printk (KERN_WARNING "JBD: %s recovery information on journal\n", + write ? "Clearing" : "Ignoring"); + + err = journal_skip_recovery(journal); + if (write) + journal_update_superblock(journal, 1); + + no_recovery: + return err; + } + + /* + * journal_dev_name: format a character string to describe on what + * device this journal is present. + */ + + const char * journal_dev_name(journal_t *journal) + { + kdev_t dev; + + if (journal->j_inode) + dev = journal->j_inode->i_dev; + else + dev = journal->j_dev; + + return bdevname(dev); + } + + /* + * journal_abort: perform a complete, immediate shutdown of the ENTIRE + * journal (not of a single transaction). This operation cannot be + * undone without closing and reopening the journal. + * + * The journal_abort function is intended to support higher level error + * recovery mechanisms such as the ext2/ext3 remount-readonly error + * mode. + * + * Journal abort has very specific semantics. Any existing dirty, + * unjournaled buffers in the main filesystem will still be written to + * disk by bdflush, but the journaling mechanism will be suspended + * immediately and no further transaction commits will be honoured. + * + * Any dirty, journaled buffers will be written back to disk without + * hitting the journal. Atomicity cannot be guaranteed on an aborted + * filesystem, but we _do_ attempt to leave as much data as possible + * behind for fsck to use for cleanup. + * + * Any attempt to get a new transaction handle on a journal which is in + * ABORT state will just result in an -EROFS error return. A + * journal_stop on an existing handle will return -EIO if we have + * entered abort state during the update. + * + * Recursive transactions are not disturbed by journal abort until the + * final journal_stop, which will receive the -EIO error. + * + * Finally, the journal_abort call allows the caller to supply an errno + * which will be recored (if possible) in the journal superblock. This + * allows a client to record failure conditions in the middle of a + * transaction without having to complete the transaction to record the + * failure to disk. ext3_error, for example, now uses this + * functionality. + * + * Errors which originate from within the journaling layer will NOT + * supply an errno; a null errno implies that absolutely no further + * writes are done to the journal (unless there are any already in + * progress). + */ + + /* Quick version for internal journal use (doesn't lock the journal) */ + void __journal_abort (journal_t *journal) + { + transaction_t *transaction; + + printk (KERN_ERR "Aborting journal on device %s.\n", + journal_dev_name(journal)); + + journal->j_flags |= JFS_ABORT; + transaction = journal->j_running_transaction; + if (transaction) + log_start_commit(journal, transaction); + } + + /* Full version for external use */ + void journal_abort (journal_t *journal, int errno) + { + lock_journal(journal); + + if (journal->j_flags & JFS_ABORT) + goto out; + + if (!journal->j_errno) + journal->j_errno = errno; + + __journal_abort(journal); + + if (errno) + journal_update_superblock(journal, 1); + + out: + unlock_journal(journal); + } + + int journal_errno (journal_t *journal) + { + int err; + + lock_journal(journal); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + err = journal->j_errno; + unlock_journal(journal); + return err; + } + + int journal_clear_err (journal_t *journal) + { + int err = 0; + + lock_journal(journal); + if (journal->j_flags & JFS_ABORT) + err = -EROFS; + else + journal->j_errno = 0; + unlock_journal(journal); + return err; + } + + void journal_ack_err (journal_t *journal) + { + lock_journal(journal); + if (journal->j_errno) + journal->j_flags |= JFS_ACK_ERR; + unlock_journal(journal); + } + + int journal_blocks_per_page(struct inode *inode) + { + return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + } + + /* + * shrink_journal_memory(). + * Called when we're under memory pressure. Free up all the written-back + * checkpointed metadata buffers. + */ + void shrink_journal_memory(void) + { + struct list_head *list; + + lock_kernel(); + list_for_each(list, &all_journals) { + journal_t *journal = + list_entry(list, journal_t, j_all_journals); + spin_lock(&journal_datalist_lock); + __journal_clean_checkpoint_list(journal); + spin_unlock(&journal_datalist_lock); + } + unlock_kernel(); + } + + /* + * Simple support for retying memory allocations. Introduced to help to + * debug different VM deadlock avoidance strategies. + */ + /* + * Simple support for retying memory allocations. Introduced to help to + * debug different VM deadlock avoidance strategies. + */ + void * __jbd_kmalloc (char *where, size_t size, int flags, int retry) + { + void *p; + static unsigned long last_warning; + + while (1) { + p = kmalloc(size, flags); + if (p) + return p; + if (!retry) + return NULL; + /* Log every retry for debugging. Also log them to the + * syslog, but do rate-limiting on the non-debugging + * messages. */ + jbd_debug(1, "ENOMEM in %s, retrying.\n", where); + + if (time_after(jiffies, last_warning + 5*HZ)) { + printk(KERN_NOTICE + "ENOMEM in %s, retrying.\n", where); + last_warning = jiffies; + } + + current->policy |= SCHED_YIELD; + schedule(); + } + } + + /* + * Journal_head storage management + */ + static kmem_cache_t *journal_head_cache; + #ifdef CONFIG_JBD_DEBUG + static atomic_t nr_journal_heads = ATOMIC_INIT(0); + #endif + + static int journal_init_journal_head_cache(void) + { + int retval; + + J_ASSERT(journal_head_cache == 0); + journal_head_cache = kmem_cache_create("journal_head", + sizeof(struct journal_head), + 0, /* offset */ + 0, /* flags */ + NULL, /* ctor */ + NULL); /* dtor */ + retval = 0; + if (journal_head_cache == 0) { + retval = -ENOMEM; + printk(KERN_EMERG "JBD: no memory for journal_head cache\n"); + } + return retval; + } + + static void journal_destroy_journal_head_cache(void) + { + J_ASSERT(journal_head_cache != NULL); + kmem_cache_destroy(journal_head_cache); + journal_head_cache = 0; + } + + /* + * journal_head splicing and dicing + */ + static struct journal_head *journal_alloc_journal_head(void) + { + struct journal_head *ret; + static unsigned long last_warning; + + #ifdef CONFIG_JBD_DEBUG + atomic_inc(&nr_journal_heads); + #endif + ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + if (ret == 0) { + jbd_debug(1, "out of memory for journal_head\n"); + if (time_after(jiffies, last_warning + 5*HZ)) { + printk(KERN_NOTICE "ENOMEM in " __FUNCTION__ + ", retrying.\n"); + last_warning = jiffies; + } + while (ret == 0) { + current->policy |= SCHED_YIELD; + schedule(); + ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS); + } + } + return ret; + } + + static void journal_free_journal_head(struct journal_head *jh) + { + #ifdef CONFIG_JBD_DEBUG + atomic_dec(&nr_journal_heads); + memset(jh, 0x5b, sizeof(*jh)); + #endif + kmem_cache_free(journal_head_cache, jh); + } + + /* + * A journal_head is attached to a buffer_head whenever JBD has an + * interest in the buffer. + * + * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit + * is set. This bit is tested in core kernel code where we need to take + * JBD-specific actions. Testing the zeroness of ->b_private is not reliable + * there. + * + * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one. + * + * When a buffer has its BH_JBD bit set it is immune from being released by + * core kernel code, mainly via ->b_count. + * + * A journal_head may be detached from its buffer_head when the journal_head's + * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL. + * Various places in JBD call journal_remove_journal_head() to indicate that the + * journal_head can be dropped if needed. + * + * Various places in the kernel want to attach a journal_head to a buffer_head + * _before_ attaching the journal_head to a transaction. To protect the + * journal_head in this situation, journal_add_journal_head elevates the + * journal_head's b_jcount refcount by one. The caller must call + * journal_unlock_journal_head() to undo this. + * + * So the typical usage would be: + * + * (Attach a journal_head if needed. Increments b_jcount) + * struct journal_head *jh = journal_add_journal_head(bh); + * ... + * jh->b_transaction = xxx; + * journal_unlock_journal_head(jh); + * + * Now, the journal_head's b_jcount is zero, but it is safe from being released + * because it has a non-zero b_transaction. + */ + + /* + * Give a buffer_head a journal_head. + * + * Doesn't need the journal lock. + * May sleep. + * Cannot be called with journal_datalist_lock held. + */ + struct journal_head *journal_add_journal_head(struct buffer_head *bh) + { + struct journal_head *jh; + + spin_lock(&journal_datalist_lock); + if (buffer_jbd(bh)) { + jh = bh2jh(bh); + } else { + J_ASSERT_BH(bh, + (atomic_read(&bh->b_count) > 0) || + (bh->b_page && bh->b_page->mapping)); + spin_unlock(&journal_datalist_lock); + jh = journal_alloc_journal_head(); + memset(jh, 0, sizeof(*jh)); + spin_lock(&journal_datalist_lock); + + if (buffer_jbd(bh)) { + /* Someone did it for us! */ + J_ASSERT_BH(bh, bh->b_private != NULL); + journal_free_journal_head(jh); + jh = bh->b_private; + } else { + /* + * We actually don't need jh_splice_lock when + * adding a journal_head - only on removal. + */ + spin_lock(&jh_splice_lock); + set_bit(BH_JBD, &bh->b_state); + bh->b_private = jh; + jh->b_bh = bh; + atomic_inc(&bh->b_count); + spin_unlock(&jh_splice_lock); + BUFFER_TRACE(bh, "added journal_head"); + } + } + jh->b_jcount++; + spin_unlock(&journal_datalist_lock); + return bh->b_private; + } + + /* + * journal_remove_journal_head(): if the buffer isn't attached to a transaction + * and has a zero b_jcount then remove and release its journal_head. If we did + * see that the buffer is not used by any transaction we also "logically" + * decrement ->b_count. + * + * We in fact take an additional increment on ->b_count as a convenience, + * because the caller usually wants to do additional things with the bh + * after calling here. + * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some + * time. Once the caller has run __brelse(), the buffer is eligible for + * reaping by try_to_free_buffers(). + * + * Requires journal_datalist_lock. + */ + void __journal_remove_journal_head(struct buffer_head *bh) + { + struct journal_head *jh = bh2jh(bh); + + assert_spin_locked(&journal_datalist_lock); + J_ASSERT_JH(jh, jh->b_jcount >= 0); + atomic_inc(&bh->b_count); + if (jh->b_jcount == 0) { + if (jh->b_transaction == NULL && + jh->b_next_transaction == NULL && + jh->b_cp_transaction == NULL) { + J_ASSERT_BH(bh, buffer_jbd(bh)); + J_ASSERT_BH(bh, jh2bh(jh) == bh); + BUFFER_TRACE(bh, "remove journal_head"); + spin_lock(&jh_splice_lock); + bh->b_private = NULL; + jh->b_bh = NULL; /* debug, really */ + clear_bit(BH_JBD, &bh->b_state); + __brelse(bh); + spin_unlock(&jh_splice_lock); + journal_free_journal_head(jh); + } else { + BUFFER_TRACE(bh, "journal_head was locked"); + } + } + } + + void journal_unlock_journal_head(struct journal_head *jh) + { + spin_lock(&journal_datalist_lock); + J_ASSERT_JH(jh, jh->b_jcount > 0); + --jh->b_jcount; + if (!jh->b_jcount && !jh->b_transaction) { + struct buffer_head *bh; + bh = jh2bh(jh); + __journal_remove_journal_head(bh); + __brelse(bh); + } + + spin_unlock(&journal_datalist_lock); + } + + void journal_remove_journal_head(struct buffer_head *bh) + { + spin_lock(&journal_datalist_lock); + __journal_remove_journal_head(bh); + spin_unlock(&journal_datalist_lock); + } + + /* + * Module startup and shutdown + */ + + static int __init journal_init_caches(void) + { + int ret; + + ret = journal_init_revoke_caches(); + if (ret == 0) + ret = journal_init_journal_head_cache(); + return ret; + } + + static void journal_destroy_caches(void) + { + journal_destroy_revoke_caches(); + journal_destroy_journal_head_cache(); + } + + static int __init journal_init(void) + { + int ret; + + printk(KERN_INFO "Journalled Block Device driver loaded\n"); + ret = journal_init_caches(); + if (ret != 0) + journal_destroy_caches(); + return ret; + } + + static void __exit journal_exit(void) + { + #ifdef CONFIG_JBD_DEBUG + int n = atomic_read(&nr_journal_heads); + if (n) + printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n); + #endif + journal_destroy_caches(); + } + + MODULE_LICENSE("GPL"); + module_init(journal_init); + module_exit(journal_exit); + diff -rc2P linux/fs/jbd/recovery.c linux-2.4.13/fs/jbd/recovery.c *** linux/fs/jbd/recovery.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/recovery.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,586 ---- + /* + * linux/fs/recovery.c + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1999-2000 Red Hat Software --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal recovery routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + */ + + #ifndef __KERNEL__ + #include "jfs_user.h" + #else + #include + #include + #include + #include + #include + #include + #endif + + /* + * Maintain information about the progress of the recovery job, so that + * the different passes can carry information between them. + */ + struct recovery_info + { + tid_t start_transaction; + tid_t end_transaction; + + int nr_replays; + int nr_revokes; + int nr_revoke_hits; + }; + + enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; + static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass); + static int scan_revoke_records(journal_t *, struct buffer_head *, + tid_t, struct recovery_info *); + + #ifdef __KERNEL__ + + /* Release readahead buffers after use */ + void journal_brelse_array(struct buffer_head *b[], int n) + { + while (--n >= 0) + brelse (b[n]); + } + + + /* + * When reading from the journal, we are going through the block device + * layer directly and so there is no readahead being done for us. We + * need to implement any readahead ourselves if we want it to happen at + * all. Recovery is basically one long sequential read, so make sure we + * do the IO in reasonably large chunks. + * + * This is not so critical that we need to be enormously clever about + * the readahead size, though. 128K is a purely arbitrary, good-enough + * fixed value. + */ + + #define MAXBUF 8 + static int do_readahead(journal_t *journal, unsigned int start) + { + int err; + unsigned int max, nbufs, next, blocknr; + struct buffer_head *bh; + + struct buffer_head * bufs[MAXBUF]; + + /* Do up to 128K of readahead */ + max = start + (128 * 1024 / journal->j_blocksize); + if (max > journal->j_maxlen) + max = journal->j_maxlen; + + /* Do the readahead itself. We'll submit MAXBUF buffer_heads at + * a time to the block device IO layer. */ + + nbufs = 0; + + for (next = start; next < max; next++) { + blocknr = journal_bmap(journal, next); + + if (!blocknr) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + next); + err = -EIO; + goto failed; + } + + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) { + err = -ENOMEM; + goto failed; + } + + if (!buffer_uptodate(bh) && !buffer_locked(bh)) { + bufs[nbufs++] = bh; + if (nbufs == MAXBUF) { + ll_rw_block(READ, nbufs, bufs); + journal_brelse_array(bufs, nbufs); + nbufs = 0; + } + } else + brelse(bh); + } + + if (nbufs) + ll_rw_block(READ, nbufs, bufs); + err = 0; + + failed: + if (nbufs) + journal_brelse_array(bufs, nbufs); + return err; + } + + #endif /* __KERNEL__ */ + + + /* + * Read a block from the journal + */ + + static int jread(struct buffer_head **bhp, journal_t *journal, + unsigned int offset) + { + unsigned int blocknr; + struct buffer_head *bh; + + *bhp = NULL; + + J_ASSERT (offset < journal->j_maxlen); + + blocknr = journal_bmap(journal, offset); + + if (!blocknr) { + printk (KERN_ERR "JBD: bad block at offset %u\n", + offset); + return -EIO; + } + + bh = getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return -ENOMEM; + + if (!buffer_uptodate(bh)) { + /* If this is a brand new buffer, start readahead. + Otherwise, we assume we are already reading it. */ + if (!buffer_req(bh)) + do_readahead(journal, offset); + wait_on_buffer(bh); + } + + if (!buffer_uptodate(bh)) { + printk (KERN_ERR "JBD: Failed to read block at offset %u\n", + offset); + brelse(bh); + return -EIO; + } + + *bhp = bh; + return 0; + } + + + /* + * Count the number of in-use tags in a journal descriptor block. + */ + + static int count_tags(struct buffer_head *bh, int size) + { + char * tagp; + journal_block_tag_t * tag; + int nr = 0; + + tagp = &bh->b_data[sizeof(journal_header_t)]; + + while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { + tag = (journal_block_tag_t *) tagp; + + nr++; + tagp += sizeof(journal_block_tag_t); + if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID))) + tagp += 16; + + if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG)) + break; + } + + return nr; + } + + + /* Make sure we wrap around the log correctly! */ + #define wrap(journal, var) \ + do { \ + if (var >= (journal)->j_last) \ + var -= ((journal)->j_last - (journal)->j_first); \ + } while (0) + + /* + * journal_recover + * + * The primary function for recovering the log contents when mounting a + * journaled device. + * + * Recovery is done in three passes. In the first pass, we look for the + * end of the log. In the second, we assemble the list of revoke + * blocks. In the third and final pass, we replay any un-revoked blocks + * in the log. + */ + + int journal_recover(journal_t *journal) + { + int err; + journal_superblock_t * sb; + + struct recovery_info info; + + memset(&info, 0, sizeof(info)); + sb = journal->j_superblock; + + /* + * The journal superblock's s_start field (the current log head) + * is always zero if, and only if, the journal was cleanly + * unmounted. + */ + + if (!sb->s_start) { + jbd_debug(1, "No recovery required, last transaction %d\n", + ntohl(sb->s_sequence)); + journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1; + return 0; + } + + + err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = do_one_pass(journal, &info, PASS_REVOKE); + if (!err) + err = do_one_pass(journal, &info, PASS_REPLAY); + + jbd_debug(0, "JBD: recovery, exit status %d, " + "recovered transactions %u to %u\n", + err, info.start_transaction, info.end_transaction); + jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", + info.nr_replays, info.nr_revoke_hits, info.nr_revokes); + + /* Restart the log at the next transaction ID, thus invalidating + * any existing commit records in the log. */ + journal->j_transaction_sequence = ++info.end_transaction; + + journal_clear_revoke(journal); + fsync_no_super(journal->j_fs_dev); + return err; + } + + /* + * journal_skip_recovery + * + * Locate any valid recovery information from the journal and set up the + * journal structures in memory to ignore it (presumably because the + * caller has evidence that it is out of date). + * + * We perform one pass over the journal to allow us to tell the user how + * much recovery information is being erased, and to let us initialise + * the journal transaction sequence numbers to the next unused ID. + */ + + int journal_skip_recovery(journal_t *journal) + { + int err; + journal_superblock_t * sb; + + struct recovery_info info; + + memset (&info, 0, sizeof(info)); + sb = journal->j_superblock; + + err = do_one_pass(journal, &info, PASS_SCAN); + + if (err) { + printk(KERN_ERR "JBD: error %d scanning journal\n", err); + ++journal->j_transaction_sequence; + } else { + #ifdef CONFIG_JBD_DEBUG + int dropped = info.end_transaction - ntohl(sb->s_sequence); + #endif + + jbd_debug(0, + "JBD: ignoring %d transaction%s from the journal.\n", + dropped, (dropped == 1) ? "" : "s"); + journal->j_transaction_sequence = ++info.end_transaction; + } + + journal->j_tail = 0; + + return err; + } + + static int do_one_pass(journal_t *journal, + struct recovery_info *info, enum passtype pass) + { + + unsigned int first_commit_ID, next_commit_ID; + unsigned long next_log_block; + int err, success = 0; + journal_superblock_t * sb; + journal_header_t * tmp; + struct buffer_head * bh; + unsigned int sequence; + int blocktype; + + /* Precompute the maximum metadata descriptors in a descriptor block */ + int MAX_BLOCKS_PER_DESC; + MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) + / sizeof(journal_block_tag_t)); + + /* + * First thing is to establish what we expect to find in the log + * (in terms of transaction IDs), and where (in terms of log + * block offsets): query the superblock. + */ + + sb = journal->j_superblock; + next_commit_ID = ntohl(sb->s_sequence); + next_log_block = ntohl(sb->s_start); + + first_commit_ID = next_commit_ID; + if (pass == PASS_SCAN) + info->start_transaction = first_commit_ID; + + jbd_debug(1, "Starting recovery pass %d\n", pass); + + /* + * Now we walk through the log, transaction by transaction, + * making sure that each transaction has a commit block in the + * expected place. Each complete transaction gets replayed back + * into the main filesystem. + */ + + while (1) { + int flags; + char * tagp; + journal_block_tag_t * tag; + struct buffer_head * obh; + struct buffer_head * nbh; + + /* If we already know where to stop the log traversal, + * check right now that we haven't gone past the end of + * the log. */ + + if (pass != PASS_SCAN) + if (tid_geq(next_commit_ID, info->end_transaction)) + break; + + jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + next_commit_ID, next_log_block, journal->j_last); + + /* Skip over each chunk of the transaction looking + * either the next descriptor block or the final commit + * record. */ + + jbd_debug(3, "JBD: checking block %ld\n", next_log_block); + err = jread(&bh, journal, next_log_block); + if (err) + goto failed; + + next_log_block++; + wrap(journal, next_log_block); + + /* What kind of buffer is it? + * + * If it is a descriptor block, check that it has the + * expected sequence number. Otherwise, we're all done + * here. */ + + tmp = (journal_header_t *)bh->b_data; + + if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) { + brelse(bh); + break; + } + + blocktype = ntohl(tmp->h_blocktype); + sequence = ntohl(tmp->h_sequence); + jbd_debug(3, "Found magic %d, sequence %d\n", + blocktype, sequence); + + if (sequence != next_commit_ID) { + brelse(bh); + break; + } + + /* OK, we have a valid descriptor block which matches + * all of the sequence number checks. What are we going + * to do with it? That depends on the pass... */ + + switch(blocktype) { + case JFS_DESCRIPTOR_BLOCK: + /* If it is a valid descriptor block, replay it + * in pass REPLAY; otherwise, just skip over the + * blocks it describes. */ + if (pass != PASS_REPLAY) { + next_log_block += + count_tags(bh, journal->j_blocksize); + wrap(journal, next_log_block); + brelse(bh); + continue; + } + + /* A descriptor block: we can now write all of + * the data blocks. Yay, useful work is finally + * getting done here! */ + + tagp = &bh->b_data[sizeof(journal_header_t)]; + while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) + <= journal->j_blocksize) { + unsigned long io_block; + + tag = (journal_block_tag_t *) tagp; + flags = ntohl(tag->t_flags); + + io_block = next_log_block++; + wrap(journal, next_log_block); + err = jread(&obh, journal, io_block); + if (err) { + /* Recover what we can, but + * report failure at the end. */ + success = err; + printk (KERN_ERR + "JBD: IO error %d recovering " + "block %ld in log\n", + err, io_block); + } else { + unsigned long blocknr; + + J_ASSERT(obh != NULL); + blocknr = ntohl(tag->t_blocknr); + + /* If the block has been + * revoked, then we're all done + * here. */ + if (journal_test_revoke + (journal, blocknr, + next_commit_ID)) { + brelse(obh); + ++info->nr_revoke_hits; + goto skip_write; + } + + /* Find a buffer for the new + * data being restored */ + nbh = getblk(journal->j_fs_dev, blocknr, + journal->j_blocksize); + if (nbh == NULL) { + printk(KERN_ERR + "JBD: Out of memory " + "during recovery.\n"); + err = -ENOMEM; + brelse(bh); + brelse(obh); + goto failed; + } + + memcpy(nbh->b_data, obh->b_data, + journal->j_blocksize); + if (flags & JFS_FLAG_ESCAPE) { + *((unsigned int *)bh->b_data) = + htonl(JFS_MAGIC_NUMBER); + } + + BUFFER_TRACE(nbh, "marking dirty"); + mark_buffer_dirty(nbh); + BUFFER_TRACE(nbh, "marking uptodate"); + mark_buffer_uptodate(nbh, 1); + ++info->nr_replays; + /* ll_rw_block(WRITE, 1, &nbh); */ + brelse(obh); + brelse(nbh); + } + + skip_write: + tagp += sizeof(journal_block_tag_t); + if (!(flags & JFS_FLAG_SAME_UUID)) + tagp += 16; + + if (flags & JFS_FLAG_LAST_TAG) + break; + } + + brelse(bh); + continue; + + case JFS_COMMIT_BLOCK: + /* Found an expected commit block: not much to + * do other than move on to the next sequence + * number. */ + brelse(bh); + next_commit_ID++; + continue; + + case JFS_REVOKE_BLOCK: + /* If we aren't in the REVOKE pass, then we can + * just skip over this block. */ + if (pass != PASS_REVOKE) { + brelse(bh); + continue; + } + + err = scan_revoke_records(journal, bh, + next_commit_ID, info); + brelse(bh); + if (err) + goto failed; + continue; + + default: + jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + blocktype); + goto done; + } + } + + done: + /* + * We broke out of the log scan loop: either we came to the + * known end of the log or we found an unexpected block in the + * log. If the latter happened, then we know that the "current" + * transaction marks the end of the valid log. + */ + + if (pass == PASS_SCAN) + info->end_transaction = next_commit_ID; + else { + /* It's really bad news if different passes end up at + * different places (but possible due to IO errors). */ + if (info->end_transaction != next_commit_ID) { + printk (KERN_ERR "JBD: recovery pass %d ended at " + "transaction %u, expected %u\n", + pass, next_commit_ID, info->end_transaction); + if (!success) + success = -EIO; + } + } + + return success; + + failed: + return err; + } + + + /* Scan a revoke record, marking all blocks mentioned as revoked. */ + + static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, + tid_t sequence, struct recovery_info *info) + { + journal_revoke_header_t *header; + int offset, max; + + header = (journal_revoke_header_t *) bh->b_data; + offset = sizeof(journal_revoke_header_t); + max = ntohl(header->r_count); + + while (offset < max) { + unsigned long blocknr; + int err; + + blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset))); + offset += 4; + err = journal_set_revoke(journal, blocknr, sequence); + if (err) + return err; + ++info->nr_revokes; + } + return 0; + } diff -rc2P linux/fs/jbd/revoke.c linux-2.4.13/fs/jbd/revoke.c *** linux/fs/jbd/revoke.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/revoke.c Fri Nov 9 16:57:59 2001 *************** *** 0 **** --- 1,631 ---- + /* + * linux/fs/revoke.c + * + * Written by Stephen C. Tweedie , 2000 + * + * Copyright 2000 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Journal revoke routines for the generic filesystem journaling code; + * part of the ext2fs journaling system. + * + * Revoke is the mechanism used to prevent old log records for deleted + * metadata from being replayed on top of newer data using the same + * blocks. The revoke mechanism is used in two separate places: + * + * + Commit: during commit we write the entire list of the current + * transaction's revoked blocks to the journal + * + * + Recovery: during recovery we record the transaction ID of all + * revoked blocks. If there are multiple revoke records in the log + * for a single block, only the last one counts, and if there is a log + * entry for a block beyond the last revoke, then that log entry still + * gets replayed. + * + * We can get interactions between revokes and new log data within a + * single transaction: + * + * Block is revoked and then journaled: + * The desired end result is the journaling of the new block, so we + * cancel the revoke before the transaction commits. + * + * Block is journaled and then revoked: + * The revoke must take precedence over the write of the block, so we + * need either to cancel the journal entry or to write the revoke + * later in the log than the log block. In this case, we choose the + * latter: journaling a block cancels any revoke record for that block + * in the current transaction, so any revoke for that block in the + * transaction must have happened after the block was journaled and so + * the revoke must take precedence. + * + * Block is revoked and then written as data: + * The data write is allowed to succeed, but the revoke is _not_ + * cancelled. We still need to prevent old log records from + * overwriting the new data. We don't even need to clear the revoke + * bit here. + * + * Revoke information on buffers is a tri-state value: + * + * RevokeValid clear: no cached revoke status, need to look it up + * RevokeValid set, Revoked clear: + * buffer has not been revoked, and cancel_revoke + * need do nothing. + * RevokeValid set, Revoked set: + * buffer has been revoked. + */ + + #ifndef __KERNEL__ + #include "jfs_user.h" + #else + #include + #include + #include + #include + #include + #include + #include + #include + #include + #endif + + static kmem_cache_t *revoke_record_cache; + static kmem_cache_t *revoke_table_cache; + + /* Each revoke record represents one single revoked block. During + journal replay, this involves recording the transaction ID of the + last transaction to revoke this block. */ + + struct jbd_revoke_record_s + { + struct list_head hash; + tid_t sequence; /* Used for recovery only */ + unsigned long blocknr; + }; + + + /* The revoke table is just a simple hash table of revoke records. */ + struct jbd_revoke_table_s + { + /* It is conceivable that we might want a larger hash table + * for recovery. Must be a power of two. */ + int hash_size; + int hash_shift; + struct list_head *hash_table; + }; + + + #ifdef __KERNEL__ + static void write_one_revoke_record(journal_t *, transaction_t *, + struct journal_head **, int *, + struct jbd_revoke_record_s *); + static void flush_descriptor(journal_t *, struct journal_head *, int); + #endif + + /* Utility functions to maintain the revoke table */ + + /* Borrowed from buffer.c: this is a tried and tested block hash function */ + static inline int hash(journal_t *journal, unsigned long block) + { + struct jbd_revoke_table_s *table = journal->j_revoke; + int hash_shift = table->hash_shift; + + return ((block << (hash_shift - 6)) ^ + (block >> 13) ^ + (block << (hash_shift - 12))) & (table->hash_size - 1); + } + + int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq) + { + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + + repeat: + record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS); + if (!record) + goto oom; + + record->sequence = seq; + record->blocknr = blocknr; + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + list_add(&record->hash, hash_list); + return 0; + + oom: + if (!journal_oom_retry) + return -ENOMEM; + jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n"); + current->policy |= SCHED_YIELD; + schedule(); + goto repeat; + } + + /* Find a revoke record in the journal's hash table. */ + + static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal, + unsigned long blocknr) + { + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + + hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; + + record = (struct jbd_revoke_record_s *) hash_list->next; + while (&(record->hash) != hash_list) { + if (record->blocknr == blocknr) + return record; + record = (struct jbd_revoke_record_s *) record->hash.next; + } + return NULL; + } + + int __init journal_init_revoke_caches(void) + { + revoke_record_cache = kmem_cache_create("revoke_record", + sizeof(struct jbd_revoke_record_s), + 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + if (revoke_record_cache == 0) + return -ENOMEM; + + revoke_table_cache = kmem_cache_create("revoke_table", + sizeof(struct jbd_revoke_table_s), + 0, 0, NULL, NULL); + if (revoke_table_cache == 0) { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = NULL; + return -ENOMEM; + } + return 0; + } + + void journal_destroy_revoke_caches(void) + { + kmem_cache_destroy(revoke_record_cache); + revoke_record_cache = 0; + kmem_cache_destroy(revoke_table_cache); + revoke_table_cache = 0; + } + + /* Initialise the revoke table for a given journal to a given size. */ + + int journal_init_revoke(journal_t *journal, int hash_size) + { + int shift, tmp; + + J_ASSERT (journal->j_revoke == NULL); + + journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL); + if (!journal->j_revoke) + return -ENOMEM; + + /* Check that the hash_size is a power of two */ + J_ASSERT ((hash_size & (hash_size-1)) == 0); + + journal->j_revoke->hash_size = hash_size; + + shift = 0; + tmp = hash_size; + while((tmp >>= 1UL) != 0UL) + shift++; + journal->j_revoke->hash_shift = shift; + + journal->j_revoke->hash_table = + kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); + if (!journal->j_revoke->hash_table) { + kmem_cache_free(revoke_table_cache, journal->j_revoke); + journal->j_revoke = NULL; + return -ENOMEM; + } + + for (tmp = 0; tmp < hash_size; tmp++) + INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + + return 0; + } + + /* Destoy a journal's revoke table. The table must already be empty! */ + + void journal_destroy_revoke(journal_t *journal) + { + struct jbd_revoke_table_s *table; + struct list_head *hash_list; + int i; + + table = journal->j_revoke; + if (!table) + return; + + for (i=0; ihash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT (list_empty(hash_list)); + } + + kfree(table->hash_table); + kmem_cache_free(revoke_table_cache, table); + journal->j_revoke = NULL; + } + + + #ifdef __KERNEL__ + + /* + * journal_revoke: revoke a given buffer_head from the journal. This + * prevents the block from being replayed during recovery if we take a + * crash after this current transaction commits. Any subsequent + * metadata writes of the buffer in this transaction cancel the + * revoke. + * + * Note that this call may block --- it is up to the caller to make + * sure that there are no further calls to journal_write_metadata + * before the revoke is complete. In ext3, this implies calling the + * revoke before clearing the block bitmap when we are deleting + * metadata. + * + * Revoke performs a journal_forget on any buffer_head passed in as a + * parameter, but does _not_ forget the buffer_head if the bh was only + * found implicitly. + * + * bh_in may not be a journalled buffer - it may have come off + * the hash tables without an attached journal_head. + * + * If bh_in is non-zero, journal_revoke() will decrement its b_count + * by one. + */ + + int journal_revoke(handle_t *handle, unsigned long blocknr, + struct buffer_head *bh_in) + { + struct buffer_head *bh = NULL; + journal_t *journal; + kdev_t dev; + int err; + + if (bh_in) + BUFFER_TRACE(bh_in, "enter"); + + journal = handle->h_transaction->t_journal; + if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){ + J_ASSERT (!"Cannot set revoke feature!"); + return -EINVAL; + } + + dev = journal->j_fs_dev; + bh = bh_in; + + if (!bh) { + bh = get_hash_table(dev, blocknr, journal->j_blocksize); + if (bh) + BUFFER_TRACE(bh, "found on hash"); + } + #ifdef JBD_EXPENSIVE_CHECKING + else { + struct buffer_head *bh2; + + /* If there is a different buffer_head lying around in + * memory anywhere... */ + bh2 = get_hash_table(dev, blocknr, journal->j_blocksize); + if (bh2) { + /* ... and it has RevokeValid status... */ + if ((bh2 != bh) && + test_bit(BH_RevokeValid, &bh2->b_state)) + /* ...then it better be revoked too, + * since it's illegal to create a revoke + * record against a buffer_head which is + * not marked revoked --- that would + * risk missing a subsequent revoke + * cancel. */ + J_ASSERT_BH(bh2, test_bit(BH_Revoked, & + bh2->b_state)); + __brelse(bh2); + } + } + #endif + + /* We really ought not ever to revoke twice in a row without + first having the revoke cancelled: it's illegal to free a + block twice without allocating it in between! */ + if (bh) { + J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state)); + set_bit(BH_Revoked, &bh->b_state); + set_bit(BH_RevokeValid, &bh->b_state); + if (bh_in) { + BUFFER_TRACE(bh_in, "call journal_forget"); + journal_forget(handle, bh_in); + } else { + BUFFER_TRACE(bh, "call brelse"); + __brelse(bh); + } + } + + lock_journal(journal); + jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in); + err = insert_revoke_hash(journal, blocknr, + handle->h_transaction->t_tid); + unlock_journal(journal); + BUFFER_TRACE(bh_in, "exit"); + return err; + } + + /* + * Cancel an outstanding revoke. For use only internally by the + * journaling code (called from journal_get_write_access). + * + * We trust the BH_Revoked bit on the buffer if the buffer is already + * being journaled: if there is no revoke pending on the buffer, then we + * don't do anything here. + * + * This would break if it were possible for a buffer to be revoked and + * discarded, and then reallocated within the same transaction. In such + * a case we would have lost the revoked bit, but when we arrived here + * the second time we would still have a pending revoke to cancel. So, + * do not trust the Revoked bit on buffers unless RevokeValid is also + * set. + * + * The caller must have the journal locked. + */ + int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) + { + struct jbd_revoke_record_s *record; + journal_t *journal = handle->h_transaction->t_journal; + int need_cancel; + int did_revoke = 0; /* akpm: debug */ + struct buffer_head *bh = jh2bh(jh); + + jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); + + /* Is the existing Revoke bit valid? If so, we trust it, and + * only perform the full cancel if the revoke bit is set. If + * not, we can't trust the revoke bit, and we need to do the + * full search for a revoke record. */ + if (test_and_set_bit(BH_RevokeValid, &bh->b_state)) + need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state)); + else { + need_cancel = 1; + clear_bit(BH_Revoked, &bh->b_state); + } + + if (need_cancel) { + record = find_revoke_record(journal, bh->b_blocknr); + if (record) { + jbd_debug(4, "cancelled existing revoke on " + "blocknr %lu\n", bh->b_blocknr); + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + did_revoke = 1; + } + } + + #ifdef JBD_EXPENSIVE_CHECKING + /* There better not be one left behind by now! */ + record = find_revoke_record(journal, bh->b_blocknr); + J_ASSERT_JH(jh, record == NULL); + #endif + + /* Finally, have we just cleared revoke on an unhashed + * buffer_head? If so, we'd better make sure we clear the + * revoked status on any hashed alias too, otherwise the revoke + * state machine will get very upset later on. */ + if (need_cancel && !bh->b_pprev) { + struct buffer_head *bh2; + bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size); + if (bh2) { + clear_bit(BH_Revoked, &bh2->b_state); + __brelse(bh2); + } + } + + return did_revoke; + } + + + /* + * Write revoke records to the journal for all entries in the current + * revoke hash, deleting the entries as we go. + * + * Called with the journal lock held. + */ + + void journal_write_revoke_records(journal_t *journal, + transaction_t *transaction) + { + struct journal_head *descriptor; + struct jbd_revoke_record_s *record; + struct jbd_revoke_table_s *revoke; + struct list_head *hash_list; + int i, offset, count; + + descriptor = NULL; + offset = 0; + count = 0; + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + + while (!list_empty(hash_list)) { + record = (struct jbd_revoke_record_s *) + hash_list->next; + write_one_revoke_record(journal, transaction, + &descriptor, &offset, + record); + count++; + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + } + } + if (descriptor) + flush_descriptor(journal, descriptor, offset); + jbd_debug(1, "Wrote %d revoke records\n", count); + } + + /* + * Write out one revoke record. We need to create a new descriptor + * block if the old one is full or if we have not already created one. + */ + + static void write_one_revoke_record(journal_t *journal, + transaction_t *transaction, + struct journal_head **descriptorp, + int *offsetp, + struct jbd_revoke_record_s *record) + { + struct journal_head *descriptor; + int offset; + journal_header_t *header; + + /* If we are already aborting, this all becomes a noop. We + still need to go round the loop in + journal_write_revoke_records in order to free all of the + revoke records: only the IO to the journal is omitted. */ + if (is_journal_aborted(journal)) + return; + + descriptor = *descriptorp; + offset = *offsetp; + + /* Make sure we have a descriptor with space left for the record */ + if (descriptor) { + if (offset == journal->j_blocksize) { + flush_descriptor(journal, descriptor, offset); + descriptor = NULL; + } + } + + if (!descriptor) { + descriptor = journal_get_descriptor_buffer(journal); + header = (journal_header_t *) &jh2bh(descriptor)->b_data[0]; + header->h_magic = htonl(JFS_MAGIC_NUMBER); + header->h_blocktype = htonl(JFS_REVOKE_BLOCK); + header->h_sequence = htonl(transaction->t_tid); + + /* Record it so that we can wait for IO completion later */ + JBUFFER_TRACE(descriptor, "file as BJ_LogCtl"); + journal_file_buffer(descriptor, transaction, BJ_LogCtl); + + offset = sizeof(journal_revoke_header_t); + *descriptorp = descriptor; + } + + * ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) = + htonl(record->blocknr); + offset += 4; + *offsetp = offset; + } + + /* + * Flush a revoke descriptor out to the journal. If we are aborting, + * this is a noop; otherwise we are generating a buffer which needs to + * be waited for during commit, so it has to go onto the appropriate + * journal buffer list. + */ + + static void flush_descriptor(journal_t *journal, + struct journal_head *descriptor, + int offset) + { + journal_revoke_header_t *header; + + if (is_journal_aborted(journal)) { + JBUFFER_TRACE(descriptor, "brelse"); + __brelse(jh2bh(descriptor)); + return; + } + + header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data; + header->r_count = htonl(offset); + set_bit(BH_JWrite, &jh2bh(descriptor)->b_state); + { + struct buffer_head *bh = jh2bh(descriptor); + BUFFER_TRACE(bh, "write"); + ll_rw_block (WRITE, 1, &bh); + } + } + + #endif + + /* + * Revoke support for recovery. + * + * Recovery needs to be able to: + * + * record all revoke records, including the tid of the latest instance + * of each revoke in the journal + * + * check whether a given block in a given transaction should be replayed + * (ie. has not been revoked by a revoke record in that or a subsequent + * transaction) + * + * empty the revoke table after recovery. + */ + + /* + * First, setting revoke records. We create a new revoke record for + * every block ever revoked in the log as we scan it for recovery, and + * we update the existing records if we find multiple revokes for a + * single block. + */ + + int journal_set_revoke(journal_t *journal, + unsigned long blocknr, + tid_t sequence) + { + struct jbd_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (record) { + /* If we have multiple occurences, only record the + * latest sequence number in the hashed record */ + if (tid_gt(sequence, record->sequence)) + record->sequence = sequence; + return 0; + } + return insert_revoke_hash(journal, blocknr, sequence); + } + + /* + * Test revoke records. For a given block referenced in the log, has + * that block been revoked? A revoke record with a given transaction + * sequence number revokes all blocks in that transaction and earlier + * ones, but later transactions still need replayed. + */ + + int journal_test_revoke(journal_t *journal, + unsigned long blocknr, + tid_t sequence) + { + struct jbd_revoke_record_s *record; + + record = find_revoke_record(journal, blocknr); + if (!record) + return 0; + if (tid_gt(sequence, record->sequence)) + return 0; + return 1; + } + + /* + * Finally, once recovery is over, we need to clear the revoke table so + * that it can be reused by the running filesystem. + */ + + void journal_clear_revoke(journal_t *journal) + { + int i; + struct list_head *hash_list; + struct jbd_revoke_record_s *record; + struct jbd_revoke_table_s *revoke; + + revoke = journal->j_revoke; + + for (i = 0; i < revoke->hash_size; i++) { + hash_list = &revoke->hash_table[i]; + while (!list_empty(hash_list)) { + record = (struct jbd_revoke_record_s*) hash_list->next; + list_del(&record->hash); + kmem_cache_free(revoke_record_cache, record); + } + } + } + diff -rc2P linux/fs/jbd/transaction.c linux-2.4.13/fs/jbd/transaction.c *** linux/fs/jbd/transaction.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd/transaction.c Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,2078 ---- + /* + * linux/fs/transaction.c + * + * Written by Stephen C. Tweedie , 1998 + * + * Copyright 1998 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Generic filesystem transaction handling code; part of the ext2fs + * journaling system. + * + * This file manages transactions (compound commits managed by the + * journaling code) and handles (individual atomic operations by the + * filesystem). + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include /* Uggh... needed for buffermem_pages */ + + + extern spinlock_t journal_datalist_lock; + + /* + * get_transaction: obtain a new transaction_t object. + * + * Simply allocate and initialise a new transaction. Create it in + * RUNNING state and add it to the current journal (which should not + * have an existing running transaction: we only make a new transaction + * once we have started to commit the old one). + * + * Preconditions: + * The journal MUST be locked. We don't perform atomic mallocs on the + * new transaction and we can't block without protecting against other + * processes trying to touch the journal while it is in transition. + */ + + static transaction_t * get_transaction (journal_t * journal, int is_try) + { + transaction_t * transaction; + + transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS); + if (!transaction) + return NULL; + + memset (transaction, 0, sizeof (transaction_t)); + + transaction->t_journal = journal; + transaction->t_state = T_RUNNING; + transaction->t_tid = journal->j_transaction_sequence++; + transaction->t_expires = jiffies + journal->j_commit_interval; + + /* Set up the commit timer for the new transaction. */ + J_ASSERT (!journal->j_commit_timer_active); + journal->j_commit_timer_active = 1; + journal->j_commit_timer->expires = transaction->t_expires; + add_timer(journal->j_commit_timer); + + J_ASSERT (journal->j_running_transaction == NULL); + journal->j_running_transaction = transaction; + + return transaction; + } + + /* + * Handle management. + * + * A handle_t is an object which represents a single atomic update to a + * filesystem, and which tracks all of the modifications which form part + * of that one update. + */ + + /* + * start_this_handle: Given a handle, deal with any locking or stalling + * needed to make sure that there is enough journal space for the handle + * to begin. Attach the handle to a transaction and set up the + * transaction's buffer credits. + */ + + static int start_this_handle(journal_t *journal, handle_t *handle) + { + transaction_t *transaction; + int needed; + int nblocks = handle->h_buffer_credits; + + jbd_debug(3, "New handle %p going live.\n", handle); + + repeat: + + lock_journal(journal); + + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { + unlock_journal(journal); + return -EROFS; + } + + /* Wait on the journal's transaction barrier if necessary */ + if (journal->j_barrier_count) { + unlock_journal(journal); + sleep_on(&journal->j_wait_transaction_locked); + goto repeat; + } + + repeat_locked: + if (!journal->j_running_transaction) + get_transaction(journal, 0); + /* @@@ Error? */ + J_ASSERT(journal->j_running_transaction); + + transaction = journal->j_running_transaction; + + /* If the current transaction is locked down for commit, wait + * for the lock to be released. */ + + if (transaction->t_state == T_LOCKED) { + unlock_journal(journal); + jbd_debug(3, "Handle %p stalling...\n", handle); + sleep_on(&journal->j_wait_transaction_locked); + goto repeat; + } + + /* If there is not enough space left in the log to write all + * potential buffers requested by this operation, we need to + * stall pending a log checkpoint to free some more log + * space. */ + + needed = transaction->t_outstanding_credits + nblocks; + + if (needed > journal->j_max_transaction_buffers) { + /* If the current transaction is already too large, then + * start to commit it: we can then go back and attach + * this handle to a new transaction. */ + + jbd_debug(2, "Handle %p starting new commit...\n", handle); + log_start_commit(journal, transaction); + unlock_journal(journal); + sleep_on(&journal->j_wait_transaction_locked); + lock_journal(journal); + goto repeat_locked; + } + + /* + * The commit code assumes that it can get enough log space + * without forcing a checkpoint. This is *critical* for + * correctness: a checkpoint of a buffer which is also + * associated with a committing transaction creates a deadlock, + * so commit simply cannot force through checkpoints. + * + * We must therefore ensure the necessary space in the journal + * *before* starting to dirty potentially checkpointed buffers + * in the new transaction. + * + * The worst part is, any transaction currently committing can + * reduce the free space arbitrarily. Be careful to account for + * those buffers when checkpointing. + */ + + /* + * @@@ AKPM: This seems rather over-defensive. We're giving commit + * a _lot_ of headroom: 1/4 of the journal plus the size of + * the committing transaction. Really, we only need to give it + * committing_transaction->t_outstanding_credits plus "enough" for + * the log control blocks. + * Also, this test is inconsitent with the matching one in + * journal_extend(). + */ + needed = journal->j_max_transaction_buffers; + if (journal->j_committing_transaction) + needed += journal->j_committing_transaction-> + t_outstanding_credits; + + if (log_space_left(journal) < needed) { + jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); + log_wait_for_space(journal, needed); + goto repeat_locked; + } + + /* OK, account for the buffers that this operation expects to + * use and add the handle to the running transaction. */ + + handle->h_transaction = transaction; + transaction->t_outstanding_credits += nblocks; + transaction->t_updates++; + transaction->t_handle_count++; + jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", + handle, nblocks, transaction->t_outstanding_credits, + log_space_left(journal)); + + unlock_journal(journal); + + return 0; + } + + /* + * Obtain a new handle. + * + * We make sure that the transaction can guarantee at least nblocks of + * modified buffers in the log. We block until the log can guarantee + * that much space. + * + * This function is visible to journal users (like ext2fs), so is not + * called with the journal already locked. + * + * Return a pointer to a newly allocated handle, or NULL on failure + */ + + handle_t *journal_start(journal_t *journal, int nblocks) + { + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + J_ASSERT(handle->h_transaction->t_journal == journal); + handle->h_ref++; + return handle; + } + + handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); + if (!handle) + return ERR_PTR(-ENOMEM); + memset (handle, 0, sizeof (handle_t)); + + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + current->journal_info = handle; + + err = start_this_handle(journal, handle); + if (err < 0) { + kfree(handle); + current->journal_info = NULL; + return ERR_PTR(err); + } + + return handle; + } + + /* + * Return zero on success + */ + static int try_start_this_handle(journal_t *journal, handle_t *handle) + { + transaction_t *transaction; + int needed; + int nblocks = handle->h_buffer_credits; + int ret = 0; + + jbd_debug(3, "New handle %p maybe going live.\n", handle); + + lock_journal(journal); + + if (is_journal_aborted(journal) || + (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) { + ret = -EROFS; + goto fail_unlock; + } + + if (journal->j_barrier_count) + goto fail_unlock; + + if (!journal->j_running_transaction && get_transaction(journal, 1) == 0) + goto fail_unlock; + + transaction = journal->j_running_transaction; + if (transaction->t_state == T_LOCKED) + goto fail_unlock; + + needed = transaction->t_outstanding_credits + nblocks; + /* We could run log_start_commit here */ + if (needed > journal->j_max_transaction_buffers) + goto fail_unlock; + + needed = journal->j_max_transaction_buffers; + if (journal->j_committing_transaction) + needed += journal->j_committing_transaction-> + t_outstanding_credits; + + if (log_space_left(journal) < needed) + goto fail_unlock; + + handle->h_transaction = transaction; + transaction->t_outstanding_credits += nblocks; + transaction->t_updates++; + jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", + handle, nblocks, transaction->t_outstanding_credits, + log_space_left(journal)); + unlock_journal(journal); + return 0; + + fail_unlock: + unlock_journal(journal); + if (ret >= 0) + ret = -1; + return ret; + } + + /* + * Try to start a handle, but non-blockingly. If we weren't able + * to, return an ERR_PTR value. + */ + handle_t *journal_try_start(journal_t *journal, int nblocks) + { + handle_t *handle = journal_current_handle(); + int err; + + if (!journal) + return ERR_PTR(-EROFS); + + if (handle) { + jbd_debug(4, "h_ref %d -> %d\n", + handle->h_ref, + handle->h_ref + 1); + J_ASSERT(handle->h_transaction->t_journal == journal); + if (is_handle_aborted(handle)) + return ERR_PTR(-EIO); + handle->h_ref++; + return handle; + } else { + jbd_debug(4, "no current transaction\n"); + } + + if (is_journal_aborted(journal)) + return ERR_PTR(-EIO); + + handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS); + if (!handle) + return ERR_PTR(-ENOMEM); + memset (handle, 0, sizeof (handle_t)); + + handle->h_buffer_credits = nblocks; + handle->h_ref = 1; + current->journal_info = handle; + + err = try_start_this_handle(journal, handle); + if (err < 0) { + kfree(handle); + current->journal_info = NULL; + return ERR_PTR(err); + } + + return handle; + } + + /* + * journal_extend: extend buffer credits. + * + * Some transactions, such as large extends and truncates, can be done + * atomically all at once or in several stages. The operation requests + * a credit for a number of buffer modications in advance, but can + * extend its credit if it needs more. + * + * journal_extend tries to give the running handle more buffer credits. + * It does not guarantee that allocation: this is a best-effort only. + * The calling process MUST be able to deal cleanly with a failure to + * extend here. + * + * Return 0 on success, non-zero on failure. + * + * return code < 0 implies an error + * return code > 0 implies normal transaction-full status. + */ + + int journal_extend (handle_t *handle, int nblocks) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int result; + int wanted; + + lock_journal (journal); + + result = -EIO; + if (is_handle_aborted(handle)) + goto error_out; + + result = 1; + + /* Don't extend a locked-down transaction! */ + if (handle->h_transaction->t_state != T_RUNNING) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction not running\n", handle, nblocks); + goto error_out; + } + + wanted = transaction->t_outstanding_credits + nblocks; + + if (wanted > journal->j_max_transaction_buffers) { + jbd_debug(3, "denied handle %p %d blocks: " + "transaction too large\n", handle, nblocks); + goto error_out; + } + + if (wanted > log_space_left(journal)) { + jbd_debug(3, "denied handle %p %d blocks: " + "insufficient log space\n", handle, nblocks); + goto error_out; + } + + handle->h_buffer_credits += nblocks; + transaction->t_outstanding_credits += nblocks; + result = 0; + + jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); + + error_out: + unlock_journal (journal); + return result; + } + + + /* + * journal_restart: restart a handle for a multi-transaction filesystem + * operation. + * + * If the journal_extend() call above fails to grant new buffer credits + * to a running handle, a call to journal_restart will commit the + * handle's transaction so far and reattach the handle to a new + * transaction capabable of guaranteeing the requested number of + * credits. + */ + + int journal_restart(handle_t *handle, int nblocks) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int ret; + + /* If we've had an abort of any type, don't even think about + * actually doing the restart! */ + if (is_handle_aborted(handle)) + return 0; + + /* First unlink the handle from its current transaction, and + * start the commit on that. */ + + J_ASSERT (transaction->t_updates > 0); + J_ASSERT (journal_current_handle() == handle); + + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + + if (!transaction->t_updates) + wake_up(&journal->j_wait_updates); + + jbd_debug(2, "restarting handle %p\n", handle); + log_start_commit(journal, transaction); + + handle->h_buffer_credits = nblocks; + ret = start_this_handle(journal, handle); + return ret; + } + + + /* + * Barrier operation: establish a transaction barrier. + * + * This locks out any further updates from being started, and blocks + * until all existing updates have completed, returning only once the + * journal is in a quiescent state with no updates running. + * + * The journal lock should not be held on entry. + */ + + void journal_lock_updates (journal_t *journal) + { + lock_journal(journal); + ++journal->j_barrier_count; + + /* Wait until there are no running updates */ + while (1) { + transaction_t *transaction = journal->j_running_transaction; + if (!transaction) + break; + if (!transaction->t_updates) + break; + + unlock_journal(journal); + sleep_on(&journal->j_wait_updates); + lock_journal(journal); + } + + unlock_journal(journal); + + /* We have now established a barrier against other normal + * updates, but we also need to barrier against other + * journal_lock_updates() calls to make sure that we serialise + * special journal-locked operations too. */ + down(&journal->j_barrier); + } + + /* + * Release a transaction barrier obtained with journal_lock_updates(). + * + * Should be called without the journal lock held. + */ + + void journal_unlock_updates (journal_t *journal) + { + lock_journal(journal); + + J_ASSERT (journal->j_barrier_count != 0); + + up(&journal->j_barrier); + --journal->j_barrier_count; + wake_up(&journal->j_wait_transaction_locked); + unlock_journal(journal); + } + + /* + * journal_get_write_access: notify intent to modify a buffer for metadata + * (not data) update. + * + * If the buffer is already part of the current transaction, then there + * is nothing we need to do. If it is already part of a prior + * transaction which we are still committing to disk, then we need to + * make sure that we do not overwrite the old copy: we do copy-out to + * preserve the copy going to disk. We also account the buffer against + * the handle's metadata buffer credits (unless the buffer is already + * part of the transaction, that is). + * + * Returns an error code or 0 on success. + * + * In full data journalling mode the buffer may be of type BJ_AsyncData, + * because we're write()ing a buffer which is also part of a shared mapping. + */ + + static int + do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int error; + char *frozen_buffer = NULL; + int need_copy = 0; + + jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy); + + JBUFFER_TRACE(jh, "entry"); + repeat: + /* @@@ Need to check for errors here at some point. */ + + /* + * AKPM: neither bdflush nor kupdate run with the BKL. There's + * nothing we can do to prevent them from starting writeout of a + * BUF_DIRTY buffer at any time. And checkpointing buffers are on + * BUF_DIRTY. So. We no longer assert that the buffer is unlocked. + * + * However. It is very wrong for us to allow ext3 to start directly + * altering the ->b_data of buffers which may at that very time be + * undergoing writeout to the client filesystem. This can leave + * the filesystem in an inconsistent, transient state if we crash. + * So what we do is to steal the buffer if it is in checkpoint + * mode and dirty. The journal lock will keep out checkpoint-mode + * state transitions within journal_remove_checkpoint() and the buffer + * is locked to keep bdflush/kupdate/whoever away from it as well. + * + * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a + * simple lock_journal(). This code here will care for locked buffers. + */ + /* + * The buffer_locked() || buffer_dirty() tests here are simply an + * optimisation tweak. If anyone else in the system decides to + * lock this buffer later on, we'll blow up. There doesn't seem + * to be a good reason why they should do this. + */ + if (jh->b_cp_transaction && + (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) { + unlock_journal(journal); + lock_buffer(jh2bh(jh)); + spin_lock(&journal_datalist_lock); + if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) { + /* OK, we need to steal it */ + JBUFFER_TRACE(jh, "stealing from checkpoint mode"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_frozen_data == NULL); + + J_ASSERT(handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + + /* This will clear BH_Dirty and set BH_JBDDirty. */ + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + + /* And pull it off BUF_DIRTY, onto BUF_CLEAN */ + refile_buffer(jh2bh(jh)); + + /* + * The buffer is now hidden from bdflush. It is + * metadata against the current transaction. + */ + JBUFFER_TRACE(jh, "steal from cp mode is complete"); + } + spin_unlock(&journal_datalist_lock); + unlock_buffer(jh2bh(jh)); + lock_journal(journal); + } + + J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh))); + + error = -EROFS; + if (is_handle_aborted(handle)) + goto out_unlocked; + error = 0; + + spin_lock(&journal_datalist_lock); + + /* The buffer is already part of this transaction if + * b_transaction or b_next_transaction points to it. */ + + if (jh->b_transaction == transaction || + jh->b_next_transaction == transaction) + goto done_locked; + + /* If there is already a copy-out version of this buffer, then + * we don't need to make another one. */ + + if (jh->b_frozen_data) { + JBUFFER_TRACE(jh, "has frozen data"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + jh->b_next_transaction = transaction; + + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + goto done_locked; + } + + /* Is there data here we need to preserve? */ + + if (jh->b_transaction && jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* There is one case we have to be very careful about. + * If the committing transaction is currently writing + * this buffer out to disk and has NOT made a copy-out, + * then we cannot modify the buffer contents at all + * right now. The essence of copy-out is that it is the + * extra copy, not the primary copy, which gets + * journaled. If the primary copy is already going to + * disk then we cannot do copy-out here. */ + + if (jh->b_jlist == BJ_Shadow) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + /* commit wakes up all shadow buffers after IO */ + sleep_on(&jh2bh(jh)->b_wait); + lock_journal(journal); + goto repeat; + } + + /* Only do the copy if the currently-owning transaction + * still needs it. If it is on the Forget list, the + * committing transaction is past that stage. The + * buffer had better remain locked during the kmalloc, + * but that should be true --- we hold the journal lock + * still and the buffer is already on the BUF_JOURNAL + * list so won't be flushed. + * + * Subtle point, though: if this is a get_undo_access, + * then we will be relying on the frozen_data to contain + * the new value of the committed_data record after the + * transaction, so we HAVE to force the frozen_data copy + * in that case. */ + + if (jh->b_jlist != BJ_Forget || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, + GFP_NOFS); + lock_journal(journal); + if (!frozen_buffer) { + printk(KERN_EMERG __FUNCTION__ + "OOM for frozen_buffer\n"); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + spin_lock(&journal_datalist_lock); + goto done_locked; + } + goto repeat; + } + + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + need_copy = 1; + } + jh->b_next_transaction = transaction; + } + + J_ASSERT(handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + + /* Finally, if the buffer is not journaled right now, we need to + * make sure it doesn't get written to disk before the caller + * actually commits the new data. */ + + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + jh->b_transaction = transaction; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + } + + done_locked: + spin_unlock(&journal_datalist_lock); + if (need_copy) { + struct page *page; + int offset; + char *source; + + J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh))); + page = jh2bh(jh)->b_page; + offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; + source = kmap(page); + memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); + kunmap(page); + } + + + /* If we are about to journal a buffer, then any revoke pending + on it is no longer valid. */ + journal_cancel_revoke(handle, jh); + + out_unlocked: + if (frozen_buffer) + kfree(frozen_buffer); + + JBUFFER_TRACE(jh, "exit"); + return error; + } + + int journal_get_write_access (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = journal_add_journal_head(bh); + int rc; + + /* We do not want to get caught playing with fields which the + * log thread also manipulates. Make sure that the buffer + * completes any outstanding IO before proceeding. */ + lock_journal(journal); + rc = do_get_write_access(handle, jh, 0); + journal_unlock_journal_head(jh); + unlock_journal(journal); + return rc; + } + + + /* + * When the user wants to journal a newly created buffer_head + * (ie. getblk() returned a new buffer and we are going to populate it + * manually rather than reading off disk), then we need to keep the + * buffer_head locked until it has been completely filled with new + * data. In this case, we should be able to make the assertion that + * the bh is not already part of an existing transaction. + * + * The buffer should already be locked by the caller by this point. + * There is no lock ranking violation: it was a newly created, + * unlocked buffer beforehand. */ + + int journal_get_create_access (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = journal_add_journal_head(bh); + int err; + + jbd_debug(5, "journal_head %p\n", jh); + lock_journal(journal); + err = -EROFS; + if (is_handle_aborted(handle)) + goto out; + err = 0; + + JBUFFER_TRACE(jh, "entry"); + /* The buffer may already belong to this transaction due to + * pre-zeroing in the filesystem's new_block code. It may also + * be on the previous, committing transaction's lists, but it + * HAS to be in Forget state in that case: the transaction must + * have deleted the buffer for it to be reused here. */ + J_ASSERT_JH(jh, (jh->b_transaction == transaction || + jh->b_transaction == NULL || + (jh->b_transaction == journal->j_committing_transaction && + jh->b_jlist == BJ_Forget))); + + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); + + J_ASSERT_JH(jh, handle->h_buffer_credits > 0); + handle->h_buffer_credits--; + + spin_lock(&journal_datalist_lock); + if (jh->b_transaction == NULL) { + jh->b_transaction = transaction; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + __journal_file_buffer(jh, transaction, BJ_Reserved); + JBUFFER_TRACE(jh, "refile"); + refile_buffer(jh2bh(jh)); + } else if (jh->b_transaction == journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "set next transaction"); + jh->b_next_transaction = transaction; + } + spin_unlock(&journal_datalist_lock); + + /* + * akpm: I added this. ext3_alloc_branch can pick up new indirect + * blocks which contain freed but then revoked metadata. We need + * to cancel the revoke in case we end up freeing it yet again + * and the reallocating as data - this would cause a second revoke, + * which hits an assertion error. + */ + JBUFFER_TRACE(jh, "cancelling revoke"); + journal_cancel_revoke(handle, jh); + journal_unlock_journal_head(jh); + out: + unlock_journal(journal); + return err; + } + + + + /* + * journal_get_undo_access: Notify intent to modify metadata with non- + * rewindable consequences + * + * Sometimes there is a need to distinguish between metadata which has + * been committed to disk and that which has not. The ext3fs code uses + * this for freeing and allocating space: we have to make sure that we + * do not reuse freed space until the deallocation has been committed, + * since if we overwrote that space we would make the delete + * un-rewindable in case of a crash. + * + * To deal with that, journal_get_undo_access requests write access to a + * buffer for parts of non-rewindable operations such as delete + * operations on the bitmaps. The journaling code must keep a copy of + * the buffer's contents prior to the undo_access call until such time + * as we know that the buffer has definitely been committed to disk. + * + * We never need to know which transaction the committed data is part + * of: buffers touched here are guaranteed to be dirtied later and so + * will be committed to a new transaction in due course, at which point + * we can discard the old committed data pointer. + * + * Returns error number or 0 on success. + */ + + int journal_get_undo_access (handle_t *handle, struct buffer_head *bh) + { + journal_t *journal = handle->h_transaction->t_journal; + int err; + struct journal_head *jh = journal_add_journal_head(bh); + + JBUFFER_TRACE(jh, "entry"); + lock_journal(journal); + + /* Do this first --- it can drop the journal lock, so we want to + * make sure that obtaining the committed_data is done + * atomically wrt. completion of any outstanding commits. */ + err = do_get_write_access (handle, jh, 1); + if (err) + goto out; + + if (!jh->b_committed_data) { + /* Copy out the current buffer contents into the + * preserved, committed copy. */ + JBUFFER_TRACE(jh, "generate b_committed data"); + jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size, + GFP_NOFS); + if (!jh->b_committed_data) { + printk(KERN_EMERG __FUNCTION__ + ": No memory for committed data!\n"); + err = -ENOMEM; + goto out; + } + + memcpy (jh->b_committed_data, jh2bh(jh)->b_data, + jh2bh(jh)->b_size); + } + + out: + if (!err) + J_ASSERT_JH(jh, jh->b_committed_data); + journal_unlock_journal_head(jh); + unlock_journal(journal); + return err; + } + + /* + * journal_dirty_data: mark a buffer as containing dirty data which + * needs to be flushed before we can commit the current transaction. + * + * The buffer is placed on the transaction's data list and is marked as + * belonging to the transaction. + * + * If `async' is set then the writebask will be initiated by the caller + * using submit_bh -> end_buffer_io_async. We put the buffer onto + * t_async_datalist. + * + * Returns error number or 0 on success. + * + * journal_dirty_data() can be called via page_launder->ext3_writepage + * by kswapd. So it cannot block. Happily, there's nothing here + * which needs lock_journal if `async' is set. + * + * When the buffer is on the current transaction we freely move it + * between BJ_AsyncData and BJ_SyncData according to who tried to + * change its state last. + */ + + int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async) + { + journal_t *journal = handle->h_transaction->t_journal; + int need_brelse = 0; + int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData; + struct journal_head *jh; + + if (is_handle_aborted(handle)) + return 0; + + jh = journal_add_journal_head(bh); + JBUFFER_TRACE(jh, "entry"); + + /* + * The buffer could *already* be dirty. Writeout can start + * at any time. + */ + jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); + + /* + * What if the buffer is already part of a running transaction? + * + * There are two cases: + * 1) It is part of the current running transaction. Refile it, + * just in case we have allocated it as metadata, deallocated + * it, then reallocated it as data. + * 2) It is part of the previous, still-committing transaction. + * If all we want to do is to guarantee that the buffer will be + * written to disk before this new transaction commits, then + * being sure that the *previous* transaction has this same + * property is sufficient for us! Just leave it on its old + * transaction. + * + * In case (2), the buffer must not already exist as metadata + * --- that would violate write ordering (a transaction is free + * to write its data at any point, even before the previous + * committing transaction has committed). The caller must + * never, ever allow this to happen: there's nothing we can do + * about it in this layer. + */ + spin_lock(&journal_datalist_lock); + if (jh->b_transaction) { + JBUFFER_TRACE(jh, "has transaction"); + if (jh->b_transaction != handle->h_transaction) { + JBUFFER_TRACE(jh, "belongs to older transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + + /* @@@ IS THIS TRUE ? */ + /* + * Not any more. Scenario: someone does a write() + * in data=journal mode. The buffer's transaction has + * moved into commit. Then someone does another + * write() to the file. We do the frozen data copyout + * and set b_next_transaction to point to j_running_t. + * And while we're in that state, someone does a + * writepage() in an attempt to pageout the same area + * of the file via a shared mapping. At present that + * calls journal_dirty_data(), and we get right here. + * It may be too late to journal the data. Simply + * falling through to the next test will suffice: the + * data will be dirty and wil be checkpointed. The + * ordering comments in the next comment block still + * apply. + */ + //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + + /* + * If we're journalling data, and this buffer was + * subject to a write(), it could be metadata, forget + * or shadow against the committing transaction. Now, + * someone has dirtied the same darn page via a mapping + * and it is being writepage()'d. + * We *could* just steal the page from commit, with some + * fancy locking there. Instead, we just skip it - + * don't tie the page's buffers to the new transaction + * at all. + * Implication: if we crash before the writepage() data + * is written into the filesystem, recovery will replay + * the write() data. + */ + if (jh->b_jlist != BJ_None && + jh->b_jlist != BJ_SyncData && + jh->b_jlist != BJ_AsyncData) { + JBUFFER_TRACE(jh, "Not stealing"); + goto no_journal; + } + + /* + * This buffer may be undergoing writeout in commit. We + * can't return from here and let the caller dirty it + * again because that can cause the write-out loop in + * commit to never terminate. + */ + if (!async && buffer_dirty(bh)) { + atomic_inc(&bh->b_count); + spin_unlock(&journal_datalist_lock); + need_brelse = 1; + ll_rw_block(WRITE, 1, &bh); + wait_on_buffer(bh); + spin_lock(&journal_datalist_lock); + /* The buffer may become locked again at any + time if it is redirtied */ + } + + /* journal_clean_data_list() may have got there first */ + if (jh->b_transaction != NULL) { + JBUFFER_TRACE(jh, "unfile from commit"); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + } + /* The buffer will be refiled below */ + + } + /* + * Special case --- the buffer might actually have been + * allocated and then immediately deallocated in the previous, + * committing transaction, so might still be left on that + * transaction's metadata lists. + */ + if (jh->b_jlist != wanted_jlist) { + JBUFFER_TRACE(jh, "not on correct data list: unfile"); + J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); + __journal_unfile_buffer(jh); + jh->b_transaction = NULL; + JBUFFER_TRACE(jh, "file as data"); + __journal_file_buffer(jh, handle->h_transaction, + wanted_jlist); + } + } else { + JBUFFER_TRACE(jh, "not on a transaction"); + __journal_file_buffer(jh, handle->h_transaction, wanted_jlist); + } + /* + * We need to mark the buffer dirty and refile it inside the lock to + * protect it from release by journal_try_to_free_buffer() + * + * We set ->b_flushtime to something small enough to typically keep + * kupdate away from the buffer. + * + * We don't need to do a balance_dirty() - __block_commit_write() + * does that. + */ + if (!async && !atomic_set_buffer_dirty(jh2bh(jh))) { + jh2bh(jh)->b_flushtime = + jiffies + journal->j_commit_interval + 1 * HZ; + refile_buffer(jh2bh(jh)); + } + no_journal: + spin_unlock(&journal_datalist_lock); + if (need_brelse) { + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + } + JBUFFER_TRACE(jh, "exit"); + journal_unlock_journal_head(jh); + return 0; + } + + /* + * journal_dirty_metadata: mark a buffer as containing dirty metadata + * which needs to be journaled as part of the current transaction. + * + * The buffer is placed on the transaction's metadata list and is marked + * as belonging to the transaction. + * + * Special care needs to be taken if the buffer already belongs to the + * current committing transaction (in which case we should have frozen + * data present for that commit). In that case, we don't relink the + * buffer: that only gets done when the old transaction finally + * completes its commit. + * + * Returns error number or 0 on success. + */ + + int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = bh2jh(bh); + + jbd_debug(5, "journal_head %p\n", jh); + JBUFFER_TRACE(jh, "entry"); + lock_journal(journal); + if (is_handle_aborted(handle)) + goto out_unlock; + + spin_lock(&journal_datalist_lock); + set_bit(BH_JBDDirty, &bh->b_state); + set_buffer_flushtime(bh); + + J_ASSERT_JH(jh, jh->b_transaction != NULL); + + /* + * Metadata already on the current transaction list doesn't + * need to be filed. Metadata on another transaction's list must + * be committing, and will be refiled once the commit completes: + * leave it alone for now. + */ + + if (jh->b_transaction != transaction) { + JBUFFER_TRACE(jh, "already on other transaction"); + J_ASSERT_JH(jh, jh->b_transaction == + journal->j_committing_transaction); + J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + /* And this case is illegal: we can't reuse another + * transaction's data buffer, ever. */ + /* FIXME: writepage() should be journalled */ + J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData); + goto done_locked; + } + + /* That test should have eliminated the following case: */ + J_ASSERT_JH(jh, jh->b_frozen_data == 0); + + JBUFFER_TRACE(jh, "file as BJ_Metadata"); + __journal_file_buffer(jh, handle->h_transaction, BJ_Metadata); + + done_locked: + spin_unlock(&journal_datalist_lock); + JBUFFER_TRACE(jh, "exit"); + out_unlock: + unlock_journal(journal); + return 0; + } + + #if 0 + /* + * journal_release_buffer: undo a get_write_access without any buffer + * updates, if the update decided in the end that it didn't need access. + * + * journal_get_write_access() can block, so it is quite possible for a + * journaling component to decide after the write access is returned + * that global state has changed and the update is no longer required. */ + + void journal_release_buffer (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh = bh2jh(bh); + + lock_journal(journal); + JBUFFER_TRACE(jh, "entry"); + + /* If the buffer is reserved but not modified by this + * transaction, then it is safe to release it. In all other + * cases, just leave the buffer as it is. */ + + spin_lock(&journal_datalist_lock); + if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction && + !buffer_jdirty(jh2bh(jh))) { + JBUFFER_TRACE(jh, "unused: refiling it"); + handle->h_buffer_credits++; + __journal_refile_buffer(jh); + } + spin_unlock(&journal_datalist_lock); + + JBUFFER_TRACE(jh, "exit"); + unlock_journal(journal); + } + #endif + + /* + * journal_forget: bforget() for potentially-journaled buffers. We can + * only do the bforget if there are no commits pending against the + * buffer. If the buffer is dirty in the current running transaction we + * can safely unlink it. + * + * bh may not be a journalled buffer at all - it may be a non-JBD + * buffer which came off the hashtable. Check for this. + * + * Decrements bh->b_count by one. + * + * Allow this call even if the handle has aborted --- it may be part of + * the caller's cleanup after an abort. + */ + + void journal_forget (handle_t *handle, struct buffer_head *bh) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + struct journal_head *jh; + + BUFFER_TRACE(bh, "entry"); + + lock_journal(journal); + spin_lock(&journal_datalist_lock); + + if (!buffer_jbd(bh)) + goto not_jbd; + jh = bh2jh(bh); + + if (jh->b_transaction == handle->h_transaction) { + J_ASSERT_JH(jh, !jh->b_frozen_data); + + /* If we are forgetting a buffer which is already part + * of this transaction, then we can just drop it from + * the transaction immediately. */ + clear_bit(BH_Dirty, &bh->b_state); + clear_bit(BH_JBDDirty, &bh->b_state); + + JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); + J_ASSERT_JH(jh, !jh->b_committed_data); + + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + + /* + * We are no longer going to journal this buffer. + * However, the commit of this transaction is still + * important to the buffer: the delete that we are now + * processing might obsolete an old log entry, so by + * committing, we can satisfy the buffer's checkpoint. + * + * So, if we have a checkpoint on the buffer, we should + * now refile the buffer on our BJ_Forget list so that + * we know to remove the checkpoint after we commit. + */ + + if (jh->b_cp_transaction) { + __journal_file_buffer(jh, transaction, BJ_Forget); + } else { + __journal_remove_journal_head(bh); + __brelse(bh); + if (!buffer_jbd(bh)) { + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + __bforget(bh); + return; + } + } + + } else if (jh->b_transaction) { + J_ASSERT_JH(jh, (jh->b_transaction == + journal->j_committing_transaction)); + /* However, if the buffer is still owned by a prior + * (committing) transaction, we can't drop it yet... */ + JBUFFER_TRACE(jh, "belongs to older transaction"); + /* ... but we CAN drop it from the new transaction if we + * have also modified it since the original commit. */ + + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == transaction); + jh->b_next_transaction = NULL; + } + } + + not_jbd: + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + __brelse(bh); + return; + } + + #if 0 /* Unused */ + /* + * journal_sync_buffer: flush a potentially-journaled buffer to disk. + * + * Used for O_SYNC filesystem operations. If the buffer is journaled, + * we need to complete the O_SYNC by waiting for the transaction to + * complete. It is an error to call journal_sync_buffer before + * journal_stop! + */ + + void journal_sync_buffer(struct buffer_head *bh) + { + transaction_t *transaction; + journal_t *journal; + long sequence; + struct journal_head *jh; + + /* If the buffer isn't journaled, this is easy: just sync it to + * disk. */ + BUFFER_TRACE(bh, "entry"); + + spin_lock(&journal_datalist_lock); + if (!buffer_jbd(bh)) { + spin_unlock(&journal_datalist_lock); + return; + } + jh = bh2jh(bh); + if (jh->b_transaction == NULL) { + /* If the buffer has already been journaled, then this + * is a noop. */ + if (jh->b_cp_transaction == NULL) { + spin_unlock(&journal_datalist_lock); + return; + } + atomic_inc(&bh->b_count); + spin_unlock(&journal_datalist_lock); + ll_rw_block (WRITE, 1, &bh); + wait_on_buffer(bh); + __brelse(bh); + goto out; + } + + /* Otherwise, just wait until the transaction is synced to disk. */ + transaction = jh->b_transaction; + journal = transaction->t_journal; + sequence = transaction->t_tid; + spin_unlock(&journal_datalist_lock); + + jbd_debug(2, "requesting commit for jh %p\n", jh); + log_start_commit (journal, transaction); + + while (tid_gt(sequence, journal->j_commit_sequence)) { + wake_up(&journal->j_wait_done_commit); + sleep_on(&journal->j_wait_done_commit); + } + JBUFFER_TRACE(jh, "exit"); + out: + return; + } + #endif + + /* + * All done for a particular handle. + * + * There is not much action needed here. We just return any remaining + * buffer credits to the transaction and remove the handle. The only + * complication is that we need to start a commit operation if the + * filesystem is marked for synchronous update. + * + * journal_stop itself will not usually return an error, but it may + * do so in unusual circumstances. In particular, expect it to + * return -EIO if a journal_abort has been executed since the + * transaction began. + */ + + int journal_stop(handle_t *handle) + { + transaction_t *transaction = handle->h_transaction; + journal_t *journal = transaction->t_journal; + int old_handle_count, err; + + if (!handle) + return 0; + + J_ASSERT (transaction->t_updates > 0); + J_ASSERT (journal_current_handle() == handle); + + if (is_handle_aborted(handle)) + err = -EIO; + else + err = 0; + + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } + + jbd_debug(4, "Handle %p going down\n", handle); + + /* + * Implement synchronous transaction batching. If the handle + * was synchronous, don't force a commit immediately. Let's + * yield and let another thread piggyback onto this transaction. + * Keep doing that while new threads continue to arrive. + * It doesn't cost much - we're about to run a commit and sleep + * on IO anyway. Speeds up many-threaded, many-dir operations + * by 30x or more... + */ + if (handle->h_sync) { + do { + old_handle_count = transaction->t_handle_count; + set_current_state(TASK_RUNNING); + current->policy |= SCHED_YIELD; + schedule(); + } while (old_handle_count != transaction->t_handle_count); + } + + current->journal_info = NULL; + transaction->t_outstanding_credits -= handle->h_buffer_credits; + transaction->t_updates--; + if (!transaction->t_updates) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + /* + * If the handle is marked SYNC, we need to set another commit + * going! We also want to force a commit if the current + * transaction is occupying too much of the log, or if the + * transaction is too old now. + */ + if (handle->h_sync || + transaction->t_outstanding_credits > + journal->j_max_transaction_buffers || + time_after_eq(jiffies, transaction->t_expires)) { + /* Do this even for aborted journals: an abort still + * completes the commit thread, it just doesn't write + * anything to disk. */ + tid_t tid = transaction->t_tid; + + jbd_debug(2, "transaction too old, requesting commit for " + "handle %p\n", handle); + /* This is non-blocking */ + log_start_commit(journal, transaction); + + /* + * Special case: JFS_SYNC synchronous updates require us + * to wait for the commit to complete. + */ + if (handle->h_sync && !(current->flags & PF_MEMALLOC)) + log_wait_commit(journal, tid); + } + kfree(handle); + return err; + } + + /* + * For synchronous operations: force any uncommitted trasnactions + * to disk. May seem kludgy, but it reuses all the handle batching + * code in a very simple manner. + */ + int journal_force_commit(journal_t *journal) + { + handle_t *handle; + int ret = 0; + + lock_kernel(); + handle = journal_start(journal, 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + handle->h_sync = 1; + journal_stop(handle); + out: + unlock_kernel(); + return ret; + } + + /* + * + * List management code snippets: various functions for manipulating the + * transaction buffer lists. + * + */ + + /* + * Append a buffer to a transaction list, given the transaction's list head + * pointer. + * journal_datalist_lock is held. + */ + + static inline void + __blist_add_buffer(struct journal_head **list, struct journal_head *jh) + { + if (!*list) { + jh->b_tnext = jh->b_tprev = jh; + *list = jh; + } else { + /* Insert at the tail of the list to preserve order */ + struct journal_head *first = *list, *last = first->b_tprev; + jh->b_tprev = last; + jh->b_tnext = first; + last->b_tnext = first->b_tprev = jh; + } + } + + /* + * Remove a buffer from a transaction list, given the transaction's list + * head pointer. + * + * Called with journal_datalist_lock held, and the journal may not + * be locked. + */ + + static inline void + __blist_del_buffer(struct journal_head **list, struct journal_head *jh) + { + if (*list == jh) { + *list = jh->b_tnext; + if (*list == jh) + *list = 0; + } + jh->b_tprev->b_tnext = jh->b_tnext; + jh->b_tnext->b_tprev = jh->b_tprev; + } + + /* + * Remove a buffer from the appropriate transaction list. + * + * Note that this function can *change* the value of + * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget, + * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller + * is holding onto a copy of one of thee pointers, it could go bad. + * Generally the caller needs to re-read the pointer from the transaction_t. + * + * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called + * via journal_try_to_free_buffer() or journal_clean_data_list(). In that + * case, journal_datalist_lock will be held, and the journal may not be locked. + */ + void __journal_unfile_buffer(struct journal_head *jh) + { + struct journal_head **list = 0; + transaction_t * transaction; + + assert_spin_locked(&journal_datalist_lock); + transaction = jh->b_transaction; + + #ifdef __SMP__ + J_ASSERT (current->lock_depth >= 0); + #endif + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + + if (jh->b_jlist != BJ_None) + J_ASSERT_JH(jh, transaction != 0); + + switch (jh->b_jlist) { + case BJ_None: + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_AsyncData: + list = &transaction->t_async_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers--; + J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_del_buffer(list, jh); + jh->b_jlist = BJ_None; + if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) { + set_bit(BH_Dirty, &jh2bh(jh)->b_state); + } + } + + void journal_unfile_buffer(struct journal_head *jh) + { + spin_lock(&journal_datalist_lock); + __journal_unfile_buffer(jh); + spin_unlock(&journal_datalist_lock); + } + + /* + * Called from journal_try_to_free_buffers(). The journal is not + * locked. lru_list_lock is not held. + * + * Here we see why journal_datalist_lock is global and not per-journal. + * We cannot get back to this buffer's journal pointer without locking + * out journal_clean_data_list() in some manner. + * + * One could use journal_datalist_lock to get unracy access to a + * per-journal lock. + * + * Called with journal_datalist_lock held. + * + * Returns non-zero iff we were able to free the journal_head. + */ + static int __journal_try_to_free_buffer(struct buffer_head *bh, + int *locked_or_dirty) + { + struct journal_head *jh; + + assert_spin_locked(&journal_datalist_lock); + + if (!buffer_jbd(bh)) + return 1; + jh = bh2jh(bh); + + if (buffer_locked(bh) || buffer_dirty(bh)) { + *locked_or_dirty = 1; + goto out; + } + + if (!buffer_uptodate(bh)) + goto out; + + if (jh->b_next_transaction != 0) + goto out; + + if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { + if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) { + /* A written-back ordered data buffer */ + JBUFFER_TRACE(jh, "release data"); + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + __journal_remove_journal_head(bh); + __brelse(bh); + } + } + else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { + /* written-back checkpointed metadata buffer */ + if (jh->b_jlist == BJ_None) { + JBUFFER_TRACE(jh, "remove from checkpoint list"); + __journal_remove_checkpoint(jh); + __journal_remove_journal_head(bh); + __brelse(bh); + } + } + return !buffer_jbd(bh); + + out: + return 0; + } + + /* + * journal_try_to_free_buffers(). For all the buffers on this page, + * if they are fully written out ordered data, move them onto BUF_CLEAN + * so try_to_free_buffers() can reap them. Called with lru_list_lock + * not held. Does its own locking. + * + * This complicates JBD locking somewhat. We aren't protected by the + * BKL here. We wish to remove the buffer from its committing or + * running transaction's ->t_datalist via __journal_unfile_buffer. + * + * This may *change* the value of transaction_t->t_datalist, so anyone + * who looks at t_datalist needs to lock against this function. + * + * Even worse, someone may be doing a journal_dirty_data on this + * buffer. So we need to lock against that. journal_dirty_data() + * will come out of the lock with the buffer dirty, which makes it + * ineligible for release here. + * + * Who else is affected by this? hmm... Really the only contender + * is do_get_write_access() - it could be looking at the buffer while + * journal_try_to_free_buffer() is changing its state. But that + * cannot happen because we never reallocate freed data as metadata + * while the data is part of a transaction. Yes? + * + * This function returns non-zero if we wish try_to_free_buffers() + * to be called. We do this is the page is releasable by try_to_free_buffers(). + * We also do it if the page has locked or dirty buffers and the caller wants + * us to perform sync or async writeout. + */ + int journal_try_to_free_buffers(journal_t *journal, + struct page *page, int gfp_mask) + { + struct buffer_head *bh; + struct buffer_head *tmp; + int locked_or_dirty = 0; + int call_ttfb = 1; + + J_ASSERT(PageLocked(page)); + + bh = page->buffers; + tmp = bh; + spin_lock(&journal_datalist_lock); + do { + struct buffer_head *p = tmp; + + tmp = tmp->b_this_page; + if (buffer_jbd(p)) + if (!__journal_try_to_free_buffer(p, &locked_or_dirty)) + call_ttfb = 0; + } while (tmp != bh); + spin_unlock(&journal_datalist_lock); + + if (!(gfp_mask & (__GFP_IO|__GFP_WAIT))) + goto out; + if (!locked_or_dirty) + goto out; + /* + * The VM wants us to do writeout, or to block on IO, or both. + * So we allow try_to_free_buffers to be called even if the page + * still has journalled buffers. + */ + call_ttfb = 1; + out: + return call_ttfb; + } + + /* + * This buffer is no longer needed. If it is on an older transaction's + * checkpoint list we need to record it on this transaction's forget list + * to pin this buffer (and hence its checkpointing transaction) down until + * this transaction commits. If the buffer isn't on a checkpoint list, we + * release it. + * Returns non-zero if JBD no longer has an interest in the buffer. + */ + static int dispose_buffer(struct journal_head *jh, + transaction_t *transaction) + { + int may_free = 1; + struct buffer_head *bh = jh2bh(jh); + + spin_lock(&journal_datalist_lock); + __journal_unfile_buffer(jh); + jh->b_transaction = 0; + + if (jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "on running+cp transaction"); + __journal_file_buffer(jh, transaction, BJ_Forget); + clear_bit(BH_JBDDirty, &bh->b_state); + may_free = 0; + } else { + JBUFFER_TRACE(jh, "on running transaction"); + __journal_remove_journal_head(bh); + __brelse(bh); + } + spin_unlock(&journal_datalist_lock); + return may_free; + } + + /* + * journal_flushpage + * + * This code is tricky. It has a number of cases to deal with. + * + * There are two invariants which this code relies on: + * + * i_size must be updated on disk before we start calling flushpage on the + * data. + * + * This is done in ext3 by defining an ext3_setattr method which + * updates i_size before truncate gets going. By maintaining this + * invariant, we can be sure that it is safe to throw away any buffers + * attached to the current transaction: once the transaction commits, + * we know that the data will not be needed. + * + * Note however that we can *not* throw away data belonging to the + * previous, committing transaction! + * + * Any disk blocks which *are* part of the previous, committing + * transaction (and which therefore cannot be discarded immediately) are + * not going to be reused in the new running transaction + * + * The bitmap committed_data images guarantee this: any block which is + * allocated in one transaction and removed in the next will be marked + * as in-use in the committed_data bitmap, so cannot be reused until + * the next transaction to delete the block commits. This means that + * leaving committing buffers dirty is quite safe: the disk blocks + * cannot be reallocated to a different file and so buffer aliasing is + * not possible. + * + * + * The above applies mainly to ordered data mode. In writeback mode we + * don't make guarantees about the order in which data hits disk --- in + * particular we don't guarantee that new dirty data is flushed before + * transaction commit --- so it is always safe just to discard data + * immediately in that mode. --sct + */ + + /* + * The journal_unmap_buffer helper function returns zero if the buffer + * concerned remains pinned as an anonymous buffer belonging to an older + * transaction. + * + * We're outside-transaction here. Either or both of j_running_transaction + * and j_committing_transaction may be NULL. + */ + static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) + { + transaction_t *transaction; + struct journal_head *jh; + int may_free = 1; + + BUFFER_TRACE(bh, "entry"); + + if (!buffer_mapped(bh)) + return 1; + + /* It is safe to proceed here without the + * journal_datalist_spinlock because the buffers cannot be + * stolen by try_to_free_buffers as long as we are holding the + * page lock. --sct */ + + if (!buffer_jbd(bh)) + goto zap_buffer; + + jh = bh2jh(bh); + transaction = jh->b_transaction; + if (transaction == NULL) { + /* First case: not on any transaction. If it + * has no checkpoint link, then we can zap it: + * it's a writeback-mode buffer so we don't care + * if it hits disk safely. */ + if (!jh->b_cp_transaction) { + JBUFFER_TRACE(jh, "not on any transaction: zap"); + goto zap_buffer; + } + + if (!buffer_dirty(bh)) { + /* bdflush has written it. We can drop it now */ + goto zap_buffer; + } + + /* OK, it must be in the journal but still not + * written fully to disk: it's metadata or + * journaled data... */ + + if (journal->j_running_transaction) { + /* ... and once the current transaction has + * committed, the buffer won't be needed any + * longer. */ + JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget"); + return dispose_buffer(jh, + journal->j_running_transaction); + } else { + /* There is no currently-running transaction. So the + * orphan record which we wrote for this file must have + * passed into commit. We must attach this buffer to + * the committing transaction, if it exists. */ + if (journal->j_committing_transaction) { + JBUFFER_TRACE(jh, "give to committing trans"); + return dispose_buffer(jh, + journal->j_committing_transaction); + } else { + /* The orphan record's transaction has + * committed. We can cleanse this buffer */ + clear_bit(BH_JBDDirty, &bh->b_state); + goto zap_buffer; + } + } + } else if (transaction == journal->j_committing_transaction) { + /* If it is committing, we simply cannot touch it. We + * can remove it's next_transaction pointer from the + * running transaction if that is set, but nothing + * else. */ + JBUFFER_TRACE(jh, "on committing transaction"); + if (jh->b_next_transaction) { + J_ASSERT(jh->b_next_transaction == + journal->j_running_transaction); + jh->b_next_transaction = NULL; + } + return 0; + } else { + /* Good, the buffer belongs to the running transaction. + * We are writing our own transaction's data, not any + * previous one's, so it is safe to throw it away + * (remember that we expect the filesystem to have set + * i_size already for this truncate so recovery will not + * expose the disk blocks we are discarding here.) */ + J_ASSERT_JH(jh, transaction == journal->j_running_transaction); + may_free = dispose_buffer(jh, transaction); + } + + zap_buffer: + if (buffer_dirty(bh)) + mark_buffer_clean(bh); + J_ASSERT_BH(bh, !buffer_jdirty(bh)); + clear_bit(BH_Uptodate, &bh->b_state); + clear_bit(BH_Mapped, &bh->b_state); + clear_bit(BH_Req, &bh->b_state); + clear_bit(BH_New, &bh->b_state); + return may_free; + } + + /* + * Return non-zero if the page's buffers were successfully reaped + */ + int journal_flushpage(journal_t *journal, + struct page *page, + unsigned long offset) + { + struct buffer_head *head, *bh, *next; + unsigned int curr_off = 0; + int may_free = 1; + + if (!PageLocked(page)) + BUG(); + if (!page->buffers) + return 1; + + /* We will potentially be playing with lists other than just the + * data lists (especially for journaled data mode), so be + * cautious in our locking. */ + lock_journal(journal); + + head = bh = page->buffers; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* AKPM: doing lock_buffer here may be overly paranoid */ + if (offset <= curr_off) { + /* This block is wholly outside the truncation point */ + lock_buffer(bh); + may_free &= journal_unmap_buffer(journal, bh); + unlock_buffer(bh); + } + curr_off = next_off; + bh = next; + + } while (bh != head); + + unlock_journal(journal); + + if (!offset) { + if (!may_free || !try_to_free_buffers(page, 0)) { + atomic_inc(&buffermem_pages); + return 0; + } + J_ASSERT(page->buffers == NULL); + } + + return 1; + } + + + + /* + * File a buffer on the given transaction list. + */ + + void __journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) + { + struct journal_head **list = 0; + + assert_spin_locked(&journal_datalist_lock); + + #ifdef __SMP__ + J_ASSERT (current->lock_depth >= 0); + #endif + J_ASSERT_JH(jh, jh->b_jlist < BJ_Types); + J_ASSERT_JH(jh, jh->b_transaction == transaction || + jh->b_transaction == 0); + + if (jh->b_transaction) { + if (jh->b_jlist == jlist) + return; + __journal_unfile_buffer(jh); + } else { + jh->b_transaction = transaction; + } + + switch (jlist) { + case BJ_None: + J_ASSERT_JH(jh, !jh->b_committed_data); + J_ASSERT_JH(jh, !jh->b_frozen_data); + return; + case BJ_SyncData: + list = &transaction->t_sync_datalist; + break; + case BJ_AsyncData: + list = &transaction->t_async_datalist; + break; + case BJ_Metadata: + transaction->t_nr_buffers++; + list = &transaction->t_buffers; + break; + case BJ_Forget: + list = &transaction->t_forget; + break; + case BJ_IO: + list = &transaction->t_iobuf_list; + break; + case BJ_Shadow: + list = &transaction->t_shadow_list; + break; + case BJ_LogCtl: + list = &transaction->t_log_list; + break; + case BJ_Reserved: + list = &transaction->t_reserved_list; + break; + } + + __blist_add_buffer(list, jh); + jh->b_jlist = jlist; + + if (jlist == BJ_Metadata || jlist == BJ_Reserved || + jlist == BJ_Shadow || jlist == BJ_Forget) { + if (atomic_set_buffer_clean(jh2bh(jh))) { + set_bit(BH_JBDDirty, &jh2bh(jh)->b_state); + } + } + } + + void journal_file_buffer(struct journal_head *jh, + transaction_t *transaction, int jlist) + { + spin_lock(&journal_datalist_lock); + __journal_file_buffer(jh, transaction, jlist); + spin_unlock(&journal_datalist_lock); + } + + /* + * Remove a buffer from its current buffer list in preparation for + * dropping it from its current transaction entirely. If the buffer has + * already started to be used by a subsequent transaction, refile the + * buffer on that transaction's metadata list. + */ + + void __journal_refile_buffer(struct journal_head *jh) + { + assert_spin_locked(&journal_datalist_lock); + #ifdef __SMP__ + J_ASSERT_JH(jh, current->lock_depth >= 0); + #endif + __journal_unfile_buffer(jh); + + /* If the buffer is now unused, just drop it. If it has been + modified by a later transaction, add it to the new + transaction's metadata list. */ + + jh->b_transaction = jh->b_next_transaction; + jh->b_next_transaction = NULL; + + if (jh->b_transaction != NULL) { + __journal_file_buffer(jh, jh->b_transaction, BJ_Metadata); + J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); + } else { + /* Onto BUF_DIRTY for writeback */ + refile_buffer(jh2bh(jh)); + } + } + + /* + * For the unlocked version of this call, also make sure that any + * hanging journal_head is cleaned up if necessary. + * + * __journal_refile_buffer is usually called as part of a single locked + * operation on a buffer_head, in which the caller is probably going to + * be hooking the journal_head onto other lists. In that case it is up + * to the caller to remove the journal_head if necessary. For the + * unlocked journal_refile_buffer call, the caller isn't going to be + * doing anything else to the buffer so we need to do the cleanup + * ourselves to avoid a jh leak. + * + * *** The journal_head may be freed by this call! *** + */ + void journal_refile_buffer(struct journal_head *jh) + { + struct buffer_head *bh; + + spin_lock(&journal_datalist_lock); + bh = jh2bh(jh); + + __journal_refile_buffer(jh); + __journal_remove_journal_head(bh); + + spin_unlock(&journal_datalist_lock); + __brelse(bh); + } diff -rc2P linux/fs/jbd-kernel.c linux-2.4.13/fs/jbd-kernel.c *** linux/fs/jbd-kernel.c Wed Dec 31 19:00:00 1969 --- linux-2.4.13/fs/jbd-kernel.c Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,336 ---- + /* + * fs/jbd-kernel.c + * + * Support code for the Journalling Block Device layer. + * This file contains things which have to be in-kernel when + * JBD is a module. + * + * 15 May 2001 Andrew Morton + * Created + */ + + #include + #include + #include + #include + #include + + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + + /* + * jh_splice_lock needs explantion. + * + * In a number of places we want to do things like: + * + * if (buffer_jbd(bh) && bh2jh(bh)->foo) + * + * This is racy on SMP, because another CPU could remove the journal_head + * in the middle of this expression. We need locking. + * + * But we can greatly optimise the locking cost by testing BH_JBD + * outside the lock. So, effectively: + * + * ret = 0; + * if (buffer_jbd(bh)) { + * spin_lock(&jh_splice_lock); + * if (buffer_jbd(bh)) { (* Still there? *) + * ret = bh2jh(bh)->foo; + * } + * spin_unlock(&jh_splice_lock); + * } + * return ret; + * + * Now, that protects us from races where another CPU can remove the + * journal_head. But it doesn't defend us from the situation where another + * CPU can *add* a journal_head. This is a correctness issue. But it's not + * a problem because a) the calling code was *already* racy and b) it often + * can't happen at the call site and c) the places where we add journal_heads + * tend to be under external locking. + */ + spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED; + EXPORT_SYMBOL(jh_splice_lock); + + #ifdef CONFIG_JBD_DEBUG + /* + * Some sanity testing which is called from mark_buffer_clean(), + * and must be present in the main kernel. + */ + + void jbd_preclean_buffer_check(struct buffer_head *bh) + { + if (buffer_jbd(bh)) { + struct journal_head *jh = bh2jh(bh); + + transaction_t *transaction = jh->b_transaction; + journal_t *journal; + + if (jh->b_jlist == 0 && transaction == NULL) + return; + + J_ASSERT_JH(jh, (jh->b_jlist == 0 || + jh->b_jlist == BJ_LogCtl || + jh->b_jlist == BJ_IO || + jh->b_jlist == BJ_Forget || + buffer_jbd_data(bh))); + J_ASSERT_JH(jh, transaction != NULL); + /* The kernel may be unmapping old data. We expect it + * to be dirty in that case, unless the buffer has + * already been forgotten by a transaction. */ + if (jh->b_jlist != BJ_Forget) { + #if 1 + if (!buffer_dirty(bh)) { + printk(__FUNCTION__": clean of clean buffer\n"); + print_buffer_trace(bh); + return; + } + #endif + J_ASSERT_BH(bh, buffer_dirty(bh)); + if (!buffer_jbd_data(bh)) { + J_ASSERT_JH(jh, + test_bit(BH_JWrite, + &jh2bh(jh)->b_state)); + } + } + + journal = transaction->t_journal; + J_ASSERT_JH(jh, + transaction == journal->j_running_transaction || + transaction == journal->j_committing_transaction); + } + } + EXPORT_SYMBOL(jbd_preclean_buffer_check); + #endif /* CONFIG_JBD_DEBUG */ + + /* + * Entries in /proc/sys/fs + */ + + int journal_oom_retry = 1; + EXPORT_SYMBOL(journal_oom_retry); + #if defined(CONFIG_JBD_DEBUG) + int journal_enable_debug; + int journal_no_write[2]; + EXPORT_SYMBOL(journal_enable_debug); + EXPORT_SYMBOL(journal_no_write); + #endif + + #endif /* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */ + + /* + * Support functions for BUFFER_TRACE() + */ + #ifdef CONFIG_BUFFER_DEBUG + + static spinlock_t trace_lock = SPIN_LOCK_UNLOCKED; + + void buffer_trace(struct buffer_head *dest, + struct buffer_head *src, char *info) + { + struct buffer_history_item *bhist_i; + unsigned long flags; + + if (dest == 0 || src == 0) + return; + + spin_lock_irqsave(&trace_lock, flags); + + /* + * Sometimes we don't initialise the ring pointers. (locally declared + * temp buffer_heads). Feebly attempt to detect and correct that here. + */ + if ((dest->b_history.b_history_head - dest->b_history.b_history_tail > + BUFFER_HISTORY_SIZE)) { + dest->b_history.b_history_head = 0; + dest->b_history.b_history_tail = 0; + } + bhist_i = dest->b_history.b + + (dest->b_history.b_history_head & (BUFFER_HISTORY_SIZE - 1)); + bhist_i->info = info; + bhist_i->b_state = src->b_state; + bhist_i->b_list = src->b_list; + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + bhist_i->b_trans_is_running = 0; + bhist_i->b_trans_is_committing = 0; + bhist_i->b_blocknr = src->b_blocknr; + if (buffer_jbd(src)) { + struct journal_head *jh; + journal_t *journal; + transaction_t *transaction; + + /* Footwork to avoid racing with journal_remove_journal_head */ + jh = src->b_private; + if (jh == 0) + goto raced; + transaction = jh->b_transaction; + if (src->b_private == 0) + goto raced; + bhist_i->b_jcount = jh->b_jcount; + bhist_i->b_jbd = 1; + bhist_i->b_jlist = jh->b_jlist; + bhist_i->b_frozen_data = jh->b_frozen_data; + bhist_i->b_committed_data = jh->b_committed_data; + bhist_i->b_transaction = !!jh->b_transaction; + bhist_i->b_next_transaction = !!jh->b_next_transaction; + bhist_i->b_cp_transaction = !!jh->b_cp_transaction; + + if (transaction) { + journal = transaction->t_journal; + bhist_i->b_trans_is_running = transaction == + journal->j_running_transaction; + bhist_i->b_trans_is_committing = transaction == + journal->j_committing_transaction; + } + } else { + raced: + bhist_i->b_jcount = 0; + bhist_i->b_jbd = 0; + bhist_i->b_jlist = 0; + bhist_i->b_frozen_data = 0; + bhist_i->b_committed_data = 0; + bhist_i->b_transaction = 0; + bhist_i->b_next_transaction = 0; + bhist_i->b_cp_transaction = 0; + } + #endif /* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */ + + bhist_i->on_lru = (src->b_prev_free != 0 && src->b_next_free != 0); + bhist_i->on_hash = (src->b_pprev != 0); + bhist_i->cpu = smp_processor_id(); + bhist_i->b_count = atomic_read(&src->b_count); + + dest->b_history.b_history_head++; + if (dest->b_history.b_history_head - dest->b_history.b_history_tail > + BUFFER_HISTORY_SIZE) + dest->b_history.b_history_tail = + dest->b_history.b_history_head - BUFFER_HISTORY_SIZE; + + spin_unlock_irqrestore(&trace_lock, flags); + } + + static const char *b_list_to_string(unsigned int b_list) + { + switch (b_list) { + case BUF_CLEAN: return "BUF_CLEAN"; + case BUF_LOCKED: return "BUF_LOCKED"; + case BUF_DIRTY: return "BUF_DIRTY"; + default: return "Bad b_list"; + } + } + + static const char *b_jlist_to_string(unsigned int b_list) + { + switch (b_list) { + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + case BJ_None: return "BJ_None"; + case BJ_SyncData: return "BJ_SyncData"; + case BJ_AsyncData: return "BJ_AsyncData"; + case BJ_Metadata: return "BJ_Metadata"; + case BJ_Forget: return "BJ_Forget"; + case BJ_IO: return "BJ_IO"; + case BJ_Shadow: return "BJ_Shadow"; + case BJ_LogCtl: return "BJ_LogCtl"; + case BJ_Reserved: return "BJ_Reserved"; + #endif + default: return "Bad b_jlist"; + } + } + + static void print_one_hist(struct buffer_history_item *bhist_i) + { + printk(" %s\n", bhist_i->info); + printk(" b_state:0x%lx b_list:%s b_jlist:%s on_lru:%d\n", + bhist_i->b_state, + b_list_to_string(bhist_i->b_list), + b_jlist_to_string(bhist_i->b_jlist), + bhist_i->on_lru); + printk(" cpu:%d on_hash:%d b_count:%d b_blocknr:%lu\n", + bhist_i->cpu, + bhist_i->on_hash, + bhist_i->b_count, + bhist_i->b_blocknr); + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + printk(" b_jbd:%u b_frozen_data:%p b_committed_data:%p\n", + bhist_i->b_jbd, + bhist_i->b_frozen_data, + bhist_i->b_committed_data); + printk(" b_transaction:%u b_next_transaction:%u " + "b_cp_transaction:%u b_trans_is_running:%u\n", + bhist_i->b_transaction, + bhist_i->b_next_transaction, + bhist_i->b_cp_transaction, + bhist_i->b_trans_is_running); + printk(" b_trans_is_comitting:%u b_jcount:%u ", + bhist_i->b_trans_is_committing, + bhist_i->b_jcount); + #endif + printk("\n"); + } + + void print_buffer_fields(struct buffer_head *bh) + { + printk("b_next:%p, b_blocknr:%lu b_count:%d b_flushtime:%lu\n", + bh->b_next, bh->b_blocknr, atomic_read(&bh->b_count), + bh->b_flushtime); + printk("b_next_free:%p b_prev_free:%p b_this_page:%p b_reqnext:%p\n", + bh->b_next_free, bh->b_prev_free, bh->b_this_page, + bh->b_reqnext); + printk("b_pprev:%p b_data:%p b_page:%p b_inode:%p b_list:%d\n", + bh->b_pprev, bh->b_data, bh->b_page, bh->b_inode, bh->b_list); + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + if (buffer_jbd(bh)) { + struct journal_head *jh = bh2jh(bh); + + printk("b_jlist:%u b_frozen_data:%p b_committed_data:%p\n", + jh->b_jlist, jh->b_frozen_data, jh->b_committed_data); + printk(" b_transaction:%p b_next_transaction:%p " + "b_cp_transaction:%p\n", + jh->b_transaction, jh->b_next_transaction, + jh->b_cp_transaction); + printk("b_cpnext:%p b_cpprev:%p\n", + jh->b_cpnext, jh->b_cpprev); + } + #endif + } + + void print_buffer_trace(struct buffer_head *bh) + { + #ifdef CONFIG_X86 + extern void show_stack(unsigned long * esp); + #endif + + unsigned long idx, count; + unsigned long flags; + + printk("buffer trace for buffer at 0x%p (I am CPU %d)\n", + bh, smp_processor_id()); + BUFFER_TRACE(bh, ""); /* Record state now */ + + spin_lock_irqsave(&trace_lock, flags); + for ( idx = bh->b_history.b_history_tail, count = 0; + idx < bh->b_history.b_history_head && + count < BUFFER_HISTORY_SIZE; + idx++, count++) + print_one_hist(bh->b_history.b + + (idx & (BUFFER_HISTORY_SIZE - 1))); + + print_buffer_fields(bh); + spin_unlock_irqrestore(&trace_lock, flags); + #ifdef CONFIG_X86 + show_stack(NULL); + #endif + printk("\n"); + } + + static struct buffer_head *failed_buffer_head; /* For access with debuggers */ + + void buffer_assertion_failure(struct buffer_head *bh) + { + failed_buffer_head = bh; + print_buffer_trace(bh); + } + EXPORT_SYMBOL(buffer_trace); + EXPORT_SYMBOL(print_buffer_trace); + EXPORT_SYMBOL(buffer_assertion_failure); + EXPORT_SYMBOL(print_buffer_fields); + #endif /* CONFIG_BUFFER_DEBUG */ + diff -rc2P linux/fs/open.c linux-2.4.13/fs/open.c *** linux/fs/open.c Fri Nov 9 16:15:08 2001 --- linux-2.4.13/fs/open.c Fri Nov 9 16:57:59 2001 *************** *** 72,75 **** --- 72,81 ---- } + /* + * i_sem is taken outside i_truncate_sem because that is the + * order in which these locks are taken on the path + * generic_file_write->copy_from_user->handle_mm_fault->do_no_page + */ + int do_truncate(struct dentry *dentry, loff_t length) { *************** *** 83,89 **** --- 89,97 ---- down(&inode->i_sem); + down_write(&inode->i_truncate_sem); newattrs.ia_size = length; newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; error = notify_change(dentry, &newattrs); + up_write(&inode->i_truncate_sem); up(&inode->i_sem); return error; diff -rc2P linux/include/linux/buffer-trace.h linux-2.4.13/include/linux/buffer-trace.h *** linux/include/linux/buffer-trace.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/buffer-trace.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,84 ---- + /* + * include/linux/buffer-trace.h + * + * Debugging support for recording buffer_head state transitions + * + * May 2001, akpm + * Created + */ + + #ifndef BUFFER_TRACE_H_INCLUDED + #define BUFFER_TRACE_H_INCLUDED + + #include + + #ifdef CONFIG_BUFFER_DEBUG + + /* The number of records per buffer_head. Must be a power of two */ + #define BUFFER_HISTORY_SIZE 32 + + struct buffer_head; + + /* This gets embedded in struct buffer_head */ + struct buffer_history { + struct buffer_history_item { + char *info; + unsigned long b_state; + unsigned b_list:3; + unsigned b_jlist:4; + unsigned on_lru:1; + unsigned on_hash:1; + unsigned cpu:3; + unsigned b_count:8; + unsigned long b_blocknr; /* For src != dest */ + #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) + unsigned b_jcount:4; + unsigned b_jbd:1; + unsigned b_transaction:1; + unsigned b_next_transaction:1; + unsigned b_cp_transaction:1; + unsigned b_trans_is_running:1; + unsigned b_trans_is_committing:1; + void *b_frozen_data; + void *b_committed_data; + #endif + } b[BUFFER_HISTORY_SIZE]; + unsigned long b_history_head; /* Next place to write */ + unsigned long b_history_tail; /* Oldest valid entry */ + }; + + static inline void buffer_trace_init(struct buffer_history *bhist) + { + bhist->b_history_head = 0; + bhist->b_history_tail = 0; + } + extern void buffer_trace(struct buffer_head *dest, + struct buffer_head *src, char *info); + extern void print_buffer_fields(struct buffer_head *bh); + extern void print_buffer_trace(struct buffer_head *bh); + + #define BUFFER_STRINGIFY2(X) #X + #define BUFFER_STRINGIFY(X) BUFFER_STRINGIFY2(X) + + #define BUFFER_TRACE2(dest, src, info) \ + do { \ + buffer_trace((dest), (src), \ + __FUNCTION__"() ["__FILE__":" \ + BUFFER_STRINGIFY(__LINE__)"] " info); \ + } while (0) + + #define BUFFER_TRACE(bh, info) BUFFER_TRACE2(bh, bh, info) + #define JBUFFER_TRACE(jh, info) BUFFER_TRACE(jh2bh(jh), info) + + #else /* CONFIG_BUFFER_DEBUG */ + + #define buffer_trace_init(bh) do {} while (0) + #define print_buffer_fields(bh) do {} while (0) + #define print_buffer_trace(bh) do {} while (0) + #define BUFFER_TRACE(bh, info) do {} while (0) + #define BUFFER_TRACE2(bh, bh2, info) do {} while (0) + #define JBUFFER_TRACE(jh, info) do {} while (0) + + #endif /* CONFIG_BUFFER_DEBUG */ + + #endif /* BUFFER_TRACE_H_INCLUDED */ diff -rc2P linux/include/linux/capability.h linux-2.4.13/include/linux/capability.h *** linux/include/linux/capability.h Fri Nov 9 16:15:08 2001 --- linux-2.4.13/include/linux/capability.h Fri Nov 9 16:58:00 2001 *************** *** 251,254 **** --- 251,256 ---- /* Override quota limits. */ /* Override reserved space on ext2 filesystem */ + /* Modify data journaling mode on ext3 filesystem (uses journaling + resources) */ /* NOTE: ext2 honors fsuid when checking for resource overrides, so you can override using fsuid too */ diff -rc2P linux/include/linux/capability.h.orig linux-2.4.13/include/linux/capability.h.orig *** linux/include/linux/capability.h.orig Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/capability.h.orig Fri Nov 9 16:15:08 2001 *************** *** 0 **** --- 1,356 ---- + /* + * This is + * + * Andrew G. Morgan + * Alexander Kjeldaas + * with help from Aleph1, Roland Buresund and Andrew Main. + * + * See here for the libcap library ("POSIX draft" compliance): + * + * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.2/ + */ + + #ifndef _LINUX_CAPABILITY_H + #define _LINUX_CAPABILITY_H + + #include + #include + + /* User-level do most of the mapping between kernel and user + capabilities based on the version tag given by the kernel. The + kernel might be somewhat backwards compatible, but don't bet on + it. */ + + /* XXX - Note, cap_t, is defined by POSIX to be an "opaque" pointer to + a set of three capability sets. The transposition of 3*the + following structure to such a composite is better handled in a user + library since the draft standard requires the use of malloc/free + etc.. */ + + #define _LINUX_CAPABILITY_VERSION 0x19980330 + + typedef struct __user_cap_header_struct { + __u32 version; + int pid; + } *cap_user_header_t; + + typedef struct __user_cap_data_struct { + __u32 effective; + __u32 permitted; + __u32 inheritable; + } *cap_user_data_t; + + #ifdef __KERNEL__ + + /* #define STRICT_CAP_T_TYPECHECKS */ + + #ifdef STRICT_CAP_T_TYPECHECKS + + typedef struct kernel_cap_struct { + __u32 cap; + } kernel_cap_t; + + #else + + typedef __u32 kernel_cap_t; + + #endif + + #define _USER_CAP_HEADER_SIZE (2*sizeof(__u32)) + #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) + + #endif + + + /** + ** POSIX-draft defined capabilities. + **/ + + /* In a system with the [_POSIX_CHOWN_RESTRICTED] option defined, this + overrides the restriction of changing file ownership and group + ownership. */ + + #define CAP_CHOWN 0 + + /* Override all DAC access, including ACL execute access if + [_POSIX_ACL] is defined. Excluding DAC access covered by + CAP_LINUX_IMMUTABLE. */ + + #define CAP_DAC_OVERRIDE 1 + + /* Overrides all DAC restrictions regarding read and search on files + and directories, including ACL restrictions if [_POSIX_ACL] is + defined. Excluding DAC access covered by CAP_LINUX_IMMUTABLE. */ + + #define CAP_DAC_READ_SEARCH 2 + + /* Overrides all restrictions about allowed operations on files, where + file owner ID must be equal to the user ID, except where CAP_FSETID + is applicable. It doesn't override MAC and DAC restrictions. */ + + #define CAP_FOWNER 3 + + /* Overrides the following restrictions that the effective user ID + shall match the file owner ID when setting the S_ISUID and S_ISGID + bits on that file; that the effective group ID (or one of the + supplementary group IDs) shall match the file owner ID when setting + the S_ISGID bit on that file; that the S_ISUID and S_ISGID bits are + cleared on successful return from chown(2) (not implemented). */ + + #define CAP_FSETID 4 + + /* Used to decide between falling back on the old suser() or fsuser(). */ + + #define CAP_FS_MASK 0x1f + + /* Overrides the restriction that the real or effective user ID of a + process sending a signal must match the real or effective user ID + of the process receiving the signal. */ + + #define CAP_KILL 5 + + /* Allows setgid(2) manipulation */ + /* Allows setgroups(2) */ + /* Allows forged gids on socket credentials passing. */ + + #define CAP_SETGID 6 + + /* Allows set*uid(2) manipulation (including fsuid). */ + /* Allows forged pids on socket credentials passing. */ + + #define CAP_SETUID 7 + + + /** + ** Linux-specific capabilities + **/ + + /* Transfer any capability in your permitted set to any pid, + remove any capability in your permitted set from any pid */ + + #define CAP_SETPCAP 8 + + /* Allow modification of S_IMMUTABLE and S_APPEND file attributes */ + + #define CAP_LINUX_IMMUTABLE 9 + + /* Allows binding to TCP/UDP sockets below 1024 */ + /* Allows binding to ATM VCIs below 32 */ + + #define CAP_NET_BIND_SERVICE 10 + + /* Allow broadcasting, listen to multicast */ + + #define CAP_NET_BROADCAST 11 + + /* Allow interface configuration */ + /* Allow administration of IP firewall, masquerading and accounting */ + /* Allow setting debug option on sockets */ + /* Allow modification of routing tables */ + /* Allow setting arbitrary process / process group ownership on + sockets */ + /* Allow binding to any address for transparent proxying */ + /* Allow setting TOS (type of service) */ + /* Allow setting promiscuous mode */ + /* Allow clearing driver statistics */ + /* Allow multicasting */ + /* Allow read/write of device-specific registers */ + /* Allow activation of ATM control sockets */ + + #define CAP_NET_ADMIN 12 + + /* Allow use of RAW sockets */ + /* Allow use of PACKET sockets */ + + #define CAP_NET_RAW 13 + + /* Allow locking of shared memory segments */ + /* Allow mlock and mlockall (which doesn't really have anything to do + with IPC) */ + + #define CAP_IPC_LOCK 14 + + /* Override IPC ownership checks */ + + #define CAP_IPC_OWNER 15 + + /* Insert and remove kernel modules - modify kernel without limit */ + /* Modify cap_bset */ + #define CAP_SYS_MODULE 16 + + /* Allow ioperm/iopl access */ + /* Allow sending USB messages to any device via /proc/bus/usb */ + + #define CAP_SYS_RAWIO 17 + + /* Allow use of chroot() */ + + #define CAP_SYS_CHROOT 18 + + /* Allow ptrace() of any process */ + + #define CAP_SYS_PTRACE 19 + + /* Allow configuration of process accounting */ + + #define CAP_SYS_PACCT 20 + + /* Allow configuration of the secure attention key */ + /* Allow administration of the random device */ + /* Allow examination and configuration of disk quotas */ + /* Allow configuring the kernel's syslog (printk behaviour) */ + /* Allow setting the domainname */ + /* Allow setting the hostname */ + /* Allow calling bdflush() */ + /* Allow mount() and umount(), setting up new smb connection */ + /* Allow some autofs root ioctls */ + /* Allow nfsservctl */ + /* Allow VM86_REQUEST_IRQ */ + /* Allow to read/write pci config on alpha */ + /* Allow irix_prctl on mips (setstacksize) */ + /* Allow flushing all cache on m68k (sys_cacheflush) */ + /* Allow removing semaphores */ + /* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores + and shared memory */ + /* Allow locking/unlocking of shared memory segment */ + /* Allow turning swap on/off */ + /* Allow forged pids on socket credentials passing */ + /* Allow setting readahead and flushing buffers on block devices */ + /* Allow setting geometry in floppy driver */ + /* Allow turning DMA on/off in xd driver */ + /* Allow administration of md devices (mostly the above, but some + extra ioctls) */ + /* Allow tuning the ide driver */ + /* Allow access to the nvram device */ + /* Allow administration of apm_bios, serial and bttv (TV) device */ + /* Allow manufacturer commands in isdn CAPI support driver */ + /* Allow reading non-standardized portions of pci configuration space */ + /* Allow DDI debug ioctl on sbpcd driver */ + /* Allow setting up serial ports */ + /* Allow sending raw qic-117 commands */ + /* Allow enabling/disabling tagged queuing on SCSI controllers and sending + arbitrary SCSI commands */ + /* Allow setting encryption key on loopback filesystem */ + /* Allow the selection of a security context */ + + #define CAP_SYS_ADMIN 21 + + /* Allow use of reboot() */ + + #define CAP_SYS_BOOT 22 + + /* Allow raising priority and setting priority on other (different + UID) processes */ + /* Allow use of FIFO and round-robin (realtime) scheduling on own + processes and setting the scheduling algorithm used by another + process. */ + + #define CAP_SYS_NICE 23 + + /* Override resource limits. Set resource limits. */ + /* Override quota limits. */ + /* Override reserved space on ext2 filesystem */ + /* NOTE: ext2 honors fsuid when checking for resource overrides, so + you can override using fsuid too */ + /* Override size restrictions on IPC message queues */ + /* Allow more than 64hz interrupts from the real-time clock */ + /* Override max number of consoles on console allocation */ + /* Override max number of keymaps */ + + #define CAP_SYS_RESOURCE 24 + + /* Allow manipulation of system clock */ + /* Allow irix_stime on mips */ + /* Allow setting the real-time clock */ + + #define CAP_SYS_TIME 25 + + /* Allow configuration of tty devices */ + /* Allow vhangup() of tty */ + + #define CAP_SYS_TTY_CONFIG 26 + + /* Allow the privileged aspects of mknod() */ + + #define CAP_MKNOD 27 + + /* Allow taking of leases on files */ + + #define CAP_LEASE 28 + + /* Allow opening special device file */ + + #define CAP_OPENDEV 29 + + #ifdef __KERNEL__ + /* + * Bounding set + */ + extern kernel_cap_t cap_bset; + + /* + * Internal kernel functions only + */ + + #ifdef STRICT_CAP_T_TYPECHECKS + + #define to_cap_t(x) { x } + #define cap_t(x) (x).cap + + #else + + #define to_cap_t(x) (x) + #define cap_t(x) (x) + + #endif + + #define CAP_EMPTY_SET to_cap_t(0) + #define CAP_FULL_SET to_cap_t(~0) + #define CAP_INIT_EFF_SET to_cap_t(~0 & ~CAP_TO_MASK(CAP_SETPCAP)) + #define CAP_INIT_INH_SET to_cap_t(0) + + #define CAP_TO_MASK(x) (1 << (x)) + #define cap_raise(c, flag) (cap_t(c) |= CAP_TO_MASK(flag)) + #define cap_lower(c, flag) (cap_t(c) &= ~CAP_TO_MASK(flag)) + #define cap_raised(c, flag) (cap_t(c) & CAP_TO_MASK(flag)) + + static inline kernel_cap_t cap_combine(kernel_cap_t a, kernel_cap_t b) + { + kernel_cap_t dest; + cap_t(dest) = cap_t(a) | cap_t(b); + return dest; + } + + static inline kernel_cap_t cap_intersect(kernel_cap_t a, kernel_cap_t b) + { + kernel_cap_t dest; + cap_t(dest) = cap_t(a) & cap_t(b); + return dest; + } + + static inline kernel_cap_t cap_drop(kernel_cap_t a, kernel_cap_t drop) + { + kernel_cap_t dest; + cap_t(dest) = cap_t(a) & ~cap_t(drop); + return dest; + } + + static inline kernel_cap_t cap_invert(kernel_cap_t c) + { + kernel_cap_t dest; + cap_t(dest) = ~cap_t(c); + return dest; + } + + #define cap_isclear(c) (!cap_t(c)) + #define cap_issubset(a,set) (!(cap_t(a) & ~cap_t(set))) + + #define cap_clear(c) do { cap_t(c) = 0; } while(0) + #define cap_set_full(c) do { cap_t(c) = ~0; } while(0) + #define cap_mask(c,mask) do { cap_t(c) &= cap_t(mask); } while(0) + + #define cap_is_fs_cap(c) (CAP_TO_MASK(c) & CAP_FS_MASK) + + #endif /* __KERNEL__ */ + + #endif /* !_LINUX_CAPABILITY_H */ diff -rc2P linux/include/linux/ext3_fs.h linux-2.4.13/include/linux/ext3_fs.h *** linux/include/linux/ext3_fs.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/ext3_fs.h Fri Nov 9 17:05:34 2001 *************** *** 0 **** --- 1,716 ---- + /* + * linux/include/linux/ext3_fs.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + + #ifndef _LINUX_EXT3_FS_H + #define _LINUX_EXT3_FS_H + + #include + + /* + * The second extended filesystem constants/structures + */ + + /* + * Define EXT3FS_DEBUG to produce debug messages + */ + #undef EXT3FS_DEBUG + + /* + * Define EXT3_PREALLOCATE to preallocate data blocks for expanding files + */ + #undef EXT3_PREALLOCATE /* @@@ Fix this! */ + #define EXT3_DEFAULT_PREALLOC_BLOCKS 8 + + /* + * The second extended file system version + */ + #define EXT3FS_DATE "21 Oct 2001" + #define EXT3FS_VERSION "2.4-0.9.13" + + /* + * Debug code + */ + #ifdef EXT3FS_DEBUG + #define ext3_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) + #else + #define ext3_debug(f, a...) do {} while (0) + #endif + + /* + * Special inodes numbers + */ + #define EXT3_BAD_INO 1 /* Bad blocks inode */ + #define EXT3_ROOT_INO 2 /* Root inode */ + #define EXT3_ACL_IDX_INO 3 /* ACL inode */ + #define EXT3_ACL_DATA_INO 4 /* ACL inode */ + #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */ + #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */ + #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */ + #define EXT3_JOURNAL_INO 8 /* Journal inode */ + + /* First non-reserved inode for old ext3 filesystems */ + #define EXT3_GOOD_OLD_FIRST_INO 11 + + /* + * The second extended file system magic number + */ + #define EXT3_SUPER_MAGIC 0xEF53 + + /* + * Maximal count of links to a file + */ + #define EXT3_LINK_MAX 32000 + + /* + * Macro-instructions used to manage several block sizes + */ + #define EXT3_MIN_BLOCK_SIZE 1024 + #define EXT3_MAX_BLOCK_SIZE 4096 + #define EXT3_MIN_BLOCK_LOG_SIZE 10 + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE(s) ((s)->s_blocksize) + #else + # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size) + #endif + #define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry)) + #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32)) + #ifdef __KERNEL__ + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) + #else + # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) + #endif + #ifdef __KERNEL__ + #define EXT3_ADDR_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_addr_per_block_bits) + #define EXT3_INODE_SIZE(s) ((s)->u.ext3_sb.s_inode_size) + #define EXT3_FIRST_INO(s) ((s)->u.ext3_sb.s_first_ino) + #else + #define EXT3_INODE_SIZE(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ + EXT3_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) + #define EXT3_FIRST_INO(s) (((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \ + EXT3_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) + #endif + + /* + * Macro-instructions used to manage fragments + */ + #define EXT3_MIN_FRAG_SIZE 1024 + #define EXT3_MAX_FRAG_SIZE 4096 + #define EXT3_MIN_FRAG_LOG_SIZE 10 + #ifdef __KERNEL__ + # define EXT3_FRAG_SIZE(s) ((s)->u.ext3_sb.s_frag_size) + # define EXT3_FRAGS_PER_BLOCK(s) ((s)->u.ext3_sb.s_frags_per_block) + #else + # define EXT3_FRAG_SIZE(s) (EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size) + # define EXT3_FRAGS_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s)) + #endif + + /* + * ACL structures + */ + struct ext3_acl_header /* Header of Access Control Lists */ + { + __u32 aclh_size; + __u32 aclh_file_count; + __u32 aclh_acle_count; + __u32 aclh_first_acle; + }; + + struct ext3_acl_entry /* Access Control List Entry */ + { + __u32 acle_size; + __u16 acle_perms; /* Access permissions */ + __u16 acle_type; /* Type of entry */ + __u16 acle_tag; /* User or group identity */ + __u16 acle_pad1; + __u32 acle_next; /* Pointer on next entry for the */ + /* same inode or on next free entry */ + }; + + /* + * Structure of a blocks group descriptor + */ + struct ext3_group_desc + { + __u32 bg_block_bitmap; /* Blocks bitmap block */ + __u32 bg_inode_bitmap; /* Inodes bitmap block */ + __u32 bg_inode_table; /* Inodes table block */ + __u16 bg_free_blocks_count; /* Free blocks count */ + __u16 bg_free_inodes_count; /* Free inodes count */ + __u16 bg_used_dirs_count; /* Directories count */ + __u16 bg_pad; + __u32 bg_reserved[3]; + }; + + /* + * Macro-instructions used to manage group descriptors + */ + #ifdef __KERNEL__ + # define EXT3_BLOCKS_PER_GROUP(s) ((s)->u.ext3_sb.s_blocks_per_group) + # define EXT3_DESC_PER_BLOCK(s) ((s)->u.ext3_sb.s_desc_per_block) + # define EXT3_INODES_PER_GROUP(s) ((s)->u.ext3_sb.s_inodes_per_group) + # define EXT3_DESC_PER_BLOCK_BITS(s) ((s)->u.ext3_sb.s_desc_per_block_bits) + #else + # define EXT3_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) + # define EXT3_DESC_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc)) + # define EXT3_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) + #endif + + /* + * Constants relative to the data blocks + */ + #define EXT3_NDIR_BLOCKS 12 + #define EXT3_IND_BLOCK EXT3_NDIR_BLOCKS + #define EXT3_DIND_BLOCK (EXT3_IND_BLOCK + 1) + #define EXT3_TIND_BLOCK (EXT3_DIND_BLOCK + 1) + #define EXT3_N_BLOCKS (EXT3_TIND_BLOCK + 1) + + /* + * Inode flags + */ + #define EXT3_SECRM_FL 0x00000001 /* Secure deletion */ + #define EXT3_UNRM_FL 0x00000002 /* Undelete */ + #define EXT3_COMPR_FL 0x00000004 /* Compress file */ + #define EXT3_SYNC_FL 0x00000008 /* Synchronous updates */ + #define EXT3_IMMUTABLE_FILE_FL 0x00000010 /* Immutable file */ + #define EXT3_APPEND_FL 0x00000020 /* writes to file may only append */ + #define EXT3_NODUMP_FL 0x00000040 /* do not dump file */ + #define EXT3_NOATIME_FL 0x00000080 /* do not update atime */ + /* Reserved for compression usage... */ + #define EXT3_DIRTY_FL 0x00000100 + #define EXT3_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ + #define EXT3_NOCOMPR_FL 0x00000400 /* Don't compress */ + #define EXT3_ECOMPR_FL 0x00000800 /* Compression error */ + /* End compression flags --- maybe not all used */ + #define EXT3_INDEX_FL 0x00001000 /* hash-indexed directory */ + #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ + #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ + #define EXT3_IMMUTABLE_LINK_FL 0x00008000 /* Immutable link */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ + + #define EXT3_FL_USER_VISIBLE 0x00009FFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000080FF /* User modifiable flags */ + + /* + * Inode dynamic state flags + */ + #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ + #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ + + /* + * ioctl commands + */ + #define EXT3_IOC_GETFLAGS _IOR('f', 1, long) + #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) + #define EXT3_IOC_GETVERSION _IOR('f', 3, long) + #define EXT3_IOC_SETVERSION _IOW('f', 4, long) + #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) + #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) + #ifdef CONFIG_JBD_DEBUG + #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) + #endif + + /* + * Structure of an inode on the disk + */ + struct ext3_inode { + __u16 i_mode; /* File mode */ + __u16 i_uid; /* Low 16 bits of Owner Uid */ + __u32 i_size; /* Size in bytes */ + __u32 i_atime; /* Access time */ + __u32 i_ctime; /* Creation time */ + __u32 i_mtime; /* Modification time */ + __u32 i_dtime; /* Deletion Time */ + __u16 i_gid; /* Low 16 bits of Group Id */ + __u16 i_links_count; /* Links count */ + __u32 i_blocks; /* Blocks count */ + __u32 i_flags; /* File flags */ + union { + struct { + __u32 l_i_reserved1; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __u32 i_block[EXT3_N_BLOCKS];/* Pointers to blocks */ + __u32 i_generation; /* File version (for NFS) */ + __u32 i_file_acl; /* File ACL */ + __u32 i_dir_acl; /* Directory ACL */ + __u32 i_faddr; /* Fragment address */ + union { + struct { + __u8 l_i_frag; /* Fragment number */ + __u8 l_i_fsize; /* Fragment size */ + __u16 i_pad1; + __u16 l_i_uid_high; /* these 2 fields */ + __u16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __u8 h_i_frag; /* Fragment number */ + __u8 h_i_fsize; /* Fragment size */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __u8 m_i_frag; /* Fragment number */ + __u8 m_i_fsize; /* Fragment size */ + __u16 m_pad1; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + }; + + #define i_size_high i_dir_acl + + #if defined(__KERNEL__) || defined(__linux__) + #define i_reserved1 osd1.linux1.l_i_reserved1 + #define i_frag osd2.linux2.l_i_frag + #define i_fsize osd2.linux2.l_i_fsize + #define i_uid_low i_uid + #define i_gid_low i_gid + #define i_uid_high osd2.linux2.l_i_uid_high + #define i_gid_high osd2.linux2.l_i_gid_high + #define i_reserved2 osd2.linux2.l_i_reserved2 + + #elif defined(__GNU__) + + #define i_translator osd1.hurd1.h_i_translator + #define i_frag osd2.hurd2.h_i_frag; + #define i_fsize osd2.hurd2.h_i_fsize; + #define i_uid_high osd2.hurd2.h_i_uid_high + #define i_gid_high osd2.hurd2.h_i_gid_high + #define i_author osd2.hurd2.h_i_author + + #elif defined(__masix__) + + #define i_reserved1 osd1.masix1.m_i_reserved1 + #define i_frag osd2.masix2.m_i_frag + #define i_fsize osd2.masix2.m_i_fsize + #define i_reserved2 osd2.masix2.m_i_reserved2 + + #endif /* defined(__KERNEL__) || defined(__linux__) */ + + /* + * File system states + */ + #define EXT3_VALID_FS 0x0001 /* Unmounted cleanly */ + #define EXT3_ERROR_FS 0x0002 /* Errors detected */ + #define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ + + /* + * Mount flags + */ + #define EXT3_MOUNT_CHECK 0x0001 /* Do mount-time checks */ + #define EXT3_MOUNT_GRPID 0x0004 /* Create files with directory's group */ + #define EXT3_MOUNT_DEBUG 0x0008 /* Some debugging messages */ + #define EXT3_MOUNT_ERRORS_CONT 0x0010 /* Continue on errors */ + #define EXT3_MOUNT_ERRORS_RO 0x0020 /* Remount fs ro on errors */ + #define EXT3_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */ + #define EXT3_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */ + #define EXT3_MOUNT_NOLOAD 0x0100 /* Don't use existing journal*/ + #define EXT3_MOUNT_ABORT 0x0200 /* Fatal error detected */ + #define EXT3_MOUNT_DATA_FLAGS 0x0C00 /* Mode for data writes: */ + #define EXT3_MOUNT_JOURNAL_DATA 0x0400 /* Write data to journal */ + #define EXT3_MOUNT_ORDERED_DATA 0x0800 /* Flush data before commit */ + #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ + #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ + #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H + #define clear_opt(o, opt) o &= ~EXT3_MOUNT_##opt + #define set_opt(o, opt) o |= EXT3_MOUNT_##opt + #define test_opt(sb, opt) ((sb)->u.ext3_sb.s_mount_opt & \ + EXT3_MOUNT_##opt) + #else + #define EXT2_MOUNT_NOLOAD EXT3_MOUNT_NOLOAD + #define EXT2_MOUNT_ABORT EXT3_MOUNT_ABORT + #endif + + #define ext3_set_bit ext2_set_bit + #define ext3_clear_bit ext2_clear_bit + #define ext3_test_bit ext2_test_bit + #define ext3_find_first_zero_bit ext2_find_first_zero_bit + #define ext3_find_next_zero_bit ext2_find_next_zero_bit + + /* + * Maximal mount counts between two filesystem checks + */ + #define EXT3_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ + #define EXT3_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + + /* + * Behaviour when detecting errors + */ + #define EXT3_ERRORS_CONTINUE 1 /* Continue execution */ + #define EXT3_ERRORS_RO 2 /* Remount fs read-only */ + #define EXT3_ERRORS_PANIC 3 /* Panic */ + #define EXT3_ERRORS_DEFAULT EXT3_ERRORS_CONTINUE + + /* + * Structure of the super block + */ + struct ext3_super_block { + /*00*/ __u32 s_inodes_count; /* Inodes count */ + __u32 s_blocks_count; /* Blocks count */ + __u32 s_r_blocks_count; /* Reserved blocks count */ + __u32 s_free_blocks_count; /* Free blocks count */ + /*10*/ __u32 s_free_inodes_count; /* Free inodes count */ + __u32 s_first_data_block; /* First Data Block */ + __u32 s_log_block_size; /* Block size */ + __s32 s_log_frag_size; /* Fragment size */ + /*20*/ __u32 s_blocks_per_group; /* # Blocks per group */ + __u32 s_frags_per_group; /* # Fragments per group */ + __u32 s_inodes_per_group; /* # Inodes per group */ + __u32 s_mtime; /* Mount time */ + /*30*/ __u32 s_wtime; /* Write time */ + __u16 s_mnt_count; /* Mount count */ + __s16 s_max_mnt_count; /* Maximal mount count */ + __u16 s_magic; /* Magic signature */ + __u16 s_state; /* File system state */ + __u16 s_errors; /* Behaviour when detecting errors */ + __u16 s_minor_rev_level; /* minor revision level */ + /*40*/ __u32 s_lastcheck; /* time of last check */ + __u32 s_checkinterval; /* max. time between checks */ + __u32 s_creator_os; /* OS */ + __u32 s_rev_level; /* Revision level */ + /*50*/ __u16 s_def_resuid; /* Default uid for reserved blocks */ + __u16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT3_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __u32 s_first_ino; /* First non-reserved inode */ + __u16 s_inode_size; /* size of inode structure */ + __u16 s_block_group_nr; /* block group # of this superblock */ + __u32 s_feature_compat; /* compatible feature set */ + /*60*/ __u32 s_feature_incompat; /* incompatible feature set */ + __u32 s_feature_ro_compat; /* readonly-compatible feature set */ + /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ + /*78*/ char s_volume_name[16]; /* volume name */ + /*88*/ char s_last_mounted[64]; /* directory where last mounted */ + /*C8*/ __u32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __u16 s_padding1; + /* + * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. + */ + /*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ + /*E0*/ __u32 s_journal_inum; /* inode number of journal file */ + __u32 s_journal_dev; /* device number of journal file */ + __u32 s_last_orphan; /* start of list of inodes to delete */ + + /*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */ + }; + + #ifdef __KERNEL__ + #define EXT3_SB(sb) (&((sb)->u.ext3_sb)) + #define EXT3_I(inode) (&((inode)->u.ext3_i)) + #else + /* Assume that user mode programs are passing in an ext3fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ + #define EXT3_SB(sb) (sb) + #endif + + #define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime + + /* + * Codes for operating systems + */ + #define EXT3_OS_LINUX 0 + #define EXT3_OS_HURD 1 + #define EXT3_OS_MASIX 2 + #define EXT3_OS_FREEBSD 3 + #define EXT3_OS_LITES 4 + + /* + * Revision levels + */ + #define EXT3_GOOD_OLD_REV 0 /* The good old (original) format */ + #define EXT3_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + + #define EXT3_CURRENT_REV EXT3_GOOD_OLD_REV + #define EXT3_MAX_SUPP_REV EXT3_DYNAMIC_REV + + #define EXT3_GOOD_OLD_INODE_SIZE 128 + + /* + * Feature set definitions + */ + + #define EXT3_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) + #define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) + #define EXT3_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) + #define EXT3_SET_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) + #define EXT3_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) + #define EXT3_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) + #define EXT3_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) + #define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) + #define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + + #define EXT3_FEATURE_COMPAT_DIR_PREALLOC 0x0001 + #define EXT3_FEATURE_COMPAT_IMAGIC_INODES 0x0002 + #define EXT3_FEATURE_COMPAT_HAS_JOURNAL 0x0004 + #define EXT3_FEATURE_COMPAT_EXT_ATTR 0x0008 + #define EXT3_FEATURE_COMPAT_RESIZE_INODE 0x0010 + #define EXT3_FEATURE_COMPAT_DIR_INDEX 0x0020 + + #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 + #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 + #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 + + #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 + #define EXT3_FEATURE_INCOMPAT_FILETYPE 0x0002 + #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ + #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ + + #define EXT3_FEATURE_COMPAT_SUPP 0 + #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \ + EXT3_FEATURE_INCOMPAT_RECOVER) + #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT3_FEATURE_RO_COMPAT_BTREE_DIR) + + /* + * Default values for user and/or group using reserved blocks + */ + #define EXT3_DEF_RESUID 0 + #define EXT3_DEF_RESGID 0 + + /* + * Structure of a directory entry + */ + #define EXT3_NAME_LEN 255 + + struct ext3_dir_entry { + __u32 inode; /* Inode number */ + __u16 rec_len; /* Directory entry length */ + __u16 name_len; /* Name length */ + char name[EXT3_NAME_LEN]; /* File name */ + }; + + /* + * The new version of the directory entry. Since EXT3 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ + struct ext3_dir_entry_2 { + __u32 inode; /* Inode number */ + __u16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT3_NAME_LEN]; /* File name */ + }; + + /* + * Ext3 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ + #define EXT3_FT_UNKNOWN 0 + #define EXT3_FT_REG_FILE 1 + #define EXT3_FT_DIR 2 + #define EXT3_FT_CHRDEV 3 + #define EXT3_FT_BLKDEV 4 + #define EXT3_FT_FIFO 5 + #define EXT3_FT_SOCK 6 + #define EXT3_FT_SYMLINK 7 + + #define EXT3_FT_MAX 8 + + /* + * EXT3_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ + #define EXT3_DIR_PAD 4 + #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1) + #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \ + ~EXT3_DIR_ROUND) + + #ifdef __KERNEL__ + + /* Filesize hard limits for 64-bit file offsets */ + extern long long ext3_max_sizes[]; + + /* + * Describe an inode's exact location on disk and in memory + */ + struct ext3_iloc + { + struct buffer_head *bh; + struct ext3_inode *raw_inode; + unsigned long block_group; + }; + + /* + * Function prototypes + */ + + /* + * Ok, these declarations are also in but none of the + * ext3 source programs needs to include it so they are duplicated here. + */ + # define NORET_TYPE /**/ + # define ATTRIB_NORET __attribute__((noreturn)) + # define NORET_AND noreturn, + + /* acl.c */ + extern int ext3_permission (struct inode *, int); + + /* balloc.c */ + extern int ext3_bg_has_super(struct super_block *sb, int group); + extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); + extern int ext3_new_block (handle_t *, struct inode *, unsigned long, + __u32 *, __u32 *, int *); + extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, + unsigned long); + extern unsigned long ext3_count_free_blocks (struct super_block *); + extern void ext3_check_blocks_bitmap (struct super_block *); + extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, + unsigned int block_group, + struct buffer_head ** bh); + + /* bitmap.c */ + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* dir.c */ + extern int ext3_check_dir_entry(const char *, struct inode *, + struct ext3_dir_entry_2 *, struct buffer_head *, + unsigned long); + + /* file.c */ + + /* fsync.c */ + extern int ext3_sync_file (struct file *, struct dentry *, int); + + /* ialloc.c */ + extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int); + extern void ext3_free_inode (handle_t *, struct inode *); + extern struct inode * ext3_orphan_get (struct super_block *, ino_t); + extern unsigned long ext3_count_free_inodes (struct super_block *); + extern void ext3_check_inodes_bitmap (struct super_block *); + + /* inode.c */ + + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); + + extern int ext3_get_inode_loc (struct inode *, struct ext3_iloc *); + extern void ext3_read_inode (struct inode *); + extern void ext3_write_inode (struct inode *, int); + extern int ext3_setattr (struct dentry *, struct iattr *); + extern void ext3_put_inode (struct inode *); + extern void ext3_delete_inode (struct inode *); + extern int ext3_sync_inode (handle_t *, struct inode *); + extern void ext3_discard_prealloc (struct inode *); + extern void ext3_dirty_inode(struct inode *); + extern int ext3_change_inode_journal_flag(struct inode *, int); + + /* ioctl.c */ + extern int ext3_ioctl (struct inode *, struct file *, unsigned int, + unsigned long); + + /* namei.c */ + extern struct inode_operations ext3_dir_inode_operations; + extern int ext3_orphan_add(handle_t *, struct inode *); + extern int ext3_orphan_del(handle_t *, struct inode *); + + /* super.c */ + extern void ext3_error (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); + extern void __ext3_std_error (struct super_block *, const char *, int); + extern void ext3_abort (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); + extern NORET_TYPE void ext3_panic (struct super_block *, const char *, + const char *, ...) + __attribute__ ((NORET_AND format (printf, 3, 4))); + extern void ext3_warning (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); + extern void ext3_update_dynamic_rev (struct super_block *sb); + extern void ext3_put_super (struct super_block *); + extern void ext3_write_super (struct super_block *); + extern void ext3_write_super_lockfs (struct super_block *); + extern void ext3_unlockfs (struct super_block *); + extern int ext3_remount (struct super_block *, int *, char *); + extern struct super_block * ext3_read_super (struct super_block *,void *,int); + extern int ext3_statfs (struct super_block *, struct statfs *); + + /* truncate.c */ + extern void ext3_truncate (struct inode *); + + #define ext3_std_error(sb, errno) \ + do { \ + if ((errno)) \ + __ext3_std_error((sb), __FUNCTION__, (errno)); \ + } while (0) + extern const char *ext3_decode_error(struct super_block *sb, int errno, char nbuf[16]); + + /* + * Inodes and files operations + */ + + /* dir.c */ + extern struct file_operations ext3_dir_operations; + + /* file.c */ + extern struct inode_operations ext3_file_inode_operations; + extern struct file_operations ext3_file_operations; + + /* symlink.c */ + extern struct inode_operations ext3_fast_symlink_inode_operations; + + extern struct address_space_operations ext3_aops; + + #endif /* __KERNEL__ */ + + #endif /* _LINUX_EXT3_FS_H */ diff -rc2P linux/include/linux/ext3_fs_i.h linux-2.4.13/include/linux/ext3_fs_i.h *** linux/include/linux/ext3_fs_i.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/ext3_fs_i.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,78 ---- + /* + * linux/include/linux/ext3_fs_i.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_i.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + + #ifndef _LINUX_EXT3_FS_I + #define _LINUX_EXT3_FS_I + + #include + + /* + * second extended file system inode data in memory + */ + struct ext3_inode_info { + __u32 i_data[15]; + __u32 i_flags; + #ifdef EXT3_FRAGMENTS + __u32 i_faddr; + __u8 i_frag_no; + __u8 i_frag_size; + __u16 unused; /* formerly i_osync */ + #endif + __u32 i_file_acl; + __u32 i_dir_acl; + __u32 i_dtime; + __u32 i_block_group; + __u32 i_state; /* Dynamic state flags for ext3 */ + __u32 i_next_alloc_block; + __u32 i_next_alloc_goal; + #ifdef EXT3_PREALLOCATE + __u32 i_prealloc_block; + __u32 i_prealloc_count; + #endif + __u32 i_dir_start_lookup; + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext3_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext3_get_block (growth) and ext3_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * truncate_sem is for serialising ext3_truncate() against + * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext3 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff -rc2P linux/include/linux/ext3_fs_sb.h linux-2.4.13/include/linux/ext3_fs_sb.h *** linux/include/linux/ext3_fs_sb.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/ext3_fs_sb.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,77 ---- + /* + * linux/include/linux/ext3_fs_sb.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_sb.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + + #ifndef _LINUX_EXT3_FS_SB + #define _LINUX_EXT3_FS_SB + + #ifdef __KERNEL__ + #include + #include + #endif + + /* + * The following is not needed anymore since the descriptors buffer + * heads are now dynamically allocated + */ + /* #define EXT3_MAX_GROUP_DESC 8 */ + + #define EXT3_MAX_GROUP_LOADED 8 + + /* + * third extended-fs super-block data in memory + */ + struct ext3_sb_info { + unsigned long s_frag_size; /* Size of a fragment in bytes */ + unsigned long s_frags_per_block;/* Number of fragments per block */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_frags_per_group;/* Number of fragments in a group */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + unsigned long s_groups_count; /* Number of groups in the fs */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext3_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned short s_loaded_inode_bitmaps; + unsigned short s_loaded_block_bitmaps; + unsigned long s_inode_bitmap_number[EXT3_MAX_GROUP_LOADED]; + struct buffer_head * s_inode_bitmap[EXT3_MAX_GROUP_LOADED]; + unsigned long s_block_bitmap_number[EXT3_MAX_GROUP_LOADED]; + struct buffer_head * s_block_bitmap[EXT3_MAX_GROUP_LOADED]; + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + + /* Journaling */ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; + unsigned long s_commit_interval; + struct block_device *journal_bdev; + #ifdef CONFIG_JBD_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ + #endif + }; + + #endif /* _LINUX_EXT3_FS_SB */ diff -rc2P linux/include/linux/ext3_jbd.h linux-2.4.13/include/linux/ext3_jbd.h *** linux/include/linux/ext3_jbd.h Wed Dec 31 19:00:00 1969 --- linux-2.4.13/include/linux/ext3_jbd.h Fri Nov 9 16:58:00 2001 *************** *** 0 **** --- 1,290 ---- + /* + * linux/include/linux/ext3_jbd.h + * + * Written by Stephen C. Tweedie , 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Ext3-specific journaling extensions. + */ + + #ifndef _LINUX_EXT3_JBD_H + #define _LINUX_EXT3_JBD_H + + #include + #include + #include + + #define EXT3_JOURNAL(inode) (EXT3_SB((inode)->i_sb)->s_journal) + + /* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. */ + + #define EXT3_SINGLEDATA_TRANS_BLOCKS 8 + + /* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + + #define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2) + + extern int ext3_writepage_trans_blocks(struct inode *inode); + + /* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + + #define EXT3_DELETE_TRANS_BLOCKS (2 * EXT3_DATA_TRANS_BLOCKS + 64) + + /* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + + #define EXT3_MAX_TRANS_DATA 64 + + /* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + + #define EXT3_RESERVE_TRANS_BLOCKS 12 + + int + ext3_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext3_iloc *iloc); + + /* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + + int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext3_iloc *iloc); + + int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode); + + /* + * Wrapper functions with which ext3 calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext3 can control + * ext2 filesystems, so ext2+ext3 systems only nee one fs. This work hasn't + * been done yet. + */ + + static inline void ext3_journal_abort_handle(const char *caller, + const char *err_fn, + struct buffer_head *bh, + handle_t *handle, + int err) + { + char nbuf[16]; + const char *errstr = ext3_decode_error(NULL, err, nbuf); + + printk(KERN_ERR "%s: aborting transaction: %s in %s", + caller, errstr, err_fn); + + if (bh) + BUFFER_TRACE(bh, "abort"); + journal_abort_handle(handle); + if (!handle->h_err) + handle->h_err = err; + } + + static inline int + __ext3_journal_get_undo_access(const char *where, + handle_t *handle, struct buffer_head *bh) + { + int err = journal_get_undo_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline int + __ext3_journal_get_write_access(const char *where, + handle_t *handle, struct buffer_head *bh) + { + int err = journal_get_write_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline int + __ext3_journal_dirty_data(const char *where, + handle_t *handle, struct buffer_head *bh, int async) + { + int err = journal_dirty_data(handle, bh, async); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline void + ext3_journal_forget(handle_t *handle, struct buffer_head *bh) + { + journal_forget(handle, bh); + } + + static inline int + __ext3_journal_revoke(const char *where, handle_t *handle, + unsigned long blocknr, struct buffer_head *bh) + { + int err = journal_revoke(handle, blocknr, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline int + __ext3_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh) + { + int err = journal_get_create_access(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + static inline int + __ext3_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh) + { + int err = journal_dirty_metadata(handle, bh); + if (err) + ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + return err; + } + + + #define ext3_journal_get_undo_access(handle, bh) \ + __ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh)) + #define ext3_journal_get_write_access(handle, bh) \ + __ext3_journal_get_write_access(__FUNCTION__, (handle), (bh)) + #define ext3_journal_dirty_data(handle, bh, async) \ + __ext3_journal_dirty_data(__FUNCTION__, (handle), (bh), (async)) + #define ext3_journal_revoke(handle, blocknr, bh) \ + __ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) + #define ext3_journal_get_create_access(handle, bh) \ + __ext3_journal_get_create_access(__FUNCTION__, (handle), (bh)) + #define ext3_journal_dirty_metadata(handle, bh) \ + __ext3_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) + + + + /* + * Wrappers for journal_start/end. + * + * The only special thing we need to do here is to make sure that all + * journal_end calls result in the superblock being marked dirty, so + * that sync() will call the filesystem's write_super callback if + * appropriate. + */ + static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) + { + if (inode->i_sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + return journal_start(EXT3_JOURNAL(inode), nblocks); + } + + static inline handle_t * + ext3_journal_try_start(struct inode *inode, int nblocks) + { + if (inode->i_sb->s_flags & MS_RDONLY) + return ERR_PTR(-EROFS); + return journal_try_start(EXT3_JOURNAL(inode), nblocks); + } + + /* + * The only special thing we need to do here is to make sure