diff -rc2P linux/Documentation/Configure.help linux-2.4.13/Documentation/Configure.help
*** linux/Documentation/Configure.help	Sat Oct 20 22:17:19 2001
--- linux-2.4.13/Documentation/Configure.help	Fri Nov  9 16:58:00 2001
***************
*** 12059,12062 ****
--- 12059,12132 ----
    wants to say Y here.
  
+ Ext3 journaling file system support (EXPERIMENTAL)
+ CONFIG_EXT3_FS
+   This is the journaling version of the Second extended file system
+   (often called ext3), the de facto standard Linux file system
+   (method to organize files on a storage device) for hard disks. 
+ 
+   The journaling code included in this driver means you do not have
+   to run e2fsck (file system checker) on your file systems after a
+   crash.  The journal keeps track of any changes that were being made
+   at the time the system crashed, and can ensure that your file system
+   is consistent without the need for a lengthy check.
+ 
+   Other than adding the journal to the filesystem, the on-disk format of
+   ext3 is identical to ext2.  It is possible to freely switch between
+   using the ext3 driver and the ext2 driver, as long as the filesystem
+   has been cleanly unmounted, or e2fsck is run on the filesystem.
+ 
+   To add a journal on an existing ext2 filesystem or change the behavior
+   of ext3 file systems, you can use the tune2fs utility ("man tune2fs").
+   To modify attributes of files and directories on ext3 file systems,
+   use chattr ("man chattr").  You need to be using e2fsprogs version
+   1.20 or later in order to create ext3 journals (available at
+   <http://sourceforge.net/projects/e2fsprogs/>).
+   
+   If you want to compile this file system as a module ( = code which
+   can be inserted in and removed from the running kernel whenever you
+   want), say M here and read Documentation/modules.txt. The module
+   will be called ext3.o. Be aware however that the file system of your
+   root partition (the one containing the directory /) cannot be
+   compiled as a module, and so this may be dangerous.
+ 
+ Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
+ CONFIG_JBD
+   This is a generic journaling layer for block devices. It is currently
+   used by the ext3 file system, but it could also be used to add journal
+   support to other file systems or block devices such as RAID or LVM.
+ 
+   If you are using the ext3 filesystem, you need to say Y here. If you
+   are not using ext3 then you will probably want to say N.
+ 
+   If you want to compile this device as a module ( = code which can be
+   inserted in and removed from the running kernel whenever you want),
+   say M here and read Documentation/modules.txt. The module will be called
+   jbd.o. If you are compiling ext3 into the kernel, you cannot compile
+   this code as a module.
+ 
+ JBD (ext3) debugging support
+ CONFIG_JBD_DEBUG
+   If you are using the ext3 journaled file system (or potentially any
+   other file system/device using JBD), this option allows you to enable
+   debugging output while the system is running, in order to help track
+   down any problems you are having.  By default the debugging output
+   will be turned off.
+ 
+   If you select Y here, then you will be able to turn on debugging with
+   "echo N > /proc/sys/fs/jbd-debug", where N is a number between 1 and 5,
+   the higher the number, the more debugging output is generated.  To turn
+   debugging off again, do "echo 0 > /proc/sys/fs/jbd-debug".
+ 
+ Buffer Head tracing (DEBUG)
+ CONFIG_BUFFER_DEBUG
+   If you are a kernel developer working with file systems or in the block
+   device layer, this buffer head tracing may help you to track down bugs
+   in your code.  This enables some debugging macros (BUFFER_TRACE, etc)
+   which allow you to track the state of a buffer through various layers
+   of code.  The debugging code is used primarily by ext3 and JBD code.
+ 
+   Because this option adds considerably to the size of each buffer, most
+   people will want to say N here.
+ 
  BFS file system support (EXPERIMENTAL)
  CONFIG_BFS_FS
diff -rc2P linux/drivers/block/ll_rw_blk.c linux-2.4.13/drivers/block/ll_rw_blk.c
*** linux/drivers/block/ll_rw_blk.c	Sat Oct 13 13:30:30 2001
--- linux-2.4.13/drivers/block/ll_rw_blk.c	Fri Nov  9 16:58:00 2001
***************
*** 672,677 ****
  	   down by us so at this point flushpage will block and
  	   won't clear the mapped bit under us. */
! 	if (!buffer_mapped(bh))
  		BUG();
  
  	/*
--- 672,679 ----
  	   down by us so at this point flushpage will block and
  	   won't clear the mapped bit under us. */
! 	if (!buffer_mapped(bh)) {
! 		print_buffer_trace(bh);
  		BUG();
+ 	}
  
  	/*
***************
*** 1007,1013 ****
  		switch(rw) {
  		case WRITE:
! 			if (!atomic_set_buffer_clean(bh))
  				/* Hmmph! Nothing to write */
  				goto end_io;
  			__mark_buffer_clean(bh);
  			break;
--- 1009,1018 ----
  		switch(rw) {
  		case WRITE:
! 			if (!atomic_set_buffer_clean(bh)) {
! 				BUFFER_TRACE(bh, "already clean");
  				/* Hmmph! Nothing to write */
  				goto end_io;
+ 			}
+ 			BUFFER_TRACE(bh, "set clean, write underway");
  			__mark_buffer_clean(bh);
  			break;
***************
*** 1032,1037 ****
  sorry:
  	/* Make sure we don't get infinite dirty retries.. */
! 	for (i = 0; i < nr; i++)
  		mark_buffer_clean(bhs[i]);
  }
  
--- 1037,1044 ----
  sorry:
  	/* Make sure we don't get infinite dirty retries.. */
! 	for (i = 0; i < nr; i++) {
! 		BUFFER_TRACE(bhs[i], "sorry");
  		mark_buffer_clean(bhs[i]);
+ 	}
  }
  
***************
*** 1133,1136 ****
--- 1140,1144 ----
  		queue_nr_requests = 128;
  
+ 
  	/*
  	 * Batch frees according to queue length
diff -rc2P linux/drivers/block/loop.c linux-2.4.13/drivers/block/loop.c
*** linux/drivers/block/loop.c	Mon Oct 15 21:53:51 2001
--- linux-2.4.13/drivers/block/loop.c	Fri Nov  9 16:58:00 2001
***************
*** 187,190 ****
--- 187,192 ----
  	while (len > 0) {
  		int IV = index * (PAGE_CACHE_SIZE/bsize) + offset/bsize;
+ 		int transfer_result;
+ 
  		size = PAGE_CACHE_SIZE - offset;
  		if (size > len)
***************
*** 198,205 ****
  		kaddr = page_address(page);
  		flush_dcache_page(page);
! 		if (lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV))
! 			goto write_fail;
  		if (aops->commit_write(file, page, offset, offset+size))
  			goto unlock;
  		data += size;
  		len -= size;
--- 200,216 ----
  		kaddr = page_address(page);
  		flush_dcache_page(page);
! 		transfer_result = lo_do_transfer(lo, WRITE, kaddr + offset, data, size, IV);
! 		if (transfer_result) {
! 			/*
! 			 * The transfer failed, but we still write the data to
! 			 * keep prepare/commit calls balanced.
! 			 */
! 			printk(KERN_ERR "loop: transfer error block %ld\n", index);
! 			memset(kaddr + offset, 0, size);
! 		}
  		if (aops->commit_write(file, page, offset, offset+size))
  			goto unlock;
+ 		if (transfer_result)
+ 			goto unlock;
  		data += size;
  		len -= size;
***************
*** 213,220 ****
  	return 0;
  
- write_fail:
- 	printk(KERN_ERR "loop: transfer error block %ld\n", index);
- 	ClearPageUptodate(page);
- 	kunmap(page);
  unlock:
  	UnlockPage(page);
--- 224,227 ----
diff -rc2P linux/drivers/ide/ide-disk.c linux-2.4.13/drivers/ide/ide-disk.c
*** linux/drivers/ide/ide-disk.c	Thu Oct 11 12:14:32 2001
--- linux-2.4.13/drivers/ide/ide-disk.c	Fri Nov  9 16:58:00 2001
***************
*** 368,371 ****
--- 368,392 ----
  static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
  {
+ #ifdef CONFIG_JBD_DEBUG
+ 	/*
+ 	 * Silently stop writing to this disk to simulate a crash.
+ 	 */
+ 	extern int journal_no_write[2];
+ 	int i;
+ 
+ 	if (rq->cmd != WRITE)
+ 		goto write_ok;
+ 
+ 	for (i = 0; i < 2; i++) {
+ 		if ((journal_no_write[i] & 0xdead0000) == 0xdead0000) {
+ 			if (rq->rq_dev == (journal_no_write[i] & 0xffff)) {
+ 				ide_end_request(1, HWGROUP(drive));
+ 				return ide_stopped;
+ 			}
+ 		}
+ 	}
+ write_ok:
+ 	;
+ #endif
  	if (IDE_CONTROL_REG)
  		OUT_BYTE(drive->ctl,IDE_CONTROL_REG);
diff -rc2P linux/fs/Config.in linux-2.4.13/fs/Config.in
*** linux/fs/Config.in	Thu Oct  4 18:13:18 2001
--- linux-2.4.13/fs/Config.in	Fri Nov  9 16:57:59 2001
***************
*** 21,24 ****
--- 21,32 ----
  dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL
  
+ tristate 'Ext3 journalling file system support (EXPERIMENTAL)' CONFIG_EXT3_FS
+ # CONFIG_JBD could be its own option (even modular), but until there are
+ # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
+ # dep_tristate '  Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
+ define_bool CONFIG_JBD $CONFIG_EXT3_FS
+ dep_mbool '  JBD (ext3) debugging support' CONFIG_JBD_DEBUG $CONFIG_JBD
+ bool 'Buffer Head tracing (DEBUG)' CONFIG_BUFFER_DEBUG
+ 
  # msdos file systems
  tristate 'DOS FAT fs support' CONFIG_FAT_FS
diff -rc2P linux/fs/Makefile linux-2.4.13/fs/Makefile
*** linux/fs/Makefile	Thu Oct  4 18:13:18 2001
--- linux-2.4.13/fs/Makefile	Fri Nov  9 16:58:00 2001
***************
*** 8,12 ****
  O_TARGET := fs.o
  
! export-objs :=	filesystems.o open.o dcache.o
  mod-subdirs :=	nls
  
--- 8,12 ----
  O_TARGET := fs.o
  
! export-objs :=	filesystems.o open.o dcache.o buffer.o jbd-kernel.o
  mod-subdirs :=	nls
  
***************
*** 15,19 ****
  		fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
  		dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
! 		filesystems.o namespace.o
  
  ifeq ($(CONFIG_QUOTA),y)
--- 15,19 ----
  		fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
  		dcache.o inode.o attr.o bad_inode.o file.o iobuf.o dnotify.o \
! 		filesystems.o namespace.o jbd-kernel.o
  
  ifeq ($(CONFIG_QUOTA),y)
***************
*** 27,30 ****
--- 27,32 ----
  
  # Do not add any filesystems before this line
+ subdir-$(CONFIG_EXT3_FS)	+= ext3    # Before ext2 so root fs can be ext3
+ subdir-$(CONFIG_JBD)		+= jbd
  subdir-$(CONFIG_EXT2_FS)	+= ext2
  subdir-$(CONFIG_CRAMFS)		+= cramfs
diff -rc2P linux/fs/buffer.c linux-2.4.13/fs/buffer.c
*** linux/fs/buffer.c	Tue Oct 23 20:54:19 2001
--- linux-2.4.13/fs/buffer.c	Fri Nov  9 16:57:59 2001
***************
*** 46,49 ****
--- 46,51 ----
  #include <linux/iobuf.h>
  #include <linux/highmem.h>
+ #include <linux/jbd.h>
+ #include <linux/module.h>
  #include <linux/completion.h>
  
***************
*** 614,619 ****
     by the user.
  
!    Thus invalidate_buffers in general usage is not allwowed to trash dirty
!    buffers. For example ioctl(FLSBLKBUF) expects dirty data to be preserved.
  
     NOTE: In the case where the user removed a removable-media-disk even if
--- 616,625 ----
     by the user.
  
!    Thus invalidate_buffers in general usage is not allwowed to trash
!    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
!    be preserved.  These buffers are simply skipped.
!   
!    We also skip buffers which are still in use.  For example this can
!    happen if a userspace program is reading the block device.
  
     NOTE: In the case where the user removed a removable-media-disk even if
***************
*** 718,721 ****
--- 724,728 ----
  	bh->b_end_io = handler;
  	bh->b_private = private;
+ 	buffer_trace_init(&bh->b_history);
  }
  
***************
*** 727,730 ****
--- 734,738 ----
  	struct page *page;
  
+ 	BUFFER_TRACE(bh, "enter");
  	mark_buffer_uptodate(bh, uptodate);
  
***************
*** 1093,1096 ****
--- 1101,1110 ----
  }
  
+ void set_buffer_flushtime(struct buffer_head *bh)
+ {
+ 	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
+ }
+ EXPORT_SYMBOL(set_buffer_flushtime);
+ 
  /*
   * A buffer may need to be moved from one buffer list to another
***************
*** 1100,1103 ****
--- 1114,1120 ----
  {
  	int dispose = BUF_CLEAN;
+ 
+ 	BUFFER_TRACE(bh, "enter");
+ 
  	if (buffer_locked(bh))
  		dispose = BUF_LOCKED;
***************
*** 1111,1114 ****
--- 1128,1132 ----
  		__insert_into_lru_list(bh, dispose);
  	}
+ 	BUFFER_TRACE(bh, "exit");
  }
  
***************
*** 1125,1128 ****
--- 1143,1147 ----
  void __brelse(struct buffer_head * buf)
  {
+ 	BUFFER_TRACE(buf, "entry");
  	if (atomic_read(&buf->b_count)) {
  		put_bh(buf);
***************
*** 1138,1141 ****
--- 1157,1161 ----
  void __bforget(struct buffer_head * buf)
  {
+ 	BUFFER_TRACE(buf, "enter");
  	mark_buffer_clean(buf);
  	__brelse(buf);
***************
*** 1168,1175 ****
   * Note: the caller should wake up the buffer_wait list if needed.
   */
! static __inline__ void __put_unused_buffer_head(struct buffer_head * bh)
  {
  	if (bh->b_inode)
  		BUG();
  	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
  		kmem_cache_free(bh_cachep, bh);
--- 1188,1207 ----
   * Note: the caller should wake up the buffer_wait list if needed.
   */
! static void __put_unused_buffer_head(struct buffer_head * bh)
  {
  	if (bh->b_inode)
  		BUG();
+ 
+ 	J_ASSERT_BH(bh, bh->b_prev_free == 0);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ 	if (buffer_jbd(bh)) {
+ 		J_ASSERT_BH(bh, bh2jh(bh)->b_transaction == 0);
+ 		J_ASSERT_BH(bh, bh2jh(bh)->b_next_transaction == 0);
+ 		J_ASSERT_BH(bh, bh2jh(bh)->b_frozen_data == 0);
+ 		J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data == 0);
+ 	}
+ #endif
+ 	buffer_trace_init(&bh->b_history);
+ 
  	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
  		kmem_cache_free(bh_cachep, bh);
***************
*** 1185,1188 ****
--- 1217,1228 ----
  }
  
+ void put_unused_buffer_head(struct buffer_head *bh)
+ {
+ 	spin_lock(&unused_list_lock);
+ 	__put_unused_buffer_head(bh);
+ 	spin_unlock(&unused_list_lock);
+ }
+ EXPORT_SYMBOL(put_unused_buffer_head);
+ 
  /*
   * Reserve NR_RESERVED buffer heads for async IO requests to avoid
***************
*** 1190,1194 ****
   * buffer heads is now handled in create_buffers().
   */ 
! static struct buffer_head * get_unused_buffer_head(int async)
  {
  	struct buffer_head * bh;
--- 1230,1234 ----
   * buffer heads is now handled in create_buffers().
   */ 
! struct buffer_head * get_unused_buffer_head(int async)
  {
  	struct buffer_head * bh;
***************
*** 1211,1214 ****
--- 1251,1255 ----
  		bh->b_blocknr = -1;
  		bh->b_this_page = NULL;
+ 		buffer_trace_init(&bh->b_history);
  		return bh;
  	}
***************
*** 1224,1227 ****
--- 1265,1269 ----
  			nr_unused_buffer_heads--;
  			spin_unlock(&unused_list_lock);
+ 			buffer_trace_init(&bh->b_history);
  			return bh;
  		}
***************
*** 1231,1234 ****
--- 1273,1277 ----
  	return NULL;
  }
+ EXPORT_SYMBOL(get_unused_buffer_head);
  
  void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
***************
*** 1245,1248 ****
--- 1288,1292 ----
  		bh->b_data = page_address(page) + offset;
  }
+ EXPORT_SYMBOL(set_bh_page);
  
  /*
***************
*** 1328,1331 ****
--- 1372,1376 ----
  {
  	if (buffer_mapped(bh)) {
+ 		BUFFER_TRACE(bh, "entry");
  		mark_buffer_clean(bh);
  		lock_buffer(bh);
***************
*** 1338,1341 ****
--- 1383,1411 ----
  }
  
+ /**
+  * try_to_release_page - release old fs-specific metadata on a page
+  *
+  */
+ 
+ int try_to_release_page(struct page * page, int gfp_mask)
+ {
+ 	if (!PageLocked(page))
+ 		BUG();
+ 	
+ 	if (!page->mapping)
+ 		goto try_to_free;
+ 	if (!page->mapping->a_ops->releasepage)
+ 		goto try_to_free;
+ 	if (page->mapping->a_ops->releasepage(page, gfp_mask))
+ 		goto try_to_free;
+ 	/*
+ 	 * We couldn't release buffer metadata; don't even bother trying
+ 	 * to release buffers.
+ 	 */
+ 	return 0;
+ try_to_free:	
+ 	return try_to_free_buffers(page, gfp_mask);
+ }
+ 
  /*
   * We don't have to release all buffers here, but
***************
*** 1381,1385 ****
  	 */
  	if (!offset) {
! 		if (!try_to_free_buffers(page, 0))
  			return 0;
  	}
--- 1451,1455 ----
  	 */
  	if (!offset) {
! 		if (!try_to_release_page(page, 0))
  			return 0;
  	}
***************
*** 1409,1412 ****
--- 1479,1483 ----
  	page_cache_get(page);
  }
+ EXPORT_SYMBOL(create_empty_buffers);
  
  /*
***************
*** 1427,1431 ****
--- 1498,1505 ----
  
  	old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+ 	J_ASSERT_BH(bh, old_bh != bh);
  	if (old_bh) {
+ 		BUFFER_TRACE(old_bh, "old_bh - entry");
+ 		J_ASSERT_BH(old_bh, !buffer_jlist_eq(old_bh, BJ_Metadata));
  		mark_buffer_clean(old_bh);
  		wait_on_buffer(old_bh);
***************
*** 1449,1454 ****
  
  /*
!  * block_write_full_page() is SMP-safe - currently it's still
!  * being called with the kernel lock held, but the code is ready.
   */
  static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
--- 1523,1527 ----
  
  /*
!  * block_write_full_page() is SMP threaded - the kernel lock is not held.
   */
  static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
***************
*** 1484,1489 ****
  			if (err)
  				goto out;
! 			if (buffer_new(bh))
  				unmap_underlying_metadata(bh);
  		}
  		bh = bh->b_this_page;
--- 1557,1564 ----
  			if (err)
  				goto out;
! 			if (buffer_new(bh)) {
! 				BUFFER_TRACE(bh, "new: call unmap_underlying_metadata");
  				unmap_underlying_metadata(bh);
+ 			}
  		}
  		bh = bh->b_this_page;
***************
*** 1493,1496 ****
--- 1568,1572 ----
  	/* Stage 2: lock the buffers, mark them clean */
  	do {
+ 		BUFFER_TRACE(bh, "lock it");
  		lock_buffer(bh);
  		set_buffer_async_io(bh);
***************
*** 1549,1554 ****
--- 1625,1632 ----
  				goto out;
  			if (buffer_new(bh)) {
+ 				BUFFER_TRACE(bh, "new: call unmap_underlying_metadata");
  				unmap_underlying_metadata(bh);
  				if (Page_Uptodate(page)) {
+ 					BUFFER_TRACE(bh, "setting uptodate");
  					set_bit(BH_Uptodate, &bh->b_state);
  					continue;
***************
*** 1564,1567 ****
--- 1642,1646 ----
  		}
  		if (Page_Uptodate(page)) {
+ 			BUFFER_TRACE(bh, "setting uptodate");
  			set_bit(BH_Uptodate, &bh->b_state);
  			continue; 
***************
*** 1569,1572 ****
--- 1648,1652 ----
  		if (!buffer_uptodate(bh) &&
  		     (block_start < from || block_end > to)) {
+ 			BUFFER_TRACE(bh, "reading");
  			ll_rw_block(READ, 1, &bh);
  			*wait_bh++=bh;
***************
*** 1607,1610 ****
--- 1687,1691 ----
  			set_bit(BH_Uptodate, &bh->b_state);
  			if (!atomic_set_buffer_dirty(bh)) {
+ 				BUFFER_TRACE(bh, "mark dirty");
  				__mark_dirty(bh);
  				buffer_insert_inode_data_queue(bh, inode);
***************
*** 1890,1893 ****
--- 1971,1975 ----
  	kunmap(page);
  
+ 	BUFFER_TRACE(bh, "zeroed end of block");
  	__mark_buffer_dirty(bh);
  	err = 0;
***************
*** 2447,2450 ****
--- 2529,2534 ----
  	return 0;
  }
+ EXPORT_SYMBOL(try_to_free_buffers);
+ EXPORT_SYMBOL(buffermem_pages);
  
  /* ================== Debugging =================== */
diff -rc2P linux/fs/ext3/Makefile linux-2.4.13/fs/ext3/Makefile
*** linux/fs/ext3/Makefile	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/Makefile	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,16 ----
+ #
+ # Makefile for the linux ext2-filesystem routines.
+ #
+ # Note! Dependencies are done automagically by 'make dep', which also
+ # removes any old dependencies. DON'T put your own dependencies here
+ # unless it's something special (ie not a .c file).
+ #
+ # Note 2! The CFLAGS definitions are now in the main makefile...
+ 
+ O_TARGET := ext3.o
+ 
+ obj-y    := acl.o balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+ 		ioctl.o namei.o super.o symlink.o
+ obj-m    := $(O_TARGET)
+ 
+ include $(TOPDIR)/Rules.make
diff -rc2P linux/fs/ext3/acl.c linux-2.4.13/fs/ext3/acl.c
*** linux/fs/ext3/acl.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/acl.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,17 ----
+ /*
+  * linux/fs/ext3/acl.c
+  *
+  * Copyright (C) 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  */
+ 
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ 
+ 
+ /*
+  * This file will contain the Access Control Lists management for the
+  * second extended file system.
+  */
diff -rc2P linux/fs/ext3/balloc.c linux-2.4.13/fs/ext3/balloc.c
*** linux/fs/ext3/balloc.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/balloc.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,995 ----
+ /*
+  *  linux/fs/ext3/balloc.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  */
+ 
+ #include <linux/config.h>
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+ 
+ /*
+  * balloc.c contains the blocks allocation and deallocation routines
+  */
+ 
+ /*
+  * The free blocks are managed by bitmaps.  A file system contains several
+  * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+  * block for inodes, N blocks for the inode table and data blocks.
+  *
+  * The file system contains group descriptors which are located after the
+  * super block.  Each descriptor contains the number of the bitmap block and
+  * the free blocks count in the block.  The descriptors are loaded in memory
+  * when a file system is mounted (see ext3_read_super).
+  */
+ 
+ 
+ #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
+ 
+ struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+ 					     unsigned int block_group,
+ 					     struct buffer_head ** bh)
+ {
+ 	unsigned long group_desc;
+ 	unsigned long desc;
+ 	struct ext3_group_desc * gdp;
+ 
+ 	if (block_group >= sb->u.ext3_sb.s_groups_count) {
+ 		ext3_error (sb, "ext3_get_group_desc",
+ 			    "block_group >= groups_count - "
+ 			    "block_group = %d, groups_count = %lu",
+ 			    block_group, sb->u.ext3_sb.s_groups_count);
+ 
+ 		return NULL;
+ 	}
+ 	
+ 	group_desc = block_group / EXT3_DESC_PER_BLOCK(sb);
+ 	desc = block_group % EXT3_DESC_PER_BLOCK(sb);
+ 	if (!sb->u.ext3_sb.s_group_desc[group_desc]) {
+ 		ext3_error (sb, "ext3_get_group_desc",
+ 			    "Group descriptor not loaded - "
+ 			    "block_group = %d, group_desc = %lu, desc = %lu",
+ 			     block_group, group_desc, desc);
+ 		return NULL;
+ 	}
+ 	
+ 	gdp = (struct ext3_group_desc *) 
+ 	      sb->u.ext3_sb.s_group_desc[group_desc]->b_data;
+ 	if (bh)
+ 		*bh = sb->u.ext3_sb.s_group_desc[group_desc];
+ 	return gdp + desc;
+ }
+ 
+ /*
+  * Read the bitmap for a given block_group, reading into the specified 
+  * slot in the superblock's bitmap cache.
+  *
+  * Return >=0 on success or a -ve error code.
+  */
+ 
+ static int read_block_bitmap (struct super_block * sb,
+ 			       unsigned int block_group,
+ 			       unsigned long bitmap_nr)
+ {
+ 	struct ext3_group_desc * gdp;
+ 	struct buffer_head * bh = NULL;
+ 	int retval = -EIO;
+ 	
+ 	gdp = ext3_get_group_desc (sb, block_group, NULL);
+ 	if (!gdp)
+ 		goto error_out;
+ 	retval = 0;
+ 	bh = bread (sb->s_dev,
+ 			le32_to_cpu(gdp->bg_block_bitmap), sb->s_blocksize);
+ 	if (!bh) {
+ 		ext3_error (sb, "read_block_bitmap",
+ 			    "Cannot read block bitmap - "
+ 			    "block_group = %d, block_bitmap = %lu",
+ 			    block_group, (unsigned long) gdp->bg_block_bitmap);
+ 		retval = -EIO;
+ 	}
+ 	/*
+ 	 * On IO error, just leave a zero in the superblock's block pointer for
+ 	 * this group.  The IO will be retried next time.
+ 	 */
+ error_out:
+ 	sb->u.ext3_sb.s_block_bitmap_number[bitmap_nr] = block_group;
+ 	sb->u.ext3_sb.s_block_bitmap[bitmap_nr] = bh;
+ 	return retval;
+ }
+ 
+ /*
+  * load_block_bitmap loads the block bitmap for a blocks group
+  *
+  * It maintains a cache for the last bitmaps loaded.  This cache is managed
+  * with a LRU algorithm.
+  *
+  * Notes:
+  * 1/ There is one cache per mounted file system.
+  * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
+  *    this function reads the bitmap without maintaining a LRU cache.
+  * 
+  * Return the slot used to store the bitmap, or a -ve error code.
+  */
+ static int __load_block_bitmap (struct super_block * sb,
+ 			        unsigned int block_group)
+ {
+ 	int i, j, retval = 0;
+ 	unsigned long block_bitmap_number;
+ 	struct buffer_head * block_bitmap;
+ 
+ 	if (block_group >= sb->u.ext3_sb.s_groups_count)
+ 		ext3_panic (sb, "load_block_bitmap",
+ 			    "block_group >= groups_count - "
+ 			    "block_group = %d, groups_count = %lu",
+ 			    block_group, sb->u.ext3_sb.s_groups_count);
+ 
+ 	if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED) {
+ 		if (sb->u.ext3_sb.s_block_bitmap[block_group]) {
+ 			if (sb->u.ext3_sb.s_block_bitmap_number[block_group] ==
+ 			    block_group)
+ 				return block_group;
+ 			ext3_error (sb, "__load_block_bitmap",
+ 				    "block_group != block_bitmap_number");
+ 		}
+ 		retval = read_block_bitmap (sb, block_group, block_group);
+ 		if (retval < 0)
+ 			return retval;
+ 		return block_group;
+ 	}
+ 
+ 	for (i = 0; i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
+ 		    sb->u.ext3_sb.s_block_bitmap_number[i] != block_group; i++)
+ 		;
+ 	if (i < sb->u.ext3_sb.s_loaded_block_bitmaps &&
+   	    sb->u.ext3_sb.s_block_bitmap_number[i] == block_group) {
+ 		block_bitmap_number = sb->u.ext3_sb.s_block_bitmap_number[i];
+ 		block_bitmap = sb->u.ext3_sb.s_block_bitmap[i];
+ 		for (j = i; j > 0; j--) {
+ 			sb->u.ext3_sb.s_block_bitmap_number[j] =
+ 				sb->u.ext3_sb.s_block_bitmap_number[j - 1];
+ 			sb->u.ext3_sb.s_block_bitmap[j] =
+ 				sb->u.ext3_sb.s_block_bitmap[j - 1];
+ 		}
+ 		sb->u.ext3_sb.s_block_bitmap_number[0] = block_bitmap_number;
+ 		sb->u.ext3_sb.s_block_bitmap[0] = block_bitmap;
+ 
+ 		/*
+ 		 * There's still one special case here --- if block_bitmap == 0
+ 		 * then our last attempt to read the bitmap failed and we have
+ 		 * just ended up caching that failure.  Try again to read it.
+ 		 */
+ 		if (!block_bitmap)
+ 			retval = read_block_bitmap (sb, block_group, 0);
+ 	} else {
+ 		if (sb->u.ext3_sb.s_loaded_block_bitmaps<EXT3_MAX_GROUP_LOADED)
+ 			sb->u.ext3_sb.s_loaded_block_bitmaps++;
+ 		else
+ 			brelse (sb->u.ext3_sb.s_block_bitmap
+ 					[EXT3_MAX_GROUP_LOADED - 1]);
+ 		for (j = sb->u.ext3_sb.s_loaded_block_bitmaps - 1;
+ 					j > 0;  j--) {
+ 			sb->u.ext3_sb.s_block_bitmap_number[j] =
+ 				sb->u.ext3_sb.s_block_bitmap_number[j - 1];
+ 			sb->u.ext3_sb.s_block_bitmap[j] =
+ 				sb->u.ext3_sb.s_block_bitmap[j - 1];
+ 		}
+ 		retval = read_block_bitmap (sb, block_group, 0);
+ 	}
+ 	return retval;
+ }
+ 
+ /*
+  * Load the block bitmap for a given block group.  First of all do a couple
+  * of fast lookups for common cases and then pass the request onto the guts
+  * of the bitmap loader.
+  *
+  * Return the slot number of the group in the superblock bitmap cache's on
+  * success, or a -ve error code.
+  *
+  * There is still one inconsistency here --- if the number of groups in this
+  * filesystems is <= EXT3_MAX_GROUP_LOADED, then we have no way of 
+  * differentiating between a group for which we have never performed a bitmap
+  * IO request, and a group for which the last bitmap read request failed.
+  */
+ static inline int load_block_bitmap (struct super_block * sb,
+ 				     unsigned int block_group)
+ {
+ 	int slot;
+ 	
+ 	/*
+ 	 * Do the lookup for the slot.  First of all, check if we're asking
+ 	 * for the same slot as last time, and did we succeed that last time?
+ 	 */
+ 	if (sb->u.ext3_sb.s_loaded_block_bitmaps > 0 &&
+ 	    sb->u.ext3_sb.s_block_bitmap_number[0] == block_group &&
+ 	    sb->u.ext3_sb.s_block_bitmap[0]) {
+ 		return 0;
+ 	}
+ 	/*
+ 	 * Or can we do a fast lookup based on a loaded group on a filesystem
+ 	 * small enough to be mapped directly into the superblock?
+ 	 */
+ 	else if (sb->u.ext3_sb.s_groups_count <= EXT3_MAX_GROUP_LOADED && 
+ 		 sb->u.ext3_sb.s_block_bitmap_number[block_group]==block_group
+ 			&& sb->u.ext3_sb.s_block_bitmap[block_group]) {
+ 		slot = block_group;
+ 	}
+ 	/*
+ 	 * If not, then do a full lookup for this block group.
+ 	 */
+ 	else {
+ 		slot = __load_block_bitmap (sb, block_group);
+ 	}
+ 
+ 	/*
+ 	 * <0 means we just got an error
+ 	 */
+ 	if (slot < 0)
+ 		return slot;
+ 	
+ 	/*
+ 	 * If it's a valid slot, we may still have cached a previous IO error,
+ 	 * in which case the bh in the superblock cache will be zero.
+ 	 */
+ 	if (!sb->u.ext3_sb.s_block_bitmap[slot])
+ 		return -EIO;
+ 	
+ 	/*
+ 	 * Must have been read in OK to get this far.
+ 	 */
+ 	return slot;
+ }
+ 
+ /* Free given blocks, update quota and i_blocks field */
+ void ext3_free_blocks (handle_t *handle, struct inode * inode,
+ 			unsigned long block, unsigned long count)
+ {
+ 	struct buffer_head *bitmap_bh;
+ 	struct buffer_head *gd_bh;
+ 	unsigned long block_group;
+ 	unsigned long bit;
+ 	unsigned long i;
+ 	int bitmap_nr;
+ 	unsigned long overflow;
+ 	struct super_block * sb;
+ 	struct ext3_group_desc * gdp;
+ 	struct ext3_super_block * es;
+ 	int err = 0, ret;
+ 	int dquot_freed_blocks = 0;
+ 
+ 	sb = inode->i_sb;
+ 	if (!sb) {
+ 		printk ("ext3_free_blocks: nonexistent device");
+ 		return;
+ 	}
+ 	lock_super (sb);
+ 	es = sb->u.ext3_sb.s_es;
+ 	if (block < le32_to_cpu(es->s_first_data_block) || 
+ 	    (block + count) > le32_to_cpu(es->s_blocks_count)) {
+ 		ext3_error (sb, "ext3_free_blocks",
+ 			    "Freeing blocks not in datazone - "
+ 			    "block = %lu, count = %lu", block, count);
+ 		goto error_return;
+ 	}
+ 
+ 	ext3_debug ("freeing block %lu\n", block);
+ 
+ do_more:
+ 	overflow = 0;
+ 	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
+ 		      EXT3_BLOCKS_PER_GROUP(sb);
+ 	bit = (block - le32_to_cpu(es->s_first_data_block)) %
+ 		      EXT3_BLOCKS_PER_GROUP(sb);
+ 	/*
+ 	 * Check to see if we are freeing blocks across a group
+ 	 * boundary.
+ 	 */
+ 	if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
+ 		overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
+ 		count -= overflow;
+ 	}
+ 	bitmap_nr = load_block_bitmap (sb, block_group);
+ 	if (bitmap_nr < 0)
+ 		goto error_return;
+ 	
+ 	bitmap_bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+ 	gdp = ext3_get_group_desc (sb, block_group, &gd_bh);
+ 	if (!gdp)
+ 		goto error_return;
+ 
+ 	if (in_range (le32_to_cpu(gdp->bg_block_bitmap), block, count) ||
+ 	    in_range (le32_to_cpu(gdp->bg_inode_bitmap), block, count) ||
+ 	    in_range (block, le32_to_cpu(gdp->bg_inode_table),
+ 		      sb->u.ext3_sb.s_itb_per_group) ||
+ 	    in_range (block + count - 1, le32_to_cpu(gdp->bg_inode_table),
+ 		      sb->u.ext3_sb.s_itb_per_group))
+ 		ext3_error (sb, "ext3_free_blocks",
+ 			    "Freeing blocks in system zones - "
+ 			    "Block = %lu, count = %lu",
+ 			    block, count);
+ 
+ 	/*
+ 	 * We are about to start releasing blocks in the bitmap,
+ 	 * so we need undo access.
+ 	 */
+ 	/* @@@ check errors */
+ 	BUFFER_TRACE(bitmap_bh, "getting undo access");
+ 	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+ 	if (err)
+ 		goto error_return;
+ 	
+ 	/*
+ 	 * We are about to modify some metadata.  Call the journal APIs
+ 	 * to unshare ->b_data if a currently-committing transaction is
+ 	 * using it
+ 	 */
+ 	BUFFER_TRACE(gd_bh, "get_write_access");
+ 	err = ext3_journal_get_write_access(handle, gd_bh);	
+ 	if (err)
+ 		goto error_return;
+ 
+ 	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+ 	err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ 	if (err)
+ 		goto error_return;
+ 
+ 	for (i = 0; i < count; i++) {
+ 		/*
+ 		 * An HJ special.  This is expensive...
+ 		 */
+ #ifdef CONFIG_JBD_DEBUG
+ 		{
+ 			struct buffer_head *debug_bh;
+ 			debug_bh = get_hash_table(sb->s_dev, block + i,
+ 							sb->s_blocksize);
+ 			if (debug_bh) {
+ 				BUFFER_TRACE(debug_bh, "Deleted!");
+ 				if (!bh2jh(bitmap_bh)->b_committed_data)
+ 					BUFFER_TRACE(debug_bh,
+ 						"No commited data in bitmap");
+ 				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
+ 				__brelse(debug_bh);
+ 			}
+ 		}
+ #endif
+ 		BUFFER_TRACE(bitmap_bh, "clear bit");
+ 		if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) {
+ 			ext3_error (sb, __FUNCTION__,
+ 				      "bit already cleared for block %lu", 
+ 				      block + i);
+ 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
+ 		} else {
+ 			dquot_freed_blocks++;
+ 			gdp->bg_free_blocks_count =
+ 			  cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)+1);
+ 			es->s_free_blocks_count =
+ 			  cpu_to_le32(le32_to_cpu(es->s_free_blocks_count)+1);
+ 		}
+ 		/* @@@ This prevents newly-allocated data from being
+ 		 * freed and then reallocated within the same
+ 		 * transaction. 
+ 		 * 
+ 		 * Ideally we would want to allow that to happen, but to
+ 		 * do so requires making journal_forget() capable of
+ 		 * revoking the queued write of a data block, which
+ 		 * implies blocking on the journal lock.  *forget()
+ 		 * cannot block due to truncate races.
+ 		 *
+ 		 * Eventually we can fix this by making journal_forget()
+ 		 * return a status indicating whether or not it was able
+ 		 * to revoke the buffer.  On successful revoke, it is
+ 		 * safe not to set the allocation bit in the committed
+ 		 * bitmap, because we know that there is no outstanding
+ 		 * activity on the buffer any more and so it is safe to
+ 		 * reallocate it.  
+ 		 */
+ 		BUFFER_TRACE(bitmap_bh, "clear in b_committed_data");
+ 		J_ASSERT_BH(bitmap_bh,
+ 				bh2jh(bitmap_bh)->b_committed_data != NULL);
+ 		ext3_set_bit(bit + i, bh2jh(bitmap_bh)->b_committed_data);
+ 	}
+ 
+ 	/* We dirtied the bitmap block */
+ 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+ 	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+ 
+ 	/* And the group descriptor block */
+ 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
+ 	ret = ext3_journal_dirty_metadata(handle, gd_bh);
+ 	if (!err) err = ret;
+ 
+ 	/* And the superblock */
+ 	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "dirtied superblock");
+ 	ret = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ 	if (!err) err = ret;
+ 
+ 	if (overflow && !err) {
+ 		block += count;
+ 		count = overflow;
+ 		goto do_more;
+ 	}
+ 	sb->s_dirt = 1;
+ error_return:
+ 	ext3_std_error(sb, err);
+ 	unlock_super(sb);
+ 	if (dquot_freed_blocks)
+ 		DQUOT_FREE_BLOCK(inode, dquot_freed_blocks);
+ 	return;
+ }
+ 
+ /* For ext3 allocations, we must not reuse any blocks which are
+  * allocated in the bitmap buffer's "last committed data" copy.  This
+  * prevents deletes from freeing up the page for reuse until we have
+  * committed the delete transaction.
+  *
+  * If we didn't do this, then deleting something and reallocating it as
+  * data would allow the old block to be overwritten before the
+  * transaction committed (because we force data to disk before commit).
+  * This would lead to corruption if we crashed between overwriting the
+  * data and committing the delete. 
+  *
+  * @@@ We may want to make this allocation behaviour conditional on
+  * data-writes at some point, and disable it for metadata allocations or
+  * sync-data inodes.
+  */
+ static int ext3_test_allocatable(int nr, struct buffer_head *bh)
+ {
+ 	if (ext3_test_bit(nr, bh->b_data))
+ 		return 0;
+ 	if (!buffer_jbd(bh) || !bh2jh(bh)->b_committed_data)
+ 		return 1;
+ 	return !ext3_test_bit(nr, bh2jh(bh)->b_committed_data);
+ }
+ 
+ /*
+  * Find an allocatable block in a bitmap.  We honour both the bitmap and
+  * its last-committed copy (if that exists), and perform the "most
+  * appropriate allocation" algorithm of looking for a free block near
+  * the initial goal; then for a free byte somewhere in the bitmap; then
+  * for any free bit in the bitmap.
+  */
+ static int find_next_usable_block(int start,
+ 			struct buffer_head *bh, int maxblocks)
+ {
+ 	int here, next;
+ 	char *p, *r;
+ 	
+ 	if (start > 0) {
+ 		/*
+ 		 * The goal was occupied; search forward for a free 
+ 		 * block within the next XX blocks.
+ 		 *
+ 		 * end_goal is more or less random, but it has to be
+ 		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
+ 		 * next 64-bit boundary is simple..
+ 		 */
+ 		int end_goal = (start + 63) & ~63;
+ 		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
+ 		if (here < end_goal && ext3_test_allocatable(here, bh))
+ 			return here;
+ 		
+ 		ext3_debug ("Bit not found near goal\n");
+ 		
+ 	}
+ 	
+ 	here = start;
+ 	if (here < 0)
+ 		here = 0;
+ 	
+ 	/*
+ 	 * There has been no free block found in the near vicinity of
+ 	 * the goal: do a search forward through the block groups,
+ 	 * searching in each group first for an entire free byte in the
+ 	 * bitmap and then for any free bit.
+ 	 * 
+ 	 * Search first in the remainder of the current group 
+ 	 */
+ 	p = ((char *) bh->b_data) + (here >> 3);
+ 	r = memscan(p, 0, (maxblocks - here + 7) >> 3);
+ 	next = (r - ((char *) bh->b_data)) << 3;
+ 	
+ 	if (next < maxblocks && ext3_test_allocatable(next, bh))
+ 		return next;
+ 	
+ 	/* The bitmap search --- search forward alternately
+ 	 * through the actual bitmap and the last-committed copy
+ 	 * until we find a bit free in both. */
+ 
+ 	while (here < maxblocks) {
+ 		next  = ext3_find_next_zero_bit ((unsigned long *) bh->b_data, 
+ 						 maxblocks, here);
+ 		if (next >= maxblocks)
+ 			return -1;
+ 		if (ext3_test_allocatable(next, bh))
+ 			return next;
+ 
+ 		J_ASSERT_BH(bh, bh2jh(bh)->b_committed_data);
+ 		here = ext3_find_next_zero_bit
+ 			((unsigned long *) bh2jh(bh)->b_committed_data, 
+ 			 maxblocks, next);
+ 	}
+ 	return -1;
+ }
+ 
+ /*
+  * ext3_new_block uses a goal block to assist allocation.  If the goal is
+  * free, or there is a free block within 32 blocks of the goal, that block
+  * is allocated.  Otherwise a forward search is made for a free block; within 
+  * each block group the search first looks for an entire free byte in the block
+  * bitmap, and then for any free bit if that fails.
+  * This function also updates quota and i_blocks field.
+  */
+ int ext3_new_block (handle_t *handle, struct inode * inode,
+ 		unsigned long goal, u32 * prealloc_count,
+ 		u32 * prealloc_block, int * errp)
+ {
+ 	struct buffer_head * bh, *bhtmp;
+ 	struct buffer_head * bh2;
+ #if 0
+ 	char * p, * r;
+ #endif
+ 	int i, j, k, tmp, alloctmp;
+ 	int bitmap_nr;
+ 	int fatal = 0, err;
+ 	struct super_block * sb;
+ 	struct ext3_group_desc * gdp;
+ 	struct ext3_super_block * es;
+ #ifdef EXT3FS_DEBUG
+ 	static int goal_hits = 0, goal_attempts = 0;
+ #endif
+ 	*errp = -ENOSPC;
+ 	sb = inode->i_sb;
+ 	if (!sb) {
+ 		printk ("ext3_new_block: nonexistent device");
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * Check quota for allocation of this block.
+ 	 */
+ 	if (DQUOT_ALLOC_BLOCK(inode, 1)) {
+ 		*errp = -EDQUOT;
+ 		return 0;
+ 	}
+ 
+ 	lock_super (sb);
+ 	es = sb->u.ext3_sb.s_es;
+ 	if (le32_to_cpu(es->s_free_blocks_count) <=
+ 			le32_to_cpu(es->s_r_blocks_count) &&
+ 	    ((sb->u.ext3_sb.s_resuid != current->fsuid) &&
+ 	     (sb->u.ext3_sb.s_resgid == 0 ||
+ 	      !in_group_p (sb->u.ext3_sb.s_resgid)) && 
+ 	     !capable(CAP_SYS_RESOURCE)))
+ 		goto out;
+ 
+ 	ext3_debug ("goal=%lu.\n", goal);
+ 
+ 	/*
+ 	 * First, test whether the goal block is free.
+ 	 */
+ 	if (goal < le32_to_cpu(es->s_first_data_block) ||
+ 	    goal >= le32_to_cpu(es->s_blocks_count))
+ 		goal = le32_to_cpu(es->s_first_data_block);
+ 	i = (goal - le32_to_cpu(es->s_first_data_block)) /
+ 			EXT3_BLOCKS_PER_GROUP(sb);
+ 	gdp = ext3_get_group_desc (sb, i, &bh2);
+ 	if (!gdp)
+ 		goto io_error;
+ 
+ 	if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
+ 		j = ((goal - le32_to_cpu(es->s_first_data_block)) %
+ 				EXT3_BLOCKS_PER_GROUP(sb));
+ #ifdef EXT3FS_DEBUG
+ 		if (j)
+ 			goal_attempts++;
+ #endif
+ 		bitmap_nr = load_block_bitmap (sb, i);
+ 		if (bitmap_nr < 0)
+ 			goto io_error;
+ 		
+ 		bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+ 
+ 		ext3_debug ("goal is at %d:%d.\n", i, j);
+ 
+ 		if (ext3_test_allocatable(j, bh)) {
+ #ifdef EXT3FS_DEBUG
+ 			goal_hits++;
+ 			ext3_debug ("goal bit allocated.\n");
+ #endif
+ 			goto got_block;
+ 		}
+ 
+ 		j = find_next_usable_block(j, bh, EXT3_BLOCKS_PER_GROUP(sb));
+ 		if (j >= 0)
+ 			goto search_back;
+ 	}
+ 
+ 	ext3_debug ("Bit not found in block group %d.\n", i);
+ 
+ 	/*
+ 	 * Now search the rest of the groups.  We assume that 
+ 	 * i and gdp correctly point to the last group visited.
+ 	 */
+ 	for (k = 0; k < sb->u.ext3_sb.s_groups_count; k++) {
+ 		i++;
+ 		if (i >= sb->u.ext3_sb.s_groups_count)
+ 			i = 0;
+ 		gdp = ext3_get_group_desc (sb, i, &bh2);
+ 		if (!gdp) {
+ 			*errp = -EIO;
+ 			goto out;
+ 		}
+ 		if (le16_to_cpu(gdp->bg_free_blocks_count) > 0) {
+ 			bitmap_nr = load_block_bitmap (sb, i);
+ 			if (bitmap_nr < 0)
+ 				goto io_error;
+ 	
+ 			bh = sb->u.ext3_sb.s_block_bitmap[bitmap_nr];
+ 			j = find_next_usable_block(-1, bh, 
+ 						   EXT3_BLOCKS_PER_GROUP(sb));
+ 			if (j >= 0) 
+ 				goto search_back;
+ 		}
+ 	}
+ 
+ 	/* No space left on the device */
+ 	unlock_super (sb);
+ 	return 0;
+ 
+ search_back:
+ 	/* 
+ 	 * We have succeeded in finding a free byte in the block
+ 	 * bitmap.  Now search backwards up to 7 bits to find the
+ 	 * start of this group of free blocks.
+ 	 */
+ 	for (	k = 0;
+ 		k < 7 && j > 0 && ext3_test_allocatable(j - 1, bh);
+ 		k++, j--)
+ 		;
+ 	
+ got_block:
+ 
+ 	ext3_debug ("using block group %d(%d)\n", i, gdp->bg_free_blocks_count);
+ 
+ 	/* Make sure we use undo access for the bitmap, because it is
+            critical that we do the frozen_data COW on bitmap buffers in
+            all cases even if the buffer is in BJ_Forget state in the
+            committing transaction.  */
+ 	BUFFER_TRACE(bh, "get undo access for marking new block");
+ 	fatal = ext3_journal_get_undo_access(handle, bh);
+ 	if (fatal) goto out;
+ 	
+ 	BUFFER_TRACE(bh2, "get_write_access");
+ 	fatal = ext3_journal_get_write_access(handle, bh2);
+ 	if (fatal) goto out;
+ 
+ 	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+ 	fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ 	if (fatal) goto out;
+ 
+ 	tmp = j + i * EXT3_BLOCKS_PER_GROUP(sb)
+ 				+ le32_to_cpu(es->s_first_data_block);
+ 
+ 	if (tmp == le32_to_cpu(gdp->bg_block_bitmap) ||
+ 	    tmp == le32_to_cpu(gdp->bg_inode_bitmap) ||
+ 	    in_range (tmp, le32_to_cpu(gdp->bg_inode_table),
+ 		      sb->u.ext3_sb.s_itb_per_group))
+ 		ext3_error (sb, "ext3_new_block",
+ 			    "Allocating block in system zone - "
+ 			    "block = %u", tmp);
+ 
+ 	/* The superblock lock should guard against anybody else beating
+ 	 * us to this point! */
+ 	J_ASSERT_BH(bh, !ext3_test_bit(j, bh->b_data));
+ 	BUFFER_TRACE(bh, "setting bitmap bit");
+ 	ext3_set_bit(j, bh->b_data);
+ 
+ #ifdef CONFIG_JBD_DEBUG
+ 	{
+ 		struct buffer_head *debug_bh;
+ 
+ 		/* Record bitmap buffer state in the newly allocated block */
+ 		debug_bh = get_hash_table(sb->s_dev, tmp, sb->s_blocksize);
+ 		if (debug_bh) {
+ 			BUFFER_TRACE(debug_bh, "state when allocated");
+ 			BUFFER_TRACE2(debug_bh, bh, "bitmap state");
+ 			brelse(debug_bh);
+ 		}
+ 	}
+ #endif
+ 	if (buffer_jbd(bh) && bh2jh(bh)->b_committed_data)
+ 		J_ASSERT_BH(bh, !ext3_test_bit(j, bh2jh(bh)->b_committed_data));
+ 	bhtmp = bh;
+ 	alloctmp = j;
+ 
+ 	ext3_debug ("found bit %d\n", j);
+ 
+ 	/*
+ 	 * Do block preallocation now if required.
+ 	 */
+ #ifdef EXT3_PREALLOCATE
+ 	/*
+ 	 * akpm: this is not enabled for ext3.  Need to use
+ 	 * ext3_test_allocatable()
+ 	 */
+ 	/* Writer: ->i_prealloc* */
+ 	if (prealloc_count && !*prealloc_count) {
+ 		int	prealloc_goal;
+ 		unsigned long next_block = tmp + 1;
+ 
+ 		prealloc_goal = es->s_prealloc_blocks ?
+ 			es->s_prealloc_blocks : EXT3_DEFAULT_PREALLOC_BLOCKS;
+ 
+ 		*prealloc_block = next_block;
+ 		/* Writer: end */
+ 		for (k = 1;
+ 		     k < prealloc_goal && (j + k) < EXT3_BLOCKS_PER_GROUP(sb);
+ 		     k++, next_block++) {
+ 			if (DQUOT_PREALLOC_BLOCK(inode, 1))
+ 				break;
+ 			/* Writer: ->i_prealloc* */
+ 			if (*prealloc_block + *prealloc_count != next_block ||
+ 			    ext3_set_bit (j + k, bh->b_data)) {
+ 				/* Writer: end */
+ 				DQUOT_FREE_BLOCK(inode, 1);
+  				break;
+ 			}
+ 			(*prealloc_count)++;
+ 			/* Writer: end */
+ 		}	
+ 		/*
+ 		 * As soon as we go for per-group spinlocks we'll need these
+ 		 * done inside the loop above.
+ 		 */
+ 		gdp->bg_free_blocks_count =
+ 			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) -
+ 			       (k - 1));
+ 		es->s_free_blocks_count =
+ 			cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) -
+ 			       (k - 1));
+ 		ext3_debug ("Preallocated a further %lu bits.\n",
+ 			       (k - 1));
+ 	}
+ #endif
+ 
+ 	j = tmp;
+ 
+ 	BUFFER_TRACE(bh, "journal_dirty_metadata for bitmap block");
+ 	err = ext3_journal_dirty_metadata(handle, bh);
+ 	if (!fatal) fatal = err;
+ 	
+ 	if (j >= le32_to_cpu(es->s_blocks_count)) {
+ 		ext3_error (sb, "ext3_new_block",
+ 			    "block(%d) >= blocks count(%d) - "
+ 			    "block_group = %d, es == %p ",j,
+ 			le32_to_cpu(es->s_blocks_count), i, es);
+ 		goto out;
+ 	}
+ 
+ 	/*
+ 	 * It is up to the caller to add the new buffer to a journal
+ 	 * list of some description.  We don't know in advance whether
+ 	 * the caller wants to use it as metadata or data.
+ 	 */
+ 
+ 	ext3_debug ("allocating block %d. "
+ 		    "Goal hits %d of %d.\n", j, goal_hits, goal_attempts);
+ 
+ 	gdp->bg_free_blocks_count =
+ 			cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
+ 	es->s_free_blocks_count =
+ 			cpu_to_le32(le32_to_cpu(es->s_free_blocks_count) - 1);
+ 
+ 	BUFFER_TRACE(bh2, "journal_dirty_metadata for group descriptor");
+ 	err = ext3_journal_dirty_metadata(handle, bh2);
+ 	if (!fatal) fatal = err;
+ 	
+ 	BUFFER_TRACE(bh, "journal_dirty_metadata for superblock");
+ 	err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ 	if (!fatal) fatal = err;
+ 
+ 	sb->s_dirt = 1;
+ 	if (fatal)
+ 		goto out;
+ 
+ 	unlock_super (sb);
+ 	*errp = 0;
+ 	return j;
+ 	
+ io_error:
+ 	*errp = -EIO;
+ out:
+ 	if (fatal) {
+ 		*errp = fatal;
+ 		ext3_std_error(sb, fatal);
+ 	}
+ 	unlock_super (sb);
+ 	return 0;
+ 	
+ }
+ 
+ unsigned long ext3_count_free_blocks (struct super_block * sb)
+ {
+ #ifdef EXT3FS_DEBUG
+ 	struct ext3_super_block * es;
+ 	unsigned long desc_count, bitmap_count, x;
+ 	int bitmap_nr;
+ 	struct ext3_group_desc * gdp;
+ 	int i;
+ 	
+ 	lock_super (sb);
+ 	es = sb->u.ext3_sb.s_es;
+ 	desc_count = 0;
+ 	bitmap_count = 0;
+ 	gdp = NULL;
+ 	for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+ 		gdp = ext3_get_group_desc (sb, i, NULL);
+ 		if (!gdp)
+ 			continue;
+ 		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ 		bitmap_nr = load_block_bitmap (sb, i);
+ 		if (bitmap_nr < 0)
+ 			continue;
+ 		
+ 		x = ext3_count_free (sb->u.ext3_sb.s_block_bitmap[bitmap_nr],
+ 				     sb->s_blocksize);
+ 		printk ("group %d: stored = %d, counted = %lu\n",
+ 			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
+ 		bitmap_count += x;
+ 	}
+ 	printk("ext3_count_free_blocks: stored = %lu, computed = %lu, %lu\n",
+ 	       le32_to_cpu(es->s_free_blocks_count), desc_count, bitmap_count);
+ 	unlock_super (sb);
+ 	return bitmap_count;
+ #else
+ 	return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_blocks_count);
+ #endif
+ }
+ 
+ static inline int block_in_use (unsigned long block,
+ 				struct super_block * sb,
+ 				unsigned char * map)
+ {
+ 	return ext3_test_bit ((block -
+ 		le32_to_cpu(sb->u.ext3_sb.s_es->s_first_data_block)) %
+ 			 EXT3_BLOCKS_PER_GROUP(sb), map);
+ }
+ 
+ static inline int test_root(int a, int b)
+ {
+ 	if (a == 0)
+ 		return 1;
+ 	while (1) {
+ 		if (a == 1)
+ 			return 1;
+ 		if (a % b)
+ 			return 0;
+ 		a = a / b;
+ 	}
+ }
+ 
+ int ext3_group_sparse(int group)
+ {
+ 	return (test_root(group, 3) || test_root(group, 5) ||
+ 		test_root(group, 7));
+ }
+ 
+ /**
+  *	ext3_bg_has_super - number of blocks used by the superblock in group
+  *	@sb: superblock for filesystem
+  *	@group: group number to check
+  *
+  *	Return the number of blocks used by the superblock (primary or backup)
+  *	in this group.  Currently this will be only 0 or 1.
+  */
+ int ext3_bg_has_super(struct super_block *sb, int group)
+ {
+ 	if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
+ 	    !ext3_group_sparse(group))
+ 		return 0;
+ 	return 1;
+ }
+ 
+ /**
+  *	ext3_bg_num_gdb - number of blocks used by the group table in group
+  *	@sb: superblock for filesystem
+  *	@group: group number to check
+  *
+  *	Return the number of blocks used by the group descriptor table
+  *	(primary or backup) in this group.  In the future there may be a
+  *	different number of descriptor blocks in each group.
+  */
+ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
+ {
+ 	if (EXT3_HAS_RO_COMPAT_FEATURE(sb,EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)&&
+ 	    !ext3_group_sparse(group))
+ 		return 0;
+ 	return EXT3_SB(sb)->s_gdb_count;
+ }
+ 
+ #ifdef CONFIG_EXT3_CHECK
+ /* Called at mount-time, super-block is locked */
+ void ext3_check_blocks_bitmap (struct super_block * sb)
+ {
+ 	struct buffer_head * bh;
+ 	struct ext3_super_block * es;
+ 	unsigned long desc_count, bitmap_count, x, j;
+ 	unsigned long desc_blocks;
+ 	int bitmap_nr;
+ 	struct ext3_group_desc * gdp;
+ 	int i;
+ 
+ 	es = sb->u.ext3_sb.s_es;
+ 	desc_count = 0;
+ 	bitmap_count = 0;
+ 	gdp = NULL;
+ 	for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+ 		gdp = ext3_get_group_desc (sb, i, NULL);
+ 		if (!gdp)
+ 			continue;
+ 		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
+ 		bitmap_nr = load_block_bitmap (sb, i);
+ 		if (bitmap_nr < 0)
+ 			continue;
+ 
+ 		bh = EXT3_SB(sb)->s_block_bitmap[bitmap_nr];
+ 
+ 		if (ext3_bg_has_super(sb, i) && !ext3_test_bit(0, bh->b_data))
+ 			ext3_error(sb, __FUNCTION__,
+ 				   "Superblock in group %d is marked free", i);
+ 
+ 		desc_blocks = ext3_bg_num_gdb(sb, i);
+ 		for (j = 0; j < desc_blocks; j++)
+ 			if (!ext3_test_bit(j + 1, bh->b_data))
+ 				ext3_error(sb, __FUNCTION__,
+ 					   "Descriptor block #%ld in group "
+ 					   "%d is marked free", j, i);
+ 
+ 		if (!block_in_use (le32_to_cpu(gdp->bg_block_bitmap),
+ 						sb, bh->b_data))
+ 			ext3_error (sb, "ext3_check_blocks_bitmap",
+ 				    "Block bitmap for group %d is marked free",
+ 				    i);
+ 
+ 		if (!block_in_use (le32_to_cpu(gdp->bg_inode_bitmap),
+ 						sb, bh->b_data))
+ 			ext3_error (sb, "ext3_check_blocks_bitmap",
+ 				    "Inode bitmap for group %d is marked free",
+ 				    i);
+ 
+ 		for (j = 0; j < sb->u.ext3_sb.s_itb_per_group; j++)
+ 			if (!block_in_use (le32_to_cpu(gdp->bg_inode_table) + j,
+ 							sb, bh->b_data))
+ 				ext3_error (sb, "ext3_check_blocks_bitmap",
+ 					    "Block #%d of the inode table in "
+ 					    "group %d is marked free", j, i);
+ 
+ 		x = ext3_count_free (bh, sb->s_blocksize);
+ 		if (le16_to_cpu(gdp->bg_free_blocks_count) != x)
+ 			ext3_error (sb, "ext3_check_blocks_bitmap",
+ 				    "Wrong free blocks count for group %d, "
+ 				    "stored = %d, counted = %lu", i,
+ 				    le16_to_cpu(gdp->bg_free_blocks_count), x);
+ 		bitmap_count += x;
+ 	}
+ 	if (le32_to_cpu(es->s_free_blocks_count) != bitmap_count)
+ 		ext3_error (sb, "ext3_check_blocks_bitmap",
+ 			"Wrong free blocks count in super block, "
+ 			"stored = %lu, counted = %lu",
+ 			(unsigned long)le32_to_cpu(es->s_free_blocks_count),
+ 			bitmap_count);
+ }
+ #endif
diff -rc2P linux/fs/ext3/bitmap.c linux-2.4.13/fs/ext3/bitmap.c
*** linux/fs/ext3/bitmap.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/bitmap.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,26 ----
+ /*
+  *  linux/fs/ext3/bitmap.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  */
+ 
+ #include <linux/fs.h>
+ 
+ 
+ static int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0};
+ 
+ unsigned long ext3_count_free (struct buffer_head * map, unsigned int numchars)
+ {
+ 	unsigned int i;
+ 	unsigned long sum = 0;
+ 	
+ 	if (!map) 
+ 		return (0);
+ 	for (i = 0; i < numchars; i++)
+ 		sum += nibblemap[map->b_data[i] & 0xf] +
+ 			nibblemap[(map->b_data[i] >> 4) & 0xf];
+ 	return (sum);
+ }
diff -rc2P linux/fs/ext3/dir.c linux-2.4.13/fs/ext3/dir.c
*** linux/fs/ext3/dir.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/dir.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,190 ----
+ /*
+  *  linux/fs/ext3/dir.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/dir.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  ext3 directory handling functions
+  *
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  */
+ 
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ 
+ static unsigned char ext3_filetype_table[] = {
+ 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+ 
+ static int ext3_readdir(struct file *, void *, filldir_t);
+ 
+ struct file_operations ext3_dir_operations = {
+ 	read:		generic_read_dir,
+ 	readdir:	ext3_readdir,		/* BKL held */
+ 	ioctl:		ext3_ioctl,		/* BKL held */
+ 	fsync:		ext3_sync_file,		/* BKL held */
+ };
+ 
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+ 			  struct ext3_dir_entry_2 * de,
+ 			  struct buffer_head * bh,
+ 			  unsigned long offset)
+ {
+ 	const char * error_msg = NULL;
+  	const int rlen = le16_to_cpu(de->rec_len);
+ 
+ 	if (rlen < EXT3_DIR_REC_LEN(1))
+ 		error_msg = "rec_len is smaller than minimal";
+ 	else if (rlen % 4 != 0)
+ 		error_msg = "rec_len % 4 != 0";
+ 	else if (rlen < EXT3_DIR_REC_LEN(de->name_len))
+ 		error_msg = "rec_len is too small for name_len";
+ 	else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize)
+ 		error_msg = "directory entry across blocks";
+ 	else if (le32_to_cpu(de->inode) >
+ 			le32_to_cpu(dir->i_sb->u.ext3_sb.s_es->s_inodes_count))
+ 		error_msg = "inode out of bounds";
+ 
+ 	if (error_msg != NULL)
+ 		ext3_error (dir->i_sb, function,
+ 			"bad entry in directory #%lu: %s - "
+ 			"offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ 			dir->i_ino, error_msg, offset,
+ 			(unsigned long) le32_to_cpu(de->inode),
+ 			rlen, de->name_len);
+ 	return error_msg == NULL ? 1 : 0;
+ }
+ 
+ static int ext3_readdir(struct file * filp,
+ 			 void * dirent, filldir_t filldir)
+ {
+ 	int error = 0;
+ 	unsigned long offset, blk;
+ 	int i, num, stored;
+ 	struct buffer_head * bh, * tmp, * bha[16];
+ 	struct ext3_dir_entry_2 * de;
+ 	struct super_block * sb;
+ 	int err;
+ 	struct inode *inode = filp->f_dentry->d_inode;
+ 
+ 	sb = inode->i_sb;
+ 
+ 	stored = 0;
+ 	bh = NULL;
+ 	offset = filp->f_pos & (sb->s_blocksize - 1);
+ 
+ 	while (!error && !stored && filp->f_pos < inode->i_size) {
+ 		blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
+ 		bh = ext3_bread (0, inode, blk, 0, &err);
+ 		if (!bh) {
+ 			ext3_error (sb, "ext3_readdir",
+ 				"directory #%lu contains a hole at offset %lu",
+ 				inode->i_ino, (unsigned long)filp->f_pos);
+ 			filp->f_pos += sb->s_blocksize - offset;
+ 			continue;
+ 		}
+ 
+ 		/*
+ 		 * Do the readahead
+ 		 */
+ 		if (!offset) {
+ 			for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
+ 			     i > 0; i--) {
+ 				tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
+ 				if (tmp && !buffer_uptodate(tmp) &&
+ 						!buffer_locked(tmp))
+ 					bha[num++] = tmp;
+ 				else
+ 					brelse (tmp);
+ 			}
+ 			if (num) {
+ 				ll_rw_block (READA, num, bha);
+ 				for (i = 0; i < num; i++)
+ 					brelse (bha[i]);
+ 			}
+ 		}
+ 		
+ revalidate:
+ 		/* If the dir block has changed since the last call to
+ 		 * readdir(2), then we might be pointing to an invalid
+ 		 * dirent right now.  Scan from the start of the block
+ 		 * to make sure. */
+ 		if (filp->f_version != inode->i_version) {
+ 			for (i = 0; i < sb->s_blocksize && i < offset; ) {
+ 				de = (struct ext3_dir_entry_2 *) 
+ 					(bh->b_data + i);
+ 				/* It's too expensive to do a full
+ 				 * dirent test each time round this
+ 				 * loop, but we do have to test at
+ 				 * least that it is non-zero.  A
+ 				 * failure will be detected in the
+ 				 * dirent test below. */
+ 				if (le16_to_cpu(de->rec_len) <
+ 						EXT3_DIR_REC_LEN(1))
+ 					break;
+ 				i += le16_to_cpu(de->rec_len);
+ 			}
+ 			offset = i;
+ 			filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
+ 				| offset;
+ 			filp->f_version = inode->i_version;
+ 		}
+ 		
+ 		while (!error && filp->f_pos < inode->i_size 
+ 		       && offset < sb->s_blocksize) {
+ 			de = (struct ext3_dir_entry_2 *) (bh->b_data + offset);
+ 			if (!ext3_check_dir_entry ("ext3_readdir", inode, de,
+ 						   bh, offset)) {
+ 				/* On error, skip the f_pos to the
+                                    next block. */
+ 				filp->f_pos = (filp->f_pos |
+ 						(sb->s_blocksize - 1)) + 1;
+ 				brelse (bh);
+ 				return stored;
+ 			}
+ 			offset += le16_to_cpu(de->rec_len);
+ 			if (le32_to_cpu(de->inode)) {
+ 				/* We might block in the next section
+ 				 * if the data destination is
+ 				 * currently swapped out.  So, use a
+ 				 * version stamp to detect whether or
+ 				 * not the directory has been modified
+ 				 * during the copy operation.
+ 				 */
+ 				unsigned long version = filp->f_version;
+ 				unsigned char d_type = DT_UNKNOWN;
+ 
+ 				if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+ 						EXT3_FEATURE_INCOMPAT_FILETYPE)
+ 						&& de->file_type < EXT3_FT_MAX)
+ 					d_type =
+ 					  ext3_filetype_table[de->file_type];
+ 				error = filldir(dirent, de->name,
+ 						de->name_len,
+ 						filp->f_pos,
+ 						le32_to_cpu(de->inode),
+ 						d_type);
+ 				if (error)
+ 					break;
+ 				if (version != filp->f_version)
+ 					goto revalidate;
+ 				stored ++;
+ 			}
+ 			filp->f_pos += le16_to_cpu(de->rec_len);
+ 		}
+ 		offset = 0;
+ 		brelse (bh);
+ 	}
+ 	UPDATE_ATIME(inode);
+ 	return 0;
+ }
diff -rc2P linux/fs/ext3/file.c linux-2.4.13/fs/ext3/file.c
*** linux/fs/ext3/file.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/file.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,97 ----
+ /*
+  *  linux/fs/ext3/file.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/file.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  ext3 fs regular file handling primitives
+  *
+  *  64-bit file support on 64-bit platforms by Jakub Jelinek
+  *	(jj@sunsite.ms.mff.cuni.cz)
+  */
+ 
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/locks.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/smp_lock.h>
+ 
+ /*
+  * Called when an inode is released. Note that this is different
+  * from ext3_file_open: open gets called at every open, but release
+  * gets called only when /all/ the files are closed.
+  */
+ static int ext3_release_file (struct inode * inode, struct file * filp)
+ {
+ 	if (filp->f_mode & FMODE_WRITE)
+ 		ext3_discard_prealloc (inode);
+ 	return 0;
+ }
+ 
+ /*
+  * Called when an inode is about to be opened.
+  * We use this to disallow opening RW large files on 32bit systems if
+  * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
+  * on this flag in sys_open.
+  */
+ static int ext3_open_file (struct inode * inode, struct file * filp)
+ {
+ 	if (!(filp->f_flags & O_LARGEFILE) &&
+ 	    inode->i_size > 0x7FFFFFFFLL)
+ 		return -EFBIG;
+ 	return 0;
+ }
+ 
+ /*
+  * ext3_file_write().
+  *
+  * Most things are done in ext3_prepare_write() and ext3_commit_write().
+  */
+ 
+ static ssize_t
+ ext3_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
+ {
+ 	int ret;
+ 	struct inode *inode = file->f_dentry->d_inode;
+ 
+ 	ret = generic_file_write(file, buf, count, ppos);
+ 	if ((ret >= 0) && IS_SYNC(inode)) {
+ 		if (file->f_flags & O_SYNC) {
+ 			/*
+ 			 * generic_osync_inode() has already done the sync
+ 			 */
+ 		} else {
+ 			int ret2 = ext3_force_commit(inode->i_sb);
+ 			if (ret2)
+ 				ret = ret2;
+ 		}
+ 	}
+ 	return ret;
+ }
+ 
+ struct file_operations ext3_file_operations = {
+ 	llseek:		generic_file_llseek,	/* BKL held */
+ 	read:		generic_file_read,	/* BKL not held.  Don't need */
+ 	write:		ext3_file_write,	/* BKL not held.  Don't need */
+ 	ioctl:		ext3_ioctl,		/* BKL held */
+ 	mmap:		generic_file_mmap,
+ 	open:		ext3_open_file,		/* BKL not held.  Don't need */
+ 	release:	ext3_release_file,	/* BKL not held.  Don't need */
+ 	fsync:		ext3_sync_file,		/* BKL held */
+ };
+ 
+ struct inode_operations ext3_file_inode_operations = {
+ 	truncate:	ext3_truncate,		/* BKL held */
+ 	setattr:	ext3_setattr,		/* BKL held */
+ };
+ 
diff -rc2P linux/fs/ext3/fsync.c linux-2.4.13/fs/ext3/fsync.c
*** linux/fs/ext3/fsync.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/fsync.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,69 ----
+ /*
+  *  linux/fs/ext3/fsync.c
+  *
+  *  Copyright (C) 1993  Stephen Tweedie (sct@redhat.com)
+  *  from
+  *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
+  *                      Laboratoire MASI - Institut Blaise Pascal
+  *                      Universite Pierre et Marie Curie (Paris VI)
+  *  from
+  *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
+  * 
+  *  ext3fs fsync primitive
+  *
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  * 
+  *  Removed unnecessary code duplication for little endian machines
+  *  and excessive __inline__s. 
+  *        Andi Kleen, 1997
+  *
+  * Major simplications and cleanup - we only need to do the metadata, because
+  * we can depend on generic_block_fdatasync() to sync the data blocks.
+  */
+ 
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/jbd.h>
+ #include <linux/smp_lock.h>
+ 
+ /*
+  * akpm: A new design for ext3_sync_file().
+  *
+  * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
+  * There cannot be a transaction open by this task. (AKPM: quotas?)
+  * Another task could have dirtied this inode.  Its data can be in any
+  * state in the journalling system.
+  *
+  * What we do is just kick off a commit and wait on it.  This will snapshot the
+  * inode to disk.
+  *
+  * Note that there is a serious optimisation we can make here: if the current
+  * inode is not part of j_running_transaction or j_committing_transaction
+  * then we have nothing to do.  That would require implementation of t_ilist,
+  * which isn't too hard.
+  */
+ 
+ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+ {
+ 	struct inode *inode = dentry->d_inode;
+ 	int ret;
+ 
+ 	J_ASSERT(ext3_journal_current_handle() == 0);
+ 
+ 	/*
+ 	 * fsync_inode_buffers() just walks i_dirty_buffers and waits
+ 	 * on them.  It's a no-op for full data journalling because
+ 	 * i_dirty_buffers will be ampty.
+ 	 * Really, we only need to start I/O on the dirty buffers -
+ 	 * we'll end up waiting on them in commit.
+ 	 */
+ 	ret = fsync_inode_buffers(inode);
+ 
+ 	ext3_force_commit(inode->i_sb);
+ 
+ 	return ret;
+ }
diff -rc2P linux/fs/ext3/ialloc.c linux-2.4.13/fs/ext3/ialloc.c
*** linux/fs/ext3/ialloc.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/ialloc.c	Fri Nov  9 17:03:46 2001
***************
*** 0 ****
--- 1,664 ----
+ /*
+  *  linux/fs/ext3/ialloc.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  BSD ufs-inspired inode and directory allocation by
+  *  Stephen Tweedie (sct@redhat.com), 1993
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  */
+ 
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+ 
+ #include <asm/bitops.h>
+ #include <asm/byteorder.h>
+ 
+ /*
+  * ialloc.c contains the inodes allocation and deallocation routines
+  */
+ 
+ /*
+  * The free inodes are managed by bitmaps.  A file system contains several
+  * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
+  * block for inodes, N blocks for the inode table and data blocks.
+  *
+  * The file system contains group descriptors which are located after the
+  * super block.  Each descriptor contains the number of the bitmap block and
+  * the free blocks count in the block.  The descriptors are loaded in memory
+  * when a file system is mounted (see ext3_read_super).
+  */
+ 
+ 
+ /*
+  * Read the inode allocation bitmap for a given block_group, reading
+  * into the specified slot in the superblock's bitmap cache.
+  *
+  * Return >=0 on success or a -ve error code.
+  */
+ static int read_inode_bitmap (struct super_block * sb,
+ 			       unsigned long block_group,
+ 			       unsigned int bitmap_nr)
+ {
+ 	struct ext3_group_desc * gdp;
+ 	struct buffer_head * bh = NULL;
+ 	int retval = 0;
+ 
+ 	gdp = ext3_get_group_desc (sb, block_group, NULL);
+ 	if (!gdp) {
+ 		retval = -EIO;
+ 		goto error_out;
+ 	}
+ 	bh = bread (sb->s_dev,
+ 			le32_to_cpu(gdp->bg_inode_bitmap), sb->s_blocksize);
+ 	if (!bh) {
+ 		ext3_error (sb, "read_inode_bitmap",
+ 			    "Cannot read inode bitmap - "
+ 			    "block_group = %lu, inode_bitmap = %lu",
+ 			    block_group, (unsigned long) gdp->bg_inode_bitmap);
+ 		retval = -EIO;
+ 	}
+ 	/*
+ 	 * On IO error, just leave a zero in the superblock's block pointer for
+ 	 * this group.  The IO will be retried next time.
+ 	 */
+ error_out:
+ 	sb->u.ext3_sb.s_inode_bitmap_number[bitmap_nr] = block_group;
+ 	sb->u.ext3_sb.s_inode_bitmap[bitmap_nr] = bh;
+ 	return retval;
+ }
+ 
+ /*
+  * load_inode_bitmap loads the inode bitmap for a blocks group
+  *
+  * It maintains a cache for the last bitmaps loaded.  This cache is managed
+  * with a LRU algorithm.
+  *
+  * Notes:
+  * 1/ There is one cache per mounted file system.
+  * 2/ If the file system contains less than EXT3_MAX_GROUP_LOADED groups,
+  *    this function reads the bitmap without maintaining a LRU cache.
+  *
+  * Return the slot used to store the bitmap, or a -ve error code.
+  */
+ static int load_inode_bitmap (struct super_block * sb,
+ 			      unsigned int block_group)
+ {
+ 	struct ext3_sb_info *sbi = EXT3_SB(sb);
+ 	unsigned long inode_bitmap_number;
+ 	struct buffer_head * inode_bitmap;
+ 	int i, j, retval = 0;
+ 
+ 	if (block_group >= sbi->s_groups_count)
+ 		ext3_panic (sb, "load_inode_bitmap",
+ 			    "block_group >= groups_count - "
+ 			    "block_group = %d, groups_count = %lu",
+ 			    block_group, sbi->s_groups_count);
+ 	if (sbi->s_loaded_inode_bitmaps > 0 &&
+ 	    sbi->s_inode_bitmap_number[0] == block_group &&
+ 	    sbi->s_inode_bitmap[0] != NULL)
+ 		return 0;
+ 	if (sbi->s_groups_count <= EXT3_MAX_GROUP_LOADED) {
+ 		if (sbi->s_inode_bitmap[block_group]) {
+ 			if (sbi->s_inode_bitmap_number[block_group] !=
+ 						block_group)
+ 				ext3_panic(sb, "load_inode_bitmap",
+ 					"block_group != inode_bitmap_number");
+ 			return block_group;
+ 		}
+ 		retval = read_inode_bitmap(sb, block_group, block_group);
+ 		if (retval < 0)
+ 			return retval;
+ 		return block_group;
+ 	}
+ 
+ 	for (i = 0; i < sbi->s_loaded_inode_bitmaps &&
+ 		    sbi->s_inode_bitmap_number[i] != block_group; i++)
+ 		/* do nothing */;
+ 	if (i < sbi->s_loaded_inode_bitmaps &&
+ 	    sbi->s_inode_bitmap_number[i] == block_group) {
+ 		inode_bitmap_number = sbi->s_inode_bitmap_number[i];
+ 		inode_bitmap = sbi->s_inode_bitmap[i];
+ 		for (j = i; j > 0; j--) {
+ 			sbi->s_inode_bitmap_number[j] =
+ 				sbi->s_inode_bitmap_number[j - 1];
+ 			sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
+ 		}
+ 		sbi->s_inode_bitmap_number[0] = inode_bitmap_number;
+ 		sbi->s_inode_bitmap[0] = inode_bitmap;
+ 
+ 		/*
+ 		 * There's still one special case here --- if inode_bitmap == 0
+ 		 * then our last attempt to read the bitmap failed and we have
+ 		 * just ended up caching that failure.  Try again to read it.
+ 		 */
+ 		if (!inode_bitmap)
+ 			retval = read_inode_bitmap (sb, block_group, 0);
+ 	} else {
+ 		if (sbi->s_loaded_inode_bitmaps < EXT3_MAX_GROUP_LOADED)
+ 			sbi->s_loaded_inode_bitmaps++;
+ 		else
+ 			brelse(sbi->s_inode_bitmap[EXT3_MAX_GROUP_LOADED - 1]);
+ 		for (j = sbi->s_loaded_inode_bitmaps - 1; j > 0; j--) {
+ 			sbi->s_inode_bitmap_number[j] =
+ 				sbi->s_inode_bitmap_number[j - 1];
+ 			sbi->s_inode_bitmap[j] = sbi->s_inode_bitmap[j - 1];
+ 		}
+ 		retval = read_inode_bitmap (sb, block_group, 0);
+ 	}
+ 	return retval;
+ }
+ 
+ /*
+  * NOTE! When we get the inode, we're the only people
+  * that have access to it, and as such there are no
+  * race conditions we have to worry about. The inode
+  * is not on the hash-lists, and it cannot be reached
+  * through the filesystem because the directory entry
+  * has been deleted earlier.
+  *
+  * HOWEVER: we must make sure that we get no aliases,
+  * which means that we have to call "clear_inode()"
+  * _before_ we mark the inode not in use in the inode
+  * bitmaps. Otherwise a newly created file might use
+  * the same inode number (not actually the same pointer
+  * though), and then we'd have two inodes sharing the
+  * same inode number and space on the harddisk.
+  */
+ void ext3_free_inode (handle_t *handle, struct inode * inode)
+ {
+ 	struct super_block * sb = inode->i_sb;
+ 	int is_directory;
+ 	unsigned long ino;
+ 	struct buffer_head * bh;
+ 	struct buffer_head * bh2;
+ 	unsigned long block_group;
+ 	unsigned long bit;
+ 	int bitmap_nr;
+ 	struct ext3_group_desc * gdp;
+ 	struct ext3_super_block * es;
+ 	int fatal = 0, err;
+ 
+ 	if (!inode->i_dev) {
+ 		printk ("ext3_free_inode: inode has no device\n");
+ 		return;
+ 	}
+ 	if (atomic_read(&inode->i_count) > 1) {
+ 		printk ("ext3_free_inode: inode has count=%d\n",
+ 					atomic_read(&inode->i_count));
+ 		return;
+ 	}
+ 	if (inode->i_nlink) {
+ 		printk ("ext3_free_inode: inode has nlink=%d\n",
+ 			inode->i_nlink);
+ 		return;
+ 	}
+ 	if (!sb) {
+ 		printk("ext3_free_inode: inode on nonexistent device\n");
+ 		return;
+ 	}
+ 
+ 	ino = inode->i_ino;
+ 	ext3_debug ("freeing inode %lu\n", ino);
+ 
+ 	/*
+ 	 * Note: we must free any quota before locking the superblock,
+ 	 * as writing the quota to disk may need the lock as well.
+ 	 */
+ 	DQUOT_INIT(inode);
+ 	DQUOT_FREE_INODE(inode);
+ 	DQUOT_DROP(inode);
+ 
+ 	is_directory = S_ISDIR(inode->i_mode);
+ 
+ 	/* Do this BEFORE marking the inode not in use or returning an error */
+ 	clear_inode (inode);
+ 
+ 	lock_super (sb);
+ 	es = sb->u.ext3_sb.s_es;
+ 	if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+ 		ext3_error (sb, "ext3_free_inode",
+ 			    "reserved or nonexistent inode %lu", ino);
+ 		goto error_return;
+ 	}
+ 	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+ 	bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+ 	bitmap_nr = load_inode_bitmap (sb, block_group);
+ 	if (bitmap_nr < 0)
+ 		goto error_return;
+ 
+ 	bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
+ 
+ 	BUFFER_TRACE(bh, "get_write_access");
+ 	fatal = ext3_journal_get_write_access(handle, bh);
+ 	if (fatal)
+ 		goto error_return;
+ 
+ 	/* Ok, now we can actually update the inode bitmaps.. */
+ 	if (!ext3_clear_bit (bit, bh->b_data))
+ 		ext3_error (sb, "ext3_free_inode",
+ 			      "bit already cleared for inode %lu", ino);
+ 	else {
+ 		gdp = ext3_get_group_desc (sb, block_group, &bh2);
+ 
+ 		BUFFER_TRACE(bh2, "get_write_access");
+ 		fatal = ext3_journal_get_write_access(handle, bh2);
+ 		if (fatal) goto error_return;
+ 
+ 		BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get write access");
+ 		fatal = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ 		if (fatal) goto error_return;
+ 
+ 		if (gdp) {
+ 			gdp->bg_free_inodes_count = cpu_to_le16(
+ 				le16_to_cpu(gdp->bg_free_inodes_count) + 1);
+ 			if (is_directory)
+ 				gdp->bg_used_dirs_count = cpu_to_le16(
+ 				  le16_to_cpu(gdp->bg_used_dirs_count) - 1);
+ 		}
+ 		BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+ 		err = ext3_journal_dirty_metadata(handle, bh2);
+ 		if (!fatal) fatal = err;
+ 		es->s_free_inodes_count =
+ 			cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) + 1);
+ 		BUFFER_TRACE(sb->u.ext3_sb.s_sbh,
+ 					"call ext3_journal_dirty_metadata");
+ 		err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ 		if (!fatal) fatal = err;
+ 	}
+ 	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ 	err = ext3_journal_dirty_metadata(handle, bh);
+ 	if (!fatal)
+ 		fatal = err;
+ 	sb->s_dirt = 1;
+ error_return:
+ 	ext3_std_error(sb, fatal);
+ 	unlock_super(sb);
+ }
+ 
+ /*
+  * There are two policies for allocating an inode.  If the new inode is
+  * a directory, then a forward search is made for a block group with both
+  * free space and a low directory-to-inode ratio; if that fails, then of
+  * the groups with above-average free space, that group with the fewest
+  * directories already is chosen.
+  *
+  * For other inodes, search forward from the parent directory's block
+  * group to find a free inode.
+  */
+ struct inode * ext3_new_inode (handle_t *handle,
+ 				const struct inode * dir, int mode)
+ {
+ 	struct super_block * sb;
+ 	struct buffer_head * bh;
+ 	struct buffer_head * bh2;
+ 	int i, j, avefreei;
+ 	struct inode * inode;
+ 	int bitmap_nr;
+ 	struct ext3_group_desc * gdp;
+ 	struct ext3_group_desc * tmp;
+ 	struct ext3_super_block * es;
+ 	int err = 0;
+ 
+ 	/* Cannot create files in a deleted directory */
+ 	if (!dir || !dir->i_nlink)
+ 		return ERR_PTR(-EPERM);
+ 
+ 	sb = dir->i_sb;
+ 	inode = new_inode(sb);
+ 	if (!inode)
+ 		return ERR_PTR(-ENOMEM);
+ 	init_rwsem(&inode->u.ext3_i.truncate_sem);
+ 
+ 	lock_super (sb);
+ 	es = sb->u.ext3_sb.s_es;
+ repeat:
+ 	gdp = NULL;
+ 	i = 0;
+ 
+ 	if (S_ISDIR(mode)) {
+ 		avefreei = le32_to_cpu(es->s_free_inodes_count) /
+ 			sb->u.ext3_sb.s_groups_count;
+ 		if (!gdp) {
+ 			for (j = 0; j < sb->u.ext3_sb.s_groups_count; j++) {
+ 				struct buffer_head *temp_buffer;
+ 				tmp = ext3_get_group_desc (sb, j, &temp_buffer);
+ 				if (tmp &&
+ 				    le16_to_cpu(tmp->bg_free_inodes_count) &&
+ 				    le16_to_cpu(tmp->bg_free_inodes_count) >=
+ 							avefreei) {
+ 					if (!gdp || (le16_to_cpu(tmp->bg_free_blocks_count) >
+ 						le16_to_cpu(gdp->bg_free_blocks_count))) {
+ 						i = j;
+ 						gdp = tmp;
+ 						bh2 = temp_buffer;
+ 					}
+ 				}
+ 			}
+ 		}
+ 	} else {
+ 		/*
+ 		 * Try to place the inode in its parent directory
+ 		 */
+ 		i = dir->u.ext3_i.i_block_group;
+ 		tmp = ext3_get_group_desc (sb, i, &bh2);
+ 		if (tmp && le16_to_cpu(tmp->bg_free_inodes_count))
+ 			gdp = tmp;
+ 		else
+ 		{
+ 			/*
+ 			 * Use a quadratic hash to find a group with a
+ 			 * free inode
+ 			 */
+ 			for (j = 1; j < sb->u.ext3_sb.s_groups_count; j <<= 1) {
+ 				i += j;
+ 				if (i >= sb->u.ext3_sb.s_groups_count)
+ 					i -= sb->u.ext3_sb.s_groups_count;
+ 				tmp = ext3_get_group_desc (sb, i, &bh2);
+ 				if (tmp &&
+ 				    le16_to_cpu(tmp->bg_free_inodes_count)) {
+ 					gdp = tmp;
+ 					break;
+ 				}
+ 			}
+ 		}
+ 		if (!gdp) {
+ 			/*
+ 			 * That failed: try linear search for a free inode
+ 			 */
+ 			i = dir->u.ext3_i.i_block_group + 1;
+ 			for (j = 2; j < sb->u.ext3_sb.s_groups_count; j++) {
+ 				if (++i >= sb->u.ext3_sb.s_groups_count)
+ 					i = 0;
+ 				tmp = ext3_get_group_desc (sb, i, &bh2);
+ 				if (tmp &&
+ 				    le16_to_cpu(tmp->bg_free_inodes_count)) {
+ 					gdp = tmp;
+ 					break;
+ 				}
+ 			}
+ 		}
+ 	}
+ 
+ 	err = -ENOSPC;
+ 	if (!gdp)
+ 		goto fail;
+ 
+ 	err = -EIO;
+ 	bitmap_nr = load_inode_bitmap (sb, i);
+ 	if (bitmap_nr < 0)
+ 		goto fail;
+ 
+ 	bh = sb->u.ext3_sb.s_inode_bitmap[bitmap_nr];
+ 
+ 	if ((j = ext3_find_first_zero_bit ((unsigned long *) bh->b_data,
+ 				      EXT3_INODES_PER_GROUP(sb))) <
+ 	    EXT3_INODES_PER_GROUP(sb)) {
+ 		BUFFER_TRACE(bh, "get_write_access");
+ 		err = ext3_journal_get_write_access(handle, bh);
+ 		if (err) goto fail;
+ 		
+ 		if (ext3_set_bit (j, bh->b_data)) {
+ 			ext3_error (sb, "ext3_new_inode",
+ 				      "bit already set for inode %d", j);
+ 			goto repeat;
+ 		}
+ 		BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ 		err = ext3_journal_dirty_metadata(handle, bh);
+ 		if (err) goto fail;
+ 	} else {
+ 		if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) {
+ 			ext3_error (sb, "ext3_new_inode",
+ 				    "Free inodes count corrupted in group %d",
+ 				    i);
+ 			/* Is it really ENOSPC? */
+ 			err = -ENOSPC;
+ 			if (sb->s_flags & MS_RDONLY)
+ 				goto fail;
+ 
+ 			BUFFER_TRACE(bh2, "get_write_access");
+ 			err = ext3_journal_get_write_access(handle, bh2);
+ 			if (err) goto fail;
+ 			gdp->bg_free_inodes_count = 0;
+ 			BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+ 			err = ext3_journal_dirty_metadata(handle, bh2);
+ 			if (err) goto fail;
+ 		}
+ 		goto repeat;
+ 	}
+ 	j += i * EXT3_INODES_PER_GROUP(sb) + 1;
+ 	if (j < EXT3_FIRST_INO(sb) || j > le32_to_cpu(es->s_inodes_count)) {
+ 		ext3_error (sb, "ext3_new_inode",
+ 			    "reserved inode or inode > inodes count - "
+ 			    "block_group = %d,inode=%d", i, j);
+ 		err = -EIO;
+ 		goto fail;
+ 	}
+ 
+ 	BUFFER_TRACE(bh2, "get_write_access");
+ 	err = ext3_journal_get_write_access(handle, bh2);
+ 	if (err) goto fail;
+ 	gdp->bg_free_inodes_count =
+ 		cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
+ 	if (S_ISDIR(mode))
+ 		gdp->bg_used_dirs_count =
+ 			cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
+ 	BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
+ 	err = ext3_journal_dirty_metadata(handle, bh2);
+ 	if (err) goto fail;
+ 	
+ 	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+ 	err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ 	if (err) goto fail;
+ 	es->s_free_inodes_count =
+ 		cpu_to_le32(le32_to_cpu(es->s_free_inodes_count) - 1);
+ 	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "call ext3_journal_dirty_metadata");
+ 	err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ 	sb->s_dirt = 1;
+ 	if (err) goto fail;
+ 
+ 	inode->i_uid = current->fsuid;
+ 	if (test_opt (sb, GRPID))
+ 		inode->i_gid = dir->i_gid;
+ 	else if (dir->i_mode & S_ISGID) {
+ 		inode->i_gid = dir->i_gid;
+ 		if (S_ISDIR(mode))
+ 			mode |= S_ISGID;
+ 	} else
+ 		inode->i_gid = current->fsgid;
+ 	inode->i_mode = mode;
+ 
+ 	inode->i_ino = j;
+ 	/* This is the optimal IO size (for stat), not the fs block size */
+ 	inode->i_blksize = PAGE_SIZE;
+ 	inode->i_blocks = 0;
+ 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ 	inode->u.ext3_i.i_flags = dir->u.ext3_i.i_flags & ~EXT3_INDEX_FL;
+ 	if (S_ISLNK(mode))
+ 		inode->u.ext3_i.i_flags &= ~(EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL | EXT3_APPEND_FL);
+ #ifdef EXT3_FRAGMENTS
+ 	inode->u.ext3_i.i_faddr = 0;
+ 	inode->u.ext3_i.i_frag_no = 0;
+ 	inode->u.ext3_i.i_frag_size = 0;
+ #endif
+ 	inode->u.ext3_i.i_file_acl = 0;
+ 	inode->u.ext3_i.i_dir_acl = 0;
+ 	inode->u.ext3_i.i_dtime = 0;
+ 	INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+ #ifdef EXT3_PREALLOCATE
+ 	inode->u.ext3_i.i_prealloc_count = 0;
+ #endif
+ 	inode->u.ext3_i.i_block_group = i;
+ 	
+ 	if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL)
+ 		inode->i_flags |= S_SYNC;
+ 	if (IS_SYNC(inode))
+ 		handle->h_sync = 1;
+ 	insert_inode_hash(inode);
+ 	inode->i_generation = event++;
+ 
+ 	inode->u.ext3_i.i_state = EXT3_STATE_NEW;
+ 	err = ext3_mark_inode_dirty(handle, inode);
+ 	if (err) goto fail;
+ 	
+ 	unlock_super (sb);
+ 	if(DQUOT_ALLOC_INODE(inode)) {
+ 		DQUOT_DROP(inode);
+ 		inode->i_flags |= S_NOQUOTA;
+ 		inode->i_nlink = 0;
+ 		iput(inode);
+ 		return ERR_PTR(-EDQUOT);
+ 	}
+ 	ext3_debug ("allocating inode %lu\n", inode->i_ino);
+ 	return inode;
+ 
+ fail:
+ 	unlock_super(sb);
+ 	iput(inode);
+ 	ext3_std_error(sb, err);
+ 	return ERR_PTR(err);
+ }
+ 
+ /* Verify that we are loading a valid orphan from disk */
+ struct inode *ext3_orphan_get (struct super_block * sb, ino_t ino)
+ {
+ 	ino_t max_ino = le32_to_cpu(EXT3_SB(sb)->s_es->s_inodes_count);
+ 	unsigned long block_group;
+ 	int bit;
+ 	int bitmap_nr;
+ 	struct buffer_head *bh;
+ 	struct inode *inode = NULL;
+ 	
+ 	/* Error cases - e2fsck has already cleaned up for us */
+ 	if (ino > max_ino) {
+ 		ext3_warning(sb, __FUNCTION__,
+ 			     "bad orphan ino %ld!  e2fsck was run?\n", ino);
+ 		return NULL;
+ 	}
+ 
+ 	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
+ 	bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb);
+ 	if ((bitmap_nr = load_inode_bitmap(sb, block_group)) < 0 ||
+ 	    !(bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr])) {
+ 		ext3_warning(sb, __FUNCTION__,
+ 			     "inode bitmap error for orphan %ld\n", ino);
+ 		return NULL;
+ 	}
+ 
+ 	/* Having the inode bit set should be a 100% indicator that this
+ 	 * is a valid orphan (no e2fsck run on fs).  Orphans also include
+ 	 * inodes that were being truncated, so we can't check i_nlink==0.
+ 	 */
+ 	if (!ext3_test_bit(bit, bh->b_data) || !(inode = iget(sb, ino)) ||
+ 	    is_bad_inode(inode) || NEXT_ORPHAN(inode) > max_ino) {
+ 		ext3_warning(sb, __FUNCTION__,
+ 			     "bad orphan inode %ld!  e2fsck was run?\n", ino);
+ 		printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%ld) = %d\n",
+ 		       bit, bh->b_blocknr, ext3_test_bit(bit, bh->b_data));
+ 		printk(KERN_NOTICE "inode=%p\n", inode);
+ 		if (inode) {
+ 			printk(KERN_NOTICE "is_bad_inode(inode)=%d\n",
+ 			       is_bad_inode(inode));
+ 			printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%d\n",
+ 			       NEXT_ORPHAN(inode));
+ 			printk(KERN_NOTICE "max_ino=%ld\n", max_ino);
+ 		}
+ 		/* Avoid freeing blocks if we got a bad deleted inode */
+ 		if (inode && inode->i_nlink == 0)
+ 			inode->i_blocks = 0;
+ 		iput(inode);
+ 		return NULL;
+ 	}
+ 
+ 	return inode;
+ }
+ 
+ unsigned long ext3_count_free_inodes (struct super_block * sb)
+ {
+ #ifdef EXT3FS_DEBUG
+ 	struct ext3_super_block * es;
+ 	unsigned long desc_count, bitmap_count, x;
+ 	int bitmap_nr;
+ 	struct ext3_group_desc * gdp;
+ 	int i;
+ 
+ 	lock_super (sb);
+ 	es = sb->u.ext3_sb.s_es;
+ 	desc_count = 0;
+ 	bitmap_count = 0;
+ 	gdp = NULL;
+ 	for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+ 		gdp = ext3_get_group_desc (sb, i, NULL);
+ 		if (!gdp)
+ 			continue;
+ 		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ 		bitmap_nr = load_inode_bitmap (sb, i);
+ 		if (bitmap_nr < 0)
+ 			continue;
+ 
+ 		x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
+ 				     EXT3_INODES_PER_GROUP(sb) / 8);
+ 		printk ("group %d: stored = %d, counted = %lu\n",
+ 			i, le16_to_cpu(gdp->bg_free_inodes_count), x);
+ 		bitmap_count += x;
+ 	}
+ 	printk("ext3_count_free_inodes: stored = %lu, computed = %lu, %lu\n",
+ 		le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count);
+ 	unlock_super (sb);
+ 	return desc_count;
+ #else
+ 	return le32_to_cpu(sb->u.ext3_sb.s_es->s_free_inodes_count);
+ #endif
+ }
+ 
+ #ifdef CONFIG_EXT3_CHECK
+ /* Called at mount-time, super-block is locked */
+ void ext3_check_inodes_bitmap (struct super_block * sb)
+ {
+ 	struct ext3_super_block * es;
+ 	unsigned long desc_count, bitmap_count, x;
+ 	int bitmap_nr;
+ 	struct ext3_group_desc * gdp;
+ 	int i;
+ 
+ 	es = sb->u.ext3_sb.s_es;
+ 	desc_count = 0;
+ 	bitmap_count = 0;
+ 	gdp = NULL;
+ 	for (i = 0; i < sb->u.ext3_sb.s_groups_count; i++) {
+ 		gdp = ext3_get_group_desc (sb, i, NULL);
+ 		if (!gdp)
+ 			continue;
+ 		desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
+ 		bitmap_nr = load_inode_bitmap (sb, i);
+ 		if (bitmap_nr < 0)
+ 			continue;
+ 
+ 		x = ext3_count_free (sb->u.ext3_sb.s_inode_bitmap[bitmap_nr],
+ 				     EXT3_INODES_PER_GROUP(sb) / 8);
+ 		if (le16_to_cpu(gdp->bg_free_inodes_count) != x)
+ 			ext3_error (sb, "ext3_check_inodes_bitmap",
+ 				    "Wrong free inodes count in group %d, "
+ 				    "stored = %d, counted = %lu", i,
+ 				    le16_to_cpu(gdp->bg_free_inodes_count), x);
+ 		bitmap_count += x;
+ 	}
+ 	if (le32_to_cpu(es->s_free_inodes_count) != bitmap_count)
+ 		ext3_error (sb, "ext3_check_inodes_bitmap",
+ 			    "Wrong free inodes count in super block, "
+ 			    "stored = %lu, counted = %lu",
+ 			    (unsigned long)le32_to_cpu(es->s_free_inodes_count),
+ 			    bitmap_count);
+ }
+ #endif
diff -rc2P linux/fs/ext3/inode.c linux-2.4.13/fs/ext3/inode.c
*** linux/fs/ext3/inode.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/inode.c	Fri Nov  9 17:03:19 2001
***************
*** 0 ****
--- 1,2676 ----
+ /*
+  *  linux/fs/ext3/inode.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/inode.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  Goal-directed block allocation by Stephen Tweedie
+  * 	(sct@redhat.com), 1993, 1998
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  *  64-bit file support on 64-bit platforms by Jakub Jelinek
+  * 	(jj@sunsite.ms.mff.cuni.cz)
+  *
+  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
+  */
+ 
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/jbd.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+ #include <linux/highuid.h>
+ #include <linux/quotaops.h>
+ #include <linux/module.h>
+ 
+ 
+ /*
+  * SEARCH_FROM_ZERO forces each block allocation to search from the start
+  * of the filesystem.  This is to force rapid reallocation of recently-freed
+  * blocks.  The file fragmentation is horrendous.
+  */
+ #undef SEARCH_FROM_ZERO
+ 
+ /* The ext3 forget function must perform a revoke if we are freeing data
+  * which has been journaled.  Metadata (eg. indirect blocks) must be
+  * revoked in all cases. 
+  *
+  * "bh" may be NULL: a metadata block may have been freed from memory
+  * but there may still be a record of it in the journal, and that record
+  * still needs to be revoked.
+  */
+ 
+ static int ext3_forget(handle_t *handle, int is_metadata,
+ 		       struct inode *inode, struct buffer_head *bh,
+ 		       int blocknr)
+ {
+ 	int err;
+ 
+ 	BUFFER_TRACE(bh, "enter");
+ 
+ 	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
+ 		  "data mode %lx\n",
+ 		  bh, is_metadata, inode->i_mode,
+ 		  test_opt(inode->i_sb, DATA_FLAGS));
+ 	
+ 	/* Never use the revoke function if we are doing full data
+ 	 * journaling: there is no need to, and a V1 superblock won't
+ 	 * support it.  Otherwise, only skip the revoke on un-journaled
+ 	 * data blocks. */
+ 
+ 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
+ 	    (!is_metadata && !ext3_should_journal_data(inode))) {
+ 		if (bh) {
+ 			BUFFER_TRACE(bh, "call journal_forget");
+ 			ext3_journal_forget(handle, bh);
+ 		}
+ 		return 0;
+ 	}
+ 
+ 	/*
+ 	 * data!=journal && (is_metadata || should_journal_data(inode))
+ 	 */
+ 	BUFFER_TRACE(bh, "call ext3_journal_revoke");
+ 	err = ext3_journal_revoke(handle, blocknr, bh);
+ 	if (err)
+ 		ext3_abort(inode->i_sb, __FUNCTION__,
+ 			   "error %d when attempting revoke", err);
+ 	BUFFER_TRACE(bh, "exit");
+ 	return err;
+ }
+ 
+ /* 
+  * Truncate transactions can be complex and absolutely huge.  So we need to
+  * be able to restart the transaction at a conventient checkpoint to make
+  * sure we don't overflow the journal.
+  *
+  * start_transaction gets us a new handle for a truncate transaction,
+  * and extend_transaction tries to extend the existing one a bit.  If
+  * extend fails, we need to propagate the failure up and restart the
+  * transaction in the top-level truncate loop. --sct 
+  */
+ 
+ static handle_t *start_transaction(struct inode *inode) 
+ {
+ 	long needed;
+ 	handle_t *result;
+ 	
+ 	needed = inode->i_blocks;
+ 	if (needed > EXT3_MAX_TRANS_DATA) 
+ 		needed = EXT3_MAX_TRANS_DATA;
+ 	
+ 	result = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS + needed);
+ 	if (!IS_ERR(result))
+ 		return result;
+ 	
+ 	ext3_std_error(inode->i_sb, PTR_ERR(result));
+ 	return result;
+ }
+ 
+ /*
+  * Try to extend this transaction for the purposes of truncation.
+  *
+  * Returns 0 if we managed to create more room.  If we can't create more
+  * room, and the transaction must be restarted we return 1.
+  */
+ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
+ {
+ 	long needed;
+ 	
+ 	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
+ 		return 0;
+ 	needed = inode->i_blocks;
+ 	if (needed > EXT3_MAX_TRANS_DATA) 
+ 		needed = EXT3_MAX_TRANS_DATA;
+ 	if (!ext3_journal_extend(handle, EXT3_RESERVE_TRANS_BLOCKS + needed))
+ 		return 0;
+ 	return 1;
+ }
+ 
+ /*
+  * Restart the transaction associated with *handle.  This does a commit,
+  * so before we call here everything must be consistently dirtied against
+  * this transaction.
+  */
+ static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+ {
+ 	long needed = inode->i_blocks;
+ 	if (needed > EXT3_MAX_TRANS_DATA) 
+ 		needed = EXT3_MAX_TRANS_DATA;
+ 	jbd_debug(2, "restarting handle %p\n", handle);
+ 	return ext3_journal_restart(handle, EXT3_DATA_TRANS_BLOCKS + needed);
+ }
+ 
+ /*
+  * Called at each iput()
+  */
+ void ext3_put_inode (struct inode * inode)
+ {
+ 	ext3_discard_prealloc (inode);
+ }
+ 
+ /*
+  * Called at the last iput() if i_nlink is zero.
+  */
+ void ext3_delete_inode (struct inode * inode)
+ {
+ 	handle_t *handle;
+ 	
+ 	if (is_bad_inode(inode) ||
+ 	    inode->i_ino == EXT3_ACL_IDX_INO ||
+ 	    inode->i_ino == EXT3_ACL_DATA_INO)
+ 		goto no_delete;
+ 
+ 	lock_kernel();
+ 	handle = start_transaction(inode);
+ 	if (IS_ERR(handle)) {
+ 		/* If we're going to skip the normal cleanup, we still
+ 		 * need to make sure that the in-core orphan linked list
+ 		 * is properly cleaned up. */
+ 		ext3_orphan_del(NULL, inode);
+ 
+ 		ext3_std_error(inode->i_sb, PTR_ERR(handle));
+ 		unlock_kernel();
+ 		goto no_delete;
+ 	}
+ 	
+ 	if (IS_SYNC(inode))
+ 		handle->h_sync = 1;
+ 	inode->i_size = 0;
+ 	if (inode->i_blocks)
+ 		ext3_truncate(inode);
+ 	/*
+ 	 * Kill off the orphan record which ext3_truncate created.
+ 	 * AKPM: I think this can be inside the above `if'.
+ 	 * Note that ext3_orphan_del() has to be able to cope with the
+ 	 * deletion of a non-existent orphan - this is because we don't
+ 	 * know if ext3_truncate() actually created an orphan record.
+ 	 * (Well, we could do this if we need to, but heck - it works)
+ 	 */
+ 	ext3_orphan_del(handle, inode);
+ 	inode->u.ext3_i.i_dtime	= CURRENT_TIME;
+ 
+ 	/* 
+ 	 * One subtle ordering requirement: if anything has gone wrong
+ 	 * (transaction abort, IO errors, whatever), then we can still
+ 	 * do these next steps (the fs will already have been marked as
+ 	 * having errors), but we can't free the inode if the mark_dirty
+ 	 * fails.  
+ 	 */
+ 	if (ext3_mark_inode_dirty(handle, inode))
+ 		/* If that failed, just do the required in-core inode clear. */
+ 		clear_inode(inode);
+ 	else
+ 		ext3_free_inode(handle, inode);
+ 	ext3_journal_stop(handle, inode);
+ 	unlock_kernel();
+ 	return;
+ no_delete:
+ 	clear_inode(inode);	/* We must guarantee clearing of inode... */
+ }
+ 
+ void ext3_discard_prealloc (struct inode * inode)
+ {
+ #ifdef EXT3_PREALLOCATE
+ 	lock_kernel();
+ 	/* Writer: ->i_prealloc* */
+ 	if (inode->u.ext3_i.i_prealloc_count) {
+ 		unsigned short total = inode->u.ext3_i.i_prealloc_count;
+ 		unsigned long block = inode->u.ext3_i.i_prealloc_block;
+ 		inode->u.ext3_i.i_prealloc_count = 0;
+ 		inode->u.ext3_i.i_prealloc_block = 0;
+ 		/* Writer: end */
+ 		ext3_free_blocks (inode, block, total);
+ 	}
+ 	unlock_kernel();
+ #endif
+ }
+ 
+ static int ext3_alloc_block (handle_t *handle,
+ 			struct inode * inode, unsigned long goal, int *err)
+ {
+ #ifdef EXT3FS_DEBUG
+ 	static unsigned long alloc_hits = 0, alloc_attempts = 0;
+ #endif
+ 	unsigned long result;
+ 
+ #ifdef EXT3_PREALLOCATE
+ 	/* Writer: ->i_prealloc* */
+ 	if (inode->u.ext3_i.i_prealloc_count &&
+ 	    (goal == inode->u.ext3_i.i_prealloc_block ||
+ 	     goal + 1 == inode->u.ext3_i.i_prealloc_block))
+ 	{
+ 		result = inode->u.ext3_i.i_prealloc_block++;
+ 		inode->u.ext3_i.i_prealloc_count--;
+ 		/* Writer: end */
+ 		ext3_debug ("preallocation hit (%lu/%lu).\n",
+ 			    ++alloc_hits, ++alloc_attempts);
+ 	} else {
+ 		ext3_discard_prealloc (inode);
+ 		ext3_debug ("preallocation miss (%lu/%lu).\n",
+ 			    alloc_hits, ++alloc_attempts);
+ 		if (S_ISREG(inode->i_mode))
+ 			result = ext3_new_block (inode, goal, 
+ 				 &inode->u.ext3_i.i_prealloc_count,
+ 				 &inode->u.ext3_i.i_prealloc_block, err);
+ 		else
+ 			result = ext3_new_block (inode, goal, 0, 0, err);
+ 		/*
+ 		 * AKPM: this is somewhat sticky.  I'm not surprised it was
+ 		 * disabled in 2.2's ext3.  Need to integrate b_committed_data
+ 		 * guarding with preallocation, if indeed preallocation is
+ 		 * effective.
+ 		 */
+ 	}
+ #else
+ 	result = ext3_new_block (handle, inode, goal, 0, 0, err);
+ #endif
+ 	return result;
+ }
+ 
+ 
+ typedef struct {
+ 	u32	*p;
+ 	u32	key;
+ 	struct buffer_head *bh;
+ } Indirect;
+ 
+ static inline void add_chain(Indirect *p, struct buffer_head *bh, u32 *v)
+ {
+ 	p->key = *(p->p = v);
+ 	p->bh = bh;
+ }
+ 
+ static inline int verify_chain(Indirect *from, Indirect *to)
+ {
+ 	while (from <= to && from->key == *from->p)
+ 		from++;
+ 	return (from > to);
+ }
+ 
+ /**
+  *	ext3_block_to_path - parse the block number into array of offsets
+  *	@inode: inode in question (we are only interested in its superblock)
+  *	@i_block: block number to be parsed
+  *	@offsets: array to store the offsets in
+  *
+  *	To store the locations of file's data ext3 uses a data structure common
+  *	for UNIX filesystems - tree of pointers anchored in the inode, with
+  *	data blocks at leaves and indirect blocks in intermediate nodes.
+  *	This function translates the block number into path in that tree -
+  *	return value is the path length and @offsets[n] is the offset of
+  *	pointer to (n+1)th node in the nth one. If @block is out of range
+  *	(negative or too large) warning is printed and zero returned.
+  *
+  *	Note: function doesn't find node addresses, so no IO is needed. All
+  *	we need to know is the capacity of indirect blocks (taken from the
+  *	inode->i_sb).
+  */
+ 
+ /*
+  * Portability note: the last comparison (check that we fit into triple
+  * indirect block) is spelled differently, because otherwise on an
+  * architecture with 32-bit longs and 8Kb pages we might get into trouble
+  * if our filesystem had 8Kb blocks. We might use long long, but that would
+  * kill us on x86. Oh, well, at least the sign propagation does not matter -
+  * i_block would have to be negative in the very beginning, so we would not
+  * get there at all.
+  */
+ 
+ static int ext3_block_to_path(struct inode *inode, long i_block, int offsets[4])
+ {
+ 	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+ 	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
+ 	const long direct_blocks = EXT3_NDIR_BLOCKS,
+ 		indirect_blocks = ptrs,
+ 		double_blocks = (1 << (ptrs_bits * 2));
+ 	int n = 0;
+ 
+ 	if (i_block < 0) {
+ 		ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
+ 	} else if (i_block < direct_blocks) {
+ 		offsets[n++] = i_block;
+ 	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
+ 		offsets[n++] = EXT3_IND_BLOCK;
+ 		offsets[n++] = i_block;
+ 	} else if ((i_block -= indirect_blocks) < double_blocks) {
+ 		offsets[n++] = EXT3_DIND_BLOCK;
+ 		offsets[n++] = i_block >> ptrs_bits;
+ 		offsets[n++] = i_block & (ptrs - 1);
+ 	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
+ 		offsets[n++] = EXT3_TIND_BLOCK;
+ 		offsets[n++] = i_block >> (ptrs_bits * 2);
+ 		offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
+ 		offsets[n++] = i_block & (ptrs - 1);
+ 	} else {
+ 		ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
+ 	}
+ 	return n;
+ }
+ 
+ /**
+  *	ext3_get_branch - read the chain of indirect blocks leading to data
+  *	@inode: inode in question
+  *	@depth: depth of the chain (1 - direct pointer, etc.)
+  *	@offsets: offsets of pointers in inode/indirect blocks
+  *	@chain: place to store the result
+  *	@err: here we store the error value
+  *
+  *	Function fills the array of triples <key, p, bh> and returns %NULL
+  *	if everything went OK or the pointer to the last filled triple
+  *	(incomplete one) otherwise. Upon the return chain[i].key contains
+  *	the number of (i+1)-th block in the chain (as it is stored in memory,
+  *	i.e. little-endian 32-bit), chain[i].p contains the address of that
+  *	number (it points into struct inode for i==0 and into the bh->b_data
+  *	for i>0) and chain[i].bh points to the buffer_head of i-th indirect
+  *	block for i>0 and NULL for i==0. In other words, it holds the block
+  *	numbers of the chain, addresses they were taken from (and where we can
+  *	verify that chain did not change) and buffer_heads hosting these
+  *	numbers.
+  *
+  *	Function stops when it stumbles upon zero pointer (absent block)
+  *		(pointer to last triple returned, *@err == 0)
+  *	or when it gets an IO error reading an indirect block
+  *		(ditto, *@err == -EIO)
+  *	or when it notices that chain had been changed while it was reading
+  *		(ditto, *@err == -EAGAIN)
+  *	or when it reads all @depth-1 indirect blocks successfully and finds
+  *	the whole chain, all way to the data (returns %NULL, *err == 0).
+  */
+ static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
+ 				 Indirect chain[4], int *err)
+ {
+ 	kdev_t dev = inode->i_dev;
+ 	int blocksize = inode->i_sb->s_blocksize;
+ 	Indirect *p = chain;
+ 	struct buffer_head *bh;
+ 
+ 	*err = 0;
+ 	/* i_data is not going away, no lock needed */
+ 	add_chain (chain, NULL, inode->u.ext3_i.i_data + *offsets);
+ 	if (!p->key)
+ 		goto no_block;
+ 	while (--depth) {
+ 		bh = bread(dev, le32_to_cpu(p->key), blocksize);
+ 		if (!bh)
+ 			goto failure;
+ 		/* Reader: pointers */
+ 		if (!verify_chain(chain, p))
+ 			goto changed;
+ 		add_chain(++p, bh, (u32*)bh->b_data + *++offsets);
+ 		/* Reader: end */
+ 		if (!p->key)
+ 			goto no_block;
+ 	}
+ 	return NULL;
+ 
+ changed:
+ 	*err = -EAGAIN;
+ 	goto no_block;
+ failure:
+ 	*err = -EIO;
+ no_block:
+ 	return p;
+ }
+ 
+ /**
+  *	ext3_find_near - find a place for allocation with sufficient locality
+  *	@inode: owner
+  *	@ind: descriptor of indirect block.
+  *
+  *	This function returns the prefered place for block allocation.
+  *	It is used when heuristic for sequential allocation fails.
+  *	Rules are:
+  *	  + if there is a block to the left of our position - allocate near it.
+  *	  + if pointer will live in indirect block - allocate near that block.
+  *	  + if pointer will live in inode - allocate in the same
+  *	    cylinder group. 
+  *	Caller must make sure that @ind is valid and will stay that way.
+  */
+ 
+ static inline unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
+ {
+ 	u32 *start = ind->bh ? (u32*) ind->bh->b_data : inode->u.ext3_i.i_data;
+ 	u32 *p;
+ 
+ 	/* Try to find previous block */
+ 	for (p = ind->p - 1; p >= start; p--)
+ 		if (*p)
+ 			return le32_to_cpu(*p);
+ 
+ 	/* No such thing, so let's try location of indirect block */
+ 	if (ind->bh)
+ 		return ind->bh->b_blocknr;
+ 
+ 	/*
+ 	 * It is going to be refered from inode itself? OK, just put it into
+ 	 * the same cylinder group then.
+ 	 */
+ 	return (inode->u.ext3_i.i_block_group * 
+ 		EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
+ 	       le32_to_cpu(inode->i_sb->u.ext3_sb.s_es->s_first_data_block);
+ }
+ 
+ /**
+  *	ext3_find_goal - find a prefered place for allocation.
+  *	@inode: owner
+  *	@block:  block we want
+  *	@chain:  chain of indirect blocks
+  *	@partial: pointer to the last triple within a chain
+  *	@goal:	place to store the result.
+  *
+  *	Normally this function find the prefered place for block allocation,
+  *	stores it in *@goal and returns zero. If the branch had been changed
+  *	under us we return -EAGAIN.
+  */
+ 
+ static int ext3_find_goal(struct inode *inode, long block, Indirect chain[4],
+ 			  Indirect *partial, unsigned long *goal)
+ {
+ 	/* Writer: ->i_next_alloc* */
+ 	if (block == inode->u.ext3_i.i_next_alloc_block + 1) {
+ 		inode->u.ext3_i.i_next_alloc_block++;
+ 		inode->u.ext3_i.i_next_alloc_goal++;
+ 	}
+ #ifdef SEARCH_FROM_ZERO
+ 	inode->u.ext3_i.i_next_alloc_block = 0;
+ 	inode->u.ext3_i.i_next_alloc_goal = 0;
+ #endif
+ 	/* Writer: end */
+ 	/* Reader: pointers, ->i_next_alloc* */
+ 	if (verify_chain(chain, partial)) {
+ 		/*
+ 		 * try the heuristic for sequential allocation,
+ 		 * failing that at least try to get decent locality.
+ 		 */
+ 		if (block == inode->u.ext3_i.i_next_alloc_block)
+ 			*goal = inode->u.ext3_i.i_next_alloc_goal;
+ 		if (!*goal)
+ 			*goal = ext3_find_near(inode, partial);
+ #ifdef SEARCH_FROM_ZERO
+ 		*goal = 0;
+ #endif
+ 		return 0;
+ 	}
+ 	/* Reader: end */
+ 	return -EAGAIN;
+ }
+ 
+ /**
+  *	ext3_alloc_branch - allocate and set up a chain of blocks.
+  *	@inode: owner
+  *	@num: depth of the chain (number of blocks to allocate)
+  *	@offsets: offsets (in the blocks) to store the pointers to next.
+  *	@branch: place to store the chain in.
+  *
+  *	This function allocates @num blocks, zeroes out all but the last one,
+  *	links them into chain and (if we are synchronous) writes them to disk.
+  *	In other words, it prepares a branch that can be spliced onto the
+  *	inode. It stores the information about that chain in the branch[], in
+  *	the same format as ext3_get_branch() would do. We are calling it after
+  *	we had read the existing part of chain and partial points to the last
+  *	triple of that (one with zero ->key). Upon the exit we have the same
+  *	picture as after the successful ext3_get_block(), excpet that in one
+  *	place chain is disconnected - *branch->p is still zero (we did not
+  *	set the last link), but branch->key contains the number that should
+  *	be placed into *branch->p to fill that gap.
+  *
+  *	If allocation fails we free all blocks we've allocated (and forget
+  *	their buffer_heads) and return the error value the from failed
+  *	ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
+  *	as described above and return 0.
+  */
+ 
+ static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
+ 			     int num,
+ 			     unsigned long goal,
+ 			     int *offsets,
+ 			     Indirect *branch)
+ {
+ 	int blocksize = inode->i_sb->s_blocksize;
+ 	int n = 0, keys = 0;
+ 	int err = 0;
+ 	int i;
+ 	int parent = ext3_alloc_block(handle, inode, goal, &err);
+ 
+ 	branch[0].key = cpu_to_le32(parent);
+ 	if (parent) {
+ 		for (n = 1; n < num; n++) {
+ 			struct buffer_head *bh;
+ 			/* Allocate the next block */
+ 			int nr = ext3_alloc_block(handle, inode, parent, &err);
+ 			if (!nr)
+ 				break;
+ 			branch[n].key = cpu_to_le32(nr);
+ 			keys = n+1;
+ 			
+ 			/*
+ 			 * Get buffer_head for parent block, zero it out
+ 			 * and set the pointer to new one, then send
+ 			 * parent to disk.  
+ 			 */
+ 			bh = getblk(inode->i_dev, parent, blocksize);
+ 			branch[n].bh = bh;
+ 			lock_buffer(bh);
+ 			BUFFER_TRACE(bh, "call get_create_access");
+ 			err = ext3_journal_get_create_access(handle, bh);
+ 			if (err) {
+ 				unlock_buffer(bh);
+ 				brelse(bh);
+ 				break;
+ 			}
+ 
+ 			memset(bh->b_data, 0, blocksize);
+ 			branch[n].p = (u32*) bh->b_data + offsets[n];
+ 			*branch[n].p = branch[n].key;
+ 			BUFFER_TRACE(bh, "marking uptodate");
+ 			mark_buffer_uptodate(bh, 1);
+ 			unlock_buffer(bh);
+ 
+ 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ 			err = ext3_journal_dirty_metadata(handle, bh);
+ 			if (err)
+ 				break;
+ 			
+ 			parent = nr;
+ 		}
+ 		if (IS_SYNC(inode))
+ 			handle->h_sync = 1;
+ 	}
+ 	if (n == num)
+ 		return 0;
+ 
+ 	/* Allocation failed, free what we already allocated */
+ 	for (i = 1; i < keys; i++) {
+ 		BUFFER_TRACE(branch[i].bh, "call journal_forget");
+ 		ext3_journal_forget(handle, branch[i].bh);
+ 	}
+ 	for (i = 0; i < keys; i++)
+ 		ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
+ 	return err;
+ }
+ 
+ /**
+  *	ext3_splice_branch - splice the allocated branch onto inode.
+  *	@inode: owner
+  *	@block: (logical) number of block we are adding
+  *	@chain: chain of indirect blocks (with a missing link - see
+  *		ext3_alloc_branch)
+  *	@where: location of missing link
+  *	@num:   number of blocks we are adding
+  *
+  *	This function verifies that chain (up to the missing link) had not
+  *	changed, fills the missing link and does all housekeeping needed in
+  *	inode (->i_blocks, etc.). In case of success we end up with the full
+  *	chain to new block and return 0. Otherwise (== chain had been changed)
+  *	we free the new blocks (forgetting their buffer_heads, indeed) and
+  *	return -EAGAIN.
+  */
+ 
+ static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
+ 			      Indirect chain[4], Indirect *where, int num)
+ {
+ 	int i;
+ 	int err = 0;
+ 
+ 	/*
+ 	 * If we're splicing into a [td]indirect block (as opposed to the
+ 	 * inode) then we need to get write access to the [td]indirect block
+ 	 * before the splice.
+ 	 */
+ 	if (where->bh) {
+ 		BUFFER_TRACE(where->bh, "get_write_access");
+ 		err = ext3_journal_get_write_access(handle, where->bh);
+ 		if (err)
+ 			goto err_out;
+ 	}
+ 	/* Verify that place we are splicing to is still there and vacant */
+ 
+ 	/* Writer: pointers, ->i_next_alloc* */
+ 	if (!verify_chain(chain, where-1) || *where->p)
+ 		/* Writer: end */
+ 		goto changed;
+ 
+ 	/* That's it */
+ 
+ 	*where->p = where->key;
+ 	inode->u.ext3_i.i_next_alloc_block = block;
+ 	inode->u.ext3_i.i_next_alloc_goal = le32_to_cpu(where[num-1].key);
+ #ifdef SEARCH_FROM_ZERO
+ 	inode->u.ext3_i.i_next_alloc_block = 0;
+ 	inode->u.ext3_i.i_next_alloc_goal = 0;
+ #endif
+ 	/* Writer: end */
+ 
+ 	/* We are done with atomic stuff, now do the rest of housekeeping */
+ 
+ 	inode->i_ctime = CURRENT_TIME;
+ 	ext3_mark_inode_dirty(handle, inode);
+ 
+ 	/* had we spliced it onto indirect block? */
+ 	if (where->bh) {
+ 		/*
+ 		 * akpm: If we spliced it onto an indirect block, we haven't
+ 		 * altered the inode.  Note however that if it is being spliced
+ 		 * onto an indirect block at the very end of the file (the
+ 		 * file is growing) then we *will* alter the inode to reflect
+ 		 * the new i_size.  But that is not done here - it is done in
+ 		 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
+ 		 */
+ 		jbd_debug(5, "splicing indirect only\n");
+ 		BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
+ 		err = ext3_journal_dirty_metadata(handle, where->bh);
+ 		if (err) 
+ 			goto err_out;
+ 	} else {
+ 		/*
+ 		 * OK, we spliced it into the inode itself on a direct block.
+ 		 * Inode was dirtied above.
+ 		 */
+ 		jbd_debug(5, "splicing direct\n");
+ 	}
+ 	return err;
+ 
+ changed:
+ 	/*
+ 	 * AKPM: if where[i].bh isn't part of the current updating
+ 	 * transaction then we explode nastily.  Test this code path.
+ 	 */
+ 	jbd_debug(1, "the chain changed: try again\n");
+ 	err = -EAGAIN;
+ 	
+ err_out:
+ 	for (i = 1; i < num; i++) {
+ 		BUFFER_TRACE(where[i].bh, "call journal_forget");
+ 		ext3_journal_forget(handle, where[i].bh);
+ 	}
+ 	/* For the normal collision cleanup case, we free up the blocks.
+ 	 * On genuine filesystem errors we don't even think about doing
+ 	 * that. */
+ 	if (err == -EAGAIN)
+ 		for (i = 0; i < num; i++)
+ 			ext3_free_blocks(handle, inode, 
+ 					 le32_to_cpu(where[i].key), 1);
+ 	return err;
+ }
+ 
+ /*
+  * Allocation strategy is simple: if we have to allocate something, we will
+  * have to go the whole way to leaf. So let's do it before attaching anything
+  * to tree, set linkage between the newborn blocks, write them if sync is
+  * required, recheck the path, free and repeat if check fails, otherwise
+  * set the last missing link (that will protect us from any truncate-generated
+  * removals - all blocks on the path are immune now) and possibly force the
+  * write on the parent block.
+  * That has a nice additional property: no special recovery from the failed
+  * allocations is needed - we simply release blocks and do not touch anything
+  * reachable from inode.
+  *
+  * akpm: `handle' can be NULL if create == 0.
+  */
+ 
+ static int ext3_get_block_handle(handle_t *handle, struct inode *inode, 
+ 				 long iblock,
+ 				 struct buffer_head *bh_result, int create)
+ {
+ 	int err = -EIO;
+ 	int offsets[4];
+ 	Indirect chain[4];
+ 	Indirect *partial;
+ 	unsigned long goal;
+ 	int left;
+ 	int depth = ext3_block_to_path(inode, iblock, offsets);
+ 	loff_t new_size;
+ 
+ 	J_ASSERT(handle != NULL || create == 0);
+ 
+ 	if (depth == 0)
+ 		goto out;
+ 
+ 	lock_kernel();
+ reread:
+ 	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
+ 
+ 	/* Simplest case - block found, no allocation needed */
+ 	if (!partial) {
+ 		bh_result->b_state &= ~(1UL << BH_New);
+ got_it:
+ 		bh_result->b_dev = inode->i_dev;
+ 		bh_result->b_blocknr = le32_to_cpu(chain[depth-1].key);
+ 		bh_result->b_state |= (1UL << BH_Mapped);
+ 		/* Clean up and exit */
+ 		partial = chain+depth-1; /* the whole chain */
+ 		goto cleanup;
+ 	}
+ 
+ 	/* Next simple case - plain lookup or failed read of indirect block */
+ 	if (!create || err == -EIO) {
+ cleanup:
+ 		while (partial > chain) {
+ 			BUFFER_TRACE(partial->bh, "call brelse");
+ 			brelse(partial->bh);
+ 			partial--;
+ 		}
+ 		BUFFER_TRACE(bh_result, "returned");
+ 		unlock_kernel();
+ out:
+ 		return err;
+ 	}
+ 
+ 	/*
+ 	 * Indirect block might be removed by truncate while we were
+ 	 * reading it. Handling of that case (forget what we've got and
+ 	 * reread) is taken out of the main path.
+ 	 */
+ 	if (err == -EAGAIN)
+ 		goto changed;
+ 
+ 	if (ext3_find_goal(inode, iblock, chain, partial, &goal) < 0)
+ 		goto changed;
+ 
+ 	left = (chain + depth) - partial;
+ 
+ 	/*
+ 	 * Block out ext3_truncate while we alter the tree
+ 	 */
+ 	down_read(&inode->u.ext3_i.truncate_sem);
+ 	err = ext3_alloc_branch(handle, inode, left, goal,
+ 					offsets+(partial-chain), partial);
+ 
+ 	/* The ext3_splice_branch call will free and forget any buffers
+ 	 * on the new chain if there is a failure, but that risks using
+ 	 * up transaction credits, especially for bitmaps where the
+ 	 * credits cannot be returned.  Can we handle this somehow?  We
+ 	 * may need to return -EAGAIN upwards in the worst case.  --sct */
+ 	if (!err)
+ 		err = ext3_splice_branch(handle, inode, iblock, chain,
+ 					 partial, left);
+ 	up_read(&inode->u.ext3_i.truncate_sem);
+ 	if (err == -EAGAIN)
+ 		goto changed;
+ 	if (err)
+ 		goto cleanup;
+ 
+ 	new_size = inode->i_size;
+ 	/*
+ 	 * This is not racy against ext3_truncate's modification of i_disksize
+ 	 * because VM/VFS ensures that the file cannot be extended while
+ 	 * truncate is in progress.  It is racy between multiple parallel
+ 	 * instances of get_block, but we have the BKL.
+ 	 */
+ 	if (new_size > inode->u.ext3_i.i_disksize)
+ 		inode->u.ext3_i.i_disksize = new_size;
+ 
+ 	bh_result->b_state |= (1UL << BH_New);
+ 	goto got_it;
+ 
+ changed:
+ 	while (partial > chain) {
+ 		jbd_debug(1, "buffer chain changed, retrying\n");
+ 		BUFFER_TRACE(partial->bh, "brelsing");
+ 		brelse(partial->bh);
+ 		partial--;
+ 	}
+ 	goto reread;
+ }
+ 
+ static int ext3_get_block(struct inode *inode, long iblock,
+ 			struct buffer_head *bh_result, int create)
+ {
+ 	handle_t *handle = 0;
+ 	int ret;
+ 
+ 	if (create) {
+ 		handle = ext3_journal_current_handle();
+ 		J_ASSERT(handle != 0);
+ 	}
+ 	ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create);
+ 	return ret;
+ }
+ 
+ /*
+  * `handle' can be NULL if create is zero
+  */
+ struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
+ 				long block, int create, int * errp)
+ {
+ 	struct buffer_head dummy;
+ 	int fatal = 0, err;
+ 	
+ 	J_ASSERT(handle != NULL || create == 0);
+ 
+ 	dummy.b_state = 0;
+ 	dummy.b_blocknr = -1000;
+ 	buffer_trace_init(&dummy.b_history);
+ 	*errp = ext3_get_block_handle(handle, inode, block, &dummy, create);
+ 	if (!*errp && buffer_mapped(&dummy)) {
+ 		struct buffer_head *bh;
+ 		bh = getblk(dummy.b_dev, dummy.b_blocknr,
+ 					inode->i_sb->s_blocksize);
+ 		if (buffer_new(&dummy)) {
+ 			J_ASSERT(create != 0);
+ 			J_ASSERT(handle != 0);
+ 
+ 			/* Now that we do not always journal data, we
+ 			   should keep in mind whether this should
+ 			   always journal the new buffer as metadata.
+ 			   For now, regular file writes use
+ 			   ext3_get_block instead, so it's not a
+ 			   problem. */
+ 			lock_kernel();
+ 			lock_buffer(bh);
+ 			BUFFER_TRACE(bh, "call get_create_access");
+ 			fatal = ext3_journal_get_create_access(handle, bh);
+ 			if (!fatal) {
+ 				memset(bh->b_data, 0,
+ 				       inode->i_sb->s_blocksize);
+ 				mark_buffer_uptodate(bh, 1);
+ 			}
+ 			unlock_buffer(bh);
+ 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ 			err = ext3_journal_dirty_metadata(handle, bh);
+ 			if (!fatal) fatal = err;
+ 			unlock_kernel();
+ 		} else {
+ 			BUFFER_TRACE(bh, "not a new buffer");
+ 		}
+ 		if (fatal) {
+ 			*errp = fatal;
+ 			brelse(bh);
+ 			bh = NULL;
+ 		}
+ 		return bh;
+ 	}
+ 	return NULL;
+ }
+ 
+ struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
+ 			       int block, int create, int *err)
+ {
+ 	struct buffer_head * bh;
+ 	int prev_blocks;
+ 
+ 	prev_blocks = inode->i_blocks;
+ 
+ 	bh = ext3_getblk (handle, inode, block, create, err);
+ 	if (!bh)
+ 		return bh;
+ #ifdef EXT3_PREALLOCATE
+ 	/*
+ 	 * If the inode has grown, and this is a directory, then use a few
+ 	 * more of the preallocated blocks to keep directory fragmentation
+ 	 * down.  The preallocated blocks are guaranteed to be contiguous.
+ 	 */
+ 	if (create &&
+ 	    S_ISDIR(inode->i_mode) &&
+ 	    inode->i_blocks > prev_blocks &&
+ 	    EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
+ 				    EXT3_FEATURE_COMPAT_DIR_PREALLOC)) {
+ 		int i;
+ 		struct buffer_head *tmp_bh;
+ 
+ 		for (i = 1;
+ 		     inode->u.ext3_i.i_prealloc_count &&
+ 		     i < EXT3_SB(inode->i_sb)->s_es->s_prealloc_dir_blocks;
+ 		     i++) {
+ 			/*
+ 			 * ext3_getblk will zero out the contents of the
+ 			 * directory for us
+ 			 */
+ 			tmp_bh = ext3_getblk(handle, inode,
+ 						block+i, create, err);
+ 			if (!tmp_bh) {
+ 				brelse (bh);
+ 				return 0;
+ 			}
+ 			brelse (tmp_bh);
+ 		}
+ 	}
+ #endif
+ 	if (buffer_uptodate(bh))
+ 		return bh;
+ 	ll_rw_block (READ, 1, &bh);
+ 	wait_on_buffer (bh);
+ 	if (buffer_uptodate(bh))
+ 		return bh;
+ 	brelse (bh);
+ 	*err = -EIO;
+ 	return NULL;
+ }
+ 
+ static int walk_page_buffers(	handle_t *handle,
+ 				struct buffer_head *head,
+ 				unsigned from,
+ 				unsigned to,
+ 				int *partial,
+ 				int (*fn)(	handle_t *handle,
+ 						struct buffer_head *bh))
+ {
+ 	struct buffer_head *bh;
+ 	unsigned block_start, block_end;
+ 	unsigned blocksize = head->b_size;
+ 	int err, ret = 0;
+ 
+ 	for (	bh = head, block_start = 0;
+ 		ret == 0 && (bh != head || !block_start);
+ 	    	block_start = block_end, bh = bh->b_this_page)
+ 	{
+ 		block_end = block_start + blocksize;
+ 		if (block_end <= from || block_start >= to) {
+ 			if (partial && !buffer_uptodate(bh))
+ 				*partial = 1;
+ 			continue;
+ 		}
+ 		err = (*fn)(handle, bh);
+ 		if (!ret)
+ 			ret = err;
+ 	}
+ 	return ret;
+ }
+ 
+ /*
+  * To preserve ordering, it is essential that the hole instantiation and
+  * the data write be encapsulated in a single transaction.  We cannot
+  * close off a transaction and start a new one between the ext3_get_block()
+  * and the commit_write().  So doing the journal_start at the start of
+  * prepare_write() is the right place.
+  *
+  * Also, this function can nest inside ext3_writepage() ->
+  * block_write_full_page(). In that case, we *know* that ext3_writepage()
+  * has generated enough buffer credits to do the whole page.  So we won't
+  * block on the journal in that case, which is good, because the caller may
+  * be PF_MEMALLOC.
+  *
+  * By accident, ext3 can be reentered when a transaction is open via
+  * quota file writes.  If we were to commit the transaction while thus
+  * reentered, there can be a deadlock - we would be holding a quota
+  * lock, and the commit would never complete if another thread had a
+  * transaction open and was blocking on the quota lock - a ranking
+  * violation.
+  *
+  * So what we do is to rely on the fact that journal_stop/journal_start
+  * will _not_ run commit under these circumstances because handle->h_ref
+  * is elevated.  We'll still have enough credits for the tiny quotafile
+  * write.  
+  */
+ 
+ static int do_journal_get_write_access(handle_t *handle, 
+ 				       struct buffer_head *bh)
+ {
+ 	return ext3_journal_get_write_access(handle, bh);
+ }
+ 
+ static int ext3_prepare_write(struct file *file, struct page *page,
+ 			      unsigned from, unsigned to)
+ {
+ 	struct inode *inode = page->mapping->host;
+ 	handle_t *handle = ext3_journal_current_handle();
+ 	int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
+ 
+ 	lock_kernel();
+ 	handle = ext3_journal_start(inode, needed_blocks);
+ 	if (IS_ERR(handle)) {
+ 		ret = PTR_ERR(handle);
+ 		goto out;
+ 	}
+ 	ret = block_prepare_write(page, from, to, ext3_get_block);
+ 	if (ret != 0)
+ 		goto prepare_write_failed;
+ 
+ 	if (ext3_should_journal_data(inode))
+ 		ret = walk_page_buffers(handle, page->buffers,
+ 				from, to, NULL, do_journal_get_write_access);
+ prepare_write_failed:
+ 	if (ret)
+ 		ext3_journal_stop(handle, inode);
+ out:
+ 	unlock_kernel();
+ 	return ret;
+ }
+ 
+ static int journal_dirty_sync_data(handle_t *handle, struct buffer_head *bh)
+ {
+ 	return ext3_journal_dirty_data(handle, bh, 0);
+ }
+ 
+ /*
+  * For ext3_writepage().  We also brelse() the buffer to account for
+  * the bget() which ext3_writepage() performs.
+  */
+ static int journal_dirty_async_data(handle_t *handle, struct buffer_head *bh)
+ {
+ 	int ret = ext3_journal_dirty_data(handle, bh, 1);
+ 	__brelse(bh);
+ 	return ret;
+ }
+ 
+ /* For commit_write() in data=journal mode */
+ static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
+ {
+ 	set_bit(BH_Uptodate, &bh->b_state);
+ 	return ext3_journal_dirty_metadata(handle, bh);
+ }
+ 
+ /*
+  * We need to pick up the new inode size which generic_commit_write gave us
+  * `file' can be NULL - eg, when called from block_symlink().
+  *
+  * ext3 inode->i_dirty_buffers policy:  If we're journalling data we
+  * definitely don't want them to appear on the inode at all - instead
+  * we need to manage them at the JBD layer and we need to intercept
+  * the relevant sync operations and translate them into journal operations.
+  *
+  * If we're not journalling data then we can just leave the buffers
+  * on ->i_dirty_buffers.  If someone writes them out for us then thanks.
+  * Otherwise we'll do it in commit, if we're using ordered data.
+  */
+ 
+ static int ext3_commit_write(struct file *file, struct page *page,
+ 			     unsigned from, unsigned to)
+ {
+ 	handle_t *handle = ext3_journal_current_handle();
+ 	struct inode *inode = page->mapping->host;
+ 	int ret = 0, ret2;
+ 
+ 	lock_kernel();
+ 	if (ext3_should_journal_data(inode)) {
+ 		/*
+ 		 * Here we duplicate the generic_commit_write() functionality
+ 		 */
+ 		int partial = 0;
+ 		loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+ 
+ 		ret = walk_page_buffers(handle, page->buffers,
+ 			from, to, &partial, commit_write_fn);
+ 		if (!partial)
+ 			SetPageUptodate(page);
+ 		kunmap(page);
+ 		if (pos > inode->i_size)
+ 			inode->i_size = pos;
+ 		set_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state);
+ 	} else {
+ 		if (ext3_should_order_data(inode)) {
+ 			ret = walk_page_buffers(handle, page->buffers,
+ 				from, to, NULL, journal_dirty_sync_data);
+ 		}
+ 		/* Be careful here if generic_commit_write becomes a
+ 		 * required invocation after block_prepare_write. */
+ 		if (ret == 0)
+ 			ret = generic_commit_write(file, page, from, to);
+ 	}
+ 	if (inode->i_size > inode->u.ext3_i.i_disksize) {
+ 		inode->u.ext3_i.i_disksize = inode->i_size;
+ 		ret2 = ext3_mark_inode_dirty(handle, inode);
+ 		if (!ret) 
+ 			ret = ret2;
+ 	}
+ 	ret2 = ext3_journal_stop(handle, inode);
+ 	unlock_kernel();
+ 	if (!ret)
+ 		ret = ret2;
+ 	return ret;
+ }
+ 
+ /* 
+  * bmap() is special.  It gets used by applications such as lilo and by
+  * the swapper to find the on-disk block of a specific piece of data.
+  *
+  * Naturally, this is dangerous if the block concerned is still in the
+  * journal.  If somebody makes a swapfile on an ext3 data-journaling
+  * filesystem and enables swap, then they may get a nasty shock when the
+  * data getting swapped to that swapfile suddenly gets overwritten by
+  * the original zero's written out previously to the journal and
+  * awaiting writeback in the kernel's buffer cache. 
+  *
+  * So, if we see any bmap calls here on a modified, data-journaled file,
+  * take extra steps to flush any blocks which might be in the cache. 
+  */
+ static int ext3_bmap(struct address_space *mapping, long block)
+ {
+ 	struct inode *inode = mapping->host;
+ 	journal_t *journal;
+ 	int err;
+ 	
+ 	if (test_and_clear_bit(EXT3_STATE_JDATA, &inode->u.ext3_i.i_state)) {
+ 		/* 
+ 		 * This is a REALLY heavyweight approach, but the use of
+ 		 * bmap on dirty files is expected to be extremely rare:
+ 		 * only if we run lilo or swapon on a freshly made file
+ 		 * do we expect this to happen. 
+ 		 *
+ 		 * (bmap requires CAP_SYS_RAWIO so this does not
+ 		 * represent an unprivileged user DOS attack --- we'd be
+ 		 * in trouble if mortal users could trigger this path at
+ 		 * will.) 
+ 		 *
+ 		 * NB. EXT3_STATE_JDATA is not set on files other than
+ 		 * regular files.  If somebody wants to bmap a directory
+ 		 * or symlink and gets confused because the buffer
+ 		 * hasn't yet been flushed to disk, they deserve
+ 		 * everything they get.
+ 		 */
+ 		
+ 		journal = EXT3_JOURNAL(inode);
+ 		journal_lock_updates(journal);
+ 		err = journal_flush(journal);
+ 		journal_unlock_updates(journal);
+ 		
+ 		if (err)
+ 			return 0;
+ 	}
+ 	
+ 	return generic_block_bmap(mapping,block,ext3_get_block);
+ }
+ 
+ static int bget_one(handle_t *handle, struct buffer_head *bh)
+ {
+ 	atomic_inc(&bh->b_count);
+ 	return 0;
+ }
+ 
+ /*
+  * Note that we always start a transaction even if we're not journalling
+  * data.  This is to preserve ordering: any hole instantiation within
+  * __block_write_full_page -> ext3_get_block() should be journalled
+  * along with the data so we don't crash and then get metadata which
+  * refers to old data.
+  *
+  * In all journalling modes block_write_full_page() will start the I/O.
+  *
+  * Problem:
+  *
+  *	ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+  *		ext3_writepage()
+  *
+  * Similar for:
+  *
+  *	ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
+  *
+  * Same applies to ext3_get_block().  We will deadlock on various things like
+  * lock_journal and i_truncate_sem.
+  *
+  * Setting PF_MEMALLOC here doesn't work - too many internal memory
+  * allocations fail.
+  *
+  * 16May01: If we're reentered then journal_current_handle() will be
+  *	    non-zero. We simply *return*.
+  *
+  * 1 July 2001: @@@ FIXME:
+  *   In journalled data mode, a data buffer may be metadata against the
+  *   current transaction.  But the same file is part of a shared mapping
+  *   and someone does a writepage() on it.
+  *
+  *   We will move the buffer onto the async_data list, but *after* it has
+  *   been dirtied. So there's a small window where we have dirty data on
+  *   BJ_Metadata.
+  *
+  *   Note that this only applies to the last partial page in the file.  The
+  *   bit which block_write_full_page() uses prepare/commit for.  (That's
+  *   broken code anyway: it's wrong for msync()).
+  *
+  *   It's a rare case: affects the final partial page, for journalled data
+  *   where the file is subject to bith write() and writepage() in the same
+  *   transction.  To fix it we'll need a custom block_write_full_page().
+  *   We'll probably need that anyway for journalling writepage() output.
+  *
+  * We don't honour synchronous mounts for writepage().  That would be
+  * disastrous.  Any write() or metadata operation will sync the fs for
+  * us.
+  */
+ static int ext3_writepage(struct page *page)
+ {
+ 	struct inode *inode = page->mapping->host;
+ 	struct buffer_head *page_buffers;
+ 	handle_t *handle = NULL;
+ 	int ret = 0, err;
+ 	int needed;
+ 	int order_data;
+ 
+ 	J_ASSERT(PageLocked(page));
+ 	
+ 	/*
+ 	 * We give up here if we're reentered, because it might be
+ 	 * for a different filesystem.  One *could* look for a
+ 	 * nested transaction opportunity.
+ 	 */
+ 	lock_kernel();
+ 	if (ext3_journal_current_handle())
+ 		goto out_fail;
+ 
+ 	needed = ext3_writepage_trans_blocks(inode);
+ 	if (current->flags & PF_MEMALLOC)
+ 		handle = ext3_journal_try_start(inode, needed);
+ 	else
+ 		handle = ext3_journal_start(inode, needed);
+ 				
+ 	if (IS_ERR(handle)) {
+ 		ret = PTR_ERR(handle);
+ 		goto out_fail;
+ 	}
+ 
+ 	order_data = ext3_should_order_data(inode) ||
+ 			ext3_should_journal_data(inode);
+ 
+ 	unlock_kernel();
+ 
+ 	page_buffers = NULL;	/* Purely to prevent compiler warning */
+ 
+ 	/* bget() all the buffers */
+ 	if (order_data) {
+ 		if (!page->buffers)
+ 			create_empty_buffers(page,
+ 				inode->i_dev, inode->i_sb->s_blocksize);
+ 		page_buffers = page->buffers;
+ 		walk_page_buffers(handle, page_buffers, 0,
+ 				PAGE_CACHE_SIZE, NULL, bget_one);
+ 	}
+ 
+ 	ret = block_write_full_page(page, ext3_get_block);
+ 
+ 	/*
+ 	 * The page can become unlocked at any point now, and
+ 	 * truncate can then come in and change things.  So we
+ 	 * can't touch *page from now on.  But *page_buffers is
+ 	 * safe due to elevated refcount.
+ 	 */
+ 
+ 	handle = ext3_journal_current_handle();
+ 	lock_kernel();
+ 
+ 	/* And attach them to the current transaction */
+ 	if (order_data) {
+ 		err = walk_page_buffers(handle, page_buffers,
+ 			0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
+ 		if (!ret)
+ 			ret = err;
+ 	}
+ 
+ 	err = ext3_journal_stop(handle, inode);
+ 	if (!ret)
+ 		ret = err;
+ 	unlock_kernel();
+ 	return ret;
+ 
+ out_fail:
+ 	
+ 	unlock_kernel();
+ 	SetPageDirty(page);
+ 	UnlockPage(page);
+ 	return ret;
+ }
+ 
+ static int ext3_readpage(struct file *file, struct page *page)
+ {
+ 	return block_read_full_page(page,ext3_get_block);
+ }
+ 
+ 
+ static int ext3_flushpage(struct page *page, unsigned long offset)
+ {
+ 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ 	return journal_flushpage(journal, page, offset);
+ }
+ 
+ static int ext3_releasepage(struct page *page, int wait)
+ {
+ 	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
+ 	return journal_try_to_free_buffers(journal, page, wait);
+ }
+ 
+ 
+ struct address_space_operations ext3_aops = {
+ 	readpage:	ext3_readpage,		/* BKL not held.  Don't need */
+ 	writepage:	ext3_writepage,		/* BKL not held.  We take it */
+ 	sync_page:	block_sync_page,
+ 	prepare_write:	ext3_prepare_write,	/* BKL not held.  We take it */
+ 	commit_write:	ext3_commit_write,	/* BKL not held.  We take it */
+ 	bmap:		ext3_bmap,		/* BKL held */
+ 	flushpage:	ext3_flushpage,		/* BKL not held.  Don't need */
+ 	releasepage:	ext3_releasepage,	/* BKL not held.  Don't need */
+ };
+ 
+ /*
+  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
+  * up to the end of the block which corresponds to `from'.
+  * This required during truncate. We need to physically zero the tail end
+  * of that block so it doesn't yield old data if the file is later grown.
+  */
+ static int ext3_block_truncate_page(handle_t *handle,
+ 		struct address_space *mapping, loff_t from)
+ {
+ 	unsigned long index = from >> PAGE_CACHE_SHIFT;
+ 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ 	unsigned blocksize, iblock, length, pos;
+ 	struct inode *inode = mapping->host;
+ 	struct page *page;
+ 	struct buffer_head *bh;
+ 	int err;
+ 
+ 	blocksize = inode->i_sb->s_blocksize;
+ 	length = offset & (blocksize - 1);
+ 
+ 	/* Block boundary? Nothing to do */
+ 	if (!length)
+ 		return 0;
+ 
+ 	length = blocksize - length;
+ 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ 
+ 	page = grab_cache_page(mapping, index);
+ 	err = -ENOMEM;
+ 	if (!page)
+ 		goto out;
+ 
+ 	if (!page->buffers)
+ 		create_empty_buffers(page, inode->i_dev, blocksize);
+ 
+ 	/* Find the buffer that contains "offset" */
+ 	bh = page->buffers;
+ 	pos = blocksize;
+ 	while (offset >= pos) {
+ 		bh = bh->b_this_page;
+ 		iblock++;
+ 		pos += blocksize;
+ 	}
+ 
+ 	err = 0;
+ 	if (!buffer_mapped(bh)) {
+ 		/* Hole? Nothing to do */
+ 		if (buffer_uptodate(bh))
+ 			goto unlock;
+ 		ext3_get_block(inode, iblock, bh, 0);
+ 		/* Still unmapped? Nothing to do */
+ 		if (!buffer_mapped(bh))
+ 			goto unlock;
+ 	}
+ 
+ 	/* Ok, it's mapped. Make sure it's up-to-date */
+ 	if (Page_Uptodate(page))
+ 		set_bit(BH_Uptodate, &bh->b_state);
+ 
+ 	if (!buffer_uptodate(bh)) {
+ 		err = -EIO;
+ 		ll_rw_block(READ, 1, &bh);
+ 		wait_on_buffer(bh);
+ 		/* Uhhuh. Read error. Complain and punt. */
+ 		if (!buffer_uptodate(bh))
+ 			goto unlock;
+ 	}
+ 
+ 	if (ext3_should_journal_data(inode)) {
+ 		BUFFER_TRACE(bh, "get write access");
+ 		err = ext3_journal_get_write_access(handle, bh);
+ 		if (err)
+ 			goto unlock;
+ 	}
+ 	
+ 	memset(kmap(page) + offset, 0, length);
+ 	flush_dcache_page(page);
+ 	kunmap(page);
+ 
+ 	BUFFER_TRACE(bh, "zeroed end of block");
+ 
+ 	err = 0;
+ 	if (ext3_should_journal_data(inode)) {
+ 		err = ext3_journal_dirty_metadata(handle, bh);
+ 	} else {
+ 		if (ext3_should_order_data(inode))
+ 			err = ext3_journal_dirty_data(handle, bh, 0);
+ 		__mark_buffer_dirty(bh);
+ 	}
+ 
+ unlock:
+ 	UnlockPage(page);
+ 	page_cache_release(page);
+ out:
+ 	return err;
+ }
+ 
+ /*
+  * Probably it should be a library function... search for first non-zero word
+  * or memcmp with zero_page, whatever is better for particular architecture.
+  * Linus?
+  */
+ static inline int all_zeroes(u32 *p, u32 *q)
+ {
+ 	while (p < q)
+ 		if (*p++)
+ 			return 0;
+ 	return 1;
+ }
+ 
+ /**
+  *	ext3_find_shared - find the indirect blocks for partial truncation.
+  *	@inode:	  inode in question
+  *	@depth:	  depth of the affected branch
+  *	@offsets: offsets of pointers in that branch (see ext3_block_to_path)
+  *	@chain:	  place to store the pointers to partial indirect blocks
+  *	@top:	  place to the (detached) top of branch
+  *
+  *	This is a helper function used by ext3_truncate().
+  *
+  *	When we do truncate() we may have to clean the ends of several
+  *	indirect blocks but leave the blocks themselves alive. Block is
+  *	partially truncated if some data below the new i_size is refered
+  *	from it (and it is on the path to the first completely truncated
+  *	data block, indeed).  We have to free the top of that path along
+  *	with everything to the right of the path. Since no allocation
+  *	past the truncation point is possible until ext3_truncate()
+  *	finishes, we may safely do the latter, but top of branch may
+  *	require special attention - pageout below the truncation point
+  *	might try to populate it.
+  *
+  *	We atomically detach the top of branch from the tree, store the
+  *	block number of its root in *@top, pointers to buffer_heads of
+  *	partially truncated blocks - in @chain[].bh and pointers to
+  *	their last elements that should not be removed - in
+  *	@chain[].p. Return value is the pointer to last filled element
+  *	of @chain.
+  *
+  *	The work left to caller to do the actual freeing of subtrees:
+  *		a) free the subtree starting from *@top
+  *		b) free the subtrees whose roots are stored in
+  *			(@chain[i].p+1 .. end of @chain[i].bh->b_data)
+  *		c) free the subtrees growing from the inode past the @chain[0].
+  *			(no partially truncated stuff there).  */
+ 
+ static Indirect *ext3_find_shared(struct inode *inode,
+ 				int depth,
+ 				int offsets[4],
+ 				Indirect chain[4],
+ 				u32 *top)
+ {
+ 	Indirect *partial, *p;
+ 	int k, err;
+ 
+ 	*top = 0;
+ 	/* Make k index the deepest non-null offest + 1 */
+ 	for (k = depth; k > 1 && !offsets[k-1]; k--)
+ 		;
+ 	partial = ext3_get_branch(inode, k, offsets, chain, &err);
+ 	/* Writer: pointers */
+ 	if (!partial)
+ 		partial = chain + k-1;
+ 	/*
+ 	 * If the branch acquired continuation since we've looked at it -
+ 	 * fine, it should all survive and (new) top doesn't belong to us.
+ 	 */
+ 	if (!partial->key && *partial->p)
+ 		/* Writer: end */
+ 		goto no_top;
+ 	for (p=partial; p>chain && all_zeroes((u32*)p->bh->b_data,p->p); p--)
+ 		;
+ 	/*
+ 	 * OK, we've found the last block that must survive. The rest of our
+ 	 * branch should be detached before unlocking. However, if that rest
+ 	 * of branch is all ours and does not grow immediately from the inode
+ 	 * it's easier to cheat and just decrement partial->p.
+ 	 */
+ 	if (p == chain + k - 1 && p > chain) {
+ 		p->p--;
+ 	} else {
+ 		*top = *p->p;
+ 		/* Nope, don't do this in ext3.  Must leave the tree intact */
+ #if 0
+ 		*p->p = 0;
+ #endif
+ 	}
+ 	/* Writer: end */
+ 
+ 	while(partial > p)
+ 	{
+ 		brelse(partial->bh);
+ 		partial--;
+ 	}
+ no_top:
+ 	return partial;
+ }
+ 
+ /*
+  * Zero a number of block pointers in either an inode or an indirect block.
+  * If we restart the transaction we must again get write access to the
+  * indirect block for further modification.
+  *
+  * We release `count' blocks on disk, but (last - first) may be greater
+  * than `count' because there can be holes in there.
+  */
+ static void
+ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
+ 		unsigned long block_to_free, unsigned long count,
+ 		u32 *first, u32 *last)
+ {
+ 	u32 *p;
+ 	kdev_t dev = inode->i_sb->s_dev;
+ 	unsigned long blocksize = inode->i_sb->s_blocksize;
+ 
+ 	if (try_to_extend_transaction(handle, inode)) {
+ 		if (bh) {
+ 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ 			ext3_journal_dirty_metadata(handle, bh);
+ 		}
+ 		ext3_mark_inode_dirty(handle, inode);
+ 		ext3_journal_test_restart(handle, inode);
+ 		BUFFER_TRACE(bh, "get_write_access");
+ 		ext3_journal_get_write_access(handle, bh);
+ 	}
+ 
+ 	/*
+ 	 * Any buffers which are on the journal will be in memory. We find
+ 	 * them on the hash table so journal_revoke() will run journal_forget()
+ 	 * on them.  We've already detached each block from the file, so
+ 	 * bforget() in journal_forget() should be safe.
+ 	 *
+ 	 * AKPM: turn on bforget in journal_forget()!!!
+ 	 */
+ 	for (p = first; p < last; p++) {
+ 		u32 nr = le32_to_cpu(*p);
+ 		if (nr) {
+ 			struct buffer_head *bh;
+ 
+ 			*p = 0;
+ 			bh = get_hash_table(dev, nr, blocksize);
+ 			ext3_forget(handle, 0, inode, bh, nr);
+ 		}
+ 	}
+ 
+ 	ext3_free_blocks(handle, inode, block_to_free, count);
+ }
+ 
+ /**
+  * ext3_free_data - free a list of data blocks
+  * @handle:	handle for this transaction
+  * @inode:	inode we are dealing with
+  * @this_bh:	indirect buffer_head which contains *@first and *@last
+  * @first:	array of block numbers
+  * @last:	points immediately past the end of array
+  *
+  * We are freeing all blocks refered from that array (numbers are stored as
+  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
+  *
+  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
+  * blocks are contiguous then releasing them at one time will only affect one
+  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
+  * actually use a lot of journal space.
+  *
+  * @this_bh will be %NULL if @first and @last point into the inode's direct
+  * block pointers.
+  */
+ static void ext3_free_data(handle_t *handle, struct inode *inode,
+ 			   struct buffer_head *this_bh, u32 *first, u32 *last)
+ {
+ 	unsigned long block_to_free = 0;    /* Starting block # of a run */
+ 	unsigned long count = 0;	    /* Number of blocks in the run */ 
+ 	u32 *block_to_free_p = NULL;	    /* Pointer into inode/ind
+ 					       corresponding to
+ 					       block_to_free */
+ 	unsigned long nr;		    /* Current block # */
+ 	u32 *p;				    /* Pointer into inode/ind
+ 					       for current block */
+ 	int err;
+ 
+ 	if (this_bh) {				/* For indirect block */
+ 		BUFFER_TRACE(this_bh, "get_write_access");
+ 		err = ext3_journal_get_write_access(handle, this_bh);
+ 		/* Important: if we can't update the indirect pointers
+ 		 * to the blocks, we can't free them. */
+ 		if (err)
+ 			return;
+ 	}
+ 
+ 	for (p = first; p < last; p++) {
+ 		nr = le32_to_cpu(*p);
+ 		if (nr) {
+ 			/* accumulate blocks to free if they're contiguous */
+ 			if (count == 0) {
+ 				block_to_free = nr;
+ 				block_to_free_p = p;
+ 				count = 1;
+ 			} else if (nr == block_to_free + count) {
+ 				count++;
+ 			} else {
+ 				ext3_clear_blocks(handle, inode, this_bh, 
+ 						  block_to_free,
+ 						  count, block_to_free_p, p);
+ 				block_to_free = nr;
+ 				block_to_free_p = p;
+ 				count = 1;
+ 			}
+ 		}
+ 	}
+ 
+ 	if (count > 0)
+ 		ext3_clear_blocks(handle, inode, this_bh, block_to_free,
+ 				  count, block_to_free_p, p);
+ 
+ 	if (this_bh) {
+ 		BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
+ 		ext3_journal_dirty_metadata(handle, this_bh);
+ 	}
+ }
+ 
+ /**
+  *	ext3_free_branches - free an array of branches
+  *	@handle: JBD handle for this transaction
+  *	@inode:	inode we are dealing with
+  *	@parent_bh: the buffer_head which contains *@first and *@last
+  *	@first:	array of block numbers
+  *	@last:	pointer immediately past the end of array
+  *	@depth:	depth of the branches to free
+  *
+  *	We are freeing all blocks refered from these branches (numbers are
+  *	stored as little-endian 32-bit) and updating @inode->i_blocks
+  *	appropriately.
+  */
+ static void ext3_free_branches(handle_t *handle, struct inode *inode,
+ 			       struct buffer_head *parent_bh,
+ 			       u32 *first, u32 *last, int depth)
+ {
+ 	unsigned long nr;
+ 	u32 *p;
+ 
+ 	if (is_handle_aborted(handle))
+ 		return;
+ 	
+ 	if (depth--) {
+ 		struct buffer_head *bh;
+ 		int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+ 		p = last;
+ 		while (--p >= first) {
+ 			nr = le32_to_cpu(*p);
+ 			if (!nr)
+ 				continue;		/* A hole */
+ 
+ 			/* Go read the buffer for the next level down */
+ 			bh = bread(inode->i_dev, nr, inode->i_sb->s_blocksize);
+ 
+ 			/*
+ 			 * A read failure? Report error and clear slot
+ 			 * (should be rare).
+ 			 */
+ 			if (!bh) {
+ 				ext3_error(inode->i_sb, "ext3_free_branches",
+ 					   "Read failure, inode=%ld, block=%ld",
+ 					   inode->i_ino, nr);
+ 				continue;
+ 			}
+ 
+ 			/* This zaps the entire block.  Bottom up. */
+ 			BUFFER_TRACE(bh, "free child branches");
+ 			ext3_free_branches(handle, inode, bh, (u32*)bh->b_data,
+ 					   (u32*)bh->b_data + addr_per_block,
+ 					   depth);
+ 
+ 			/*
+ 			 * We've probably journalled the indirect block several
+ 			 * times during the truncate.  But it's no longer
+ 			 * needed and we now drop it from the transaction via
+ 			 * journal_revoke().
+ 			 *
+ 			 * That's easy if it's exclusively part of this
+ 			 * transaction.  But if it's part of the committing
+ 			 * transaction then journal_forget() will simply
+ 			 * brelse() it.  That means that if the underlying
+ 			 * block is reallocated in ext3_get_block(),
+ 			 * unmap_underlying_metadata() will find this block
+ 			 * and will try to get rid of it.  damn, damn.
+ 			 *
+ 			 * If this block has already been committed to the
+ 			 * journal, a revoke record will be written.  And
+ 			 * revoke records must be emitted *before* clearing
+ 			 * this block's bit in the bitmaps.
+ 			 */
+ 			ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
+ 
+ 			/*
+ 			 * Everything below this this pointer has been
+ 			 * released.  Now let this top-of-subtree go.
+ 			 *
+ 			 * We want the freeing of this indirect block to be
+ 			 * atomic in the journal with the updating of the
+ 			 * bitmap block which owns it.  So make some room in
+ 			 * the journal.
+ 			 *
+ 			 * We zero the parent pointer *after* freeing its
+ 			 * pointee in the bitmaps, so if extend_transaction()
+ 			 * for some reason fails to put the bitmap changes and
+ 			 * the release into the same transaction, recovery
+ 			 * will merely complain about releasing a free block,
+ 			 * rather than leaking blocks.
+ 			 */
+ 			if (is_handle_aborted(handle))
+ 				return;
+ 			if (try_to_extend_transaction(handle, inode)) {
+ 				ext3_mark_inode_dirty(handle, inode);
+ 				ext3_journal_test_restart(handle, inode);
+ 			}
+ 
+ 			ext3_free_blocks(handle, inode, nr, 1);
+ 
+ 			if (parent_bh) {
+ 				/*
+ 				 * The block which we have just freed is
+ 				 * pointed to by an indirect block: journal it
+ 				 */
+ 				BUFFER_TRACE(parent_bh, "get_write_access");
+ 				if (!ext3_journal_get_write_access(handle,
+ 								   parent_bh)){
+ 					*p = 0;
+ 					BUFFER_TRACE(parent_bh,
+ 					"call ext3_journal_dirty_metadata");
+ 					ext3_journal_dirty_metadata(handle, 
+ 								    parent_bh);
+ 				}
+ 			}
+ 		}
+ 	} else {
+ 		/* We have reached the bottom of the tree. */
+ 		BUFFER_TRACE(parent_bh, "free data blocks");
+ 		ext3_free_data(handle, inode, parent_bh, first, last);
+ 	}
+ }
+ 
+ /*
+  * ext3_truncate()
+  *
+  * We block out ext3_get_block() block instantiations across the entire
+  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
+  * simultaneously on behalf of the same inode.
+  *
+  * As we work through the truncate and commmit bits of it to the journal there
+  * is one core, guiding principle: the file's tree must always be consistent on
+  * disk.  We must be able to restart the truncate after a crash.
+  *
+  * The file's tree may be transiently inconsistent in memory (although it
+  * probably isn't), but whenever we close off and commit a journal transaction,
+  * the contents of (the filesystem + the journal) must be consistent and
+  * restartable.  It's pretty simple, really: bottom up, right to left (although
+  * left-to-right works OK too).
+  *
+  * Note that at recovery time, journal replay occurs *before* the restart of
+  * truncate against the orphan inode list.
+  *
+  * The committed inode has the new, desired i_size (which is the same as
+  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
+  * that this inode's truncate did not complete and it will again call
+  * ext3_truncate() to have another go.  So there will be instantiated blocks
+  * to the right of the truncation point in a crashed ext3 filesystem.  But
+  * that's fine - as long as they are linked from the inode, the post-crash
+  * ext3_truncate() run will find them and release them.
+  */
+ 
+ void ext3_truncate(struct inode * inode)
+ {
+ 	handle_t *handle;
+ 	u32 *i_data = inode->u.ext3_i.i_data;
+ 	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
+ 	int offsets[4];
+ 	Indirect chain[4];
+ 	Indirect *partial;
+ 	int nr = 0;
+ 	int n;
+ 	long last_block;
+ 	unsigned blocksize;
+ 
+ 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ 	    S_ISLNK(inode->i_mode)))
+ 		return;
+ 	if (IS_APPEND(inode) || IS_IMMUTABLE_FILE(inode))
+ 		return;
+ 
+ 	ext3_discard_prealloc(inode);
+ 
+ 	handle = start_transaction(inode);
+ 	if (IS_ERR(handle))
+ 		return;		/* AKPM: return what? */
+ 
+ 	blocksize = inode->i_sb->s_blocksize;
+ 	last_block = (inode->i_size + blocksize-1)
+ 					>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
+ 
+ 	ext3_block_truncate_page(handle, inode->i_mapping, inode->i_size);
+ 		
+ 
+ 	n = ext3_block_to_path(inode, last_block, offsets);
+ 	if (n == 0)
+ 		goto out_stop;	/* error */
+ 
+ 	/*
+ 	 * OK.  This truncate is going to happen.  We add the inode to the
+ 	 * orphan list, so that if this truncate spans multiple transactions,
+ 	 * and we crash, we will resume the truncate when the filesystem
+ 	 * recovers.  It also marks the inode dirty, to catch the new size.
+ 	 *
+ 	 * Implication: the file must always be in a sane, consistent
+ 	 * truncatable state while each transaction commits.
+ 	 */
+ 	if (ext3_orphan_add(handle, inode))
+ 		goto out_stop;
+ 
+ 	/*
+ 	 * The orphan list entry will now protect us from any crash which
+ 	 * occurs before the truncate completes, so it is now safe to propagate
+ 	 * the new, shorter inode size (held for now in i_size) into the
+ 	 * on-disk inode. We do this via i_disksize, which is the value which
+ 	 * ext3 *really* writes onto the disk inode.
+ 	 */
+ 	inode->u.ext3_i.i_disksize = inode->i_size;
+ 
+ 	/*
+ 	 * From here we block out all ext3_get_block() callers who want to
+ 	 * modify the block allocation tree.
+ 	 */
+ 	down_write(&inode->u.ext3_i.truncate_sem);
+ 
+ 	if (n == 1) {		/* direct blocks */
+ 		ext3_free_data(handle, inode, NULL, i_data+offsets[0],
+ 			       i_data + EXT3_NDIR_BLOCKS);
+ 		goto do_indirects;
+ 	}
+ 
+ 	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
+ 	/* Kill the top of shared branch (not detached) */
+ 	if (nr) {
+ 		if (partial == chain) {
+ 			/* Shared branch grows from the inode */
+ 			ext3_free_branches(handle, inode, NULL,
+ 					   &nr, &nr+1, (chain+n-1) - partial);
+ 			*partial->p = 0;
+ 			/*
+ 			 * We mark the inode dirty prior to restart,
+ 			 * and prior to stop.  No need for it here.
+ 			 */
+ 		} else {
+ 			/* Shared branch grows from an indirect block */
+ 			BUFFER_TRACE(partial->bh, "get_write_access");
+ 			ext3_free_branches(handle, inode, partial->bh,
+ 					partial->p,
+ 					partial->p+1, (chain+n-1) - partial);
+ 		}
+ 	}
+ 	/* Clear the ends of indirect blocks on the shared branch */
+ 	while (partial > chain) {
+ 		ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
+ 				   (u32*)partial->bh->b_data + addr_per_block,
+ 				   (chain+n-1) - partial);
+ 		BUFFER_TRACE(partial->bh, "call brelse");
+ 		brelse (partial->bh);
+ 		partial--;
+ 	}
+ do_indirects:
+ 	/* Kill the remaining (whole) subtrees */
+ 	switch (offsets[0]) {
+ 		default:
+ 			nr = i_data[EXT3_IND_BLOCK];
+ 			if (nr) {
+ 				ext3_free_branches(handle, inode, NULL,
+ 						   &nr, &nr+1, 1);
+ 				i_data[EXT3_IND_BLOCK] = 0;
+ 			}
+ 		case EXT3_IND_BLOCK:
+ 			nr = i_data[EXT3_DIND_BLOCK];
+ 			if (nr) {
+ 				ext3_free_branches(handle, inode, NULL,
+ 						   &nr, &nr+1, 2);
+ 				i_data[EXT3_DIND_BLOCK] = 0;
+ 			}
+ 		case EXT3_DIND_BLOCK:
+ 			nr = i_data[EXT3_TIND_BLOCK];
+ 			if (nr) {
+ 				ext3_free_branches(handle, inode, NULL,
+ 						   &nr, &nr+1, 3);
+ 				i_data[EXT3_TIND_BLOCK] = 0;
+ 			}
+ 		case EXT3_TIND_BLOCK:
+ 			;
+ 	}
+ 	up_write(&inode->u.ext3_i.truncate_sem);
+ 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ 	ext3_mark_inode_dirty(handle, inode);
+ 
+ 	/* In a multi-transaction truncate, we only make the final
+ 	 * transaction synchronous */
+ 	if (IS_SYNC(inode))
+ 		handle->h_sync = 1;
+ out_stop:
+ 	/*
+ 	 * If this was a simple ftruncate(), and the file will remain alive
+ 	 * then we need to clear up the orphan record which we created above.
+ 	 * However, if this was a real unlink then we were called by
+ 	 * ext3_delete_inode(), and we allow that function to clean up the
+ 	 * orphan info for us.
+ 	 */
+ 	if (inode->i_nlink)
+ 		ext3_orphan_del(handle, inode);
+ 
+ 	ext3_journal_stop(handle, inode);
+ }
+ 
+ /* 
+  * ext3_get_inode_loc returns with an extra refcount against the
+  * inode's underlying buffer_head on success. 
+  */
+ 
+ int ext3_get_inode_loc (struct inode *inode, struct ext3_iloc *iloc)
+ {
+ 	struct buffer_head *bh = 0;
+ 	unsigned long block;
+ 	unsigned long block_group;
+ 	unsigned long group_desc;
+ 	unsigned long desc;
+ 	unsigned long offset;
+ 	struct ext3_group_desc * gdp;
+ 		
+ 	if ((inode->i_ino != EXT3_ROOT_INO &&
+ 		inode->i_ino != EXT3_ACL_IDX_INO &&
+ 		inode->i_ino != EXT3_ACL_DATA_INO &&
+ 		inode->i_ino != EXT3_JOURNAL_INO &&
+ 		inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+ 		inode->i_ino > le32_to_cpu(
+ 			inode->i_sb->u.ext3_sb.s_es->s_inodes_count)) {
+ 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+ 			    "bad inode number: %lu", inode->i_ino);
+ 		goto bad_inode;
+ 	}
+ 	block_group = (inode->i_ino - 1) / EXT3_INODES_PER_GROUP(inode->i_sb);
+ 	if (block_group >= inode->i_sb->u.ext3_sb.s_groups_count) {
+ 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+ 			    "group >= groups count");
+ 		goto bad_inode;
+ 	}
+ 	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(inode->i_sb);
+ 	desc = block_group & (EXT3_DESC_PER_BLOCK(inode->i_sb) - 1);
+ 	bh = inode->i_sb->u.ext3_sb.s_group_desc[group_desc];
+ 	if (!bh) {
+ 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+ 			    "Descriptor not loaded");
+ 		goto bad_inode;
+ 	}
+ 
+ 	gdp = (struct ext3_group_desc *) bh->b_data;
+ 	/*
+ 	 * Figure out the offset within the block group inode table
+ 	 */
+ 	offset = ((inode->i_ino - 1) % EXT3_INODES_PER_GROUP(inode->i_sb)) *
+ 		EXT3_INODE_SIZE(inode->i_sb);
+ 	block = le32_to_cpu(gdp[desc].bg_inode_table) +
+ 		(offset >> EXT3_BLOCK_SIZE_BITS(inode->i_sb));
+ 	if (!(bh = bread (inode->i_dev, block, inode->i_sb->s_blocksize))) {
+ 		ext3_error (inode->i_sb, "ext3_get_inode_loc",
+ 			    "unable to read inode block - "
+ 			    "inode=%lu, block=%lu", inode->i_ino, block);
+ 		goto bad_inode;
+ 	}
+ 	offset &= (EXT3_BLOCK_SIZE(inode->i_sb) - 1);
+ 
+ 	iloc->bh = bh;
+ 	iloc->raw_inode = (struct ext3_inode *) (bh->b_data + offset);
+ 	iloc->block_group = block_group;
+ 	
+ 	return 0;
+ 	
+  bad_inode:
+ 	return -EIO;
+ }
+ 
+ void ext3_read_inode(struct inode * inode)
+ {
+ 	struct ext3_iloc iloc;
+ 	struct ext3_inode *raw_inode;
+ 	struct buffer_head *bh;
+ 	int block;
+ 	
+ 	if(ext3_get_inode_loc(inode, &iloc))
+ 		goto bad_inode;
+ 	bh = iloc.bh;
+ 	raw_inode = iloc.raw_inode;
+ 	init_rwsem(&inode->u.ext3_i.truncate_sem);
+ 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
+ 	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
+ 	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+ 	if(!(test_opt (inode->i_sb, NO_UID32))) {
+ 		inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
+ 		inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
+ 	}
+ 	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
+ 	inode->i_size = le32_to_cpu(raw_inode->i_size);
+ 	inode->i_atime = le32_to_cpu(raw_inode->i_atime);
+ 	inode->i_ctime = le32_to_cpu(raw_inode->i_ctime);
+ 	inode->i_mtime = le32_to_cpu(raw_inode->i_mtime);
+ 	inode->u.ext3_i.i_dtime = le32_to_cpu(raw_inode->i_dtime);
+ 	/* We now have enough fields to check if the inode was active or not.
+ 	 * This is needed because nfsd might try to access dead inodes
+ 	 * the test is that same one that e2fsck uses
+ 	 * NeilBrown 1999oct15
+ 	 */
+ 	if (inode->i_nlink == 0) {
+ 		if (inode->i_mode == 0 ||
+ 		    !(inode->i_sb->u.ext3_sb.s_mount_state & EXT3_ORPHAN_FS)) {
+ 			/* this inode is deleted */
+ 			brelse (bh);
+ 			goto bad_inode;
+ 		}
+ 		/* The only unlinked inodes we let through here have
+ 		 * valid i_mode and are being read by the orphan
+ 		 * recovery code: that's fine, we're about to complete
+ 		 * the process of deleting those. */
+ 	}
+ 	inode->i_blksize = PAGE_SIZE;	/* This is the optimal IO size
+ 					 * (for stat), not the fs block
+ 					 * size */  
+ 	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
+ 	inode->i_version = ++event;
+ 	inode->u.ext3_i.i_flags = le32_to_cpu(raw_inode->i_flags);
+ #ifdef EXT3_FRAGMENTS
+ 	inode->u.ext3_i.i_faddr = le32_to_cpu(raw_inode->i_faddr);
+ 	inode->u.ext3_i.i_frag_no = raw_inode->i_frag;
+ 	inode->u.ext3_i.i_frag_size = raw_inode->i_fsize;
+ #endif
+ 	inode->u.ext3_i.i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
+ 	if (!S_ISREG(inode->i_mode)) {
+ 		inode->u.ext3_i.i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+ 	} else {
+ 		inode->i_size |=
+ 			((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
+ 	}
+ 	inode->u.ext3_i.i_disksize = inode->i_size;
+ 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
+ #ifdef EXT3_PREALLOCATE
+ 	inode->u.ext3_i.i_prealloc_count = 0;
+ #endif
+ 	inode->u.ext3_i.i_block_group = iloc.block_group;
+ 
+ 	/*
+ 	 * NOTE! The in-memory inode i_data array is in little-endian order
+ 	 * even on big-endian machines: we do NOT byteswap the block numbers!
+ 	 */
+ 	for (block = 0; block < EXT3_N_BLOCKS; block++)
+ 		inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
+ 	INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+ 
+ 	brelse (iloc.bh);
+ 
+ 	if (inode->i_ino == EXT3_ACL_IDX_INO ||
+ 	    inode->i_ino == EXT3_ACL_DATA_INO)
+ 		/* Nothing to do */ ;
+ 	else if (S_ISREG(inode->i_mode)) {
+ 		inode->i_op = &ext3_file_inode_operations;
+ 		inode->i_fop = &ext3_file_operations;
+ 		inode->i_mapping->a_ops = &ext3_aops;
+ 	} else if (S_ISDIR(inode->i_mode)) {
+ 		inode->i_op = &ext3_dir_inode_operations;
+ 		inode->i_fop = &ext3_dir_operations;
+ 	} else if (S_ISLNK(inode->i_mode)) {
+ 		if (!inode->i_blocks)
+ 			inode->i_op = &ext3_fast_symlink_inode_operations;
+ 		else {
+ 			inode->i_op = &page_symlink_inode_operations;
+ 			inode->i_mapping->a_ops = &ext3_aops;
+ 		}
+ 	} else 
+ 		init_special_inode(inode, inode->i_mode,
+ 				   le32_to_cpu(iloc.raw_inode->i_block[0]));
+ 	/* inode->i_attr_flags = 0;				unused */
+ 	if (inode->u.ext3_i.i_flags & EXT3_SYNC_FL) {
+ 		/* inode->i_attr_flags |= ATTR_FLAG_SYNCRONOUS; unused */
+ 		inode->i_flags |= S_SYNC;
+ 	}
+ 	if (inode->u.ext3_i.i_flags & EXT3_APPEND_FL) {
+ 		/* inode->i_attr_flags |= ATTR_FLAG_APPEND;	unused */
+ 		inode->i_flags |= S_APPEND;
+ 	}
+ 	if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_FILE_FL) {
+ 		/* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE;	unused */
+ 		inode->i_flags |= S_IMMUTABLE_FILE;
+        }
+        if (inode->u.ext3_i.i_flags & EXT3_IMMUTABLE_LINK_FL) {
+                /* inode->i_attr_flags |= ATTR_FLAG_IMMUTABLE_LINK; unused */
+                inode->i_flags |= S_IMMUTABLE_LINK;
+ 	}
+ 	if (inode->u.ext3_i.i_flags & EXT3_NOATIME_FL) {
+ 		/* inode->i_attr_flags |= ATTR_FLAG_NOATIME;	unused */
+ 		inode->i_flags |= S_NOATIME;
+ 	}
+ 	return;
+ 	
+ bad_inode:
+ 	make_bad_inode(inode);
+ 	return;
+ }
+ 
+ /*
+  * Post the struct inode info into an on-disk inode location in the
+  * buffer-cache.  This gobbles the caller's reference to the
+  * buffer_head in the inode location struct.  
+  */
+ 
+ static int ext3_do_update_inode(handle_t *handle, 
+ 				struct inode *inode, 
+ 				struct ext3_iloc *iloc)
+ {
+ 	struct ext3_inode *raw_inode = iloc->raw_inode;
+ 	struct buffer_head *bh = iloc->bh;
+ 	int err = 0, rc, block;
+ 
+ 	if (handle) {
+ 		BUFFER_TRACE(bh, "get_write_access");
+ 		err = ext3_journal_get_write_access(handle, bh);
+ 		if (err)
+ 			goto out_brelse;
+ 	}
+ 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
+ 	if(!(test_opt(inode->i_sb, NO_UID32))) {
+ 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
+ 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
+ /*
+  * Fix up interoperability with old kernels. Otherwise, old inodes get
+  * re-used with the upper 16 bits of the uid/gid intact
+  */
+ 		if(!inode->u.ext3_i.i_dtime) {
+ 			raw_inode->i_uid_high =
+ 				cpu_to_le16(high_16_bits(inode->i_uid));
+ 			raw_inode->i_gid_high =
+ 				cpu_to_le16(high_16_bits(inode->i_gid));
+ 		} else {
+ 			raw_inode->i_uid_high = 0;
+ 			raw_inode->i_gid_high = 0;
+ 		}
+ 	} else {
+ 		raw_inode->i_uid_low =
+ 			cpu_to_le16(fs_high2lowuid(inode->i_uid));
+ 		raw_inode->i_gid_low =
+ 			cpu_to_le16(fs_high2lowgid(inode->i_gid));
+ 		raw_inode->i_uid_high = 0;
+ 		raw_inode->i_gid_high = 0;
+ 	}
+ 	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
+ 	raw_inode->i_size = cpu_to_le32(inode->u.ext3_i.i_disksize);
+ 	raw_inode->i_atime = cpu_to_le32(inode->i_atime);
+ 	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime);
+ 	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime);
+ 	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
+ 	raw_inode->i_dtime = cpu_to_le32(inode->u.ext3_i.i_dtime);
+ 	raw_inode->i_flags = cpu_to_le32(inode->u.ext3_i.i_flags);
+ #ifdef EXT3_FRAGMENTS
+ 	raw_inode->i_faddr = cpu_to_le32(inode->u.ext3_i.i_faddr);
+ 	raw_inode->i_frag = inode->u.ext3_i.i_frag_no;
+ 	raw_inode->i_fsize = inode->u.ext3_i.i_frag_size;
+ #else
+ 	/* If we are not tracking these fields in the in-memory inode,
+ 	 * then preserve them on disk, but still initialise them to zero
+ 	 * for new inodes. */
+ 	if (inode->u.ext3_i.i_state & EXT3_STATE_NEW) {
+ 		raw_inode->i_faddr = 0;
+ 		raw_inode->i_frag = 0;
+ 		raw_inode->i_fsize = 0;
+ 	}
+ #endif
+ 	raw_inode->i_file_acl = cpu_to_le32(inode->u.ext3_i.i_file_acl);
+ 	if (!S_ISREG(inode->i_mode)) {
+ 		raw_inode->i_dir_acl = cpu_to_le32(inode->u.ext3_i.i_dir_acl);
+ 	} else {
+ 		raw_inode->i_size_high =
+ 			cpu_to_le32(inode->u.ext3_i.i_disksize >> 32);
+ 		if (inode->u.ext3_i.i_disksize > 0x7fffffffULL) {
+ 			struct super_block *sb = inode->i_sb;
+ 			if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
+ 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ 			    EXT3_SB(sb)->s_es->s_rev_level ==
+ 					cpu_to_le32(EXT3_GOOD_OLD_REV)) {
+ 			       /* If this is the first large file
+ 				* created, add a flag to the superblock.
+ 				*/
+ 				err = ext3_journal_get_write_access(handle,
+ 						sb->u.ext3_sb.s_sbh);
+ 				if (err)
+ 					goto out_brelse;
+ 				ext3_update_dynamic_rev(sb);
+ 				EXT3_SET_RO_COMPAT_FEATURE(sb,
+ 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
+ 				sb->s_dirt = 1;
+ 				handle->h_sync = 1;
+ 				err = ext3_journal_dirty_metadata(handle,
+ 						sb->u.ext3_sb.s_sbh);
+ 			}
+ 		}
+ 	}
+ 	raw_inode->i_generation = le32_to_cpu(inode->i_generation);
+ 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+ 		raw_inode->i_block[0] =
+ 			cpu_to_le32(kdev_t_to_nr(inode->i_rdev));
+ 	else for (block = 0; block < EXT3_N_BLOCKS; block++)
+ 		raw_inode->i_block[block] = inode->u.ext3_i.i_data[block];
+ 
+ 	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ 	rc = ext3_journal_dirty_metadata(handle, bh);
+ 	if (!err)
+ 		err = rc;
+ 	inode->u.ext3_i.i_state &= ~EXT3_STATE_NEW;
+ 
+ out_brelse:
+ 	brelse (bh);
+ 	ext3_std_error(inode->i_sb, err);
+ 	return err;
+ }
+ 
+ /*
+  * ext3_write_inode()
+  *
+  * We are called from a few places:
+  *
+  * - Within generic_file_write() for O_SYNC files.
+  *   Here, there will be no transaction running. We wait for any running
+  *   trasnaction to commit.
+  *
+  * - Within sys_sync(), kupdate and such.
+  *   We wait on commit, if tol to.
+  *
+  * - Within prune_icache() (PF_MEMALLOC == true)
+  *   Here we simply return.  We can't afford to block kswapd on the
+  *   journal commit.
+  *
+  * In all cases it is actually safe for us to return without doing anything,
+  * because the inode has been copied into a raw inode buffer in
+  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
+  * knfsd.
+  *
+  * Note that we are absolutely dependent upon all inode dirtiers doing the
+  * right thing: they *must* call mark_inode_dirty() after dirtying info in
+  * which we are interested.
+  *
+  * It would be a bug for them to not do this.  The code:
+  *
+  *	mark_inode_dirty(inode)
+  *	stuff();
+  *	inode->i_size = expr;
+  *
+  * is in error because a kswapd-driven write_inode() could occur while
+  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
+  * will no longer be on the superblock's dirty inode list.
+  */
+ void ext3_write_inode(struct inode *inode, int wait)
+ {
+ 	if (current->flags & PF_MEMALLOC)
+ 		return;
+ 
+ 	if (ext3_journal_current_handle()) {
+ 		jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
+ 		return;
+ 	}
+ 
+ 	if (!wait)
+ 		return;
+ 
+ 	ext3_force_commit(inode->i_sb);	
+ }
+ 
+ /*
+  * ext3_setattr()
+  *
+  * Called from notify_change.
+  *
+  * We want to trap VFS attempts to truncate the file as soon as
+  * possible.  In particular, we want to make sure that when the VFS
+  * shrinks i_size, we put the inode on the orphan list and modify
+  * i_disksize immediately, so that during the subsequent flushing of
+  * dirty pages and freeing of disk blocks, we can guarantee that any
+  * commit will leave the blocks being flushed in an unused state on
+  * disk.  (On recovery, the inode will get truncated and the blocks will
+  * be freed, so we have a strong guarantee that no future commit will
+  * leave these blocks visible to the user.)  
+  *
+  * This is only needed for regular files.  rmdir() has its own path, and
+  * we can never truncate a direcory except on final unlink (at which
+  * point i_nlink is zero so recovery is easy.)
+  *
+  * Called with the BKL.  
+  */
+ 
+ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
+ {
+ 	struct inode *inode = dentry->d_inode;
+ 	int error, rc;
+ 
+ 	error = inode_change_ok(inode, attr);
+ 	if (error)
+ 		return error;
+ 	
+ 	if (attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
+ 		handle_t *handle;
+ 
+ 		handle = ext3_journal_start(inode, 3);
+ 		if (IS_ERR(handle)) {
+ 			error = PTR_ERR(handle);
+ 			goto err_out;
+ 		}
+ 		
+ 		error = ext3_orphan_add(handle, inode);
+ 		inode->u.ext3_i.i_disksize = attr->ia_size;
+ 		rc = ext3_mark_inode_dirty(handle, inode);
+ 		if (!error)
+ 			error = rc;
+ 		ext3_journal_stop(handle, inode);
+ 	}
+ 	
+ 	inode_setattr(inode, attr);
+ 
+ 	/* If inode_setattr's call to ext3_truncate failed to get a
+ 	 * transaction handle at all, we need to clean up the in-core
+ 	 * orphan list manually. */
+ 	if (inode->i_nlink)
+ 		ext3_orphan_del(NULL, inode);
+ 
+ err_out:
+ 	ext3_std_error(inode->i_sb, error);
+ 	return 0;
+ }
+ 
+ 
+ /*
+  * akpm: how many blocks doth make a writepage()?
+  *
+  * With N blocks per page, it may be:
+  * N data blocks
+  * 2 indirect block
+  * 2 dindirect
+  * 1 tindirect
+  * N+5 bitmap blocks (from the above)
+  * N+5 group descriptor summary blocks
+  * 1 inode block
+  * 1 superblock.
+  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
+  *
+  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
+  *
+  * With ordered or writeback data it's the same, less the N data blocks.
+  *
+  * If the inode's direct blocks can hold an integral number of pages then a
+  * page cannot straddle two indirect blocks, and we can only touch one indirect
+  * and dindirect block, and the "5" above becomes "3".
+  *
+  * This still overestimates under most circumstances.  If we were to pass the
+  * start and end offsets in here as well we could do block_to_path() on each
+  * block and work out the exact number of indirects which are touched.  Pah.
+  */
+ 
+ int ext3_writepage_trans_blocks(struct inode *inode)
+ {
+ 	int bpp = ext3_journal_blocks_per_page(inode);
+ 	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
+ 	int ret;
+ 	
+ 	if (ext3_should_journal_data(inode))
+ 		ret = 3 * (bpp + indirects) + 2;
+ 	else
+ 		ret = 2 * (bpp + indirects) + 2;
+ 
+ #ifdef CONFIG_QUOTA
+ 	ret += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS;
+ #endif
+ 
+ 	return ret;
+ }
+ 
+ int
+ ext3_mark_iloc_dirty(handle_t *handle, 
+ 		     struct inode *inode,
+ 		     struct ext3_iloc *iloc)
+ {
+ 	int err = 0;
+ 
+ 	if (handle) {
+ 		/* the do_update_inode consumes one bh->b_count */
+ 		atomic_inc(&iloc->bh->b_count);
+ 		err = ext3_do_update_inode(handle, inode, iloc);
+ 		/* ext3_do_update_inode() does journal_dirty_metadata */
+ 		brelse(iloc->bh);
+ 	} else {
+ 		printk(KERN_EMERG __FUNCTION__ ": called with no handle!\n");
+ 	}
+ 	return err;
+ }
+ 
+ /* 
+  * On success, We end up with an outstanding reference count against
+  * iloc->bh.  This _must_ be cleaned up later. 
+  */
+ 
+ int
+ ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
+ 			 struct ext3_iloc *iloc)
+ {
+ 	int err = 0;
+ 	if (handle) {
+ 		err = ext3_get_inode_loc(inode, iloc);
+ 		if (!err) {
+ 			BUFFER_TRACE(iloc->bh, "get_write_access");
+ 			err = ext3_journal_get_write_access(handle, iloc->bh);
+ 			if (err) {
+ 				brelse(iloc->bh);
+ 				iloc->bh = NULL;
+ 			}
+ 		}
+ 	}
+ 	ext3_std_error(inode->i_sb, err);
+ 	return err;
+ }
+ 
+ /*
+  * akpm: What we do here is to mark the in-core inode as clean
+  * with respect to inode dirtiness (it may still be data-dirty).
+  * This means that the in-core inode may be reaped by prune_icache
+  * without having to perform any I/O.  This is a very good thing,
+  * because *any* task may call prune_icache - even ones which
+  * have a transaction open against a different journal.
+  *
+  * Is this cheating?  Not really.  Sure, we haven't written the
+  * inode out, but prune_icache isn't a user-visible syncing function.
+  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
+  * we start and wait on commits.
+  *
+  * Is this efficient/effective?  Well, we're being nice to the system
+  * by cleaning up our inodes proactively so they can be reaped
+  * without I/O.  But we are potentially leaving up to five seconds'
+  * worth of inodes floating about which prune_icache wants us to
+  * write out.  One way to fix that would be to get prune_icache()
+  * to do a write_super() to free up some memory.  It has the desired
+  * effect.
+  */
+ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
+ {
+ 	struct ext3_iloc iloc;
+ 	int err;
+ 
+ 	err = ext3_reserve_inode_write(handle, inode, &iloc);
+ 	if (!err)
+ 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ 	return err;
+ }
+ 
+ /*
+  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
+  *
+  * We're really interested in the case where a file is being extended.
+  * i_size has been changed by generic_commit_write() and we thus need
+  * to include the updated inode in the current transaction.
+  *
+  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
+  * are allocated to the file.
+  *
+  * If the inode is marked synchronous, we don't honour that here - doing
+  * so would cause a commit on atime updates, which we don't bother doing.
+  * We handle synchronous inodes at the highest possible level.
+  */
+ void ext3_dirty_inode(struct inode *inode)
+ {
+ 	handle_t *current_handle = ext3_journal_current_handle();
+ 	handle_t *handle;
+ 
+ 	lock_kernel();
+ 	handle = ext3_journal_start(inode, 1);
+ 	if (IS_ERR(handle))
+ 		goto out;
+ 	if (current_handle &&
+ 		current_handle->h_transaction != handle->h_transaction) {
+ 		/* This task has a transaction open against a different fs */
+ 		printk(KERN_EMERG __FUNCTION__": transactions do not match!\n");
+ 	} else {
+ 		jbd_debug(5, "marking dirty.  outer handle=%p\n",
+ 				current_handle);
+ 		ext3_mark_inode_dirty(handle, inode);
+ 	}
+ 	ext3_journal_stop(handle, inode);
+ out:
+ 	unlock_kernel();
+ }
+ 
+ #ifdef AKPM
+ /* 
+  * Bind an inode's backing buffer_head into this transaction, to prevent
+  * it from being flushed to disk early.  Unlike
+  * ext3_reserve_inode_write, this leaves behind no bh reference and
+  * returns no iloc structure, so the caller needs to repeat the iloc
+  * lookup to mark the inode dirty later.
+  */
+ static inline int
+ ext3_pin_inode(handle_t *handle, struct inode *inode)
+ {
+ 	struct ext3_iloc iloc;
+ 	
+ 	int err = 0;
+ 	if (handle) {
+ 		err = ext3_get_inode_loc(inode, &iloc);
+ 		if (!err) {
+ 			BUFFER_TRACE(iloc.bh, "get_write_access");
+ 			err = journal_get_write_access(handle, iloc.bh);
+ 			if (!err)
+ 				err = ext3_journal_dirty_metadata(handle, 
+ 								  iloc.bh);
+ 			brelse(iloc.bh);
+ 		}
+ 	}
+ 	ext3_std_error(inode->i_sb, err);
+ 	return err;
+ }
+ #endif
+ 
+ int ext3_change_inode_journal_flag(struct inode *inode, int val)
+ {
+ 	journal_t *journal;
+ 	handle_t *handle;
+ 	int err;
+ 
+ 	/*
+ 	 * We have to be very careful here: changing a data block's
+ 	 * journaling status dynamically is dangerous.  If we write a
+ 	 * data block to the journal, change the status and then delete
+ 	 * that block, we risk forgetting to revoke the old log record
+ 	 * from the journal and so a subsequent replay can corrupt data.
+ 	 * So, first we make sure that the journal is empty and that
+ 	 * nobody is changing anything.
+ 	 */
+ 
+ 	journal = EXT3_JOURNAL(inode);
+ 	if (is_journal_aborted(journal) || IS_RDONLY(inode))
+ 		return -EROFS;
+ 	
+ 	journal_lock_updates(journal);
+ 	journal_flush(journal);
+ 
+ 	/*
+ 	 * OK, there are no updates running now, and all cached data is
+ 	 * synced to disk.  We are now in a completely consistent state
+ 	 * which doesn't have anything in the journal, and we know that
+ 	 * no filesystem updates are running, so it is safe to modify
+ 	 * the inode's in-core data-journaling state flag now.
+ 	 */
+ 
+ 	if (val)
+ 		inode->u.ext3_i.i_flags |= EXT3_JOURNAL_DATA_FL;
+ 	else
+ 		inode->u.ext3_i.i_flags &= ~EXT3_JOURNAL_DATA_FL;
+ 
+ 	journal_unlock_updates(journal);
+ 
+ 	/* Finally we can mark the inode as dirty. */
+ 
+ 	handle = ext3_journal_start(inode, 1);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	err = ext3_mark_inode_dirty(handle, inode);
+ 	handle->h_sync = 1;
+ 	ext3_journal_stop(handle, inode);
+ 	ext3_std_error(inode->i_sb, err);
+ 	
+ 	return err;
+ }
+ 
+ 
+ /*
+  * ext3_aops_journal_start().
+  *
+  * <This function died, but the comment lives on>
+  *
+  * We need to take the inode semaphore *outside* the
+  * journal_start/journal_stop.  Otherwise, a different task could do a
+  * wait_for_commit() while holding ->i_sem, which deadlocks.  The rule
+  * is: transaction open/closes are considered to be a locking operation
+  * and they nest *inside* ->i_sem.
+  * ----------------------------------------------------------------------------
+  * Possible problem:
+  *	ext3_file_write()
+  *	-> generic_file_write()
+  *	   -> __alloc_pages()
+  *	      -> page_launder()
+  *		 -> ext3_writepage()
+  *
+  * And the writepage can be on a different fs while we have a
+  * transaction open against this one!  Bad.
+  *
+  * I tried making the task PF_MEMALLOC here, but that simply results in
+  * 0-order allocation failures passed back to generic_file_write().
+  * Instead, we rely on the reentrancy protection in ext3_writepage().
+  * ----------------------------------------------------------------------------
+  * When we do the journal_start() here we don't really need to reserve
+  * any blocks - we won't need any until we hit ext3_prepare_write(),
+  * which does all the needed journal extending.  However!  There is a
+  * problem with quotas:
+  *
+  * Thread 1:
+  * sys_sync
+  * ->sync_dquots
+  *   ->commit_dquot
+  *     ->lock_dquot
+  *     ->write_dquot
+  *       ->ext3_file_write
+  *         ->journal_start
+  *         ->ext3_prepare_write
+  *           ->journal_extend
+  *           ->journal_start
+  * Thread 2:
+  * ext3_create		(for example)
+  * ->ext3_new_inode
+  *   ->dquot_initialize
+  *     ->lock_dquot
+  *
+  * Deadlock.  Thread 1's journal_start blocks because thread 2 has a
+  * transaction open.  Thread 2's transaction will never close because
+  * thread 2 is stuck waiting for the dquot lock.
+  *
+  * So.  We must ensure that thread 1 *never* needs to extend the journal
+  * for quota writes.  We do that by reserving enough journal blocks
+  * here, in ext3_aops_journal_start() to ensure that the forthcoming "see if we
+  * need to extend" test in ext3_prepare_write() succeeds.  
+  */
+ 
+ 
+ MODULE_LICENSE("GPL");
diff -rc2P linux/fs/ext3/ioctl.c linux-2.4.13/fs/ext3/ioctl.c
*** linux/fs/ext3/ioctl.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/ioctl.c	Fri Nov  9 17:03:13 2001
***************
*** 0 ****
--- 1,176 ----
+ /*
+  * linux/fs/ext3/ioctl.c
+  *
+  * Copyright (C) 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  */
+ 
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/sched.h>
+ #include <asm/uaccess.h>
+ 
+ 
+ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
+ 		unsigned long arg)
+ {
+ 	unsigned int flags;
+ 
+ 	ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+ 
+ 	switch (cmd) {
+ 	case EXT3_IOC_GETFLAGS:
+ 		flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
+ 		return put_user(flags, (int *) arg);
+ 	case EXT3_IOC_SETFLAGS: {
+ 		handle_t *handle = NULL;
+ 		int err;
+ 		struct ext3_iloc iloc;
+ 		unsigned int oldflags;
+ 		unsigned int jflag;
+ 
+ 		if (IS_RDONLY(inode))
+ 			return -EROFS;
+ 
+ 		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ 			return -EPERM;
+ 
+ 		if (get_user(flags, (int *) arg))
+ 			return -EFAULT;
+ 
+ 		oldflags = inode->u.ext3_i.i_flags;
+ 
+ 		/* The JOURNAL_DATA flag is modifiable only by root */
+ 		jflag = flags & EXT3_JOURNAL_DATA_FL;
+ 
+ 		/*
+ 		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+ 		 * the relevant capability.
+ 		 *
+ 		 * This test looks nicer. Thanks to Pauline Middelink
+ 		 */
+ 		if ((flags ^ oldflags) & (EXT3_APPEND_FL |  EXT3_IMMUTABLE_FILE_FL | EXT3_IMMUTABLE_LINK_FL)) {
+ 			if (!capable(CAP_LINUX_IMMUTABLE))
+ 				return -EPERM;
+ 		}
+ 		
+ 		/*
+ 		 * The JOURNAL_DATA flag can only be changed by
+ 		 * the relevant capability.
+ 		 */
+ 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
+ 			if (!capable(CAP_SYS_RESOURCE))
+ 				return -EPERM;
+ 		}
+ 
+ 
+ 		handle = ext3_journal_start(inode, 1);
+ 		if (IS_ERR(handle))
+ 			return PTR_ERR(handle);
+ 		if (IS_SYNC(inode))
+ 			handle->h_sync = 1;
+ 		err = ext3_reserve_inode_write(handle, inode, &iloc);
+ 		if (err)
+ 			goto flags_err;
+ 		
+ 		flags = flags & EXT3_FL_USER_MODIFIABLE;
+ 		flags |= oldflags & ~EXT3_FL_USER_MODIFIABLE;
+ 		inode->u.ext3_i.i_flags = flags;
+ 
+ 		if (flags & EXT3_SYNC_FL)
+ 			inode->i_flags |= S_SYNC;
+ 		else
+ 			inode->i_flags &= ~S_SYNC;
+ 		if (flags & EXT3_APPEND_FL)
+ 			inode->i_flags |= S_APPEND;
+ 		else
+ 			inode->i_flags &= ~S_APPEND;
+                if (flags & EXT3_IMMUTABLE_FILE_FL)
+                        inode->i_flags |= S_IMMUTABLE_FILE;
+                else
+                        inode->i_flags &= ~S_IMMUTABLE_FILE;
+ 
+                if (flags & EXT3_IMMUTABLE_LINK_FL)
+                        inode->i_flags |= S_IMMUTABLE_LINK;
+ 		else
+                        inode->i_flags &= ~S_IMMUTABLE_LINK;
+ 
+ 		if (flags & EXT3_NOATIME_FL)
+ 			inode->i_flags |= S_NOATIME;
+ 		else
+ 			inode->i_flags &= ~S_NOATIME;
+ 		inode->i_ctime = CURRENT_TIME;
+ 
+ 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ flags_err:
+ 		ext3_journal_stop(handle, inode);
+ 		if (err)
+ 			return err;
+ 		
+ 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
+ 			err = ext3_change_inode_journal_flag(inode, jflag);
+ 		return err;
+ 	}
+ 	case EXT3_IOC_GETVERSION:
+ 	case EXT3_IOC_GETVERSION_OLD:
+ 		return put_user(inode->i_generation, (int *) arg);
+ 	case EXT3_IOC_SETVERSION:
+ 	case EXT3_IOC_SETVERSION_OLD: {
+ 		handle_t *handle;
+ 		struct ext3_iloc iloc;
+ 		__u32 generation;
+ 		int err;
+ 
+ 		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
+ 			return -EPERM;
+ 		if (IS_RDONLY(inode))
+ 			return -EROFS;
+ 		if (get_user(generation, (int *) arg))
+ 			return -EFAULT;
+ 
+ 		handle = ext3_journal_start(inode, 1);
+ 		if (IS_ERR(handle))
+ 			return PTR_ERR(handle);
+ 		err = ext3_reserve_inode_write(handle, inode, &iloc);
+ 		if (err)
+ 			return err;
+ 
+ 		inode->i_ctime = CURRENT_TIME;
+ 		inode->i_generation = generation;
+ 
+ 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ 		ext3_journal_stop(handle, inode);
+ 		return err;
+ 	}
+ #ifdef CONFIG_JBD_DEBUG
+ 	case EXT3_IOC_WAIT_FOR_READONLY:
+ 		/*
+ 		 * This is racy - by the time we're woken up and running,
+ 		 * the superblock could be released.  And the module could
+ 		 * have been unloaded.  So sue me.
+ 		 *
+ 		 * Returns 1 if it slept, else zero.
+ 		 */
+ 		{
+ 			struct super_block *sb = inode->i_sb;
+ 			DECLARE_WAITQUEUE(wait, current);
+ 			int ret = 0;
+ 
+ 			set_current_state(TASK_INTERRUPTIBLE);
+ 			add_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
+ 			if (timer_pending(&sb->u.ext3_sb.turn_ro_timer)) {
+ 				schedule();
+ 				ret = 1;
+ 			}
+ 			remove_wait_queue(&sb->u.ext3_sb.ro_wait_queue, &wait);
+ 			return ret;
+ 		}
+ #endif
+ 	default:
+ 		return -ENOTTY;
+ 	}
+ }
diff -rc2P linux/fs/ext3/namei.c linux-2.4.13/fs/ext3/namei.c
*** linux/fs/ext3/namei.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/namei.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,1125 ----
+ /*
+  *  linux/fs/ext3/namei.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/namei.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  *  Directory entry file type support and forward compatibility hooks
+  *  	for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
+  */
+ 
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+ 
+ 
+ /*
+  * define how far ahead to read directories while searching them.
+  */
+ #define NAMEI_RA_CHUNKS  2
+ #define NAMEI_RA_BLOCKS  4
+ #define NAMEI_RA_SIZE        (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b)  (((c) * NAMEI_RA_BLOCKS) + (b))
+ 
+ /*
+  * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+  *
+  * `len <= EXT3_NAME_LEN' is guaranteed by caller.
+  * `de != NULL' is guaranteed by caller.
+  */
+ static inline int ext3_match (int len, const char * const name,
+ 			      struct ext3_dir_entry_2 * de)
+ {
+ 	if (len != de->name_len)
+ 		return 0;
+ 	if (!de->inode)
+ 		return 0;
+ 	return !memcmp(name, de->name, len);
+ }
+ 
+ /*
+  * Returns 0 if not found, -1 on failure, and 1 on success
+  */
+ static int inline search_dirblock(struct buffer_head * bh,
+ 				  struct inode *dir,
+ 				  struct dentry *dentry,
+ 				  unsigned long offset,
+ 				  struct ext3_dir_entry_2 ** res_dir)
+ {
+ 	struct ext3_dir_entry_2 * de;
+ 	char * dlimit;
+ 	int de_len;
+ 	const char *name = dentry->d_name.name;
+ 	int namelen = dentry->d_name.len;
+ 
+ 	de = (struct ext3_dir_entry_2 *) bh->b_data;
+ 	dlimit = bh->b_data + dir->i_sb->s_blocksize;
+ 	while ((char *) de < dlimit) {
+ 		/* this code is executed quadratically often */
+ 		/* do minimal checking `by hand' */
+ 
+ 		if ((char *) de + namelen <= dlimit &&
+ 		    ext3_match (namelen, name, de)) {
+ 			/* found a match - just to be sure, do a full check */
+ 			if (!ext3_check_dir_entry("ext3_find_entry",
+ 						  dir, de, bh, offset))
+ 				return -1;
+ 			*res_dir = de;
+ 			return 1;
+ 		}
+ 		/* prevent looping on a bad block */
+ 		de_len = le16_to_cpu(de->rec_len);
+ 		if (de_len <= 0)
+ 			return -1;
+ 		offset += de_len;
+ 		de = (struct ext3_dir_entry_2 *) ((char *) de + de_len);
+ 	}
+ 	return 0;
+ }
+ 
+ /*
+  *	ext3_find_entry()
+  *
+  * finds an entry in the specified directory with the wanted name. It
+  * returns the cache buffer in which the entry was found, and the entry
+  * itself (as a parameter - res_dir). It does NOT read the inode of the
+  * entry - you'll have to do that yourself if you want to.
+  *
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
+  * to brelse() it when appropriate.
+  */
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+ 					struct ext3_dir_entry_2 ** res_dir)
+ {
+ 	struct super_block * sb;
+ 	struct buffer_head * bh_use[NAMEI_RA_SIZE];
+ 	struct buffer_head * bh, *ret = NULL;
+ 	unsigned long start, block, b;
+ 	int ra_max = 0;		/* Number of bh's in the readahead
+ 				   buffer, bh_use[] */
+ 	int ra_ptr = 0;		/* Current index into readahead
+ 				   buffer */
+ 	int num = 0;
+ 	int nblocks, i, err;
+ 	struct inode *dir = dentry->d_parent->d_inode;
+ 
+ 	*res_dir = NULL;
+ 	sb = dir->i_sb;
+ 
+ 	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+ 	start = dir->u.ext3_i.i_dir_start_lookup;
+ 	if (start >= nblocks)
+ 		start = 0;
+ 	block = start;
+ restart:
+ 	do {
+ 		/*
+ 		 * We deal with the read-ahead logic here.
+ 		 */
+ 		if (ra_ptr >= ra_max) {
+ 			/* Refill the readahead buffer */
+ 			ra_ptr = 0;
+ 			b = block;
+ 			for (ra_max = 0; ra_max < NAMEI_RA_SIZE; ra_max++) {
+ 				/*
+ 				 * Terminate if we reach the end of the
+ 				 * directory and must wrap, or if our
+ 				 * search has finished at this block.
+ 				 */
+ 				if (b >= nblocks || (num && block == start)) {
+ 					bh_use[ra_max] = NULL;
+ 					break;
+ 				}
+ 				num++;
+ 				bh = ext3_getblk(NULL, dir, b++, 0, &err);
+ 				bh_use[ra_max] = bh;
+ 				if (bh)
+ 					ll_rw_block(READ, 1, &bh);
+ 			}
+ 		}
+ 		if ((bh = bh_use[ra_ptr++]) == NULL)
+ 			goto next;
+ 		wait_on_buffer(bh);
+ 		if (!buffer_uptodate(bh)) {
+ 			/* read error, skip block & hope for the best */
+ 			brelse(bh);
+ 			goto next;
+ 		}
+ 		i = search_dirblock(bh, dir, dentry,
+ 			    block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+ 		if (i == 1) {
+ 			dir->u.ext3_i.i_dir_start_lookup = block;
+ 			ret = bh;
+ 			goto cleanup_and_exit;
+ 		} else {
+ 			brelse(bh);
+ 			if (i < 0)
+ 				goto cleanup_and_exit;
+ 		}
+ 	next:
+ 		if (++block >= nblocks)
+ 			block = 0;
+ 	} while (block != start);
+ 
+ 	/*
+ 	 * If the directory has grown while we were searching, then
+ 	 * search the last part of the directory before giving up.
+ 	 */
+ 	block = nblocks;
+ 	nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+ 	if (block < nblocks) {
+ 		start = 0;
+ 		goto restart;
+ 	}
+ 		
+ cleanup_and_exit:
+ 	/* Clean up the read-ahead blocks */
+ 	for (; ra_ptr < ra_max; ra_ptr++)
+ 		brelse (bh_use[ra_ptr]);
+ 	return ret;
+ }
+ 
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+ 	struct inode * inode;
+ 	struct ext3_dir_entry_2 * de;
+ 	struct buffer_head * bh;
+ 
+ 	if (dentry->d_name.len > EXT3_NAME_LEN)
+ 		return ERR_PTR(-ENAMETOOLONG);
+ 
+ 	bh = ext3_find_entry(dentry, &de);
+ 	inode = NULL;
+ 	if (bh) {
+ 		unsigned long ino = le32_to_cpu(de->inode);
+ 		brelse (bh);
+ 		inode = iget(dir->i_sb, ino);
+ 
+ 		if (!inode)
+ 			return ERR_PTR(-EACCES);
+ 	}
+ 	d_add(dentry, inode);
+ 	return NULL;
+ }
+ 
+ #define S_SHIFT 12
+ static unsigned char ext3_type_by_mode[S_IFMT >> S_SHIFT] = {
+ 	[S_IFREG >> S_SHIFT]	EXT3_FT_REG_FILE,
+ 	[S_IFDIR >> S_SHIFT]	EXT3_FT_DIR,
+ 	[S_IFCHR >> S_SHIFT]	EXT3_FT_CHRDEV,
+ 	[S_IFBLK >> S_SHIFT]	EXT3_FT_BLKDEV,
+ 	[S_IFIFO >> S_SHIFT]	EXT3_FT_FIFO,
+ 	[S_IFSOCK >> S_SHIFT]	EXT3_FT_SOCK,
+ 	[S_IFLNK >> S_SHIFT]	EXT3_FT_SYMLINK,
+ };
+ 
+ static inline void ext3_set_de_type(struct super_block *sb,
+ 				struct ext3_dir_entry_2 *de,
+ 				umode_t mode) {
+ 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE))
+ 		de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
+ 
+ /*
+  *	ext3_add_entry()
+  *
+  * adds a file entry to the specified directory, using the same
+  * semantics as ext3_find_entry(). It returns NULL if it failed.
+  *
+  * NOTE!! The inode part of 'de' is left at 0 - which means you
+  * may not sleep between calling this and putting something into
+  * the entry, as someone else might have used it while you slept.
+  */
+ 
+ /*
+  * AKPM: the journalling code here looks wrong on the error paths
+  */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+ 	struct inode *inode)
+ {
+ 	struct inode *dir = dentry->d_parent->d_inode;
+ 	const char *name = dentry->d_name.name;
+ 	int namelen = dentry->d_name.len;
+ 	unsigned long offset;
+ 	unsigned short rec_len;
+ 	struct buffer_head * bh;
+ 	struct ext3_dir_entry_2 * de, * de1;
+ 	struct super_block * sb;
+ 	int	retval;
+ 
+ 	sb = dir->i_sb;
+ 
+ 	if (!namelen)
+ 		return -EINVAL;
+ 	bh = ext3_bread (handle, dir, 0, 0, &retval);
+ 	if (!bh)
+ 		return retval;
+ 	rec_len = EXT3_DIR_REC_LEN(namelen);
+ 	offset = 0;
+ 	de = (struct ext3_dir_entry_2 *) bh->b_data;
+ 	while (1) {
+ 		if ((char *)de >= sb->s_blocksize + bh->b_data) {
+ 			brelse (bh);
+ 			bh = NULL;
+ 			bh = ext3_bread (handle, dir,
+ 				offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+ 			if (!bh)
+ 				return retval;
+ 			if (dir->i_size <= offset) {
+ 				if (dir->i_size == 0) {
+ 					brelse(bh);
+ 					return -ENOENT;
+ 				}
+ 
+ 				ext3_debug ("creating next block\n");
+ 
+ 				BUFFER_TRACE(bh, "get_write_access");
+ 				ext3_journal_get_write_access(handle, bh);
+ 				de = (struct ext3_dir_entry_2 *) bh->b_data;
+ 				de->inode = 0;
+ 				de->rec_len = le16_to_cpu(sb->s_blocksize);
+ 				dir->u.ext3_i.i_disksize =
+ 					dir->i_size = offset + sb->s_blocksize;
+ 				dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ 				ext3_mark_inode_dirty(handle, dir);
+ 			} else {
+ 
+ 				ext3_debug ("skipping to next block\n");
+ 
+ 				de = (struct ext3_dir_entry_2 *) bh->b_data;
+ 			}
+ 		}
+ 		if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+ 					   offset)) {
+ 			brelse (bh);
+ 			return -ENOENT;
+ 		}
+ 		if (ext3_match (namelen, name, de)) {
+ 				brelse (bh);
+ 				return -EEXIST;
+ 		}
+ 		if ((le32_to_cpu(de->inode) == 0 &&
+ 				le16_to_cpu(de->rec_len) >= rec_len) ||
+ 		    (le16_to_cpu(de->rec_len) >=
+ 				EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+ 			BUFFER_TRACE(bh, "get_write_access");
+ 			ext3_journal_get_write_access(handle, bh);
+ 			/* By now the buffer is marked for journaling */
+ 			offset += le16_to_cpu(de->rec_len);
+ 			if (le32_to_cpu(de->inode)) {
+ 				de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+ 					EXT3_DIR_REC_LEN(de->name_len));
+ 				de1->rec_len =
+ 					cpu_to_le16(le16_to_cpu(de->rec_len) -
+ 					EXT3_DIR_REC_LEN(de->name_len));
+ 				de->rec_len = cpu_to_le16(
+ 						EXT3_DIR_REC_LEN(de->name_len));
+ 				de = de1;
+ 			}
+ 			de->file_type = EXT3_FT_UNKNOWN;
+ 			if (inode) {
+ 				de->inode = cpu_to_le32(inode->i_ino);
+ 				ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+ 			} else
+ 				de->inode = 0;
+ 			de->name_len = namelen;
+ 			memcpy (de->name, name, namelen);
+ 			/*
+ 			 * XXX shouldn't update any times until successful
+ 			 * completion of syscall, but too many callers depend
+ 			 * on this.
+ 			 *
+ 			 * XXX similarly, too many callers depend on
+ 			 * ext3_new_inode() setting the times, but error
+ 			 * recovery deletes the inode, so the worst that can
+ 			 * happen is that the times are slightly out of date
+ 			 * and/or different from the directory change time.
+ 			 */
+ 			dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+ 			dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ 			ext3_mark_inode_dirty(handle, dir);
+ 			dir->i_version = ++event;
+ 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ 			ext3_journal_dirty_metadata(handle, bh);
+ 			brelse(bh);
+ 			return 0;
+ 		}
+ 		offset += le16_to_cpu(de->rec_len);
+ 		de = (struct ext3_dir_entry_2 *)
+ 			((char *) de + le16_to_cpu(de->rec_len));
+ 	}
+ 	brelse (bh);
+ 	return -ENOSPC;
+ }
+ 
+ /*
+  * ext3_delete_entry deletes a directory entry by merging it with the
+  * previous entry
+  */
+ static int ext3_delete_entry (handle_t *handle, 
+ 			      struct inode * dir,
+ 			      struct ext3_dir_entry_2 * de_del,
+ 			      struct buffer_head * bh)
+ {
+ 	struct ext3_dir_entry_2 * de, * pde;
+ 	int i;
+ 
+ 	i = 0;
+ 	pde = NULL;
+ 	de = (struct ext3_dir_entry_2 *) bh->b_data;
+ 	while (i < bh->b_size) {
+ 		if (!ext3_check_dir_entry("ext3_delete_entry", dir, de, bh, i))
+ 			return -EIO;
+ 		if (de == de_del)  {
+ 			BUFFER_TRACE(bh, "get_write_access");
+ 			ext3_journal_get_write_access(handle, bh);
+ 			if (pde)
+ 				pde->rec_len =
+ 					cpu_to_le16(le16_to_cpu(pde->rec_len) +
+ 						    le16_to_cpu(de->rec_len));
+ 			else
+ 				de->inode = 0;
+ 			dir->i_version = ++event;
+ 			BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+ 			ext3_journal_dirty_metadata(handle, bh);
+ 			return 0;
+ 		}
+ 		i += le16_to_cpu(de->rec_len);
+ 		pde = de;
+ 		de = (struct ext3_dir_entry_2 *)
+ 			((char *) de + le16_to_cpu(de->rec_len));
+ 	}
+ 	return -ENOENT;
+ }
+ 
+ /*
+  * ext3_mark_inode_dirty is somewhat expensive, so unlike ext2 we
+  * do not perform it in these functions.  We perform it at the call site,
+  * if it is needed.
+  */
+ static inline void ext3_inc_count(handle_t *handle, struct inode *inode)
+ {
+ 	inode->i_nlink++;
+ }
+ 
+ static inline void ext3_dec_count(handle_t *handle, struct inode *inode)
+ {
+ 	inode->i_nlink--;
+ }
+ 
+ static int ext3_add_nondir(handle_t *handle,
+ 		struct dentry *dentry, struct inode *inode)
+ {
+ 	int err = ext3_add_entry(handle, dentry, inode);
+ 	if (!err) {
+ 		d_instantiate(dentry, inode);
+ 		return 0;
+ 	}
+ 	ext3_dec_count(handle, inode);
+ 	iput(inode);
+ 	return err;
+ }
+ 
+ /*
+  * By the time this is called, we already have created
+  * the directory cache entry for the new file, but it
+  * is so far negative - it has no inode.
+  *
+  * If the create succeeds, we fill in the inode information
+  * with d_instantiate(). 
+  */
+ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode)
+ {
+ 	handle_t *handle; 
+ 	struct inode * inode;
+ 	int err;
+ 
+ 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	if (IS_SYNC(dir))
+ 		handle->h_sync = 1;
+ 
+ 	inode = ext3_new_inode (handle, dir, mode);
+ 	err = PTR_ERR(inode);
+ 	if (!IS_ERR(inode)) {
+ 		inode->i_op = &ext3_file_inode_operations;
+ 		inode->i_fop = &ext3_file_operations;
+ 		inode->i_mapping->a_ops = &ext3_aops;
+ 		ext3_mark_inode_dirty(handle, inode);
+ 		err = ext3_add_nondir(handle, dentry, inode);
+ 	}
+ 	ext3_journal_stop(handle, dir);
+ 	return err;
+ }
+ 
+ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
+ 			int mode, int rdev)
+ {
+ 	handle_t *handle;
+ 	struct inode *inode;
+ 	int err;
+ 
+ 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	if (IS_SYNC(dir))
+ 		handle->h_sync = 1;
+ 
+ 	inode = ext3_new_inode (handle, dir, mode);
+ 	err = PTR_ERR(inode);
+ 	if (!IS_ERR(inode)) {
+ 		init_special_inode(inode, mode, rdev);
+ 		ext3_mark_inode_dirty(handle, inode);
+ 		err = ext3_add_nondir(handle, dentry, inode);
+ 	}
+ 	ext3_journal_stop(handle, dir);
+ 	return err;
+ }
+ 
+ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
+ {
+ 	handle_t *handle;
+ 	struct inode * inode;
+ 	struct buffer_head * dir_block;
+ 	struct ext3_dir_entry_2 * de;
+ 	int err;
+ 
+ 	if (dir->i_nlink >= EXT3_LINK_MAX)
+ 		return -EMLINK;
+ 
+ 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	if (IS_SYNC(dir))
+ 		handle->h_sync = 1;
+ 
+ 	inode = ext3_new_inode (handle, dir, S_IFDIR);
+ 	err = PTR_ERR(inode);
+ 	if (IS_ERR(inode))
+ 		goto out_stop;
+ 
+ 	inode->i_op = &ext3_dir_inode_operations;
+ 	inode->i_fop = &ext3_dir_operations;
+ 	inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
+ 	inode->i_blocks = 0;	
+ 	dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ 	if (!dir_block) {
+ 		inode->i_nlink--; /* is this nlink == 0? */
+ 		ext3_mark_inode_dirty(handle, inode);
+ 		iput (inode);
+ 		goto out_stop;
+ 	}
+ 	BUFFER_TRACE(dir_block, "get_write_access");
+ 	ext3_journal_get_write_access(handle, dir_block);
+ 	de = (struct ext3_dir_entry_2 *) dir_block->b_data;
+ 	de->inode = cpu_to_le32(inode->i_ino);
+ 	de->name_len = 1;
+ 	de->rec_len = cpu_to_le16(EXT3_DIR_REC_LEN(de->name_len));
+ 	strcpy (de->name, ".");
+ 	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+ 	de = (struct ext3_dir_entry_2 *)
+ 			((char *) de + le16_to_cpu(de->rec_len));
+ 	de->inode = cpu_to_le32(dir->i_ino);
+ 	de->rec_len = cpu_to_le16(inode->i_sb->s_blocksize-EXT3_DIR_REC_LEN(1));
+ 	de->name_len = 2;
+ 	strcpy (de->name, "..");
+ 	ext3_set_de_type(dir->i_sb, de, S_IFDIR);
+ 	inode->i_nlink = 2;
+ 	BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+ 	ext3_journal_dirty_metadata(handle, dir_block);
+ 	brelse (dir_block);
+ 	inode->i_mode = S_IFDIR | mode;
+ 	if (dir->i_mode & S_ISGID)
+ 		inode->i_mode |= S_ISGID;
+ 	ext3_mark_inode_dirty(handle, inode);
+ 	err = ext3_add_entry (handle, dentry, inode);
+ 	if (err)
+ 		goto out_no_entry;
+ 	dir->i_nlink++;
+ 	dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ 	ext3_mark_inode_dirty(handle, dir);
+ 	d_instantiate(dentry, inode);
+ out_stop:
+ 	ext3_journal_stop(handle, dir);
+ 	return err;
+ 
+ out_no_entry:
+ 	inode->i_nlink = 0;
+ 	ext3_mark_inode_dirty(handle, inode);
+ 	iput (inode);
+ 	goto out_stop;
+ }
+ 
+ /*
+  * routine to check that the specified directory is empty (for rmdir)
+  */
+ static int empty_dir (struct inode * inode)
+ {
+ 	unsigned long offset;
+ 	struct buffer_head * bh;
+ 	struct ext3_dir_entry_2 * de, * de1;
+ 	struct super_block * sb;
+ 	int err;
+ 
+ 	sb = inode->i_sb;
+ 	if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) ||
+ 	    !(bh = ext3_bread (NULL, inode, 0, 0, &err))) {
+ 	    	ext3_warning (inode->i_sb, "empty_dir",
+ 			      "bad directory (dir #%lu) - no data block",
+ 			      inode->i_ino);
+ 		return 1;
+ 	}
+ 	de = (struct ext3_dir_entry_2 *) bh->b_data;
+ 	de1 = (struct ext3_dir_entry_2 *)
+ 			((char *) de + le16_to_cpu(de->rec_len));
+ 	if (le32_to_cpu(de->inode) != inode->i_ino ||
+ 			!le32_to_cpu(de1->inode) || 
+ 			strcmp (".", de->name) ||
+ 			strcmp ("..", de1->name)) {
+ 	    	ext3_warning (inode->i_sb, "empty_dir",
+ 			      "bad directory (dir #%lu) - no `.' or `..'",
+ 			      inode->i_ino);
+ 		brelse (bh);
+ 		return 1;
+ 	}
+ 	offset = le16_to_cpu(de->rec_len) + le16_to_cpu(de1->rec_len);
+ 	de = (struct ext3_dir_entry_2 *)
+ 			((char *) de1 + le16_to_cpu(de1->rec_len));
+ 	while (offset < inode->i_size ) {
+ 		if (!bh ||
+ 			(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ 			brelse (bh);
+ 			bh = ext3_bread (NULL, inode,
+ 				offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err);
+ 			if (!bh) {
+ #if 0
+ 				ext3_error (sb, "empty_dir",
+ 				"directory #%lu contains a hole at offset %lu",
+ 					inode->i_ino, offset);
+ #endif
+ 				offset += sb->s_blocksize;
+ 				continue;
+ 			}
+ 			de = (struct ext3_dir_entry_2 *) bh->b_data;
+ 		}
+ 		if (!ext3_check_dir_entry ("empty_dir", inode, de, bh,
+ 					   offset)) {
+ 			brelse (bh);
+ 			return 1;
+ 		}
+ 		if (le32_to_cpu(de->inode)) {
+ 			brelse (bh);
+ 			return 0;
+ 		}
+ 		offset += le16_to_cpu(de->rec_len);
+ 		de = (struct ext3_dir_entry_2 *)
+ 				((char *) de + le16_to_cpu(de->rec_len));
+ 	}
+ 	brelse (bh);
+ 	return 1;
+ }
+ 
+ /* ext3_orphan_add() links an unlinked or truncated inode into a list of
+  * such inodes, starting at the superblock, in case we crash before the
+  * file is closed/deleted, or in case the inode truncate spans multiple
+  * transactions and the last transaction is not recovered after a crash.
+  *
+  * At filesystem recovery time, we walk this list deleting unlinked
+  * inodes and truncating linked inodes in ext3_orphan_cleanup().
+  */
+ int ext3_orphan_add(handle_t *handle, struct inode *inode)
+ {
+ 	struct super_block *sb = inode->i_sb;
+ 	struct ext3_iloc iloc;
+ 	int err = 0, rc;
+ 	
+ 	lock_super(sb);
+ 	if (!list_empty(&inode->u.ext3_i.i_orphan))
+ 		goto out_unlock;
+ 
+ 	/* Orphan handling is only valid for files with data blocks
+ 	 * being truncated, or files being unlinked. */
+ 
+ 	/* @@@ FIXME: Observation from aviro:
+ 	 * I think I can trigger J_ASSERT in ext3_orphan_add().  We block 
+ 	 * here (on lock_super()), so race with ext3_link() which might bump
+ 	 * ->i_nlink. For, say it, character device. Not a regular file,
+ 	 * not a directory, not a symlink and ->i_nlink > 0.
+ 	 */
+ 	J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ 		S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
+ 
+ 	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "get_write_access");
+ 	err = ext3_journal_get_write_access(handle, sb->u.ext3_sb.s_sbh);
+ 	if (err)
+ 		goto out_unlock;
+ 	
+ 	err = ext3_reserve_inode_write(handle, inode, &iloc);
+ 	if (err)
+ 		goto out_unlock;
+ 
+ 	/* Insert this inode at the head of the on-disk orphan list... */
+ 	NEXT_ORPHAN(inode) = le32_to_cpu(EXT3_SB(sb)->s_es->s_last_orphan);
+ 	EXT3_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ 	err = ext3_journal_dirty_metadata(handle, sb->u.ext3_sb.s_sbh);
+ 	rc = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ 	if (!err)
+ 		err = rc;
+ 
+ 	/* Only add to the head of the in-memory list if all the
+ 	 * previous operations succeeded.  If the orphan_add is going to
+ 	 * fail (possibly taking the journal offline), we can't risk
+ 	 * leaving the inode on the orphan list: stray orphan-list
+ 	 * entries can cause panics at unmount time.
+ 	 *
+ 	 * This is safe: on error we're going to ignore the orphan list
+ 	 * anyway on the next recovery. */
+ 	if (!err)
+ 		list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
+ 
+ 	jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+ 	jbd_debug(4, "orphan inode %ld will point to %d\n",
+ 			inode->i_ino, NEXT_ORPHAN(inode));
+ out_unlock:
+ 	unlock_super(sb);
+ 	ext3_std_error(inode->i_sb, err);
+ 	return err;
+ }
+ 
+ /*
+  * ext3_orphan_del() removes an unlinked or truncated inode from the list
+  * of such inodes stored on disk, because it is finally being cleaned up.
+  */
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+ 	struct list_head *prev;
+ 	struct ext3_sb_info *sbi;
+ 	ino_t ino_next; 
+ 	struct ext3_iloc iloc;
+ 	int err = 0;
+ 	
+ 	lock_super(inode->i_sb);
+ 	if (list_empty(&inode->u.ext3_i.i_orphan)) {
+ 		unlock_super(inode->i_sb);
+ 		return 0;
+ 	}
+ 
+ 	ino_next = NEXT_ORPHAN(inode);
+ 	prev = inode->u.ext3_i.i_orphan.prev;
+ 	sbi = EXT3_SB(inode->i_sb);
+ 
+ 	jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+ 
+ 	list_del(&inode->u.ext3_i.i_orphan);
+ 	INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+ 
+ 	/* If we're on an error path, we may not have a valid
+ 	 * transaction handle with which to update the orphan list on
+ 	 * disk, but we still need to remove the inode from the linked
+ 	 * list in memory. */
+ 	if (!handle)
+ 		goto out;
+ 	
+ 	err = ext3_reserve_inode_write(handle, inode, &iloc);
+ 	if (err)
+ 		goto out_err;
+ 
+ 	if (prev == &sbi->s_orphan) {
+ 		jbd_debug(4, "superblock will point to %ld\n", ino_next);
+ 		BUFFER_TRACE(sbi->s_sbh, "get_write_access");
+ 		err = ext3_journal_get_write_access(handle, sbi->s_sbh);
+ 		if (err)
+ 			goto out_brelse;
+ 		sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ 		err = ext3_journal_dirty_metadata(handle, sbi->s_sbh);
+ 	} else {
+ 		struct ext3_iloc iloc2;
+ 		struct inode *i_prev =
+ 			list_entry(prev, struct inode, u.ext3_i.i_orphan);
+ 		
+ 		jbd_debug(4, "orphan inode %ld will point to %ld\n",
+ 			  i_prev->i_ino, ino_next);
+ 		err = ext3_reserve_inode_write(handle, i_prev, &iloc2);
+ 		if (err)
+ 			goto out_brelse;
+ 		NEXT_ORPHAN(i_prev) = ino_next;
+ 		err = ext3_mark_iloc_dirty(handle, i_prev, &iloc2);
+ 	}
+ 	if (err)
+ 		goto out_brelse;
+ 	NEXT_ORPHAN(inode) = 0;
+ 	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
+ 	if (err)
+ 		goto out_brelse;
+ 
+ out_err: 	
+ 	ext3_std_error(inode->i_sb, err);
+ out:
+ 	unlock_super(inode->i_sb);
+ 	return err;
+ 
+ out_brelse:
+ 	brelse(iloc.bh);
+ 	goto out_err;
+ }
+ 
+ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
+ {
+ 	int retval;
+ 	struct inode * inode;
+ 	struct buffer_head * bh;
+ 	struct ext3_dir_entry_2 * de;
+ 	handle_t *handle;
+ 
+ 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	retval = -ENOENT;
+ 	bh = ext3_find_entry (dentry, &de);
+ 	if (!bh)
+ 		goto end_rmdir;
+ 
+ 	if (IS_SYNC(dir))
+ 		handle->h_sync = 1;
+ 
+ 	inode = dentry->d_inode;
+ 	DQUOT_INIT(inode);
+ 
+ 	retval = -EIO;
+ 	if (le32_to_cpu(de->inode) != inode->i_ino)
+ 		goto end_rmdir;
+ 
+ 	retval = -ENOTEMPTY;
+ 	if (!empty_dir (inode))
+ 		goto end_rmdir;
+ 
+ 	retval = ext3_delete_entry(handle, dir, de, bh);
+ 	if (retval)
+ 		goto end_rmdir;
+ 	if (inode->i_nlink != 2)
+ 		ext3_warning (inode->i_sb, "ext3_rmdir",
+ 			      "empty directory has nlink!=2 (%d)",
+ 			      inode->i_nlink);
+ 	inode->i_version = ++event;
+ 	inode->i_nlink = 0;
+ 	/* There's no need to set i_disksize: the fact that i_nlink is
+ 	 * zero will ensure that the right thing happens during any
+ 	 * recovery. */
+ 	inode->i_size = 0;
+ 	ext3_orphan_add(handle, inode);
+ 	ext3_mark_inode_dirty(handle, inode);
+ 	dir->i_nlink--;
+ 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ 	dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ 	ext3_mark_inode_dirty(handle, dir);
+ 
+ end_rmdir:
+ 	ext3_journal_stop(handle, dir);
+ 	brelse (bh);
+ 	return retval;
+ }
+ 
+ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
+ {
+ 	int retval;
+ 	struct inode * inode;
+ 	struct buffer_head * bh;
+ 	struct ext3_dir_entry_2 * de;
+ 	handle_t *handle;
+ 
+ 	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	if (IS_SYNC(dir))
+ 		handle->h_sync = 1;
+ 
+ 	retval = -ENOENT;
+ 	bh = ext3_find_entry (dentry, &de);
+ 	if (!bh)
+ 		goto end_unlink;
+ 
+ 	inode = dentry->d_inode;
+ 	DQUOT_INIT(inode);
+ 
+ 	retval = -EIO;
+ 	if (le32_to_cpu(de->inode) != inode->i_ino)
+ 		goto end_unlink;
+ 	
+ 	if (!inode->i_nlink) {
+ 		ext3_warning (inode->i_sb, "ext3_unlink",
+ 			      "Deleting nonexistent file (%lu), %d",
+ 			      inode->i_ino, inode->i_nlink);
+ 		inode->i_nlink = 1;
+ 	}
+ 	retval = ext3_delete_entry(handle, dir, de, bh);
+ 	if (retval)
+ 		goto end_unlink;
+ 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ 	dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ 	ext3_mark_inode_dirty(handle, dir);
+ 	inode->i_nlink--;
+ 	if (!inode->i_nlink)
+ 		ext3_orphan_add(handle, inode);
+ 	ext3_mark_inode_dirty(handle, inode);
+ 	inode->i_ctime = dir->i_ctime;
+ 	retval = 0;
+ 
+ end_unlink:
+ 	ext3_journal_stop(handle, dir);
+ 	brelse (bh);
+ 	return retval;
+ }
+ 
+ static int ext3_symlink (struct inode * dir,
+ 		struct dentry *dentry, const char * symname)
+ {
+ 	handle_t *handle;
+ 	struct inode * inode;
+ 	int l, err;
+ 
+ 	l = strlen(symname)+1;
+ 	if (l > dir->i_sb->s_blocksize)
+ 		return -ENAMETOOLONG;
+ 
+ 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	if (IS_SYNC(dir))
+ 		handle->h_sync = 1;
+ 
+ 	inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
+ 	err = PTR_ERR(inode);
+ 	if (IS_ERR(inode))
+ 		goto out_stop;
+ 
+ 	if (l > sizeof (inode->u.ext3_i.i_data)) {
+ 		inode->i_op = &page_symlink_inode_operations;
+ 		inode->i_mapping->a_ops = &ext3_aops;
+ 		/*
+ 		 * block_symlink() calls back into ext3_prepare/commit_write.
+ 		 * We have a transaction open.  All is sweetness.  It also sets
+ 		 * i_size in generic_commit_write().
+ 		 */
+ 		err = block_symlink(inode, symname, l);
+ 		if (err)
+ 			goto out_no_entry;
+ 	} else {
+ 		inode->i_op = &ext3_fast_symlink_inode_operations;
+ 		memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
+ 		inode->i_size = l-1;
+ 	}
+ 	inode->u.ext3_i.i_disksize = inode->i_size;
+ 	ext3_mark_inode_dirty(handle, inode);
+ 	err = ext3_add_nondir(handle, dentry, inode);
+ out_stop:
+ 	ext3_journal_stop(handle, dir);
+ 	return err;
+ 
+ out_no_entry:
+ 	ext3_dec_count(handle, inode);
+ 	ext3_mark_inode_dirty(handle, inode);
+ 	iput (inode);
+ 	goto out_stop;
+ }
+ 
+ static int ext3_link (struct dentry * old_dentry,
+ 		struct inode * dir, struct dentry *dentry)
+ {
+ 	handle_t *handle;
+ 	struct inode *inode = old_dentry->d_inode;
+ 	int err;
+ 
+ 	if (S_ISDIR(inode->i_mode))
+ 		return -EPERM;
+ 
+ 	if (inode->i_nlink >= EXT3_LINK_MAX)
+ 		return -EMLINK;
+ 
+ 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	if (IS_SYNC(dir))
+ 		handle->h_sync = 1;
+ 
+ 	inode->i_ctime = CURRENT_TIME;
+ 	ext3_inc_count(handle, inode);
+ 	atomic_inc(&inode->i_count);
+ 
+ 	ext3_mark_inode_dirty(handle, inode);
+ 	err = ext3_add_nondir(handle, dentry, inode);
+ 	ext3_journal_stop(handle, dir);
+ 	return err;
+ }
+ 
+ #define PARENT_INO(buffer) \
+ 	((struct ext3_dir_entry_2 *) ((char *) buffer + \
+ 	le16_to_cpu(((struct ext3_dir_entry_2 *) buffer)->rec_len)))->inode
+ 
+ /*
+  * Anybody can rename anything with this: the permission checks are left to the
+  * higher-level routines.
+  */
+ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
+ 			   struct inode * new_dir,struct dentry *new_dentry)
+ {
+ 	handle_t *handle;
+ 	struct inode * old_inode, * new_inode;
+ 	struct buffer_head * old_bh, * new_bh, * dir_bh;
+ 	struct ext3_dir_entry_2 * old_de, * new_de;
+ 	int retval;
+ 
+ 	old_bh = new_bh = dir_bh = NULL;
+ 
+ 	handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+ 	if (IS_ERR(handle))
+ 		return PTR_ERR(handle);
+ 
+ 	if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+ 		handle->h_sync = 1;
+ 
+ 	old_bh = ext3_find_entry (old_dentry, &old_de);
+ 	/*
+ 	 *  Check for inode number is _not_ due to possible IO errors.
+ 	 *  We might rmdir the source, keep it as pwd of some process
+ 	 *  and merrily kill the link to whatever was created under the
+ 	 *  same name. Goodbye sticky bit ;-<
+ 	 */
+ 	old_inode = old_dentry->d_inode;
+ 	retval = -ENOENT;
+ 	if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino)
+ 		goto end_rename;
+ 
+ 	new_inode = new_dentry->d_inode;
+ 	new_bh = ext3_find_entry (new_dentry, &new_de);
+ 	if (new_bh) {
+ 		if (!new_inode) {
+ 			brelse (new_bh);
+ 			new_bh = NULL;
+ 		} else {
+ 			DQUOT_INIT(new_inode);
+ 		}
+ 	}
+ 	if (S_ISDIR(old_inode->i_mode)) {
+ 		if (new_inode) {
+ 			retval = -ENOTEMPTY;
+ 			if (!empty_dir (new_inode))
+ 				goto end_rename;
+ 		}
+ 		retval = -EIO;
+ 		dir_bh = ext3_bread (handle, old_inode, 0, 0, &retval);
+ 		if (!dir_bh)
+ 			goto end_rename;
+ 		if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ 			goto end_rename;
+ 		retval = -EMLINK;
+ 		if (!new_inode && new_dir!=old_dir &&
+ 				new_dir->i_nlink >= EXT3_LINK_MAX)
+ 			goto end_rename;
+ 	}
+ 	if (!new_bh) {
+ 		retval = ext3_add_entry (handle, new_dentry, old_inode);
+ 		if (retval)
+ 			goto end_rename;
+ 	} else {
+ 		BUFFER_TRACE(new_bh, "get write access");
+ 		BUFFER_TRACE(new_bh, "get_write_access");
+ 		ext3_journal_get_write_access(handle, new_bh);
+ 		new_de->inode = le32_to_cpu(old_inode->i_ino);
+ 		if (EXT3_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
+ 					      EXT3_FEATURE_INCOMPAT_FILETYPE))
+ 			new_de->file_type = old_de->file_type;
+ 		new_dir->i_version = ++event;
+ 		BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata");
+ 		ext3_journal_dirty_metadata(handle, new_bh);
+ 		brelse(new_bh);
+ 		new_bh = NULL;
+ 	}
+ 
+ 	/*
+ 	 * Like most other Unix systems, set the ctime for inodes on a
+ 	 * rename.
+ 	 */
+ 	old_inode->i_ctime = CURRENT_TIME;
+ 	ext3_mark_inode_dirty(handle, old_inode);
+ 
+ 	/*
+ 	 * ok, that's it
+ 	 */
+ 	ext3_delete_entry(handle, old_dir, old_de, old_bh);
+ 
+ 	if (new_inode) {
+ 		new_inode->i_nlink--;
+ 		new_inode->i_ctime = CURRENT_TIME;
+ 	}
+ 	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+ 	old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ 	if (dir_bh) {
+ 		BUFFER_TRACE(dir_bh, "get_write_access");
+ 		ext3_journal_get_write_access(handle, dir_bh);
+ 		PARENT_INO(dir_bh->b_data) = le32_to_cpu(new_dir->i_ino);
+ 		BUFFER_TRACE(dir_bh, "call ext3_journal_dirty_metadata");
+ 		ext3_journal_dirty_metadata(handle, dir_bh);
+ 		old_dir->i_nlink--;
+ 		if (new_inode) {
+ 			new_inode->i_nlink--;
+ 		} else {
+ 			new_dir->i_nlink++;
+ 			new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+ 			ext3_mark_inode_dirty(handle, new_dir);
+ 		}
+ 	}
+ 	ext3_mark_inode_dirty(handle, old_dir);
+ 	if (new_inode) {
+ 		ext3_mark_inode_dirty(handle, new_inode);
+ 		if (!new_inode->i_nlink)
+ 			ext3_orphan_add(handle, new_inode);
+ 	}
+ 	retval = 0;
+ 
+ end_rename:
+ 	brelse (dir_bh);
+ 	brelse (old_bh);
+ 	brelse (new_bh);
+ 	ext3_journal_stop(handle, old_dir);
+ 	return retval;
+ }
+ 
+ /*
+  * directories can handle most operations...
+  */
+ struct inode_operations ext3_dir_inode_operations = {
+ 	create:		ext3_create,		/* BKL held */
+ 	lookup:		ext3_lookup,		/* BKL held */
+ 	link:		ext3_link,		/* BKL held */
+ 	unlink:		ext3_unlink,		/* BKL held */
+ 	symlink:	ext3_symlink,		/* BKL held */
+ 	mkdir:		ext3_mkdir,		/* BKL held */
+ 	rmdir:		ext3_rmdir,		/* BKL held */
+ 	mknod:		ext3_mknod,		/* BKL held */
+ 	rename:		ext3_rename,		/* BKL held */
+ };
diff -rc2P linux/fs/ext3/super.c linux-2.4.13/fs/ext3/super.c
*** linux/fs/ext3/super.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/super.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,1743 ----
+ /*
+  *  linux/fs/ext3/super.c
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/inode.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  Big-endian to little-endian byte-swapping/bitmaps by
+  *        David S. Miller (davem@caip.rutgers.edu), 1995
+  */
+ 
+ #include <linux/config.h>
+ #include <linux/module.h>
+ #include <linux/string.h>
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+ #include <linux/blkdev.h>
+ #include <linux/smp_lock.h>
+ #include <asm/uaccess.h>
+ 
+ #ifdef CONFIG_JBD_DEBUG
+ static int ext3_ro_after; /* Make fs read-only after this many jiffies */
+ #endif
+ 
+ static int ext3_load_journal(struct super_block *, struct ext3_super_block *);
+ static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
+ 			       int);
+ static void ext3_commit_super (struct super_block * sb,
+ 			       struct ext3_super_block * es,
+ 			       int sync);
+ static void ext3_mark_recovery_complete(struct super_block * sb,
+ 					struct ext3_super_block * es);
+ static void ext3_clear_journal_err(struct super_block * sb,
+ 				   struct ext3_super_block * es);
+ 
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+  * Debug code for turning filesystems "read-only" after a specified
+  * amount of time.  This is for crash/recovery testing.
+  */
+ 
+ static void make_rdonly(kdev_t dev, int *no_write)
+ {
+ 	if (dev) {
+ 		printk(KERN_WARNING "Turning device %s read-only\n", 
+ 		       bdevname(dev));
+ 		*no_write = 0xdead0000 + dev;
+ 	}
+ }
+ 
+ static void turn_fs_readonly(unsigned long arg)
+ {
+ 	struct super_block *sb = (struct super_block *)arg;
+ 
+ 	make_rdonly(sb->s_dev, &journal_no_write[0]);
+ 	make_rdonly(EXT3_SB(sb)->s_journal->j_dev, &journal_no_write[1]);
+ 	wake_up(&EXT3_SB(sb)->ro_wait_queue);
+ }
+ 
+ static void setup_ro_after(struct super_block *sb)
+ {
+ 	struct ext3_sb_info *sbi = EXT3_SB(sb);
+ 	init_timer(&sbi->turn_ro_timer);
+ 	if (ext3_ro_after) {
+ 		printk(KERN_DEBUG "fs will go read-only in %d jiffies\n",
+ 		       ext3_ro_after);
+ 		init_waitqueue_head(&sbi->ro_wait_queue);
+ 		journal_no_write[0] = 0;
+ 		journal_no_write[1] = 0;
+ 		sbi->turn_ro_timer.function = turn_fs_readonly;
+ 		sbi->turn_ro_timer.data = (unsigned long)sb;
+ 		sbi->turn_ro_timer.expires = jiffies + ext3_ro_after;
+ 		ext3_ro_after = 0;
+ 		add_timer(&sbi->turn_ro_timer);
+ 	}
+ }
+ 
+ static void clear_ro_after(struct super_block *sb)
+ {
+ 	del_timer_sync(&EXT3_SB(sb)->turn_ro_timer);
+ 	journal_no_write[0] = 0;
+ 	journal_no_write[1] = 0;
+ 	ext3_ro_after = 0;
+ }
+ #else
+ #define setup_ro_after(sb)	do {} while (0)
+ #define clear_ro_after(sb)	do {} while (0)
+ #endif
+ 
+ 
+ static char error_buf[1024];
+ 
+ /* Determine the appropriate response to ext3_error on a given filesystem */
+ 
+ static int ext3_error_behaviour(struct super_block *sb)
+ {
+ 	/* First check for mount-time options */
+ 	if (test_opt (sb, ERRORS_PANIC))
+ 		return EXT3_ERRORS_PANIC;
+ 	if (test_opt (sb, ERRORS_RO))
+ 		return EXT3_ERRORS_RO;
+ 	if (test_opt (sb, ERRORS_CONT))
+ 		return EXT3_ERRORS_CONTINUE;
+ 	
+ 	/* If no overrides were specified on the mount, then fall back
+ 	 * to the default behaviour set in the filesystem's superblock
+ 	 * on disk. */
+ 	switch (le16_to_cpu(sb->u.ext3_sb.s_es->s_errors)) {
+ 	case EXT3_ERRORS_PANIC:
+ 		return EXT3_ERRORS_PANIC;
+ 	case EXT3_ERRORS_RO:
+ 		return EXT3_ERRORS_RO;
+ 	default:
+ 		break;
+ 	}
+ 	return EXT3_ERRORS_CONTINUE;
+ }
+ 
+ /* Deal with the reporting of failure conditions on a filesystem such as
+  * inconsistencies detected or read IO failures.
+  *
+  * On ext2, we can store the error state of the filesystem in the
+  * superblock.  That is not possible on ext3, because we may have other
+  * write ordering constraints on the superblock which prevent us from
+  * writing it out straight away; and given that the journal is about to
+  * be aborted, we can't rely on the current, or future, transactions to
+  * write out the superblock safely.
+  *
+  * We'll just use the journal_abort() error code to record an error in
+  * the journal instead.  On recovery, the journal will compain about
+  * that error until we've noted it down and cleared it.
+  */
+ 
+ static void ext3_handle_error(struct super_block *sb)
+ {
+ 	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+ 
+ 	EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS;
+ 	es->s_state |= cpu_to_le32(EXT3_ERROR_FS);
+ 
+ 	if (sb->s_flags & MS_RDONLY)
+ 		return;
+ 
+ 	if (ext3_error_behaviour(sb) != EXT3_ERRORS_CONTINUE) {
+ 		EXT3_SB(sb)->s_mount_opt |= EXT3_MOUNT_ABORT;
+ 		journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+ 	}
+ 
+ 	if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC) 
+ 		panic ("EXT3-fs (device %s): panic forced after error\n",
+ 		       bdevname(sb->s_dev));
+ 
+ 	if (ext3_error_behaviour(sb) == EXT3_ERRORS_RO) {
+ 		printk (KERN_CRIT "Remounting filesystem read-only\n");
+ 		sb->s_flags |= MS_RDONLY;
+ 	}
+ 
+ 	ext3_commit_super(sb, es, 1);
+ }
+ 
+ void ext3_error (struct super_block * sb, const char * function,
+ 		 const char * fmt, ...)
+ {
+ 	va_list args;
+ 
+ 	va_start (args, fmt);
+ 	vsprintf (error_buf, fmt, args);
+ 	va_end (args);
+ 
+ 	printk (KERN_CRIT "EXT3-fs error (device %s): %s: %s\n",
+ 		bdevname(sb->s_dev), function, error_buf);
+ 
+ 	ext3_handle_error(sb);
+ }
+ 
+ const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16])
+ {
+ 	char *errstr = NULL;
+ 	
+ 	switch (errno) {
+ 	case -EIO:
+ 		errstr = "IO failure";
+ 		break;
+ 	case -ENOMEM:
+ 		errstr = "Out of memory";
+ 		break;
+ 	case -EROFS:
+ 		if (!sb || EXT3_SB(sb)->s_journal->j_flags & JFS_ABORT)
+ 			errstr = "Journal has aborted";
+ 		else
+ 			errstr = "Readonly filesystem";
+ 		break;
+ 	default:
+ 		/* If the caller passed in an extra buffer for unknown
+ 		 * errors, textualise them now.  Else we just return
+ 		 * NULL. */
+ 		if (nbuf) {
+ 			/* Check for truncated error codes... */
+ 			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+ 				errstr = nbuf;
+ 		}
+ 		
+ 		break;
+ 	}
+ 
+ 	return errstr;
+ }
+ 
+ /* __ext3_std_error decodes expected errors from journaling functions
+  * automatically and invokes the appropriate error response.  */
+ 
+ void __ext3_std_error (struct super_block * sb, const char * function,
+ 		       int errno)
+ {
+ 	char nbuf[16];
+ 	const char *errstr = ext3_decode_error(sb, errno, nbuf);
+ 
+ 	printk (KERN_CRIT "EXT3-fs error (device %s) in %s: %s\n",
+ 		bdevname(sb->s_dev), function, errstr);
+ 	
+ 	ext3_handle_error(sb);
+ }
+ 
+ /*
+  * ext3_abort is a much stronger failure handler than ext3_error.  The
+  * abort function may be used to deal with unrecoverable failures such
+  * as journal IO errors or ENOMEM at a critical moment in log management.
+  *
+  * We unconditionally force the filesystem into an ABORT|READONLY state,
+  * unless the error response on the fs has been set to panic in which
+  * case we take the easy way out and panic immediately.
+  */
+ 
+ void ext3_abort (struct super_block * sb, const char * function,
+ 		 const char * fmt, ...)
+ {
+ 	va_list args;
+ 
+ 	printk (KERN_CRIT "ext3_abort called.\n");
+ 
+ 	va_start (args, fmt);
+ 	vsprintf (error_buf, fmt, args);
+ 	va_end (args);
+ 
+ 	if (ext3_error_behaviour(sb) == EXT3_ERRORS_PANIC)
+ 		panic ("EXT3-fs panic (device %s): %s: %s\n",
+ 		       bdevname(sb->s_dev), function, error_buf);
+ 
+ 	printk (KERN_CRIT "EXT3-fs abort (device %s): %s: %s\n",
+ 		bdevname(sb->s_dev), function, error_buf);
+ 
+ 	if (sb->s_flags & MS_RDONLY)
+ 		return;
+ 	
+ 	printk (KERN_CRIT "Remounting filesystem read-only\n");
+ 	sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
+ 	sb->s_flags |= MS_RDONLY;
+ 	sb->u.ext3_sb.s_mount_opt |= EXT3_MOUNT_ABORT;
+ 	journal_abort(EXT3_SB(sb)->s_journal, -EIO);
+ }
+ 
+ /* Deal with the reporting of failure conditions while running, such as
+  * inconsistencies in operation or invalid system states.
+  *
+  * Use ext3_error() for cases of invalid filesystem states, as that will
+  * record an error on disk and force a filesystem check on the next boot.
+  */
+ NORET_TYPE void ext3_panic (struct super_block * sb, const char * function,
+ 			    const char * fmt, ...)
+ {
+ 	va_list args;
+ 
+ 	va_start (args, fmt);
+ 	vsprintf (error_buf, fmt, args);
+ 	va_end (args);
+ 
+ 	/* this is to prevent panic from syncing this filesystem */
+ 	/* AKPM: is this sufficient? */
+ 	sb->s_flags |= MS_RDONLY;
+ 	panic ("EXT3-fs panic (device %s): %s: %s\n",
+ 	       bdevname(sb->s_dev), function, error_buf);
+ }
+ 
+ void ext3_warning (struct super_block * sb, const char * function,
+ 		   const char * fmt, ...)
+ {
+ 	va_list args;
+ 
+ 	va_start (args, fmt);
+ 	vsprintf (error_buf, fmt, args);
+ 	va_end (args);
+ 	printk (KERN_WARNING "EXT3-fs warning (device %s): %s: %s\n",
+ 		bdevname(sb->s_dev), function, error_buf);
+ }
+ 
+ void ext3_update_dynamic_rev(struct super_block *sb)
+ {
+ 	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+ 
+ 	if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV)
+ 		return;
+ 
+ 	ext3_warning(sb, __FUNCTION__,
+ 		     "updating to rev %d because of new feature flag, "
+ 		     "running e2fsck is recommended",
+ 		     EXT3_DYNAMIC_REV);
+ 
+ 	es->s_first_ino = cpu_to_le32(EXT3_GOOD_OLD_FIRST_INO);
+ 	es->s_inode_size = cpu_to_le16(EXT3_GOOD_OLD_INODE_SIZE);
+ 	es->s_rev_level = cpu_to_le32(EXT3_DYNAMIC_REV);
+ 	/* leave es->s_feature_*compat flags alone */
+ 	/* es->s_uuid will be set by e2fsck if empty */
+ 
+ 	/*
+ 	 * The rest of the superblock fields should be zero, and if not it
+ 	 * means they are likely already in use, so leave them alone.  We
+ 	 * can leave it up to e2fsck to clean up any inconsistencies there.
+ 	 */
+ }
+ 
+ /*
+  * Open the external journal device
+  */
+ static struct block_device *ext3_blkdev_get(kdev_t dev)
+ {
+ 	struct block_device *bdev;
+ 	int err = -ENODEV;
+ 
+ 	bdev = bdget(kdev_t_to_nr(dev));
+ 	if (bdev == NULL)
+ 		goto fail;
+ 	err = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS);
+ 	if (err < 0)
+ 		goto fail;
+ 	return bdev;
+ 
+ fail:
+ 	printk(KERN_ERR "EXT3: failed to open journal device %s: %d\n",
+ 			bdevname(dev), err);
+ 	return NULL;
+ }
+ 
+ /*
+  * Release the journal device
+  */
+ static int ext3_blkdev_put(struct block_device *bdev)
+ {
+ 	return blkdev_put(bdev, BDEV_FS);
+ }
+ 
+ static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+ {
+ 	struct block_device *bdev;
+ 	int ret = -ENODEV;
+ 
+ 	bdev = sbi->journal_bdev;
+ 	if (bdev) {
+ 		ret = ext3_blkdev_put(bdev);
+ 		sbi->journal_bdev = 0;
+ 	}
+ 	return ret;
+ }
+ 
+ #define orphan_list_entry(l) list_entry((l), struct inode, u.ext3_i.i_orphan)
+ 
+ static void dump_orphan_list(struct super_block *sb, struct ext3_sb_info *sbi)
+ {
+ 	struct list_head *l;
+ 	
+ 	printk(KERN_ERR "sb orphan head is %d\n", 
+ 	       le32_to_cpu(sbi->s_es->s_last_orphan));
+ 	
+ 	printk(KERN_ERR "sb_info orphan list:\n");
+ 	list_for_each(l, &sbi->s_orphan) {
+ 		struct inode *inode = orphan_list_entry(l);
+ 		printk(KERN_ERR "  "
+ 		       "inode 0x%04x:%ld at %p: mode %o, nlink %d, next %d\n",
+ 		       inode->i_dev, inode->i_ino, inode,
+ 		       inode->i_mode, inode->i_nlink, 
+ 		       le32_to_cpu(NEXT_ORPHAN(inode)));
+ 	}
+ }
+ 
+ void ext3_put_super (struct super_block * sb)
+ {
+ 	struct ext3_sb_info *sbi = EXT3_SB(sb);
+ 	struct ext3_super_block *es = sbi->s_es;
+ 	kdev_t j_dev = sbi->s_journal->j_dev;
+ 	int i;
+ 
+ 	journal_destroy(sbi->s_journal);
+ 	if (!(sb->s_flags & MS_RDONLY)) {
+ 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ 		es->s_state = le16_to_cpu(sbi->s_mount_state);
+ 		BUFFER_TRACE(sbi->s_sbh, "marking dirty");
+ 		mark_buffer_dirty(sbi->s_sbh);
+ 		ext3_commit_super(sb, es, 1);
+ 	}
+ 
+ 	for (i = 0; i < sbi->s_gdb_count; i++)
+ 		brelse(sbi->s_group_desc[i]);
+ 	kfree(sbi->s_group_desc);
+ 	for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
+ 		brelse(sbi->s_inode_bitmap[i]);
+ 	for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++)
+ 		brelse(sbi->s_block_bitmap[i]);
+ 	brelse(sbi->s_sbh);
+ 
+ 	/* Debugging code just in case the in-memory inode orphan list
+ 	 * isn't empty.  The on-disk one can be non-empty if we've
+ 	 * detected an error and taken the fs readonly, but the
+ 	 * in-memory list had better be clean by this point. */
+ 	if (!list_empty(&sbi->s_orphan))
+ 		dump_orphan_list(sb, sbi);
+ 	J_ASSERT(list_empty(&sbi->s_orphan));
+ 
+ 	invalidate_buffers(sb->s_dev);
+ 	if (j_dev != sb->s_dev) {
+ 		/*
+ 		 * Invalidate the journal device's buffers.  We don't want them
+ 		 * floating about in memory - the physical journal device may
+ 		 * hotswapped, and it breaks the `ro-after' testing code.
+ 		 */
+ 		fsync_no_super(j_dev);
+ 		invalidate_buffers(j_dev);
+ 		ext3_blkdev_remove(sbi);
+ 	}
+ 	clear_ro_after(sb);
+ 
+ 	return;
+ }
+ 
+ static struct super_operations ext3_sops = {
+ 	read_inode:	ext3_read_inode,	/* BKL held */
+ 	write_inode:	ext3_write_inode,	/* BKL not held.  Don't need */
+ 	dirty_inode:	ext3_dirty_inode,	/* BKL not held.  We take it */
+ 	put_inode:	ext3_put_inode,		/* BKL not held.  Don't need */
+ 	delete_inode:	ext3_delete_inode,	/* BKL not held.  We take it */
+ 	put_super:	ext3_put_super,		/* BKL held */
+ 	write_super:	ext3_write_super,	/* BKL held */
+ 	write_super_lockfs: ext3_write_super_lockfs, /* BKL not held. Take it */
+ 	unlockfs:	ext3_unlockfs,		/* BKL not held.  We take it */
+ 	statfs:		ext3_statfs,		/* BKL held */
+ 	remount_fs:	ext3_remount,		/* BKL held */
+ };
+ 
+ static int want_value(char *value, char *option)
+ {
+ 	if (!value || !*value) {
+ 		printk(KERN_NOTICE "EXT3-fs: the %s option needs an argument\n",
+ 		       option);
+ 		return -1;
+ 	}
+ 	return 0;
+ }
+ 
+ static int want_null_value(char *value, char *option)
+ {
+ 	if (*value) {
+ 		printk(KERN_NOTICE "EXT3-fs: Invalid %s argument: %s\n",
+ 		       option, value);
+ 		return -1;
+ 	}
+ 	return 0;
+ }
+ 
+ static int want_numeric(char *value, char *option, unsigned long *number)
+ {
+ 	if (want_value(value, option))
+ 		return -1;
+ 	*number = simple_strtoul(value, &value, 0);
+ 	if (want_null_value(value, option))
+ 		return -1;
+ 	return 0;
+ }
+ 
+ /*
+  * This function has been shamelessly adapted from the msdos fs
+  */
+ static int parse_options (char * options, unsigned long * sb_block,
+ 			  struct ext3_sb_info *sbi,
+ 			  unsigned long * inum,
+ 			  int is_remount)
+ {
+ 	unsigned long *mount_options = &sbi->s_mount_opt;
+ 	uid_t *resuid = &sbi->s_resuid;
+ 	gid_t *resgid = &sbi->s_resgid;
+ 	char * this_char;
+ 	char * value;
+ 
+ 	if (!options)
+ 		return 1;
+ 	for (this_char = strtok (options, ",");
+ 	     this_char != NULL;
+ 	     this_char = strtok (NULL, ",")) {
+ 		if ((value = strchr (this_char, '=')) != NULL)
+ 			*value++ = 0;
+ 		if (!strcmp (this_char, "bsddf"))
+ 			clear_opt (*mount_options, MINIX_DF);
+ 		else if (!strcmp (this_char, "nouid32")) {
+ 			set_opt (*mount_options, NO_UID32);
+ 		}
+ 		else if (!strcmp (this_char, "abort"))
+ 			set_opt (*mount_options, ABORT);
+ 		else if (!strcmp (this_char, "check")) {
+ 			if (!value || !*value || !strcmp (value, "none"))
+ 				clear_opt (*mount_options, CHECK);
+ 			else
+ #ifdef CONFIG_EXT3_CHECK
+ 				set_opt (*mount_options, CHECK);
+ #else
+ 				printk(KERN_ERR 
+ 				       "EXT3 Check option not supported\n");
+ #endif
+ 		}
+ 		else if (!strcmp (this_char, "debug"))
+ 			set_opt (*mount_options, DEBUG);
+ 		else if (!strcmp (this_char, "errors")) {
+ 			if (want_value(value, "errors"))
+ 				return 0;
+ 			if (!strcmp (value, "continue")) {
+ 				clear_opt (*mount_options, ERRORS_RO);
+ 				clear_opt (*mount_options, ERRORS_PANIC);
+ 				set_opt (*mount_options, ERRORS_CONT);
+ 			}
+ 			else if (!strcmp (value, "remount-ro")) {
+ 				clear_opt (*mount_options, ERRORS_CONT);
+ 				clear_opt (*mount_options, ERRORS_PANIC);
+ 				set_opt (*mount_options, ERRORS_RO);
+ 			}
+ 			else if (!strcmp (value, "panic")) {
+ 				clear_opt (*mount_options, ERRORS_CONT);
+ 				clear_opt (*mount_options, ERRORS_RO);
+ 				set_opt (*mount_options, ERRORS_PANIC);
+ 			}
+ 			else {
+ 				printk (KERN_ERR
+ 					"EXT3-fs: Invalid errors option: %s\n",
+ 					value);
+ 				return 0;
+ 			}
+ 		}
+ 		else if (!strcmp (this_char, "grpid") ||
+ 			 !strcmp (this_char, "bsdgroups"))
+ 			set_opt (*mount_options, GRPID);
+ 		else if (!strcmp (this_char, "minixdf"))
+ 			set_opt (*mount_options, MINIX_DF);
+ 		else if (!strcmp (this_char, "nocheck"))
+ 			clear_opt (*mount_options, CHECK);
+ 		else if (!strcmp (this_char, "nogrpid") ||
+ 			 !strcmp (this_char, "sysvgroups"))
+ 			clear_opt (*mount_options, GRPID);
+ 		else if (!strcmp (this_char, "resgid")) {
+ 			unsigned long v;
+ 			if (want_numeric(value, "resgid", &v))
+ 				return 0;
+ 			*resgid = v;
+ 		}
+ 		else if (!strcmp (this_char, "resuid")) {
+ 			unsigned long v;
+ 			if (want_numeric(value, "resuid", &v))
+ 				return 0;
+ 			*resuid = v;
+ 		}
+ 		else if (!strcmp (this_char, "sb")) {
+ 			if (want_numeric(value, "sb", sb_block))
+ 				return 0;
+ 		}
+ #ifdef CONFIG_JBD_DEBUG
+ 		else if (!strcmp (this_char, "ro-after")) {
+ 			unsigned long v;
+ 			if (want_numeric(value, "ro-after", &v))
+ 				return 0;
+ 			ext3_ro_after = v;
+ 		}
+ #endif
+ 		/* Silently ignore the quota options */
+ 		else if (!strcmp (this_char, "grpquota")
+ 		         || !strcmp (this_char, "noquota")
+ 		         || !strcmp (this_char, "quota")
+ 		         || !strcmp (this_char, "usrquota"))
+ 			/* Don't do anything ;-) */ ;
+ 		else if (!strcmp (this_char, "journal")) {
+ 			/* @@@ FIXME */
+ 			/* Eventually we will want to be able to create
+                            a journal file here.  For now, only allow the
+                            user to specify an existing inode to be the
+                            journal file. */
+ 			if (is_remount) {
+ 				printk(KERN_ERR "EXT3-fs: cannot specify "
+ 				       "journal on remount\n");
+ 				return 0;
+ 			}
+ 
+ 			if (want_value(value, "journal"))
+ 				return 0;
+ 			if (!strcmp (value, "update"))
+ 				set_opt (*mount_options, UPDATE_JOURNAL);
+ 			else if (want_numeric(value, "journal", inum))
+ 				return 0;
+ 		}
+ 		else if (!strcmp (this_char, "noload"))
+ 			set_opt (*mount_options, NOLOAD);
+ 		else if (!strcmp (this_char, "data")) {
+ 			int data_opt = 0;
+ 
+ 			if (want_value(value, "data"))
+ 				return 0;
+ 			if (!strcmp (value, "journal"))
+ 				data_opt = EXT3_MOUNT_JOURNAL_DATA;
+ 			else if (!strcmp (value, "ordered"))
+ 				data_opt = EXT3_MOUNT_ORDERED_DATA;
+ 			else if (!strcmp (value, "writeback"))
+ 				data_opt = EXT3_MOUNT_WRITEBACK_DATA;
+ 			else {
+ 				printk (KERN_ERR 
+ 					"EXT3-fs: Invalid data option: %s\n",
+ 					value);
+ 				return 0;
+ 			}
+ 			if (is_remount) {
+ 				if ((*mount_options & EXT3_MOUNT_DATA_FLAGS) !=
+ 							data_opt) {
+ 					printk(KERN_ERR
+ 					       "EXT3-fs: cannot change data "
+ 					       "mode on remount\n");
+ 					return 0;
+ 				}
+ 			} else {
+ 				*mount_options &= ~EXT3_MOUNT_DATA_FLAGS;
+ 				*mount_options |= data_opt;
+ 			}
+ 		} else {
+ 			printk (KERN_ERR 
+ 				"EXT3-fs: Unrecognized mount option %s\n",
+ 				this_char);
+ 			return 0;
+ 		}
+ 	}
+ 	return 1;
+ }
+ 
+ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es,
+ 			    int read_only)
+ {
+ 	struct ext3_sb_info *sbi = EXT3_SB(sb);
+ 	int res = 0;
+ 
+ 	if (le32_to_cpu(es->s_rev_level) > EXT3_MAX_SUPP_REV) {
+ 		printk (KERN_ERR "EXT3-fs warning: revision level too high, "
+ 			"forcing read-only mode\n");
+ 		res = MS_RDONLY;
+ 	}
+ 	if (read_only)
+ 		return res;
+ 	if (!(sbi->s_mount_state & EXT3_VALID_FS))
+ 		printk (KERN_WARNING "EXT3-fs warning: mounting unchecked fs, "
+ 			"running e2fsck is recommended\n");
+ 	else if ((sbi->s_mount_state & EXT3_ERROR_FS))
+ 		printk (KERN_WARNING
+ 			"EXT3-fs warning: mounting fs with errors, "
+ 			"running e2fsck is recommended\n");
+ 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
+ 		 le16_to_cpu(es->s_mnt_count) >=
+ 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
+ 		printk (KERN_WARNING
+ 			"EXT3-fs warning: maximal mount count reached, "
+ 			"running e2fsck is recommended\n");
+ 	else if (le32_to_cpu(es->s_checkinterval) &&
+ 		(le32_to_cpu(es->s_lastcheck) +
+ 			le32_to_cpu(es->s_checkinterval) <= CURRENT_TIME))
+ 		printk (KERN_WARNING
+ 			"EXT3-fs warning: checktime reached, "
+ 			"running e2fsck is recommended\n");
+ #if 0
+ 		/* @@@ We _will_ want to clear the valid bit if we find
+                    inconsistencies, to force a fsck at reboot.  But for
+                    a plain journaled filesystem we can keep it set as
+                    valid forever! :) */
+ 	es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS);
+ #endif
+ 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
+ 		es->s_max_mnt_count =
+ 			(__s16) cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT);
+ 	es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1);
+ 	es->s_mtime = cpu_to_le32(CURRENT_TIME);
+ 	ext3_update_dynamic_rev(sb);
+ 	EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ 	ext3_commit_super (sb, es, 1);
+ 	if (test_opt (sb, DEBUG))
+ 		printk (KERN_INFO
+ 			"[EXT3 FS %s, %s, bs=%lu, gc=%lu, "
+ 			"bpg=%lu, ipg=%lu, mo=%04lx]\n",
+ 			EXT3FS_VERSION, EXT3FS_DATE, sb->s_blocksize,
+ 			sbi->s_groups_count,
+ 			EXT3_BLOCKS_PER_GROUP(sb),
+ 			EXT3_INODES_PER_GROUP(sb),
+ 			sbi->s_mount_opt);
+ 	printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+ 				bdevname(sb->s_dev));
+ 	if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+ 		printk("external journal on %s\n",
+ 				bdevname(EXT3_SB(sb)->s_journal->j_dev));
+ 	} else {
+ 		printk("internal journal\n");
+ 	}
+ #ifdef CONFIG_EXT3_CHECK
+ 	if (test_opt (sb, CHECK)) {
+ 		ext3_check_blocks_bitmap (sb);
+ 		ext3_check_inodes_bitmap (sb);
+ 	}
+ #endif
+ 	setup_ro_after(sb);
+ 	return res;
+ }
+ 
+ static int ext3_check_descriptors (struct super_block * sb)
+ {
+ 	struct ext3_sb_info *sbi = EXT3_SB(sb);
+ 	unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
+ 	struct ext3_group_desc * gdp = NULL;
+ 	int desc_block = 0;
+ 	int i;
+ 
+ 	ext3_debug ("Checking group descriptors");
+ 
+ 	for (i = 0; i < sbi->s_groups_count; i++)
+ 	{
+ 		if ((i % EXT3_DESC_PER_BLOCK(sb)) == 0)
+ 			gdp = (struct ext3_group_desc *)
+ 					sbi->s_group_desc[desc_block++]->b_data;
+ 		if (le32_to_cpu(gdp->bg_block_bitmap) < block ||
+ 		    le32_to_cpu(gdp->bg_block_bitmap) >=
+ 				block + EXT3_BLOCKS_PER_GROUP(sb))
+ 		{
+ 			ext3_error (sb, "ext3_check_descriptors",
+ 				    "Block bitmap for group %d"
+ 				    " not in group (block %lu)!",
+ 				    i, (unsigned long)
+ 					le32_to_cpu(gdp->bg_block_bitmap));
+ 			return 0;
+ 		}
+ 		if (le32_to_cpu(gdp->bg_inode_bitmap) < block ||
+ 		    le32_to_cpu(gdp->bg_inode_bitmap) >=
+ 				block + EXT3_BLOCKS_PER_GROUP(sb))
+ 		{
+ 			ext3_error (sb, "ext3_check_descriptors",
+ 				    "Inode bitmap for group %d"
+ 				    " not in group (block %lu)!",
+ 				    i, (unsigned long)
+ 					le32_to_cpu(gdp->bg_inode_bitmap));
+ 			return 0;
+ 		}
+ 		if (le32_to_cpu(gdp->bg_inode_table) < block ||
+ 		    le32_to_cpu(gdp->bg_inode_table) + sbi->s_itb_per_group >=
+ 		    block + EXT3_BLOCKS_PER_GROUP(sb))
+ 		{
+ 			ext3_error (sb, "ext3_check_descriptors",
+ 				    "Inode table for group %d"
+ 				    " not in group (block %lu)!",
+ 				    i, (unsigned long)
+ 					le32_to_cpu(gdp->bg_inode_table));
+ 			return 0;
+ 		}
+ 		block += EXT3_BLOCKS_PER_GROUP(sb);
+ 		gdp++;
+ 	}
+ 	return 1;
+ }
+ 
+ 
+ /* ext3_orphan_cleanup() walks a singly-linked list of inodes (starting at
+  * the superblock) which were deleted from all directories, but held open by
+  * a process at the time of a crash.  We walk the list and try to delete these
+  * inodes at recovery time (only with a read-write filesystem).
+  *
+  * In order to keep the orphan inode chain consistent during traversal (in
+  * case of crash during recovery), we link each inode into the superblock
+  * orphan list_head and handle it the same way as an inode deletion during
+  * normal operation (which journals the operations for us).
+  *
+  * We only do an iget() and an iput() on each inode, which is very safe if we
+  * accidentally point at an in-use or already deleted inode.  The worst that
+  * can happen in this case is that we get a "bit already cleared" message from
+  * ext3_free_inode().  The only reason we would point at a wrong inode is if
+  * e2fsck was run on this filesystem, and it must have already done the orphan
+  * inode cleanup for us, so we can safely abort without any further action.
+  */
+ static void ext3_orphan_cleanup (struct super_block * sb,
+ 				 struct ext3_super_block * es)
+ {
+ 	unsigned int s_flags = sb->s_flags;
+ 	int nr_orphans = 0, nr_truncates = 0;
+ 	if (!es->s_last_orphan) {
+ 		jbd_debug(4, "no orphan inodes to clean up\n");
+ 		return;
+ 	}
+ 
+ 	if (s_flags & MS_RDONLY) {
+ 		printk(KERN_INFO "EXT3-fs: %s: orphan cleanup on readonly fs\n",
+ 		       bdevname(sb->s_dev));
+ 		sb->s_flags &= ~MS_RDONLY;
+ 	}
+ 
+ 	if (sb->u.ext3_sb.s_mount_state & EXT3_ERROR_FS) {
+ 		if (es->s_last_orphan)
+ 			jbd_debug(1, "Errors on filesystem, "
+ 				  "clearing orphan list.\n");
+ 		es->s_last_orphan = 0;
+ 		jbd_debug(1, "Skipping orphan recovery on fs with errors.\n");
+ 		return;
+ 	}
+ 
+ 	while (es->s_last_orphan) {
+ 		struct inode *inode;
+ 
+ 		if (!(inode =
+ 		      ext3_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) {
+ 			es->s_last_orphan = 0;
+ 			break;
+ 		}
+ 
+ 		list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+ 		if (inode->i_nlink) {
+ 			printk(KERN_DEBUG __FUNCTION__
+ 				": truncating inode %ld to %Ld bytes\n",
+ 				inode->i_ino, inode->i_size);
+ 			jbd_debug(2, "truncating inode %ld to %Ld bytes\n",
+ 				  inode->i_ino, inode->i_size);
+ 			ext3_truncate(inode);
+ 			nr_truncates++;
+ 		} else {
+ 			printk(KERN_DEBUG __FUNCTION__
+ 				": deleting unreferenced inode %ld\n",
+ 				inode->i_ino);
+ 			jbd_debug(2, "deleting unreferenced inode %ld\n",
+ 				  inode->i_ino);
+ 			nr_orphans++;
+ 		}
+ 		iput(inode);  /* The delete magic happens here! */
+ 	}
+ 
+ #define PLURAL(x) (x), ((x)==1) ? "" : "s"
+ 
+ 	if (nr_orphans)
+ 		printk(KERN_INFO "EXT3-fs: %s: %d orphan inode%s deleted\n",
+ 		       bdevname(sb->s_dev), PLURAL(nr_orphans));
+ 	if (nr_truncates)
+ 		printk(KERN_INFO "EXT3-fs: %s: %d truncate%s cleaned up\n",
+ 		       bdevname(sb->s_dev), PLURAL(nr_truncates));
+ 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ }
+ 
+ #define log2(n) ffz(~(n))
+ 
+ /*
+  * Maximal file size.  There is a direct, and {,double-,triple-}indirect
+  * block limit, and also a limit of (2^32 - 1) 512-byte sectors in i_blocks.
+  * We need to be 1 filesystem block less than the 2^32 sector limit.
+  */
+ static loff_t ext3_max_size(int bits)
+ {
+ 	loff_t res = EXT3_NDIR_BLOCKS;
+ 	res += 1LL << (bits-2);
+ 	res += 1LL << (2*(bits-2));
+ 	res += 1LL << (3*(bits-2));
+ 	res <<= bits;
+ 	if (res > (512LL << 32) - (1 << bits))
+ 		res = (512LL << 32) - (1 << bits);
+ 	return res;
+ }
+ 
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+ 				      int silent)
+ {
+ 	struct buffer_head * bh;
+ 	struct ext3_super_block *es = 0;
+ 	struct ext3_sb_info *sbi = EXT3_SB(sb);
+ 	unsigned long sb_block = 1;
+ 	unsigned long logic_sb_block = 1;
+ 	unsigned long offset = 0;
+ 	unsigned long journal_inum = 0;
+ 	kdev_t dev = sb->s_dev;
+ 	int blocksize;
+ 	int hblock;
+ 	int db_count;
+ 	int i;
+ 	int needs_recovery;
+ 
+ #ifdef CONFIG_JBD_DEBUG
+ 	ext3_ro_after = 0;
+ #endif
+ 	/*
+ 	 * See what the current blocksize for the device is, and
+ 	 * use that as the blocksize.  Otherwise (or if the blocksize
+ 	 * is smaller than the default) use the default.
+ 	 * This is important for devices that have a hardware
+ 	 * sectorsize that is larger than the default.
+ 	 */
+ 	blocksize = EXT3_MIN_BLOCK_SIZE;
+ 	hblock = get_hardsect_size(dev);
+ 	if (blocksize < hblock)
+ 		blocksize = hblock;
+ 
+ 	sbi->s_mount_opt = 0;
+ 	sbi->s_resuid = EXT3_DEF_RESUID;
+ 	sbi->s_resgid = EXT3_DEF_RESGID;
+ 	if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
+ 		sb->s_dev = 0;
+ 		goto out_fail;
+ 	}
+ 
+ 	set_blocksize (dev, blocksize);
+ 
+ 	/*
+ 	 * The ext3 superblock will not be buffer aligned for other than 1kB
+ 	 * block sizes.  We need to calculate the offset from buffer start.
+ 	 */
+ 	if (blocksize != EXT3_MIN_BLOCK_SIZE) {
+ 		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+ 		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+ 	}
+ 
+ 	if (!(bh = bread (dev, logic_sb_block, blocksize))) {
+ 		printk (KERN_ERR "EXT3-fs: unable to read superblock\n");
+ 		goto out_fail;
+ 	}
+ 	/*
+ 	 * Note: s_es must be initialized as soon as possible because
+ 	 *       some ext3 macro-instructions depend on its value
+ 	 */
+ 	es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+ 	sbi->s_es = es;
+ 	sb->s_magic = le16_to_cpu(es->s_magic);
+ 	if (sb->s_magic != EXT3_SUPER_MAGIC) {
+ 		if (!silent)
+ 			printk(KERN_ERR 
+ 			       "VFS: Can't find ext3 filesystem on dev %s.\n",
+ 			       bdevname(dev));
+ 		goto failed_mount;
+ 	}
+ 	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV &&
+ 	    (EXT3_HAS_COMPAT_FEATURE(sb, ~0U) ||
+ 	     EXT3_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
+ 	     EXT3_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+ 		printk(KERN_WARNING 
+ 		       "EXT3-fs warning: feature flags set on rev 0 fs, "
+ 		       "running e2fsck is recommended\n");
+ 	/*
+ 	 * Check feature flags regardless of the revision level, since we
+ 	 * previously didn't change the revision level when setting the flags,
+ 	 * so there is a chance incompat flags are set on a rev 0 filesystem.
+ 	 */
+ 	if ((i = EXT3_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))) {
+ 		printk(KERN_ERR "EXT3-fs: %s: couldn't mount because of "
+ 		       "unsupported optional features (%x).\n",
+ 		       bdevname(dev), i);
+ 		goto failed_mount;
+ 	}
+ 	if (!(sb->s_flags & MS_RDONLY) &&
+ 	    (i = EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))){
+ 		printk(KERN_ERR "EXT3-fs: %s: couldn't mount RDWR because of "
+ 		       "unsupported optional features (%x).\n",
+ 		       bdevname(dev), i);
+ 		goto failed_mount;
+ 	}
+ 	sb->s_blocksize_bits = le32_to_cpu(es->s_log_block_size) + 10;
+ 	sb->s_blocksize = 1 << sb->s_blocksize_bits;
+ 
+ 	if (sb->s_blocksize < EXT3_MIN_BLOCK_SIZE ||
+ 	    sb->s_blocksize > EXT3_MAX_BLOCK_SIZE) {
+ 		printk(KERN_ERR 
+ 		       "EXT3-fs: Unsupported filesystem blocksize %d on %s.\n",
+ 		       blocksize, bdevname(dev));
+ 		goto failed_mount;
+ 	}
+ 
+ 	sb->s_maxbytes = ext3_max_size(sb->s_blocksize_bits);
+ 
+ 	if (sb->s_blocksize != blocksize) {
+ 		blocksize = sb->s_blocksize;
+ 
+ 		/*
+ 		 * Make sure the blocksize for the filesystem is larger
+ 		 * than the hardware sectorsize for the machine.
+ 		 */
+ 		if (sb->s_blocksize < hblock) {
+ 			printk(KERN_ERR "EXT3-fs: blocksize %d too small for "
+ 			       "device blocksize %d.\n", blocksize, hblock);
+ 			goto failed_mount;
+ 		}
+ 
+ 		brelse (bh);
+ 		set_blocksize (dev, sb->s_blocksize);
+ 		logic_sb_block = (sb_block * EXT3_MIN_BLOCK_SIZE) / blocksize;
+ 		offset = (sb_block * EXT3_MIN_BLOCK_SIZE) % blocksize;
+ 		bh = bread (dev, logic_sb_block, blocksize);
+ 		if (!bh) {
+ 			printk(KERN_ERR 
+ 			       "EXT3-fs: Can't read superblock on 2nd try.\n");
+ 			return NULL;
+ 		}
+ 		es = (struct ext3_super_block *)(((char *)bh->b_data) + offset);
+ 		sbi->s_es = es;
+ 		if (es->s_magic != le16_to_cpu(EXT3_SUPER_MAGIC)) {
+ 			printk (KERN_ERR 
+ 				"EXT3-fs: Magic mismatch, very weird !\n");
+ 			goto failed_mount;
+ 		}
+ 	}
+ 
+ 	if (le32_to_cpu(es->s_rev_level) == EXT3_GOOD_OLD_REV) {
+ 		sbi->s_inode_size = EXT3_GOOD_OLD_INODE_SIZE;
+ 		sbi->s_first_ino = EXT3_GOOD_OLD_FIRST_INO;
+ 	} else {
+ 		sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+ 		sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+ 		if (sbi->s_inode_size != EXT3_GOOD_OLD_INODE_SIZE) {
+ 			printk (KERN_ERR
+ 				"EXT3-fs: unsupported inode size: %d\n",
+ 				sbi->s_inode_size);
+ 			goto failed_mount;
+ 		}
+ 	}
+ 	sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
+ 				   le32_to_cpu(es->s_log_frag_size);
+ 	if (blocksize != sbi->s_frag_size) {
+ 		printk(KERN_ERR
+ 		       "EXT3-fs: fragsize %lu != blocksize %u (unsupported)\n",
+ 		       sbi->s_frag_size, blocksize);
+ 		goto failed_mount;
+ 	}
+ 	sbi->s_frags_per_block = 1;
+ 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
+ 	sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
+ 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
+ 	sbi->s_inodes_per_block = blocksize / EXT3_INODE_SIZE(sb);
+ 	sbi->s_itb_per_group = sbi->s_inodes_per_group /sbi->s_inodes_per_block;
+ 	sbi->s_desc_per_block = blocksize / sizeof(struct ext3_group_desc);
+ 	sbi->s_sbh = bh;
+ 	if (sbi->s_resuid == EXT3_DEF_RESUID)
+ 		sbi->s_resuid = le16_to_cpu(es->s_def_resuid);
+ 	if (sbi->s_resgid == EXT3_DEF_RESGID)
+ 		sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
+ 	sbi->s_mount_state = le16_to_cpu(es->s_state);
+ 	sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+ 	sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
+ 
+ 	if (sbi->s_blocks_per_group > blocksize * 8) {
+ 		printk (KERN_ERR
+ 			"EXT3-fs: #blocks per group too big: %lu\n",
+ 			sbi->s_blocks_per_group);
+ 		goto failed_mount;
+ 	}
+ 	if (sbi->s_frags_per_group > blocksize * 8) {
+ 		printk (KERN_ERR
+ 			"EXT3-fs: #fragments per group too big: %lu\n",
+ 			sbi->s_frags_per_group);
+ 		goto failed_mount;
+ 	}
+ 	if (sbi->s_inodes_per_group > blocksize * 8) {
+ 		printk (KERN_ERR
+ 			"EXT3-fs: #inodes per group too big: %lu\n",
+ 			sbi->s_inodes_per_group);
+ 		goto failed_mount;
+ 	}
+ 
+ 	sbi->s_groups_count = (le32_to_cpu(es->s_blocks_count) -
+ 			       le32_to_cpu(es->s_first_data_block) +
+ 			       EXT3_BLOCKS_PER_GROUP(sb) - 1) /
+ 			      EXT3_BLOCKS_PER_GROUP(sb);
+ 	db_count = (sbi->s_groups_count + EXT3_DESC_PER_BLOCK(sb) - 1) /
+ 		   EXT3_DESC_PER_BLOCK(sb);
+ 	sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
+ 				    GFP_KERNEL);
+ 	if (sbi->s_group_desc == NULL) {
+ 		printk (KERN_ERR "EXT3-fs: not enough memory\n");
+ 		goto failed_mount;
+ 	}
+ 	for (i = 0; i < db_count; i++) {
+ 		sbi->s_group_desc[i] = bread(dev, logic_sb_block + i + 1,
+ 					     blocksize);
+ 		if (!sbi->s_group_desc[i]) {
+ 			printk (KERN_ERR "EXT3-fs: "
+ 				"can't read group descriptor %d\n", i);
+ 			db_count = i;
+ 			goto failed_mount2;
+ 		}
+ 	}
+ 	if (!ext3_check_descriptors (sb)) {
+ 		printk (KERN_ERR "EXT3-fs: group descriptors corrupted !\n");
+ 		goto failed_mount2;
+ 	}
+ 	for (i = 0; i < EXT3_MAX_GROUP_LOADED; i++) {
+ 		sbi->s_inode_bitmap_number[i] = 0;
+ 		sbi->s_inode_bitmap[i] = NULL;
+ 		sbi->s_block_bitmap_number[i] = 0;
+ 		sbi->s_block_bitmap[i] = NULL;
+ 	}
+ 	sbi->s_loaded_inode_bitmaps = 0;
+ 	sbi->s_loaded_block_bitmaps = 0;
+ 	sbi->s_gdb_count = db_count;
+ 	/*
+ 	 * set up enough so that it can read an inode
+ 	 */
+ 	sb->s_op = &ext3_sops;
+ 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+ 
+ 	sb->s_root = 0;
+ 
+ 	needs_recovery = (es->s_last_orphan != 0 ||
+ 			  EXT3_HAS_INCOMPAT_FEATURE(sb,
+ 				    EXT3_FEATURE_INCOMPAT_RECOVER));
+ 
+ 	/*
+ 	 * The first inode we look at is the journal inode.  Don't try
+ 	 * root first: it may be modified in the journal!
+ 	 */
+ 	if (!test_opt(sb, NOLOAD) &&
+ 	    EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) {
+ 		if (ext3_load_journal(sb, es))
+ 			goto failed_mount2;
+ 	} else if (journal_inum) {
+ 		if (ext3_create_journal(sb, es, journal_inum))
+ 			goto failed_mount2;
+ 	} else {
+ 		if (!silent)
+ 			printk (KERN_ERR
+ 				"ext3: No journal on filesystem on %s\n",
+ 				bdevname(dev));
+ 		goto failed_mount2;
+ 	}
+ 
+ 	/* We have now updated the journal if required, so we can
+ 	 * validate the data journaling mode. */
+ 	switch (test_opt(sb, DATA_FLAGS)) {
+ 	case 0:
+ 		/* No mode set, assume a default based on the journal
+                    capabilities: ORDERED_DATA if the journal can
+                    cope, else JOURNAL_DATA */
+ 		if (journal_check_available_features
+ 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
+ 			set_opt(sbi->s_mount_opt, ORDERED_DATA);
+ 		else
+ 			set_opt(sbi->s_mount_opt, JOURNAL_DATA);
+ 		break;
+ 
+ 	case EXT3_MOUNT_ORDERED_DATA:
+ 	case EXT3_MOUNT_WRITEBACK_DATA:
+ 		if (!journal_check_available_features
+ 		    (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) {
+ 			printk(KERN_ERR "EXT3-fs: Journal does not support "
+ 			       "requested data journaling mode\n");
+ 			goto failed_mount3;
+ 		}
+ 	default:
+ 		break;
+ 	}
+ 
+ 	/*
+ 	 * The journal_load will have done any necessary log recovery,
+ 	 * so we can safely mount the rest of the filesystem now.
+ 	 */
+ 
+ 	sb->s_root = d_alloc_root(iget(sb, EXT3_ROOT_INO));
+ 	if (!sb->s_root || !S_ISDIR(sb->s_root->d_inode->i_mode) ||
+ 	    !sb->s_root->d_inode->i_blocks || !sb->s_root->d_inode->i_size) {
+ 		if (sb->s_root) {
+ 			dput(sb->s_root);
+ 			sb->s_root = NULL;
+ 			printk(KERN_ERR
+ 			       "EXT3-fs: corrupt root inode, run e2fsck\n");
+ 		} else
+ 			printk(KERN_ERR "EXT3-fs: get root inode failed\n");
+ 		goto failed_mount3;
+ 	}
+ 
+ 	ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY);
+ 	/*
+ 	 * akpm: core read_super() calls in here with the superblock locked.
+ 	 * That deadlocks, because orphan cleanup needs to lock the superblock
+ 	 * in numerous places.  Here we just pop the lock - it's relatively
+ 	 * harmless, because we are now ready to accept write_super() requests,
+ 	 * and aviro says that's the only reason for hanging onto the
+ 	 * superblock lock.
+ 	 */
+ 	EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS;
+ 	unlock_super(sb);	/* akpm: sigh */
+ 	ext3_orphan_cleanup(sb, es);
+ 	lock_super(sb);
+ 	EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS;
+ 	if (needs_recovery)
+ 		printk (KERN_INFO "EXT3-fs: recovery complete.\n");
+ 	ext3_mark_recovery_complete(sb, es);
+ 	printk (KERN_INFO "EXT3-fs: mounted filesystem with %s data mode.\n",
+ 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
+ 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
+ 		"writeback");
+ 
+ 	return sb;
+ 
+ failed_mount3:
+ 	journal_destroy(sbi->s_journal);
+ failed_mount2:
+ 	for (i = 0; i < db_count; i++)
+ 		brelse(sbi->s_group_desc[i]);
+ 	kfree(sbi->s_group_desc);
+ failed_mount:
+ 	ext3_blkdev_remove(sbi);
+ 	brelse(bh);
+ out_fail:
+ 	return NULL;
+ }
+ 
+ static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum)
+ {
+ 	struct inode *journal_inode;
+ 	journal_t *journal;
+ 
+ 	/* First, test for the existence of a valid inode on disk.  Bad
+ 	 * things happen if we iget() an unused inode, as the subsequent
+ 	 * iput() will try to delete it. */
+ 
+ 	journal_inode = iget(sb, journal_inum);
+ 	if (!journal_inode) {
+ 		printk(KERN_ERR "EXT3-fs: no journal found.\n");
+ 		return NULL;
+ 	}
+ 	if (!journal_inode->i_nlink) {
+ 		make_bad_inode(journal_inode);
+ 		iput(journal_inode);
+ 		printk(KERN_ERR "EXT3-fs: journal inode is deleted.\n");
+ 		return NULL;
+ 	}
+ 
+ 	jbd_debug(2, "Journal inode found at %p: %Ld bytes\n",
+ 		  journal_inode, journal_inode->i_size);
+ 	if (is_bad_inode(journal_inode) || !S_ISREG(journal_inode->i_mode)) {
+ 		printk(KERN_ERR "EXT3-fs: invalid journal inode.\n");
+ 		iput(journal_inode);
+ 		return NULL;
+ 	}
+ 
+ 	journal = journal_init_inode(journal_inode);
+ 	if (!journal)
+ 		iput(journal_inode);
+ 	return journal;
+ }
+ 
+ static journal_t *ext3_get_dev_journal(struct super_block *sb,
+ 				       int dev)
+ {
+ 	struct buffer_head * bh;
+ 	journal_t *journal;
+ 	int start;
+ 	int len;
+ 	int hblock, blocksize;
+ 	unsigned long sb_block;
+ 	unsigned long offset;
+ 	kdev_t journal_dev = to_kdev_t(dev);
+ 	struct ext3_super_block * es;
+ 	struct block_device *bdev;
+ 
+ 	bdev = ext3_blkdev_get(journal_dev);
+ 	if (bdev == NULL)
+ 		return NULL;
+ 
+ 	blocksize = sb->s_blocksize;
+ 	hblock = get_hardsect_size(journal_dev);
+ 	if (blocksize < hblock) {
+ 		printk(KERN_ERR
+ 			"EXT3-fs: blocksize too small for journal device.\n");
+ 		goto out_bdev;
+ 	}
+ 	
+ 	sb_block = EXT3_MIN_BLOCK_SIZE / blocksize;
+ 	offset = EXT3_MIN_BLOCK_SIZE % blocksize;
+ 	set_blocksize(dev, blocksize);
+ 	if (!(bh = bread(dev, sb_block, blocksize))) {
+ 		printk(KERN_ERR "EXT3-fs: couldn't read superblock of "
+ 		       "external journal\n");
+ 		goto out_bdev;
+ 	}
+ 
+ 	es = (struct ext3_super_block *) (((char *)bh->b_data) + offset);
+ 	if ((le16_to_cpu(es->s_magic) != EXT3_SUPER_MAGIC) ||
+ 	    !(le32_to_cpu(es->s_feature_incompat) &
+ 	      EXT3_FEATURE_INCOMPAT_JOURNAL_DEV)) {
+ 		printk(KERN_ERR "EXT3-fs: external journal has "
+ 					"bad superblock\n");
+ 		brelse(bh);
+ 		goto out_bdev;
+ 	}
+ 
+ 	if (memcmp(EXT3_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
+ 		printk(KERN_ERR "EXT3-fs: journal UUID does not match\n");
+ 		brelse(bh);
+ 		goto out_bdev;
+ 	}
+ 
+ 	len = le32_to_cpu(es->s_blocks_count);
+ 	start = sb_block + 1;
+ 	brelse(bh);	/* we're done with the superblock */
+ 
+ 	journal = journal_init_dev(journal_dev, sb->s_dev, 
+ 					start, len, blocksize);
+ 	if (!journal) {
+ 		printk(KERN_ERR "EXT3-fs: failed to create device journal\n");
+ 		goto out_bdev;
+ 	}
+ 	ll_rw_block(READ, 1, &journal->j_sb_buffer);
+ 	wait_on_buffer(journal->j_sb_buffer);
+ 	if (!buffer_uptodate(journal->j_sb_buffer)) {
+ 		printk(KERN_ERR "EXT3-fs: I/O error on journal device\n");
+ 		goto out_journal;
+ 	}
+ 	if (ntohl(journal->j_superblock->s_nr_users) != 1) {
+ 		printk(KERN_ERR "EXT3-fs: External journal has more than one "
+ 					"user (unsupported) - %d\n",
+ 			ntohl(journal->j_superblock->s_nr_users));
+ 		goto out_journal;
+ 	}
+ 	EXT3_SB(sb)->journal_bdev = bdev;
+ 	return journal;
+ out_journal:
+ 	journal_destroy(journal);
+ out_bdev:
+ 	ext3_blkdev_put(bdev);
+ 	return NULL;
+ }
+ 
+ static int ext3_load_journal(struct super_block * sb,
+ 			     struct ext3_super_block * es)
+ {
+ 	journal_t *journal;
+ 	int journal_inum = le32_to_cpu(es->s_journal_inum);
+ 	int journal_dev = le32_to_cpu(es->s_journal_dev);
+ 	int err;
+ 	int really_read_only;
+ 
+ 	really_read_only = is_read_only(sb->s_dev);
+ 
+ 	/*
+ 	 * Are we loading a blank journal or performing recovery after a
+ 	 * crash?  For recovery, we need to check in advance whether we
+ 	 * can get read-write access to the device.
+ 	 */
+ 
+ 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER)) {
+ 		if (sb->s_flags & MS_RDONLY) {
+ 			printk(KERN_INFO "EXT3-fs: INFO: recovery "
+ 					"required on readonly filesystem.\n");
+ 			if (really_read_only) {
+ 				printk(KERN_ERR "EXT3-fs: write access "
+ 					"unavailable, cannot proceed.\n");
+ 				return -EROFS;
+ 			}
+ 			printk (KERN_INFO "EXT3-fs: write access will "
+ 					"be enabled during recovery.\n");
+ 		}
+ 	}
+ 
+ 	if (journal_inum && journal_dev) {
+ 		printk(KERN_ERR "EXT3-fs: filesystem has both journal "
+ 		       "and inode journals!\n");
+ 		return -EINVAL;
+ 	}
+ 
+ 	if (journal_inum) {
+ 		if (!(journal = ext3_get_journal(sb, journal_inum)))
+ 			return -EINVAL;
+ 	} else {
+ 		if (!(journal = ext3_get_dev_journal(sb, journal_dev)))
+ 			return -EINVAL;
+ 	}
+ 	
+ 
+ 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
+ 		err = journal_update_format(journal);
+ 		if (err)  {
+ 			printk(KERN_ERR "EXT3-fs: error updating journal.\n");
+ 			journal_destroy(journal);
+ 			return err;
+ 		}
+ 	}
+ 
+ 	if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER))
+ 		journal_wipe(journal, !really_read_only);
+ 
+ 	err = journal_load(journal);
+ 	if (err) {
+ 		printk(KERN_ERR "EXT3-fs: error loading journal.\n");
+ 		journal_destroy(journal);
+ 		return err;
+ 	}
+ 
+ 	EXT3_SB(sb)->s_journal = journal;
+ 	ext3_clear_journal_err(sb, es);
+ 	return 0;
+ }
+ 
+ static int ext3_create_journal(struct super_block * sb,
+ 			       struct ext3_super_block * es,
+ 			       int journal_inum)
+ {
+ 	journal_t *journal;
+ 
+ 	if (sb->s_flags & MS_RDONLY) {
+ 		printk(KERN_ERR "EXT3-fs: readonly filesystem when trying to "
+ 				"create journal.\n");
+ 		return -EROFS;
+ 	}
+ 
+ 	if (!(journal = ext3_get_journal(sb, journal_inum)))
+ 		return -EINVAL;
+ 
+ 	printk(KERN_INFO "EXT3-fs: creating new journal on inode %d\n",
+ 	       journal_inum);
+ 
+ 	if (journal_create(journal)) {
+ 		printk(KERN_ERR "EXT3-fs: error creating journal.\n");
+ 		journal_destroy(journal);
+ 		return -EIO;
+ 	}
+ 
+ 	EXT3_SB(sb)->s_journal = journal;
+ 
+ 	ext3_update_dynamic_rev(sb);
+ 	EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ 	EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
+ 
+ 	es->s_journal_inum = cpu_to_le32(journal_inum);
+ 	sb->s_dirt = 1;
+ 
+ 	/* Make sure we flush the recovery flag to disk. */
+ 	ext3_commit_super(sb, es, 1);
+ 
+ 	return 0;
+ }
+ 
+ static void ext3_commit_super (struct super_block * sb,
+ 			       struct ext3_super_block * es,
+ 			       int sync)
+ {
+ 	es->s_wtime = cpu_to_le32(CURRENT_TIME);
+ 	BUFFER_TRACE(sb->u.ext3_sb.s_sbh, "marking dirty");
+ 	mark_buffer_dirty(sb->u.ext3_sb.s_sbh);
+ 	if (sync) {
+ 		ll_rw_block(WRITE, 1, &sb->u.ext3_sb.s_sbh);
+ 		wait_on_buffer(sb->u.ext3_sb.s_sbh);
+ 	}
+ }
+ 
+ 
+ /*
+  * Have we just finished recovery?  If so, and if we are mounting (or
+  * remounting) the filesystem readonly, then we will end up with a
+  * consistent fs on disk.  Record that fact.
+  */
+ static void ext3_mark_recovery_complete(struct super_block * sb,
+ 					struct ext3_super_block * es)
+ {
+ 	journal_flush(EXT3_SB(sb)->s_journal);
+ 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
+ 	    sb->s_flags & MS_RDONLY) {
+ 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ 		sb->s_dirt = 0;
+ 		ext3_commit_super(sb, es, 1);
+ 	}
+ }
+ 
+ /*
+  * If we are mounting (or read-write remounting) a filesystem whose journal
+  * has recorded an error from a previous lifetime, move that error to the
+  * main filesystem now.
+  */
+ static void ext3_clear_journal_err(struct super_block * sb,
+ 				   struct ext3_super_block * es)
+ {
+ 	journal_t *journal;
+ 	int j_errno;
+ 	const char *errstr;
+ 	
+ 	journal = EXT3_SB(sb)->s_journal;
+ 
+ 	/*
+ 	 * Now check for any error status which may have been recorded in the
+ 	 * journal by a prior ext3_error() or ext3_abort()
+ 	 */
+ 
+ 	j_errno = journal_errno(journal);
+ 	if (j_errno) {
+ 		char nbuf[16];
+ 		
+ 		errstr = ext3_decode_error(sb, j_errno, nbuf);
+ 		ext3_warning(sb, __FUNCTION__, "Filesystem error recorded "
+ 			     "from previous mount: %s", errstr);
+ 		ext3_warning(sb, __FUNCTION__, "Marking fs in need of "
+ 			     "filesystem check.");
+ 		
+ 		sb->u.ext3_sb.s_mount_state |= EXT3_ERROR_FS;
+ 		es->s_state |= cpu_to_le16(EXT3_ERROR_FS);
+ 		ext3_commit_super (sb, es, 1);
+ 
+ 		journal_clear_err(journal);
+ 	}
+ }
+ 
+ /*
+  * Force the running and committing transactions to commit,
+  * and wait on the commit.
+  */
+ int ext3_force_commit(struct super_block *sb)
+ {
+ 	journal_t *journal;
+ 	int ret;
+ 
+ 	if (sb->s_flags & MS_RDONLY)
+ 		return 0;
+ 
+ 	journal = EXT3_SB(sb)->s_journal;
+ 	sb->s_dirt = 0;
+ 	lock_kernel();	/* important: lock down j_running_transaction */
+ 	ret = ext3_journal_force_commit(journal);
+ 	unlock_kernel();
+ 	return ret;
+ }
+ 
+ /*
+  * Ext3 always journals updates to the superblock itself, so we don't
+  * have to propagate any other updates to the superblock on disk at this
+  * point.  Just start an async writeback to get the buffers on their way
+  * to the disk.
+  *
+  * This implicitly triggers the writebehind on sync().
+  */
+ 
+ static int do_sync_supers = 0;
+ MODULE_PARM(do_sync_supers, "i");
+ MODULE_PARM_DESC(do_sync_supers, "Write superblocks synchronously");
+ 
+ void ext3_write_super (struct super_block * sb)
+ {
+ 	tid_t target;
+ 	
+ 	if (down_trylock(&sb->s_lock) == 0)
+ 		BUG();		/* aviro detector */
+ 	sb->s_dirt = 0;
+ 	target = log_start_commit(EXT3_SB(sb)->s_journal, NULL);
+ 
+ 	if (do_sync_supers) {
+ 		unlock_super(sb);
+ 		log_wait_commit(EXT3_SB(sb)->s_journal, target);
+ 		lock_super(sb);
+ 	}
+ }
+ 
+ /*
+  * LVM calls this function before a (read-only) snapshot is created.  This
+  * gives us a chance to flush the journal completely and mark the fs clean.
+  */
+ void ext3_write_super_lockfs(struct super_block *sb)
+ {
+ 	sb->s_dirt = 0;
+ 
+ 	lock_kernel();		/* 2.4.5 forgot to do this for us */
+ 	if (!(sb->s_flags & MS_RDONLY)) {
+ 		journal_t *journal = EXT3_SB(sb)->s_journal;
+ 
+ 		/* Now we set up the journal barrier. */
+ 		journal_lock_updates(journal);
+ 		journal_flush(journal);
+ 
+ 		/* Journal blocked and flushed, clear needs_recovery flag. */
+ 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ 		ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+ 	}
+ 	unlock_kernel();
+ }
+ 
+ /*
+  * Called by LVM after the snapshot is done.  We need to reset the RECOVER
+  * flag here, even though the filesystem is not technically dirty yet.
+  */
+ void ext3_unlockfs(struct super_block *sb)
+ {
+ 	if (!(sb->s_flags & MS_RDONLY)) {
+ 		lock_kernel();
+ 		lock_super(sb);
+ 		/* Reser the needs_recovery flag before the fs is unlocked. */
+ 		EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+ 		ext3_commit_super(sb, EXT3_SB(sb)->s_es, 1);
+ 		unlock_super(sb);
+ 		journal_unlock_updates(EXT3_SB(sb)->s_journal);
+ 		unlock_kernel();
+ 	}
+ }
+ 
+ int ext3_remount (struct super_block * sb, int * flags, char * data)
+ {
+ 	struct ext3_super_block * es;
+ 	struct ext3_sb_info *sbi = EXT3_SB(sb);
+ 	unsigned long tmp;
+ 
+ 	clear_ro_after(sb);
+ 
+ 	/*
+ 	 * Allow the "check" option to be passed as a remount option.
+ 	 */
+ 	if (!parse_options(data, &tmp, sbi, &tmp, 1))
+ 		return -EINVAL;
+ 
+ 	if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+ 		ext3_abort(sb, __FUNCTION__, "Abort forced by user");
+ 
+ 	es = sbi->s_es;
+ 
+ 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ 		if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
+ 			return -EROFS;
+ 
+ 		if (*flags & MS_RDONLY) {
+ 			/*
+ 			 * First of all, the unconditional stuff we have to do
+ 			 * to disable replay of the journal when we next remount
+ 			 */
+ 			sb->s_flags |= MS_RDONLY;
+ 
+ 			/*
+ 			 * OK, test if we are remounting a valid rw partition
+ 			 * readonly, and if so set the rdonly flag and then
+ 			 * mark the partition as valid again.
+ 			 */
+ 			if (!(es->s_state & cpu_to_le16(EXT3_VALID_FS)) &&
+ 			    (sbi->s_mount_state & EXT3_VALID_FS))
+ 				es->s_state = cpu_to_le16(sbi->s_mount_state);
+ 
+ 			ext3_mark_recovery_complete(sb, es);
+ 		} else {
+ 			int ret;
+ 			if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb,
+ 					~EXT3_FEATURE_RO_COMPAT_SUPP))) {
+ 				printk(KERN_WARNING "EXT3-fs: %s: couldn't "
+ 				       "remount RDWR because of unsupported "
+ 				       "optional features (%x).\n",
+ 				       bdevname(sb->s_dev), ret);
+ 				return -EROFS;
+ 			}
+ 			/*
+ 			 * Mounting a RDONLY partition read-write, so reread
+ 			 * and store the current valid flag.  (It may have
+ 			 * been changed by e2fsck since we originally mounted
+ 			 * the partition.)
+ 			 */
+ 			ext3_clear_journal_err(sb, es);
+ 			sbi->s_mount_state = le16_to_cpu(es->s_state);
+ 			if (!ext3_setup_super (sb, es, 0))
+ 				sb->s_flags &= ~MS_RDONLY;
+ 		}
+ 	}
+ 	setup_ro_after(sb);
+ 	return 0;
+ }
+ 
+ int ext3_statfs (struct super_block * sb, struct statfs * buf)
+ {
+ 	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+ 	unsigned long overhead;
+ 	int i;
+ 
+ 	if (test_opt (sb, MINIX_DF))
+ 		overhead = 0;
+ 	else {
+ 		/*
+ 		 * Compute the overhead (FS structures)
+ 		 */
+ 
+ 		/*
+ 		 * All of the blocks before first_data_block are
+ 		 * overhead
+ 		 */
+ 		overhead = le32_to_cpu(es->s_first_data_block);
+ 
+ 		/*
+ 		 * Add the overhead attributed to the superblock and
+ 		 * block group descriptors.  If the sparse superblocks
+ 		 * feature is turned on, then not all groups have this.
+ 		 */
+ 		for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++)
+ 			overhead += ext3_bg_has_super(sb, i) +
+ 				ext3_bg_num_gdb(sb, i);
+ 
+ 		/*
+ 		 * Every block group has an inode bitmap, a block
+ 		 * bitmap, and an inode table.
+ 		 */
+ 		overhead += (EXT3_SB(sb)->s_groups_count *
+ 			     (2 + EXT3_SB(sb)->s_itb_per_group));
+ 	}
+ 
+ 	buf->f_type = EXT3_SUPER_MAGIC;
+ 	buf->f_bsize = sb->s_blocksize;
+ 	buf->f_blocks = le32_to_cpu(es->s_blocks_count) - overhead;
+ 	buf->f_bfree = ext3_count_free_blocks (sb);
+ 	buf->f_bavail = buf->f_bfree - le32_to_cpu(es->s_r_blocks_count);
+ 	if (buf->f_bfree < le32_to_cpu(es->s_r_blocks_count))
+ 		buf->f_bavail = 0;
+ 	buf->f_files = le32_to_cpu(es->s_inodes_count);
+ 	buf->f_ffree = ext3_count_free_inodes (sb);
+ 	buf->f_namelen = EXT3_NAME_LEN;
+ 	return 0;
+ }
+ 
+ static DECLARE_FSTYPE_DEV(ext3_fs_type, "ext3", ext3_read_super);
+ 
+ static int __init init_ext3_fs(void)
+ {
+         return register_filesystem(&ext3_fs_type);
+ }
+ 
+ static void __exit exit_ext3_fs(void)
+ {
+ 	unregister_filesystem(&ext3_fs_type);
+ }
+ 
+ EXPORT_NO_SYMBOLS;
+ 
+ MODULE_LICENSE("GPL");
+ module_init(init_ext3_fs)
+ module_exit(exit_ext3_fs)
diff -rc2P linux/fs/ext3/symlink.c linux-2.4.13/fs/ext3/symlink.c
*** linux/fs/ext3/symlink.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/ext3/symlink.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,39 ----
+ /*
+  *  linux/fs/ext3/symlink.c
+  *
+  * Only fast symlinks left here - the rest is done by generic code. AV, 1999
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/fs/minix/symlink.c
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  *
+  *  ext3 symlink handling code
+  */
+ 
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ 
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+ 	char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
+ 	return vfs_readlink(dentry, buffer, buflen, s);
+ }
+ 
+ static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
+ {
+ 	char *s = (char *)dentry->d_inode->u.ext3_i.i_data;
+ 	return vfs_follow_link(nd, s);
+ }
+ 
+ struct inode_operations ext3_fast_symlink_inode_operations = {
+ 	readlink:	ext3_readlink,		/* BKL not held.  Don't need */
+ 	follow_link:	ext3_follow_link,	/* BKL not held.  Don't need */
+ };
diff -rc2P linux/fs/inode.c linux-2.4.13/fs/inode.c
*** linux/fs/inode.c	Fri Sep 28 21:03:48 2001
--- linux-2.4.13/fs/inode.c	Fri Nov  9 16:57:59 2001
***************
*** 110,113 ****
--- 110,114 ----
  		sema_init(&inode->i_sem, 1);
  		sema_init(&inode->i_zombie, 1);
+ 		init_rwsem(&inode->i_truncate_sem);
  		spin_lock_init(&inode->i_data.i_shared_lock);
  	}
diff -rc2P linux/fs/jbd/Makefile linux-2.4.13/fs/jbd/Makefile
*** linux/fs/jbd/Makefile	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/Makefile	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,15 ----
+ #
+ # fs/jbd/Makefile
+ # 
+ # Makefile for the linux journaling routines.
+ #
+ 
+ export-objs := journal.o
+ O_TARGET := jbd.o
+ 
+ obj-y   := transaction.o commit.o recovery.o checkpoint.o revoke.o journal.o
+ 
+ obj-m   := $(O_TARGET)
+ 
+ include $(TOPDIR)/Rules.make
+ 
diff -rc2P linux/fs/jbd/checkpoint.c linux-2.4.13/fs/jbd/checkpoint.c
*** linux/fs/jbd/checkpoint.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/checkpoint.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,608 ----
+ /*
+  * linux/fs/checkpoint.c
+  * 
+  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+  *
+  * Copyright 1999 Red Hat Software --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Checkpoint routines for the generic filesystem journaling code.  
+  * Part of the ext2fs journaling system.  
+  *
+  * Checkpointing is the process of ensuring that a section of the log is
+  * committed fully to disk, so that that portion of the log can be
+  * reused.
+  */
+ 
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ 
+ extern spinlock_t journal_datalist_lock;
+ 
+ /*
+  * Unlink a buffer from a transaction. 
+  *
+  * Called with journal_datalist_lock held.
+  */
+ 
+ static inline void __buffer_unlink(struct journal_head *jh)
+ {
+ 	transaction_t *transaction;
+ 
+ 	transaction = jh->b_cp_transaction;
+ 	jh->b_cp_transaction = NULL;
+ 
+ 	jh->b_cpnext->b_cpprev = jh->b_cpprev;
+ 	jh->b_cpprev->b_cpnext = jh->b_cpnext;
+ 	if (transaction->t_checkpoint_list == jh)
+ 		transaction->t_checkpoint_list = jh->b_cpnext;
+ 	if (transaction->t_checkpoint_list == jh)
+ 		transaction->t_checkpoint_list = NULL;
+ }
+ 
+ /*
+  * Try to release a checkpointed buffer from its transaction.
+  * Returns 1 if we released it.
+  * Requires journal_datalist_lock
+  */
+ static int __try_to_free_cp_buf(struct journal_head *jh)
+ {
+ 	int ret = 0;
+ 	struct buffer_head *bh = jh2bh(jh);
+ 
+ 	if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+ 		JBUFFER_TRACE(jh, "remove from checkpoint list");
+ 		__journal_remove_checkpoint(jh);
+ 		__journal_remove_journal_head(bh);
+ 		BUFFER_TRACE(bh, "release");
+ 		/* BUF_LOCKED -> BUF_CLEAN (fwiw) */
+ 		refile_buffer(bh);
+ 		__brelse(bh);
+ 		ret = 1;
+ 	}
+ 	return ret;
+ }
+ 
+ /*
+  * log_wait_for_space: wait until there is space in the journal.
+  *
+  * Called with the journal already locked, but it will be unlocked if we have
+  * to wait for a checkpoint to free up some space in the log.
+  */
+ 
+ void log_wait_for_space(journal_t *journal, int nblocks)
+ {
+ 	while (log_space_left(journal) < nblocks) {
+ 		if (journal->j_flags & JFS_ABORT)
+ 			return;
+ 		unlock_journal(journal);
+ 		down(&journal->j_checkpoint_sem);
+ 		lock_journal(journal);
+ 		
+ 		/* Test again, another process may have checkpointed
+ 		 * while we were waiting for the checkpoint lock */
+ 		if (log_space_left(journal) < nblocks) {
+ 			log_do_checkpoint(journal, nblocks);
+ 		}
+ 		up(&journal->j_checkpoint_sem);
+ 	}
+ }
+ 
+ /*
+  * Clean up a transaction's checkpoint list.  
+  *
+  * We wait for any pending IO to complete and make sure any clean
+  * buffers are removed from the transaction. 
+  *
+  * Return 1 if we performed any actions which might have destroyed the
+  * checkpoint.  (journal_remove_checkpoint() deletes the transaction when
+  * the last checkpoint buffer is cleansed)
+  *
+  * Called with the journal locked.
+  * Called with journal_datalist_lock held.
+  */
+ static int __cleanup_transaction(journal_t *journal, transaction_t *transaction)
+ {
+ 	struct journal_head *jh, *next_jh, *last_jh;
+ 	struct buffer_head *bh;
+ 	int ret = 0;
+ 
+ 	assert_spin_locked(&journal_datalist_lock);
+ 	jh = transaction->t_checkpoint_list;
+ 	if (!jh)
+ 		return 0;
+ 
+ 	last_jh = jh->b_cpprev;
+ 	next_jh = jh;
+ 	do {
+ 		jh = next_jh;
+ 		bh = jh2bh(jh);
+ 		if (buffer_locked(bh)) {
+ 			atomic_inc(&bh->b_count);
+ 			spin_unlock(&journal_datalist_lock);
+ 			unlock_journal(journal);
+ 			wait_on_buffer(bh);
+ 			/* the journal_head may have gone by now */
+ 			BUFFER_TRACE(bh, "brelse");
+ 			__brelse(bh);
+ 			goto out_return_1;
+ 		}
+ 		
+ 		if (jh->b_transaction != NULL) {
+ 			transaction_t *transaction = jh->b_transaction;
+ 			tid_t tid = transaction->t_tid;
+ 
+ 			spin_unlock(&journal_datalist_lock);
+ 			log_start_commit(journal, transaction);
+ 			unlock_journal(journal);
+ 			log_wait_commit(journal, tid);
+ 			goto out_return_1;
+ 		}
+ 
+ 		/*
+ 		 * We used to test for (jh->b_list != BUF_CLEAN) here.
+ 		 * But unmap_underlying_metadata() can place buffer onto
+ 		 * BUF_CLEAN. Since refile_buffer() no longer takes buffers
+ 		 * off checkpoint lists, we cope with it here
+ 		 */
+ 		/*
+ 		 * AKPM: I think the buffer_jdirty test is redundant - it
+ 		 * shouldn't have NULL b_transaction?
+ 		 */
+ 		next_jh = jh->b_cpnext;
+ 		if (!buffer_dirty(bh) && !buffer_jdirty(bh)) {
+ 			BUFFER_TRACE(bh, "remove from checkpoint");
+ 			__journal_remove_checkpoint(jh);
+ 			__journal_remove_journal_head(bh);
+ 			refile_buffer(bh);
+ 			__brelse(bh);
+ 			ret = 1;
+ 		}
+ 		
+ 		jh = next_jh;
+ 	} while (jh != last_jh);
+ 
+ 	return ret;
+ out_return_1:
+ 	lock_journal(journal);
+ 	spin_lock(&journal_datalist_lock);
+ 	return 1;
+ }
+ 
+ #define NR_BATCH	64
+ 
+ static void __flush_batch(struct buffer_head **bhs, int *batch_count)
+ {
+ 	int i;
+ 
+ 	spin_unlock(&journal_datalist_lock);
+ 	ll_rw_block(WRITE, *batch_count, bhs);
+ 	run_task_queue(&tq_disk);
+ 	spin_lock(&journal_datalist_lock);
+ 	for (i = 0; i < *batch_count; i++) {
+ 		struct buffer_head *bh = bhs[i];
+ 		clear_bit(BH_JWrite, &bh->b_state);
+ 		BUFFER_TRACE(bh, "brelse");
+ 		__brelse(bh);
+ 	}
+ 	*batch_count = 0;
+ }
+ 
+ /*
+  * Try to flush one buffer from the checkpoint list to disk.
+  *
+  * Return 1 if something happened which requires us to abort the current
+  * scan of the checkpoint list.  
+  *
+  * Called with journal_datalist_lock held.
+  */
+ static int __flush_buffer(journal_t *journal, struct journal_head *jh,
+ 			struct buffer_head **bhs, int *batch_count,
+ 			int *drop_count)
+ {
+ 	struct buffer_head *bh = jh2bh(jh);
+ 	int ret = 0;
+ 
+ 	if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) {
+ 		J_ASSERT_JH(jh, jh->b_transaction == NULL);
+ 		
+ 		/*
+ 		 * Important: we are about to write the buffer, and
+ 		 * possibly block, while still holding the journal lock.
+ 		 * We cannot afford to let the transaction logic start
+ 		 * messing around with this buffer before we write it to
+ 		 * disk, as that would break recoverability.  
+ 		 */
+ 		BUFFER_TRACE(bh, "queue");
+ 		atomic_inc(&bh->b_count);
+ 		J_ASSERT_BH(bh, !test_bit(BH_JWrite, &bh->b_state));
+ 		set_bit(BH_JWrite, &bh->b_state);
+ 		bhs[*batch_count] = bh;
+ 		(*batch_count)++;
+ 		if (*batch_count == NR_BATCH) {
+ 			__flush_batch(bhs, batch_count);
+ 			ret = 1;
+ 		}
+ 	} else {
+ 		int last_buffer = 0;
+ 		if (jh->b_cpnext == jh) {
+ 			/* We may be about to drop the transaction.  Tell the
+ 			 * caller that the lists have changed.
+ 			 */
+ 			last_buffer = 1;
+ 		}
+ 		if (__try_to_free_cp_buf(jh)) {
+ 			(*drop_count)++;
+ 			ret = last_buffer;
+ 		}
+ 	}
+ 	return ret;
+ }
+ 
+ 	
+ /*
+  * Perform an actual checkpoint.  We don't write out only enough to
+  * satisfy the current blocked requests: rather we submit a reasonably
+  * sized chunk of the outstanding data to disk at once for
+  * efficiency.  log_wait_for_space() will retry if we didn't free enough.
+  * 
+  * However, we _do_ take into account the amount requested so that once
+  * the IO has been queued, we can return as soon as enough of it has
+  * completed to disk.  
+  *
+  * The journal should be locked before calling this function.
+  */
+ 
+ /* @@@ `nblocks' is unused.  Should it be used? */
+ int log_do_checkpoint (journal_t *journal, int nblocks)
+ {
+ 	transaction_t *transaction, *last_transaction, *next_transaction;
+ 	int result;
+ 	int target;
+ 	int batch_count = 0;
+ 	struct buffer_head *bhs[NR_BATCH];
+ 
+ 	jbd_debug(1, "Start checkpoint\n");
+ 
+ 	/* 
+ 	 * First thing: if there are any transactions in the log which
+ 	 * don't need checkpointing, just eliminate them from the
+ 	 * journal straight away.  
+ 	 */
+ 	result = cleanup_journal_tail(journal);
+ 	jbd_debug(1, "cleanup_journal_tail returned %d\n", result);
+ 	if (result <= 0)
+ 		return result;
+ 
+ 	/*
+ 	 * OK, we need to start writing disk blocks.  Try to free up a
+ 	 * quarter of the log in a single checkpoint if we can.
+ 	 */
+ 	/*
+ 	 * AKPM: check this code.  I had a feeling a while back that it
+ 	 * degenerates into a busy loop at unmount time.
+ 	 */
+ 	target = (journal->j_last - journal->j_first) / 4;
+ 
+ 	spin_lock(&journal_datalist_lock);
+ repeat:
+ 	transaction = journal->j_checkpoint_transactions;
+ 	if (transaction == NULL)
+ 		goto done;
+ 	last_transaction = transaction->t_cpprev;
+ 	next_transaction = transaction;
+ 
+ 	do {
+ 		struct journal_head *jh, *last_jh, *next_jh;
+ 		int drop_count = 0;
+ 		int cleanup_ret, retry = 0;
+ 
+ 		transaction = next_transaction;
+ 		next_transaction = transaction->t_cpnext;
+ 		jh = transaction->t_checkpoint_list;
+ 		last_jh = jh->b_cpprev;
+ 		next_jh = jh;
+ 		do {
+ 			jh = next_jh;
+ 			next_jh = jh->b_cpnext;
+ 			retry = __flush_buffer(journal, jh, bhs, &batch_count,
+ 						&drop_count);
+ 		} while (jh != last_jh && !retry);
+ 		if (batch_count) {
+ 			__flush_batch(bhs, &batch_count);
+ 			goto repeat;
+ 		}
+ 		if (retry)
+ 			goto repeat;
+ 		/*
+ 		 * We have walked the whole transaction list without
+ 		 * finding anything to write to disk.  We had better be
+ 		 * able to make some progress or we are in trouble. 
+ 		 */
+ 		cleanup_ret = __cleanup_transaction(journal, transaction);
+ 		J_ASSERT(drop_count != 0 || cleanup_ret != 0);
+ 		goto repeat;	/* __cleanup may have dropped lock */
+ 	} while (transaction != last_transaction);
+ 
+ done:
+ 	spin_unlock(&journal_datalist_lock);
+ 	result = cleanup_journal_tail(journal);
+ 	if (result < 0)
+ 		return result;
+ 	
+ 	return 0;
+ }
+ 
+ /*
+  * Check the list of checkpoint transactions for the journal to see if
+  * we have already got rid of any since the last update of the log tail
+  * in the journal superblock.  If so, we can instantly roll the
+  * superblock forward to remove those transactions from the log.
+  * 
+  * Return <0 on error, 0 on success, 1 if there was nothing to clean up.
+  * 
+  * Called with the journal lock held.
+  *
+  * This is the only part of the journaling code which really needs to be
+  * aware of transaction aborts.  Checkpointing involves writing to the
+  * main filesystem area rather than to the journal, so it can proceed
+  * even in abort state, but we must not update the journal superblock if
+  * we have an abort error outstanding.
+  */
+ 
+ int cleanup_journal_tail(journal_t *journal)
+ {
+ 	transaction_t * transaction;
+ 	tid_t		first_tid;
+ 	unsigned long	blocknr, freed;
+ 
+ 	/* OK, work out the oldest transaction remaining in the log, and
+ 	 * the log block it starts at. 
+ 	 * 
+ 	 * If the log is now empty, we need to work out which is the
+ 	 * next transaction ID we will write, and where it will
+ 	 * start. */
+ 
+ 	/* j_checkpoint_transactions needs locking */
+ 	spin_lock(&journal_datalist_lock);
+ 	transaction = journal->j_checkpoint_transactions;
+ 	if (transaction) {
+ 		first_tid = transaction->t_tid;
+ 		blocknr = transaction->t_log_start;
+ 	} else if ((transaction = journal->j_committing_transaction) != NULL) {
+ 		first_tid = transaction->t_tid;
+ 		blocknr = transaction->t_log_start;
+ 	} else if ((transaction = journal->j_running_transaction) != NULL) {
+ 		first_tid = transaction->t_tid;
+ 		blocknr = journal->j_head;
+ 	} else {
+ 		first_tid = journal->j_transaction_sequence;
+ 		blocknr = journal->j_head;
+ 	}
+ 	spin_unlock(&journal_datalist_lock);
+ 	J_ASSERT (blocknr != 0);
+ 
+ 	/* If the oldest pinned transaction is at the tail of the log
+            already then there's not much we can do right now. */
+ 	if (journal->j_tail_sequence == first_tid)
+ 		return 1;
+ 
+ 	/* OK, update the superblock to recover the freed space.
+ 	 * Physical blocks come first: have we wrapped beyond the end of
+ 	 * the log?  */
+ 	freed = blocknr - journal->j_tail;
+ 	if (blocknr < journal->j_tail)
+ 		freed = freed + journal->j_last - journal->j_first;
+ 
+ 	jbd_debug(1,
+ 		  "Cleaning journal tail from %d to %d (offset %lu), "
+ 		  "freeing %lu\n",
+ 		  journal->j_tail_sequence, first_tid, blocknr, freed);
+ 
+ 	journal->j_free += freed;
+ 	journal->j_tail_sequence = first_tid;
+ 	journal->j_tail = blocknr;
+ 	if (!(journal->j_flags & JFS_ABORT))
+ 		journal_update_superblock(journal, 1);
+ 	return 0;
+ }
+ 
+ 
+ /* Checkpoint list management */
+ 
+ /*
+  * journal_clean_checkpoint_list
+  *
+  * Find all the written-back checkpoint buffers in the journal and release them.
+  *
+  * Called with the journal locked.
+  * Called with journal_datalist_lock held.
+  * Returns number of bufers reaped (for debug)
+  */
+ 
+ int __journal_clean_checkpoint_list(journal_t *journal)
+ {
+ 	transaction_t *transaction, *last_transaction, *next_transaction;
+ 	int ret = 0;
+ 
+ 	transaction = journal->j_checkpoint_transactions;
+ 	if (transaction == 0)
+ 		goto out;
+ 
+ 	last_transaction = transaction->t_cpprev;
+ 	next_transaction = transaction;
+ 	do {
+ 		struct journal_head *jh;
+ 
+ 		transaction = next_transaction;
+ 		next_transaction = transaction->t_cpnext;
+ 		jh = transaction->t_checkpoint_list;
+ 		if (jh) {
+ 			struct journal_head *last_jh = jh->b_cpprev;
+ 			struct journal_head *next_jh = jh;
+ 			do {
+ 				struct buffer_head *bh;
+ 
+ 				jh = next_jh;
+ 				next_jh = jh->b_cpnext;
+ 				bh = jh2bh(jh);
+ 				ret += __try_to_free_cp_buf(jh);
+ 			} while (jh != last_jh);
+ 		}
+ 	} while (transaction != last_transaction);
+ out:
+ 	return ret;
+ }
+ 
+ /* 
+  * journal_remove_checkpoint: called after a buffer has been committed
+  * to disk (either by being write-back flushed to disk, or being
+  * committed to the log).
+  *
+  * We cannot safely clean a transaction out of the log until all of the
+  * buffer updates committed in that transaction have safely been stored
+  * elsewhere on disk.  To achieve this, all of the buffers in a
+  * transaction need to be maintained on the transaction's checkpoint
+  * list until they have been rewritten, at which point this function is
+  * called to remove the buffer from the existing transaction's
+  * checkpoint list.  
+  *
+  * This function is called with the journal locked.
+  * This function is called with journal_datalist_lock held.
+  */
+ 
+ void __journal_remove_checkpoint(struct journal_head *jh)
+ {
+ 	transaction_t *transaction;
+ 	journal_t *journal;
+ 
+ 	JBUFFER_TRACE(jh, "entry");
+ 	
+ 	if ((transaction = jh->b_cp_transaction) == NULL) {
+ 		JBUFFER_TRACE(jh, "not on transaction");
+ 		goto out;
+ 	}
+ 
+ 	journal = transaction->t_journal;
+ 
+ 	__buffer_unlink(jh);
+ 
+ 	if (transaction->t_checkpoint_list != NULL)
+ 		goto out;
+ 	JBUFFER_TRACE(jh, "transaction has no more buffers");
+ 
+ 	/* There is one special case to worry about: if we have just
+            pulled the buffer off a committing transaction's forget list,
+            then even if the checkpoint list is empty, the transaction
+            obviously cannot be dropped! */
+ 
+ 	if (transaction == journal->j_committing_transaction) {
+ 		JBUFFER_TRACE(jh, "belongs to committing transaction");
+ 		goto out;
+ 	}
+ 
+ 	/* OK, that was the last buffer for the transaction: we can now
+ 	   safely remove this transaction from the log */
+ 
+ 	__journal_drop_transaction(journal, transaction);
+ 
+ 	/* Just in case anybody was waiting for more transactions to be
+            checkpointed... */
+ 	wake_up(&journal->j_wait_logspace);
+ out:
+ 	JBUFFER_TRACE(jh, "exit");
+ }
+ 
+ void journal_remove_checkpoint(struct journal_head *jh)
+ {
+ 	spin_lock(&journal_datalist_lock);
+ 	__journal_remove_checkpoint(jh);
+ 	spin_unlock(&journal_datalist_lock);
+ }
+ 
+ /*
+  * journal_insert_checkpoint: put a committed buffer onto a checkpoint
+  * list so that we know when it is safe to clean the transaction out of
+  * the log.
+  *
+  * Called with the journal locked.
+  * Called with journal_datalist_lock held.
+  */
+ void __journal_insert_checkpoint(struct journal_head *jh, 
+ 			       transaction_t *transaction)
+ {
+ 	JBUFFER_TRACE(jh, "entry");
+ 	J_ASSERT_JH(jh, buffer_dirty(jh2bh(jh)) || buffer_jdirty(jh2bh(jh)));
+ 	J_ASSERT_JH(jh, jh->b_cp_transaction == NULL);
+ 
+ 	assert_spin_locked(&journal_datalist_lock);
+ 	jh->b_cp_transaction = transaction;
+ 
+ 	if (!transaction->t_checkpoint_list) {
+ 		jh->b_cpnext = jh->b_cpprev = jh;
+ 	} else {
+ 		jh->b_cpnext = transaction->t_checkpoint_list;
+ 		jh->b_cpprev = transaction->t_checkpoint_list->b_cpprev;
+ 		jh->b_cpprev->b_cpnext = jh;
+ 		jh->b_cpnext->b_cpprev = jh;
+ 	}
+ 	transaction->t_checkpoint_list = jh;
+ }
+ 
+ void journal_insert_checkpoint(struct journal_head *jh, 
+ 			       transaction_t *transaction)
+ {
+ 	spin_lock(&journal_datalist_lock);
+ 	__journal_insert_checkpoint(jh, transaction);
+ 	spin_unlock(&journal_datalist_lock);
+ }
+ 
+ /*
+  * We've finished with this transaction structure: adios...
+  * 
+  * The transaction must have no links except for the checkpoint by this
+  * point.
+  *
+  * Called with the journal locked.
+  * Called with journal_datalist_lock held.
+  */
+ 
+ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction)
+ {
+ 	assert_spin_locked(&journal_datalist_lock);
+ 	if (transaction->t_cpnext) {
+ 		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
+ 		transaction->t_cpprev->t_cpnext = transaction->t_cpnext;
+ 		if (journal->j_checkpoint_transactions == transaction)
+ 			journal->j_checkpoint_transactions =
+ 				transaction->t_cpnext;
+ 		if (journal->j_checkpoint_transactions == transaction)
+ 			journal->j_checkpoint_transactions = NULL;
+ 	}
+ 
+ 	J_ASSERT (transaction->t_ilist == NULL);
+ 	J_ASSERT (transaction->t_buffers == NULL);
+ 	J_ASSERT (transaction->t_sync_datalist == NULL);
+ 	J_ASSERT (transaction->t_async_datalist == NULL);
+ 	J_ASSERT (transaction->t_forget == NULL);
+ 	J_ASSERT (transaction->t_iobuf_list == NULL);
+ 	J_ASSERT (transaction->t_shadow_list == NULL);
+ 	J_ASSERT (transaction->t_log_list == NULL);
+ 	J_ASSERT (transaction->t_checkpoint_list == NULL);
+ 	J_ASSERT (transaction->t_updates == 0);
+ 	
+ 	J_ASSERT (transaction->t_journal->j_committing_transaction !=
+ 					transaction);
+ 	
+ 	jbd_debug (1, "Dropping transaction %d, all done\n", 
+ 		   transaction->t_tid);
+ 	kfree (transaction);
+ }
+ 
diff -rc2P linux/fs/jbd/commit.c linux-2.4.13/fs/jbd/commit.c
*** linux/fs/jbd/commit.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/commit.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,701 ----
+ /*
+  * linux/fs/commit.c
+  *
+  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+  *
+  * Copyright 1998 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Journal commit routines for the generic filesystem journaling code;
+  * part of the ext2fs journaling system.
+  */
+ 
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+ 
+ extern spinlock_t journal_datalist_lock;
+ 
+ /*
+  * Default IO end handler for temporary BJ_IO buffer_heads.
+  */
+ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+ {
+ 	BUFFER_TRACE(bh, "");
+ 	mark_buffer_uptodate(bh, uptodate);
+ 	unlock_buffer(bh);
+ }
+ 
+ /*
+  * journal_commit_transaction
+  *
+  * The primary function for committing a transaction to the log.  This
+  * function is called by the journal thread to begin a complete commit.
+  */
+ void journal_commit_transaction(journal_t *journal)
+ {
+ 	transaction_t *commit_transaction;
+ 	struct journal_head *jh, *new_jh, *descriptor;
+ 	struct journal_head *next_jh, *last_jh;
+ 	struct buffer_head *wbuf[64];
+ 	int bufs;
+ 	int flags;
+ 	int blocknr;
+ 	char *tagp = NULL;
+ 	journal_header_t *header;
+ 	journal_block_tag_t *tag = NULL;
+ 	int space_left = 0;
+ 	int first_tag = 0;
+ 	int tag_flag;
+ 	int i;
+ 
+ 	/*
+ 	 * First job: lock down the current transaction and wait for
+ 	 * all outstanding updates to complete.
+ 	 */
+ 
+ 	lock_journal(journal); /* Protect journal->j_running_transaction */
+ 
+ #ifdef COMMIT_STATS
+ 	spin_lock(&journal_datalist_lock);
+ 	summarise_journal_usage(journal);
+ 	spin_unlock(&journal_datalist_lock);
+ #endif
+ 
+ 	lock_kernel();
+ 	
+ 	J_ASSERT (journal->j_running_transaction != NULL);
+ 	J_ASSERT (journal->j_committing_transaction == NULL);
+ 
+ 	commit_transaction = journal->j_running_transaction;
+ 	J_ASSERT (commit_transaction->t_state == T_RUNNING);
+ 
+ 	jbd_debug (1, "JBD: starting commit of transaction %d\n",
+ 		   commit_transaction->t_tid);
+ 
+ 	commit_transaction->t_state = T_LOCKED;
+ 	while (commit_transaction->t_updates != 0) {
+ 		unlock_journal(journal);
+ 		sleep_on(&journal->j_wait_updates);
+ 		lock_journal(journal);
+ 	}
+ 
+ 	J_ASSERT (commit_transaction->t_outstanding_credits <=
+ 			journal->j_max_transaction_buffers);
+ 
+ 	/* Do we need to erase the effects of a prior journal_flush? */
+ 	if (journal->j_flags & JFS_FLUSHED) {
+ 		jbd_debug(3, "super block updated\n");
+ 		journal_update_superblock(journal, 1);
+ 	} else {
+ 		jbd_debug(3, "superblock not updated\n");
+ 	}
+ 
+ 	/*
+ 	 * First thing we are allowed to do is to discard any remaining
+ 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
+ 	 * that there are no such buffers: if a large filesystem
+ 	 * operation like a truncate needs to split itself over multiple
+ 	 * transactions, then it may try to do a journal_restart() while
+ 	 * there are still BJ_Reserved buffers outstanding.  These must
+ 	 * be released cleanly from the current transaction.
+ 	 *
+ 	 * In this case, the filesystem must still reserve write access
+ 	 * again before modifying the buffer in the new transaction, but
+ 	 * we do not require it to remember exactly which old buffers it
+ 	 * has reserved.  This is consistent with the existing behaviour
+ 	 * that multiple journal_get_write_access() calls to the same
+ 	 * buffer are perfectly permissable.
+ 	 */
+ 
+ 	while (commit_transaction->t_reserved_list) {
+ 		jh = commit_transaction->t_reserved_list;
+ 		JBUFFER_TRACE(jh, "reserved, unused: refile");
+ 		journal_refile_buffer(jh);
+ 	}
+ 
+ 	/*
+ 	 * Now try to drop any written-back buffers from the journal's
+ 	 * checkpoint lists.  We do this *before* commit because it potentially
+ 	 * frees some memory
+ 	 */
+ 	spin_lock(&journal_datalist_lock);
+ 	__journal_clean_checkpoint_list(journal);
+ 	spin_unlock(&journal_datalist_lock);
+ 
+ 	/* First part of the commit: force the revoke list out to disk.
+ 	 * The revoke code generates its own metadata blocks on disk for this.
+ 	 *
+ 	 * It is important that we do this while the transaction is
+ 	 * still locked.  Generating the revoke records should not
+ 	 * generate any IO stalls, so this should be quick; and doing
+ 	 * the work while we have the transaction locked means that we
+ 	 * only ever have to maintain the revoke list for one
+ 	 * transaction at a time.
+ 	 */
+ 
+ 	jbd_debug (3, "JBD: commit phase 1\n");
+ 
+ 	journal_write_revoke_records(journal, commit_transaction);
+ 
+ 	/*
+ 	 * Now that we have built the revoke records, we can start
+ 	 * reusing the revoke list for a new running transaction.  We
+ 	 * can now safely start committing the old transaction: time to
+ 	 * get a new running transaction for incoming filesystem updates
+ 	 */
+ 
+ 	commit_transaction->t_state = T_FLUSH;
+ 
+ 	wake_up(&journal->j_wait_transaction_locked);
+ 
+ 	journal->j_committing_transaction = commit_transaction;
+ 	journal->j_running_transaction = NULL;
+ 
+ 	commit_transaction->t_log_start = journal->j_head;
+ 
+ 	unlock_kernel();
+ 	
+ 	jbd_debug (3, "JBD: commit phase 2\n");
+ 
+ 	/*
+ 	 * Now start flushing things to disk, in the order they appear
+ 	 * on the transaction lists.  Data blocks go first.
+ 	 */
+ 
+ 	/*
+ 	 * Whenever we unlock the journal and sleep, things can get added
+ 	 * onto ->t_datalist, so we have to keep looping back to write_out_data
+ 	 * until we *know* that the list is empty.
+ 	 */
+ write_out_data:
+ 
+ 	/*
+ 	 * Cleanup any flushed data buffers from the data list.  Even in
+ 	 * abort mode, we want to flush this out as soon as possible.
+ 	 *
+ 	 * We take journal_datalist_lock to protect the lists from
+ 	 * journal_try_to_free_buffers().
+ 	 */
+ 	spin_lock(&journal_datalist_lock);
+ 
+ write_out_data_locked:
+ 	bufs = 0;
+ 	next_jh = commit_transaction->t_sync_datalist;
+ 	if (next_jh == NULL)
+ 		goto sync_datalist_empty;
+ 	last_jh = next_jh->b_tprev;
+ 
+ 	do {
+ 		struct buffer_head *bh;
+ 
+ 		jh = next_jh;
+ 		next_jh = jh->b_tnext;
+ 		bh = jh2bh(jh);
+ 		if (!buffer_locked(bh)) {
+ 			if (buffer_dirty(bh)) {
+ 				BUFFER_TRACE(bh, "start journal writeout");
+ 				atomic_inc(&bh->b_count);
+ 				wbuf[bufs++] = bh;
+ 			} else {
+ 				BUFFER_TRACE(bh, "writeout complete: unfile");
+ 				__journal_unfile_buffer(jh);
+ 				jh->b_transaction = NULL;
+ 				__journal_remove_journal_head(bh);
+ 				refile_buffer(bh);
+ 				__brelse(bh);
+ 			}
+ 		}
+ 		if (bufs == ARRAY_SIZE(wbuf)) {
+ 			/*
+ 			 * Major speedup: start here on the next scan
+ 			 */
+ 			J_ASSERT(commit_transaction->t_sync_datalist != 0);
+ 			commit_transaction->t_sync_datalist = jh;
+ 			break;
+ 		}
+ 	} while (jh != last_jh);
+ 
+ 	if (bufs || current->need_resched) {
+ 		jbd_debug(2, "submit %d writes\n", bufs);
+ 		spin_unlock(&journal_datalist_lock);
+ 		unlock_journal(journal);
+ 		if (bufs)
+ 			ll_rw_block(WRITE, bufs, wbuf);
+ 		if (current->need_resched)
+ 			schedule();
+ 		journal_brelse_array(wbuf, bufs);
+ 		lock_journal(journal);
+ 		spin_lock(&journal_datalist_lock);
+ 		if (bufs)
+ 			goto write_out_data_locked;
+ 	}
+ 
+ 	/*
+ 	 * Wait for all previously submitted IO on the data list to complete.
+ 	 */
+ 	jh = commit_transaction->t_sync_datalist;
+ 	if (jh == NULL)
+ 		goto sync_datalist_empty;
+ 
+ 	do {
+ 		struct buffer_head *bh;
+ 		jh = jh->b_tprev;	/* Wait on the last written */
+ 		bh = jh2bh(jh);
+ 		if (buffer_locked(bh)) {
+ 			spin_unlock(&journal_datalist_lock);
+ 			unlock_journal(journal);
+ 			wait_on_buffer(bh);
+ 			/* the journal_head may have been removed now */
+ 			lock_journal(journal);
+ 			goto write_out_data;
+ 		} else if (buffer_dirty(bh)) {
+ 			goto write_out_data_locked;
+ 		}
+ 	} while (jh != commit_transaction->t_sync_datalist);
+ 	goto write_out_data_locked;
+ 
+ sync_datalist_empty:
+ 	/*
+ 	 * Wait for all the async writepage data.  As they become unlocked
+ 	 * in end_buffer_io_async(), the only place where they can be
+ 	 * reaped is in try_to_free_buffers(), and we're locked against
+ 	 * that.
+ 	 */
+ 	while ((jh = commit_transaction->t_async_datalist)) {
+ 		struct buffer_head *bh = jh2bh(jh);
+ 		if (buffer_locked(bh)) {
+ 			spin_unlock(&journal_datalist_lock);
+ 			unlock_journal(journal);
+ 			wait_on_buffer(bh);
+ 			lock_journal(journal);
+ 			spin_lock(&journal_datalist_lock);
+ 			continue;	/* List may have changed */
+ 		}
+ 		if (jh->b_next_transaction) {
+ 			/*
+ 			 * For writepage() buffers in journalled data mode: a
+ 			 * later transaction may want the buffer for "metadata"
+ 			 */
+ 			__journal_refile_buffer(jh);
+ 		} else {
+ 			BUFFER_TRACE(bh, "finished async writeout: unfile");
+ 			__journal_unfile_buffer(jh);
+ 			jh->b_transaction = NULL;
+ 			__journal_remove_journal_head(bh);
+ 			BUFFER_TRACE(bh, "finished async writeout: refile");
+ 			/* It can sometimes be on BUF_LOCKED due to migration
+ 			 * from syncdata to asyncdata */
+ 			if (bh->b_list != BUF_CLEAN)
+ 				refile_buffer(bh);
+ 			__brelse(bh);
+ 		}
+ 	}
+ 	spin_unlock(&journal_datalist_lock);
+ 
+ 	/*
+ 	 * If we found any dirty or locked buffers, then we should have
+ 	 * looped back up to the write_out_data label.  If there weren't
+ 	 * any then journal_clean_data_list should have wiped the list
+ 	 * clean by now, so check that it is in fact empty.
+ 	 */
+ 	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
+ 	J_ASSERT (commit_transaction->t_async_datalist == NULL);
+ 
+ 	jbd_debug (3, "JBD: commit phase 3\n");
+ 
+ 	/*
+ 	 * Way to go: we have now written out all of the data for a
+ 	 * transaction!  Now comes the tricky part: we need to write out
+ 	 * metadata.  Loop over the transaction's entire buffer list:
+ 	 */
+ 	commit_transaction->t_state = T_COMMIT;
+ 
+ 	descriptor = 0;
+ 	bufs = 0;
+ 	while (commit_transaction->t_buffers) {
+ 
+ 		/* Find the next buffer to be journaled... */
+ 
+ 		jh = commit_transaction->t_buffers;
+ 
+ 		/* If we're in abort mode, we just un-journal the buffer and
+ 		   release it for background writing. */
+ 
+ 		if (is_journal_aborted(journal)) {
+ 			JBUFFER_TRACE(jh, "journal is aborting: refile");
+ 			journal_refile_buffer(jh);
+ 			/* If that was the last one, we need to clean up
+ 			 * any descriptor buffers which may have been
+ 			 * already allocated, even if we are now
+ 			 * aborting. */
+ 			if (!commit_transaction->t_buffers)
+ 				goto start_journal_io;
+ 			continue;
+ 		}
+ 
+ 		/* Make sure we have a descriptor block in which to
+ 		   record the metadata buffer. */
+ 
+ 		if (!descriptor) {
+ 			struct buffer_head *bh;
+ 
+ 			J_ASSERT (bufs == 0);
+ 
+ 			jbd_debug(4, "JBD: get descriptor\n");
+ 
+ 			descriptor = journal_get_descriptor_buffer(journal);
+ 			bh = jh2bh(descriptor);
+ 			jbd_debug(4, "JBD: got buffer %ld (%p)\n",
+ 				bh->b_blocknr, bh->b_data);
+ 			header = (journal_header_t *)&bh->b_data[0];
+ 			header->h_magic     = htonl(JFS_MAGIC_NUMBER);
+ 			header->h_blocktype = htonl(JFS_DESCRIPTOR_BLOCK);
+ 			header->h_sequence  = htonl(commit_transaction->t_tid);
+ 
+ 			tagp = &bh->b_data[sizeof(journal_header_t)];
+ 			space_left = bh->b_size - sizeof(journal_header_t);
+ 			first_tag = 1;
+ 			set_bit(BH_JWrite, &bh->b_state);
+ 			wbuf[bufs++] = bh;
+ 
+ 			/* Record it so that we can wait for IO
+                            completion later */
+ 			BUFFER_TRACE(bh, "ph3: file as descriptor");
+ 			journal_file_buffer(descriptor, commit_transaction,
+ 						BJ_LogCtl);
+ 		}
+ 
+ 		/* Where is the buffer to be written? */
+ 
+ 		blocknr = journal_next_log_block(journal);
+ 
+ 		/* Bump b_count to prevent truncate from stumbling over
+                    the shadowed buffer!  @@@ This can go if we ever get
+                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
+ 		atomic_inc(&jh2bh(jh)->b_count);
+ 
+ 		/* Make a temporary IO buffer with which to write it out
+                    (this will requeue both the metadata buffer and the
+                    temporary IO buffer). new_bh goes on BJ_IO*/
+ 
+ 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
+ 		/*
+ 		 * akpm: journal_write_metadata_buffer() sets
+ 		 * new_bh->b_transaction to commit_transaction.
+ 		 * We need to clean this up before we release new_bh
+ 		 * (which is of type BJ_IO)
+ 		 */
+ 		JBUFFER_TRACE(jh, "ph3: write metadata");
+ 		flags = journal_write_metadata_buffer(commit_transaction,
+ 						      jh, &new_jh, blocknr);
+ 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
+ 		wbuf[bufs++] = jh2bh(new_jh);
+ 
+ 		/* Record the new block's tag in the current descriptor
+                    buffer */
+ 
+ 		tag_flag = 0;
+ 		if (flags & 1)
+ 			tag_flag |= JFS_FLAG_ESCAPE;
+ 		if (!first_tag)
+ 			tag_flag |= JFS_FLAG_SAME_UUID;
+ 
+ 		tag = (journal_block_tag_t *) tagp;
+ 		tag->t_blocknr = htonl(jh2bh(jh)->b_blocknr);
+ 		tag->t_flags = htonl(tag_flag);
+ 		tagp += sizeof(journal_block_tag_t);
+ 		space_left -= sizeof(journal_block_tag_t);
+ 
+ 		if (first_tag) {
+ 			memcpy (tagp, journal->j_uuid, 16);
+ 			tagp += 16;
+ 			space_left -= 16;
+ 			first_tag = 0;
+ 		}
+ 
+ 		/* If there's no more to do, or if the descriptor is full,
+ 		   let the IO rip! */
+ 
+ 		if (bufs == ARRAY_SIZE(wbuf) ||
+ 		    commit_transaction->t_buffers == NULL ||
+ 		    space_left < sizeof(journal_block_tag_t) + 16) {
+ 
+ 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
+ 
+ 			/* Write an end-of-descriptor marker before
+                            submitting the IOs.  "tag" still points to
+                            the last tag we set up. */
+ 
+ 			tag->t_flags |= htonl(JFS_FLAG_LAST_TAG);
+ 
+ start_journal_io:
+ 			unlock_journal(journal);
+ 			for (i=0; i<bufs; i++) {
+ 				struct buffer_head *bh = wbuf[i];
+ 				set_bit(BH_Lock, &bh->b_state);
+ 				clear_bit(BH_Dirty, &bh->b_state);
+ 				bh->b_end_io = journal_end_buffer_io_sync;
+ 				submit_bh(WRITE, bh);
+ 			}
+ 			if (current->need_resched)
+ 				schedule();
+ 			lock_journal(journal);
+ 
+ 			/* Force a new descriptor to be generated next
+                            time round the loop. */
+ 			descriptor = NULL;
+ 			bufs = 0;
+ 		}
+ 	}
+ 
+ 	/* Lo and behold: we have just managed to send a transaction to
+            the log.  Before we can commit it, wait for the IO so far to
+            complete.  Control buffers being written are on the
+            transaction's t_log_list queue, and metadata buffers are on
+            the t_iobuf_list queue.
+ 
+ 	   Wait for the transactions in reverse order.  That way we are
+ 	   less likely to be woken up until all IOs have completed, and
+ 	   so we incur less scheduling load.
+ 	*/
+ 
+ 	jbd_debug(3, "JBD: commit phase 4\n");
+ 
+ 	/* akpm: these are BJ_IO, and journal_datalist_lock is not needed */
+  wait_for_iobuf:
+ 	while (commit_transaction->t_iobuf_list != NULL) {
+ 		struct buffer_head *bh;
+ 		jh = commit_transaction->t_iobuf_list->b_tprev;
+ 		bh = jh2bh(jh);
+ 		if (buffer_locked(bh)) {
+ 			unlock_journal(journal);
+ 			wait_on_buffer(bh);
+ 			lock_journal(journal);
+ 			goto wait_for_iobuf;
+ 		}
+ 
+ 		clear_bit(BH_JWrite, &jh2bh(jh)->b_state);
+ 
+ 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
+ 		journal_unfile_buffer(jh);
+ 
+ 		/*
+ 		 * akpm: don't put back a buffer_head with stale pointers
+ 		 * dangling around.
+ 		 */
+ 		J_ASSERT_JH(jh, jh->b_transaction != NULL);
+ 		jh->b_transaction = NULL;
+ 
+ 		/*
+ 		 * ->t_iobuf_list should contain only dummy buffer_heads
+ 		 * which were created by journal_write_metadata_buffer().
+ 		 */
+ 		bh = jh2bh(jh);
+ 		BUFFER_TRACE(bh, "dumping temporary bh");
+ 		journal_unlock_journal_head(jh);
+ 		__brelse(bh);
+ 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
+ 		put_unused_buffer_head(bh);
+ 
+ 		/* We also have to unlock and free the corresponding
+                    shadowed buffer */
+ 		jh = commit_transaction->t_shadow_list->b_tprev;
+ 		bh = jh2bh(jh);
+ 		clear_bit(BH_JWrite, &bh->b_state);
+ 		J_ASSERT_BH(bh, buffer_jdirty(bh));
+ 
+ 		/* The metadata is now released for reuse, but we need
+                    to remember it against this transaction so that when
+                    we finally commit, we can do any checkpointing
+                    required. */
+ 		JBUFFER_TRACE(jh, "file as BJ_Forget");
+ 		journal_file_buffer(jh, commit_transaction, BJ_Forget);
+ 		/* Wake up any transactions which were waiting for this
+ 		   IO to complete */
+ 		wake_up(&bh->b_wait);
+ 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
+ 		__brelse(bh);
+ 	}
+ 
+ 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
+ 
+ 	jbd_debug(3, "JBD: commit phase 5\n");
+ 
+ 	/* Here we wait for the revoke record and descriptor record buffers */
+  wait_for_ctlbuf:
+ 	while (commit_transaction->t_log_list != NULL) {
+ 		struct buffer_head *bh;
+ 
+ 		jh = commit_transaction->t_log_list->b_tprev;
+ 		bh = jh2bh(jh);
+ 		if (buffer_locked(bh)) {
+ 			unlock_journal(journal);
+ 			wait_on_buffer(bh);
+ 			lock_journal(journal);
+ 			goto wait_for_ctlbuf;
+ 		}
+ 
+ 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
+ 		clear_bit(BH_JWrite, &bh->b_state);
+ 		journal_unfile_buffer(jh);
+ 		jh->b_transaction = NULL;
+ 		journal_unlock_journal_head(jh);
+ 		__brelse(bh);		/* One for getblk */
+ 		/* AKPM: bforget here */
+ 	}
+ 
+ 	jbd_debug(3, "JBD: commit phase 6\n");
+ 
+ 	/* Done it all: now write the commit record.  We should have
+ 	 * cleaned up our previous buffers by now, so if we are in abort
+ 	 * mode we can now just skip the rest of the journal write
+ 	 * entirely. */
+ 
+ 	if (is_journal_aborted(journal))
+ 		goto skip_commit;
+ 
+ 	descriptor = journal_get_descriptor_buffer(journal);
+ 
+ 	/* AKPM: buglet - add `i' to tmp! */
+ 	for (i = 0; i < jh2bh(descriptor)->b_size; i += 512) {
+ 		journal_header_t *tmp =
+ 			(journal_header_t*)jh2bh(descriptor)->b_data;
+ 		tmp->h_magic = htonl(JFS_MAGIC_NUMBER);
+ 		tmp->h_blocktype = htonl(JFS_COMMIT_BLOCK);
+ 		tmp->h_sequence = htonl(commit_transaction->t_tid);
+ 	}
+ 
+ 	unlock_journal(journal);
+ 	JBUFFER_TRACE(descriptor, "write commit block");
+ 	{
+ 		struct buffer_head *bh = jh2bh(descriptor);
+ 		ll_rw_block(WRITE, 1, &bh);
+ 		wait_on_buffer(bh);
+ 		__brelse(bh);		/* One for getblk() */
+ 		journal_unlock_journal_head(descriptor);
+ 	}
+ 	lock_journal(journal);
+ 
+ 	/* End of a transaction!  Finally, we can do checkpoint
+            processing: any buffers committed as a result of this
+            transaction can be removed from any checkpoint list it was on
+            before. */
+ 
+ skip_commit:
+ 
+ 	jbd_debug(3, "JBD: commit phase 7\n");
+ 
+ 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+ 	J_ASSERT(commit_transaction->t_async_datalist == NULL);
+ 	J_ASSERT(commit_transaction->t_buffers == NULL);
+ 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
+ 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
+ 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
+ 	J_ASSERT(commit_transaction->t_log_list == NULL);
+ 
+ 	while (commit_transaction->t_forget) {
+ 		transaction_t *cp_transaction;
+ 		struct buffer_head *bh;
+ 
+ 		jh = commit_transaction->t_forget;
+ 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
+ 			jh->b_transaction == journal->j_running_transaction);
+ 
+ 		/*
+ 		 * If there is undo-protected committed data against
+ 		 * this buffer, then we can remove it now.  If it is a
+ 		 * buffer needing such protection, the old frozen_data
+ 		 * field now points to a committed version of the
+ 		 * buffer, so rotate that field to the new committed
+ 		 * data.
+ 		 *
+ 		 * Otherwise, we can just throw away the frozen data now.
+ 		 */
+ 		if (jh->b_committed_data) {
+ 			kfree(jh->b_committed_data);
+ 			jh->b_committed_data = NULL;
+ 			if (jh->b_frozen_data) {
+ 				jh->b_committed_data = jh->b_frozen_data;
+ 				jh->b_frozen_data = NULL;
+ 			}
+ 		} else if (jh->b_frozen_data) {
+ 			kfree(jh->b_frozen_data);
+ 			jh->b_frozen_data = NULL;
+ 		}
+ 
+ 		spin_lock(&journal_datalist_lock);
+ 		cp_transaction = jh->b_cp_transaction;
+ 		if (cp_transaction) {
+ 			JBUFFER_TRACE(jh, "remove from old cp transaction");
+ 			J_ASSERT_JH(jh, commit_transaction != cp_transaction);
+ 			__journal_remove_checkpoint(jh);
+ 		}
+ 
+ 		/* Only re-checkpoint the buffer_head if it is marked
+ 		 * dirty.  If the buffer was added to the BJ_Forget list
+ 		 * by journal_forget, it may no longer be dirty and
+ 		 * there's no point in keeping a checkpoint record for
+ 		 * it. */
+ 		bh = jh2bh(jh);
+ 		if (buffer_jdirty(bh)) {
+ 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
+ 			__journal_insert_checkpoint(jh, commit_transaction);
+ 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
+ 			__journal_refile_buffer(jh);
+ 		} else {
+ 			J_ASSERT_BH(bh, !buffer_dirty(bh));
+ 			J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ 			__journal_unfile_buffer(jh);
+ 			jh->b_transaction = 0;
+ 			__journal_remove_journal_head(bh);
+ 			__brelse(bh);
+ 		}
+ 		spin_unlock(&journal_datalist_lock);
+ 	}
+ 
+ 	/* Done with this transaction! */
+ 
+ 	jbd_debug(3, "JBD: commit phase 8\n");
+ 
+ 	J_ASSERT (commit_transaction->t_state == T_COMMIT);
+ 	commit_transaction->t_state = T_FINISHED;
+ 
+ 	J_ASSERT (commit_transaction == journal->j_committing_transaction);
+ 	journal->j_commit_sequence = commit_transaction->t_tid;
+ 	journal->j_committing_transaction = NULL;
+ 
+ 	spin_lock(&journal_datalist_lock);
+ 	if (commit_transaction->t_checkpoint_list == NULL) {
+ 		__journal_drop_transaction(journal, commit_transaction);
+ 	} else {
+ 		if (journal->j_checkpoint_transactions == NULL) {
+ 			journal->j_checkpoint_transactions = commit_transaction;
+ 			commit_transaction->t_cpnext = commit_transaction;
+ 			commit_transaction->t_cpprev = commit_transaction;
+ 		} else {
+ 			commit_transaction->t_cpnext =
+ 				journal->j_checkpoint_transactions;
+ 			commit_transaction->t_cpprev =
+ 				commit_transaction->t_cpnext->t_cpprev;
+ 			commit_transaction->t_cpnext->t_cpprev =
+ 				commit_transaction;
+ 			commit_transaction->t_cpprev->t_cpnext =
+ 				commit_transaction;
+ 		}
+ 	}
+ 	spin_unlock(&journal_datalist_lock);
+ 
+ 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
+ 		  journal->j_commit_sequence, journal->j_tail_sequence);
+ 
+ 	unlock_journal(journal);
+ 	wake_up(&journal->j_wait_done_commit);
+ }
diff -rc2P linux/fs/jbd/journal.c linux-2.4.13/fs/jbd/journal.c
*** linux/fs/jbd/journal.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/journal.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,1716 ----
+ /*
+  * linux/fs/journal.c
+  *
+  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+  *
+  * Copyright 1998 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Generic filesystem journal-writing code; part of the ext2fs
+  * journaling system.
+  *
+  * This file manages journals: areas of disk reserved for logging
+  * transactional updates.  This includes the kernel journaling thread
+  * which is responsible for scheduling updates to the log.
+  *
+  * We do not actually manage the physical storage of the journal in this
+  * file: that is left to a per-journal policy function, which allows us
+  * to store the journal within a filesystem-specified area for ext2
+  * journaling (ext2 can use a reserved inode for storing the log).
+  */
+ 
+ #include <linux/module.h>
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/smp_lock.h>
+ #include <linux/sched.h>
+ #include <linux/init.h>
+ #include <linux/mm.h>
+ #include <linux/slab.h>
+ 
+ EXPORT_SYMBOL(journal_start);
+ EXPORT_SYMBOL(journal_try_start);
+ EXPORT_SYMBOL(journal_restart);
+ EXPORT_SYMBOL(journal_extend);
+ EXPORT_SYMBOL(journal_stop);
+ EXPORT_SYMBOL(journal_lock_updates);
+ EXPORT_SYMBOL(journal_unlock_updates);
+ EXPORT_SYMBOL(journal_get_write_access);
+ EXPORT_SYMBOL(journal_get_create_access);
+ EXPORT_SYMBOL(journal_get_undo_access);
+ EXPORT_SYMBOL(journal_dirty_data);
+ EXPORT_SYMBOL(journal_dirty_metadata);
+ #if 0
+ EXPORT_SYMBOL(journal_release_buffer);
+ #endif
+ EXPORT_SYMBOL(journal_forget);
+ #if 0
+ EXPORT_SYMBOL(journal_sync_buffer);
+ #endif
+ EXPORT_SYMBOL(journal_flush);
+ EXPORT_SYMBOL(journal_revoke);
+ 
+ EXPORT_SYMBOL(journal_init_dev);
+ EXPORT_SYMBOL(journal_init_inode);
+ EXPORT_SYMBOL(journal_update_format);
+ EXPORT_SYMBOL(journal_check_used_features);
+ EXPORT_SYMBOL(journal_check_available_features);
+ EXPORT_SYMBOL(journal_set_features);
+ EXPORT_SYMBOL(journal_create);
+ EXPORT_SYMBOL(journal_load);
+ EXPORT_SYMBOL(journal_destroy);
+ EXPORT_SYMBOL(journal_recover);
+ EXPORT_SYMBOL(journal_update_superblock);
+ EXPORT_SYMBOL(__journal_abort);
+ EXPORT_SYMBOL(journal_abort);
+ EXPORT_SYMBOL(journal_errno);
+ EXPORT_SYMBOL(journal_ack_err);
+ EXPORT_SYMBOL(journal_clear_err);
+ EXPORT_SYMBOL(log_wait_commit);
+ EXPORT_SYMBOL(log_start_commit);
+ EXPORT_SYMBOL(journal_wipe);
+ EXPORT_SYMBOL(journal_blocks_per_page);
+ EXPORT_SYMBOL(journal_flushpage);
+ EXPORT_SYMBOL(journal_try_to_free_buffers);
+ EXPORT_SYMBOL(journal_bmap);
+ EXPORT_SYMBOL(journal_force_commit);
+ 
+ static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+ 
+ /*
+  * journal_datalist_lock is used to protect data buffers:
+  *
+  *	bh->b_transaction
+  *	bh->b_tprev
+  *	bh->b_tnext
+  *
+  * journal_free_buffer() is called from journal_try_to_free_buffer(), and is
+  * async wrt everything else.
+  *
+  * It is also used for checkpoint data, also to protect against
+  * journal_try_to_free_buffer():
+  *
+  *	bh->b_cp_transaction
+  *	bh->b_cpnext
+  *	bh->b_cpprev
+  *	transaction->t_checkpoint_list
+  *	transaction->t_cpnext
+  *	transaction->t_cpprev
+  *	journal->j_checkpoint_transactions
+  *
+  * It is global at this time rather than per-journal because it's
+  * impossible for __journal_free_buffer to go from a buffer_head
+  * back to a journal_t unracily (well, not true.  Fix later)
+  *
+  *
+  * The `datalist' and `checkpoint list' functions are quite
+  * separate and we could use two spinlocks here.
+  *
+  * lru_list_lock nests inside journal_datalist_lock.
+  */
+ spinlock_t journal_datalist_lock = SPIN_LOCK_UNLOCKED;
+ 
+ /*
+  * List of all journals in the system.  Protected by the BKL.
+  */
+ static LIST_HEAD(all_journals);
+ 
+ /*
+  * Helper function used to manage commit timeouts
+  */
+ 
+ static void commit_timeout(unsigned long __data)
+ {
+ 	struct task_struct * p = (struct task_struct *) __data;
+ 
+ 	wake_up_process(p);
+ }
+ 
+ /* Static check for data structure consistency.  There's no code
+  * invoked --- we'll just get a linker failure if things aren't right.
+  */
+ void __journal_internal_check(void)
+ {
+ 	extern void journal_bad_superblock_size(void);
+ 	if (sizeof(struct journal_superblock_s) != 1024)
+ 		journal_bad_superblock_size();
+ }
+ 
+ /*
+  * kjournald: The main thread function used to manage a logging device
+  * journal.
+  *
+  * This kernel thread is responsible for two things:
+  *
+  * 1) COMMIT:  Every so often we need to commit the current state of the
+  *    filesystem to disk.  The journal thread is responsible for writing
+  *    all of the metadata buffers to disk.
+  *
+  * 2) CHECKPOINT: We cannot reuse a used section of the log file until all
+  *    of the data in that part of the log has been rewritten elsewhere on
+  *    the disk.  Flushing these old buffers to reclaim space in the log is
+  *    known as checkpointing, and this thread is responsible for that job.
+  */
+ 
+ journal_t *current_journal;		// AKPM: debug
+ 
+ int kjournald(void *arg)
+ {
+ 	journal_t *journal = (journal_t *) arg;
+ 	transaction_t *transaction;
+ 	struct timer_list timer;
+ 
+ 	current_journal = journal;
+ 
+ 	lock_kernel();
+ 	daemonize();
+ 	spin_lock_irq(&current->sigmask_lock);
+ 	sigfillset(&current->blocked);
+ 	recalc_sigpending(current);
+ 	spin_unlock_irq(&current->sigmask_lock);
+ 
+ 	sprintf(current->comm, "kjournald");
+ 
+ 	/* Set up an interval timer which can be used to trigger a
+            commit wakeup after the commit interval expires */
+ 	init_timer(&timer);
+ 	timer.data = (unsigned long) current;
+ 	timer.function = commit_timeout;
+ 	journal->j_commit_timer = &timer;
+ 
+ 	/* Record that the journal thread is running */
+ 	journal->j_task = current;
+ 	wake_up(&journal->j_wait_done_commit);
+ 
+ 	printk(KERN_INFO "kjournald starting.  Commit interval %ld seconds\n",
+ 			journal->j_commit_interval / HZ);
+ 	list_add(&journal->j_all_journals, &all_journals);
+ 
+ 	/* And now, wait forever for commit wakeup events. */
+ 	while (1) {
+ 		if (journal->j_flags & JFS_UNMOUNT)
+ 			break;
+ 
+ 		jbd_debug(1, "commit_sequence=%d, commit_request=%d\n",
+ 			journal->j_commit_sequence, journal->j_commit_request);
+ 
+ 		if (journal->j_commit_sequence != journal->j_commit_request) {
+ 			jbd_debug(1, "OK, requests differ\n");
+ 			if (journal->j_commit_timer_active) {
+ 				journal->j_commit_timer_active = 0;
+ 				del_timer(journal->j_commit_timer);
+ 			}
+ 
+ 			journal_commit_transaction(journal);
+ 			continue;
+ 		}
+ 
+ 		wake_up(&journal->j_wait_done_commit);
+ 		interruptible_sleep_on(&journal->j_wait_commit);
+ 
+ 		jbd_debug(1, "kjournald wakes\n");
+ 
+ 		/* Were we woken up by a commit wakeup event? */
+ 		if ((transaction = journal->j_running_transaction) != NULL &&
+ 		    time_after_eq(jiffies, transaction->t_expires)) {
+ 			journal->j_commit_request = transaction->t_tid;
+ 			jbd_debug(1, "woke because of timeout\n");
+ 		}
+ 	}
+ 
+ 	if (journal->j_commit_timer_active) {
+ 		journal->j_commit_timer_active = 0;
+ 		del_timer_sync(journal->j_commit_timer);
+ 	}
+ 
+ 	list_del(&journal->j_all_journals);
+ 
+ 	journal->j_task = NULL;
+ 	wake_up(&journal->j_wait_done_commit);
+ 	jbd_debug(1, "Journal thread exiting.\n");
+ 	return 0;
+ }
+ 
+ static void journal_start_thread(journal_t *journal)
+ {
+ 	kernel_thread(kjournald, (void *) journal,
+ 		      CLONE_VM | CLONE_FS | CLONE_FILES);
+ 	while (!journal->j_task)
+ 		sleep_on(&journal->j_wait_done_commit);
+ }
+ 
+ static void journal_kill_thread(journal_t *journal)
+ {
+ 	journal->j_flags |= JFS_UNMOUNT;
+ 
+ 	while (journal->j_task) {
+ 		wake_up(&journal->j_wait_commit);
+ 		sleep_on(&journal->j_wait_done_commit);
+ 	}
+ }
+ 
+ #if 0
+ 
+ This is no longer needed - we do it in commit quite efficiently.
+ Note that if this function is resurrected, the loop needs to
+ be reorganised into the next_jh/last_jh algorithm.
+ 
+ /*
+  * journal_clean_data_list: cleanup after data IO.
+  *
+  * Once the IO system has finished writing the buffers on the transaction's
+  * data list, we can remove those buffers from the list.  This function
+  * scans the list for such buffers and removes them cleanly.
+  *
+  * We assume that the journal is already locked.
+  * We are called with journal_datalist_lock held.
+  *
+  * AKPM: This function looks inefficient.  Approximately O(n^2)
+  * for potentially thousands of buffers.  It no longer shows on profiles
+  * because these buffers are mainly dropped in journal_commit_transaction().
+  */
+ 
+ void __journal_clean_data_list(transaction_t *transaction)
+ {
+ 	struct journal_head *jh, *next;
+ 
+ 	assert_spin_locked(&journal_datalist_lock);
+ 
+ restart:
+ 	jh = transaction->t_sync_datalist;
+ 	if (!jh)
+ 		goto out;
+ 	do {
+ 		next = jh->b_tnext;
+ 		if (!buffer_locked(jh2bh(jh)) && !buffer_dirty(jh2bh(jh))) {
+ 			struct buffer_head *bh = jh2bh(jh);
+ 			BUFFER_TRACE(bh, "data writeout complete: unfile");
+ 			__journal_unfile_buffer(jh);
+ 			jh->b_transaction = NULL;
+ 			__journal_remove_journal_head(bh);
+ 			refile_buffer(bh);
+ 			__brelse(bh);
+ 			goto restart;
+ 		}
+ 		jh = next;
+ 	} while (transaction->t_sync_datalist &&
+ 			jh != transaction->t_sync_datalist);
+ out:
+ 	return;
+ }
+ #endif
+ 
+ /*
+  * journal_write_metadata_buffer: write a metadata buffer to the journal.
+  *
+  * Writes a metadata buffer to a given disk block.  The actual IO is not
+  * performed but a new buffer_head is constructed which labels the data
+  * to be written with the correct destination disk block.
+  *
+  * Any magic-number escaping which needs to be done will cause a
+  * copy-out here.  If the buffer happens to start with the
+  * JFS_MAGIC_NUMBER, then we can't write it to the log directly: the
+  * magic number is only written to the log for descripter blocks.  In
+  * this case, we copy the data and replace the first word with 0, and we
+  * return a result code which indicates that this buffer needs to be
+  * marked as an escaped buffer in the corresponding log descriptor
+  * block.  The missing word can then be restored when the block is read
+  * during recovery.
+  *
+  * If the source buffer has already been modified by a new transaction
+  * since we took the last commit snapshot, we use the frozen copy of
+  * that data for IO.  If we end up using the existing buffer_head's data
+  * for the write, then we *have* to lock the buffer to prevent anyone
+  * else from using and possibly modifying it while the IO is in
+  * progress.
+  *
+  * The function returns a pointer to the buffer_heads to be used for IO.
+  *
+  * We assume that the journal has already been locked in this function.
+  *
+  * Return value:
+  *  <0: Error
+  * >=0: Finished OK
+  *
+  * On success:
+  * Bit 0 set == escape performed on the data
+  * Bit 1 set == buffer copy-out performed (kfree the data after IO)
+  */
+ 
+ static inline unsigned long virt_to_offset(void *p) 
+ {return ((unsigned long) p) & ~PAGE_MASK;}
+ 					       
+ int journal_write_metadata_buffer(transaction_t *transaction,
+ 				  struct journal_head  *jh_in,
+ 				  struct journal_head **jh_out,
+ 				  int blocknr)
+ {
+ 	int need_copy_out = 0;
+ 	int done_copy_out = 0;
+ 	int do_escape = 0;
+ 	char *mapped_data;
+ 	struct buffer_head *new_bh;
+ 	struct journal_head * new_jh;
+ 	struct page *new_page;
+ 	unsigned int new_offset;
+ 
+ 	/*
+ 	 * The buffer really shouldn't be locked: only the current committing
+ 	 * transaction is allowed to write it, so nobody else is allowed
+ 	 * to do any IO.
+ 	 *
+ 	 * akpm: except if we're journalling data, and write() output is
+ 	 * also part of a shared mapping, and another thread has
+ 	 * decided to launch a writepage() against this buffer.
+ 	 */
+ 	J_ASSERT_JH(jh_in, buffer_jdirty(jh2bh(jh_in)));
+ 
+ 	/*
+ 	 * If a new transaction has already done a buffer copy-out, then
+ 	 * we use that version of the data for the commit.
+ 	 */
+ 
+ 	if (jh_in->b_frozen_data) {
+ 		done_copy_out = 1;
+ 		new_page = virt_to_page(jh_in->b_frozen_data);
+ 		new_offset = virt_to_offset(jh_in->b_frozen_data);
+ 	} else {
+ 		new_page = jh2bh(jh_in)->b_page;
+ 		new_offset = virt_to_offset(jh2bh(jh_in)->b_data);
+ 	}
+ 
+ 	mapped_data = ((char *) kmap(new_page)) + new_offset;
+ 
+ 	/*
+ 	 * Check for escaping
+ 	 */
+ 	if (* ((unsigned int *) mapped_data) == htonl(JFS_MAGIC_NUMBER)) {
+ 		need_copy_out = 1;
+ 		do_escape = 1;
+ 	}
+ 
+ 	/*
+ 	 * Do we need to do a data copy?
+ 	 */
+ 
+ 	if (need_copy_out && !done_copy_out) {
+ 		char *tmp;
+ 		tmp = jbd_rep_kmalloc(jh2bh(jh_in)->b_size, GFP_NOFS);
+ 
+ 		jh_in->b_frozen_data = tmp;
+ 		memcpy (tmp, mapped_data, jh2bh(jh_in)->b_size);
+ 		
+ 		/* If we get to this path, we'll always need the new
+ 		   address kmapped so that we can clear the escaped
+ 		   magic number below. */
+ 		kunmap(new_page);
+ 		new_page = virt_to_page(tmp);
+ 		new_offset = virt_to_offset(tmp);
+ 		mapped_data = ((char *) kmap(new_page)) + new_offset;
+ 		
+ 		done_copy_out = 1;
+ 	}
+ 
+ 	/*
+ 	 * Right, time to make up the new buffer_head.
+ 	 */
+ 	do {
+ 		new_bh = get_unused_buffer_head(0);
+ 		if (!new_bh) {
+ 			printk (KERN_NOTICE __FUNCTION__
+ 				": ENOMEM at get_unused_buffer_head, "
+ 				"trying again.\n");
+ 			current->policy |= SCHED_YIELD;
+ 			schedule();
+ 		}
+ 	} while (!new_bh);
+ 	/* keep subsequent assertions sane */
+ 	new_bh->b_prev_free = 0;
+ 	new_bh->b_next_free = 0;
+ 	new_bh->b_state = 0;
+ 	init_buffer(new_bh, NULL, NULL);
+ 	atomic_set(&new_bh->b_count, 1);
+ 	new_jh = journal_add_journal_head(new_bh);
+ 
+ 	set_bh_page(new_bh, new_page, new_offset);
+ 
+ 	new_jh->b_transaction = NULL;
+ 	new_bh->b_size = jh2bh(jh_in)->b_size;
+ 	new_bh->b_dev = transaction->t_journal->j_dev;
+ 	new_bh->b_blocknr = blocknr;
+ 	new_bh->b_state |= (1 << BH_Mapped) | (1 << BH_Dirty);
+ 
+ 	*jh_out = new_jh;
+ 
+ 	/*
+ 	 * Did we need to do an escaping?  Now we've done all the
+ 	 * copying, we can finally do so.
+ 	 */
+ 
+ 	if (do_escape)
+ 		* ((unsigned int *) mapped_data) = 0;
+ 	kunmap(new_page);
+ 	
+ 	/*
+ 	 * The to-be-written buffer needs to get moved to the io queue,
+ 	 * and the original buffer whose contents we are shadowing or
+ 	 * copying is moved to the transaction's shadow queue.
+ 	 */
+ 	JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
+ 	journal_file_buffer(jh_in, transaction, BJ_Shadow);
+ 	JBUFFER_TRACE(new_jh, "file as BJ_IO");
+ 	journal_file_buffer(new_jh, transaction, BJ_IO);
+ 
+ 	return do_escape | (done_copy_out << 1);
+ }
+ 
+ /*
+  * Allocation code for the journal file.  Manage the space left in the
+  * journal, so that we can begin checkpointing when appropriate.
+  */
+ 
+ /*
+  * log_space_left: Return the number of free blocks left in the journal.
+  *
+  * Called with the journal already locked.
+  */
+ 
+ int log_space_left (journal_t *journal)
+ {
+ 	int left = journal->j_free;
+ 
+ 	/* Be pessimistic here about the number of those free blocks
+ 	 * which might be required for log descriptor control blocks. */
+ 
+ #define MIN_LOG_RESERVED_BLOCKS 32 /* Allow for rounding errors */
+ 
+ 	left -= MIN_LOG_RESERVED_BLOCKS;
+ 
+ 	if (left <= 0)
+ 		return 0;
+ 	left -= (left >> 3);
+ 	return left;
+ }
+ 
+ /*
+  * This function must be non-allocating for PF_MEMALLOC tasks
+  */
+ tid_t log_start_commit (journal_t *journal, transaction_t *transaction)
+ {
+ 	tid_t target = journal->j_commit_request;
+ 
+ 	lock_kernel(); /* Protect journal->j_running_transaction */
+ 	
+ 	/*
+ 	 * A NULL transaction asks us to commit the currently running
+ 	 * transaction, if there is one.  
+ 	 */
+ 	if (transaction)
+ 		target = transaction->t_tid;
+ 	else {
+ 		transaction = journal->j_running_transaction;
+ 		if (!transaction)
+ 			goto out;
+ 		target = transaction->t_tid;
+ 	}
+ 		
+ 	/*
+ 	 * Are we already doing a recent enough commit?
+ 	 */
+ 	if (tid_geq(journal->j_commit_request, target))
+ 		goto out;
+ 
+ 	/*
+ 	 * We want a new commit: OK, mark the request and wakup the
+ 	 * commit thread.  We do _not_ do the commit ourselves.
+ 	 */
+ 
+ 	journal->j_commit_request = target;
+ 	jbd_debug(1, "JBD: requesting commit %d/%d\n",
+ 		  journal->j_commit_request,
+ 		  journal->j_commit_sequence);
+ 	wake_up(&journal->j_wait_commit);
+ 
+ out:
+ 	unlock_kernel();
+ 	return target;
+ }
+ 
+ /*
+  * Wait for a specified commit to complete.
+  * The caller may not hold the journal lock.
+  */
+ void log_wait_commit (journal_t *journal, tid_t tid)
+ {
+ 	lock_kernel();
+ #ifdef CONFIG_JBD_DEBUG
+ 	lock_journal(journal);
+ 	if (!tid_geq(journal->j_commit_request, tid)) {
+ 		printk(KERN_EMERG __FUNCTION__
+ 			": error: j_commit_request=%d, tid=%d\n",
+ 			journal->j_commit_request, tid);
+ 	}
+ 	unlock_journal(journal);
+ #endif
+ 	while (tid_gt(tid, journal->j_commit_sequence)) {
+ 		jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n",
+ 				  tid, journal->j_commit_sequence);
+ 		wake_up(&journal->j_wait_commit);
+ 		sleep_on(&journal->j_wait_done_commit);
+ 	}
+ 	unlock_kernel();
+ }
+ 
+ /*
+  * Log buffer allocation routines:
+  */
+ 
+ unsigned long journal_next_log_block(journal_t *journal)
+ {
+ 	unsigned long blocknr;
+ 
+ 	J_ASSERT(journal->j_free > 1);
+ 
+ 	blocknr = journal->j_head;
+ 	journal->j_head++;
+ 	journal->j_free--;
+ 	if (journal->j_head == journal->j_last)
+ 		journal->j_head = journal->j_first;
+ 	return journal_bmap(journal, blocknr);
+ }
+ 
+ /*
+  * Conversion of logical to physical block numbers for the journal
+  *
+  * On external journals the journal blocks are identity-mapped, so
+  * this is a no-op.  If needed, we can use j_blk_offset - everything is
+  * ready.
+  */
+ unsigned long journal_bmap(journal_t *journal, unsigned long blocknr)
+ {
+ 	unsigned long ret;
+ 
+ 	if (journal->j_inode) {
+ 		ret = bmap(journal->j_inode, blocknr);
+ 		J_ASSERT(ret != 0);
+ 	} else {
+ 		ret = blocknr;	 /* +journal->j_blk_offset */
+ 	}
+ 	return ret;
+ }
+ 
+ /*
+  * We play buffer_head aliasing tricks to write data/metadata blocks to
+  * the journal without copying their contents, but for journal
+  * descriptor blocks we do need to generate bona fide buffers.
+  */
+ 
+ struct journal_head * journal_get_descriptor_buffer(journal_t *journal)
+ {
+ 	struct buffer_head *bh;
+ 	unsigned long blocknr = journal_next_log_block(journal);
+ 
+ 	bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ 	bh->b_state |= (1 << BH_Dirty);
+ 	BUFFER_TRACE(bh, "return this buffer");
+ 	return journal_add_journal_head(bh);
+ }
+ 
+ /*
+  * Management for journal control blocks: functions to create and
+  * destroy journal_t structures, and to initialise and read existing
+  * journal blocks from disk.  */
+ 
+ /* First: create and setup a journal_t object in memory.  We initialise
+  * very few fields yet: that has to wait until we have created the
+  * journal structures from from scratch, or loaded them from disk. */
+ 
+ static journal_t * journal_init_common (void)
+ {
+ 	journal_t *journal;
+ 	int err;
+ 
+ 	MOD_INC_USE_COUNT;
+ 
+ 	journal = jbd_kmalloc(sizeof(*journal), GFP_KERNEL);
+ 	if (!journal)
+ 		goto fail;
+ 	memset(journal, 0, sizeof(*journal));
+ 
+ 	init_waitqueue_head(&journal->j_wait_transaction_locked);
+ 	init_waitqueue_head(&journal->j_wait_logspace);
+ 	init_waitqueue_head(&journal->j_wait_done_commit);
+ 	init_waitqueue_head(&journal->j_wait_checkpoint);
+ 	init_waitqueue_head(&journal->j_wait_commit);
+ 	init_waitqueue_head(&journal->j_wait_updates);
+ 	init_MUTEX(&journal->j_barrier);
+ 	init_MUTEX(&journal->j_checkpoint_sem);
+ 	init_MUTEX(&journal->j_sem);
+ 
+ 	journal->j_commit_interval = (HZ * 5);
+ 
+ 	/* The journal is marked for error until we succeed with recovery! */
+ 	journal->j_flags = JFS_ABORT;
+ 
+ 	/* Set up a default-sized revoke table for the new mount. */
+ 	err = journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
+ 	if (err) {
+ 		kfree(journal);
+ 		goto fail;
+ 	}
+ 	return journal;
+ fail:
+ 	MOD_DEC_USE_COUNT;
+ 	return NULL;
+ }
+ 
+ /* journal_init_dev and journal_init_inode:
+  *
+  * Create a journal structure assigned some fixed set of disk blocks to
+  * the journal.  We don't actually touch those disk blocks yet, but we
+  * need to set up all of the mapping information to tell the journaling
+  * system where the journal blocks are.
+  *
+  * journal_init_dev creates a journal which maps a fixed contiguous
+  * range of blocks on an arbitrary block device.
+  *
+  * journal_init_inode creates a journal which maps an on-disk inode as
+  * the journal.  The inode must exist already, must support bmap() and
+  * must have all data blocks preallocated.
+  */
+ 
+ journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev,
+ 			int start, int len, int blocksize)
+ {
+ 	journal_t *journal = journal_init_common();
+ 	struct buffer_head *bh;
+ 
+ 	if (!journal)
+ 		return NULL;
+ 
+ 	journal->j_dev = dev;
+ 	journal->j_fs_dev = fs_dev;
+ 	journal->j_blk_offset = start;
+ 	journal->j_maxlen = len;
+ 	journal->j_blocksize = blocksize;
+ 
+ 	bh = getblk(journal->j_dev, start, journal->j_blocksize);
+ 	J_ASSERT(bh != NULL);
+ 	journal->j_sb_buffer = bh;
+ 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+ 
+ 	return journal;
+ }
+ 
+ journal_t * journal_init_inode (struct inode *inode)
+ {
+ 	struct buffer_head *bh;
+ 	journal_t *journal = journal_init_common();
+ 	int blocknr;
+ 
+ 	if (!journal)
+ 		return NULL;
+ 
+ 	journal->j_dev = inode->i_dev;
+ 	journal->j_fs_dev = inode->i_dev;
+ 	journal->j_inode = inode;
+ 	jbd_debug(1,
+ 		  "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n",
+ 		  journal, bdevname(inode->i_dev), inode->i_ino, inode->i_size,
+ 		  inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize);
+ 
+ 	journal->j_maxlen = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ 	journal->j_blocksize = inode->i_sb->s_blocksize;
+ 
+ 	blocknr = journal_bmap(journal, 0);
+ 	bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ 	J_ASSERT(bh != NULL);
+ 	journal->j_sb_buffer = bh;
+ 	journal->j_superblock = (journal_superblock_t *)bh->b_data;
+ 
+ 	return journal;
+ }
+ 
+ /*
+  * Given a journal_t structure, initialise the various fields for
+  * startup of a new journaling session.  We use this both when creating
+  * a journal, and after recovering an old journal to reset it for
+  * subsequent use.
+  */
+ 
+ static int journal_reset (journal_t *journal)
+ {
+ 	journal_superblock_t *sb = journal->j_superblock;
+ 	unsigned int first, last;
+ 
+ 	first = ntohl(sb->s_first);
+ 	last = ntohl(sb->s_maxlen);
+ 
+ 	journal->j_first = first;
+ 	journal->j_last = last;
+ 
+ 	journal->j_head = first;
+ 	journal->j_tail = first;
+ 	journal->j_free = last - first;
+ 
+ 	journal->j_tail_sequence = journal->j_transaction_sequence;
+ 	journal->j_commit_sequence = journal->j_transaction_sequence - 1;
+ 	journal->j_commit_request = journal->j_commit_sequence;
+ 
+ 	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
+ 
+ 	/* Add the dynamic fields and write it to disk. */
+ 	journal_update_superblock(journal, 1);
+ 
+ 	lock_journal(journal);
+ 	journal_start_thread(journal);
+ 	unlock_journal(journal);
+ 
+ 	return 0;
+ }
+ 
+ /*
+  * Given a journal_t structure which tells us which disk blocks we can
+  * use, create a new journal superblock and initialise all of the
+  * journal fields from scratch.  */
+ 
+ int journal_create (journal_t *journal)
+ {
+ 	int blocknr;
+ 	struct buffer_head *bh;
+ 	journal_superblock_t *sb;
+ 	int i;
+ 
+ 	if (journal->j_maxlen < JFS_MIN_JOURNAL_BLOCKS) {
+ 		printk (KERN_ERR "Journal length (%d blocks) too short.\n",
+ 			journal->j_maxlen);
+ 		return -EINVAL;
+ 	}
+ 
+ 	if (journal->j_inode == NULL) {
+ 		/*
+ 		 * We don't know what block to start at!
+ 		 */
+ 		printk(KERN_EMERG __FUNCTION__
+ 			": creation of journal on external device!\n");
+ 		BUG();
+ 	}
+ 
+ 	/* Zero out the entire journal on disk.  We cannot afford to
+ 	   have any blocks on disk beginning with JFS_MAGIC_NUMBER. */
+ 	jbd_debug(1, "JBD: Zeroing out journal blocks...\n");
+ 	for (i = 0; i < journal->j_maxlen; i++) {
+ 		blocknr = journal_bmap(journal, i);
+ 		bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ 		wait_on_buffer(bh);
+ 		memset (bh->b_data, 0, journal->j_blocksize);
+ 		BUFFER_TRACE(bh, "marking dirty");
+ 		mark_buffer_dirty(bh);
+ 		BUFFER_TRACE(bh, "marking uptodate");
+ 		mark_buffer_uptodate(bh, 1);
+ 		__brelse(bh);
+ 	}
+ 	sync_dev(journal->j_dev);
+ 	jbd_debug(1, "JBD: journal cleared.\n");
+ 
+ 	/* OK, fill in the initial static fields in the new superblock */
+ 	sb = journal->j_superblock;
+ 
+ 	sb->s_header.h_magic	 = htonl(JFS_MAGIC_NUMBER);
+ 	sb->s_header.h_blocktype = htonl(JFS_SUPERBLOCK_V2);
+ 
+ 	sb->s_blocksize	= htonl(journal->j_blocksize);
+ 	sb->s_maxlen	= htonl(journal->j_maxlen);
+ 	sb->s_first	= htonl(1);
+ 
+ 	journal->j_transaction_sequence = 1;
+ 
+ 	journal->j_flags &= ~JFS_ABORT;
+ 	journal->j_format_version = 2;
+ 
+ 	return journal_reset(journal);
+ }
+ 
+ /*
+  * Update a journal's dynamic superblock fields and write it to disk,
+  * optionally waiting for the IO to complete.
+ */
+ 
+ void journal_update_superblock(journal_t *journal, int wait)
+ {
+ 	journal_superblock_t *sb = journal->j_superblock;
+ 	struct buffer_head *bh = journal->j_sb_buffer;
+ 
+ 	jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+ 		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+ 
+ 	sb->s_sequence = htonl(journal->j_tail_sequence);
+ 	sb->s_start    = htonl(journal->j_tail);
+ 	sb->s_errno    = htonl(journal->j_errno);
+ 
+ 	BUFFER_TRACE(bh, "marking dirty");
+ 	mark_buffer_dirty(bh);
+ 	ll_rw_block(WRITE, 1, &bh);
+ 	if (wait)
+ 		wait_on_buffer(bh);
+ 
+ 	/* If we have just flushed the log (by marking s_start==0), then
+ 	 * any future commit will have to be careful to update the
+ 	 * superblock again to re-record the true start of the log. */
+ 
+ 	if (sb->s_start)
+ 		journal->j_flags &= ~JFS_FLUSHED;
+ 	else
+ 		journal->j_flags |= JFS_FLUSHED;
+ }
+ 
+ 
+ /*
+  * Read the superblock for a given journal, performing initial
+  * validation of the format.
+  */
+ 
+ static int journal_get_superblock(journal_t *journal)
+ {
+ 	struct buffer_head *bh;
+ 	journal_superblock_t *sb;
+ 
+ 	bh = journal->j_sb_buffer;
+ 
+ 	J_ASSERT(bh != NULL);
+ 	if (!buffer_uptodate(bh)) {
+ 		ll_rw_block(READ, 1, &bh);
+ 		wait_on_buffer(bh);
+ 		if (!buffer_uptodate(bh)) {
+ 			printk (KERN_ERR
+ 				"JBD: IO error reading journal superblock\n");
+ 			return -EIO;
+ 		}
+ 	}
+ 
+ 	sb = journal->j_superblock;
+ 
+ 	if (sb->s_header.h_magic != htonl(JFS_MAGIC_NUMBER) ||
+ 	    sb->s_blocksize != htonl(journal->j_blocksize)) {
+ 		printk(KERN_WARNING "JBD: no valid journal superblock found\n");
+ 		return -EINVAL;
+ 	}
+ 
+ 	switch(ntohl(sb->s_header.h_blocktype)) {
+ 	case JFS_SUPERBLOCK_V1:
+ 		journal->j_format_version = 1;
+ 		break;
+ 	case JFS_SUPERBLOCK_V2:
+ 		journal->j_format_version = 2;
+ 		break;
+ 	default:
+ 		printk(KERN_WARNING "JBD: unrecognised superblock format ID\n");
+ 		return -EINVAL;
+ 	}
+ 
+ 	if (ntohl(sb->s_maxlen) < journal->j_maxlen)
+ 		journal->j_maxlen = ntohl(sb->s_maxlen);
+ 	else if (ntohl(sb->s_maxlen) > journal->j_maxlen) {
+ 		printk (KERN_WARNING "JBD: journal file too short\n");
+ 		return -EINVAL;
+ 	}
+ 
+ 	return 0;
+ }
+ 
+ /*
+  * Load the on-disk journal superblock and read the key fields into the
+  * journal_t.
+  */
+ 
+ static int load_superblock(journal_t *journal)
+ {
+ 	int err;
+ 	journal_superblock_t *sb;
+ 
+ 	err = journal_get_superblock(journal);
+ 	if (err)
+ 		return err;
+ 
+ 	sb = journal->j_superblock;
+ 
+ 	journal->j_tail_sequence = ntohl(sb->s_sequence);
+ 	journal->j_tail = ntohl(sb->s_start);
+ 	journal->j_first = ntohl(sb->s_first);
+ 	journal->j_last = ntohl(sb->s_maxlen);
+ 	journal->j_errno = ntohl(sb->s_errno);
+ 
+ 	return 0;
+ }
+ 
+ 
+ /*
+  * Given a journal_t structure which tells us which disk blocks contain
+  * a journal, read the journal from disk to initialise the in-memory
+  * structures.
+  */
+ 
+ int journal_load(journal_t *journal)
+ {
+ 	int err;
+ 
+ 	err = load_superblock(journal);
+ 	if (err)
+ 		return err;
+ 
+ 	/* If this is a V2 superblock, then we have to check the
+ 	 * features flags on it. */
+ 
+ 	if (journal->j_format_version >= 2) {
+ 		journal_superblock_t *sb = journal->j_superblock;
+ 
+ 		if ((sb->s_feature_ro_compat &
+ 		     ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) ||
+ 		    (sb->s_feature_incompat &
+ 		     ~cpu_to_be32(JFS_KNOWN_INCOMPAT_FEATURES))) {
+ 			printk (KERN_WARNING
+ 				"JBD: Unrecognised features on journal\n");
+ 			return -EINVAL;
+ 		}
+ 	}
+ 
+ 	/* Let the recovery code check whether it needs to recover any
+ 	 * data from the journal. */
+ 	if (journal_recover(journal))
+ 		goto recovery_error;
+ 
+ 	/* OK, we've finished with the dynamic journal bits:
+ 	 * reinitialise the dynamic contents of the superblock in memory
+ 	 * and reset them on disk. */
+ 	if (journal_reset(journal))
+ 		goto recovery_error;
+ 
+ 	journal->j_flags &= ~JFS_ABORT;
+ 	journal->j_flags |= JFS_LOADED;
+ 	return 0;
+ 
+ recovery_error:
+ 	printk (KERN_WARNING "JBD: recovery failed\n");
+ 	return -EIO;
+ }
+ 
+ /*
+  * Release a journal_t structure once it is no longer in use by the
+  * journaled object.
+  */
+ 
+ void journal_destroy (journal_t *journal)
+ {
+ 	/* Wait for the commit thread to wake up and die. */
+ 	journal_kill_thread(journal);
+ 
+ 	/* Force a final log commit */
+ 	if (journal->j_running_transaction)
+ 		journal_commit_transaction(journal);
+ 
+ 	/* Force any old transactions to disk */
+ 	lock_journal(journal);
+ 	while (journal->j_checkpoint_transactions != NULL)
+ 		log_do_checkpoint(journal, 1);
+ 
+ 	J_ASSERT(journal->j_running_transaction == NULL);
+ 	J_ASSERT(journal->j_committing_transaction == NULL);
+ 	J_ASSERT(journal->j_checkpoint_transactions == NULL);
+ 
+ 	/* We can now mark the journal as empty. */
+ 	journal->j_tail = 0;
+ 	journal->j_tail_sequence = ++journal->j_transaction_sequence;
+ 	journal_update_superblock(journal, 1);
+ 
+ 	if (journal->j_inode)
+ 		iput(journal->j_inode);
+ 	if (journal->j_revoke)
+ 		journal_destroy_revoke(journal);
+ 
+ 	unlock_journal(journal);
+ 	brelse(journal->j_sb_buffer);
+ 	kfree(journal);
+ 	MOD_DEC_USE_COUNT;
+ }
+ 
+ 
+ /* Published API: Check whether the journal uses all of a given set of
+  * features.  Return true (non-zero) if it does. */
+ 
+ int journal_check_used_features (journal_t *journal, unsigned long compat,
+ 				 unsigned long ro, unsigned long incompat)
+ {
+ 	journal_superblock_t *sb;
+ 
+ 	if (!compat && !ro && !incompat)
+ 		return 1;
+ 	if (journal->j_format_version == 1)
+ 		return 0;
+ 
+ 	sb = journal->j_superblock;
+ 
+ 	if (((be32_to_cpu(sb->s_feature_compat) & compat) == compat) &&
+ 	    ((be32_to_cpu(sb->s_feature_ro_compat) & ro) == ro) &&
+ 	    ((be32_to_cpu(sb->s_feature_incompat) & incompat) == incompat))
+ 		return 1;
+ 
+ 	return 0;
+ }
+ 
+ /* Published API: Check whether the journaling code supports the use of
+  * all of a given set of features on this journal.  Return true
+  * (non-zero) if it can. */
+ 
+ int journal_check_available_features (journal_t *journal, unsigned long compat,
+ 				      unsigned long ro, unsigned long incompat)
+ {
+ 	journal_superblock_t *sb;
+ 
+ 	if (!compat && !ro && !incompat)
+ 		return 1;
+ 
+ 	sb = journal->j_superblock;
+ 
+ 	/* We can support any known requested features iff the
+ 	 * superblock is in version 2.  Otherwise we fail to support any
+ 	 * extended sb features. */
+ 
+ 	if (journal->j_format_version != 2)
+ 		return 0;
+ 
+ 	if ((compat   & JFS_KNOWN_COMPAT_FEATURES) == compat &&
+ 	    (ro       & JFS_KNOWN_ROCOMPAT_FEATURES) == ro &&
+ 	    (incompat & JFS_KNOWN_INCOMPAT_FEATURES) == incompat)
+ 		return 1;
+ 
+ 	return 0;
+ }
+ 
+ /* Published API: Mark a given journal feature as present on the
+  * superblock.  Returns true if the requested features could be set. */
+ 
+ int journal_set_features (journal_t *journal, unsigned long compat,
+ 			  unsigned long ro, unsigned long incompat)
+ {
+ 	journal_superblock_t *sb;
+ 
+ 	if (journal_check_used_features(journal, compat, ro, incompat))
+ 		return 1;
+ 
+ 	if (!journal_check_available_features(journal, compat, ro, incompat))
+ 		return 0;
+ 
+ 	jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
+ 		  compat, ro, incompat);
+ 
+ 	sb = journal->j_superblock;
+ 
+ 	sb->s_feature_compat    |= cpu_to_be32(compat);
+ 	sb->s_feature_ro_compat |= cpu_to_be32(ro);
+ 	sb->s_feature_incompat  |= cpu_to_be32(incompat);
+ 
+ 	return 1;
+ }
+ 
+ 
+ /*
+  * Published API:
+  * Given an initialised but unloaded journal struct, poke about in the
+  * on-disk structure to update it to the most recent supported version.
+  */
+ 
+ int journal_update_format (journal_t *journal)
+ {
+ 	journal_superblock_t *sb;
+ 	int err;
+ 
+ 	err = journal_get_superblock(journal);
+ 	if (err)
+ 		return err;
+ 
+ 	sb = journal->j_superblock;
+ 
+ 	switch (ntohl(sb->s_header.h_blocktype)) {
+ 	case JFS_SUPERBLOCK_V2:
+ 		return 0;
+ 	case JFS_SUPERBLOCK_V1:
+ 		return journal_convert_superblock_v1(journal, sb);
+ 	default:
+ 		break;
+ 	}
+ 	return -EINVAL;
+ }
+ 
+ static int journal_convert_superblock_v1(journal_t *journal,
+ 					 journal_superblock_t *sb)
+ {
+ 	int offset, blocksize;
+ 	struct buffer_head *bh;
+ 
+ 	printk(KERN_WARNING
+ 		"JBD: Converting superblock from version 1 to 2.\n");
+ 
+ 	/* Pre-initialise new fields to zero */
+ 	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
+ 	blocksize = ntohl(sb->s_blocksize);
+ 	memset(&sb->s_feature_compat, 0, blocksize-offset);
+ 
+ 	sb->s_nr_users = cpu_to_be32(1);
+ 	sb->s_header.h_blocktype = cpu_to_be32(JFS_SUPERBLOCK_V2);
+ 	journal->j_format_version = 2;
+ 
+ 	bh = journal->j_sb_buffer;
+ 	BUFFER_TRACE(bh, "marking dirty");
+ 	mark_buffer_dirty(bh);
+ 	ll_rw_block(WRITE, 1, &bh);
+ 	wait_on_buffer(bh);
+ 	return 0;
+ }
+ 
+ 
+ /*
+  * Flush all data for a given journal to disk and empty the journal.
+  * Filesystems can use this when remounting readonly to ensure that
+  * recovery does not need to happen on remount.
+  */
+ 
+ int journal_flush (journal_t *journal)
+ {
+ 	int err = 0;
+ 	transaction_t *transaction = NULL;
+ 	unsigned long old_tail;
+ 
+ 	lock_kernel();
+ 	
+ 	/* Force everything buffered to the log... */
+ 	if (journal->j_running_transaction) {
+ 		transaction = journal->j_running_transaction;
+ 		log_start_commit(journal, transaction);
+ 	} else if (journal->j_committing_transaction)
+ 		transaction = journal->j_committing_transaction;
+ 
+ 	/* Wait for the log commit to complete... */
+ 	if (transaction)
+ 		log_wait_commit(journal, transaction->t_tid);
+ 
+ 	/* ...and flush everything in the log out to disk. */
+ 	lock_journal(journal);
+ 	while (!err && journal->j_checkpoint_transactions != NULL)
+ 		err = log_do_checkpoint(journal, journal->j_maxlen);
+ 	cleanup_journal_tail(journal);
+ 
+ 	/* Finally, mark the journal as really needing no recovery.
+ 	 * This sets s_start==0 in the underlying superblock, which is
+ 	 * the magic code for a fully-recovered superblock.  Any future
+ 	 * commits of data to the journal will restore the current
+ 	 * s_start value. */
+ 	old_tail = journal->j_tail;
+ 	journal->j_tail = 0;
+ 	journal_update_superblock(journal, 1);
+ 	journal->j_tail = old_tail;
+ 
+ 	unlock_journal(journal);
+ 
+ 	J_ASSERT(!journal->j_running_transaction);
+ 	J_ASSERT(!journal->j_committing_transaction);
+ 	J_ASSERT(!journal->j_checkpoint_transactions);
+ 	J_ASSERT(journal->j_head == journal->j_tail);
+ 	J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
+ 
+ 	unlock_kernel();
+ 	
+ 	return err;
+ }
+ 
+ /*
+  * Wipe out all of the contents of a journal, safely.  This will produce
+  * a warning if the journal contains any valid recovery information.
+  * Must be called between journal_init_*() and journal_load().
+  *
+  * If (write) is non-zero, then we wipe out the journal on disk; otherwise
+  * we merely suppress recovery.
+  */
+ 
+ int journal_wipe (journal_t *journal, int write)
+ {
+ 	journal_superblock_t *sb;
+ 	int err = 0;
+ 
+ 	J_ASSERT (!(journal->j_flags & JFS_LOADED));
+ 
+ 	err = load_superblock(journal);
+ 	if (err)
+ 		return err;
+ 
+ 	sb = journal->j_superblock;
+ 
+ 	if (!journal->j_tail)
+ 		goto no_recovery;
+ 
+ 	printk (KERN_WARNING "JBD: %s recovery information on journal\n",
+ 		write ? "Clearing" : "Ignoring");
+ 
+ 	err = journal_skip_recovery(journal);
+ 	if (write)
+ 		journal_update_superblock(journal, 1);
+ 
+  no_recovery:
+ 	return err;
+ }
+ 
+ /*
+  * journal_dev_name: format a character string to describe on what
+  * device this journal is present.
+  */
+ 
+ const char * journal_dev_name(journal_t *journal)
+ {
+ 	kdev_t dev;
+ 
+ 	if (journal->j_inode)
+ 		dev = journal->j_inode->i_dev;
+ 	else
+ 		dev = journal->j_dev;
+ 
+ 	return bdevname(dev);
+ }
+ 
+ /*
+  * journal_abort: perform a complete, immediate shutdown of the ENTIRE
+  * journal (not of a single transaction).  This operation cannot be
+  * undone without closing and reopening the journal.
+  *
+  * The journal_abort function is intended to support higher level error
+  * recovery mechanisms such as the ext2/ext3 remount-readonly error
+  * mode.
+  *
+  * Journal abort has very specific semantics.  Any existing dirty,
+  * unjournaled buffers in the main filesystem will still be written to
+  * disk by bdflush, but the journaling mechanism will be suspended
+  * immediately and no further transaction commits will be honoured.
+  *
+  * Any dirty, journaled buffers will be written back to disk without
+  * hitting the journal.  Atomicity cannot be guaranteed on an aborted
+  * filesystem, but we _do_ attempt to leave as much data as possible
+  * behind for fsck to use for cleanup.
+  *
+  * Any attempt to get a new transaction handle on a journal which is in
+  * ABORT state will just result in an -EROFS error return.  A
+  * journal_stop on an existing handle will return -EIO if we have
+  * entered abort state during the update.
+  *
+  * Recursive transactions are not disturbed by journal abort until the
+  * final journal_stop, which will receive the -EIO error.
+  *
+  * Finally, the journal_abort call allows the caller to supply an errno
+  * which will be recored (if possible) in the journal superblock.  This
+  * allows a client to record failure conditions in the middle of a
+  * transaction without having to complete the transaction to record the
+  * failure to disk.  ext3_error, for example, now uses this
+  * functionality.
+  *
+  * Errors which originate from within the journaling layer will NOT
+  * supply an errno; a null errno implies that absolutely no further
+  * writes are done to the journal (unless there are any already in
+  * progress).
+  */
+ 
+ /* Quick version for internal journal use (doesn't lock the journal) */
+ void __journal_abort (journal_t *journal)
+ {
+ 	transaction_t *transaction;
+ 
+ 	printk (KERN_ERR "Aborting journal on device %s.\n",
+ 		journal_dev_name(journal));
+ 
+ 	journal->j_flags |= JFS_ABORT;
+ 	transaction = journal->j_running_transaction;
+ 	if (transaction)
+ 		log_start_commit(journal, transaction);
+ }
+ 
+ /* Full version for external use */
+ void journal_abort (journal_t *journal, int errno)
+ {
+ 	lock_journal(journal);
+ 
+ 	if (journal->j_flags & JFS_ABORT)
+ 		goto out;
+ 
+ 	if (!journal->j_errno)
+ 		journal->j_errno = errno;
+ 
+ 	__journal_abort(journal);
+ 
+ 	if (errno)
+ 		journal_update_superblock(journal, 1);
+ 
+  out:
+ 	unlock_journal(journal);
+ }
+ 
+ int journal_errno (journal_t *journal)
+ {
+ 	int err;
+ 
+ 	lock_journal(journal);
+ 	if (journal->j_flags & JFS_ABORT)
+ 		err = -EROFS;
+ 	else
+ 		err = journal->j_errno;
+ 	unlock_journal(journal);
+ 	return err;
+ }
+ 
+ int journal_clear_err (journal_t *journal)
+ {
+ 	int err = 0;
+ 
+ 	lock_journal(journal);
+ 	if (journal->j_flags & JFS_ABORT)
+ 		err = -EROFS;
+ 	else
+ 		journal->j_errno = 0;
+ 	unlock_journal(journal);
+ 	return err;
+ }
+ 
+ void journal_ack_err (journal_t *journal)
+ {
+ 	lock_journal(journal);
+ 	if (journal->j_errno)
+ 		journal->j_flags |= JFS_ACK_ERR;
+ 	unlock_journal(journal);
+ }
+ 
+ int journal_blocks_per_page(struct inode *inode)
+ {
+ 	return 1 << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
+ }
+ 
+ /*
+  * shrink_journal_memory().
+  * Called when we're under memory pressure.  Free up all the written-back
+  * checkpointed metadata buffers.
+  */
+ void shrink_journal_memory(void)
+ {
+ 	struct list_head *list;
+ 
+ 	lock_kernel();
+ 	list_for_each(list, &all_journals) {
+ 		journal_t *journal =
+ 			list_entry(list, journal_t, j_all_journals);
+ 		spin_lock(&journal_datalist_lock);
+ 		__journal_clean_checkpoint_list(journal);
+ 		spin_unlock(&journal_datalist_lock);
+ 	}
+ 	unlock_kernel();
+ }
+ 
+ /*
+  * Simple support for retying memory allocations.  Introduced to help to
+  * debug different VM deadlock avoidance strategies. 
+  */
+ /*
+  * Simple support for retying memory allocations.  Introduced to help to
+  * debug different VM deadlock avoidance strategies. 
+  */
+ void * __jbd_kmalloc (char *where, size_t size, int flags, int retry)
+ {
+ 	void *p;
+ 	static unsigned long last_warning;
+ 	
+ 	while (1) {
+ 		p = kmalloc(size, flags);
+ 		if (p)
+ 			return p;
+ 		if (!retry)
+ 			return NULL;
+ 		/* Log every retry for debugging.  Also log them to the
+ 		 * syslog, but do rate-limiting on the non-debugging
+ 		 * messages. */
+ 		jbd_debug(1, "ENOMEM in %s, retrying.\n", where);
+ 
+ 		if (time_after(jiffies, last_warning + 5*HZ)) {
+ 			printk(KERN_NOTICE
+ 			       "ENOMEM in %s, retrying.\n", where);
+ 			last_warning = jiffies;
+ 		}
+ 		
+ 		current->policy |= SCHED_YIELD;
+ 		schedule();
+ 	}
+ }
+ 
+ /*
+  * Journal_head storage management
+  */
+ static kmem_cache_t *journal_head_cache;
+ #ifdef CONFIG_JBD_DEBUG
+ static atomic_t nr_journal_heads = ATOMIC_INIT(0);
+ #endif
+ 
+ static int journal_init_journal_head_cache(void)
+ {
+ 	int retval;
+ 
+ 	J_ASSERT(journal_head_cache == 0);
+ 	journal_head_cache = kmem_cache_create("journal_head",
+ 				sizeof(struct journal_head),
+ 				0,		/* offset */
+ 				0,		/* flags */
+ 				NULL,		/* ctor */
+ 				NULL);		/* dtor */
+ 	retval = 0;
+ 	if (journal_head_cache == 0) {
+ 		retval = -ENOMEM;
+ 		printk(KERN_EMERG "JBD: no memory for journal_head cache\n");
+ 	}
+ 	return retval;
+ }
+ 
+ static void journal_destroy_journal_head_cache(void)
+ {
+ 	J_ASSERT(journal_head_cache != NULL);
+ 	kmem_cache_destroy(journal_head_cache);
+ 	journal_head_cache = 0;
+ }
+ 
+ /*
+  * journal_head splicing and dicing
+  */
+ static struct journal_head *journal_alloc_journal_head(void)
+ {
+ 	struct journal_head *ret;
+ 	static unsigned long last_warning;
+ 
+ #ifdef CONFIG_JBD_DEBUG
+ 	atomic_inc(&nr_journal_heads);
+ #endif
+ 	ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+ 	if (ret == 0) {
+ 		jbd_debug(1, "out of memory for journal_head\n");
+ 		if (time_after(jiffies, last_warning + 5*HZ)) {
+ 			printk(KERN_NOTICE "ENOMEM in " __FUNCTION__
+ 			       ", retrying.\n");
+ 			last_warning = jiffies;
+ 		}
+ 		while (ret == 0) {
+ 			current->policy |= SCHED_YIELD;
+ 			schedule();
+ 			ret = kmem_cache_alloc(journal_head_cache, GFP_NOFS);
+ 		}
+ 	}
+ 	return ret;
+ }
+ 
+ static void journal_free_journal_head(struct journal_head *jh)
+ {
+ #ifdef CONFIG_JBD_DEBUG
+ 	atomic_dec(&nr_journal_heads);
+ 	memset(jh, 0x5b, sizeof(*jh));
+ #endif
+ 	kmem_cache_free(journal_head_cache, jh);
+ }
+ 
+ /*
+  * A journal_head is attached to a buffer_head whenever JBD has an
+  * interest in the buffer.
+  *
+  * Whenever a buffer has an attached journal_head, its ->b_state:BH_JBD bit
+  * is set.  This bit is tested in core kernel code where we need to take
+  * JBD-specific actions.  Testing the zeroness of ->b_private is not reliable
+  * there.
+  *
+  * When a buffer has its BH_JBD bit set, its ->b_count is elevated by one.
+  *
+  * When a buffer has its BH_JBD bit set it is immune from being released by
+  * core kernel code, mainly via ->b_count.
+  *
+  * A journal_head may be detached from its buffer_head when the journal_head's
+  * b_transaction, b_cp_transaction and b_next_transaction pointers are NULL.
+  * Various places in JBD call journal_remove_journal_head() to indicate that the
+  * journal_head can be dropped if needed.
+  *
+  * Various places in the kernel want to attach a journal_head to a buffer_head
+  * _before_ attaching the journal_head to a transaction.  To protect the
+  * journal_head in this situation, journal_add_journal_head elevates the
+  * journal_head's b_jcount refcount by one.  The caller must call
+  * journal_unlock_journal_head() to undo this.
+  *
+  * So the typical usage would be:
+  *
+  *	(Attach a journal_head if needed.  Increments b_jcount)
+  *	struct journal_head *jh = journal_add_journal_head(bh);
+  *	...
+  *	jh->b_transaction = xxx;
+  *	journal_unlock_journal_head(jh);
+  *
+  * Now, the journal_head's b_jcount is zero, but it is safe from being released
+  * because it has a non-zero b_transaction.
+  */
+ 
+ /*
+  * Give a buffer_head a journal_head.
+  *
+  * Doesn't need the journal lock.
+  * May sleep.
+  * Cannot be called with journal_datalist_lock held.
+  */
+ struct journal_head *journal_add_journal_head(struct buffer_head *bh)
+ {
+ 	struct journal_head *jh;
+ 
+ 	spin_lock(&journal_datalist_lock);
+ 	if (buffer_jbd(bh)) {
+ 		jh = bh2jh(bh);
+ 	} else {
+ 		J_ASSERT_BH(bh,
+ 			(atomic_read(&bh->b_count) > 0) ||
+ 			(bh->b_page && bh->b_page->mapping));
+ 		spin_unlock(&journal_datalist_lock);
+ 		jh = journal_alloc_journal_head();
+ 		memset(jh, 0, sizeof(*jh));
+ 		spin_lock(&journal_datalist_lock);
+ 
+ 		if (buffer_jbd(bh)) {
+ 			/* Someone did it for us! */
+ 			J_ASSERT_BH(bh, bh->b_private != NULL);
+ 			journal_free_journal_head(jh);
+ 			jh = bh->b_private;
+ 		} else {
+ 			/*
+ 			 * We actually don't need jh_splice_lock when
+ 			 * adding a journal_head - only on removal.
+ 			 */
+ 			spin_lock(&jh_splice_lock);
+ 			set_bit(BH_JBD, &bh->b_state);
+ 			bh->b_private = jh;
+ 			jh->b_bh = bh;
+ 			atomic_inc(&bh->b_count);
+ 			spin_unlock(&jh_splice_lock);
+ 			BUFFER_TRACE(bh, "added journal_head");
+ 		}
+ 	}
+ 	jh->b_jcount++;
+ 	spin_unlock(&journal_datalist_lock);
+ 	return bh->b_private;
+ }
+ 
+ /*
+  * journal_remove_journal_head(): if the buffer isn't attached to a transaction
+  * and has a zero b_jcount then remove and release its journal_head.   If we did
+  * see that the buffer is not used by any transaction we also "logically"
+  * decrement ->b_count.
+  *
+  * We in fact take an additional increment on ->b_count as a convenience,
+  * because the caller usually wants to do additional things with the bh
+  * after calling here.
+  * The caller of journal_remove_journal_head() *must* run __brelse(bh) at some
+  * time.  Once the caller has run __brelse(), the buffer is eligible for
+  * reaping by try_to_free_buffers().
+  *
+  * Requires journal_datalist_lock.
+  */
+ void __journal_remove_journal_head(struct buffer_head *bh)
+ {
+ 	struct journal_head *jh = bh2jh(bh);
+ 
+ 	assert_spin_locked(&journal_datalist_lock);
+ 	J_ASSERT_JH(jh, jh->b_jcount >= 0);
+ 	atomic_inc(&bh->b_count);
+ 	if (jh->b_jcount == 0) {
+ 		if (jh->b_transaction == NULL &&
+ 				jh->b_next_transaction == NULL &&
+ 				jh->b_cp_transaction == NULL) {
+ 			J_ASSERT_BH(bh, buffer_jbd(bh));
+ 			J_ASSERT_BH(bh, jh2bh(jh) == bh);
+ 			BUFFER_TRACE(bh, "remove journal_head");
+ 			spin_lock(&jh_splice_lock);
+ 			bh->b_private = NULL;
+ 			jh->b_bh = NULL;	/* debug, really */
+ 			clear_bit(BH_JBD, &bh->b_state);
+ 			__brelse(bh);
+ 			spin_unlock(&jh_splice_lock);
+ 			journal_free_journal_head(jh);
+ 		} else {
+ 			BUFFER_TRACE(bh, "journal_head was locked");
+ 		}
+ 	}
+ }
+ 
+ void journal_unlock_journal_head(struct journal_head *jh)
+ {
+ 	spin_lock(&journal_datalist_lock);
+ 	J_ASSERT_JH(jh, jh->b_jcount > 0);
+ 	--jh->b_jcount;
+ 	if (!jh->b_jcount && !jh->b_transaction) {
+ 		struct buffer_head *bh;
+ 		bh = jh2bh(jh);
+ 		__journal_remove_journal_head(bh);
+ 		__brelse(bh);
+ 	}
+ 	
+ 	spin_unlock(&journal_datalist_lock);
+ }
+ 
+ void journal_remove_journal_head(struct buffer_head *bh)
+ {
+ 	spin_lock(&journal_datalist_lock);
+ 	__journal_remove_journal_head(bh);
+ 	spin_unlock(&journal_datalist_lock);
+ }
+ 
+ /*
+  * Module startup and shutdown
+  */
+ 
+ static int __init journal_init_caches(void)
+ {
+ 	int ret;
+ 
+ 	ret = journal_init_revoke_caches();
+ 	if (ret == 0)
+ 		ret = journal_init_journal_head_cache();
+ 	return ret;
+ }
+ 
+ static void journal_destroy_caches(void)
+ {
+ 	journal_destroy_revoke_caches();
+ 	journal_destroy_journal_head_cache();
+ }
+ 
+ static int __init journal_init(void)
+ {
+ 	int ret;
+ 
+ 	printk(KERN_INFO "Journalled Block Device driver loaded\n");
+ 	ret = journal_init_caches();
+ 	if (ret != 0)
+ 		journal_destroy_caches();
+ 	return ret;
+ }
+ 
+ static void __exit journal_exit(void)
+ {
+ #ifdef CONFIG_JBD_DEBUG
+ 	int n = atomic_read(&nr_journal_heads);
+ 	if (n)
+ 		printk(KERN_EMERG "JBD: leaked %d journal_heads!\n", n);
+ #endif
+ 	journal_destroy_caches();
+ }
+ 
+ MODULE_LICENSE("GPL");
+ module_init(journal_init);
+ module_exit(journal_exit);
+ 
diff -rc2P linux/fs/jbd/recovery.c linux-2.4.13/fs/jbd/recovery.c
*** linux/fs/jbd/recovery.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/recovery.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,586 ----
+ /*
+  * linux/fs/recovery.c
+  * 
+  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+  *
+  * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Journal recovery routines for the generic filesystem journaling code;
+  * part of the ext2fs journaling system.  
+  */
+ 
+ #ifndef __KERNEL__
+ #include "jfs_user.h"
+ #else
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #endif
+ 
+ /*
+  * Maintain information about the progress of the recovery job, so that
+  * the different passes can carry information between them. 
+  */
+ struct recovery_info 
+ {
+ 	tid_t		start_transaction;	
+ 	tid_t		end_transaction;
+ 	
+ 	int		nr_replays;
+ 	int		nr_revokes;
+ 	int		nr_revoke_hits;
+ };
+ 
+ enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
+ static int do_one_pass(journal_t *journal,
+ 				struct recovery_info *info, enum passtype pass);
+ static int scan_revoke_records(journal_t *, struct buffer_head *,
+ 				tid_t, struct recovery_info *);
+ 
+ #ifdef __KERNEL__
+ 
+ /* Release readahead buffers after use */
+ void journal_brelse_array(struct buffer_head *b[], int n)
+ {
+ 	while (--n >= 0)
+ 		brelse (b[n]);
+ }
+ 
+ 
+ /*
+  * When reading from the journal, we are going through the block device
+  * layer directly and so there is no readahead being done for us.  We
+  * need to implement any readahead ourselves if we want it to happen at
+  * all.  Recovery is basically one long sequential read, so make sure we
+  * do the IO in reasonably large chunks.
+  *
+  * This is not so critical that we need to be enormously clever about
+  * the readahead size, though.  128K is a purely arbitrary, good-enough
+  * fixed value.
+  */
+ 
+ #define MAXBUF 8
+ static int do_readahead(journal_t *journal, unsigned int start)
+ {
+ 	int err;
+ 	unsigned int max, nbufs, next, blocknr;
+ 	struct buffer_head *bh;
+ 	
+ 	struct buffer_head * bufs[MAXBUF];
+ 	
+ 	/* Do up to 128K of readahead */
+ 	max = start + (128 * 1024 / journal->j_blocksize);
+ 	if (max > journal->j_maxlen)
+ 		max = journal->j_maxlen;
+ 
+ 	/* Do the readahead itself.  We'll submit MAXBUF buffer_heads at
+ 	 * a time to the block device IO layer. */
+ 	
+ 	nbufs = 0;
+ 	
+ 	for (next = start; next < max; next++) {
+ 		blocknr = journal_bmap(journal, next);
+ 
+ 		if (!blocknr) {
+ 			printk (KERN_ERR "JBD: bad block at offset %u\n",
+ 				next);
+ 			err = -EIO;
+ 			goto failed;
+ 		}
+ 
+ 		bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ 		if (!bh) {
+ 			err = -ENOMEM;
+ 			goto failed;
+ 		}
+ 
+ 		if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+ 			bufs[nbufs++] = bh;
+ 			if (nbufs == MAXBUF) {
+ 				ll_rw_block(READ, nbufs, bufs);
+ 				journal_brelse_array(bufs, nbufs);
+ 				nbufs = 0;
+ 			}
+ 		} else
+ 			brelse(bh);
+ 	}
+ 
+ 	if (nbufs)
+ 		ll_rw_block(READ, nbufs, bufs);
+ 	err = 0;
+ 
+ failed:	
+ 	if (nbufs) 
+ 		journal_brelse_array(bufs, nbufs);
+ 	return err;
+ }
+ 
+ #endif /* __KERNEL__ */
+ 
+ 
+ /*
+  * Read a block from the journal
+  */
+ 
+ static int jread(struct buffer_head **bhp, journal_t *journal, 
+ 		 unsigned int offset)
+ {
+ 	unsigned int blocknr;
+ 	struct buffer_head *bh;
+ 
+ 	*bhp = NULL;
+ 
+ 	J_ASSERT (offset < journal->j_maxlen);
+ 	
+ 	blocknr = journal_bmap(journal, offset);
+ 
+ 	if (!blocknr) {
+ 		printk (KERN_ERR "JBD: bad block at offset %u\n",
+ 			offset);
+ 		return -EIO;
+ 	}
+ 
+ 	bh = getblk(journal->j_dev, blocknr, journal->j_blocksize);
+ 	if (!bh)
+ 		return -ENOMEM;
+ 
+ 	if (!buffer_uptodate(bh)) {
+ 		/* If this is a brand new buffer, start readahead.
+                    Otherwise, we assume we are already reading it.  */
+ 		if (!buffer_req(bh))
+ 			do_readahead(journal, offset);
+ 		wait_on_buffer(bh);
+ 	}
+ 
+ 	if (!buffer_uptodate(bh)) {
+ 		printk (KERN_ERR "JBD: Failed to read block at offset %u\n",
+ 			offset);
+ 		brelse(bh);
+ 		return -EIO;
+ 	}
+ 
+ 	*bhp = bh;
+ 	return 0;
+ }
+ 
+ 
+ /*
+  * Count the number of in-use tags in a journal descriptor block.
+  */
+ 
+ static int count_tags(struct buffer_head *bh, int size)
+ {
+ 	char *			tagp;
+ 	journal_block_tag_t *	tag;
+ 	int			nr = 0;
+ 
+ 	tagp = &bh->b_data[sizeof(journal_header_t)];
+ 
+ 	while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) {
+ 		tag = (journal_block_tag_t *) tagp;
+ 
+ 		nr++;
+ 		tagp += sizeof(journal_block_tag_t);
+ 		if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID)))
+ 			tagp += 16;
+ 
+ 		if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG))
+ 			break;
+ 	}
+ 
+ 	return nr;
+ }
+ 
+ 
+ /* Make sure we wrap around the log correctly! */
+ #define wrap(journal, var)						\
+ do {									\
+ 	if (var >= (journal)->j_last)					\
+ 		var -= ((journal)->j_last - (journal)->j_first);	\
+ } while (0)
+ 
+ /*
+  * journal_recover
+  *
+  * The primary function for recovering the log contents when mounting a
+  * journaled device.  
+  * 
+  * Recovery is done in three passes.  In the first pass, we look for the
+  * end of the log.  In the second, we assemble the list of revoke
+  * blocks.  In the third and final pass, we replay any un-revoked blocks
+  * in the log.  
+  */
+ 
+ int journal_recover(journal_t *journal)
+ {
+ 	int			err;
+ 	journal_superblock_t *	sb;
+ 
+ 	struct recovery_info	info;
+ 	
+ 	memset(&info, 0, sizeof(info));
+ 	sb = journal->j_superblock;
+ 	
+ 	/* 
+ 	 * The journal superblock's s_start field (the current log head)
+ 	 * is always zero if, and only if, the journal was cleanly
+ 	 * unmounted.  
+ 	 */
+ 
+ 	if (!sb->s_start) {
+ 		jbd_debug(1, "No recovery required, last transaction %d\n",
+ 			  ntohl(sb->s_sequence));
+ 		journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1;
+ 		return 0;
+ 	}
+ 	
+ 
+ 	err = do_one_pass(journal, &info, PASS_SCAN);
+ 	if (!err)
+ 		err = do_one_pass(journal, &info, PASS_REVOKE);
+ 	if (!err)
+ 		err = do_one_pass(journal, &info, PASS_REPLAY);
+ 
+ 	jbd_debug(0, "JBD: recovery, exit status %d, "
+ 		  "recovered transactions %u to %u\n",
+ 		  err, info.start_transaction, info.end_transaction);
+ 	jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", 
+ 		  info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
+ 
+ 	/* Restart the log at the next transaction ID, thus invalidating
+ 	 * any existing commit records in the log. */
+ 	journal->j_transaction_sequence = ++info.end_transaction;
+ 		
+ 	journal_clear_revoke(journal);
+ 	fsync_no_super(journal->j_fs_dev);
+ 	return err;
+ }
+ 
+ /*
+  * journal_skip_recovery
+  *
+  * Locate any valid recovery information from the journal and set up the
+  * journal structures in memory to ignore it (presumably because the
+  * caller has evidence that it is out of date).  
+  *
+  * We perform one pass over the journal to allow us to tell the user how
+  * much recovery information is being erased, and to let us initialise
+  * the journal transaction sequence numbers to the next unused ID. 
+  */
+ 
+ int journal_skip_recovery(journal_t *journal)
+ {
+ 	int			err;
+ 	journal_superblock_t *	sb;
+ 
+ 	struct recovery_info	info;
+ 	
+ 	memset (&info, 0, sizeof(info));
+ 	sb = journal->j_superblock;
+ 	
+ 	err = do_one_pass(journal, &info, PASS_SCAN);
+ 
+ 	if (err) {
+ 		printk(KERN_ERR "JBD: error %d scanning journal\n", err);
+ 		++journal->j_transaction_sequence;
+ 	} else {
+ #ifdef CONFIG_JBD_DEBUG
+ 		int dropped = info.end_transaction - ntohl(sb->s_sequence);
+ #endif
+ 		
+ 		jbd_debug(0, 
+ 			  "JBD: ignoring %d transaction%s from the journal.\n",
+ 			  dropped, (dropped == 1) ? "" : "s");
+ 		journal->j_transaction_sequence = ++info.end_transaction;
+ 	}
+ 
+ 	journal->j_tail = 0;
+ 	
+ 	return err;
+ }
+ 
+ static int do_one_pass(journal_t *journal,
+ 			struct recovery_info *info, enum passtype pass)
+ {
+ 	
+ 	unsigned int		first_commit_ID, next_commit_ID;
+ 	unsigned long		next_log_block;
+ 	int			err, success = 0;
+ 	journal_superblock_t *	sb;
+ 	journal_header_t * 	tmp;
+ 	struct buffer_head *	bh;
+ 	unsigned int		sequence;
+ 	int			blocktype;
+ 	
+ 	/* Precompute the maximum metadata descriptors in a descriptor block */
+ 	int			MAX_BLOCKS_PER_DESC;
+ 	MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t))
+ 			       / sizeof(journal_block_tag_t));
+ 
+ 	/* 
+ 	 * First thing is to establish what we expect to find in the log
+ 	 * (in terms of transaction IDs), and where (in terms of log
+ 	 * block offsets): query the superblock.  
+ 	 */
+ 
+ 	sb = journal->j_superblock;
+ 	next_commit_ID = ntohl(sb->s_sequence);
+ 	next_log_block = ntohl(sb->s_start);
+ 
+ 	first_commit_ID = next_commit_ID;
+ 	if (pass == PASS_SCAN)
+ 		info->start_transaction = first_commit_ID;
+ 
+ 	jbd_debug(1, "Starting recovery pass %d\n", pass);
+ 
+ 	/*
+ 	 * Now we walk through the log, transaction by transaction,
+ 	 * making sure that each transaction has a commit block in the
+ 	 * expected place.  Each complete transaction gets replayed back
+ 	 * into the main filesystem. 
+ 	 */
+ 
+ 	while (1) {
+ 		int			flags;
+ 		char *			tagp;
+ 		journal_block_tag_t *	tag;
+ 		struct buffer_head *	obh;
+ 		struct buffer_head *	nbh;
+ 		
+ 		/* If we already know where to stop the log traversal,
+ 		 * check right now that we haven't gone past the end of
+ 		 * the log. */
+ 		
+ 		if (pass != PASS_SCAN)
+ 			if (tid_geq(next_commit_ID, info->end_transaction))
+ 				break;
+ 
+ 		jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+ 			  next_commit_ID, next_log_block, journal->j_last);
+ 
+ 		/* Skip over each chunk of the transaction looking
+ 		 * either the next descriptor block or the final commit
+ 		 * record. */
+ 		
+ 		jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+ 		err = jread(&bh, journal, next_log_block);
+ 		if (err)
+ 			goto failed;
+ 
+ 		next_log_block++;
+ 		wrap(journal, next_log_block);
+ 		
+ 		/* What kind of buffer is it? 
+ 		 * 
+ 		 * If it is a descriptor block, check that it has the
+ 		 * expected sequence number.  Otherwise, we're all done
+ 		 * here. */
+ 
+ 		tmp = (journal_header_t *)bh->b_data;
+ 		
+ 		if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) {
+ 			brelse(bh);
+ 			break;
+ 		}
+ 
+ 		blocktype = ntohl(tmp->h_blocktype);
+ 		sequence = ntohl(tmp->h_sequence);
+ 		jbd_debug(3, "Found magic %d, sequence %d\n", 
+ 			  blocktype, sequence);
+ 		
+ 		if (sequence != next_commit_ID) {
+ 			brelse(bh);
+ 			break;
+ 		}
+ 		
+ 		/* OK, we have a valid descriptor block which matches
+ 		 * all of the sequence number checks.  What are we going
+ 		 * to do with it?  That depends on the pass... */
+ 
+ 		switch(blocktype) {
+ 		case JFS_DESCRIPTOR_BLOCK:
+ 			/* If it is a valid descriptor block, replay it
+ 			 * in pass REPLAY; otherwise, just skip over the
+ 			 * blocks it describes. */
+ 			if (pass != PASS_REPLAY) {
+ 				next_log_block +=
+ 					count_tags(bh, journal->j_blocksize);
+ 				wrap(journal, next_log_block);
+ 				brelse(bh);
+ 				continue;
+ 			}
+ 
+ 			/* A descriptor block: we can now write all of
+ 			 * the data blocks.  Yay, useful work is finally
+ 			 * getting done here! */
+ 
+ 			tagp = &bh->b_data[sizeof(journal_header_t)];
+ 			while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
+ 			       <= journal->j_blocksize) {
+ 				unsigned long io_block;
+ 
+ 				tag = (journal_block_tag_t *) tagp;
+ 				flags = ntohl(tag->t_flags);
+ 				
+ 				io_block = next_log_block++;
+ 				wrap(journal, next_log_block);
+ 				err = jread(&obh, journal, io_block);
+ 				if (err) {
+ 					/* Recover what we can, but
+ 					 * report failure at the end. */
+ 					success = err;
+ 					printk (KERN_ERR 
+ 						"JBD: IO error %d recovering "
+ 						"block %ld in log\n",
+ 						err, io_block);
+ 				} else {
+ 					unsigned long blocknr;
+ 					
+ 					J_ASSERT(obh != NULL);
+ 					blocknr = ntohl(tag->t_blocknr);
+ 
+ 					/* If the block has been
+ 					 * revoked, then we're all done
+ 					 * here. */
+ 					if (journal_test_revoke
+ 					    (journal, blocknr, 
+ 					     next_commit_ID)) {
+ 						brelse(obh);
+ 						++info->nr_revoke_hits;
+ 						goto skip_write;
+ 					}
+ 								
+ 					/* Find a buffer for the new
+ 					 * data being restored */
+ 					nbh = getblk(journal->j_fs_dev, blocknr,
+ 						     journal->j_blocksize);
+ 					if (nbh == NULL) {
+ 						printk(KERN_ERR 
+ 						       "JBD: Out of memory "
+ 						       "during recovery.\n");
+ 						err = -ENOMEM;
+ 						brelse(bh);
+ 						brelse(obh);
+ 						goto failed;
+ 					}
+ 
+ 					memcpy(nbh->b_data, obh->b_data,
+ 							journal->j_blocksize);
+ 					if (flags & JFS_FLAG_ESCAPE) {
+ 						*((unsigned int *)bh->b_data) =
+ 							htonl(JFS_MAGIC_NUMBER);
+ 					}
+ 
+ 					BUFFER_TRACE(nbh, "marking dirty");
+ 					mark_buffer_dirty(nbh);
+ 					BUFFER_TRACE(nbh, "marking uptodate");
+ 					mark_buffer_uptodate(nbh, 1);
+ 					++info->nr_replays;
+ 					/* ll_rw_block(WRITE, 1, &nbh); */
+ 					brelse(obh);
+ 					brelse(nbh);
+ 				}
+ 				
+ 			skip_write:
+ 				tagp += sizeof(journal_block_tag_t);
+ 				if (!(flags & JFS_FLAG_SAME_UUID))
+ 					tagp += 16;
+ 
+ 				if (flags & JFS_FLAG_LAST_TAG)
+ 					break;
+ 			}
+ 			
+ 			brelse(bh);
+ 			continue;
+ 
+ 		case JFS_COMMIT_BLOCK:
+ 			/* Found an expected commit block: not much to
+ 			 * do other than move on to the next sequence
+ 			 * number. */
+ 			brelse(bh);
+ 			next_commit_ID++;
+ 			continue;
+ 
+ 		case JFS_REVOKE_BLOCK:
+ 			/* If we aren't in the REVOKE pass, then we can
+ 			 * just skip over this block. */
+ 			if (pass != PASS_REVOKE) {
+ 				brelse(bh);
+ 				continue;
+ 			}
+ 
+ 			err = scan_revoke_records(journal, bh,
+ 						  next_commit_ID, info);
+ 			brelse(bh);
+ 			if (err)
+ 				goto failed;
+ 			continue;
+ 
+ 		default:
+ 			jbd_debug(3, "Unrecognised magic %d, end of scan.\n",
+ 				  blocktype);
+ 			goto done;
+ 		}
+ 	}
+ 
+  done:
+ 	/* 
+ 	 * We broke out of the log scan loop: either we came to the
+ 	 * known end of the log or we found an unexpected block in the
+ 	 * log.  If the latter happened, then we know that the "current"
+ 	 * transaction marks the end of the valid log.
+ 	 */
+ 	
+ 	if (pass == PASS_SCAN)
+ 		info->end_transaction = next_commit_ID;
+ 	else {
+ 		/* It's really bad news if different passes end up at
+ 		 * different places (but possible due to IO errors). */
+ 		if (info->end_transaction != next_commit_ID) {
+ 			printk (KERN_ERR "JBD: recovery pass %d ended at "
+ 				"transaction %u, expected %u\n",
+ 				pass, next_commit_ID, info->end_transaction);
+ 			if (!success)
+ 				success = -EIO;
+ 		}
+ 	}
+ 
+ 	return success;
+ 
+  failed:
+ 	return err;
+ }
+ 
+ 
+ /* Scan a revoke record, marking all blocks mentioned as revoked. */
+ 
+ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 
+ 			       tid_t sequence, struct recovery_info *info)
+ {
+ 	journal_revoke_header_t *header;
+ 	int offset, max;
+ 
+ 	header = (journal_revoke_header_t *) bh->b_data;
+ 	offset = sizeof(journal_revoke_header_t);
+ 	max = ntohl(header->r_count);
+ 	
+ 	while (offset < max) {
+ 		unsigned long blocknr;
+ 		int err;
+ 		
+ 		blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset)));
+ 		offset += 4;
+ 		err = journal_set_revoke(journal, blocknr, sequence);
+ 		if (err)
+ 			return err;
+ 		++info->nr_revokes;
+ 	}
+ 	return 0;
+ }
diff -rc2P linux/fs/jbd/revoke.c linux-2.4.13/fs/jbd/revoke.c
*** linux/fs/jbd/revoke.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/revoke.c	Fri Nov  9 16:57:59 2001
***************
*** 0 ****
--- 1,631 ----
+ /*
+  * linux/fs/revoke.c
+  * 
+  * Written by Stephen C. Tweedie <sct@redhat.com>, 2000
+  *
+  * Copyright 2000 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Journal revoke routines for the generic filesystem journaling code;
+  * part of the ext2fs journaling system.
+  *
+  * Revoke is the mechanism used to prevent old log records for deleted
+  * metadata from being replayed on top of newer data using the same
+  * blocks.  The revoke mechanism is used in two separate places:
+  * 
+  * + Commit: during commit we write the entire list of the current
+  *   transaction's revoked blocks to the journal
+  * 
+  * + Recovery: during recovery we record the transaction ID of all
+  *   revoked blocks.  If there are multiple revoke records in the log
+  *   for a single block, only the last one counts, and if there is a log
+  *   entry for a block beyond the last revoke, then that log entry still
+  *   gets replayed.
+  *
+  * We can get interactions between revokes and new log data within a
+  * single transaction:
+  *
+  * Block is revoked and then journaled:
+  *   The desired end result is the journaling of the new block, so we 
+  *   cancel the revoke before the transaction commits.
+  *
+  * Block is journaled and then revoked:
+  *   The revoke must take precedence over the write of the block, so we
+  *   need either to cancel the journal entry or to write the revoke
+  *   later in the log than the log block.  In this case, we choose the
+  *   latter: journaling a block cancels any revoke record for that block
+  *   in the current transaction, so any revoke for that block in the
+  *   transaction must have happened after the block was journaled and so
+  *   the revoke must take precedence.
+  *
+  * Block is revoked and then written as data: 
+  *   The data write is allowed to succeed, but the revoke is _not_
+  *   cancelled.  We still need to prevent old log records from
+  *   overwriting the new data.  We don't even need to clear the revoke
+  *   bit here.
+  *
+  * Revoke information on buffers is a tri-state value:
+  *
+  * RevokeValid clear:	no cached revoke status, need to look it up
+  * RevokeValid set, Revoked clear:
+  *			buffer has not been revoked, and cancel_revoke
+  *			need do nothing.
+  * RevokeValid set, Revoked set:
+  *			buffer has been revoked.  
+  */
+ 
+ #ifndef __KERNEL__
+ #include "jfs_user.h"
+ #else
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/list.h>
+ #include <linux/smp_lock.h>
+ #include <linux/init.h>
+ #endif
+ 
+ static kmem_cache_t *revoke_record_cache;
+ static kmem_cache_t *revoke_table_cache;
+ 
+ /* Each revoke record represents one single revoked block.  During
+    journal replay, this involves recording the transaction ID of the
+    last transaction to revoke this block. */
+ 
+ struct jbd_revoke_record_s 
+ {
+ 	struct list_head  hash;
+ 	tid_t		  sequence;	/* Used for recovery only */
+ 	unsigned long	  blocknr;	
+ };
+ 
+ 
+ /* The revoke table is just a simple hash table of revoke records. */
+ struct jbd_revoke_table_s
+ {
+ 	/* It is conceivable that we might want a larger hash table
+ 	 * for recovery.  Must be a power of two. */
+ 	int		  hash_size; 
+ 	int		  hash_shift; 
+ 	struct list_head *hash_table;
+ };
+ 
+ 
+ #ifdef __KERNEL__
+ static void write_one_revoke_record(journal_t *, transaction_t *,
+ 				    struct journal_head **, int *,
+ 				    struct jbd_revoke_record_s *);
+ static void flush_descriptor(journal_t *, struct journal_head *, int);
+ #endif
+ 
+ /* Utility functions to maintain the revoke table */
+ 
+ /* Borrowed from buffer.c: this is a tried and tested block hash function */
+ static inline int hash(journal_t *journal, unsigned long block)
+ {
+ 	struct jbd_revoke_table_s *table = journal->j_revoke;
+ 	int hash_shift = table->hash_shift;
+ 	
+ 	return ((block << (hash_shift - 6)) ^
+ 		(block >> 13) ^
+ 		(block << (hash_shift - 12))) & (table->hash_size - 1);
+ }
+ 
+ int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
+ {
+ 	struct list_head *hash_list;
+ 	struct jbd_revoke_record_s *record;
+ 
+ repeat:
+ 	record = kmem_cache_alloc(revoke_record_cache, GFP_NOFS);
+ 	if (!record)
+ 		goto oom;
+ 
+ 	record->sequence = seq;
+ 	record->blocknr = blocknr;
+ 	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+ 	list_add(&record->hash, hash_list);
+ 	return 0;
+ 
+ oom:
+ 	if (!journal_oom_retry)
+ 		return -ENOMEM;
+ 	jbd_debug(1, "ENOMEM in " __FUNCTION__ ", retrying.\n");
+ 	current->policy |= SCHED_YIELD;
+ 	schedule();
+ 	goto repeat;
+ }
+ 
+ /* Find a revoke record in the journal's hash table. */
+ 
+ static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
+ 						      unsigned long blocknr)
+ {
+ 	struct list_head *hash_list;
+ 	struct jbd_revoke_record_s *record;
+ 	
+ 	hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)];
+ 
+ 	record = (struct jbd_revoke_record_s *) hash_list->next;
+ 	while (&(record->hash) != hash_list) {
+ 		if (record->blocknr == blocknr)
+ 			return record;
+ 		record = (struct jbd_revoke_record_s *) record->hash.next;
+ 	}
+ 	return NULL;
+ }
+ 
+ int __init journal_init_revoke_caches(void)
+ {
+ 	revoke_record_cache = kmem_cache_create("revoke_record",
+ 					   sizeof(struct jbd_revoke_record_s),
+ 					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+ 	if (revoke_record_cache == 0)
+ 		return -ENOMEM;
+ 
+ 	revoke_table_cache = kmem_cache_create("revoke_table",
+ 					   sizeof(struct jbd_revoke_table_s),
+ 					   0, 0, NULL, NULL);
+ 	if (revoke_table_cache == 0) {
+ 		kmem_cache_destroy(revoke_record_cache);
+ 		revoke_record_cache = NULL;
+ 		return -ENOMEM;
+ 	}
+ 	return 0;
+ }	
+ 
+ void journal_destroy_revoke_caches(void)
+ {
+ 	kmem_cache_destroy(revoke_record_cache);
+ 	revoke_record_cache = 0;
+ 	kmem_cache_destroy(revoke_table_cache);
+ 	revoke_table_cache = 0;
+ }
+ 
+ /* Initialise the revoke table for a given journal to a given size. */
+ 
+ int journal_init_revoke(journal_t *journal, int hash_size)
+ {
+ 	int shift, tmp;
+ 	
+ 	J_ASSERT (journal->j_revoke == NULL);
+ 	
+ 	journal->j_revoke = kmem_cache_alloc(revoke_table_cache, GFP_KERNEL);
+ 	if (!journal->j_revoke)
+ 		return -ENOMEM;
+ 	
+ 	/* Check that the hash_size is a power of two */
+ 	J_ASSERT ((hash_size & (hash_size-1)) == 0);
+ 
+ 	journal->j_revoke->hash_size = hash_size;
+ 
+ 	shift = 0;
+ 	tmp = hash_size;
+ 	while((tmp >>= 1UL) != 0UL)
+ 		shift++;
+ 	journal->j_revoke->hash_shift = shift;
+ 
+ 	journal->j_revoke->hash_table =
+ 		kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL);
+ 	if (!journal->j_revoke->hash_table) {
+ 		kmem_cache_free(revoke_table_cache, journal->j_revoke);
+ 		journal->j_revoke = NULL;
+ 		return -ENOMEM;
+ 	}
+ 	
+ 	for (tmp = 0; tmp < hash_size; tmp++)
+ 		INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]);
+ 	
+ 	return 0;
+ }
+ 
+ /* Destoy a journal's revoke table.  The table must already be empty! */
+ 
+ void journal_destroy_revoke(journal_t *journal)
+ {
+ 	struct jbd_revoke_table_s *table;
+ 	struct list_head *hash_list;
+ 	int i;
+ 	
+ 	table = journal->j_revoke;
+ 	if (!table)
+ 		return;
+ 	
+ 	for (i=0; i<table->hash_size; i++) {
+ 		hash_list = &table->hash_table[i];
+ 		J_ASSERT (list_empty(hash_list));
+ 	}
+ 	
+ 	kfree(table->hash_table);
+ 	kmem_cache_free(revoke_table_cache, table);
+ 	journal->j_revoke = NULL;
+ }
+ 
+ 
+ #ifdef __KERNEL__
+ 
+ /* 
+  * journal_revoke: revoke a given buffer_head from the journal.  This
+  * prevents the block from being replayed during recovery if we take a
+  * crash after this current transaction commits.  Any subsequent
+  * metadata writes of the buffer in this transaction cancel the
+  * revoke.  
+  *
+  * Note that this call may block --- it is up to the caller to make
+  * sure that there are no further calls to journal_write_metadata
+  * before the revoke is complete.  In ext3, this implies calling the
+  * revoke before clearing the block bitmap when we are deleting
+  * metadata. 
+  *
+  * Revoke performs a journal_forget on any buffer_head passed in as a
+  * parameter, but does _not_ forget the buffer_head if the bh was only
+  * found implicitly. 
+  *
+  * bh_in may not be a journalled buffer - it may have come off
+  * the hash tables without an attached journal_head.
+  *
+  * If bh_in is non-zero, journal_revoke() will decrement its b_count
+  * by one.
+  */
+ 
+ int journal_revoke(handle_t *handle, unsigned long blocknr, 
+ 		   struct buffer_head *bh_in)
+ {
+ 	struct buffer_head *bh = NULL;
+ 	journal_t *journal;
+ 	kdev_t dev;
+ 	int err;
+ 
+ 	if (bh_in)
+ 		BUFFER_TRACE(bh_in, "enter");
+ 
+ 	journal = handle->h_transaction->t_journal;
+ 	if (!journal_set_features(journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)){
+ 		J_ASSERT (!"Cannot set revoke feature!");
+ 		return -EINVAL;
+ 	}
+ 
+ 	dev = journal->j_fs_dev;
+ 	bh = bh_in;
+ 
+ 	if (!bh) {
+ 		bh = get_hash_table(dev, blocknr, journal->j_blocksize);
+ 		if (bh)
+ 			BUFFER_TRACE(bh, "found on hash");
+ 	}
+ #ifdef JBD_EXPENSIVE_CHECKING
+ 	else {
+ 		struct buffer_head *bh2;
+ 
+ 		/* If there is a different buffer_head lying around in
+ 		 * memory anywhere... */
+ 		bh2 = get_hash_table(dev, blocknr, journal->j_blocksize);
+ 		if (bh2) {
+ 			/* ... and it has RevokeValid status... */
+ 			if ((bh2 != bh) &&
+ 			    test_bit(BH_RevokeValid, &bh2->b_state))
+ 				/* ...then it better be revoked too,
+ 				 * since it's illegal to create a revoke
+ 				 * record against a buffer_head which is
+ 				 * not marked revoked --- that would
+ 				 * risk missing a subsequent revoke
+ 				 * cancel. */
+ 				J_ASSERT_BH(bh2, test_bit(BH_Revoked, &
+ 							  bh2->b_state));
+ 			__brelse(bh2);
+ 		}
+ 	}
+ #endif
+ 
+ 	/* We really ought not ever to revoke twice in a row without
+            first having the revoke cancelled: it's illegal to free a
+            block twice without allocating it in between! */
+ 	if (bh) {
+ 		J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state));
+ 		set_bit(BH_Revoked, &bh->b_state);
+ 		set_bit(BH_RevokeValid, &bh->b_state);
+ 		if (bh_in) {
+ 			BUFFER_TRACE(bh_in, "call journal_forget");
+ 			journal_forget(handle, bh_in);
+ 		} else {
+ 			BUFFER_TRACE(bh, "call brelse");
+ 			__brelse(bh);
+ 		}
+ 	}
+ 
+ 	lock_journal(journal);
+ 	jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+ 	err = insert_revoke_hash(journal, blocknr,
+ 				handle->h_transaction->t_tid);
+ 	unlock_journal(journal);
+ 	BUFFER_TRACE(bh_in, "exit");
+ 	return err;
+ }
+ 
+ /*
+  * Cancel an outstanding revoke.  For use only internally by the
+  * journaling code (called from journal_get_write_access).
+  *
+  * We trust the BH_Revoked bit on the buffer if the buffer is already
+  * being journaled: if there is no revoke pending on the buffer, then we
+  * don't do anything here.
+  *
+  * This would break if it were possible for a buffer to be revoked and
+  * discarded, and then reallocated within the same transaction.  In such
+  * a case we would have lost the revoked bit, but when we arrived here
+  * the second time we would still have a pending revoke to cancel.  So,
+  * do not trust the Revoked bit on buffers unless RevokeValid is also
+  * set.
+  *
+  * The caller must have the journal locked.
+  */
+ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
+ {
+ 	struct jbd_revoke_record_s *record;
+ 	journal_t *journal = handle->h_transaction->t_journal;
+ 	int need_cancel;
+ 	int did_revoke = 0;	/* akpm: debug */
+ 	struct buffer_head *bh = jh2bh(jh);
+ 	
+ 	jbd_debug(4, "journal_head %p, cancelling revoke\n", jh);
+ 
+ 	/* Is the existing Revoke bit valid?  If so, we trust it, and
+ 	 * only perform the full cancel if the revoke bit is set.  If
+ 	 * not, we can't trust the revoke bit, and we need to do the
+ 	 * full search for a revoke record. */
+ 	if (test_and_set_bit(BH_RevokeValid, &bh->b_state))
+ 		need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state));
+ 	else {
+ 		need_cancel = 1;
+ 		clear_bit(BH_Revoked, &bh->b_state);
+ 	}
+ 
+ 	if (need_cancel) {
+ 		record = find_revoke_record(journal, bh->b_blocknr);
+ 		if (record) {
+ 			jbd_debug(4, "cancelled existing revoke on "
+ 				  "blocknr %lu\n", bh->b_blocknr);
+ 			list_del(&record->hash);
+ 			kmem_cache_free(revoke_record_cache, record);
+ 			did_revoke = 1;
+ 		}
+ 	}
+ 
+ #ifdef JBD_EXPENSIVE_CHECKING
+ 	/* There better not be one left behind by now! */
+ 	record = find_revoke_record(journal, bh->b_blocknr);
+ 	J_ASSERT_JH(jh, record == NULL);
+ #endif
+ 
+ 	/* Finally, have we just cleared revoke on an unhashed
+ 	 * buffer_head?  If so, we'd better make sure we clear the
+ 	 * revoked status on any hashed alias too, otherwise the revoke
+ 	 * state machine will get very upset later on. */
+ 	if (need_cancel && !bh->b_pprev) {
+ 		struct buffer_head *bh2;
+ 		bh2 = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
+ 		if (bh2) {
+ 			clear_bit(BH_Revoked, &bh2->b_state);
+ 			__brelse(bh2);
+ 		}
+ 	}
+ 	
+ 	return did_revoke;
+ }
+ 
+ 
+ /*
+  * Write revoke records to the journal for all entries in the current
+  * revoke hash, deleting the entries as we go.
+  *
+  * Called with the journal lock held.
+  */
+ 
+ void journal_write_revoke_records(journal_t *journal, 
+ 				  transaction_t *transaction)
+ {
+ 	struct journal_head *descriptor;
+ 	struct jbd_revoke_record_s *record;
+ 	struct jbd_revoke_table_s *revoke;
+ 	struct list_head *hash_list;
+ 	int i, offset, count;
+ 
+ 	descriptor = NULL; 
+ 	offset = 0;
+ 	count = 0;
+ 	revoke = journal->j_revoke;
+ 	
+ 	for (i = 0; i < revoke->hash_size; i++) {
+ 		hash_list = &revoke->hash_table[i];
+ 
+ 		while (!list_empty(hash_list)) {
+ 			record = (struct jbd_revoke_record_s *) 
+ 				hash_list->next;
+ 			write_one_revoke_record(journal, transaction,
+ 						&descriptor, &offset, 
+ 						record);
+ 			count++;
+ 			list_del(&record->hash);
+ 			kmem_cache_free(revoke_record_cache, record);
+ 		}
+ 	}
+ 	if (descriptor) 
+ 		flush_descriptor(journal, descriptor, offset);
+ 	jbd_debug(1, "Wrote %d revoke records\n", count);
+ }
+ 
+ /* 
+  * Write out one revoke record.  We need to create a new descriptor
+  * block if the old one is full or if we have not already created one.  
+  */
+ 
+ static void write_one_revoke_record(journal_t *journal, 
+ 				    transaction_t *transaction,
+ 				    struct journal_head **descriptorp, 
+ 				    int *offsetp,
+ 				    struct jbd_revoke_record_s *record)
+ {
+ 	struct journal_head *descriptor;
+ 	int offset;
+ 	journal_header_t *header;
+ 
+ 	/* If we are already aborting, this all becomes a noop.  We
+            still need to go round the loop in
+            journal_write_revoke_records in order to free all of the
+            revoke records: only the IO to the journal is omitted. */
+ 	if (is_journal_aborted(journal))
+ 		return;
+ 
+ 	descriptor = *descriptorp;
+ 	offset = *offsetp;
+ 
+ 	/* Make sure we have a descriptor with space left for the record */
+ 	if (descriptor) {
+ 		if (offset == journal->j_blocksize) {
+ 			flush_descriptor(journal, descriptor, offset);
+ 			descriptor = NULL;
+ 		}
+ 	}
+ 	
+ 	if (!descriptor) {
+ 		descriptor = journal_get_descriptor_buffer(journal);
+ 		header = (journal_header_t *) &jh2bh(descriptor)->b_data[0];
+ 		header->h_magic     = htonl(JFS_MAGIC_NUMBER);
+ 		header->h_blocktype = htonl(JFS_REVOKE_BLOCK);
+ 		header->h_sequence  = htonl(transaction->t_tid);
+ 
+ 		/* Record it so that we can wait for IO completion later */
+ 		JBUFFER_TRACE(descriptor, "file as BJ_LogCtl");
+ 		journal_file_buffer(descriptor, transaction, BJ_LogCtl);
+ 
+ 		offset = sizeof(journal_revoke_header_t);
+ 		*descriptorp = descriptor;
+ 	}
+ 	
+ 	* ((unsigned int *)(&jh2bh(descriptor)->b_data[offset])) = 
+ 		htonl(record->blocknr);
+ 	offset += 4;
+ 	*offsetp = offset;
+ }
+ 
+ /* 
+  * Flush a revoke descriptor out to the journal.  If we are aborting,
+  * this is a noop; otherwise we are generating a buffer which needs to
+  * be waited for during commit, so it has to go onto the appropriate
+  * journal buffer list.
+  */
+ 
+ static void flush_descriptor(journal_t *journal, 
+ 			     struct journal_head *descriptor, 
+ 			     int offset)
+ {
+ 	journal_revoke_header_t *header;
+ 
+ 	if (is_journal_aborted(journal)) {
+ 		JBUFFER_TRACE(descriptor, "brelse");
+ 		__brelse(jh2bh(descriptor));
+ 		return;
+ 	}
+ 	
+ 	header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
+ 	header->r_count = htonl(offset);
+ 	set_bit(BH_JWrite, &jh2bh(descriptor)->b_state);
+ 	{
+ 		struct buffer_head *bh = jh2bh(descriptor);
+ 		BUFFER_TRACE(bh, "write");
+ 		ll_rw_block (WRITE, 1, &bh);
+ 	}
+ }
+ 
+ #endif
+ 
+ /* 
+  * Revoke support for recovery.
+  *
+  * Recovery needs to be able to:
+  *
+  *  record all revoke records, including the tid of the latest instance
+  *  of each revoke in the journal
+  *
+  *  check whether a given block in a given transaction should be replayed
+  *  (ie. has not been revoked by a revoke record in that or a subsequent
+  *  transaction)
+  * 
+  *  empty the revoke table after recovery.
+  */
+ 
+ /*
+  * First, setting revoke records.  We create a new revoke record for
+  * every block ever revoked in the log as we scan it for recovery, and
+  * we update the existing records if we find multiple revokes for a
+  * single block. 
+  */
+ 
+ int journal_set_revoke(journal_t *journal, 
+ 		       unsigned long blocknr, 
+ 		       tid_t sequence)
+ {
+ 	struct jbd_revoke_record_s *record;
+ 	
+ 	record = find_revoke_record(journal, blocknr);
+ 	if (record) {
+ 		/* If we have multiple occurences, only record the
+ 		 * latest sequence number in the hashed record */
+ 		if (tid_gt(sequence, record->sequence))
+ 			record->sequence = sequence;
+ 		return 0;
+ 	} 
+ 	return insert_revoke_hash(journal, blocknr, sequence);
+ }
+ 
+ /* 
+  * Test revoke records.  For a given block referenced in the log, has
+  * that block been revoked?  A revoke record with a given transaction
+  * sequence number revokes all blocks in that transaction and earlier
+  * ones, but later transactions still need replayed.
+  */
+ 
+ int journal_test_revoke(journal_t *journal, 
+ 			unsigned long blocknr,
+ 			tid_t sequence)
+ {
+ 	struct jbd_revoke_record_s *record;
+ 	
+ 	record = find_revoke_record(journal, blocknr);
+ 	if (!record)
+ 		return 0;
+ 	if (tid_gt(sequence, record->sequence))
+ 		return 0;
+ 	return 1;
+ }
+ 
+ /*
+  * Finally, once recovery is over, we need to clear the revoke table so
+  * that it can be reused by the running filesystem.
+  */
+ 
+ void journal_clear_revoke(journal_t *journal)
+ {
+ 	int i;
+ 	struct list_head *hash_list;
+ 	struct jbd_revoke_record_s *record;
+ 	struct jbd_revoke_table_s *revoke;
+ 	
+ 	revoke = journal->j_revoke;
+ 	
+ 	for (i = 0; i < revoke->hash_size; i++) {
+ 		hash_list = &revoke->hash_table[i];
+ 		while (!list_empty(hash_list)) {
+ 			record = (struct jbd_revoke_record_s*) hash_list->next;
+ 			list_del(&record->hash);
+ 			kmem_cache_free(revoke_record_cache, record);
+ 		}
+ 	}
+ }
+ 
diff -rc2P linux/fs/jbd/transaction.c linux-2.4.13/fs/jbd/transaction.c
*** linux/fs/jbd/transaction.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd/transaction.c	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,2078 ----
+ /*
+  * linux/fs/transaction.c
+  * 
+  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
+  *
+  * Copyright 1998 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Generic filesystem transaction handling code; part of the ext2fs
+  * journaling system.  
+  *
+  * This file manages transactions (compound commits managed by the
+  * journaling code) and handles (individual atomic operations by the
+  * filesystem).
+  */
+ 
+ #include <linux/sched.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/errno.h>
+ #include <linux/slab.h>
+ #include <linux/locks.h>
+ #include <linux/timer.h>
+ #include <linux/smp_lock.h>
+ #include <linux/mm.h>
+ #include <linux/swap.h>	/* Uggh... needed for buffermem_pages */
+ 
+ 
+ extern spinlock_t journal_datalist_lock;
+ 
+ /*
+  * get_transaction: obtain a new transaction_t object.
+  *
+  * Simply allocate and initialise a new transaction.  Create it in
+  * RUNNING state and add it to the current journal (which should not
+  * have an existing running transaction: we only make a new transaction
+  * once we have started to commit the old one).
+  *
+  * Preconditions:
+  *	The journal MUST be locked.  We don't perform atomic mallocs on the
+  *	new transaction	and we can't block without protecting against other
+  *	processes trying to touch the journal while it is in transition.
+  */
+ 
+ static transaction_t * get_transaction (journal_t * journal, int is_try)
+ {
+ 	transaction_t * transaction;
+ 
+ 	transaction = jbd_kmalloc (sizeof (transaction_t), GFP_NOFS);
+ 	if (!transaction)
+ 		return NULL;
+ 	
+ 	memset (transaction, 0, sizeof (transaction_t));
+ 	
+ 	transaction->t_journal = journal;
+ 	transaction->t_state = T_RUNNING;
+ 	transaction->t_tid = journal->j_transaction_sequence++;
+ 	transaction->t_expires = jiffies + journal->j_commit_interval;
+ 
+ 	/* Set up the commit timer for the new transaction. */
+ 	J_ASSERT (!journal->j_commit_timer_active);
+ 	journal->j_commit_timer_active = 1;
+ 	journal->j_commit_timer->expires = transaction->t_expires;
+ 	add_timer(journal->j_commit_timer);
+ 	
+ 	J_ASSERT (journal->j_running_transaction == NULL);
+ 	journal->j_running_transaction = transaction;
+ 
+ 	return transaction;
+ }
+ 
+ /*
+  * Handle management.
+  *
+  * A handle_t is an object which represents a single atomic update to a
+  * filesystem, and which tracks all of the modifications which form part
+  * of that one update.
+  */
+ 
+ /*
+  * start_this_handle: Given a handle, deal with any locking or stalling
+  * needed to make sure that there is enough journal space for the handle
+  * to begin.  Attach the handle to a transaction and set up the
+  * transaction's buffer credits.  
+  */
+ 
+ static int start_this_handle(journal_t *journal, handle_t *handle)
+ {
+ 	transaction_t *transaction;
+ 	int needed;
+ 	int nblocks = handle->h_buffer_credits;
+ 	
+ 	jbd_debug(3, "New handle %p going live.\n", handle);
+ 
+ repeat:
+ 
+ 	lock_journal(journal);
+ 
+ 	if (is_journal_aborted(journal) ||
+ 	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
+ 		unlock_journal(journal);
+ 		return -EROFS; 
+ 	}
+ 
+ 	/* Wait on the journal's transaction barrier if necessary */
+ 	if (journal->j_barrier_count) {
+ 		unlock_journal(journal);
+ 		sleep_on(&journal->j_wait_transaction_locked);
+ 		goto repeat;
+ 	}
+ 	
+ repeat_locked:
+ 	if (!journal->j_running_transaction)
+ 		get_transaction(journal, 0);
+ 	/* @@@ Error? */
+ 	J_ASSERT(journal->j_running_transaction);
+ 	
+ 	transaction = journal->j_running_transaction;
+ 
+ 	/* If the current transaction is locked down for commit, wait
+ 	 * for the lock to be released. */
+ 
+ 	if (transaction->t_state == T_LOCKED) {
+ 		unlock_journal(journal);
+ 		jbd_debug(3, "Handle %p stalling...\n", handle);
+ 		sleep_on(&journal->j_wait_transaction_locked);
+ 		goto repeat;
+ 	}
+ 	
+ 	/* If there is not enough space left in the log to write all
+ 	 * potential buffers requested by this operation, we need to
+ 	 * stall pending a log checkpoint to free some more log
+ 	 * space. */
+ 
+ 	needed = transaction->t_outstanding_credits + nblocks;
+ 
+ 	if (needed > journal->j_max_transaction_buffers) {
+ 		/* If the current transaction is already too large, then
+ 		 * start to commit it: we can then go back and attach
+ 		 * this handle to a new transaction. */
+ 		
+ 		jbd_debug(2, "Handle %p starting new commit...\n", handle);
+ 		log_start_commit(journal, transaction);
+ 		unlock_journal(journal);
+ 		sleep_on(&journal->j_wait_transaction_locked);
+ 		lock_journal(journal);
+ 		goto repeat_locked;
+ 	}
+ 
+ 	/* 
+ 	 * The commit code assumes that it can get enough log space
+ 	 * without forcing a checkpoint.  This is *critical* for
+ 	 * correctness: a checkpoint of a buffer which is also
+ 	 * associated with a committing transaction creates a deadlock,
+ 	 * so commit simply cannot force through checkpoints.
+ 	 *
+ 	 * We must therefore ensure the necessary space in the journal
+ 	 * *before* starting to dirty potentially checkpointed buffers
+ 	 * in the new transaction. 
+ 	 *
+ 	 * The worst part is, any transaction currently committing can
+ 	 * reduce the free space arbitrarily.  Be careful to account for
+ 	 * those buffers when checkpointing.
+ 	 */
+ 
+ 	/*
+ 	 * @@@ AKPM: This seems rather over-defensive.  We're giving commit
+ 	 * a _lot_ of headroom: 1/4 of the journal plus the size of
+ 	 * the committing transaction.  Really, we only need to give it
+ 	 * committing_transaction->t_outstanding_credits plus "enough" for
+ 	 * the log control blocks.
+ 	 * Also, this test is inconsitent with the matching one in
+ 	 * journal_extend().
+ 	 */
+ 	needed = journal->j_max_transaction_buffers;
+ 	if (journal->j_committing_transaction) 
+ 		needed += journal->j_committing_transaction->
+ 					t_outstanding_credits;
+ 	
+ 	if (log_space_left(journal) < needed) {
+ 		jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle);
+ 		log_wait_for_space(journal, needed);
+ 		goto repeat_locked;
+ 	}
+ 
+ 	/* OK, account for the buffers that this operation expects to
+ 	 * use and add the handle to the running transaction. */
+ 
+ 	handle->h_transaction = transaction;
+ 	transaction->t_outstanding_credits += nblocks;
+ 	transaction->t_updates++;
+ 	transaction->t_handle_count++;
+ 	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+ 		  handle, nblocks, transaction->t_outstanding_credits,
+ 		  log_space_left(journal));
+ 
+ 	unlock_journal(journal);
+ 	
+ 	return 0;
+ }
+ 
+ /*
+  * Obtain a new handle.  
+  *
+  * We make sure that the transaction can guarantee at least nblocks of
+  * modified buffers in the log.  We block until the log can guarantee
+  * that much space.  
+  *
+  * This function is visible to journal users (like ext2fs), so is not
+  * called with the journal already locked.
+  *
+  * Return a pointer to a newly allocated handle, or NULL on failure
+  */
+ 
+ handle_t *journal_start(journal_t *journal, int nblocks)
+ {
+ 	handle_t *handle = journal_current_handle();
+ 	int err;
+ 	
+ 	if (!journal)
+ 		return ERR_PTR(-EROFS);
+ 
+ 	if (handle) {
+ 		J_ASSERT(handle->h_transaction->t_journal == journal);
+ 		handle->h_ref++;
+ 		return handle;
+ 	}
+ 	
+ 	handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+ 	if (!handle)
+ 		return ERR_PTR(-ENOMEM);
+ 	memset (handle, 0, sizeof (handle_t));
+ 
+ 	handle->h_buffer_credits = nblocks;
+ 	handle->h_ref = 1;
+ 	current->journal_info = handle;
+ 
+ 	err = start_this_handle(journal, handle);
+ 	if (err < 0) {
+ 		kfree(handle);
+ 		current->journal_info = NULL;
+ 		return ERR_PTR(err);
+ 	}
+ 
+ 	return handle;
+ }
+ 
+ /*
+  * Return zero on success
+  */
+ static int try_start_this_handle(journal_t *journal, handle_t *handle)
+ {
+ 	transaction_t *transaction;
+ 	int needed;
+ 	int nblocks = handle->h_buffer_credits;
+ 	int ret = 0;
+ 
+ 	jbd_debug(3, "New handle %p maybe going live.\n", handle);
+ 
+ 	lock_journal(journal);
+ 
+ 	if (is_journal_aborted(journal) ||
+ 	    (journal->j_errno != 0 && !(journal->j_flags & JFS_ACK_ERR))) {
+ 		ret = -EROFS;
+ 		goto fail_unlock;
+ 	}
+ 
+ 	if (journal->j_barrier_count)
+ 		goto fail_unlock;
+ 
+ 	if (!journal->j_running_transaction && get_transaction(journal, 1) == 0)
+ 		goto fail_unlock;
+ 	
+ 	transaction = journal->j_running_transaction;
+ 	if (transaction->t_state == T_LOCKED)
+ 		goto fail_unlock;
+ 	
+ 	needed = transaction->t_outstanding_credits + nblocks;
+ 	/* We could run log_start_commit here */
+ 	if (needed > journal->j_max_transaction_buffers)
+ 		goto fail_unlock;
+ 
+ 	needed = journal->j_max_transaction_buffers;
+ 	if (journal->j_committing_transaction) 
+ 		needed += journal->j_committing_transaction->
+ 						t_outstanding_credits;
+ 	
+ 	if (log_space_left(journal) < needed)
+ 		goto fail_unlock;
+ 
+ 	handle->h_transaction = transaction;
+ 	transaction->t_outstanding_credits += nblocks;
+ 	transaction->t_updates++;
+ 	jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n",
+ 		  handle, nblocks, transaction->t_outstanding_credits,
+ 		  log_space_left(journal));
+ 	unlock_journal(journal);
+ 	return 0;
+ 
+ fail_unlock:
+ 	unlock_journal(journal);
+ 	if (ret >= 0)
+ 		ret = -1;
+ 	return ret;
+ }
+ 
+ /*
+  * Try to start a handle, but non-blockingly.  If we weren't able
+  * to, return an ERR_PTR value.
+  */
+ handle_t *journal_try_start(journal_t *journal, int nblocks)
+ {
+ 	handle_t *handle = journal_current_handle();
+ 	int err;
+ 	
+ 	if (!journal)
+ 		return ERR_PTR(-EROFS);
+ 
+ 	if (handle) {
+ 		jbd_debug(4, "h_ref %d -> %d\n",
+ 				handle->h_ref,
+ 				handle->h_ref + 1);
+ 		J_ASSERT(handle->h_transaction->t_journal == journal);
+ 		if (is_handle_aborted(handle))
+ 			return ERR_PTR(-EIO);
+ 		handle->h_ref++;
+ 		return handle;
+ 	} else {
+ 		jbd_debug(4, "no current transaction\n");
+ 	}
+ 	
+ 	if (is_journal_aborted(journal))
+ 		return ERR_PTR(-EIO);
+ 	
+ 	handle = jbd_kmalloc(sizeof (handle_t), GFP_NOFS);
+ 	if (!handle)
+ 		return ERR_PTR(-ENOMEM);
+ 	memset (handle, 0, sizeof (handle_t));
+ 
+ 	handle->h_buffer_credits = nblocks;
+ 	handle->h_ref = 1;
+ 	current->journal_info = handle;
+ 
+ 	err = try_start_this_handle(journal, handle);
+ 	if (err < 0) {
+ 		kfree(handle);
+ 		current->journal_info = NULL;
+ 		return ERR_PTR(err);
+ 	}
+ 
+ 	return handle;
+ }
+ 
+ /*
+  * journal_extend: extend buffer credits.
+  *
+  * Some transactions, such as large extends and truncates, can be done
+  * atomically all at once or in several stages.  The operation requests
+  * a credit for a number of buffer modications in advance, but can
+  * extend its credit if it needs more.  
+  *
+  * journal_extend tries to give the running handle more buffer credits.
+  * It does not guarantee that allocation: this is a best-effort only.
+  * The calling process MUST be able to deal cleanly with a failure to
+  * extend here.
+  *
+  * Return 0 on success, non-zero on failure.
+  *
+  * return code < 0 implies an error
+  * return code > 0 implies normal transaction-full status.
+  */
+ 
+ int journal_extend (handle_t *handle, int nblocks)
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	int result;
+ 	int wanted;
+ 
+ 	lock_journal (journal);
+ 
+ 	result = -EIO;
+ 	if (is_handle_aborted(handle))
+ 		goto error_out;
+ 
+ 	result = 1;
+ 	       
+ 	/* Don't extend a locked-down transaction! */
+ 	if (handle->h_transaction->t_state != T_RUNNING) {
+ 		jbd_debug(3, "denied handle %p %d blocks: "
+ 			  "transaction not running\n", handle, nblocks);
+ 		goto error_out;
+ 	}
+ 	
+ 	wanted = transaction->t_outstanding_credits + nblocks;
+ 	
+ 	if (wanted > journal->j_max_transaction_buffers) {
+ 		jbd_debug(3, "denied handle %p %d blocks: "
+ 			  "transaction too large\n", handle, nblocks);
+ 		goto error_out;
+ 	}
+ 
+ 	if (wanted > log_space_left(journal)) {
+ 		jbd_debug(3, "denied handle %p %d blocks: "
+ 			  "insufficient log space\n", handle, nblocks);
+ 		goto error_out;
+ 	}
+ 	
+ 	handle->h_buffer_credits += nblocks;
+ 	transaction->t_outstanding_credits += nblocks;
+ 	result = 0;
+ 
+ 	jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
+ 	
+ error_out:
+ 	unlock_journal (journal);
+ 	return result;
+ }
+ 
+ 
+ /*
+  * journal_restart: restart a handle for a multi-transaction filesystem
+  * operation.
+  *
+  * If the journal_extend() call above fails to grant new buffer credits
+  * to a running handle, a call to journal_restart will commit the
+  * handle's transaction so far and reattach the handle to a new
+  * transaction capabable of guaranteeing the requested number of
+  * credits.
+  */
+ 
+ int journal_restart(handle_t *handle, int nblocks)
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	int ret;
+ 
+ 	/* If we've had an abort of any type, don't even think about
+ 	 * actually doing the restart! */
+ 	if (is_handle_aborted(handle))
+ 		return 0;
+ 	
+ 	/* First unlink the handle from its current transaction, and
+ 	 * start the commit on that. */
+ 	
+ 	J_ASSERT (transaction->t_updates > 0);
+ 	J_ASSERT (journal_current_handle() == handle);
+ 
+ 	transaction->t_outstanding_credits -= handle->h_buffer_credits;
+ 	transaction->t_updates--;
+ 
+ 	if (!transaction->t_updates)
+ 		wake_up(&journal->j_wait_updates);
+ 
+ 	jbd_debug(2, "restarting handle %p\n", handle);
+ 	log_start_commit(journal, transaction);
+ 
+ 	handle->h_buffer_credits = nblocks;
+ 	ret = start_this_handle(journal, handle);
+ 	return ret;
+ }
+ 
+ 
+ /* 
+  * Barrier operation: establish a transaction barrier. 
+  *
+  * This locks out any further updates from being started, and blocks
+  * until all existing updates have completed, returning only once the
+  * journal is in a quiescent state with no updates running.
+  *
+  * The journal lock should not be held on entry.
+  */
+ 
+ void journal_lock_updates (journal_t *journal)
+ {
+ 	lock_journal(journal);
+ 	++journal->j_barrier_count;
+ 
+ 	/* Wait until there are no running updates */
+ 	while (1) {
+ 		transaction_t *transaction = journal->j_running_transaction;
+ 		if (!transaction)
+ 			break;
+ 		if (!transaction->t_updates)
+ 			break;
+ 		
+ 		unlock_journal(journal);
+ 		sleep_on(&journal->j_wait_updates);
+ 		lock_journal(journal);
+ 	}
+ 
+ 	unlock_journal(journal);
+ 
+ 	/* We have now established a barrier against other normal
+ 	 * updates, but we also need to barrier against other
+ 	 * journal_lock_updates() calls to make sure that we serialise
+ 	 * special journal-locked operations too. */
+ 	down(&journal->j_barrier);
+ }
+ 
+ /*
+  * Release a transaction barrier obtained with journal_lock_updates().
+  *
+  * Should be called without the journal lock held.
+  */
+ 
+ void journal_unlock_updates (journal_t *journal)
+ {
+ 	lock_journal(journal);
+ 
+ 	J_ASSERT (journal->j_barrier_count != 0);
+ 	
+ 	up(&journal->j_barrier);
+ 	--journal->j_barrier_count;
+ 	wake_up(&journal->j_wait_transaction_locked);
+ 	unlock_journal(journal);
+ }
+ 
+ /*
+  * journal_get_write_access: notify intent to modify a buffer for metadata
+  * (not data) update.
+  *
+  * If the buffer is already part of the current transaction, then there
+  * is nothing we need to do.  If it is already part of a prior
+  * transaction which we are still committing to disk, then we need to
+  * make sure that we do not overwrite the old copy: we do copy-out to
+  * preserve the copy going to disk.  We also account the buffer against
+  * the handle's metadata buffer credits (unless the buffer is already
+  * part of the transaction, that is).
+  *
+  * Returns an error code or 0 on success.
+  *
+  * In full data journalling mode the buffer may be of type BJ_AsyncData,
+  * because we're write()ing a buffer which is also part of a shared mapping.
+  */
+ 
+ static int
+ do_get_write_access(handle_t *handle, struct journal_head *jh, int force_copy) 
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	int error;
+ 	char *frozen_buffer = NULL;
+ 	int need_copy = 0;
+ 
+ 	jbd_debug(5, "buffer_head %p, force_copy %d\n", jh, force_copy);
+ 
+ 	JBUFFER_TRACE(jh, "entry");
+ repeat:
+ 	/* @@@ Need to check for errors here at some point. */
+ 
+ 	/*
+ 	 * AKPM: neither bdflush nor kupdate run with the BKL.   There's
+ 	 * nothing we can do to prevent them from starting writeout of a
+ 	 * BUF_DIRTY buffer at any time.  And checkpointing buffers are on
+ 	 * BUF_DIRTY.  So.  We no longer assert that the buffer is unlocked.
+ 	 *
+ 	 * However.  It is very wrong for us to allow ext3 to start directly
+ 	 * altering the ->b_data of buffers which may at that very time be
+ 	 * undergoing writeout to the client filesystem.  This can leave
+ 	 * the filesystem in an inconsistent, transient state if we crash.
+ 	 * So what we do is to steal the buffer if it is in checkpoint
+ 	 * mode and dirty.  The journal lock will keep out checkpoint-mode
+ 	 * state transitions within journal_remove_checkpoint() and the buffer
+ 	 * is locked to keep bdflush/kupdate/whoever away from it as well.
+ 	 *
+ 	 * AKPM: we have replaced all the lock_journal_bh_wait() stuff with a
+ 	 * simple lock_journal().  This code here will care for locked buffers.
+ 	 */
+ 	/*
+ 	 * The buffer_locked() || buffer_dirty() tests here are simply an
+ 	 * optimisation tweak.  If anyone else in the system decides to
+ 	 * lock this buffer later on, we'll blow up.  There doesn't seem
+ 	 * to be a good reason why they should do this.
+ 	 */
+ 	if (jh->b_cp_transaction &&
+ 	    (buffer_locked(jh2bh(jh)) || buffer_dirty(jh2bh(jh)))) {
+ 		unlock_journal(journal);
+ 		lock_buffer(jh2bh(jh));
+ 		spin_lock(&journal_datalist_lock);
+ 		if (jh->b_cp_transaction && buffer_dirty(jh2bh(jh))) {
+ 			/* OK, we need to steal it */
+ 			JBUFFER_TRACE(jh, "stealing from checkpoint mode");
+ 			J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ 			J_ASSERT_JH(jh, jh->b_frozen_data == NULL);
+ 
+ 			J_ASSERT(handle->h_buffer_credits > 0);
+ 			handle->h_buffer_credits--;
+ 
+ 			/* This will clear BH_Dirty and set BH_JBDDirty. */
+ 			JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ 			__journal_file_buffer(jh, transaction, BJ_Reserved);
+ 
+ 			/* And pull it off BUF_DIRTY, onto BUF_CLEAN */
+ 			refile_buffer(jh2bh(jh));
+ 
+ 			/*
+ 			 * The buffer is now hidden from bdflush.   It is
+ 			 * metadata against the current transaction.
+ 			 */
+ 			JBUFFER_TRACE(jh, "steal from cp mode is complete");
+ 		}
+ 		spin_unlock(&journal_datalist_lock);
+ 		unlock_buffer(jh2bh(jh));
+ 		lock_journal(journal);
+ 	}
+ 
+ 	J_ASSERT_JH(jh, !buffer_locked(jh2bh(jh)));
+ 
+ 	error = -EROFS;
+ 	if (is_handle_aborted(handle)) 
+ 		goto out_unlocked;
+ 	error = 0;
+ 
+ 	spin_lock(&journal_datalist_lock);
+ 
+ 	/* The buffer is already part of this transaction if
+ 	 * b_transaction or b_next_transaction points to it. */
+ 
+ 	if (jh->b_transaction == transaction ||
+ 	    jh->b_next_transaction == transaction)
+ 		goto done_locked;
+ 
+ 	/* If there is already a copy-out version of this buffer, then
+ 	 * we don't need to make another one. */
+ 
+ 	if (jh->b_frozen_data) {
+ 		JBUFFER_TRACE(jh, "has frozen data");
+ 		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ 		jh->b_next_transaction = transaction;
+ 
+ 		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+ 		handle->h_buffer_credits--;
+ 		goto done_locked;
+ 	}
+ 	
+ 	/* Is there data here we need to preserve? */
+ 
+ 	if (jh->b_transaction && jh->b_transaction != transaction) {
+ 		JBUFFER_TRACE(jh, "owned by older transaction");
+ 		J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ 		J_ASSERT_JH(jh, jh->b_transaction ==
+ 					journal->j_committing_transaction);
+ 
+ 		/* There is one case we have to be very careful about.
+ 		 * If the committing transaction is currently writing
+ 		 * this buffer out to disk and has NOT made a copy-out,
+ 		 * then we cannot modify the buffer contents at all
+ 		 * right now.  The essence of copy-out is that it is the
+ 		 * extra copy, not the primary copy, which gets
+ 		 * journaled.  If the primary copy is already going to
+ 		 * disk then we cannot do copy-out here. */
+ 
+ 		if (jh->b_jlist == BJ_Shadow) {
+ 			JBUFFER_TRACE(jh, "on shadow: sleep");
+ 			spin_unlock(&journal_datalist_lock);
+ 			unlock_journal(journal);
+ 			/* commit wakes up all shadow buffers after IO */
+ 			sleep_on(&jh2bh(jh)->b_wait);
+ 			lock_journal(journal);
+ 			goto repeat;
+ 		}
+ 			
+ 		/* Only do the copy if the currently-owning transaction
+ 		 * still needs it.  If it is on the Forget list, the
+ 		 * committing transaction is past that stage.  The
+ 		 * buffer had better remain locked during the kmalloc,
+ 		 * but that should be true --- we hold the journal lock
+ 		 * still and the buffer is already on the BUF_JOURNAL
+ 		 * list so won't be flushed. 
+ 		 *
+ 		 * Subtle point, though: if this is a get_undo_access,
+ 		 * then we will be relying on the frozen_data to contain
+ 		 * the new value of the committed_data record after the
+ 		 * transaction, so we HAVE to force the frozen_data copy
+ 		 * in that case. */
+ 
+ 		if (jh->b_jlist != BJ_Forget || force_copy) {
+ 			JBUFFER_TRACE(jh, "generate frozen data");
+ 			if (!frozen_buffer) {
+ 				JBUFFER_TRACE(jh, "allocate memory for buffer");
+ 				spin_unlock(&journal_datalist_lock);
+ 				unlock_journal(journal);
+ 				frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size,
+ 							    GFP_NOFS);
+ 				lock_journal(journal);
+ 				if (!frozen_buffer) {
+ 					printk(KERN_EMERG __FUNCTION__
+ 						"OOM for frozen_buffer\n");
+ 					JBUFFER_TRACE(jh, "oom!");
+ 					error = -ENOMEM;
+ 					spin_lock(&journal_datalist_lock);
+ 					goto done_locked;
+ 				}
+ 				goto repeat;
+ 			}
+ 
+ 			jh->b_frozen_data = frozen_buffer;
+ 			frozen_buffer = NULL;
+ 			need_copy = 1;
+ 		}
+ 		jh->b_next_transaction = transaction;
+ 	}
+ 
+ 	J_ASSERT(handle->h_buffer_credits > 0);
+ 	handle->h_buffer_credits--;
+ 
+ 	/* Finally, if the buffer is not journaled right now, we need to
+ 	 * make sure it doesn't get written to disk before the caller
+ 	 * actually commits the new data. */
+ 
+ 	if (!jh->b_transaction) {
+ 		JBUFFER_TRACE(jh, "no transaction");
+ 		J_ASSERT_JH(jh, !jh->b_next_transaction);
+ 		jh->b_transaction = transaction;
+ 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ 		__journal_file_buffer(jh, transaction, BJ_Reserved);
+ 	}
+ 	
+ done_locked:
+ 	spin_unlock(&journal_datalist_lock);
+ 	if (need_copy) {
+ 		struct page *page;
+ 		int offset;
+ 		char *source;
+ 
+ 		J_ASSERT_JH(jh, buffer_uptodate(jh2bh(jh)));
+ 		page = jh2bh(jh)->b_page;
+ 		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
+ 		source = kmap(page);
+ 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
+ 		kunmap(page);
+ 	}
+ 	
+ 
+ 	/* If we are about to journal a buffer, then any revoke pending
+            on it is no longer valid. */
+ 	journal_cancel_revoke(handle, jh);
+ 
+ out_unlocked:
+ 	if (frozen_buffer)
+ 		kfree(frozen_buffer);
+ 
+ 	JBUFFER_TRACE(jh, "exit");
+ 	return error;
+ }
+ 
+ int journal_get_write_access (handle_t *handle, struct buffer_head *bh) 
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	struct journal_head *jh = journal_add_journal_head(bh);
+ 	int rc;
+ 
+ 	/* We do not want to get caught playing with fields which the
+ 	 * log thread also manipulates.  Make sure that the buffer
+ 	 * completes any outstanding IO before proceeding. */
+ 	lock_journal(journal);
+ 	rc = do_get_write_access(handle, jh, 0);
+ 	journal_unlock_journal_head(jh);
+ 	unlock_journal(journal);
+ 	return rc;
+ }
+ 
+ 
+ /*
+  * When the user wants to journal a newly created buffer_head
+  * (ie. getblk() returned a new buffer and we are going to populate it
+  * manually rather than reading off disk), then we need to keep the
+  * buffer_head locked until it has been completely filled with new
+  * data.  In this case, we should be able to make the assertion that
+  * the bh is not already part of an existing transaction.  
+  * 
+  * The buffer should already be locked by the caller by this point.
+  * There is no lock ranking violation: it was a newly created,
+  * unlocked buffer beforehand. */
+ 
+ int journal_get_create_access (handle_t *handle, struct buffer_head *bh) 
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	struct journal_head *jh = journal_add_journal_head(bh);
+ 	int err;
+ 	
+ 	jbd_debug(5, "journal_head %p\n", jh);
+ 	lock_journal(journal);
+ 	err = -EROFS;
+ 	if (is_handle_aborted(handle))
+ 		goto out;
+ 	err = 0;
+ 	
+ 	JBUFFER_TRACE(jh, "entry");
+ 	/* The buffer may already belong to this transaction due to
+ 	 * pre-zeroing in the filesystem's new_block code.  It may also
+ 	 * be on the previous, committing transaction's lists, but it
+ 	 * HAS to be in Forget state in that case: the transaction must
+ 	 * have deleted the buffer for it to be reused here. */
+ 	J_ASSERT_JH(jh, (jh->b_transaction == transaction ||
+ 			 jh->b_transaction == NULL ||
+ 			 (jh->b_transaction == journal->j_committing_transaction &&
+ 			  jh->b_jlist == BJ_Forget)));
+ 
+ 	J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ 	J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
+ 
+ 	J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+ 	handle->h_buffer_credits--;
+ 
+ 	spin_lock(&journal_datalist_lock);
+ 	if (jh->b_transaction == NULL) {
+ 		jh->b_transaction = transaction;
+ 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
+ 		__journal_file_buffer(jh, transaction, BJ_Reserved);
+ 		JBUFFER_TRACE(jh, "refile");
+ 		refile_buffer(jh2bh(jh));
+ 	} else if (jh->b_transaction == journal->j_committing_transaction) {
+ 		JBUFFER_TRACE(jh, "set next transaction");
+ 		jh->b_next_transaction = transaction;
+ 	}
+ 	spin_unlock(&journal_datalist_lock);
+ 
+ 	/*
+ 	 * akpm: I added this.  ext3_alloc_branch can pick up new indirect
+ 	 * blocks which contain freed but then revoked metadata.  We need
+ 	 * to cancel the revoke in case we end up freeing it yet again
+ 	 * and the reallocating as data - this would cause a second revoke,
+ 	 * which hits an assertion error.
+ 	 */
+ 	JBUFFER_TRACE(jh, "cancelling revoke");
+ 	journal_cancel_revoke(handle, jh);
+ 	journal_unlock_journal_head(jh);
+ out:
+ 	unlock_journal(journal);
+ 	return err;
+ }
+ 
+ 
+ 
+ /*
+  * journal_get_undo_access: Notify intent to modify metadata with non-
+  * rewindable consequences
+  *
+  * Sometimes there is a need to distinguish between metadata which has
+  * been committed to disk and that which has not.  The ext3fs code uses
+  * this for freeing and allocating space: we have to make sure that we
+  * do not reuse freed space until the deallocation has been committed,
+  * since if we overwrote that space we would make the delete
+  * un-rewindable in case of a crash.
+  * 
+  * To deal with that, journal_get_undo_access requests write access to a
+  * buffer for parts of non-rewindable operations such as delete
+  * operations on the bitmaps.  The journaling code must keep a copy of
+  * the buffer's contents prior to the undo_access call until such time
+  * as we know that the buffer has definitely been committed to disk.
+  * 
+  * We never need to know which transaction the committed data is part
+  * of: buffers touched here are guaranteed to be dirtied later and so
+  * will be committed to a new transaction in due course, at which point
+  * we can discard the old committed data pointer.
+  *
+  * Returns error number or 0 on success.  
+  */
+ 
+ int journal_get_undo_access (handle_t *handle, struct buffer_head *bh)
+ {
+ 	journal_t *journal = handle->h_transaction->t_journal;
+ 	int err;
+ 	struct journal_head *jh = journal_add_journal_head(bh);
+ 
+ 	JBUFFER_TRACE(jh, "entry");
+ 	lock_journal(journal);
+ 
+ 	/* Do this first --- it can drop the journal lock, so we want to
+ 	 * make sure that obtaining the committed_data is done
+ 	 * atomically wrt. completion of any outstanding commits. */
+ 	err = do_get_write_access (handle, jh, 1);
+ 	if (err)
+ 		goto out;
+ 	
+ 	if (!jh->b_committed_data) {
+ 		/* Copy out the current buffer contents into the
+ 		 * preserved, committed copy. */
+ 		JBUFFER_TRACE(jh, "generate b_committed data");
+ 		jh->b_committed_data = jbd_kmalloc(jh2bh(jh)->b_size, 
+ 						   GFP_NOFS);
+ 		if (!jh->b_committed_data) {
+ 			printk(KERN_EMERG __FUNCTION__
+ 				": No memory for committed data!\n");
+ 			err = -ENOMEM;
+ 			goto out;
+ 		}
+ 		
+ 		memcpy (jh->b_committed_data, jh2bh(jh)->b_data,
+ 				jh2bh(jh)->b_size);
+ 	}
+ 
+ out:
+ 	if (!err)
+ 		J_ASSERT_JH(jh, jh->b_committed_data);
+ 	journal_unlock_journal_head(jh);
+ 	unlock_journal(journal);
+ 	return err;
+ }
+ 
+ /* 
+  * journal_dirty_data: mark a buffer as containing dirty data which
+  * needs to be flushed before we can commit the current transaction.  
+  *
+  * The buffer is placed on the transaction's data list and is marked as
+  * belonging to the transaction.
+  *
+  * If `async' is set then the writebask will be initiated by the caller
+  * using submit_bh -> end_buffer_io_async.  We put the buffer onto
+  * t_async_datalist.
+  * 
+  * Returns error number or 0 on success.  
+  *
+  * journal_dirty_data() can be called via page_launder->ext3_writepage
+  * by kswapd.  So it cannot block.  Happily, there's nothing here
+  * which needs lock_journal if `async' is set.
+  *
+  * When the buffer is on the current transaction we freely move it
+  * between BJ_AsyncData and BJ_SyncData according to who tried to
+  * change its state last.
+  */
+ 
+ int journal_dirty_data (handle_t *handle, struct buffer_head *bh, int async)
+ {
+ 	journal_t *journal = handle->h_transaction->t_journal;
+ 	int need_brelse = 0;
+ 	int wanted_jlist = async ? BJ_AsyncData : BJ_SyncData;
+ 	struct journal_head *jh;
+ 
+ 	if (is_handle_aborted(handle))
+ 		return 0;
+ 	
+ 	jh = journal_add_journal_head(bh);
+ 	JBUFFER_TRACE(jh, "entry");
+ 
+ 	/*
+ 	 * The buffer could *already* be dirty.  Writeout can start
+ 	 * at any time.
+ 	 */
+ 	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
+ 
+ 	/*
+ 	 * What if the buffer is already part of a running transaction?
+ 	 * 
+ 	 * There are two cases:
+ 	 * 1) It is part of the current running transaction.  Refile it,
+ 	 *    just in case we have allocated it as metadata, deallocated
+ 	 *    it, then reallocated it as data. 
+ 	 * 2) It is part of the previous, still-committing transaction.
+ 	 *    If all we want to do is to guarantee that the buffer will be
+ 	 *    written to disk before this new transaction commits, then
+ 	 *    being sure that the *previous* transaction has this same 
+ 	 *    property is sufficient for us!  Just leave it on its old
+ 	 *    transaction.
+ 	 *
+ 	 * In case (2), the buffer must not already exist as metadata
+ 	 * --- that would violate write ordering (a transaction is free
+ 	 * to write its data at any point, even before the previous
+ 	 * committing transaction has committed).  The caller must
+ 	 * never, ever allow this to happen: there's nothing we can do
+ 	 * about it in this layer.
+ 	 */
+ 	spin_lock(&journal_datalist_lock);
+ 	if (jh->b_transaction) {
+ 		JBUFFER_TRACE(jh, "has transaction");
+ 		if (jh->b_transaction != handle->h_transaction) {
+ 			JBUFFER_TRACE(jh, "belongs to older transaction");
+ 			J_ASSERT_JH(jh, jh->b_transaction ==
+ 					journal->j_committing_transaction);
+ 
+ 			/* @@@ IS THIS TRUE  ? */
+ 			/*
+ 			 * Not any more.  Scenario: someone does a write()
+ 			 * in data=journal mode.  The buffer's transaction has
+ 			 * moved into commit.  Then someone does another
+ 			 * write() to the file.  We do the frozen data copyout
+ 			 * and set b_next_transaction to point to j_running_t.
+ 			 * And while we're in that state, someone does a
+ 			 * writepage() in an attempt to pageout the same area
+ 			 * of the file via a shared mapping.  At present that
+ 			 * calls journal_dirty_data(), and we get right here.
+ 			 * It may be too late to journal the data.  Simply
+ 			 * falling through to the next test will suffice: the
+ 			 * data will be dirty and wil be checkpointed.  The
+ 			 * ordering comments in the next comment block still
+ 			 * apply.
+ 			 */
+ 			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
+ 
+ 			/*
+ 			 * If we're journalling data, and this buffer was
+ 			 * subject to a write(), it could be metadata, forget
+ 			 * or shadow against the committing transaction.  Now,
+ 			 * someone has dirtied the same darn page via a mapping
+ 			 * and it is being writepage()'d.
+ 			 * We *could* just steal the page from commit, with some
+ 			 * fancy locking there.  Instead, we just skip it -
+ 			 * don't tie the page's buffers to the new transaction
+ 			 * at all.
+ 			 * Implication: if we crash before the writepage() data
+ 			 * is written into the filesystem, recovery will replay
+ 			 * the write() data.
+ 			 */
+ 			if (jh->b_jlist != BJ_None &&
+ 					jh->b_jlist != BJ_SyncData &&
+ 					jh->b_jlist != BJ_AsyncData) {
+ 				JBUFFER_TRACE(jh, "Not stealing");
+ 				goto no_journal;
+ 			}
+ 
+ 			/*
+ 			 * This buffer may be undergoing writeout in commit.  We
+ 			 * can't return from here and let the caller dirty it
+ 			 * again because that can cause the write-out loop in
+ 			 * commit to never terminate.
+ 			 */
+ 			if (!async && buffer_dirty(bh)) {
+ 				atomic_inc(&bh->b_count);
+ 				spin_unlock(&journal_datalist_lock);
+ 				need_brelse = 1;
+ 				ll_rw_block(WRITE, 1, &bh);
+ 				wait_on_buffer(bh);
+ 				spin_lock(&journal_datalist_lock);
+ 				/* The buffer may become locked again at any
+ 				   time if it is redirtied */
+ 			}
+ 
+ 			/* journal_clean_data_list() may have got there first */
+ 			if (jh->b_transaction != NULL) {
+ 				JBUFFER_TRACE(jh, "unfile from commit");
+ 				__journal_unfile_buffer(jh);
+ 				jh->b_transaction = NULL;
+ 			}
+ 			/* The buffer will be refiled below */
+ 
+ 		}
+ 		/*
+ 		 * Special case --- the buffer might actually have been
+ 		 * allocated and then immediately deallocated in the previous,
+ 		 * committing transaction, so might still be left on that
+ 		 * transaction's metadata lists.
+ 		 */
+ 		if (jh->b_jlist != wanted_jlist) {
+ 			JBUFFER_TRACE(jh, "not on correct data list: unfile");
+ 			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
+ 			__journal_unfile_buffer(jh);
+ 			jh->b_transaction = NULL;
+ 			JBUFFER_TRACE(jh, "file as data");
+ 			__journal_file_buffer(jh, handle->h_transaction,
+ 						wanted_jlist);
+ 		}
+ 	} else {
+ 		JBUFFER_TRACE(jh, "not on a transaction");
+ 		__journal_file_buffer(jh, handle->h_transaction, wanted_jlist);
+ 	}
+ 	/*
+ 	 * We need to mark the buffer dirty and refile it inside the lock to
+ 	 * protect it from release by journal_try_to_free_buffer()
+ 	 *
+ 	 * We set ->b_flushtime to something small enough to typically keep
+ 	 * kupdate away from the buffer.
+ 	 *
+ 	 * We don't need to do a balance_dirty() - __block_commit_write()
+ 	 * does that.
+ 	 */
+ 	if (!async && !atomic_set_buffer_dirty(jh2bh(jh))) {
+ 		jh2bh(jh)->b_flushtime =
+ 			jiffies + journal->j_commit_interval + 1 * HZ;
+ 		refile_buffer(jh2bh(jh));
+ 	}
+ no_journal:
+ 	spin_unlock(&journal_datalist_lock);
+ 	if (need_brelse) {
+ 		BUFFER_TRACE(bh, "brelse");
+ 		__brelse(bh);
+ 	}
+ 	JBUFFER_TRACE(jh, "exit");
+ 	journal_unlock_journal_head(jh);
+ 	return 0;
+ }
+ 
+ /* 
+  * journal_dirty_metadata: mark a buffer as containing dirty metadata
+  * which needs to be journaled as part of the current transaction.
+  *
+  * The buffer is placed on the transaction's metadata list and is marked
+  * as belonging to the transaction.  
+  *
+  * Special care needs to be taken if the buffer already belongs to the
+  * current committing transaction (in which case we should have frozen
+  * data present for that commit).  In that case, we don't relink the
+  * buffer: that only gets done when the old transaction finally
+  * completes its commit.
+  * 
+  * Returns error number or 0 on success.  
+  */
+ 
+ int journal_dirty_metadata (handle_t *handle, struct buffer_head *bh)
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	struct journal_head *jh = bh2jh(bh);
+ 
+ 	jbd_debug(5, "journal_head %p\n", jh);
+ 	JBUFFER_TRACE(jh, "entry");
+ 	lock_journal(journal);
+ 	if (is_handle_aborted(handle))
+ 		goto out_unlock;
+ 	
+ 	spin_lock(&journal_datalist_lock);
+ 	set_bit(BH_JBDDirty, &bh->b_state);
+ 	set_buffer_flushtime(bh);
+ 
+ 	J_ASSERT_JH(jh, jh->b_transaction != NULL);
+ 	
+ 	/* 
+ 	 * Metadata already on the current transaction list doesn't
+ 	 * need to be filed.  Metadata on another transaction's list must
+ 	 * be committing, and will be refiled once the commit completes:
+ 	 * leave it alone for now. 
+ 	 */
+ 
+ 	if (jh->b_transaction != transaction) {
+ 		JBUFFER_TRACE(jh, "already on other transaction");
+ 		J_ASSERT_JH(jh, jh->b_transaction ==
+ 					journal->j_committing_transaction);
+ 		J_ASSERT_JH(jh, jh->b_next_transaction == transaction);
+ 		/* And this case is illegal: we can't reuse another
+ 		 * transaction's data buffer, ever. */
+ 		/* FIXME: writepage() should be journalled */
+ 		J_ASSERT_JH(jh, jh->b_jlist != BJ_SyncData);
+ 		goto done_locked;
+ 	}
+ 
+ 	/* That test should have eliminated the following case: */
+ 	J_ASSERT_JH(jh, jh->b_frozen_data == 0);
+ 
+ 	JBUFFER_TRACE(jh, "file as BJ_Metadata");
+ 	__journal_file_buffer(jh, handle->h_transaction, BJ_Metadata);
+ 
+ done_locked:
+ 	spin_unlock(&journal_datalist_lock);
+ 	JBUFFER_TRACE(jh, "exit");
+ out_unlock:
+ 	unlock_journal(journal);
+ 	return 0;
+ }
+ 
+ #if 0
+ /* 
+  * journal_release_buffer: undo a get_write_access without any buffer
+  * updates, if the update decided in the end that it didn't need access.
+  *
+  * journal_get_write_access() can block, so it is quite possible for a
+  * journaling component to decide after the write access is returned
+  * that global state has changed and the update is no longer required.  */
+ 
+ void journal_release_buffer (handle_t *handle, struct buffer_head *bh)
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	struct journal_head *jh = bh2jh(bh);
+ 
+ 	lock_journal(journal);
+ 	JBUFFER_TRACE(jh, "entry");
+ 
+ 	/* If the buffer is reserved but not modified by this
+ 	 * transaction, then it is safe to release it.  In all other
+ 	 * cases, just leave the buffer as it is. */
+ 
+ 	spin_lock(&journal_datalist_lock);
+ 	if (jh->b_jlist == BJ_Reserved && jh->b_transaction == transaction &&
+ 	    !buffer_jdirty(jh2bh(jh))) {
+ 		JBUFFER_TRACE(jh, "unused: refiling it");
+ 		handle->h_buffer_credits++;
+ 		__journal_refile_buffer(jh);
+ 	}
+ 	spin_unlock(&journal_datalist_lock);
+ 
+ 	JBUFFER_TRACE(jh, "exit");
+ 	unlock_journal(journal);
+ }
+ #endif
+ 
+ /* 
+  * journal_forget: bforget() for potentially-journaled buffers.  We can
+  * only do the bforget if there are no commits pending against the
+  * buffer.  If the buffer is dirty in the current running transaction we
+  * can safely unlink it. 
+  *
+  * bh may not be a journalled buffer at all - it may be a non-JBD
+  * buffer which came off the hashtable.  Check for this.
+  *
+  * Decrements bh->b_count by one.
+  * 
+  * Allow this call even if the handle has aborted --- it may be part of
+  * the caller's cleanup after an abort.
+  */
+ 
+ void journal_forget (handle_t *handle, struct buffer_head *bh)
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	struct journal_head *jh;
+ 
+ 	BUFFER_TRACE(bh, "entry");
+ 
+ 	lock_journal(journal);
+ 	spin_lock(&journal_datalist_lock);
+ 
+ 	if (!buffer_jbd(bh))
+ 		goto not_jbd;
+ 	jh = bh2jh(bh);
+ 
+ 	if (jh->b_transaction == handle->h_transaction) {
+ 		J_ASSERT_JH(jh, !jh->b_frozen_data);
+ 
+ 		/* If we are forgetting a buffer which is already part
+ 		 * of this transaction, then we can just drop it from
+ 		 * the transaction immediately. */
+ 		clear_bit(BH_Dirty, &bh->b_state);
+ 		clear_bit(BH_JBDDirty, &bh->b_state);
+ 
+ 		JBUFFER_TRACE(jh, "belongs to current transaction: unfile");
+ 		J_ASSERT_JH(jh, !jh->b_committed_data);
+ 
+ 		__journal_unfile_buffer(jh);
+ 		jh->b_transaction = 0;
+ 
+ 		/* 
+ 		 * We are no longer going to journal this buffer.
+ 		 * However, the commit of this transaction is still
+ 		 * important to the buffer: the delete that we are now
+ 		 * processing might obsolete an old log entry, so by
+ 		 * committing, we can satisfy the buffer's checkpoint.
+ 		 *
+ 		 * So, if we have a checkpoint on the buffer, we should
+ 		 * now refile the buffer on our BJ_Forget list so that
+ 		 * we know to remove the checkpoint after we commit. 
+ 		 */
+ 
+ 		if (jh->b_cp_transaction) {
+ 			__journal_file_buffer(jh, transaction, BJ_Forget);
+ 		} else {
+ 			__journal_remove_journal_head(bh);
+ 			__brelse(bh);
+ 			if (!buffer_jbd(bh)) {
+ 				spin_unlock(&journal_datalist_lock);
+ 				unlock_journal(journal);
+ 				__bforget(bh);
+ 				return;
+ 			}
+ 		}
+ 		
+ 	} else if (jh->b_transaction) {
+ 		J_ASSERT_JH(jh, (jh->b_transaction == 
+ 				 journal->j_committing_transaction));
+ 		/* However, if the buffer is still owned by a prior
+ 		 * (committing) transaction, we can't drop it yet... */
+ 		JBUFFER_TRACE(jh, "belongs to older transaction");
+ 		/* ... but we CAN drop it from the new transaction if we
+ 		 * have also modified it since the original commit. */
+ 
+ 		if (jh->b_next_transaction) {
+ 			J_ASSERT(jh->b_next_transaction == transaction);
+ 			jh->b_next_transaction = NULL;
+ 		}
+ 	}
+ 
+ not_jbd:
+ 	spin_unlock(&journal_datalist_lock);
+ 	unlock_journal(journal);
+ 	__brelse(bh);
+ 	return;
+ }
+ 
+ #if 0	/* Unused */
+ /*
+  * journal_sync_buffer: flush a potentially-journaled buffer to disk.
+  *
+  * Used for O_SYNC filesystem operations.  If the buffer is journaled,
+  * we need to complete the O_SYNC by waiting for the transaction to
+  * complete.  It is an error to call journal_sync_buffer before
+  * journal_stop!
+  */
+ 
+ void journal_sync_buffer(struct buffer_head *bh)
+ {
+ 	transaction_t *transaction;
+ 	journal_t *journal;
+ 	long sequence;
+ 	struct journal_head *jh;
+ 
+ 	/* If the buffer isn't journaled, this is easy: just sync it to
+ 	 * disk.  */
+ 	BUFFER_TRACE(bh, "entry");
+ 
+ 	spin_lock(&journal_datalist_lock);
+ 	if (!buffer_jbd(bh)) {
+ 		spin_unlock(&journal_datalist_lock);
+ 		return;
+ 	}
+ 	jh = bh2jh(bh);
+ 	if (jh->b_transaction == NULL) {
+ 		/* If the buffer has already been journaled, then this
+ 		 * is a noop. */
+ 		if (jh->b_cp_transaction == NULL) {
+ 			spin_unlock(&journal_datalist_lock);
+ 			return;
+ 		}
+ 		atomic_inc(&bh->b_count);
+ 		spin_unlock(&journal_datalist_lock);
+ 		ll_rw_block (WRITE, 1, &bh);
+ 		wait_on_buffer(bh);
+ 		__brelse(bh);
+ 		goto out;
+ 	}
+ 	
+ 	/* Otherwise, just wait until the transaction is synced to disk. */
+ 	transaction = jh->b_transaction;
+ 	journal = transaction->t_journal;
+ 	sequence = transaction->t_tid;
+ 	spin_unlock(&journal_datalist_lock);
+ 
+ 	jbd_debug(2, "requesting commit for jh %p\n", jh);
+ 	log_start_commit (journal, transaction);
+ 	
+ 	while (tid_gt(sequence, journal->j_commit_sequence)) {
+ 		wake_up(&journal->j_wait_done_commit);
+ 		sleep_on(&journal->j_wait_done_commit);
+ 	}
+ 	JBUFFER_TRACE(jh, "exit");
+ out:
+ 	return;
+ }
+ #endif
+ 
+ /*
+  * All done for a particular handle.
+  *
+  * There is not much action needed here.  We just return any remaining
+  * buffer credits to the transaction and remove the handle.  The only
+  * complication is that we need to start a commit operation if the
+  * filesystem is marked for synchronous update.
+  *
+  * journal_stop itself will not usually return an error, but it may
+  * do so in unusual circumstances.  In particular, expect it to 
+  * return -EIO if a journal_abort has been executed since the
+  * transaction began.
+  */
+ 
+ int journal_stop(handle_t *handle)
+ {
+ 	transaction_t *transaction = handle->h_transaction;
+ 	journal_t *journal = transaction->t_journal;
+ 	int old_handle_count, err;
+ 	
+ 	if (!handle)
+ 		return 0;
+ 
+ 	J_ASSERT (transaction->t_updates > 0);
+ 	J_ASSERT (journal_current_handle() == handle);
+ 	
+ 	if (is_handle_aborted(handle))
+ 		err = -EIO;
+ 	else
+ 		err = 0;
+ 	
+ 	if (--handle->h_ref > 0) {
+ 		jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1,
+ 			  handle->h_ref);
+ 		return err;
+ 	}
+ 
+ 	jbd_debug(4, "Handle %p going down\n", handle);
+ 
+ 	/*
+ 	 * Implement synchronous transaction batching.  If the handle
+ 	 * was synchronous, don't force a commit immediately.  Let's
+ 	 * yield and let another thread piggyback onto this transaction.
+ 	 * Keep doing that while new threads continue to arrive.
+ 	 * It doesn't cost much - we're about to run a commit and sleep
+ 	 * on IO anyway.  Speeds up many-threaded, many-dir operations
+ 	 * by 30x or more...
+ 	 */
+ 	if (handle->h_sync) {
+ 		do {
+ 			old_handle_count = transaction->t_handle_count;
+ 			set_current_state(TASK_RUNNING);
+ 			current->policy |= SCHED_YIELD;
+ 			schedule();
+ 		} while (old_handle_count != transaction->t_handle_count);
+ 	}
+ 
+ 	current->journal_info = NULL;
+ 	transaction->t_outstanding_credits -= handle->h_buffer_credits;
+ 	transaction->t_updates--;
+ 	if (!transaction->t_updates) {
+ 		wake_up(&journal->j_wait_updates);
+ 		if (journal->j_barrier_count)
+ 			wake_up(&journal->j_wait_transaction_locked);
+ 	}
+ 
+ 	/* 
+ 	 * If the handle is marked SYNC, we need to set another commit
+ 	 * going!  We also want to force a commit if the current
+ 	 * transaction is occupying too much of the log, or if the
+ 	 * transaction is too old now.
+ 	 */
+ 	if (handle->h_sync ||
+ 			transaction->t_outstanding_credits >
+ 				journal->j_max_transaction_buffers ||
+ 	    		time_after_eq(jiffies, transaction->t_expires)) {
+ 		/* Do this even for aborted journals: an abort still
+ 		 * completes the commit thread, it just doesn't write
+ 		 * anything to disk. */
+ 		tid_t tid = transaction->t_tid;
+ 		
+ 		jbd_debug(2, "transaction too old, requesting commit for "
+ 					"handle %p\n", handle);
+ 		/* This is non-blocking */
+ 		log_start_commit(journal, transaction);
+ 		
+ 		/*
+ 		 * Special case: JFS_SYNC synchronous updates require us
+ 		 * to wait for the commit to complete.  
+ 		 */
+ 		if (handle->h_sync && !(current->flags & PF_MEMALLOC))
+ 			log_wait_commit(journal, tid);
+ 	}
+ 	kfree(handle);
+ 	return err;
+ }
+ 
+ /*
+  * For synchronous operations: force any uncommitted trasnactions
+  * to disk.  May seem kludgy, but it reuses all the handle batching
+  * code in a very simple manner.
+  */
+ int journal_force_commit(journal_t *journal)
+ {
+ 	handle_t *handle;
+ 	int ret = 0;
+ 
+ 	lock_kernel();
+ 	handle = journal_start(journal, 1);
+ 	if (IS_ERR(handle)) {
+ 		ret = PTR_ERR(handle);
+ 		goto out;
+ 	}
+ 	handle->h_sync = 1;
+ 	journal_stop(handle);
+ out:
+ 	unlock_kernel();
+ 	return ret;
+ }
+ 
+ /*
+  *
+  * List management code snippets: various functions for manipulating the
+  * transaction buffer lists.
+  *
+  */
+ 
+ /*
+  * Append a buffer to a transaction list, given the transaction's list head
+  * pointer.
+  * journal_datalist_lock is held.
+  */
+ 
+ static inline void 
+ __blist_add_buffer(struct journal_head **list, struct journal_head *jh)
+ {
+ 	if (!*list) {
+ 		jh->b_tnext = jh->b_tprev = jh;
+ 		*list = jh;
+ 	} else {
+ 		/* Insert at the tail of the list to preserve order */
+ 		struct journal_head *first = *list, *last = first->b_tprev;
+ 		jh->b_tprev = last;
+ 		jh->b_tnext = first;
+ 		last->b_tnext = first->b_tprev = jh;
+ 	}
+ }
+ 
+ /* 
+  * Remove a buffer from a transaction list, given the transaction's list
+  * head pointer.
+  *
+  * Called with journal_datalist_lock held, and the journal may not
+  * be locked.
+  */
+ 
+ static inline void
+ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
+ {
+ 	if (*list == jh) {
+ 		*list = jh->b_tnext;
+ 		if (*list == jh)
+ 			*list = 0;
+ 	}
+ 	jh->b_tprev->b_tnext = jh->b_tnext;
+ 	jh->b_tnext->b_tprev = jh->b_tprev;
+ }
+ 
+ /* 
+  * Remove a buffer from the appropriate transaction list.
+  *
+  * Note that this function can *change* the value of
+  * bh->b_transaction->t_sync_datalist, t_async_datalist, t_buffers, t_forget,
+  * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
+  * is holding onto a copy of one of thee pointers, it could go bad.
+  * Generally the caller needs to re-read the pointer from the transaction_t.
+  *
+  * If bh->b_jlist is BJ_SyncData or BJ_AsyncData then we may have been called
+  * via journal_try_to_free_buffer() or journal_clean_data_list().  In that
+  * case, journal_datalist_lock will be held, and the journal may not be locked.
+  */
+ void __journal_unfile_buffer(struct journal_head *jh)
+ {
+ 	struct journal_head **list = 0;
+ 	transaction_t * transaction;
+ 
+ 	assert_spin_locked(&journal_datalist_lock);
+ 	transaction = jh->b_transaction;
+ 
+ #ifdef __SMP__
+ 	J_ASSERT (current->lock_depth >= 0);
+ #endif
+ 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+ 
+ 	if (jh->b_jlist != BJ_None)
+ 		J_ASSERT_JH(jh, transaction != 0);
+ 
+ 	switch (jh->b_jlist) {
+ 	case BJ_None:
+ 		return;
+ 	case BJ_SyncData:
+ 		list = &transaction->t_sync_datalist;
+ 		break;
+ 	case BJ_AsyncData:
+ 		list = &transaction->t_async_datalist;
+ 		break;
+ 	case BJ_Metadata:
+ 		transaction->t_nr_buffers--;
+ 		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
+ 		list = &transaction->t_buffers;
+ 		break;
+ 	case BJ_Forget:
+ 		list = &transaction->t_forget;
+ 		break;
+ 	case BJ_IO:
+ 		list = &transaction->t_iobuf_list;
+ 		break;
+ 	case BJ_Shadow:
+ 		list = &transaction->t_shadow_list;
+ 		break;
+ 	case BJ_LogCtl:
+ 		list = &transaction->t_log_list;
+ 		break;
+ 	case BJ_Reserved:
+ 		list = &transaction->t_reserved_list;
+ 		break;
+ 	}
+ 	
+ 	__blist_del_buffer(list, jh);
+ 	jh->b_jlist = BJ_None;
+ 	if (test_and_clear_bit(BH_JBDDirty, &jh2bh(jh)->b_state)) {
+ 		set_bit(BH_Dirty, &jh2bh(jh)->b_state);
+ 	}
+ }
+ 
+ void journal_unfile_buffer(struct journal_head *jh)
+ {
+ 	spin_lock(&journal_datalist_lock);
+ 	__journal_unfile_buffer(jh);
+ 	spin_unlock(&journal_datalist_lock);
+ }
+ 
+ /*
+  * Called from journal_try_to_free_buffers().  The journal is not
+  * locked. lru_list_lock is not held.
+  *
+  * Here we see why journal_datalist_lock is global and not per-journal.
+  * We cannot get back to this buffer's journal pointer without locking
+  * out journal_clean_data_list() in some manner.
+  *
+  * One could use journal_datalist_lock to get unracy access to a
+  * per-journal lock.
+  *
+  * Called with journal_datalist_lock held.
+  *
+  * Returns non-zero iff we were able to free the journal_head.
+  */
+ static int __journal_try_to_free_buffer(struct buffer_head *bh,
+ 					int *locked_or_dirty)
+ {
+ 	struct journal_head *jh;
+ 
+ 	assert_spin_locked(&journal_datalist_lock);
+ 
+ 	if (!buffer_jbd(bh))
+ 		return 1;
+ 	jh = bh2jh(bh);
+ 
+ 	if (buffer_locked(bh) || buffer_dirty(bh)) {
+ 		*locked_or_dirty = 1;
+ 		goto out;
+ 	}
+ 
+ 	if (!buffer_uptodate(bh))
+ 		goto out;
+ 
+ 	if (jh->b_next_transaction != 0)
+ 		goto out;
+ 
+ 	if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
+ 		if (jh->b_jlist == BJ_SyncData || jh->b_jlist==BJ_AsyncData) {
+ 			/* A written-back ordered data buffer */
+ 			JBUFFER_TRACE(jh, "release data");
+ 			__journal_unfile_buffer(jh);
+ 			jh->b_transaction = 0;
+ 			__journal_remove_journal_head(bh);
+ 			__brelse(bh);
+ 		}
+ 	}
+ 	else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+ 		/* written-back checkpointed metadata buffer */
+ 		if (jh->b_jlist == BJ_None) {
+ 			JBUFFER_TRACE(jh, "remove from checkpoint list");
+ 			__journal_remove_checkpoint(jh);
+ 			__journal_remove_journal_head(bh);
+ 			__brelse(bh);
+ 		}
+ 	}
+ 	return !buffer_jbd(bh);
+ 
+ out:
+ 	return 0;
+ }
+ 
+ /*
+  * journal_try_to_free_buffers().  For all the buffers on this page,
+  * if they are fully written out ordered data, move them onto BUF_CLEAN
+  * so try_to_free_buffers() can reap them.  Called with lru_list_lock
+  * not held.  Does its own locking.
+  *
+  * This complicates JBD locking somewhat.  We aren't protected by the
+  * BKL here.  We wish to remove the buffer from its committing or
+  * running transaction's ->t_datalist via __journal_unfile_buffer.
+  *
+  * This may *change* the value of transaction_t->t_datalist, so anyone
+  * who looks at t_datalist needs to lock against this function.
+  *
+  * Even worse, someone may be doing a journal_dirty_data on this
+  * buffer.  So we need to lock against that.  journal_dirty_data()
+  * will come out of the lock with the buffer dirty, which makes it
+  * ineligible for release here.
+  *
+  * Who else is affected by this?  hmm...  Really the only contender
+  * is do_get_write_access() - it could be looking at the buffer while
+  * journal_try_to_free_buffer() is changing its state.  But that
+  * cannot happen because we never reallocate freed data as metadata
+  * while the data is part of a transaction.  Yes?
+  *
+  * This function returns non-zero if we wish try_to_free_buffers()
+  * to be called. We do this is the page is releasable by try_to_free_buffers().
+  * We also do it if the page has locked or dirty buffers and the caller wants
+  * us to perform sync or async writeout.
+  */
+ int journal_try_to_free_buffers(journal_t *journal, 
+ 				struct page *page, int gfp_mask)
+ {
+ 	struct buffer_head *bh;
+ 	struct buffer_head *tmp;
+ 	int locked_or_dirty = 0;
+ 	int call_ttfb = 1;
+ 
+ 	J_ASSERT(PageLocked(page));
+ 
+ 	bh = page->buffers;
+ 	tmp = bh;
+ 	spin_lock(&journal_datalist_lock);
+ 	do {
+ 		struct buffer_head *p = tmp;
+ 
+ 		tmp = tmp->b_this_page;
+ 		if (buffer_jbd(p))
+ 			if (!__journal_try_to_free_buffer(p, &locked_or_dirty))
+ 				call_ttfb = 0;
+ 	} while (tmp != bh);
+ 	spin_unlock(&journal_datalist_lock);
+ 
+ 	if (!(gfp_mask & (__GFP_IO|__GFP_WAIT)))
+ 		goto out;
+ 	if (!locked_or_dirty)
+ 		goto out;
+ 	/*
+ 	 * The VM wants us to do writeout, or to block on IO, or both.
+ 	 * So we allow try_to_free_buffers to be called even if the page
+ 	 * still has journalled buffers.
+ 	 */
+ 	call_ttfb = 1;
+ out:
+ 	return call_ttfb;
+ }
+ 
+ /*
+  * This buffer is no longer needed.  If it is on an older transaction's
+  * checkpoint list we need to record it on this transaction's forget list
+  * to pin this buffer (and hence its checkpointing transaction) down until
+  * this transaction commits.  If the buffer isn't on a checkpoint list, we
+  * release it.
+  * Returns non-zero if JBD no longer has an interest in the buffer.
+  */
+ static int dispose_buffer(struct journal_head *jh,
+ 		transaction_t *transaction)
+ {
+ 	int may_free = 1;
+ 	struct buffer_head *bh = jh2bh(jh);
+ 
+ 	spin_lock(&journal_datalist_lock);
+ 	__journal_unfile_buffer(jh);
+ 	jh->b_transaction = 0;
+ 
+ 	if (jh->b_cp_transaction) {
+ 		JBUFFER_TRACE(jh, "on running+cp transaction");
+ 		__journal_file_buffer(jh, transaction, BJ_Forget);
+ 		clear_bit(BH_JBDDirty, &bh->b_state);
+ 		may_free = 0;
+ 	} else {
+ 		JBUFFER_TRACE(jh, "on running transaction");
+ 		__journal_remove_journal_head(bh);
+ 		__brelse(bh);
+ 	}
+ 	spin_unlock(&journal_datalist_lock);
+ 	return may_free;
+ }
+ 
+ /*
+  * journal_flushpage 
+  *
+  * This code is tricky.  It has a number of cases to deal with.
+  *
+  * There are two invariants which this code relies on:
+  *
+  * i_size must be updated on disk before we start calling flushpage on the
+  * data.
+  * 
+  *  This is done in ext3 by defining an ext3_setattr method which
+  *  updates i_size before truncate gets going.  By maintaining this
+  *  invariant, we can be sure that it is safe to throw away any buffers
+  *  attached to the current transaction: once the transaction commits,
+  *  we know that the data will not be needed.
+  * 
+  *  Note however that we can *not* throw away data belonging to the
+  *  previous, committing transaction!  
+  *
+  * Any disk blocks which *are* part of the previous, committing
+  * transaction (and which therefore cannot be discarded immediately) are
+  * not going to be reused in the new running transaction
+  *
+  *  The bitmap committed_data images guarantee this: any block which is
+  *  allocated in one transaction and removed in the next will be marked
+  *  as in-use in the committed_data bitmap, so cannot be reused until
+  *  the next transaction to delete the block commits.  This means that
+  *  leaving committing buffers dirty is quite safe: the disk blocks
+  *  cannot be reallocated to a different file and so buffer aliasing is
+  *  not possible.
+  *
+  *
+  * The above applies mainly to ordered data mode.  In writeback mode we
+  * don't make guarantees about the order in which data hits disk --- in
+  * particular we don't guarantee that new dirty data is flushed before
+  * transaction commit --- so it is always safe just to discard data
+  * immediately in that mode.  --sct 
+  */
+ 
+ /*
+  * The journal_unmap_buffer helper function returns zero if the buffer
+  * concerned remains pinned as an anonymous buffer belonging to an older
+  * transaction.
+  *
+  * We're outside-transaction here.  Either or both of j_running_transaction
+  * and j_committing_transaction may be NULL.
+  */
+ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
+ {
+ 	transaction_t *transaction;
+ 	struct journal_head *jh;
+ 	int may_free = 1;
+ 
+ 	BUFFER_TRACE(bh, "entry");
+ 
+ 	if (!buffer_mapped(bh))
+ 		return 1;
+ 
+ 	/* It is safe to proceed here without the
+ 	 * journal_datalist_spinlock because the buffers cannot be
+ 	 * stolen by try_to_free_buffers as long as we are holding the
+ 	 * page lock. --sct */
+ 
+ 	if (!buffer_jbd(bh))
+ 		goto zap_buffer;
+ 
+ 	jh = bh2jh(bh);
+ 	transaction = jh->b_transaction;
+ 	if (transaction == NULL) {
+ 		/* First case: not on any transaction.  If it
+ 		 * has no checkpoint link, then we can zap it:
+ 		 * it's a writeback-mode buffer so we don't care
+ 		 * if it hits disk safely. */
+ 		if (!jh->b_cp_transaction) {
+ 			JBUFFER_TRACE(jh, "not on any transaction: zap");
+ 			goto zap_buffer;
+ 		}
+ 		
+ 		if (!buffer_dirty(bh)) {
+ 			/* bdflush has written it.  We can drop it now */
+ 			goto zap_buffer;
+ 		}
+ 
+ 		/* OK, it must be in the journal but still not
+ 		 * written fully to disk: it's metadata or
+ 		 * journaled data... */
+ 
+ 		if (journal->j_running_transaction) {
+ 			/* ... and once the current transaction has
+ 			 * committed, the buffer won't be needed any
+ 			 * longer. */
+ 			JBUFFER_TRACE(jh, "checkpointed: add to BJ_Forget");
+ 			return dispose_buffer(jh,
+ 					journal->j_running_transaction);
+ 		} else {
+ 			/* There is no currently-running transaction. So the
+ 			 * orphan record which we wrote for this file must have
+ 			 * passed into commit.  We must attach this buffer to
+ 			 * the committing transaction, if it exists. */
+ 			if (journal->j_committing_transaction) {
+ 				JBUFFER_TRACE(jh, "give to committing trans");
+ 				return dispose_buffer(jh,
+ 					journal->j_committing_transaction);
+ 			} else {
+ 				/* The orphan record's transaction has
+ 				 * committed.  We can cleanse this buffer */
+ 				clear_bit(BH_JBDDirty, &bh->b_state);
+ 				goto zap_buffer;
+ 			}
+ 		}
+ 	} else if (transaction == journal->j_committing_transaction) {
+ 		/* If it is committing, we simply cannot touch it.  We
+ 		 * can remove it's next_transaction pointer from the
+ 		 * running transaction if that is set, but nothing
+ 		 * else. */
+ 		JBUFFER_TRACE(jh, "on committing transaction");
+ 		if (jh->b_next_transaction) {
+ 			J_ASSERT(jh->b_next_transaction ==
+ 					journal->j_running_transaction);
+ 			jh->b_next_transaction = NULL;
+ 		}
+ 		return 0;
+ 	} else {
+ 		/* Good, the buffer belongs to the running transaction.
+ 		 * We are writing our own transaction's data, not any
+ 		 * previous one's, so it is safe to throw it away
+ 		 * (remember that we expect the filesystem to have set
+ 		 * i_size already for this truncate so recovery will not
+ 		 * expose the disk blocks we are discarding here.) */
+ 		J_ASSERT_JH(jh, transaction == journal->j_running_transaction);
+ 		may_free = dispose_buffer(jh, transaction);
+ 	}
+ 
+ zap_buffer:	
+ 	if (buffer_dirty(bh))
+ 		mark_buffer_clean(bh);
+ 	J_ASSERT_BH(bh, !buffer_jdirty(bh));
+ 	clear_bit(BH_Uptodate, &bh->b_state);
+ 	clear_bit(BH_Mapped, &bh->b_state);
+ 	clear_bit(BH_Req, &bh->b_state);
+ 	clear_bit(BH_New, &bh->b_state);
+ 	return may_free;
+ }
+ 
+ /*
+  * Return non-zero if the page's buffers were successfully reaped
+  */
+ int journal_flushpage(journal_t *journal, 
+ 		      struct page *page, 
+ 		      unsigned long offset)
+ {
+ 	struct buffer_head *head, *bh, *next;
+ 	unsigned int curr_off = 0;
+ 	int may_free = 1;
+ 		
+ 	if (!PageLocked(page))
+ 		BUG();
+ 	if (!page->buffers)
+ 		return 1;
+ 
+ 	/* We will potentially be playing with lists other than just the
+ 	 * data lists (especially for journaled data mode), so be
+ 	 * cautious in our locking. */
+ 	lock_journal(journal);
+ 
+ 	head = bh = page->buffers;
+ 	do {
+ 		unsigned int next_off = curr_off + bh->b_size;
+ 		next = bh->b_this_page;
+ 
+ 		/* AKPM: doing lock_buffer here may be overly paranoid */
+ 		if (offset <= curr_off) {
+ 		 	/* This block is wholly outside the truncation point */
+ 			lock_buffer(bh);
+ 			may_free &= journal_unmap_buffer(journal, bh);
+ 			unlock_buffer(bh);
+ 		}
+ 		curr_off = next_off;
+ 		bh = next;
+ 
+ 	} while (bh != head);
+ 	
+ 	unlock_journal(journal);
+ 
+ 	if (!offset) {
+ 		if (!may_free || !try_to_free_buffers(page, 0)) {
+ 			atomic_inc(&buffermem_pages);
+ 			return 0;
+ 		}
+ 		J_ASSERT(page->buffers == NULL);
+ 	}
+ 	
+ 	return 1;
+ }
+ 
+ 
+ 
+ /* 
+  * File a buffer on the given transaction list. 
+  */
+ 
+ void __journal_file_buffer(struct journal_head *jh,
+ 			transaction_t *transaction, int jlist)
+ {
+ 	struct journal_head **list = 0;
+ 
+ 	assert_spin_locked(&journal_datalist_lock);
+ 	
+ #ifdef __SMP__
+ 	J_ASSERT (current->lock_depth >= 0);
+ #endif
+ 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
+ 	J_ASSERT_JH(jh, jh->b_transaction == transaction ||
+ 				jh->b_transaction == 0);
+ 
+ 	if (jh->b_transaction) {
+ 		if (jh->b_jlist == jlist)
+ 			return;
+ 		__journal_unfile_buffer(jh);
+ 	} else {
+ 		jh->b_transaction = transaction;
+ 	}
+ 
+ 	switch (jlist) {
+ 	case BJ_None:
+ 		J_ASSERT_JH(jh, !jh->b_committed_data);
+ 		J_ASSERT_JH(jh, !jh->b_frozen_data);
+ 		return;
+ 	case BJ_SyncData:
+ 		list = &transaction->t_sync_datalist;
+ 		break;
+ 	case BJ_AsyncData:
+ 		list = &transaction->t_async_datalist;
+ 		break;
+ 	case BJ_Metadata:
+ 		transaction->t_nr_buffers++;
+ 		list = &transaction->t_buffers;
+ 		break;
+ 	case BJ_Forget:
+ 		list = &transaction->t_forget;
+ 		break;
+ 	case BJ_IO:
+ 		list = &transaction->t_iobuf_list;
+ 		break;
+ 	case BJ_Shadow:
+ 		list = &transaction->t_shadow_list;
+ 		break;
+ 	case BJ_LogCtl:
+ 		list = &transaction->t_log_list;
+ 		break;
+ 	case BJ_Reserved:
+ 		list = &transaction->t_reserved_list;
+ 		break;
+ 	}
+ 
+ 	__blist_add_buffer(list, jh);
+ 	jh->b_jlist = jlist;
+ 
+ 	if (jlist == BJ_Metadata || jlist == BJ_Reserved || 
+ 	    jlist == BJ_Shadow || jlist == BJ_Forget) {
+ 		if (atomic_set_buffer_clean(jh2bh(jh))) {
+ 			set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
+ 		}
+ 	}
+ }
+ 
+ void journal_file_buffer(struct journal_head *jh,
+ 				transaction_t *transaction, int jlist)
+ {
+ 	spin_lock(&journal_datalist_lock);
+ 	__journal_file_buffer(jh, transaction, jlist);
+ 	spin_unlock(&journal_datalist_lock);
+ }
+ 
+ /* 
+  * Remove a buffer from its current buffer list in preparation for
+  * dropping it from its current transaction entirely.  If the buffer has
+  * already started to be used by a subsequent transaction, refile the
+  * buffer on that transaction's metadata list.
+  */
+ 
+ void __journal_refile_buffer(struct journal_head *jh)
+ {
+ 	assert_spin_locked(&journal_datalist_lock);
+ #ifdef __SMP__
+ 	J_ASSERT_JH(jh, current->lock_depth >= 0);
+ #endif
+ 	__journal_unfile_buffer(jh);
+ 
+ 	/* If the buffer is now unused, just drop it.  If it has been
+ 	   modified by a later transaction, add it to the new
+ 	   transaction's metadata list. */
+ 
+ 	jh->b_transaction = jh->b_next_transaction;
+ 	jh->b_next_transaction = NULL;
+ 
+ 	if (jh->b_transaction != NULL) {
+ 		__journal_file_buffer(jh, jh->b_transaction, BJ_Metadata);
+ 		J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING);
+ 	} else {
+ 		/* Onto BUF_DIRTY for writeback */
+ 		refile_buffer(jh2bh(jh));
+ 	}
+ }
+ 
+ /*
+  * For the unlocked version of this call, also make sure that any
+  * hanging journal_head is cleaned up if necessary.
+  *
+  * __journal_refile_buffer is usually called as part of a single locked
+  * operation on a buffer_head, in which the caller is probably going to
+  * be hooking the journal_head onto other lists.  In that case it is up
+  * to the caller to remove the journal_head if necessary.  For the
+  * unlocked journal_refile_buffer call, the caller isn't going to be
+  * doing anything else to the buffer so we need to do the cleanup
+  * ourselves to avoid a jh leak. 
+  *
+  * *** The journal_head may be freed by this call! ***
+  */
+ void journal_refile_buffer(struct journal_head *jh)
+ {
+ 	struct buffer_head *bh;
+ 
+ 	spin_lock(&journal_datalist_lock);
+ 	bh = jh2bh(jh);
+ 
+ 	__journal_refile_buffer(jh);
+ 	__journal_remove_journal_head(bh);
+ 
+ 	spin_unlock(&journal_datalist_lock);
+ 	__brelse(bh);
+ }
diff -rc2P linux/fs/jbd-kernel.c linux-2.4.13/fs/jbd-kernel.c
*** linux/fs/jbd-kernel.c	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/fs/jbd-kernel.c	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,336 ----
+ /*
+  * fs/jbd-kernel.c
+  *
+  * Support code for the Journalling Block Device layer.
+  * This file contains things which have to be in-kernel when
+  * JBD is a module.
+  *
+  * 15 May 2001	Andrew Morton <andrewm@uow.edu.au>
+  *	Created
+  */
+ 
+ #include <linux/config.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/module.h>
+ #include <linux/sched.h>
+ 
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ 
+ /*
+  * jh_splice_lock needs explantion.
+  *
+  * In a number of places we want to do things like:
+  *
+  *	if (buffer_jbd(bh) && bh2jh(bh)->foo)
+  *
+  * This is racy on SMP, because another CPU could remove the journal_head
+  * in the middle of this expression.  We need locking.
+  *
+  * But we can greatly optimise the locking cost by testing BH_JBD
+  * outside the lock.  So, effectively:
+  *
+  *	ret = 0;
+  *	if (buffer_jbd(bh)) {
+  *		spin_lock(&jh_splice_lock);
+  *		if (buffer_jbd(bh)) {	 (* Still there? *)
+  *			ret = bh2jh(bh)->foo;
+  *		}
+  *		spin_unlock(&jh_splice_lock);
+  *	}
+  *	return ret;
+  *
+  * Now, that protects us from races where another CPU can remove the
+  * journal_head.  But it doesn't defend us from the situation where another
+  * CPU can *add* a journal_head.  This is a correctness issue.  But it's not
+  * a problem because a) the calling code was *already* racy and b) it often
+  * can't happen at the call site and c) the places where we add journal_heads
+  * tend to be under external locking.
+  */
+ spinlock_t jh_splice_lock = SPIN_LOCK_UNLOCKED;
+ EXPORT_SYMBOL(jh_splice_lock);
+ 
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+  * Some sanity testing which is called from mark_buffer_clean(),
+  * and must be present in the main kernel.
+  */
+ 
+ void jbd_preclean_buffer_check(struct buffer_head *bh)
+ {
+ 	if (buffer_jbd(bh)) {
+ 		struct journal_head *jh = bh2jh(bh);
+ 
+ 		transaction_t *transaction = jh->b_transaction;
+ 		journal_t *journal;
+ 
+ 		if (jh->b_jlist == 0 && transaction == NULL)
+ 			return;
+ 
+ 		J_ASSERT_JH(jh, (jh->b_jlist == 0 ||
+ 				 jh->b_jlist == BJ_LogCtl ||
+ 				 jh->b_jlist == BJ_IO ||
+ 				 jh->b_jlist == BJ_Forget ||
+ 				 buffer_jbd_data(bh)));
+ 		J_ASSERT_JH(jh, transaction != NULL);
+ 		/* The kernel may be unmapping old data.  We expect it
+ 		 * to be dirty in that case, unless the buffer has
+ 		 * already been forgotten by a transaction. */
+ 		if (jh->b_jlist != BJ_Forget) {
+ #if 1
+ 			if (!buffer_dirty(bh)) {
+ 				printk(__FUNCTION__": clean of clean buffer\n");
+ 				print_buffer_trace(bh);
+ 				return;
+ 			}
+ #endif
+ 			J_ASSERT_BH(bh, buffer_dirty(bh));
+ 			if (!buffer_jbd_data(bh)) {
+ 				J_ASSERT_JH(jh,
+ 					    test_bit(BH_JWrite, 
+ 						     &jh2bh(jh)->b_state));
+ 			}
+ 		}
+ 		
+ 		journal = transaction->t_journal;
+ 		J_ASSERT_JH(jh,
+ 			    transaction == journal->j_running_transaction ||
+ 			    transaction == journal->j_committing_transaction);
+ 	}
+ }
+ EXPORT_SYMBOL(jbd_preclean_buffer_check);
+ #endif		/* CONFIG_JBD_DEBUG */
+ 
+ /*
+  * Entries in /proc/sys/fs
+  */
+ 
+ int journal_oom_retry = 1;
+ EXPORT_SYMBOL(journal_oom_retry);
+ #if defined(CONFIG_JBD_DEBUG)
+ int journal_enable_debug;
+ int journal_no_write[2];
+ EXPORT_SYMBOL(journal_enable_debug);
+ EXPORT_SYMBOL(journal_no_write);
+ #endif
+ 
+ #endif	/* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */
+ 
+ /*
+  * Support functions for BUFFER_TRACE()
+  */
+ #ifdef CONFIG_BUFFER_DEBUG
+ 
+ static spinlock_t trace_lock = SPIN_LOCK_UNLOCKED;
+ 
+ void buffer_trace(struct buffer_head *dest,
+ 		struct buffer_head *src, char *info)
+ {
+ 	struct buffer_history_item *bhist_i;
+ 	unsigned long flags;
+ 
+ 	if (dest == 0 || src == 0)
+ 		return;
+ 
+ 	spin_lock_irqsave(&trace_lock, flags);
+ 
+ 	/*
+ 	 * Sometimes we don't initialise the ring pointers. (locally declared
+ 	 * temp buffer_heads). Feebly attempt to detect and correct that here.
+ 	 */
+ 	if ((dest->b_history.b_history_head - dest->b_history.b_history_tail >
+ 				BUFFER_HISTORY_SIZE)) {
+ 		dest->b_history.b_history_head = 0;
+ 		dest->b_history.b_history_tail = 0;
+ 	}
+ 	bhist_i = dest->b_history.b +
+ 		(dest->b_history.b_history_head & (BUFFER_HISTORY_SIZE - 1));
+ 	bhist_i->info = info;
+ 	bhist_i->b_state = src->b_state;
+ 	bhist_i->b_list = src->b_list;
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ 	bhist_i->b_trans_is_running = 0;
+ 	bhist_i->b_trans_is_committing = 0;
+ 	bhist_i->b_blocknr = src->b_blocknr;
+ 	if (buffer_jbd(src)) {
+ 		struct journal_head *jh;
+ 		journal_t *journal;
+ 		transaction_t *transaction;
+ 
+ 		/* Footwork to avoid racing with journal_remove_journal_head */
+ 		jh = src->b_private;
+ 		if (jh == 0)
+ 			goto raced;
+ 		transaction = jh->b_transaction;
+ 		if (src->b_private == 0)
+ 			goto raced;
+ 		bhist_i->b_jcount = jh->b_jcount;
+ 		bhist_i->b_jbd = 1;
+ 		bhist_i->b_jlist = jh->b_jlist;
+ 		bhist_i->b_frozen_data = jh->b_frozen_data;
+ 		bhist_i->b_committed_data = jh->b_committed_data;
+ 		bhist_i->b_transaction = !!jh->b_transaction;
+ 		bhist_i->b_next_transaction = !!jh->b_next_transaction;
+ 		bhist_i->b_cp_transaction = !!jh->b_cp_transaction;
+ 
+ 		if (transaction) {
+ 			journal = transaction->t_journal;
+ 			bhist_i->b_trans_is_running = transaction ==
+ 					journal->j_running_transaction;
+ 			bhist_i->b_trans_is_committing = transaction ==
+ 					journal->j_committing_transaction;
+ 		}
+ 	} else {
+ raced:
+ 		bhist_i->b_jcount = 0;
+ 		bhist_i->b_jbd = 0;
+ 		bhist_i->b_jlist = 0;
+ 		bhist_i->b_frozen_data = 0;
+ 		bhist_i->b_committed_data = 0;
+ 		bhist_i->b_transaction = 0;
+ 		bhist_i->b_next_transaction = 0;
+ 		bhist_i->b_cp_transaction = 0;
+ 	}
+ #endif	/* defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) */
+ 
+ 	bhist_i->on_lru = (src->b_prev_free != 0 && src->b_next_free != 0);
+ 	bhist_i->on_hash = (src->b_pprev != 0);
+ 	bhist_i->cpu = smp_processor_id();
+ 	bhist_i->b_count = atomic_read(&src->b_count);
+ 
+ 	dest->b_history.b_history_head++;
+ 	if (dest->b_history.b_history_head - dest->b_history.b_history_tail >
+ 				BUFFER_HISTORY_SIZE)
+ 		dest->b_history.b_history_tail =
+ 			dest->b_history.b_history_head - BUFFER_HISTORY_SIZE;
+ 
+ 	spin_unlock_irqrestore(&trace_lock, flags);
+ }
+ 
+ static const char *b_list_to_string(unsigned int b_list)
+ {
+ 	switch (b_list) {
+ 	case BUF_CLEAN:		return "BUF_CLEAN";
+ 	case BUF_LOCKED:	return "BUF_LOCKED";
+ 	case BUF_DIRTY:		return "BUF_DIRTY";
+ 	default:		return "Bad b_list";
+ 	}
+ }
+ 
+ static const char *b_jlist_to_string(unsigned int b_list)
+ {
+ 	switch (b_list) {
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ 	case BJ_None:		return "BJ_None";
+ 	case BJ_SyncData:	return "BJ_SyncData";
+ 	case BJ_AsyncData:	return "BJ_AsyncData";
+ 	case BJ_Metadata:	return "BJ_Metadata";
+ 	case BJ_Forget:		return "BJ_Forget";
+ 	case BJ_IO:		return "BJ_IO";
+ 	case BJ_Shadow:		return "BJ_Shadow";
+ 	case BJ_LogCtl:		return "BJ_LogCtl";
+ 	case BJ_Reserved:	return "BJ_Reserved";
+ #endif
+ 	default:		return "Bad b_jlist";
+ 	}
+ }
+ 
+ static void print_one_hist(struct buffer_history_item *bhist_i)
+ {
+ 	printk(" %s\n", bhist_i->info);
+ 	printk("     b_state:0x%lx b_list:%s b_jlist:%s on_lru:%d\n",
+ 			bhist_i->b_state,
+ 			b_list_to_string(bhist_i->b_list),
+ 			b_jlist_to_string(bhist_i->b_jlist),
+ 			bhist_i->on_lru);
+ 	printk("     cpu:%d on_hash:%d b_count:%d b_blocknr:%lu\n",
+ 			bhist_i->cpu,
+ 			bhist_i->on_hash,
+ 			bhist_i->b_count,
+ 			bhist_i->b_blocknr);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ 	printk("     b_jbd:%u b_frozen_data:%p b_committed_data:%p\n",
+ 			bhist_i->b_jbd,
+ 			bhist_i->b_frozen_data,
+ 			bhist_i->b_committed_data);
+ 	printk("     b_transaction:%u b_next_transaction:%u "
+ 			"b_cp_transaction:%u b_trans_is_running:%u\n",
+ 			bhist_i->b_transaction,
+ 			bhist_i->b_next_transaction,
+ 			bhist_i->b_cp_transaction,
+ 			bhist_i->b_trans_is_running);
+ 	printk("     b_trans_is_comitting:%u b_jcount:%u ",
+ 			bhist_i->b_trans_is_committing,
+ 			bhist_i->b_jcount);
+ #endif
+ 	printk("\n");
+ }
+ 
+ void print_buffer_fields(struct buffer_head *bh)
+ {
+ 	printk("b_next:%p, b_blocknr:%lu b_count:%d b_flushtime:%lu\n",
+ 		bh->b_next, bh->b_blocknr, atomic_read(&bh->b_count),
+ 			bh->b_flushtime);
+ 	printk("b_next_free:%p b_prev_free:%p b_this_page:%p b_reqnext:%p\n",
+ 		bh->b_next_free, bh->b_prev_free, bh->b_this_page,
+ 			bh->b_reqnext);
+ 	printk("b_pprev:%p b_data:%p b_page:%p b_inode:%p b_list:%d\n",
+ 		bh->b_pprev, bh->b_data, bh->b_page, bh->b_inode, bh->b_list);
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ 	if (buffer_jbd(bh)) {
+ 		struct journal_head *jh = bh2jh(bh);
+ 
+ 		printk("b_jlist:%u b_frozen_data:%p b_committed_data:%p\n",
+ 			jh->b_jlist, jh->b_frozen_data, jh->b_committed_data);
+ 		printk(" b_transaction:%p b_next_transaction:%p "
+ 				"b_cp_transaction:%p\n",
+ 			jh->b_transaction, jh->b_next_transaction,
+ 			jh->b_cp_transaction);
+ 		printk("b_cpnext:%p b_cpprev:%p\n",
+ 			jh->b_cpnext, jh->b_cpprev);
+ 	}
+ #endif
+ }
+ 
+ void print_buffer_trace(struct buffer_head *bh)
+ {
+ #ifdef CONFIG_X86
+ 	extern void show_stack(unsigned long * esp);
+ #endif
+ 
+ 	unsigned long idx, count;
+ 	unsigned long flags;
+ 
+ 	printk("buffer trace for buffer at 0x%p (I am CPU %d)\n",
+ 			bh, smp_processor_id());
+ 	BUFFER_TRACE(bh, "");		/* Record state now */
+ 
+ 	spin_lock_irqsave(&trace_lock, flags);
+ 	for (	idx = bh->b_history.b_history_tail, count = 0;
+ 		idx < bh->b_history.b_history_head &&
+ 			count < BUFFER_HISTORY_SIZE;
+ 		idx++, count++)
+ 		print_one_hist(bh->b_history.b +
+ 			(idx & (BUFFER_HISTORY_SIZE - 1)));
+ 
+ 	print_buffer_fields(bh);
+ 	spin_unlock_irqrestore(&trace_lock, flags);
+ #ifdef CONFIG_X86
+ 	show_stack(NULL);
+ #endif
+ 	printk("\n");
+ }
+ 
+ static struct buffer_head *failed_buffer_head;	/* For access with debuggers */
+ 
+ void buffer_assertion_failure(struct buffer_head *bh)
+ {
+ 	failed_buffer_head = bh;
+ 	print_buffer_trace(bh);
+ }
+ EXPORT_SYMBOL(buffer_trace);
+ EXPORT_SYMBOL(print_buffer_trace);
+ EXPORT_SYMBOL(buffer_assertion_failure);
+ EXPORT_SYMBOL(print_buffer_fields);
+ #endif	/* CONFIG_BUFFER_DEBUG */
+ 
diff -rc2P linux/fs/open.c linux-2.4.13/fs/open.c
*** linux/fs/open.c	Fri Nov  9 16:15:08 2001
--- linux-2.4.13/fs/open.c	Fri Nov  9 16:57:59 2001
***************
*** 72,75 ****
--- 72,81 ----
  }
  
+ /*
+  * i_sem is taken outside i_truncate_sem because that is the
+  * order in which these locks are taken on the path
+  * generic_file_write->copy_from_user->handle_mm_fault->do_no_page
+  */
+ 
  int do_truncate(struct dentry *dentry, loff_t length)
  {
***************
*** 83,89 ****
--- 89,97 ----
  
  	down(&inode->i_sem);
+ 	down_write(&inode->i_truncate_sem);
  	newattrs.ia_size = length;
  	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
  	error = notify_change(dentry, &newattrs);
+ 	up_write(&inode->i_truncate_sem);
  	up(&inode->i_sem);
  	return error;
diff -rc2P linux/include/linux/buffer-trace.h linux-2.4.13/include/linux/buffer-trace.h
*** linux/include/linux/buffer-trace.h	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/buffer-trace.h	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,84 ----
+ /*
+  * include/linux/buffer-trace.h
+  *
+  * Debugging support for recording buffer_head state transitions
+  *
+  * May 2001, akpm
+  *	Created
+  */
+ 
+ #ifndef BUFFER_TRACE_H_INCLUDED
+ #define BUFFER_TRACE_H_INCLUDED
+ 
+ #include <linux/config.h>
+ 
+ #ifdef CONFIG_BUFFER_DEBUG
+ 
+ /* The number of records per buffer_head.  Must be a power of two */
+ #define BUFFER_HISTORY_SIZE	32
+ 
+ struct buffer_head;
+ 
+ /* This gets embedded in struct buffer_head */
+ struct buffer_history {
+ 	struct buffer_history_item {
+ 		char *info;
+ 		unsigned long b_state;
+ 		unsigned b_list:3;
+ 		unsigned b_jlist:4;
+ 		unsigned on_lru:1;
+ 		unsigned on_hash:1;
+ 		unsigned cpu:3;
+ 		unsigned b_count:8;
+ 		unsigned long b_blocknr;	/* For src != dest */
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ 		unsigned b_jcount:4;
+ 		unsigned b_jbd:1;
+ 		unsigned b_transaction:1;
+ 		unsigned b_next_transaction:1;
+ 		unsigned b_cp_transaction:1;
+ 		unsigned b_trans_is_running:1;
+ 		unsigned b_trans_is_committing:1;
+ 		void *b_frozen_data;
+ 		void *b_committed_data;
+ #endif
+ 	} b[BUFFER_HISTORY_SIZE];
+ 	unsigned long b_history_head;	/* Next place to write */
+ 	unsigned long b_history_tail;	/* Oldest valid entry */
+ };
+ 
+ static inline void buffer_trace_init(struct buffer_history *bhist)
+ {
+ 	bhist->b_history_head = 0;
+ 	bhist->b_history_tail = 0;
+ }
+ extern void buffer_trace(struct buffer_head *dest,
+ 			struct buffer_head *src, char *info);
+ extern void print_buffer_fields(struct buffer_head *bh);
+ extern void print_buffer_trace(struct buffer_head *bh);
+ 
+ #define BUFFER_STRINGIFY2(X)		#X
+ #define BUFFER_STRINGIFY(X)		BUFFER_STRINGIFY2(X)
+ 
+ #define BUFFER_TRACE2(dest, src, info)				\
+ 	do {							\
+ 		buffer_trace((dest), (src),			\
+ 			__FUNCTION__"() ["__FILE__":"		\
+ 			BUFFER_STRINGIFY(__LINE__)"] " info);	\
+ 	} while (0)
+ 
+ #define BUFFER_TRACE(bh, info) BUFFER_TRACE2(bh, bh, info)
+ #define JBUFFER_TRACE(jh, info)	BUFFER_TRACE(jh2bh(jh), info)
+ 
+ #else		/* CONFIG_BUFFER_DEBUG */
+ 
+ #define buffer_trace_init(bh)	do {} while (0)
+ #define print_buffer_fields(bh)	do {} while (0)
+ #define print_buffer_trace(bh)	do {} while (0)
+ #define BUFFER_TRACE(bh, info)	do {} while (0)
+ #define BUFFER_TRACE2(bh, bh2, info)	do {} while (0)
+ #define JBUFFER_TRACE(jh, info)	do {} while (0)
+ 
+ #endif		/* CONFIG_BUFFER_DEBUG */
+ 
+ #endif		/* BUFFER_TRACE_H_INCLUDED */
diff -rc2P linux/include/linux/capability.h linux-2.4.13/include/linux/capability.h
*** linux/include/linux/capability.h	Fri Nov  9 16:15:08 2001
--- linux-2.4.13/include/linux/capability.h	Fri Nov  9 16:58:00 2001
***************
*** 251,254 ****
--- 251,256 ----
  /* Override quota limits. */
  /* Override reserved space on ext2 filesystem */
+ /* Modify data journaling mode on ext3 filesystem (uses journaling
+    resources) */
  /* NOTE: ext2 honors fsuid when checking for resource overrides, so 
     you can override using fsuid too */
diff -rc2P linux/include/linux/capability.h.orig linux-2.4.13/include/linux/capability.h.orig
*** linux/include/linux/capability.h.orig	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/capability.h.orig	Fri Nov  9 16:15:08 2001
***************
*** 0 ****
--- 1,356 ----
+ /*
+  * This is <linux/capability.h>
+  *
+  * Andrew G. Morgan <morgan@transmeta.com>
+  * Alexander Kjeldaas <astor@guardian.no>
+  * with help from Aleph1, Roland Buresund and Andrew Main.
+  *
+  * See here for the libcap library ("POSIX draft" compliance):
+  *
+  * ftp://linux.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.2/
+  */ 
+ 
+ #ifndef _LINUX_CAPABILITY_H
+ #define _LINUX_CAPABILITY_H
+ 
+ #include <linux/types.h>
+ #include <linux/fs.h>
+ 
+ /* User-level do most of the mapping between kernel and user
+    capabilities based on the version tag given by the kernel. The
+    kernel might be somewhat backwards compatible, but don't bet on
+    it. */
+ 
+ /* XXX - Note, cap_t, is defined by POSIX to be an "opaque" pointer to
+    a set of three capability sets.  The transposition of 3*the
+    following structure to such a composite is better handled in a user
+    library since the draft standard requires the use of malloc/free
+    etc.. */
+  
+ #define _LINUX_CAPABILITY_VERSION  0x19980330
+ 
+ typedef struct __user_cap_header_struct {
+ 	__u32 version;
+ 	int pid;
+ } *cap_user_header_t;
+  
+ typedef struct __user_cap_data_struct {
+         __u32 effective;
+         __u32 permitted;
+         __u32 inheritable;
+ } *cap_user_data_t;
+   
+ #ifdef __KERNEL__
+ 
+ /* #define STRICT_CAP_T_TYPECHECKS */
+ 
+ #ifdef STRICT_CAP_T_TYPECHECKS
+ 
+ typedef struct kernel_cap_struct {
+ 	__u32 cap;
+ } kernel_cap_t;
+ 
+ #else
+ 
+ typedef __u32 kernel_cap_t;
+ 
+ #endif
+   
+ #define _USER_CAP_HEADER_SIZE  (2*sizeof(__u32))
+ #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
+ 
+ #endif
+ 
+ 
+ /**
+  ** POSIX-draft defined capabilities. 
+  **/
+ 
+ /* In a system with the [_POSIX_CHOWN_RESTRICTED] option defined, this
+    overrides the restriction of changing file ownership and group
+    ownership. */
+ 
+ #define CAP_CHOWN            0
+ 
+ /* Override all DAC access, including ACL execute access if
+    [_POSIX_ACL] is defined. Excluding DAC access covered by
+    CAP_LINUX_IMMUTABLE. */
+ 
+ #define CAP_DAC_OVERRIDE     1
+ 
+ /* Overrides all DAC restrictions regarding read and search on files
+    and directories, including ACL restrictions if [_POSIX_ACL] is
+    defined. Excluding DAC access covered by CAP_LINUX_IMMUTABLE. */
+ 
+ #define CAP_DAC_READ_SEARCH  2
+     
+ /* Overrides all restrictions about allowed operations on files, where
+    file owner ID must be equal to the user ID, except where CAP_FSETID
+    is applicable. It doesn't override MAC and DAC restrictions. */
+ 
+ #define CAP_FOWNER           3
+ 
+ /* Overrides the following restrictions that the effective user ID
+    shall match the file owner ID when setting the S_ISUID and S_ISGID
+    bits on that file; that the effective group ID (or one of the
+    supplementary group IDs) shall match the file owner ID when setting
+    the S_ISGID bit on that file; that the S_ISUID and S_ISGID bits are
+    cleared on successful return from chown(2) (not implemented). */
+ 
+ #define CAP_FSETID           4
+ 
+ /* Used to decide between falling back on the old suser() or fsuser(). */
+ 
+ #define CAP_FS_MASK          0x1f
+ 
+ /* Overrides the restriction that the real or effective user ID of a
+    process sending a signal must match the real or effective user ID
+    of the process receiving the signal. */
+ 
+ #define CAP_KILL             5
+ 
+ /* Allows setgid(2) manipulation */
+ /* Allows setgroups(2) */
+ /* Allows forged gids on socket credentials passing. */
+ 
+ #define CAP_SETGID           6
+ 
+ /* Allows set*uid(2) manipulation (including fsuid). */
+ /* Allows forged pids on socket credentials passing. */
+ 
+ #define CAP_SETUID           7
+ 
+ 
+ /**
+  ** Linux-specific capabilities
+  **/
+ 
+ /* Transfer any capability in your permitted set to any pid,
+    remove any capability in your permitted set from any pid */
+ 
+ #define CAP_SETPCAP          8
+ 
+ /* Allow modification of S_IMMUTABLE and S_APPEND file attributes */
+ 
+ #define CAP_LINUX_IMMUTABLE  9
+ 
+ /* Allows binding to TCP/UDP sockets below 1024 */
+ /* Allows binding to ATM VCIs below 32 */
+ 
+ #define CAP_NET_BIND_SERVICE 10
+ 
+ /* Allow broadcasting, listen to multicast */
+ 
+ #define CAP_NET_BROADCAST    11
+ 
+ /* Allow interface configuration */
+ /* Allow administration of IP firewall, masquerading and accounting */
+ /* Allow setting debug option on sockets */
+ /* Allow modification of routing tables */
+ /* Allow setting arbitrary process / process group ownership on
+    sockets */
+ /* Allow binding to any address for transparent proxying */
+ /* Allow setting TOS (type of service) */
+ /* Allow setting promiscuous mode */
+ /* Allow clearing driver statistics */
+ /* Allow multicasting */
+ /* Allow read/write of device-specific registers */
+ /* Allow activation of ATM control sockets */
+ 
+ #define CAP_NET_ADMIN        12
+ 
+ /* Allow use of RAW sockets */
+ /* Allow use of PACKET sockets */
+ 
+ #define CAP_NET_RAW          13
+ 
+ /* Allow locking of shared memory segments */
+ /* Allow mlock and mlockall (which doesn't really have anything to do
+    with IPC) */
+ 
+ #define CAP_IPC_LOCK         14
+ 
+ /* Override IPC ownership checks */
+ 
+ #define CAP_IPC_OWNER        15
+ 
+ /* Insert and remove kernel modules - modify kernel without limit */
+ /* Modify cap_bset */
+ #define CAP_SYS_MODULE       16
+ 
+ /* Allow ioperm/iopl access */
+ /* Allow sending USB messages to any device via /proc/bus/usb */
+ 
+ #define CAP_SYS_RAWIO        17
+ 
+ /* Allow use of chroot() */
+ 
+ #define CAP_SYS_CHROOT       18
+ 
+ /* Allow ptrace() of any process */
+ 
+ #define CAP_SYS_PTRACE       19
+ 
+ /* Allow configuration of process accounting */
+ 
+ #define CAP_SYS_PACCT        20
+ 
+ /* Allow configuration of the secure attention key */
+ /* Allow administration of the random device */
+ /* Allow examination and configuration of disk quotas */
+ /* Allow configuring the kernel's syslog (printk behaviour) */
+ /* Allow setting the domainname */
+ /* Allow setting the hostname */
+ /* Allow calling bdflush() */
+ /* Allow mount() and umount(), setting up new smb connection */
+ /* Allow some autofs root ioctls */
+ /* Allow nfsservctl */
+ /* Allow VM86_REQUEST_IRQ */
+ /* Allow to read/write pci config on alpha */
+ /* Allow irix_prctl on mips (setstacksize) */
+ /* Allow flushing all cache on m68k (sys_cacheflush) */
+ /* Allow removing semaphores */
+ /* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
+    and shared memory */
+ /* Allow locking/unlocking of shared memory segment */
+ /* Allow turning swap on/off */
+ /* Allow forged pids on socket credentials passing */
+ /* Allow setting readahead and flushing buffers on block devices */
+ /* Allow setting geometry in floppy driver */
+ /* Allow turning DMA on/off in xd driver */
+ /* Allow administration of md devices (mostly the above, but some
+    extra ioctls) */
+ /* Allow tuning the ide driver */
+ /* Allow access to the nvram device */
+ /* Allow administration of apm_bios, serial and bttv (TV) device */
+ /* Allow manufacturer commands in isdn CAPI support driver */
+ /* Allow reading non-standardized portions of pci configuration space */
+ /* Allow DDI debug ioctl on sbpcd driver */
+ /* Allow setting up serial ports */
+ /* Allow sending raw qic-117 commands */
+ /* Allow enabling/disabling tagged queuing on SCSI controllers and sending
+    arbitrary SCSI commands */
+ /* Allow setting encryption key on loopback filesystem */
+ /* Allow the selection of a security context */
+ 
+ #define CAP_SYS_ADMIN        21
+ 
+ /* Allow use of reboot() */
+ 
+ #define CAP_SYS_BOOT         22
+ 
+ /* Allow raising priority and setting priority on other (different
+    UID) processes */
+ /* Allow use of FIFO and round-robin (realtime) scheduling on own
+    processes and setting the scheduling algorithm used by another
+    process. */
+ 
+ #define CAP_SYS_NICE         23
+ 
+ /* Override resource limits. Set resource limits. */
+ /* Override quota limits. */
+ /* Override reserved space on ext2 filesystem */
+ /* NOTE: ext2 honors fsuid when checking for resource overrides, so 
+    you can override using fsuid too */
+ /* Override size restrictions on IPC message queues */
+ /* Allow more than 64hz interrupts from the real-time clock */
+ /* Override max number of consoles on console allocation */
+ /* Override max number of keymaps */
+ 
+ #define CAP_SYS_RESOURCE     24
+ 
+ /* Allow manipulation of system clock */
+ /* Allow irix_stime on mips */
+ /* Allow setting the real-time clock */
+ 
+ #define CAP_SYS_TIME         25
+ 
+ /* Allow configuration of tty devices */
+ /* Allow vhangup() of tty */
+ 
+ #define CAP_SYS_TTY_CONFIG   26
+ 
+ /* Allow the privileged aspects of mknod() */
+ 
+ #define CAP_MKNOD            27
+ 
+ /* Allow taking of leases on files */
+ 
+ #define CAP_LEASE            28
+ 
+ /* Allow opening special device file */
+ 
+ #define CAP_OPENDEV          29
+ 
+ #ifdef __KERNEL__
+ /* 
+  * Bounding set
+  */
+ extern kernel_cap_t cap_bset;
+ 
+ /*
+  * Internal kernel functions only
+  */
+  
+ #ifdef STRICT_CAP_T_TYPECHECKS
+ 
+ #define to_cap_t(x) { x }
+ #define cap_t(x) (x).cap
+ 
+ #else
+ 
+ #define to_cap_t(x) (x)
+ #define cap_t(x) (x)
+ 
+ #endif
+ 
+ #define CAP_EMPTY_SET       to_cap_t(0)
+ #define CAP_FULL_SET        to_cap_t(~0)
+ #define CAP_INIT_EFF_SET    to_cap_t(~0 & ~CAP_TO_MASK(CAP_SETPCAP))
+ #define CAP_INIT_INH_SET    to_cap_t(0)
+ 
+ #define CAP_TO_MASK(x) (1 << (x))
+ #define cap_raise(c, flag)   (cap_t(c) |=  CAP_TO_MASK(flag))
+ #define cap_lower(c, flag)   (cap_t(c) &= ~CAP_TO_MASK(flag))
+ #define cap_raised(c, flag)  (cap_t(c) & CAP_TO_MASK(flag))
+ 
+ static inline kernel_cap_t cap_combine(kernel_cap_t a, kernel_cap_t b)
+ {
+      kernel_cap_t dest;
+      cap_t(dest) = cap_t(a) | cap_t(b);
+      return dest;
+ }
+ 
+ static inline kernel_cap_t cap_intersect(kernel_cap_t a, kernel_cap_t b)
+ {
+      kernel_cap_t dest;
+      cap_t(dest) = cap_t(a) & cap_t(b);
+      return dest;
+ }
+ 
+ static inline kernel_cap_t cap_drop(kernel_cap_t a, kernel_cap_t drop)
+ {
+      kernel_cap_t dest;
+      cap_t(dest) = cap_t(a) & ~cap_t(drop);
+      return dest;
+ }
+ 
+ static inline kernel_cap_t cap_invert(kernel_cap_t c)
+ {
+      kernel_cap_t dest;
+      cap_t(dest) = ~cap_t(c);
+      return dest;
+ }
+ 
+ #define cap_isclear(c)       (!cap_t(c))
+ #define cap_issubset(a,set)  (!(cap_t(a) & ~cap_t(set)))
+ 
+ #define cap_clear(c)         do { cap_t(c) =  0; } while(0)
+ #define cap_set_full(c)      do { cap_t(c) = ~0; } while(0)
+ #define cap_mask(c,mask)     do { cap_t(c) &= cap_t(mask); } while(0)
+ 
+ #define cap_is_fs_cap(c)     (CAP_TO_MASK(c) & CAP_FS_MASK)
+ 
+ #endif /* __KERNEL__ */
+ 
+ #endif /* !_LINUX_CAPABILITY_H */
diff -rc2P linux/include/linux/ext3_fs.h linux-2.4.13/include/linux/ext3_fs.h
*** linux/include/linux/ext3_fs.h	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs.h	Fri Nov  9 17:05:34 2001
***************
*** 0 ****
--- 1,716 ----
+ /*
+  *  linux/include/linux/ext3_fs.h
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/include/linux/minix_fs.h
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+ 
+ #ifndef _LINUX_EXT3_FS_H
+ #define _LINUX_EXT3_FS_H
+ 
+ #include <linux/types.h>
+ 
+ /*
+  * The second extended filesystem constants/structures
+  */
+ 
+ /*
+  * Define EXT3FS_DEBUG to produce debug messages
+  */
+ #undef EXT3FS_DEBUG
+ 
+ /*
+  * Define EXT3_PREALLOCATE to preallocate data blocks for expanding files
+  */
+ #undef  EXT3_PREALLOCATE /* @@@ Fix this! */
+ #define EXT3_DEFAULT_PREALLOC_BLOCKS	8
+ 
+ /*
+  * The second extended file system version
+  */
+ #define EXT3FS_DATE		"21 Oct 2001"
+ #define EXT3FS_VERSION		"2.4-0.9.13"
+ 
+ /*
+  * Debug code
+  */
+ #ifdef EXT3FS_DEBUG
+ #define ext3_debug(f, a...)						\
+ 	do {								\
+ 		printk (KERN_DEBUG "EXT3-fs DEBUG (%s, %d): %s:",	\
+ 			__FILE__, __LINE__, __FUNCTION__);		\
+ 		printk (KERN_DEBUG f, ## a);				\
+ 	} while (0)
+ #else
+ #define ext3_debug(f, a...)	do {} while (0)
+ #endif
+ 
+ /*
+  * Special inodes numbers
+  */
+ #define	EXT3_BAD_INO		 1	/* Bad blocks inode */
+ #define EXT3_ROOT_INO		 2	/* Root inode */
+ #define EXT3_ACL_IDX_INO	 3	/* ACL inode */
+ #define EXT3_ACL_DATA_INO	 4	/* ACL inode */
+ #define EXT3_BOOT_LOADER_INO	 5	/* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO	 6	/* Undelete directory inode */
+ #define EXT3_RESIZE_INO		 7	/* Reserved group descriptors inode */
+ #define EXT3_JOURNAL_INO	 8	/* Journal inode */
+ 
+ /* First non-reserved inode for old ext3 filesystems */
+ #define EXT3_GOOD_OLD_FIRST_INO	11
+ 
+ /*
+  * The second extended file system magic number
+  */
+ #define EXT3_SUPER_MAGIC	0xEF53
+ 
+ /*
+  * Maximal count of links to a file
+  */
+ #define EXT3_LINK_MAX		32000
+ 
+ /*
+  * Macro-instructions used to manage several block sizes
+  */
+ #define EXT3_MIN_BLOCK_SIZE		1024
+ #define	EXT3_MAX_BLOCK_SIZE		4096
+ #define EXT3_MIN_BLOCK_LOG_SIZE		  10
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE(s)		((s)->s_blocksize)
+ #else
+ # define EXT3_BLOCK_SIZE(s)		(EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+ #define EXT3_ACLE_PER_BLOCK(s)		(EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define	EXT3_ADDR_PER_BLOCK(s)		(EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s)	((s)->s_blocksize_bits)
+ #else
+ # define EXT3_BLOCK_SIZE_BITS(s)	((s)->s_log_block_size + 10)
+ #endif
+ #ifdef __KERNEL__
+ #define	EXT3_ADDR_PER_BLOCK_BITS(s)	((s)->u.ext3_sb.s_addr_per_block_bits)
+ #define EXT3_INODE_SIZE(s)		((s)->u.ext3_sb.s_inode_size)
+ #define EXT3_FIRST_INO(s)		((s)->u.ext3_sb.s_first_ino)
+ #else
+ #define EXT3_INODE_SIZE(s)	(((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
+ 				 EXT3_GOOD_OLD_INODE_SIZE : \
+ 				 (s)->s_inode_size)
+ #define EXT3_FIRST_INO(s)	(((s)->s_rev_level == EXT3_GOOD_OLD_REV) ? \
+ 				 EXT3_GOOD_OLD_FIRST_INO : \
+ 				 (s)->s_first_ino)
+ #endif
+ 
+ /*
+  * Macro-instructions used to manage fragments
+  */
+ #define EXT3_MIN_FRAG_SIZE		1024
+ #define	EXT3_MAX_FRAG_SIZE		4096
+ #define EXT3_MIN_FRAG_LOG_SIZE		  10
+ #ifdef __KERNEL__
+ # define EXT3_FRAG_SIZE(s)		((s)->u.ext3_sb.s_frag_size)
+ # define EXT3_FRAGS_PER_BLOCK(s)	((s)->u.ext3_sb.s_frags_per_block)
+ #else
+ # define EXT3_FRAG_SIZE(s)		(EXT3_MIN_FRAG_SIZE << (s)->s_log_frag_size)
+ # define EXT3_FRAGS_PER_BLOCK(s)	(EXT3_BLOCK_SIZE(s) / EXT3_FRAG_SIZE(s))
+ #endif
+ 
+ /*
+  * ACL structures
+  */
+ struct ext3_acl_header	/* Header of Access Control Lists */
+ {
+ 	__u32	aclh_size;
+ 	__u32	aclh_file_count;
+ 	__u32	aclh_acle_count;
+ 	__u32	aclh_first_acle;
+ };
+ 
+ struct ext3_acl_entry	/* Access Control List Entry */
+ {
+ 	__u32	acle_size;
+ 	__u16	acle_perms;	/* Access permissions */
+ 	__u16	acle_type;	/* Type of entry */
+ 	__u16	acle_tag;	/* User or group identity */
+ 	__u16	acle_pad1;
+ 	__u32	acle_next;	/* Pointer on next entry for the */
+ 					/* same inode or on next free entry */
+ };
+ 
+ /*
+  * Structure of a blocks group descriptor
+  */
+ struct ext3_group_desc
+ {
+ 	__u32	bg_block_bitmap;		/* Blocks bitmap block */
+ 	__u32	bg_inode_bitmap;		/* Inodes bitmap block */
+ 	__u32	bg_inode_table;		/* Inodes table block */
+ 	__u16	bg_free_blocks_count;	/* Free blocks count */
+ 	__u16	bg_free_inodes_count;	/* Free inodes count */
+ 	__u16	bg_used_dirs_count;	/* Directories count */
+ 	__u16	bg_pad;
+ 	__u32	bg_reserved[3];
+ };
+ 
+ /*
+  * Macro-instructions used to manage group descriptors
+  */
+ #ifdef __KERNEL__
+ # define EXT3_BLOCKS_PER_GROUP(s)	((s)->u.ext3_sb.s_blocks_per_group)
+ # define EXT3_DESC_PER_BLOCK(s)		((s)->u.ext3_sb.s_desc_per_block)
+ # define EXT3_INODES_PER_GROUP(s)	((s)->u.ext3_sb.s_inodes_per_group)
+ # define EXT3_DESC_PER_BLOCK_BITS(s)	((s)->u.ext3_sb.s_desc_per_block_bits)
+ #else
+ # define EXT3_BLOCKS_PER_GROUP(s)	((s)->s_blocks_per_group)
+ # define EXT3_DESC_PER_BLOCK(s)		(EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_group_desc))
+ # define EXT3_INODES_PER_GROUP(s)	((s)->s_inodes_per_group)
+ #endif
+ 
+ /*
+  * Constants relative to the data blocks
+  */
+ #define	EXT3_NDIR_BLOCKS		12
+ #define	EXT3_IND_BLOCK			EXT3_NDIR_BLOCKS
+ #define	EXT3_DIND_BLOCK			(EXT3_IND_BLOCK + 1)
+ #define	EXT3_TIND_BLOCK			(EXT3_DIND_BLOCK + 1)
+ #define	EXT3_N_BLOCKS			(EXT3_TIND_BLOCK + 1)
+ 
+ /*
+  * Inode flags
+  */
+ #define	EXT3_SECRM_FL			0x00000001 /* Secure deletion */
+ #define	EXT3_UNRM_FL			0x00000002 /* Undelete */
+ #define	EXT3_COMPR_FL			0x00000004 /* Compress file */
+ #define EXT3_SYNC_FL			0x00000008 /* Synchronous updates */
+ #define EXT3_IMMUTABLE_FILE_FL		0x00000010 /* Immutable file */
+ #define EXT3_APPEND_FL			0x00000020 /* writes to file may only append */
+ #define EXT3_NODUMP_FL			0x00000040 /* do not dump file */
+ #define EXT3_NOATIME_FL			0x00000080 /* do not update atime */
+ /* Reserved for compression usage... */
+ #define EXT3_DIRTY_FL			0x00000100
+ #define EXT3_COMPRBLK_FL		0x00000200 /* One or more compressed clusters */
+ #define EXT3_NOCOMPR_FL			0x00000400 /* Don't compress */
+ #define EXT3_ECOMPR_FL			0x00000800 /* Compression error */
+ /* End compression flags --- maybe not all used */
+ #define EXT3_INDEX_FL			0x00001000 /* hash-indexed directory */
+ #define EXT3_IMAGIC_FL			0x00002000 /* AFS directory */
+ #define EXT3_JOURNAL_DATA_FL		0x00004000 /* file data should be journaled */
+ #define EXT3_IMMUTABLE_LINK_FL          0x00008000 /* Immutable link */
+ #define EXT3_RESERVED_FL		0x80000000 /* reserved for ext3 lib */
+ 
+ #define EXT3_FL_USER_VISIBLE		0x00009FFF /* User visible flags */
+ #define EXT3_FL_USER_MODIFIABLE		0x000080FF /* User modifiable flags */
+ 
+ /*
+  * Inode dynamic state flags
+  */
+ #define EXT3_STATE_JDATA		0x00000001 /* journaled data exists */
+ #define EXT3_STATE_NEW			0x00000002 /* inode is newly created */
+ 
+ /*
+  * ioctl commands
+  */
+ #define	EXT3_IOC_GETFLAGS		_IOR('f', 1, long)
+ #define	EXT3_IOC_SETFLAGS		_IOW('f', 2, long)
+ #define	EXT3_IOC_GETVERSION		_IOR('f', 3, long)
+ #define	EXT3_IOC_SETVERSION		_IOW('f', 4, long)
+ #define	EXT3_IOC_GETVERSION_OLD		_IOR('v', 1, long)
+ #define	EXT3_IOC_SETVERSION_OLD		_IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+ #define EXT3_IOC_WAIT_FOR_READONLY	_IOR('f', 99, long)
+ #endif
+ 
+ /*
+  * Structure of an inode on the disk
+  */
+ struct ext3_inode {
+ 	__u16	i_mode;		/* File mode */
+ 	__u16	i_uid;		/* Low 16 bits of Owner Uid */
+ 	__u32	i_size;		/* Size in bytes */
+ 	__u32	i_atime;	/* Access time */
+ 	__u32	i_ctime;	/* Creation time */
+ 	__u32	i_mtime;	/* Modification time */
+ 	__u32	i_dtime;	/* Deletion Time */
+ 	__u16	i_gid;		/* Low 16 bits of Group Id */
+ 	__u16	i_links_count;	/* Links count */
+ 	__u32	i_blocks;	/* Blocks count */
+ 	__u32	i_flags;	/* File flags */
+ 	union {
+ 		struct {
+ 			__u32  l_i_reserved1;
+ 		} linux1;
+ 		struct {
+ 			__u32  h_i_translator;
+ 		} hurd1;
+ 		struct {
+ 			__u32  m_i_reserved1;
+ 		} masix1;
+ 	} osd1;				/* OS dependent 1 */
+ 	__u32	i_block[EXT3_N_BLOCKS];/* Pointers to blocks */
+ 	__u32	i_generation;	/* File version (for NFS) */
+ 	__u32	i_file_acl;	/* File ACL */
+ 	__u32	i_dir_acl;	/* Directory ACL */
+ 	__u32	i_faddr;	/* Fragment address */
+ 	union {
+ 		struct {
+ 			__u8	l_i_frag;	/* Fragment number */
+ 			__u8	l_i_fsize;	/* Fragment size */
+ 			__u16	i_pad1;
+ 			__u16	l_i_uid_high;	/* these 2 fields    */
+ 			__u16	l_i_gid_high;	/* were reserved2[0] */
+ 			__u32	l_i_reserved2;
+ 		} linux2;
+ 		struct {
+ 			__u8	h_i_frag;	/* Fragment number */
+ 			__u8	h_i_fsize;	/* Fragment size */
+ 			__u16	h_i_mode_high;
+ 			__u16	h_i_uid_high;
+ 			__u16	h_i_gid_high;
+ 			__u32	h_i_author;
+ 		} hurd2;
+ 		struct {
+ 			__u8	m_i_frag;	/* Fragment number */
+ 			__u8	m_i_fsize;	/* Fragment size */
+ 			__u16	m_pad1;
+ 			__u32	m_i_reserved2[2];
+ 		} masix2;
+ 	} osd2;				/* OS dependent 2 */
+ };
+ 
+ #define i_size_high	i_dir_acl
+ 
+ #if defined(__KERNEL__) || defined(__linux__)
+ #define i_reserved1	osd1.linux1.l_i_reserved1
+ #define i_frag		osd2.linux2.l_i_frag
+ #define i_fsize		osd2.linux2.l_i_fsize
+ #define i_uid_low	i_uid
+ #define i_gid_low	i_gid
+ #define i_uid_high	osd2.linux2.l_i_uid_high
+ #define i_gid_high	osd2.linux2.l_i_gid_high
+ #define i_reserved2	osd2.linux2.l_i_reserved2
+ 
+ #elif defined(__GNU__)
+ 
+ #define i_translator	osd1.hurd1.h_i_translator
+ #define i_frag		osd2.hurd2.h_i_frag;
+ #define i_fsize		osd2.hurd2.h_i_fsize;
+ #define i_uid_high	osd2.hurd2.h_i_uid_high
+ #define i_gid_high	osd2.hurd2.h_i_gid_high
+ #define i_author	osd2.hurd2.h_i_author
+ 
+ #elif defined(__masix__)
+ 
+ #define i_reserved1	osd1.masix1.m_i_reserved1
+ #define i_frag		osd2.masix2.m_i_frag
+ #define i_fsize		osd2.masix2.m_i_fsize
+ #define i_reserved2	osd2.masix2.m_i_reserved2
+ 
+ #endif /* defined(__KERNEL__) || defined(__linux__) */
+ 
+ /*
+  * File system states
+  */
+ #define	EXT3_VALID_FS			0x0001	/* Unmounted cleanly */
+ #define	EXT3_ERROR_FS			0x0002	/* Errors detected */
+ #define	EXT3_ORPHAN_FS			0x0004	/* Orphans being recovered */
+ 
+ /*
+  * Mount flags
+  */
+ #define EXT3_MOUNT_CHECK		0x0001	/* Do mount-time checks */
+ #define EXT3_MOUNT_GRPID		0x0004	/* Create files with directory's group */
+ #define EXT3_MOUNT_DEBUG		0x0008	/* Some debugging messages */
+ #define EXT3_MOUNT_ERRORS_CONT		0x0010	/* Continue on errors */
+ #define EXT3_MOUNT_ERRORS_RO		0x0020	/* Remount fs ro on errors */
+ #define EXT3_MOUNT_ERRORS_PANIC		0x0040	/* Panic on errors */
+ #define EXT3_MOUNT_MINIX_DF		0x0080	/* Mimics the Minix statfs */
+ #define EXT3_MOUNT_NOLOAD		0x0100	/* Don't use existing journal*/
+ #define EXT3_MOUNT_ABORT		0x0200	/* Fatal error detected */
+ #define EXT3_MOUNT_DATA_FLAGS		0x0C00	/* Mode for data writes: */
+   #define EXT3_MOUNT_JOURNAL_DATA	0x0400	/* Write data to journal */
+   #define EXT3_MOUNT_ORDERED_DATA	0x0800	/* Flush data before commit */
+   #define EXT3_MOUNT_WRITEBACK_DATA	0x0C00	/* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL	0x1000	/* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32		0x2000  /* Disable 32-bit UIDs */
+ 
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+ #define clear_opt(o, opt)		o &= ~EXT3_MOUNT_##opt
+ #define set_opt(o, opt)			o |= EXT3_MOUNT_##opt
+ #define test_opt(sb, opt)		((sb)->u.ext3_sb.s_mount_opt & \
+ 					 EXT3_MOUNT_##opt)
+ #else
+ #define EXT2_MOUNT_NOLOAD		EXT3_MOUNT_NOLOAD
+ #define EXT2_MOUNT_ABORT		EXT3_MOUNT_ABORT
+ #endif
+ 
+ #define ext3_set_bit			ext2_set_bit
+ #define ext3_clear_bit			ext2_clear_bit
+ #define ext3_test_bit			ext2_test_bit
+ #define ext3_find_first_zero_bit	ext2_find_first_zero_bit
+ #define ext3_find_next_zero_bit		ext2_find_next_zero_bit
+ 
+ /*
+  * Maximal mount counts between two filesystem checks
+  */
+ #define EXT3_DFL_MAX_MNT_COUNT		20	/* Allow 20 mounts */
+ #define EXT3_DFL_CHECKINTERVAL		0	/* Don't use interval check */
+ 
+ /*
+  * Behaviour when detecting errors
+  */
+ #define EXT3_ERRORS_CONTINUE		1	/* Continue execution */
+ #define EXT3_ERRORS_RO			2	/* Remount fs read-only */
+ #define EXT3_ERRORS_PANIC		3	/* Panic */
+ #define EXT3_ERRORS_DEFAULT		EXT3_ERRORS_CONTINUE
+ 
+ /*
+  * Structure of the super block
+  */
+ struct ext3_super_block {
+ /*00*/	__u32	s_inodes_count;		/* Inodes count */
+ 	__u32	s_blocks_count;		/* Blocks count */
+ 	__u32	s_r_blocks_count;	/* Reserved blocks count */
+ 	__u32	s_free_blocks_count;	/* Free blocks count */
+ /*10*/	__u32	s_free_inodes_count;	/* Free inodes count */
+ 	__u32	s_first_data_block;	/* First Data Block */
+ 	__u32	s_log_block_size;	/* Block size */
+ 	__s32	s_log_frag_size;	/* Fragment size */
+ /*20*/	__u32	s_blocks_per_group;	/* # Blocks per group */
+ 	__u32	s_frags_per_group;	/* # Fragments per group */
+ 	__u32	s_inodes_per_group;	/* # Inodes per group */
+ 	__u32	s_mtime;		/* Mount time */
+ /*30*/	__u32	s_wtime;		/* Write time */
+ 	__u16	s_mnt_count;		/* Mount count */
+ 	__s16	s_max_mnt_count;	/* Maximal mount count */
+ 	__u16	s_magic;		/* Magic signature */
+ 	__u16	s_state;		/* File system state */
+ 	__u16	s_errors;		/* Behaviour when detecting errors */
+ 	__u16	s_minor_rev_level;	/* minor revision level */
+ /*40*/	__u32	s_lastcheck;		/* time of last check */
+ 	__u32	s_checkinterval;	/* max. time between checks */
+ 	__u32	s_creator_os;		/* OS */
+ 	__u32	s_rev_level;		/* Revision level */
+ /*50*/	__u16	s_def_resuid;		/* Default uid for reserved blocks */
+ 	__u16	s_def_resgid;		/* Default gid for reserved blocks */
+ 	/*
+ 	 * These fields are for EXT3_DYNAMIC_REV superblocks only.
+ 	 *
+ 	 * Note: the difference between the compatible feature set and
+ 	 * the incompatible feature set is that if there is a bit set
+ 	 * in the incompatible feature set that the kernel doesn't
+ 	 * know about, it should refuse to mount the filesystem.
+ 	 *
+ 	 * e2fsck's requirements are more strict; if it doesn't know
+ 	 * about a feature in either the compatible or incompatible
+ 	 * feature set, it must abort and not try to meddle with
+ 	 * things it doesn't understand...
+ 	 */
+ 	__u32	s_first_ino;		/* First non-reserved inode */
+ 	__u16   s_inode_size;		/* size of inode structure */
+ 	__u16	s_block_group_nr;	/* block group # of this superblock */
+ 	__u32	s_feature_compat;	/* compatible feature set */
+ /*60*/	__u32	s_feature_incompat;	/* incompatible feature set */
+ 	__u32	s_feature_ro_compat;	/* readonly-compatible feature set */
+ /*68*/	__u8	s_uuid[16];		/* 128-bit uuid for volume */
+ /*78*/	char	s_volume_name[16];	/* volume name */
+ /*88*/	char	s_last_mounted[64];	/* directory where last mounted */
+ /*C8*/	__u32	s_algorithm_usage_bitmap; /* For compression */
+ 	/*
+ 	 * Performance hints.  Directory preallocation should only
+ 	 * happen if the EXT3_FEATURE_COMPAT_DIR_PREALLOC flag is on.
+ 	 */
+ 	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
+ 	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
+ 	__u16	s_padding1;
+ 	/*
+ 	 * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set.
+ 	 */
+ /*D0*/	__u8	s_journal_uuid[16];	/* uuid of journal superblock */
+ /*E0*/	__u32	s_journal_inum;		/* inode number of journal file */
+ 	__u32	s_journal_dev;		/* device number of journal file */
+ 	__u32	s_last_orphan;		/* start of list of inodes to delete */
+ 
+ /*EC*/	__u32	s_reserved[197];	/* Padding to the end of the block */
+ };
+ 
+ #ifdef __KERNEL__
+ #define EXT3_SB(sb)	(&((sb)->u.ext3_sb))
+ #define EXT3_I(inode)	(&((inode)->u.ext3_i))
+ #else
+ /* Assume that user mode programs are passing in an ext3fs superblock, not
+  * a kernel struct super_block.  This will allow us to call the feature-test
+  * macros from user land. */
+ #define EXT3_SB(sb)	(sb)
+ #endif
+ 
+ #define NEXT_ORPHAN(inode) (inode)->u.ext3_i.i_dtime
+ 
+ /*
+  * Codes for operating systems
+  */
+ #define EXT3_OS_LINUX		0
+ #define EXT3_OS_HURD		1
+ #define EXT3_OS_MASIX		2
+ #define EXT3_OS_FREEBSD		3
+ #define EXT3_OS_LITES		4
+ 
+ /*
+  * Revision levels
+  */
+ #define EXT3_GOOD_OLD_REV	0	/* The good old (original) format */
+ #define EXT3_DYNAMIC_REV	1	/* V2 format w/ dynamic inode sizes */
+ 
+ #define EXT3_CURRENT_REV	EXT3_GOOD_OLD_REV
+ #define EXT3_MAX_SUPP_REV	EXT3_DYNAMIC_REV
+ 
+ #define EXT3_GOOD_OLD_INODE_SIZE 128
+ 
+ /*
+  * Feature set definitions
+  */
+ 
+ #define EXT3_HAS_COMPAT_FEATURE(sb,mask)			\
+ 	( EXT3_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) )
+ #define EXT3_HAS_RO_COMPAT_FEATURE(sb,mask)			\
+ 	( EXT3_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) )
+ #define EXT3_HAS_INCOMPAT_FEATURE(sb,mask)			\
+ 	( EXT3_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) )
+ #define EXT3_SET_COMPAT_FEATURE(sb,mask)			\
+ 	EXT3_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask)
+ #define EXT3_SET_RO_COMPAT_FEATURE(sb,mask)			\
+ 	EXT3_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask)
+ #define EXT3_SET_INCOMPAT_FEATURE(sb,mask)			\
+ 	EXT3_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask)
+ #define EXT3_CLEAR_COMPAT_FEATURE(sb,mask)			\
+ 	EXT3_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask)
+ #define EXT3_CLEAR_RO_COMPAT_FEATURE(sb,mask)			\
+ 	EXT3_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask)
+ #define EXT3_CLEAR_INCOMPAT_FEATURE(sb,mask)			\
+ 	EXT3_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask)
+ 
+ #define EXT3_FEATURE_COMPAT_DIR_PREALLOC	0x0001
+ #define EXT3_FEATURE_COMPAT_IMAGIC_INODES	0x0002
+ #define EXT3_FEATURE_COMPAT_HAS_JOURNAL		0x0004
+ #define EXT3_FEATURE_COMPAT_EXT_ATTR		0x0008
+ #define EXT3_FEATURE_COMPAT_RESIZE_INODE	0x0010
+ #define EXT3_FEATURE_COMPAT_DIR_INDEX		0x0020
+ 
+ #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER	0x0001
+ #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE	0x0002
+ #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR	0x0004
+ 
+ #define EXT3_FEATURE_INCOMPAT_COMPRESSION	0x0001
+ #define EXT3_FEATURE_INCOMPAT_FILETYPE		0x0002
+ #define EXT3_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
+ 
+ #define EXT3_FEATURE_COMPAT_SUPP	0
+ #define EXT3_FEATURE_INCOMPAT_SUPP	(EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ 					 EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP	(EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ 					 EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \
+ 					 EXT3_FEATURE_RO_COMPAT_BTREE_DIR)
+ 
+ /*
+  * Default values for user and/or group using reserved blocks
+  */
+ #define	EXT3_DEF_RESUID		0
+ #define	EXT3_DEF_RESGID		0
+ 
+ /*
+  * Structure of a directory entry
+  */
+ #define EXT3_NAME_LEN 255
+ 
+ struct ext3_dir_entry {
+ 	__u32	inode;			/* Inode number */
+ 	__u16	rec_len;		/* Directory entry length */
+ 	__u16	name_len;		/* Name length */
+ 	char	name[EXT3_NAME_LEN];	/* File name */
+ };
+ 
+ /*
+  * The new version of the directory entry.  Since EXT3 structures are
+  * stored in intel byte order, and the name_len field could never be
+  * bigger than 255 chars, it's safe to reclaim the extra byte for the
+  * file_type field.
+  */
+ struct ext3_dir_entry_2 {
+ 	__u32	inode;			/* Inode number */
+ 	__u16	rec_len;		/* Directory entry length */
+ 	__u8	name_len;		/* Name length */
+ 	__u8	file_type;
+ 	char	name[EXT3_NAME_LEN];	/* File name */
+ };
+ 
+ /*
+  * Ext3 directory file types.  Only the low 3 bits are used.  The
+  * other bits are reserved for now.
+  */
+ #define EXT3_FT_UNKNOWN		0
+ #define EXT3_FT_REG_FILE	1
+ #define EXT3_FT_DIR		2
+ #define EXT3_FT_CHRDEV		3
+ #define EXT3_FT_BLKDEV		4
+ #define EXT3_FT_FIFO		5
+ #define EXT3_FT_SOCK		6
+ #define EXT3_FT_SYMLINK		7
+ 
+ #define EXT3_FT_MAX		8
+ 
+ /*
+  * EXT3_DIR_PAD defines the directory entries boundaries
+  *
+  * NOTE: It must be a multiple of 4
+  */
+ #define EXT3_DIR_PAD			4
+ #define EXT3_DIR_ROUND			(EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len)	(((name_len) + 8 + EXT3_DIR_ROUND) & \
+ 					 ~EXT3_DIR_ROUND)
+ 
+ #ifdef __KERNEL__
+ 
+ /* Filesize hard limits for 64-bit file offsets */
+ extern long long ext3_max_sizes[];
+ 
+ /*
+  * Describe an inode's exact location on disk and in memory
+  */
+ struct ext3_iloc
+ {
+ 	struct buffer_head *bh;
+ 	struct ext3_inode *raw_inode;
+ 	unsigned long block_group;
+ };
+ 
+ /*
+  * Function prototypes
+  */
+ 
+ /*
+  * Ok, these declarations are also in <linux/kernel.h> but none of the
+  * ext3 source programs needs to include it so they are duplicated here.
+  */
+ # define NORET_TYPE    /**/
+ # define ATTRIB_NORET  __attribute__((noreturn))
+ # define NORET_AND     noreturn,
+ 
+ /* acl.c */
+ extern int ext3_permission (struct inode *, int);
+ 
+ /* balloc.c */
+ extern int ext3_bg_has_super(struct super_block *sb, int group);
+ extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group);
+ extern int ext3_new_block (handle_t *, struct inode *, unsigned long,
+ 					    __u32 *, __u32 *, int *);
+ extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long,
+ 			      unsigned long);
+ extern unsigned long ext3_count_free_blocks (struct super_block *);
+ extern void ext3_check_blocks_bitmap (struct super_block *);
+ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
+ 						    unsigned int block_group,
+ 						    struct buffer_head ** bh);
+ 
+ /* bitmap.c */
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+ 
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+ 				struct ext3_dir_entry_2 *, struct buffer_head *,
+ 				unsigned long);
+ 
+ /* file.c */
+ 
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
+ 
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, ino_t);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+ extern void ext3_check_inodes_bitmap (struct super_block *);
+ 
+ /* inode.c */
+ 
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+ 
+ extern int  ext3_get_inode_loc (struct inode *, struct ext3_iloc *);
+ extern void ext3_read_inode (struct inode *);
+ extern void ext3_write_inode (struct inode *, int);
+ extern int  ext3_setattr (struct dentry *, struct iattr *);
+ extern void ext3_put_inode (struct inode *);
+ extern void ext3_delete_inode (struct inode *);
+ extern int  ext3_sync_inode (handle_t *, struct inode *);
+ extern void ext3_discard_prealloc (struct inode *);
+ extern void ext3_dirty_inode(struct inode *);
+ extern int ext3_change_inode_journal_flag(struct inode *, int);
+ 
+ /* ioctl.c */
+ extern int ext3_ioctl (struct inode *, struct file *, unsigned int,
+ 		       unsigned long);
+ 
+ /* namei.c */
+ extern struct inode_operations ext3_dir_inode_operations;
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
+ 
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+ 	__attribute__ ((format (printf, 3, 4)));
+ extern void __ext3_std_error (struct super_block *, const char *, int);
+ extern void ext3_abort (struct super_block *, const char *, const char *, ...)
+ 	__attribute__ ((format (printf, 3, 4)));
+ extern NORET_TYPE void ext3_panic (struct super_block *, const char *,
+ 				   const char *, ...)
+ 	__attribute__ ((NORET_AND format (printf, 3, 4)));
+ extern void ext3_warning (struct super_block *, const char *, const char *, ...)
+ 	__attribute__ ((format (printf, 3, 4)));
+ extern void ext3_update_dynamic_rev (struct super_block *sb);
+ extern void ext3_put_super (struct super_block *);
+ extern void ext3_write_super (struct super_block *);
+ extern void ext3_write_super_lockfs (struct super_block *);
+ extern void ext3_unlockfs (struct super_block *);
+ extern int ext3_remount (struct super_block *, int *, char *);
+ extern struct super_block * ext3_read_super (struct super_block *,void *,int);
+ extern int ext3_statfs (struct super_block *, struct statfs *);
+ 
+ /* truncate.c */
+ extern void ext3_truncate (struct inode *);
+ 
+ #define ext3_std_error(sb, errno)				\
+ do {								\
+ 	if ((errno))						\
+ 		__ext3_std_error((sb), __FUNCTION__, (errno));	\
+ } while (0)
+ extern const char *ext3_decode_error(struct super_block *sb, int errno, char nbuf[16]);
+ 
+ /*
+  * Inodes and files operations
+  */
+ 
+ /* dir.c */
+ extern struct file_operations ext3_dir_operations;
+ 
+ /* file.c */
+ extern struct inode_operations ext3_file_inode_operations;
+ extern struct file_operations ext3_file_operations;
+ 
+ /* symlink.c */
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+ 
+ extern struct address_space_operations ext3_aops;
+ 
+ #endif	/* __KERNEL__ */
+ 
+ #endif	/* _LINUX_EXT3_FS_H */
diff -rc2P linux/include/linux/ext3_fs_i.h linux-2.4.13/include/linux/ext3_fs_i.h
*** linux/include/linux/ext3_fs_i.h	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs_i.h	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,78 ----
+ /*
+  *  linux/include/linux/ext3_fs_i.h
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/include/linux/minix_fs_i.h
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+ 
+ #ifndef _LINUX_EXT3_FS_I
+ #define _LINUX_EXT3_FS_I
+ 
+ #include <linux/rwsem.h>
+ 
+ /*
+  * second extended file system inode data in memory
+  */
+ struct ext3_inode_info {
+ 	__u32	i_data[15];
+ 	__u32	i_flags;
+ #ifdef EXT3_FRAGMENTS
+ 	__u32	i_faddr;
+ 	__u8	i_frag_no;
+ 	__u8	i_frag_size;
+ 	__u16	unused;			/* formerly i_osync */
+ #endif
+ 	__u32	i_file_acl;
+ 	__u32	i_dir_acl;
+ 	__u32	i_dtime;
+ 	__u32	i_block_group;
+ 	__u32	i_state;		/* Dynamic state flags for ext3 */
+ 	__u32	i_next_alloc_block;
+ 	__u32	i_next_alloc_goal;
+ #ifdef EXT3_PREALLOCATE
+ 	__u32	i_prealloc_block;
+ 	__u32	i_prealloc_count;
+ #endif
+ 	__u32	i_dir_start_lookup;
+ 	
+ 	struct list_head i_orphan;	/* unlinked but open inodes */
+ 
+ 	/*
+ 	 * i_disksize keeps track of what the inode size is ON DISK, not
+ 	 * in memory.  During truncate, i_size is set to the new size by
+ 	 * the VFS prior to calling ext3_truncate(), but the filesystem won't
+ 	 * set i_disksize to 0 until the truncate is actually under way.
+ 	 *
+ 	 * The intent is that i_disksize always represents the blocks which
+ 	 * are used by this file.  This allows recovery to restart truncate
+ 	 * on orphans if we crash during truncate.  We actually write i_disksize
+ 	 * into the on-disk inode when writing inodes out, instead of i_size.
+ 	 *
+ 	 * The only time when i_disksize and i_size may be different is when
+ 	 * a truncate is in progress.  The only things which change i_disksize
+ 	 * are ext3_get_block (growth) and ext3_truncate (shrinkth).
+ 	 */
+ 	loff_t	i_disksize;
+ 
+ 	/*
+ 	 * truncate_sem is for serialising ext3_truncate() against
+ 	 * ext3_getblock().  In the 2.4 ext2 design, great chunks of inode's
+ 	 * data tree are chopped off during truncate. We can't do that in
+ 	 * ext3 because whenever we perform intermediate commits during
+ 	 * truncate, the inode and all the metadata blocks *must* be in a
+ 	 * consistent state which allows truncation of the orphans to restart
+ 	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+ 	 * by other means, so we have truncate_sem.
+ 	 */
+ 	struct rw_semaphore truncate_sem;
+ };
+ 
+ #endif	/* _LINUX_EXT3_FS_I */
diff -rc2P linux/include/linux/ext3_fs_sb.h linux-2.4.13/include/linux/ext3_fs_sb.h
*** linux/include/linux/ext3_fs_sb.h	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_fs_sb.h	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,77 ----
+ /*
+  *  linux/include/linux/ext3_fs_sb.h
+  *
+  * Copyright (C) 1992, 1993, 1994, 1995
+  * Remy Card (card@masi.ibp.fr)
+  * Laboratoire MASI - Institut Blaise Pascal
+  * Universite Pierre et Marie Curie (Paris VI)
+  *
+  *  from
+  *
+  *  linux/include/linux/minix_fs_sb.h
+  *
+  *  Copyright (C) 1991, 1992  Linus Torvalds
+  */
+ 
+ #ifndef _LINUX_EXT3_FS_SB
+ #define _LINUX_EXT3_FS_SB
+ 
+ #ifdef __KERNEL__
+ #include <linux/timer.h>
+ #include <linux/wait.h>
+ #endif
+ 
+ /*
+  * The following is not needed anymore since the descriptors buffer
+  * heads are now dynamically allocated
+  */
+ /* #define EXT3_MAX_GROUP_DESC	8 */
+ 
+ #define EXT3_MAX_GROUP_LOADED	8
+ 
+ /*
+  * third extended-fs super-block data in memory
+  */
+ struct ext3_sb_info {
+ 	unsigned long s_frag_size;	/* Size of a fragment in bytes */
+ 	unsigned long s_frags_per_block;/* Number of fragments per block */
+ 	unsigned long s_inodes_per_block;/* Number of inodes per block */
+ 	unsigned long s_frags_per_group;/* Number of fragments in a group */
+ 	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+ 	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+ 	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+ 	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+ 	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+ 	unsigned long s_groups_count;	/* Number of groups in the fs */
+ 	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+ 	struct ext3_super_block * s_es;	/* Pointer to the super block in the buffer */
+ 	struct buffer_head ** s_group_desc;
+ 	unsigned short s_loaded_inode_bitmaps;
+ 	unsigned short s_loaded_block_bitmaps;
+ 	unsigned long s_inode_bitmap_number[EXT3_MAX_GROUP_LOADED];
+ 	struct buffer_head * s_inode_bitmap[EXT3_MAX_GROUP_LOADED];
+ 	unsigned long s_block_bitmap_number[EXT3_MAX_GROUP_LOADED];
+ 	struct buffer_head * s_block_bitmap[EXT3_MAX_GROUP_LOADED];
+ 	unsigned long  s_mount_opt;
+ 	uid_t s_resuid;
+ 	gid_t s_resgid;
+ 	unsigned short s_mount_state;
+ 	unsigned short s_pad;
+ 	int s_addr_per_block_bits;
+ 	int s_desc_per_block_bits;
+ 	int s_inode_size;
+ 	int s_first_ino;
+ 
+ 	/* Journaling */
+ 	struct inode * s_journal_inode;
+ 	struct journal_s * s_journal;
+ 	struct list_head s_orphan;
+ 	unsigned long s_commit_interval;
+ 	struct block_device *journal_bdev;
+ #ifdef CONFIG_JBD_DEBUG
+ 	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
+ 	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
+ #endif
+ };
+ 
+ #endif	/* _LINUX_EXT3_FS_SB */
diff -rc2P linux/include/linux/ext3_jbd.h linux-2.4.13/include/linux/ext3_jbd.h
*** linux/include/linux/ext3_jbd.h	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/ext3_jbd.h	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,290 ----
+ /*
+  * linux/include/linux/ext3_jbd.h
+  *
+  * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
+  *
+  * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Ext3-specific journaling extensions.
+  */
+ 
+ #ifndef _LINUX_EXT3_JBD_H
+ #define _LINUX_EXT3_JBD_H
+ 
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ 
+ #define EXT3_JOURNAL(inode)	(EXT3_SB((inode)->i_sb)->s_journal)
+ 
+ /* Define the number of blocks we need to account to a transaction to
+  * modify one block of data.
+  * 
+  * We may have to touch one inode, one bitmap buffer, up to three
+  * indirection blocks, the group and superblock summaries, and the data
+  * block to complete the transaction.  */
+ 
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS	8
+ 
+ /* Define the minimum size for a transaction which modifies data.  This
+  * needs to take into account the fact that we may end up modifying two
+  * quota files too (one for the group, one for the user quota).  The
+  * superblock only gets updated once, of course, so don't bother
+  * counting that again for the quota updates. */
+ 
+ #define EXT3_DATA_TRANS_BLOCKS		(3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
+ 
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+ 
+ /* Delete operations potentially hit one directory's namespace plus an
+  * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
+  * generous.  We can grow the delete transaction later if necessary. */
+ 
+ #define EXT3_DELETE_TRANS_BLOCKS	(2 * EXT3_DATA_TRANS_BLOCKS + 64)
+ 
+ /* Define an arbitrary limit for the amount of data we will anticipate
+  * writing to any given transaction.  For unbounded transactions such as
+  * write(2) and truncate(2) we can write more than this, but we always
+  * start off at the maximum transaction size and grow the transaction
+  * optimistically as we go. */
+ 
+ #define EXT3_MAX_TRANS_DATA		64
+ 
+ /* We break up a large truncate or write transaction once the handle's
+  * buffer credits gets this low, we need either to extend the
+  * transaction or to start a new one.  Reserve enough space here for
+  * inode, bitmap, superblock, group and indirection updates for at least
+  * one block, plus two quota updates.  Quota allocations are not
+  * needed. */
+ 
+ #define EXT3_RESERVE_TRANS_BLOCKS	12
+ 
+ int
+ ext3_mark_iloc_dirty(handle_t *handle, 
+ 		     struct inode *inode,
+ 		     struct ext3_iloc *iloc);
+ 
+ /* 
+  * On success, We end up with an outstanding reference count against
+  * iloc->bh.  This _must_ be cleaned up later. 
+  */
+ 
+ int ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
+ 			struct ext3_iloc *iloc);
+ 
+ int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode);
+ 
+ /*
+  * Wrapper functions with which ext3 calls into JBD.  The intent here is
+  * to allow these to be turned into appropriate stubs so ext3 can control
+  * ext2 filesystems, so ext2+ext3 systems only nee one fs.  This work hasn't
+  * been done yet.
+  */
+ 
+ static inline void ext3_journal_abort_handle(const char *caller, 
+ 					     const char *err_fn,
+ 					     struct buffer_head *bh,
+ 					     handle_t *handle,
+ 					     int err)
+ {
+ 	char nbuf[16];
+ 	const char *errstr = ext3_decode_error(NULL, err, nbuf);
+ 	
+ 	printk(KERN_ERR "%s: aborting transaction: %s in %s", 
+ 	       caller, errstr, err_fn);
+ 
+ 	if (bh)
+ 		BUFFER_TRACE(bh, "abort");
+ 	journal_abort_handle(handle);
+ 	if (!handle->h_err)
+ 		handle->h_err = err;
+ }
+ 
+ static inline int
+ __ext3_journal_get_undo_access(const char *where,
+ 			       handle_t *handle, struct buffer_head *bh)
+ {
+ 	int err = journal_get_undo_access(handle, bh);
+ 	if (err)
+ 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ 	return err;
+ }
+ 
+ static inline int
+ __ext3_journal_get_write_access(const char *where,
+ 				handle_t *handle, struct buffer_head *bh)
+ {
+ 	int err = journal_get_write_access(handle, bh);
+ 	if (err)
+ 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ 	return err;
+ }
+ 
+ static inline int
+ __ext3_journal_dirty_data(const char *where,
+ 			  handle_t *handle, struct buffer_head *bh, int async)
+ {
+ 	int err = journal_dirty_data(handle, bh, async);
+ 	if (err)
+ 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ 	return err;
+ }
+ 
+ static inline void
+ ext3_journal_forget(handle_t *handle, struct buffer_head *bh)
+ {
+ 	journal_forget(handle, bh);
+ }
+ 
+ static inline int
+ __ext3_journal_revoke(const char *where, handle_t *handle,
+ 		      unsigned long blocknr, struct buffer_head *bh)
+ {
+ 	int err = journal_revoke(handle, blocknr, bh);
+ 	if (err)
+ 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ 	return err;
+ }
+ 
+ static inline int
+ __ext3_journal_get_create_access(const char *where,
+ 				 handle_t *handle, struct buffer_head *bh)
+ {
+ 	int err = journal_get_create_access(handle, bh);
+ 	if (err)
+ 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ 	return err;
+ }
+ 
+ static inline int
+ __ext3_journal_dirty_metadata(const char *where,
+ 			      handle_t *handle, struct buffer_head *bh)
+ {
+ 	int err = journal_dirty_metadata(handle, bh);
+ 	if (err)
+ 		ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err);
+ 	return err;
+ }
+ 
+ 
+ #define ext3_journal_get_undo_access(handle, bh) \
+ 	__ext3_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_get_write_access(handle, bh) \
+ 	__ext3_journal_get_write_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_dirty_data(handle, bh, async) \
+ 	__ext3_journal_dirty_data(__FUNCTION__, (handle), (bh), (async))
+ #define ext3_journal_revoke(handle, blocknr, bh) \
+ 	__ext3_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+ #define ext3_journal_get_create_access(handle, bh) \
+ 	__ext3_journal_get_create_access(__FUNCTION__, (handle), (bh))
+ #define ext3_journal_dirty_metadata(handle, bh) \
+ 	__ext3_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+ 
+ 
+ 
+ /* 
+  * Wrappers for journal_start/end.
+  *
+  * The only special thing we need to do here is to make sure that all
+  * journal_end calls result in the superblock being marked dirty, so
+  * that sync() will call the filesystem's write_super callback if
+  * appropriate. 
+  */
+ static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks)
+ {
+ 	if (inode->i_sb->s_flags & MS_RDONLY)
+ 		return ERR_PTR(-EROFS);
+ 	return journal_start(EXT3_JOURNAL(inode), nblocks);
+ }
+ 
+ static inline handle_t *
+ ext3_journal_try_start(struct inode *inode, int nblocks)
+ {
+ 	if (inode->i_sb->s_flags & MS_RDONLY)
+ 		return ERR_PTR(-EROFS);
+ 	return journal_try_start(EXT3_JOURNAL(inode), nblocks);
+ }
+ 
+ /* 
+  * The only special thing we need to do here is to make sure that all
+  * journal_stop calls result in the superblock being marked dirty, so
+  * that sync() will call the filesystem's write_super callback if
+  * appropriate. 
+  */
+ static inline int __ext3_journal_stop(const char *where,
+ 				      handle_t *handle, struct inode *inode)
+ {
+ 	int err = handle->h_err;
+ 	int rc = journal_stop(handle);
+ 
+ 	inode->i_sb->s_dirt = 1;
+ 	if (!err)
+ 		err = rc;
+ 	if (err)
+ 		__ext3_std_error(inode->i_sb, where, err);
+ 	return err;
+ }
+ #define ext3_journal_stop(handle, inode) \
+ 	__ext3_journal_stop(__FUNCTION__, (handle), (inode))
+ 
+ static inline handle_t *ext3_journal_current_handle(void)
+ {
+ 	return journal_current_handle();
+ }
+ 
+ static inline void
+ ext3_log_start_commit(journal_t *journal, transaction_t *transaction)
+ {
+ 	log_start_commit(journal, transaction);
+ }
+ 
+ static inline void ext3_log_wait_commit(journal_t *journal, tid_t tid)
+ {
+ 	log_wait_commit(journal, tid);
+ }
+ 
+ static inline int ext3_journal_extend(handle_t *handle, int nblocks)
+ {
+ 	return journal_extend(handle, nblocks);
+ }
+ 
+ static inline int ext3_journal_restart(handle_t *handle, int nblocks)
+ {
+ 	return journal_restart(handle, nblocks);
+ }
+ 
+ static inline int ext3_journal_blocks_per_page(struct inode *inode)
+ {
+ 	return journal_blocks_per_page(inode);
+ }
+ 
+ static inline int ext3_journal_force_commit(journal_t *journal)
+ {
+ 	return journal_force_commit(journal);
+ }
+ 
+ /* super.c */
+ int ext3_force_commit(struct super_block *sb);
+ 
+ static inline int ext3_should_journal_data(struct inode *inode)
+ {
+ 	if (!S_ISREG(inode->i_mode))
+ 		return 1;
+ 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+ 		return 1;
+ 	if (inode->u.ext3_i.i_flags & EXT3_JOURNAL_DATA_FL)
+ 		return 1;
+ 	return 0;
+ }
+ 
+ static inline int ext3_should_order_data(struct inode *inode)
+ {
+ 	return (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA);
+ }
+ 
+ 
+ #endif	/* _LINUX_EXT3_JBD_H */
diff -rc2P linux/include/linux/fs.h linux-2.4.13/include/linux/fs.h
*** linux/include/linux/fs.h	Fri Nov  9 16:15:08 2001
--- linux-2.4.13/include/linux/fs.h	Fri Nov  9 16:58:00 2001
***************
*** 22,25 ****
--- 22,26 ----
  #include <linux/stddef.h>
  #include <linux/string.h>
+ #include <linux/buffer-trace.h>
  
  #include <asm/atomic.h>
***************
*** 219,222 ****
--- 220,224 ----
  	BH_Wait_IO,	/* 1 if we should write out this buffer */
  	BH_launder,	/* 1 if we should throttle on this buffer */
+ 	BH_JBD,		/* 1 if it has an attached journal_head */
  
  	BH_PrivateStart,/* not a state bit, but the first bit available
***************
*** 265,268 ****
--- 267,274 ----
  	struct inode *	     b_inode;
  	struct list_head     b_inode_buffers;	/* doubly linked list of inode dirty buffers */
+ 
+ #ifdef CONFIG_BUFFER_DEBUG
+ 	struct buffer_history b_history;
+ #endif
  };
  
***************
*** 290,293 ****
--- 296,300 ----
  #include <linux/minix_fs_i.h>
  #include <linux/ext2_fs_i.h>
+ #include <linux/ext3_fs_i.h>
  #include <linux/hpfs_fs_i.h>
  #include <linux/ntfs_fs_i.h>
***************
*** 380,387 ****
--- 387,400 ----
  	int (*readpage)(struct file *, struct page *);
  	int (*sync_page)(struct page *);
+ 	/*
+ 	 * ext3 requires that a successful prepare_write() call be followed
+ 	 * by a commit_write() call - they must be balanced
+ 	 */
  	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
  	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
  	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
  	int (*bmap)(struct address_space *, long);
+ 	int (*flushpage) (struct page *, unsigned long);
+ 	int (*releasepage) (struct page *, int);
  #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
  	int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
***************
*** 445,448 ****
--- 458,462 ----
  	unsigned long		i_version;
  	struct semaphore	i_sem;
+ 	struct rw_semaphore	i_truncate_sem;	/* Nests inside i_sem */
  	struct semaphore	i_zombie;
  	struct inode_operations	*i_op;
***************
*** 474,477 ****
--- 488,492 ----
  		struct minix_inode_info		minix_i;
  		struct ext2_inode_info		ext2_i;
+ 		struct ext3_inode_info		ext3_i;
  		struct hpfs_inode_info		hpfs_i;
  		struct ntfs_inode_info		ntfs_i;
***************
*** 662,665 ****
--- 677,681 ----
  #include <linux/minix_fs_sb.h>
  #include <linux/ext2_fs_sb.h>
+ #include <linux/ext3_fs_sb.h>
  #include <linux/hpfs_fs_sb.h>
  #include <linux/ntfs_fs_sb.h>
***************
*** 718,721 ****
--- 734,738 ----
  		struct minix_sb_info	minix_sb;
  		struct ext2_sb_info	ext2_sb;
+ 		struct ext3_sb_info	ext3_sb;
  		struct hpfs_sb_info	hpfs_sb;
  		struct ntfs_sb_info	ntfs_sb;
***************
*** 1091,1094 ****
--- 1108,1112 ----
  extern int try_to_free_buffers(struct page *, unsigned int);
  extern void refile_buffer(struct buffer_head * buf);
+ extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
  extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
  
***************
*** 1132,1135 ****
--- 1150,1157 ----
  static inline void mark_buffer_clean(struct buffer_head * bh)
  {
+ #if defined(CONFIG_JBD_DEBUG)
+ 	extern void jbd_preclean_buffer_check(struct buffer_head *);
+ 	jbd_preclean_buffer_check(bh); /* @@@ Expensive debugging */
+ #endif
  	if (atomic_set_buffer_clean(bh))
  		__mark_buffer_clean(bh);
***************
*** 1173,1176 ****
--- 1195,1199 ----
  }
  
+ extern void set_buffer_flushtime(struct buffer_head *);
  extern void balance_dirty(void);
  extern int check_disk_change(kdev_t);
***************
*** 1352,1355 ****
--- 1375,1380 ----
  extern struct buffer_head * bread(kdev_t, int, int);
  extern void wakeup_bdflush(void);
+ extern void put_unused_buffer_head(struct buffer_head * bh);
+ extern struct buffer_head * get_unused_buffer_head(int async);
  
  extern int brw_page(int, struct page *, kdev_t, int [], int);
***************
*** 1358,1361 ****
--- 1383,1387 ----
  
  /* Generic buffer handling for block filesystems.. */
+ extern int try_to_release_page(struct page * page, int gfp_mask);
  extern int discard_bh_page(struct page *, unsigned long, int);
  #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
diff -rc2P linux/include/linux/fs.h.orig linux-2.4.13/include/linux/fs.h.orig
*** linux/include/linux/fs.h.orig	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/fs.h.orig	Fri Nov  9 16:15:08 2001
***************
*** 0 ****
--- 1,1569 ----
+ #ifndef _LINUX_FS_H
+ #define _LINUX_FS_H
+ 
+ /*
+  * This file has definitions for some important file table
+  * structures etc.
+  */
+ 
+ #include <linux/config.h>
+ #include <linux/linkage.h>
+ #include <linux/limits.h>
+ #include <linux/wait.h>
+ #include <linux/types.h>
+ #include <linux/vfs.h>
+ #include <linux/net.h>
+ #include <linux/kdev_t.h>
+ #include <linux/ioctl.h>
+ #include <linux/list.h>
+ #include <linux/dcache.h>
+ #include <linux/stat.h>
+ #include <linux/cache.h>
+ #include <linux/stddef.h>
+ #include <linux/string.h>
+ 
+ #include <asm/atomic.h>
+ #include <asm/bitops.h>
+ 
+ struct poll_table_struct;
+ 
+ 
+ /*
+  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
+  * the file limit at runtime and only root can increase the per-process
+  * nr_file rlimit, so it's safe to set up a ridiculously high absolute
+  * upper limit on files-per-process.
+  *
+  * Some programs (notably those using select()) may have to be 
+  * recompiled to take full advantage of the new limits..  
+  */
+ 
+ /* Fixed constants first: */
+ #undef NR_OPEN
+ #define NR_OPEN (1024*1024)	/* Absolute upper limit on fd num */
+ #define INR_OPEN 1024		/* Initial setting for nfile rlimits */
+ 
+ #define BLOCK_SIZE_BITS 10
+ #define BLOCK_SIZE (1<<BLOCK_SIZE_BITS)
+ 
+ /* And dynamically-tunable limits and defaults: */
+ struct files_stat_struct {
+ 	int nr_files;		/* read only */
+ 	int nr_free_files;	/* read only */
+ 	int max_files;		/* tunable */
+ };
+ extern struct files_stat_struct files_stat;
+ 
+ struct inodes_stat_t {
+ 	int nr_inodes;
+ 	int nr_unused;
+ 	int dummy[5];
+ };
+ extern struct inodes_stat_t inodes_stat;
+ 
+ extern int leases_enable, dir_notify_enable, lease_break_time;
+ 
+ #define NR_FILE  8192	/* this can well be larger on a larger system */
+ #define NR_RESERVED_FILES 10 /* reserved for root */
+ #define NR_SUPER 256
+ 
+ #define MAY_EXEC 1
+ #define MAY_WRITE 2
+ #define MAY_READ 4
+ 
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
+ 
+ #define READ 0
+ #define WRITE 1
+ #define READA 2		/* read-ahead  - don't block if no resources */
+ #define SPECIAL 4	/* For non-blockdevice requests in request queue */
+ 
+ #define SEL_IN		1
+ #define SEL_OUT		2
+ #define SEL_EX		4
+ 
+ /* public flags for file_system_type */
+ #define FS_REQUIRES_DEV 1 
+ #define FS_NO_DCACHE	2 /* Only dcache the necessary things. */
+ #define FS_NO_PRELIM	4 /* prevent preloading of dentries, even if
+ 			   * FS_NO_DCACHE is not set.
+ 			   */
+ #define FS_SINGLE	8 /* Filesystem that can have only one superblock */
+ #define FS_NOMOUNT	16 /* Never mount from userland */
+ #define FS_LITTER	32 /* Keeps the tree in dcache */
+ #define FS_ODD_RENAME	32768	/* Temporary stuff; will go away as soon
+ 				  * as nfs_rename() will be cleaned up
+ 				  */
+ /*
+  * These are the fs-independent mount-flags: up to 32 flags are supported
+  */
+ #define MS_RDONLY	 1	/* Mount read-only */
+ #define MS_NOSUID	 2	/* Ignore suid and sgid bits */
+ #define MS_NODEV	 4	/* Disallow access to device special files */
+ #define MS_NOEXEC	 8	/* Disallow program execution */
+ #define MS_SYNCHRONOUS	16	/* Writes are synced at once */
+ #define MS_REMOUNT	32	/* Alter flags of a mounted FS */
+ #define MS_MANDLOCK	64	/* Allow mandatory locks on an FS */
+ #define MS_NOATIME	1024	/* Do not update access times. */
+ #define MS_NODIRATIME	2048	/* Do not update directory access times */
+ #define MS_BIND		4096
+ #define MS_REC		16384
+ #define MS_VERBOSE	32768
+ #define MS_NOUSER	(1<<31)
+ 
+ /*
+  * Superblock flags that can be altered by MS_REMOUNT
+  */
+ #define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_NOATIME|\
+ 			 MS_NODIRATIME)
+ 
+ /*
+  * Old magic mount flag and mask
+  */
+ #define MS_MGC_VAL 0xC0ED0000
+ #define MS_MGC_MSK 0xffff0000
+ 
+ /* Inode flags - they have nothing to superblock flags now */
+ 
+ #define S_SYNC			1	/* Writes are synced at once */
+ #define S_NOATIME		2	/* Do not update access times */
+ #define S_QUOTA			4	/* Quota initialized for file */
+ #define S_APPEND		8	/* Append-only file */
+ #define S_IMMUTABLE_FILE	16	/* Immutable file */
+ #define S_DEAD			32	/* removed, but still open directory */
+ #define S_NOQUOTA		64	/* Inode is not counted to quota */
+ #define S_IMMUTABLE_LINK	128	/* Immutable links */
+ 
+ /*
+  * Note that nosuid etc flags are inode-specific: setting some file-system
+  * flags just means all the inodes inherit those flags by default. It might be
+  * possible to override it selectively if you really wanted to with some
+  * ioctl() that is not currently implemented.
+  *
+  * Exception: MS_RDONLY is always applied to the entire file system.
+  *
+  * Unfortunately, it is possible to change a filesystems flags with it mounted
+  * with files in use.  This means that all of the inodes will not have their
+  * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
+  * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
+  */
+ #define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))
+ 
+ #define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
+ #define IS_SYNC(inode)		(__IS_FLG(inode, MS_SYNCHRONOUS) || ((inode)->i_flags & S_SYNC))
+ #define IS_MANDLOCK(inode)	__IS_FLG(inode, MS_MANDLOCK)
+ 
+ #define IS_QUOTAINIT(inode)	((inode)->i_flags & S_QUOTA)
+ #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
+ #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
+ #define IS_IMMUTABLE_FILE(inode)	((inode)->i_flags & S_IMMUTABLE_FILE)
+ #define IS_IMMUTABLE_LINK(inode) ((((inode)->i_flags & S_IMMUTABLE_FILE) << 3) ^ ((inode)->i_flags & S_IMMUTABLE_LINK) )
+ #define IS_NOATIME(inode)	(__IS_FLG(inode, MS_NOATIME) || ((inode)->i_flags & S_NOATIME))
+ #define IS_NODIRATIME(inode)	__IS_FLG(inode, MS_NODIRATIME)
+ 
+ #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
+ 
+ /* the read-only stuff doesn't really belong here, but any other place is
+    probably as bad and I don't want to create yet another include file. */
+ 
+ #define BLKROSET   _IO(0x12,93)	/* set device read-only (0 = read-write) */
+ #define BLKROGET   _IO(0x12,94)	/* get read-only status (0 = read_write) */
+ #define BLKRRPART  _IO(0x12,95)	/* re-read partition table */
+ #define BLKGETSIZE _IO(0x12,96)	/* return device size /512 (long *arg) */
+ #define BLKFLSBUF  _IO(0x12,97)	/* flush buffer cache */
+ #define BLKRASET   _IO(0x12,98)	/* Set read ahead for block device */
+ #define BLKRAGET   _IO(0x12,99)	/* get current read ahead setting */
+ #define BLKFRASET  _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
+ #define BLKFRAGET  _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
+ #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
+ #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
+ #define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
+ #if 0
+ #define BLKPG      _IO(0x12,105)/* See blkpg.h */
+ #define BLKELVGET  _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))/* elevator get */
+ #define BLKELVSET  _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))/* elevator set */
+ /* This was here just to show that the number is taken -
+    probably all these _IO(0x12,*) ioctls should be moved to blkpg.h. */
+ #endif
+ /* A jump here: 108-111 have been used for various private purposes. */
+ #define BLKBSZGET  _IOR(0x12,112,sizeof(int))
+ #define BLKBSZSET  _IOW(0x12,113,sizeof(int))
+ #define BLKGETSIZE64 _IOR(0x12,114,sizeof(u64))	/* return device size in bytes (u64 *arg) */
+ 
+ #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
+ #define FIBMAP	   _IO(0x00,1)	/* bmap access */
+ #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
+ 
+ #ifdef __KERNEL__
+ 
+ #include <asm/semaphore.h>
+ #include <asm/byteorder.h>
+ 
+ extern void update_atime (struct inode *);
+ #define UPDATE_ATIME(inode) update_atime (inode)
+ 
+ extern void buffer_init(unsigned long);
+ extern void inode_init(unsigned long);
+ extern void mnt_init(unsigned long);
+ 
+ /* bh state bits */
+ enum bh_state_bits {
+ 	BH_Uptodate,	/* 1 if the buffer contains valid data */
+ 	BH_Dirty,	/* 1 if the buffer is dirty */
+ 	BH_Lock,	/* 1 if the buffer is locked */
+ 	BH_Req,		/* 0 if the buffer has been invalidated */
+ 	BH_Mapped,	/* 1 if the buffer has a disk mapping */
+ 	BH_New,		/* 1 if the buffer is new and not yet written out */
+ 	BH_Async,	/* 1 if the buffer is under end_buffer_io_async I/O */
+ 	BH_Wait_IO,	/* 1 if we should write out this buffer */
+ 	BH_launder,	/* 1 if we should throttle on this buffer */
+ 
+ 	BH_PrivateStart,/* not a state bit, but the first bit available
+ 			 * for private allocation by other entities
+ 			 */
+ };
+ 
+ /*
+  * Try to keep the most commonly used fields in single cache lines (16
+  * bytes) to improve performance.  This ordering should be
+  * particularly beneficial on 32-bit processors.
+  * 
+  * We use the first 16 bytes for the data which is used in searches
+  * over the block hash lists (ie. getblk() and friends).
+  * 
+  * The second 16 bytes we use for lru buffer scans, as used by
+  * sync_buffers() and refill_freelist().  -- sct
+  */
+ struct buffer_head {
+ 	/* First cache line: */
+ 	struct buffer_head *b_next;	/* Hash queue list */
+ 	unsigned long b_blocknr;	/* block number */
+ 	unsigned short b_size;		/* block size */
+ 	unsigned short b_list;		/* List that this buffer appears */
+ 	kdev_t b_dev;			/* device (B_FREE = free) */
+ 
+ 	atomic_t b_count;		/* users using this block */
+ 	kdev_t b_rdev;			/* Real device */
+ 	unsigned long b_state;		/* buffer state bitmap (see above) */
+ 	unsigned long b_flushtime;	/* Time when (dirty) buffer should be written */
+ 
+ 	struct buffer_head *b_next_free;/* lru/free list linkage */
+ 	struct buffer_head *b_prev_free;/* doubly linked list of buffers */
+ 	struct buffer_head *b_this_page;/* circular list of buffers in one page */
+ 	struct buffer_head *b_reqnext;	/* request queue */
+ 
+ 	struct buffer_head **b_pprev;	/* doubly linked list of hash-queue */
+ 	char * b_data;			/* pointer to data block */
+ 	struct page *b_page;		/* the page this bh is mapped to */
+ 	void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
+  	void *b_private;		/* reserved for b_end_io */
+ 
+ 	unsigned long b_rsector;	/* Real buffer location on disk */
+ 	wait_queue_head_t b_wait;
+ 
+ 	struct inode *	     b_inode;
+ 	struct list_head     b_inode_buffers;	/* doubly linked list of inode dirty buffers */
+ };
+ 
+ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate);
+ void init_buffer(struct buffer_head *, bh_end_io_t *, void *);
+ 
+ #define __buffer_state(bh, state)	(((bh)->b_state & (1UL << BH_##state)) != 0)
+ 
+ #define buffer_uptodate(bh)	__buffer_state(bh,Uptodate)
+ #define buffer_dirty(bh)	__buffer_state(bh,Dirty)
+ #define buffer_locked(bh)	__buffer_state(bh,Lock)
+ #define buffer_req(bh)		__buffer_state(bh,Req)
+ #define buffer_mapped(bh)	__buffer_state(bh,Mapped)
+ #define buffer_new(bh)		__buffer_state(bh,New)
+ #define buffer_async(bh)	__buffer_state(bh,Async)
+ 
+ #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
+ 
+ extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset);
+ 
+ #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
+ 
+ 
+ #include <linux/pipe_fs_i.h>
+ #include <linux/minix_fs_i.h>
+ #include <linux/ext2_fs_i.h>
+ #include <linux/hpfs_fs_i.h>
+ #include <linux/ntfs_fs_i.h>
+ #include <linux/msdos_fs_i.h>
+ #include <linux/umsdos_fs_i.h>
+ #include <linux/iso_fs_i.h>
+ #include <linux/nfs_fs_i.h>
+ #include <linux/sysv_fs_i.h>
+ #include <linux/affs_fs_i.h>
+ #include <linux/ufs_fs_i.h>
+ #include <linux/efs_fs_i.h>
+ #include <linux/coda_fs_i.h>
+ #include <linux/romfs_fs_i.h>
+ #include <linux/shmem_fs.h>
+ #include <linux/smb_fs_i.h>
+ #include <linux/hfs_fs_i.h>
+ #include <linux/adfs_fs_i.h>
+ #include <linux/qnx4_fs_i.h>
+ #include <linux/reiserfs_fs_i.h>
+ #include <linux/bfs_fs_i.h>
+ #include <linux/udf_fs_i.h>
+ #include <linux/ncp_fs_i.h>
+ #include <linux/proc_fs_i.h>
+ #include <linux/usbdev_fs_i.h>
+ #include <linux/jffs2_fs_i.h>
+ #include <linux/cramfs_fs_sb.h>
+ 
+ /*
+  * Attribute flags.  These should be or-ed together to figure out what
+  * has been changed!
+  */
+ #define ATTR_MODE	1
+ #define ATTR_UID	2
+ #define ATTR_GID	4
+ #define ATTR_SIZE	8
+ #define ATTR_ATIME	16
+ #define ATTR_MTIME	32
+ #define ATTR_CTIME	64
+ #define ATTR_ATIME_SET	128
+ #define ATTR_MTIME_SET	256
+ #define ATTR_FORCE	512	/* Not a change, but a change it */
+ #define ATTR_ATTR_FLAG	1024
+ 
+ /*
+  * This is the Inode Attributes structure, used for notify_change().  It
+  * uses the above definitions as flags, to know which values have changed.
+  * Also, in this manner, a Filesystem can look at only the values it cares
+  * about.  Basically, these are the attributes that the VFS layer can
+  * request to change from the FS layer.
+  *
+  * Derek Atkins <warlord@MIT.EDU> 94-10-20
+  */
+ struct iattr {
+ 	unsigned int	ia_valid;
+ 	umode_t		ia_mode;
+ 	uid_t		ia_uid;
+ 	gid_t		ia_gid;
+ 	loff_t		ia_size;
+ 	time_t		ia_atime;
+ 	time_t		ia_mtime;
+ 	time_t		ia_ctime;
+ 	unsigned int	ia_attr_flags;
+ };
+ 
+ /*
+  * This is the inode attributes flag definitions
+  */
+ #define ATTR_FLAG_SYNCRONOUS		1 	/* Syncronous write */
+ #define ATTR_FLAG_NOATIME		2 	/* Don't update atime */
+ #define ATTR_FLAG_APPEND		4 	/* Append-only file */
+ #define ATTR_FLAG_IMMUTABLE_FILE	8 	/* Immutable file */
+ #define ATTR_FLAG_NODIRATIME		16 	/* Don't update atime for directory */
+ #define ATTR_FLAG_IMMUTABLE_LINK	32 	/* Immutable file */
+ 
+ /*
+  * Includes for diskquotas and mount structures.
+  */
+ #include <linux/quota.h>
+ #include <linux/mount.h>
+ 
+ /*
+  * oh the beauties of C type declarations.
+  */
+ struct page;
+ struct address_space;
+ struct kiobuf;
+ 
+ struct address_space_operations {
+ 	int (*writepage)(struct page *);
+ 	int (*readpage)(struct file *, struct page *);
+ 	int (*sync_page)(struct page *);
+ 	int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
+ 	int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+ 	/* Unfortunately this kludge is needed for FIBMAP. Don't use it */
+ 	int (*bmap)(struct address_space *, long);
+ #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */
+ 	int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int);
+ };
+ 
+ struct address_space {
+ 	struct list_head	clean_pages;	/* list of clean pages */
+ 	struct list_head	dirty_pages;	/* list of dirty pages */
+ 	struct list_head	locked_pages;	/* list of locked pages */
+ 	unsigned long		nrpages;	/* number of total pages */
+ 	struct address_space_operations *a_ops;	/* methods */
+ 	struct inode		*host;		/* owner: inode, block_device */
+ 	struct vm_area_struct	*i_mmap;	/* list of private mappings */
+ 	struct vm_area_struct	*i_mmap_shared; /* list of shared mappings */
+ 	spinlock_t		i_shared_lock;  /* and spinlock protecting it */
+ 	int			gfp_mask;	/* how to allocate the pages */
+ };
+ 
+ struct char_device {
+ 	struct list_head	hash;
+ 	atomic_t		count;
+ 	dev_t			dev;
+ 	atomic_t		openers;
+ 	struct semaphore	sem;
+ };
+ 
+ struct block_device {
+ 	struct list_head	bd_hash;
+ 	atomic_t		bd_count;
+ 	struct inode *		bd_inode;
+ 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
+ 	int			bd_openers;
+ 	const struct block_device_operations *bd_op;
+ 	struct semaphore	bd_sem;	/* open/close mutex */
+ 	struct list_head	bd_inodes;
+ };
+ 
+ struct inode {
+ 	struct list_head	i_hash;
+ 	struct list_head	i_list;
+ 	struct list_head	i_dentry;
+ 	
+ 	struct list_head	i_dirty_buffers;
+ 	struct list_head	i_dirty_data_buffers;
+ 
+ 	unsigned long		i_ino;
+ 	atomic_t		i_count;
+ 	kdev_t			i_dev;
+ 	umode_t			i_mode;
+ 	nlink_t			i_nlink;
+ 	uid_t			i_uid;
+ 	gid_t			i_gid;
+ 	kdev_t			i_rdev;
+ 	loff_t			i_size;
+ 	time_t			i_atime;
+ 	time_t			i_mtime;
+ 	time_t			i_ctime;
+ 	unsigned int		i_blkbits;
+ 	unsigned long		i_blksize;
+ 	unsigned long		i_blocks;
+ 	unsigned long		i_version;
+ 	struct semaphore	i_sem;
+ 	struct semaphore	i_zombie;
+ 	struct inode_operations	*i_op;
+ 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
+ 	struct super_block	*i_sb;
+ 	wait_queue_head_t	i_wait;
+ 	struct file_lock	*i_flock;
+ 	struct address_space	*i_mapping;
+ 	struct address_space	i_data;
+ 	struct dquot		*i_dquot[MAXQUOTAS];
+ 	/* These three should probably be a union */
+ 	struct list_head	i_devices;
+ 	struct pipe_inode_info	*i_pipe;
+ 	struct block_device	*i_bdev;
+ 	struct char_device	*i_cdev;
+ 
+ 	unsigned long		i_dnotify_mask; /* Directory notify events */
+ 	struct dnotify_struct	*i_dnotify; /* for directory notifications */
+ 
+ 	unsigned long		i_state;
+ 
+ 	unsigned int		i_flags;
+ 	unsigned char		i_sock;
+ 
+ 	atomic_t		i_writecount;
+ 	unsigned int		i_attr_flags;
+ 	__u32			i_generation;
+ 	union {
+ 		struct minix_inode_info		minix_i;
+ 		struct ext2_inode_info		ext2_i;
+ 		struct hpfs_inode_info		hpfs_i;
+ 		struct ntfs_inode_info		ntfs_i;
+ 		struct msdos_inode_info		msdos_i;
+ 		struct umsdos_inode_info	umsdos_i;
+ 		struct iso_inode_info		isofs_i;
+ 		struct nfs_inode_info		nfs_i;
+ 		struct sysv_inode_info		sysv_i;
+ 		struct affs_inode_info		affs_i;
+ 		struct ufs_inode_info		ufs_i;
+ 		struct efs_inode_info		efs_i;
+ 		struct romfs_inode_info		romfs_i;
+ 		struct shmem_inode_info		shmem_i;
+ 		struct coda_inode_info		coda_i;
+ 		struct smb_inode_info		smbfs_i;
+ 		struct hfs_inode_info		hfs_i;
+ 		struct adfs_inode_info		adfs_i;
+ 		struct qnx4_inode_info		qnx4_i;
+ 		struct reiserfs_inode_info	reiserfs_i;
+ 		struct bfs_inode_info		bfs_i;
+ 		struct udf_inode_info		udf_i;
+ 		struct ncp_inode_info		ncpfs_i;
+ 		struct proc_inode_info		proc_i;
+ 		struct socket			socket_i;
+ 		struct usbdev_inode_info        usbdev_i;
+ 		struct jffs2_inode_info		jffs2_i;
+ 		void				*generic_ip;
+ 	} u;
+ };
+ 
+ struct fown_struct {
+ 	int pid;		/* pid or -pgrp where SIGIO should be sent */
+ 	uid_t uid, euid;	/* uid/euid of process setting the owner */
+ 	int signum;		/* posix.1b rt signal to be delivered on IO */
+ };
+ 
+ struct file {
+ 	struct list_head	f_list;
+ 	struct dentry		*f_dentry;
+ 	struct vfsmount         *f_vfsmnt;
+ 	struct file_operations	*f_op;
+ 	atomic_t		f_count;
+ 	unsigned int 		f_flags;
+ 	mode_t			f_mode;
+ 	loff_t			f_pos;
+ 	unsigned long 		f_reada, f_ramax, f_raend, f_ralen, f_rawin;
+ 	struct fown_struct	f_owner;
+ 	unsigned int		f_uid, f_gid;
+ 	int			f_error;
+ 
+ 	unsigned long		f_version;
+ 
+ 	/* needed for tty driver, and maybe others */
+ 	void			*private_data;
+ 
+ 	/* preallocated helper kiobuf to speedup O_DIRECT */
+ 	struct kiobuf		*f_iobuf;
+ 	long			f_iobuf_lock;
+ };
+ extern spinlock_t files_lock;
+ #define file_list_lock() spin_lock(&files_lock);
+ #define file_list_unlock() spin_unlock(&files_lock);
+ 
+ #define get_file(x)	atomic_inc(&(x)->f_count)
+ #define file_count(x)	atomic_read(&(x)->f_count)
+ 
+ extern int init_private_file(struct file *, struct dentry *, int);
+ 
+ #define	MAX_NON_LFS	((1UL<<31) - 1)
+ 
+ #define FL_POSIX	1
+ #define FL_FLOCK	2
+ #define FL_BROKEN	4	/* broken flock() emulation */
+ #define FL_ACCESS	8	/* for processes suspended by mandatory locking */
+ #define FL_LOCKD	16	/* lock held by rpc.lockd */
+ #define FL_LEASE	32	/* lease held on this file */
+ 
+ /*
+  * The POSIX file lock owner is determined by
+  * the "struct files_struct" in the thread group
+  * (or NULL for no owner - BSD locks).
+  *
+  * Lockd stuffs a "host" pointer into this.
+  */
+ typedef struct files_struct *fl_owner_t;
+ 
+ struct file_lock {
+ 	struct file_lock *fl_next;	/* singly linked list for this inode  */
+ 	struct list_head fl_link;	/* doubly linked list of all locks */
+ 	struct list_head fl_block;	/* circular list of blocked processes */
+ 	fl_owner_t fl_owner;
+ 	unsigned int fl_pid;
+ 	wait_queue_head_t fl_wait;
+ 	struct file *fl_file;
+ 	unsigned char fl_flags;
+ 	unsigned char fl_type;
+ 	loff_t fl_start;
+ 	loff_t fl_end;
+ 
+ 	void (*fl_notify)(struct file_lock *);	/* unblock callback */
+ 	void (*fl_insert)(struct file_lock *);	/* lock insertion callback */
+ 	void (*fl_remove)(struct file_lock *);	/* lock removal callback */
+ 
+ 	struct fasync_struct *	fl_fasync; /* for lease break notifications */
+ 
+ 	union {
+ 		struct nfs_lock_info	nfs_fl;
+ 	} fl_u;
+ };
+ 
+ /* The following constant reflects the upper bound of the file/locking space */
+ #ifndef OFFSET_MAX
+ #define INT_LIMIT(x)	(~((x)1 << (sizeof(x)*8 - 1)))
+ #define OFFSET_MAX	INT_LIMIT(loff_t)
+ #define OFFT_OFFSET_MAX	INT_LIMIT(off_t)
+ #endif
+ 
+ extern struct list_head file_lock_list;
+ 
+ #include <linux/fcntl.h>
+ 
+ extern int fcntl_getlk(unsigned int, struct flock *);
+ extern int fcntl_setlk(unsigned int, unsigned int, struct flock *);
+ 
+ extern int fcntl_getlk64(unsigned int, struct flock64 *);
+ extern int fcntl_setlk64(unsigned int, unsigned int, struct flock64 *);
+ 
+ /* fs/locks.c */
+ extern void locks_init_lock(struct file_lock *);
+ extern void locks_copy_lock(struct file_lock *, struct file_lock *);
+ extern void locks_remove_posix(struct file *, fl_owner_t);
+ extern void locks_remove_flock(struct file *);
+ extern struct file_lock *posix_test_lock(struct file *, struct file_lock *);
+ extern int posix_lock_file(struct file *, struct file_lock *, unsigned int);
+ extern void posix_block_lock(struct file_lock *, struct file_lock *);
+ extern void posix_unblock_lock(struct file_lock *);
+ extern int posix_locks_deadlock(struct file_lock *, struct file_lock *);
+ extern int __get_lease(struct inode *inode, unsigned int flags);
+ extern time_t lease_get_mtime(struct inode *);
+ extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
+ extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+ 
+ struct fasync_struct {
+ 	int	magic;
+ 	int	fa_fd;
+ 	struct	fasync_struct	*fa_next; /* singly linked list */
+ 	struct	file 		*fa_file;
+ };
+ 
+ #define FASYNC_MAGIC 0x4601
+ 
+ /* SMP safe fasync helpers: */
+ extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
+ /* can be called from interrupts */
+ extern void kill_fasync(struct fasync_struct **, int, int);
+ /* only for net: no internal synchronization */
+ extern void __kill_fasync(struct fasync_struct *, int, int);
+ 
+ struct nameidata {
+ 	struct dentry *dentry;
+ 	struct vfsmount *mnt;
+ 	struct qstr last;
+ 	unsigned int flags;
+ 	int last_type;
+ };
+ 
+ #define DQUOT_USR_ENABLED	0x01		/* User diskquotas enabled */
+ #define DQUOT_GRP_ENABLED	0x02		/* Group diskquotas enabled */
+ 
+ struct quota_mount_options
+ {
+ 	unsigned int flags;			/* Flags for diskquotas on this device */
+ 	struct semaphore dqio_sem;		/* lock device while I/O in progress */
+ 	struct semaphore dqoff_sem;		/* serialize quota_off() and quota_on() on device */
+ 	struct file *files[MAXQUOTAS];		/* fp's to quotafiles */
+ 	time_t inode_expire[MAXQUOTAS];		/* expiretime for inode-quota */
+ 	time_t block_expire[MAXQUOTAS];		/* expiretime for block-quota */
+ 	char rsquash[MAXQUOTAS];		/* for quotas threat root as any other user */
+ };
+ 
+ /*
+  *	Umount options
+  */
+ 
+ #define MNT_FORCE	0x00000001	/* Attempt to forcibily umount */
+ #define MNT_DETACH	0x00000002	/* Just detach from the tree */
+ 
+ #include <linux/minix_fs_sb.h>
+ #include <linux/ext2_fs_sb.h>
+ #include <linux/hpfs_fs_sb.h>
+ #include <linux/ntfs_fs_sb.h>
+ #include <linux/msdos_fs_sb.h>
+ #include <linux/iso_fs_sb.h>
+ #include <linux/nfs_fs_sb.h>
+ #include <linux/sysv_fs_sb.h>
+ #include <linux/affs_fs_sb.h>
+ #include <linux/ufs_fs_sb.h>
+ #include <linux/efs_fs_sb.h>
+ #include <linux/romfs_fs_sb.h>
+ #include <linux/smb_fs_sb.h>
+ #include <linux/hfs_fs_sb.h>
+ #include <linux/adfs_fs_sb.h>
+ #include <linux/qnx4_fs_sb.h>
+ #include <linux/reiserfs_fs_sb.h>
+ #include <linux/bfs_fs_sb.h>
+ #include <linux/udf_fs_sb.h>
+ #include <linux/ncp_fs_sb.h>
+ #include <linux/usbdev_fs_sb.h>
+ #include <linux/cramfs_fs_sb.h>
+ #include <linux/jffs2_fs_sb.h>
+ 
+ extern struct list_head super_blocks;
+ extern spinlock_t sb_lock;
+ 
+ #define sb_entry(list)	list_entry((list), struct super_block, s_list)
+ #define S_BIAS (1<<30)
+ struct super_block {
+ 	struct list_head	s_list;		/* Keep this first */
+ 	kdev_t			s_dev;
+ 	unsigned long		s_blocksize;
+ 	unsigned char		s_blocksize_bits;
+ 	unsigned char		s_dirt;
+ 	unsigned long long	s_maxbytes;	/* Max file size */
+ 	struct file_system_type	*s_type;
+ 	struct super_operations	*s_op;
+ 	struct dquot_operations	*dq_op;
+ 	unsigned long		s_flags;
+ 	unsigned long		s_magic;
+ 	struct dentry		*s_root;
+ 	struct rw_semaphore	s_umount;
+ 	struct semaphore	s_lock;
+ 	int			s_count;
+ 	atomic_t		s_active;
+ 
+ 	struct list_head	s_dirty;	/* dirty inodes */
+ 	struct list_head	s_locked_inodes;/* inodes being synced */
+ 	struct list_head	s_files;
+ 
+ 	struct block_device	*s_bdev;
+ 	struct list_head	s_instances;
+ 	struct quota_mount_options s_dquot;	/* Diskquota specific options */
+ 
+ 	union {
+ 		struct minix_sb_info	minix_sb;
+ 		struct ext2_sb_info	ext2_sb;
+ 		struct hpfs_sb_info	hpfs_sb;
+ 		struct ntfs_sb_info	ntfs_sb;
+ 		struct msdos_sb_info	msdos_sb;
+ 		struct isofs_sb_info	isofs_sb;
+ 		struct nfs_sb_info	nfs_sb;
+ 		struct sysv_sb_info	sysv_sb;
+ 		struct affs_sb_info	affs_sb;
+ 		struct ufs_sb_info	ufs_sb;
+ 		struct efs_sb_info	efs_sb;
+ 		struct shmem_sb_info	shmem_sb;
+ 		struct romfs_sb_info	romfs_sb;
+ 		struct smb_sb_info	smbfs_sb;
+ 		struct hfs_sb_info	hfs_sb;
+ 		struct adfs_sb_info	adfs_sb;
+ 		struct qnx4_sb_info	qnx4_sb;
+ 		struct reiserfs_sb_info	reiserfs_sb;
+ 		struct bfs_sb_info	bfs_sb;
+ 		struct udf_sb_info	udf_sb;
+ 		struct ncp_sb_info	ncpfs_sb;
+ 		struct usbdev_sb_info   usbdevfs_sb;
+ 		struct jffs2_sb_info	jffs2_sb;
+ 		struct cramfs_sb_info	cramfs_sb;
+ 		void			*generic_sbp;
+ 	} u;
+ 	/*
+ 	 * The next field is for VFS *only*. No filesystems have any business
+ 	 * even looking at it. You had been warned.
+ 	 */
+ 	struct semaphore s_vfs_rename_sem;	/* Kludge */
+ 
+ 	/* The next field is used by knfsd when converting a (inode number based)
+ 	 * file handle into a dentry. As it builds a path in the dcache tree from
+ 	 * the bottom up, there may for a time be a subpath of dentrys which is not
+ 	 * connected to the main tree.  This semaphore ensure that there is only ever
+ 	 * one such free path per filesystem.  Note that unconnected files (or other
+ 	 * non-directories) are allowed, but not unconnected diretories.
+ 	 */
+ 	struct semaphore s_nfsd_free_path_sem;
+ };
+ 
+ /*
+  * VFS helper functions..
+  */
+ extern int vfs_create(struct inode *, struct dentry *, int);
+ extern int vfs_mkdir(struct inode *, struct dentry *, int);
+ extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
+ extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+ extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
+ extern int vfs_rmdir(struct inode *, struct dentry *);
+ extern int vfs_unlink(struct inode *, struct dentry *);
+ extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
+ 
+ /*
+  * File types
+  */
+ #define DT_UNKNOWN	0
+ #define DT_FIFO		1
+ #define DT_CHR		2
+ #define DT_DIR		4
+ #define DT_BLK		6
+ #define DT_REG		8
+ #define DT_LNK		10
+ #define DT_SOCK		12
+ #define DT_WHT		14
+ 
+ /*
+  * This is the "filldir" function type, used by readdir() to let
+  * the kernel specify what kind of dirent layout it wants to have.
+  * This allows the kernel to read directories into kernel space or
+  * to have different dirent layouts depending on the binary type.
+  */
+ typedef int (*filldir_t)(void *, const char *, int, loff_t, ino_t, unsigned);
+ 
+ struct block_device_operations {
+ 	int (*open) (struct inode *, struct file *);
+ 	int (*release) (struct inode *, struct file *);
+ 	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+ 	int (*check_media_change) (kdev_t);
+ 	int (*revalidate) (kdev_t);
+ };
+ 
+ /*
+  * NOTE:
+  * read, write, poll, fsync, readv, writev can be called
+  *   without the big kernel lock held in all filesystems.
+  */
+ struct file_operations {
+ 	struct module *owner;
+ 	loff_t (*llseek) (struct file *, loff_t, int);
+ 	ssize_t (*read) (struct file *, char *, size_t, loff_t *);
+ 	ssize_t (*write) (struct file *, const char *, size_t, loff_t *);
+ 	int (*readdir) (struct file *, void *, filldir_t);
+ 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+ 	int (*ioctl) (struct inode *, struct file *, unsigned int, unsigned long);
+ 	int (*mmap) (struct file *, struct vm_area_struct *);
+ 	int (*open) (struct inode *, struct file *);
+ 	int (*flush) (struct file *);
+ 	int (*release) (struct inode *, struct file *);
+ 	int (*fsync) (struct file *, struct dentry *, int datasync);
+ 	int (*fasync) (int, struct file *, int);
+ 	int (*lock) (struct file *, int, struct file_lock *);
+ 	ssize_t (*readv) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ 	ssize_t (*writev) (struct file *, const struct iovec *, unsigned long, loff_t *);
+ 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
+ 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+ };
+ 
+ struct inode_operations {
+ 	int (*create) (struct inode *,struct dentry *,int);
+ 	struct dentry * (*lookup) (struct inode *,struct dentry *);
+ 	int (*link) (struct dentry *,struct inode *,struct dentry *);
+ 	int (*unlink) (struct inode *,struct dentry *);
+ 	int (*symlink) (struct inode *,struct dentry *,const char *);
+ 	int (*mkdir) (struct inode *,struct dentry *,int);
+ 	int (*rmdir) (struct inode *,struct dentry *);
+ 	int (*mknod) (struct inode *,struct dentry *,int,int);
+ 	int (*rename) (struct inode *, struct dentry *,
+ 			struct inode *, struct dentry *);
+ 	int (*readlink) (struct dentry *, char *,int);
+ 	int (*follow_link) (struct dentry *, struct nameidata *);
+ 	void (*truncate) (struct inode *);
+ 	int (*permission) (struct inode *, int);
+ 	int (*revalidate) (struct dentry *);
+ 	int (*setattr) (struct dentry *, struct iattr *);
+ 	int (*getattr) (struct dentry *, struct iattr *);
+ };
+ 
+ /*
+  * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
+  * without the big kernel lock held in all filesystems.
+  */
+ struct super_operations {
+ 	void (*read_inode) (struct inode *);
+   
+   	/* reiserfs kludge.  reiserfs needs 64 bits of information to
+     	** find an inode.  We are using the read_inode2 call to get
+    	** that information.  We don't like this, and are waiting on some
+    	** VFS changes for the real solution.
+    	** iget4 calls read_inode2, iff it is defined
+    	*/
+     	void (*read_inode2) (struct inode *, void *) ;
+    	void (*dirty_inode) (struct inode *);
+ 	void (*write_inode) (struct inode *, int);
+ 	void (*put_inode) (struct inode *);
+ 	void (*delete_inode) (struct inode *);
+ 	void (*put_super) (struct super_block *);
+ 	void (*write_super) (struct super_block *);
+ 	void (*write_super_lockfs) (struct super_block *);
+ 	void (*unlockfs) (struct super_block *);
+ 	int (*statfs) (struct super_block *, struct statfs *);
+ 	int (*remount_fs) (struct super_block *, int *, char *);
+ 	void (*clear_inode) (struct inode *);
+ 	void (*umount_begin) (struct super_block *);
+ 
+ 	/* Following are for knfsd to interact with "interesting" filesystems
+ 	 * Currently just reiserfs, but possibly FAT and others later
+ 	 *
+ 	 * fh_to_dentry is given a filehandle fragement with length, and a type flag
+ 	 *   and must return a dentry for the referenced object or, if "parent" is
+ 	 *   set, a dentry for the parent of the object.
+ 	 *   If a dentry cannot be found, a "root" dentry should be created and
+ 	 *   flaged as DCACHE_NFSD_DISCONNECTED. nfsd_iget is an example implementation.
+ 	 *
+ 	 * dentry_to_fh is given a dentry and must generate the filesys specific
+ 	 *   part of the file handle.  Available length is passed in *lenp and used
+ 	 *   length should be returned therein.
+ 	 *   If need_parent is set, then dentry_to_fh should encode sufficient information
+ 	 *   to find the (current) parent.
+ 	 *   dentry_to_fh should return a 1byte "type" which will be passed back in
+ 	 *   the fhtype arguement to fh_to_dentry.  Type of 0 is reserved.
+ 	 *   If filesystem was exportable before the introduction of fh_to_dentry,
+ 	 *   types 1 and 2 should be used is that same way as the generic code.
+ 	 *   Type 255 means error.
+ 	 *
+ 	 * Lengths are in units of 4bytes, not bytes.
+ 	 */
+ 	struct dentry * (*fh_to_dentry)(struct super_block *sb, __u32 *fh, int len, int fhtype, int parent);
+ 	int (*dentry_to_fh)(struct dentry *, __u32 *fh, int *lenp, int need_parent);
+ };
+ 
+ /* Inode state bits.. */
+ #define I_DIRTY_SYNC		1 /* Not dirty enough for O_DATASYNC */
+ #define I_DIRTY_DATASYNC	2 /* Data-related inode changes pending */
+ #define I_DIRTY_PAGES		4 /* Data-related inode changes pending */
+ #define I_LOCK			8
+ #define I_FREEING		16
+ #define I_CLEAR			32
+ 
+ #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+ 
+ extern void __mark_inode_dirty(struct inode *, int);
+ static inline void mark_inode_dirty(struct inode *inode)
+ {
+ 	__mark_inode_dirty(inode, I_DIRTY);
+ }
+ 
+ static inline void mark_inode_dirty_sync(struct inode *inode)
+ {
+ 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
+ }
+ 
+ static inline void mark_inode_dirty_pages(struct inode *inode)
+ {
+ 	__mark_inode_dirty(inode, I_DIRTY_PAGES);
+ }
+ 
+ struct dquot_operations {
+ 	void (*initialize) (struct inode *, short);
+ 	void (*drop) (struct inode *);
+ 	int (*alloc_block) (struct inode *, unsigned long, char);
+ 	int (*alloc_inode) (const struct inode *, unsigned long);
+ 	void (*free_block) (struct inode *, unsigned long);
+ 	void (*free_inode) (const struct inode *, unsigned long);
+ 	int (*transfer) (struct inode *, struct iattr *);
+ };
+ 
+ struct file_system_type {
+ 	const char *name;
+ 	int fs_flags;
+ 	struct super_block *(*read_super) (struct super_block *, void *, int);
+ 	struct module *owner;
+ 	struct file_system_type * next;
+ 	struct list_head fs_supers;
+ };
+ 
+ #define DECLARE_FSTYPE(var,type,read,flags) \
+ struct file_system_type var = { \
+ 	name:		type, \
+ 	read_super:	read, \
+ 	fs_flags:	flags, \
+ 	owner:		THIS_MODULE, \
+ }
+ 
+ #define DECLARE_FSTYPE_DEV(var,type,read) \
+ 	DECLARE_FSTYPE(var,type,read,FS_REQUIRES_DEV)
+ 
+ /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
+ #define fops_get(fops) \
+ 	(((fops) && (fops)->owner)	\
+ 		? ( try_inc_mod_count((fops)->owner) ? (fops) : NULL ) \
+ 		: (fops))
+ 
+ #define fops_put(fops) \
+ do {	\
+ 	if ((fops) && (fops)->owner) \
+ 		__MOD_DEC_USE_COUNT((fops)->owner);	\
+ } while(0)
+ 
+ extern int register_filesystem(struct file_system_type *);
+ extern int unregister_filesystem(struct file_system_type *);
+ extern struct vfsmount *kern_mount(struct file_system_type *);
+ extern int may_umount(struct vfsmount *);
+ extern long do_mount(char *, char *, char *, unsigned long, void *);
+ 
+ #define kern_umount mntput
+ 
+ extern int vfs_statfs(struct super_block *, struct statfs *);
+ 
+ /* Return value for VFS lock functions - tells locks.c to lock conventionally
+  * REALLY kosha for root NFS and nfs_lock
+  */ 
+ #define LOCK_USE_CLNT 1
+ 
+ #define FLOCK_VERIFY_READ  1
+ #define FLOCK_VERIFY_WRITE 2
+ 
+ extern int locks_mandatory_locked(struct inode *);
+ extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size_t);
+ 
+ /*
+  * Candidates for mandatory locking have the setgid bit set
+  * but no group execute bit -  an otherwise meaningless combination.
+  */
+ #define MANDATORY_LOCK(inode) \
+ 	(IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+ 
+ static inline int locks_verify_locked(struct inode *inode)
+ {
+ 	if (MANDATORY_LOCK(inode))
+ 		return locks_mandatory_locked(inode);
+ 	return 0;
+ }
+ 
+ static inline int locks_verify_area(int read_write, struct inode *inode,
+ 				    struct file *filp, loff_t offset,
+ 				    size_t count)
+ {
+ 	if (inode->i_flock && MANDATORY_LOCK(inode))
+ 		return locks_mandatory_area(read_write, inode, filp, offset, count);
+ 	return 0;
+ }
+ 
+ static inline int locks_verify_truncate(struct inode *inode,
+ 				    struct file *filp,
+ 				    loff_t size)
+ {
+ 	if (inode->i_flock && MANDATORY_LOCK(inode))
+ 		return locks_mandatory_area(
+ 			FLOCK_VERIFY_WRITE, inode, filp,
+ 			size < inode->i_size ? size : inode->i_size,
+ 			(size < inode->i_size ? inode->i_size - size
+ 			 : size - inode->i_size)
+ 		);
+ 	return 0;
+ }
+ 
+ static inline int get_lease(struct inode *inode, unsigned int mode)
+ {
+ 	if (inode->i_flock && (inode->i_flock->fl_flags & FL_LEASE))
+ 		return __get_lease(inode, mode);
+ 	return 0;
+ }
+ 
+ /* fs/open.c */
+ 
+ asmlinkage long sys_open(const char *, int, int);
+ asmlinkage long sys_close(unsigned int);	/* yes, it's really unsigned */
+ extern int do_truncate(struct dentry *, loff_t start);
+ 
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char *);
+ 
+ /* fs/dcache.c */
+ extern void vfs_caches_init(unsigned long);
+ 
+ #define __getname()	kmem_cache_alloc(names_cachep, SLAB_KERNEL)
+ #define putname(name)	kmem_cache_free(names_cachep, (void *)(name))
+ 
+ enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW};
+ extern int register_blkdev(unsigned int, const char *, struct block_device_operations *);
+ extern int unregister_blkdev(unsigned int, const char *);
+ extern struct block_device *bdget(dev_t);
+ extern int bd_acquire(struct inode *inode);
+ extern void bd_forget(struct inode *inode);
+ extern void bdput(struct block_device *);
+ extern struct char_device *cdget(dev_t);
+ extern void cdput(struct char_device *);
+ extern int blkdev_open(struct inode *, struct file *);
+ extern int blkdev_close(struct inode *, struct file *);
+ extern struct file_operations def_blk_fops;
+ extern struct address_space_operations def_blk_aops;
+ extern struct file_operations def_fifo_fops;
+ extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
+ extern int blkdev_get(struct block_device *, mode_t, unsigned, int);
+ extern int blkdev_put(struct block_device *, int);
+ 
+ /* fs/devices.c */
+ extern const struct block_device_operations *get_blkfops(unsigned int);
+ extern int register_chrdev(unsigned int, const char *, struct file_operations *);
+ extern int unregister_chrdev(unsigned int, const char *);
+ extern int chrdev_open(struct inode *, struct file *);
+ extern const char * bdevname(kdev_t);
+ extern const char * cdevname(kdev_t);
+ extern const char * kdevname(kdev_t);
+ extern void init_special_inode(struct inode *, umode_t, int);
+ 
+ /* Invalid inode operations -- fs/bad_inode.c */
+ extern void make_bad_inode(struct inode *);
+ extern int is_bad_inode(struct inode *);
+ 
+ extern struct file_operations read_fifo_fops;
+ extern struct file_operations write_fifo_fops;
+ extern struct file_operations rdwr_fifo_fops;
+ extern struct file_operations read_pipe_fops;
+ extern struct file_operations write_pipe_fops;
+ extern struct file_operations rdwr_pipe_fops;
+ 
+ extern int fs_may_remount_ro(struct super_block *);
+ 
+ extern int try_to_free_buffers(struct page *, unsigned int);
+ extern void refile_buffer(struct buffer_head * buf);
+ extern void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
+ 
+ /* reiserfs_writepage needs this */
+ extern void set_buffer_async_io(struct buffer_head *bh) ;
+ 
+ #define BUF_CLEAN	0
+ #define BUF_LOCKED	1	/* Buffers scheduled for write */
+ #define BUF_DIRTY	2	/* Dirty buffers, not yet scheduled for write */
+ #define NR_LIST		3
+ 
+ static inline void get_bh(struct buffer_head * bh)
+ {
+         atomic_inc(&(bh)->b_count);
+ }
+ 
+ static inline void put_bh(struct buffer_head *bh)
+ {
+         smp_mb__before_atomic_dec();
+         atomic_dec(&bh->b_count);
+ }
+ 
+ /*
+  * This is called by bh->b_end_io() handlers when I/O has completed.
+  */
+ static inline void mark_buffer_uptodate(struct buffer_head * bh, int on)
+ {
+ 	if (on)
+ 		set_bit(BH_Uptodate, &bh->b_state);
+ 	else
+ 		clear_bit(BH_Uptodate, &bh->b_state);
+ }
+ 
+ #define atomic_set_buffer_clean(bh) test_and_clear_bit(BH_Dirty, &(bh)->b_state)
+ 
+ static inline void __mark_buffer_clean(struct buffer_head *bh)
+ {
+ 	refile_buffer(bh);
+ }
+ 
+ static inline void mark_buffer_clean(struct buffer_head * bh)
+ {
+ 	if (atomic_set_buffer_clean(bh))
+ 		__mark_buffer_clean(bh);
+ }
+ 
+ extern void FASTCALL(__mark_dirty(struct buffer_head *bh));
+ extern void FASTCALL(__mark_buffer_dirty(struct buffer_head *bh));
+ extern void FASTCALL(mark_buffer_dirty(struct buffer_head *bh));
+ extern void FASTCALL(buffer_insert_inode_data_queue(struct buffer_head *, struct inode *));
+ 
+ #define atomic_set_buffer_dirty(bh) test_and_set_bit(BH_Dirty, &(bh)->b_state)
+ 
+ static inline void mark_buffer_async(struct buffer_head * bh, int on)
+ {
+ 	if (on)
+ 		set_bit(BH_Async, &bh->b_state);
+ 	else
+ 		clear_bit(BH_Async, &bh->b_state);
+ }
+ 
+ /*
+  * If an error happens during the make_request, this function
+  * has to be recalled. It marks the buffer as clean and not
+  * uptodate, and it notifys the upper layer about the end
+  * of the I/O.
+  */
+ static inline void buffer_IO_error(struct buffer_head * bh)
+ {
+ 	mark_buffer_clean(bh);
+ 	/*
+ 	 * b_end_io has to clear the BH_Uptodate bitflag in the error case!
+ 	 */
+ 	bh->b_end_io(bh, 0);
+ }
+ 
+ extern void buffer_insert_inode_queue(struct buffer_head *, struct inode *);
+ static inline void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
+ {
+ 	mark_buffer_dirty(bh);
+ 	buffer_insert_inode_queue(bh, inode);
+ }
+ 
+ extern void balance_dirty(void);
+ extern int check_disk_change(kdev_t);
+ extern int invalidate_inodes(struct super_block *);
+ extern int invalidate_device(kdev_t, int);
+ extern void invalidate_inode_pages(struct inode *);
+ extern void invalidate_inode_pages2(struct address_space *);
+ extern void invalidate_inode_buffers(struct inode *);
+ #define invalidate_buffers(dev)	__invalidate_buffers((dev), 0)
+ #define destroy_buffers(dev)	__invalidate_buffers((dev), 1)
+ extern void invalidate_bdev(struct block_device *, int);
+ extern void __invalidate_buffers(kdev_t dev, int);
+ extern void sync_inodes(kdev_t);
+ extern void sync_unlocked_inodes(void);
+ extern void write_inode_now(struct inode *, int);
+ extern int sync_buffers(kdev_t, int);
+ extern void sync_dev(kdev_t);
+ extern int fsync_dev(kdev_t);
+ extern int fsync_super(struct super_block *);
+ extern int fsync_no_super(kdev_t);
+ extern void sync_inodes_sb(struct super_block *);
+ extern int osync_inode_buffers(struct inode *);
+ extern int osync_inode_data_buffers(struct inode *);
+ extern int fsync_inode_buffers(struct inode *);
+ extern int fsync_inode_data_buffers(struct inode *);
+ extern int inode_has_buffers(struct inode *);
+ extern void filemap_fdatasync(struct address_space *);
+ extern void filemap_fdatawait(struct address_space *);
+ extern void sync_supers(kdev_t);
+ extern int bmap(struct inode *, int);
+ extern int notify_change(struct dentry *, struct iattr *);
+ extern int permission(struct inode *, int);
+ extern int vfs_permission(struct inode *, int);
+ extern int get_write_access(struct inode *);
+ extern int deny_write_access(struct file *);
+ static inline void put_write_access(struct inode * inode)
+ {
+ 	atomic_dec(&inode->i_writecount);
+ }
+ static inline void allow_write_access(struct file *file)
+ {
+ 	if (file)
+ 		atomic_inc(&file->f_dentry->d_inode->i_writecount);
+ }
+ extern int do_pipe(int *);
+ 
+ extern int open_namei(const char *, int, int, struct nameidata *);
+ 
+ extern int kernel_read(struct file *, unsigned long, char *, unsigned long);
+ extern struct file * open_exec(const char *);
+  
+ /* fs/dcache.c -- generic fs support functions */
+ extern int is_subdir(struct dentry *, struct dentry *);
+ extern ino_t find_inode_number(struct dentry *, struct qstr *);
+ 
+ /*
+  * Kernel pointers have redundant information, so we can use a
+  * scheme where we can return either an error code or a dentry
+  * pointer with the same return value.
+  *
+  * This should be a per-architecture thing, to allow different
+  * error and pointer decisions.
+  */
+ static inline void *ERR_PTR(long error)
+ {
+ 	return (void *) error;
+ }
+ 
+ static inline long PTR_ERR(const void *ptr)
+ {
+ 	return (long) ptr;
+ }
+ 
+ static inline long IS_ERR(const void *ptr)
+ {
+ 	return (unsigned long)ptr > (unsigned long)-1000L;
+ }
+ 
+ /*
+  * The bitmask for a lookup event:
+  *  - follow links at the end
+  *  - require a directory
+  *  - ending slashes ok even for nonexistent files
+  *  - internal "there are more path compnents" flag
+  */
+ #define LOOKUP_FOLLOW		(1)
+ #define LOOKUP_DIRECTORY	(2)
+ #define LOOKUP_CONTINUE		(4)
+ #define LOOKUP_POSITIVE		(8)
+ #define LOOKUP_PARENT		(16)
+ #define LOOKUP_NOALT		(32)
+ /*
+  * Type of the last component on LOOKUP_PARENT
+  */
+ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
+ 
+ /*
+  * "descriptor" for what we're up to with a read for sendfile().
+  * This allows us to use the same read code yet
+  * have multiple different users of the data that
+  * we read from a file.
+  *
+  * The simplest case just copies the data to user
+  * mode.
+  */
+ typedef struct {
+ 	size_t written;
+ 	size_t count;
+ 	char * buf;
+ 	int error;
+ } read_descriptor_t;
+ 
+ typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long);
+ 
+ /* needed for stackable file system support */
+ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
+ 
+ extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
+ extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
+ extern void path_release(struct nameidata *);
+ extern int follow_down(struct vfsmount **, struct dentry **);
+ extern int follow_up(struct vfsmount **, struct dentry **);
+ extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+ #define user_path_walk(name,nd)	 __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
+ #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
+ 
+ extern void iput(struct inode *);
+ extern void force_delete(struct inode *);
+ extern struct inode * igrab(struct inode *);
+ extern ino_t iunique(struct super_block *, ino_t);
+ 
+ typedef int (*find_inode_t)(struct inode *, unsigned long, void *);
+ extern struct inode * iget4(struct super_block *, unsigned long, find_inode_t, void *);
+ static inline struct inode *iget(struct super_block *sb, unsigned long ino)
+ {
+ 	return iget4(sb, ino, NULL, NULL);
+ }
+ 
+ extern void clear_inode(struct inode *);
+ extern struct inode * get_empty_inode(void);
+ 
+ static inline struct inode * new_inode(struct super_block *sb)
+ {
+ 	struct inode *inode = get_empty_inode();
+ 	if (inode) {
+ 		inode->i_sb = sb;
+ 		inode->i_dev = sb->s_dev;
+ 		inode->i_blkbits = sb->s_blocksize_bits;
+ 	}
+ 	return inode;
+ }
+ extern void remove_suid(struct inode *inode);
+ 
+ extern void insert_inode_hash(struct inode *);
+ extern void remove_inode_hash(struct inode *);
+ extern struct file * get_empty_filp(void);
+ extern void file_move(struct file *f, struct list_head *list);
+ extern struct buffer_head * get_hash_table(kdev_t, int, int);
+ extern struct buffer_head * getblk(kdev_t, int, int);
+ extern void ll_rw_block(int, int, struct buffer_head * bh[]);
+ extern void submit_bh(int, struct buffer_head *);
+ extern int is_read_only(kdev_t);
+ extern void __brelse(struct buffer_head *);
+ static inline void brelse(struct buffer_head *buf)
+ {
+ 	if (buf)
+ 		__brelse(buf);
+ }
+ extern void __bforget(struct buffer_head *);
+ static inline void bforget(struct buffer_head *buf)
+ {
+ 	if (buf)
+ 		__bforget(buf);
+ }
+ extern int set_blocksize(kdev_t, int);
+ extern struct buffer_head * bread(kdev_t, int, int);
+ extern void wakeup_bdflush(void);
+ 
+ extern int brw_page(int, struct page *, kdev_t, int [], int);
+ 
+ typedef int (get_block_t)(struct inode*,long,struct buffer_head*,int);
+ 
+ /* Generic buffer handling for block filesystems.. */
+ extern int discard_bh_page(struct page *, unsigned long, int);
+ #define block_flushpage(page, offset) discard_bh_page(page, offset, 1)
+ #define block_invalidate_page(page) discard_bh_page(page, 0, 0)
+ extern int block_symlink(struct inode *, const char *, int);
+ extern int block_write_full_page(struct page*, get_block_t*);
+ extern int block_read_full_page(struct page*, get_block_t*);
+ extern int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
+ extern int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
+ 				unsigned long *);
+ extern int block_commit_write(struct page *page, unsigned from, unsigned to);
+ extern int block_sync_page(struct page *);
+ 
+ int generic_block_bmap(struct address_space *, long, get_block_t *);
+ int generic_commit_write(struct file *, struct page *, unsigned, unsigned);
+ int block_truncate_page(struct address_space *, loff_t, get_block_t *);
+ extern void create_empty_buffers(struct page *, kdev_t, unsigned long);
+ 
+ extern int waitfor_one_page(struct page*);
+ extern int generic_file_mmap(struct file *, struct vm_area_struct *);
+ extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
+ extern ssize_t generic_file_read(struct file *, char *, size_t, loff_t *);
+ extern ssize_t generic_file_write(struct file *, const char *, size_t, loff_t *);
+ extern void do_generic_file_read(struct file *, loff_t *, read_descriptor_t *, read_actor_t);
+ extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
+ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
+ extern ssize_t generic_read_dir(struct file *, char *, size_t, loff_t *);
+ extern int generic_file_open(struct inode * inode, struct file * filp);
+ 
+ extern struct file_operations generic_ro_fops;
+ 
+ extern int vfs_readlink(struct dentry *, char *, int, const char *);
+ extern int vfs_follow_link(struct nameidata *, const char *);
+ extern int page_readlink(struct dentry *, char *, int);
+ extern int page_follow_link(struct dentry *, struct nameidata *);
+ extern struct inode_operations page_symlink_inode_operations;
+ 
+ extern int vfs_readdir(struct file *, filldir_t, void *);
+ extern int dcache_readdir(struct file *, void *, filldir_t);
+ 
+ extern struct file_system_type *get_fs_type(const char *name);
+ extern struct super_block *get_super(kdev_t);
+ extern void drop_super(struct super_block *sb);
+ static inline int is_mounted(kdev_t dev)
+ {
+ 	struct super_block *sb = get_super(dev);
+ 	if (sb) {
+ 		drop_super(sb);
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ unsigned long generate_cluster(kdev_t, int b[], int);
+ unsigned long generate_cluster_swab32(kdev_t, int b[], int);
+ extern kdev_t ROOT_DEV;
+ extern char root_device_name[];
+ 
+ 
+ extern void show_buffers(void);
+ extern void mount_root(void);
+ 
+ #ifdef CONFIG_BLK_DEV_INITRD
+ extern kdev_t real_root_dev;
+ extern int change_root(kdev_t, const char *);
+ #endif
+ 
+ extern ssize_t char_read(struct file *, char *, size_t, loff_t *);
+ extern ssize_t block_read(struct file *, char *, size_t, loff_t *);
+ extern int read_ahead[];
+ 
+ extern ssize_t char_write(struct file *, const char *, size_t, loff_t *);
+ extern ssize_t block_write(struct file *, const char *, size_t, loff_t *);
+ 
+ extern int file_fsync(struct file *, struct dentry *, int);
+ extern int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx);
+ extern int generic_osync_inode(struct inode *, int);
+ #define OSYNC_METADATA (1<<0)
+ #define OSYNC_DATA (1<<1)
+ #define OSYNC_INODE (1<<2)
+ 
+ extern int inode_change_ok(struct inode *, struct iattr *);
+ extern int inode_setattr(struct inode *, struct iattr *);
+ 
+ /*
+  * Common dentry functions for inclusion in the VFS
+  * or in other stackable file systems.  Some of these
+  * functions were in linux/fs/ C (VFS) files.
+  *
+  */
+ 
+ /*
+  * Locking the parent is needed to:
+  *  - serialize directory operations
+  *  - make sure the parent doesn't change from
+  *    under us in the middle of an operation.
+  *
+  * NOTE! Right now we'd rather use a "struct inode"
+  * for this, but as I expect things to move toward
+  * using dentries instead for most things it is
+  * probably better to start with the conceptually
+  * better interface of relying on a path of dentries.
+  */
+ static inline struct dentry *lock_parent(struct dentry *dentry)
+ {
+ 	struct dentry *dir = dget(dentry->d_parent);
+ 
+ 	down(&dir->d_inode->i_sem);
+ 	return dir;
+ }
+ 
+ static inline struct dentry *get_parent(struct dentry *dentry)
+ {
+ 	return dget(dentry->d_parent);
+ }
+ 
+ static inline void unlock_dir(struct dentry *dir)
+ {
+ 	up(&dir->d_inode->i_sem);
+ 	dput(dir);
+ }
+ 
+ /*
+  * Whee.. Deadlock country. Happily there are only two VFS
+  * operations that does this..
+  */
+ static inline void double_down(struct semaphore *s1, struct semaphore *s2)
+ {
+ 	if (s1 != s2) {
+ 		if ((unsigned long) s1 < (unsigned long) s2) {
+ 			struct semaphore *tmp = s2;
+ 			s2 = s1; s1 = tmp;
+ 		}
+ 		down(s1);
+ 	}
+ 	down(s2);
+ }
+ 
+ /*
+  * Ewwwwwwww... _triple_ lock. We are guaranteed that the 3rd argument is
+  * not equal to 1st and not equal to 2nd - the first case (target is parent of
+  * source) would be already caught, the second is plain impossible (target is
+  * its own parent and that case would be caught even earlier). Very messy.
+  * I _think_ that it works, but no warranties - please, look it through.
+  * Pox on bloody lusers who mandated overwriting rename() for directories...
+  */
+ 
+ static inline void triple_down(struct semaphore *s1,
+ 			       struct semaphore *s2,
+ 			       struct semaphore *s3)
+ {
+ 	if (s1 != s2) {
+ 		if ((unsigned long) s1 < (unsigned long) s2) {
+ 			if ((unsigned long) s1 < (unsigned long) s3) {
+ 				struct semaphore *tmp = s3;
+ 				s3 = s1; s1 = tmp;
+ 			}
+ 			if ((unsigned long) s1 < (unsigned long) s2) {
+ 				struct semaphore *tmp = s2;
+ 				s2 = s1; s1 = tmp;
+ 			}
+ 		} else {
+ 			if ((unsigned long) s1 < (unsigned long) s3) {
+ 				struct semaphore *tmp = s3;
+ 				s3 = s1; s1 = tmp;
+ 			}
+ 			if ((unsigned long) s2 < (unsigned long) s3) {
+ 				struct semaphore *tmp = s3;
+ 				s3 = s2; s2 = tmp;
+ 			}
+ 		}
+ 		down(s1);
+ 	} else if ((unsigned long) s2 < (unsigned long) s3) {
+ 		struct semaphore *tmp = s3;
+ 		s3 = s2; s2 = tmp;
+ 	}
+ 	down(s2);
+ 	down(s3);
+ }
+ 
+ static inline void double_up(struct semaphore *s1, struct semaphore *s2)
+ {
+ 	up(s1);
+ 	if (s1 != s2)
+ 		up(s2);
+ }
+ 
+ static inline void triple_up(struct semaphore *s1,
+ 			     struct semaphore *s2,
+ 			     struct semaphore *s3)
+ {
+ 	up(s1);
+ 	if (s1 != s2)
+ 		up(s2);
+ 	up(s3);
+ }
+ 
+ static inline void double_lock(struct dentry *d1, struct dentry *d2)
+ {
+ 	double_down(&d1->d_inode->i_sem, &d2->d_inode->i_sem);
+ }
+ 
+ static inline void double_unlock(struct dentry *d1, struct dentry *d2)
+ {
+ 	double_up(&d1->d_inode->i_sem,&d2->d_inode->i_sem);
+ 	dput(d1);
+ 	dput(d2);
+ }
+ 
+ #endif /* __KERNEL__ */
+ 
+ #endif /* _LINUX_FS_H */
diff -rc2P linux/include/linux/jbd.h linux-2.4.13/include/linux/jbd.h
*** linux/include/linux/jbd.h	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/jbd.h	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,878 ----
+ /*
+  * linux/include/linux/jbd.h
+  * 
+  * Written by Stephen C. Tweedie <sct@redhat.com>
+  *
+  * Copyright 1998-2000 Red Hat, Inc --- All Rights Reserved
+  *
+  * This file is part of the Linux kernel and is made available under
+  * the terms of the GNU General Public License, version 2, or at your
+  * option, any later version, incorporated herein by reference.
+  *
+  * Definitions for transaction data structures for the buffer cache
+  * filesystem journaling support.
+  */
+ 
+ #ifndef _LINUX_JBD_H
+ #define _LINUX_JBD_H
+ 
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE) || !defined(__KERNEL__)
+ 
+ /* Allow this file to be included directly into e2fsprogs */
+ #ifndef __KERNEL__
+ #include "jfs_compat.h"
+ #define JFS_DEBUG
+ #define jfs_debug jbd_debug
+ #else
+ 
+ #include <linux/journal-head.h>
+ #include <linux/stddef.h>
+ #include <asm/semaphore.h>
+ #endif
+ 
+ extern int journal_oom_retry;
+ 
+ #ifdef CONFIG_JBD_DEBUG
+ /*
+  * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
+  * consistency checks.  By default we don't do this unless
+  * CONFIG_JBD_DEBUG is on.
+  */
+ #define JBD_EXPENSIVE_CHECKING
+ 
+ extern int journal_enable_debug;
+ extern int journal_no_write[2];
+ 
+ #define jbd_debug(n, f, a...)						\
+ 	do {								\
+ 		if ((n) <= journal_enable_debug) {			\
+ 			printk (KERN_DEBUG "(%s, %d): %s: ",		\
+ 				__FILE__, __LINE__, __FUNCTION__);	\
+ 		  	printk (f, ## a);				\
+ 		}							\
+ 	} while (0)
+ #else
+ #define jbd_debug(f, a...)	/**/
+ #endif
+ 
+ extern void * __jbd_kmalloc (char *where, size_t size, int flags, int retry);
+ #define jbd_kmalloc(size, flags) \
+ 	__jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry)
+ #define jbd_rep_kmalloc(size, flags) \
+ 	__jbd_kmalloc(__FUNCTION__, (size), (flags), 1)
+ 
+ #define JFS_MIN_JOURNAL_BLOCKS 1024
+ 
+ #ifdef __KERNEL__
+ typedef struct handle_s		handle_t;	/* Atomic operation type */
+ typedef struct journal_s	journal_t;	/* Journal control structure */
+ #endif
+ 
+ /*
+  * Internal structures used by the logging mechanism:
+  */
+ 
+ #define JFS_MAGIC_NUMBER 0xc03b3998U /* The first 4 bytes of /dev/random! */
+ 
+ /*
+  * On-disk structures
+  */
+ 
+ /* 
+  * Descriptor block types:
+  */
+ 
+ #define JFS_DESCRIPTOR_BLOCK	1
+ #define JFS_COMMIT_BLOCK	2
+ #define JFS_SUPERBLOCK_V1	3
+ #define JFS_SUPERBLOCK_V2	4
+ #define JFS_REVOKE_BLOCK	5
+ 
+ /*
+  * Standard header for all descriptor blocks:
+  */
+ typedef struct journal_header_s
+ {
+ 	__u32		h_magic;
+ 	__u32		h_blocktype;
+ 	__u32		h_sequence;
+ } journal_header_t;
+ 
+ 
+ /* 
+  * The block tag: used to describe a single buffer in the journal 
+  */
+ typedef struct journal_block_tag_s
+ {
+ 	__u32		t_blocknr;	/* The on-disk block number */
+ 	__u32		t_flags;	/* See below */
+ } journal_block_tag_t;
+ 
+ /* 
+  * The revoke descriptor: used on disk to describe a series of blocks to
+  * be revoked from the log 
+  */
+ typedef struct journal_revoke_header_s
+ {
+ 	journal_header_t r_header;
+ 	int		 r_count;	/* Count of bytes used in the block */
+ } journal_revoke_header_t;
+ 
+ 
+ /* Definitions for the journal tag flags word: */
+ #define JFS_FLAG_ESCAPE		1	/* on-disk block is escaped */
+ #define JFS_FLAG_SAME_UUID	2	/* block has same uuid as previous */
+ #define JFS_FLAG_DELETED	4	/* block deleted by this transaction */
+ #define JFS_FLAG_LAST_TAG	8	/* last tag in this descriptor block */
+ 
+ 
+ /*
+  * The journal superblock.  All fields are in big-endian byte order.
+  */
+ typedef struct journal_superblock_s
+ {
+ /* 0x0000 */
+ 	journal_header_t s_header;
+ 
+ /* 0x000C */
+ 	/* Static information describing the journal */
+ 	__u32	s_blocksize;		/* journal device blocksize */
+ 	__u32	s_maxlen;		/* total blocks in journal file */
+ 	__u32	s_first;		/* first block of log information */
+ 	
+ /* 0x0018 */
+ 	/* Dynamic information describing the current state of the log */
+ 	__u32	s_sequence;		/* first commit ID expected in log */
+ 	__u32	s_start;		/* blocknr of start of log */
+ 
+ /* 0x0020 */
+ 	/* Error value, as set by journal_abort(). */
+ 	__s32	s_errno;
+ 
+ /* 0x0024 */
+ 	/* Remaining fields are only valid in a version-2 superblock */
+ 	__u32	s_feature_compat; 	/* compatible feature set */
+ 	__u32	s_feature_incompat; 	/* incompatible feature set */
+ 	__u32	s_feature_ro_compat; 	/* readonly-compatible feature set */
+ /* 0x0030 */
+ 	__u8	s_uuid[16];		/* 128-bit uuid for journal */
+ 
+ /* 0x0040 */
+ 	__u32	s_nr_users;		/* Nr of filesystems sharing log */
+ 	
+ 	__u32	s_dynsuper;		/* Blocknr of dynamic superblock copy*/
+ 	
+ /* 0x0048 */
+ 	__u32	s_max_transaction;	/* Limit of journal blocks per trans.*/
+ 	__u32	s_max_trans_data;	/* Limit of data blocks per trans. */
+ 
+ /* 0x0050 */
+ 	__u32	s_padding[44];
+ 
+ /* 0x0100 */
+ 	__u8	s_users[16*48];		/* ids of all fs'es sharing the log */
+ /* 0x0400 */
+ } journal_superblock_t;
+ 
+ #define JFS_HAS_COMPAT_FEATURE(j,mask)					\
+ 	((j)->j_format_version >= 2 &&					\
+ 	 ((j)->j_superblock->s_feature_compat & cpu_to_be32((mask))))
+ #define JFS_HAS_RO_COMPAT_FEATURE(j,mask)				\
+ 	((j)->j_format_version >= 2 &&					\
+ 	 ((j)->j_superblock->s_feature_ro_compat & cpu_to_be32((mask))))
+ #define JFS_HAS_INCOMPAT_FEATURE(j,mask)				\
+ 	((j)->j_format_version >= 2 &&					\
+ 	 ((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
+ 
+ #define JFS_FEATURE_INCOMPAT_REVOKE	0x00000001
+ 
+ /* Features known to this kernel version: */
+ #define JFS_KNOWN_COMPAT_FEATURES	0
+ #define JFS_KNOWN_ROCOMPAT_FEATURES	0
+ #define JFS_KNOWN_INCOMPAT_FEATURES	JFS_FEATURE_INCOMPAT_REVOKE
+ 
+ #ifdef __KERNEL__
+ 
+ #include <linux/fs.h>
+ #include <linux/sched.h>
+ 
+ #define JBD_ASSERTIONS
+ #ifdef JBD_ASSERTIONS
+ #define J_ASSERT(assert)						\
+ do {									\
+ 	if (!(assert)) {						\
+ 		printk (KERN_EMERG					\
+ 			"Assertion failure in %s() at %s:%d: \"%s\"\n",	\
+ 			__FUNCTION__, __FILE__, __LINE__, # assert);	\
+ 		BUG();							\
+ 	}								\
+ } while (0)
+ 
+ #if defined(CONFIG_BUFFER_DEBUG)
+ void buffer_assertion_failure(struct buffer_head *bh);
+ #define J_ASSERT_BH(bh, expr)						\
+ 	do {								\
+ 		if (!(expr))						\
+ 			buffer_assertion_failure(bh);			\
+ 		J_ASSERT(expr);						\
+ 	} while (0)
+ #define J_ASSERT_JH(jh, expr)	J_ASSERT_BH(jh2bh(jh), expr)
+ #else
+ #define J_ASSERT_BH(bh, expr)	J_ASSERT(expr)
+ #define J_ASSERT_JH(jh, expr)	J_ASSERT(expr)
+ #endif
+ 
+ #else
+ #define J_ASSERT(assert)
+ #endif		/* JBD_ASSERTIONS */
+ 
+ enum jbd_state_bits {
+ 	BH_JWrite
+ 	  = BH_PrivateStart,	/* 1 if being written to log (@@@ DEBUGGING) */
+ 	BH_Freed,		/* 1 if buffer has been freed (truncated) */
+ 	BH_Revoked,		/* 1 if buffer has been revoked from the log */
+ 	BH_RevokeValid,		/* 1 if buffer revoked flag is valid */
+ 	BH_JBDDirty,		/* 1 if buffer is dirty but journaled */
+ };
+ 
+ /* Return true if the buffer is one which JBD is managing */
+ static inline int buffer_jbd(struct buffer_head *bh)
+ {
+ 	return __buffer_state(bh, JBD);
+ }
+ 
+ static inline struct buffer_head *jh2bh(struct journal_head *jh)
+ {
+ 	return jh->b_bh;
+ }
+ 
+ static inline struct journal_head *bh2jh(struct buffer_head *bh)
+ {
+ 	return bh->b_private;
+ }
+ 
+ struct jbd_revoke_table_s;
+ 
+ /* The handle_t type represents a single atomic update being performed
+  * by some process.  All filesystem modifications made by the process go
+  * through this handle.  Recursive operations (such as quota operations)
+  * are gathered into a single update.
+  *
+  * The buffer credits field is used to account for journaled buffers
+  * being modified by the running process.  To ensure that there is
+  * enough log space for all outstanding operations, we need to limit the
+  * number of outstanding buffers possible at any time.  When the
+  * operation completes, any buffer credits not used are credited back to
+  * the transaction, so that at all times we know how many buffers the
+  * outstanding updates on a transaction might possibly touch. */
+ 
+ struct handle_s 
+ {
+ 	/* Which compound transaction is this update a part of? */
+ 	transaction_t	      * h_transaction;
+ 
+ 	/* Number of remaining buffers we are allowed to dirty: */
+ 	int			h_buffer_credits;
+ 
+ 	/* Reference count on this handle */
+ 	int			h_ref;
+ 
+ 	/* Field for caller's use to track errors through large fs
+ 	   operations */
+ 	int			h_err;
+ 
+ 	/* Flags */
+ 	unsigned int	h_sync:		1;	/* sync-on-close */
+ 	unsigned int	h_jdata:	1;	/* force data journaling */
+ 	unsigned int	h_aborted:	1;	/* fatal error on handle */
+ };
+ 
+ 
+ /* The transaction_t type is the guts of the journaling mechanism.  It
+  * tracks a compound transaction through its various states:
+  *
+  * RUNNING:	accepting new updates
+  * LOCKED:	Updates still running but we don't accept new ones
+  * RUNDOWN:	Updates are tidying up but have finished requesting
+  *		new buffers to modify (state not used for now)
+  * FLUSH:       All updates complete, but we are still writing to disk
+  * COMMIT:      All data on disk, writing commit record
+  * FINISHED:	We still have to keep the transaction for checkpointing.
+  *
+  * The transaction keeps track of all of the buffers modified by a
+  * running transaction, and all of the buffers committed but not yet
+  * flushed to home for finished transactions.
+  */
+ 
+ struct transaction_s 
+ {
+ 	/* Pointer to the journal for this transaction. */
+ 	journal_t *		t_journal;
+ 	
+ 	/* Sequence number for this transaction */
+ 	tid_t			t_tid;
+ 	
+ 	/* Transaction's current state */
+ 	enum {
+ 		T_RUNNING,
+ 		T_LOCKED,
+ 		T_RUNDOWN,
+ 		T_FLUSH,
+ 		T_COMMIT,
+ 		T_FINISHED 
+ 	}			t_state;
+ 
+ 	/* Where in the log does this transaction's commit start? */
+ 	unsigned long		t_log_start;
+ 	
+ 	/* Doubly-linked circular list of all inodes owned by this
+            transaction */	/* AKPM: unused */
+ 	struct inode *		t_ilist;
+ 	
+ 	/* Number of buffers on the t_buffers list */
+ 	int			t_nr_buffers;
+ 	
+ 	/* Doubly-linked circular list of all buffers reserved but not
+            yet modified by this transaction */
+ 	struct journal_head *	t_reserved_list;
+ 	
+ 	/* Doubly-linked circular list of all metadata buffers owned by this
+            transaction */
+ 	struct journal_head *	t_buffers;
+ 	
+ 	/*
+ 	 * Doubly-linked circular list of all data buffers still to be
+ 	 * flushed before this transaction can be committed.
+ 	 * Protected by journal_datalist_lock.
+ 	 */
+ 	struct journal_head *	t_sync_datalist;
+ 	
+ 	/*
+ 	 * Doubly-linked circular list of all writepage data buffers
+ 	 * still to be written before this transaction can be committed.
+ 	 * Protected by journal_datalist_lock.
+ 	 */
+ 	struct journal_head *	t_async_datalist;
+ 	
+ 	/* Doubly-linked circular list of all forget buffers (superceded
+            buffers which we can un-checkpoint once this transaction
+            commits) */
+ 	struct journal_head *	t_forget;
+ 	
+ 	/*
+ 	 * Doubly-linked circular list of all buffers still to be
+ 	 * flushed before this transaction can be checkpointed.
+ 	 */
+ 	/* Protected by journal_datalist_lock */
+ 	struct journal_head *	t_checkpoint_list;
+ 	
+ 	/* Doubly-linked circular list of temporary buffers currently
+            undergoing IO in the log */
+ 	struct journal_head *	t_iobuf_list;
+ 	
+ 	/* Doubly-linked circular list of metadata buffers being
+            shadowed by log IO.  The IO buffers on the iobuf list and the
+            shadow buffers on this list match each other one for one at
+            all times. */
+ 	struct journal_head *	t_shadow_list;
+ 	
+ 	/* Doubly-linked circular list of control buffers being written
+            to the log. */
+ 	struct journal_head *	t_log_list;
+ 	
+ 	/* Number of outstanding updates running on this transaction */
+ 	int			t_updates;
+ 
+ 	/* Number of buffers reserved for use by all handles in this
+ 	 * transaction handle but not yet modified. */
+ 	int			t_outstanding_credits;
+ 	
+ 	/*
+ 	 * Forward and backward links for the circular list of all
+ 	 * transactions awaiting checkpoint.
+ 	 */
+ 	/* Protected by journal_datalist_lock */
+ 	transaction_t		*t_cpnext, *t_cpprev;
+ 
+ 	/* When will the transaction expire (become due for commit), in
+ 	 * jiffies ? */
+ 	unsigned long		t_expires;
+ 
+ 	/* How many handles used this transaction? */
+ 	int t_handle_count;
+ };
+ 
+ 
+ /* The journal_t maintains all of the journaling state information for a
+  * single filesystem.  It is linked to from the fs superblock structure.
+  * 
+  * We use the journal_t to keep track of all outstanding transaction
+  * activity on the filesystem, and to manage the state of the log
+  * writing process. */
+ 
+ struct journal_s
+ {
+ 	/* General journaling state flags */
+ 	unsigned long		j_flags;
+ 
+ 	/* Is there an outstanding uncleared error on the journal (from
+ 	 * a prior abort)? */
+ 	int			j_errno;
+ 	
+ 	/* The superblock buffer */
+ 	struct buffer_head *	j_sb_buffer;
+ 	journal_superblock_t *	j_superblock;
+ 
+ 	/* Version of the superblock format */
+ 	int			j_format_version;
+ 
+ 	/* Number of processes waiting to create a barrier lock */
+ 	int			j_barrier_count;
+ 	
+ 	/* The barrier lock itself */
+ 	struct semaphore	j_barrier;
+ 	
+ 	/* Transactions: The current running transaction... */
+ 	transaction_t *		j_running_transaction;
+ 	
+ 	/* ... the transaction we are pushing to disk ... */
+ 	transaction_t *		j_committing_transaction;
+ 	
+ 	/* ... and a linked circular list of all transactions waiting
+ 	 * for checkpointing. */
+ 	/* Protected by journal_datalist_lock */
+ 	transaction_t *		j_checkpoint_transactions;
+ 
+ 	/* Wait queue for waiting for a locked transaction to start
+            committing, or for a barrier lock to be released */
+ 	wait_queue_head_t	j_wait_transaction_locked;
+ 	
+ 	/* Wait queue for waiting for checkpointing to complete */
+ 	wait_queue_head_t	j_wait_logspace;
+ 	
+ 	/* Wait queue for waiting for commit to complete */
+ 	wait_queue_head_t	j_wait_done_commit;
+ 	
+ 	/* Wait queue to trigger checkpointing */
+ 	wait_queue_head_t	j_wait_checkpoint;
+ 	
+ 	/* Wait queue to trigger commit */
+ 	wait_queue_head_t	j_wait_commit;
+ 	
+ 	/* Wait queue to wait for updates to complete */
+ 	wait_queue_head_t	j_wait_updates;
+ 
+ 	/* Semaphore for locking against concurrent checkpoints */
+ 	struct semaphore 	j_checkpoint_sem;
+ 
+ 	/* The main journal lock, used by lock_journal() */
+ 	struct semaphore	j_sem;
+ 		
+ 	/* Journal head: identifies the first unused block in the journal. */
+ 	unsigned long		j_head;
+ 	
+ 	/* Journal tail: identifies the oldest still-used block in the
+ 	 * journal. */
+ 	unsigned long		j_tail;
+ 
+ 	/* Journal free: how many free blocks are there in the journal? */
+ 	unsigned long		j_free;
+ 
+ 	/* Journal start and end: the block numbers of the first usable
+ 	 * block and one beyond the last usable block in the journal. */
+ 	unsigned long		j_first, j_last;
+ 
+ 	/* Device, blocksize and starting block offset for the location
+ 	 * where we store the journal. */
+ 	kdev_t			j_dev;
+ 	int			j_blocksize;
+ 	unsigned int		j_blk_offset;
+ 
+ 	/* Device which holds the client fs.  For internal journal this
+ 	 * will be equal to j_dev. */
+ 	kdev_t			j_fs_dev;
+ 
+ 	/* Total maximum capacity of the journal region on disk. */
+ 	unsigned int		j_maxlen;
+ 
+ 	/* Optional inode where we store the journal.  If present, all
+ 	 * journal block numbers are mapped into this inode via
+ 	 * bmap(). */
+ 	struct inode *		j_inode;
+ 
+ 	/* Sequence number of the oldest transaction in the log */
+ 	tid_t			j_tail_sequence;
+ 	/* Sequence number of the next transaction to grant */
+ 	tid_t			j_transaction_sequence;
+ 	/* Sequence number of the most recently committed transaction */
+ 	tid_t			j_commit_sequence;
+ 	/* Sequence number of the most recent transaction wanting commit */
+ 	tid_t			j_commit_request;
+ 
+ 	/* Journal uuid: identifies the object (filesystem, LVM volume
+ 	 * etc) backed by this journal.  This will eventually be
+ 	 * replaced by an array of uuids, allowing us to index multiple
+ 	 * devices within a single journal and to perform atomic updates
+ 	 * across them.  */
+ 
+ 	__u8			j_uuid[16];
+ 
+ 	/* Pointer to the current commit thread for this journal */
+ 	struct task_struct *	j_task;
+ 
+ 	/* Maximum number of metadata buffers to allow in a single
+ 	 * compound commit transaction */
+ 	int			j_max_transaction_buffers;
+ 
+ 	/* What is the maximum transaction lifetime before we begin a
+ 	 * commit? */
+ 	unsigned long		j_commit_interval;
+ 
+ 	/* The timer used to wakeup the commit thread: */
+ 	struct timer_list *	j_commit_timer;
+ 	int			j_commit_timer_active;
+ 
+ 	/* Link all journals together - system-wide */
+ 	struct list_head	j_all_journals;
+ 
+ 	/* The revoke table: maintains the list of revoked blocks in the
+            current transaction. */
+ 	struct jbd_revoke_table_s *j_revoke;
+ };
+ 
+ /* 
+  * Journal flag definitions 
+  */
+ #define JFS_UNMOUNT	0x001	/* Journal thread is being destroyed */
+ #define JFS_ABORT	0x002	/* Journaling has been aborted for errors. */
+ #define JFS_ACK_ERR	0x004	/* The errno in the sb has been acked */
+ #define JFS_FLUSHED	0x008	/* The journal superblock has been flushed */
+ #define JFS_LOADED	0x010	/* The journal superblock has been loaded */
+ 
+ /* 
+  * Function declarations for the journaling transaction and buffer
+  * management
+  */
+ 
+ /* Filing buffers */
+ extern void __journal_unfile_buffer(struct journal_head *);
+ extern void journal_unfile_buffer(struct journal_head *);
+ extern void __journal_refile_buffer(struct journal_head *);
+ extern void journal_refile_buffer(struct journal_head *);
+ extern void __journal_file_buffer(struct journal_head *, transaction_t *, int);
+ extern void __journal_free_buffer(struct journal_head *bh);
+ extern void journal_file_buffer(struct journal_head *, transaction_t *, int);
+ extern void __journal_clean_data_list(transaction_t *transaction);
+ 
+ /* Log buffer allocation */
+ extern struct journal_head * journal_get_descriptor_buffer(journal_t *);
+ extern unsigned long journal_next_log_block(journal_t *);
+ 
+ /* Commit management */
+ extern void journal_commit_transaction(journal_t *);
+ 
+ /* Checkpoint list management */
+ int __journal_clean_checkpoint_list(journal_t *journal);
+ extern void journal_remove_checkpoint(struct journal_head *);
+ extern void __journal_remove_checkpoint(struct journal_head *);
+ extern void journal_insert_checkpoint(struct journal_head *, transaction_t *);
+ extern void __journal_insert_checkpoint(struct journal_head *,transaction_t *);
+ 
+ /* Buffer IO */
+ extern int 
+ journal_write_metadata_buffer(transaction_t	  *transaction,
+ 			      struct journal_head  *jh_in,
+ 			      struct journal_head **jh_out,
+ 			      int		   blocknr);
+ 
+ /* Transaction locking */
+ extern void		__wait_on_journal (journal_t *);
+ 
+ /*
+  * Journal locking.
+  *
+  * We need to lock the journal during transaction state changes so that
+  * nobody ever tries to take a handle on the running transaction while
+  * we are in the middle of moving it to the commit phase.  
+  *
+  * Note that the locking is completely interrupt unsafe.  We never touch
+  * journal structures from interrupts.
+  *
+  * In 2.2, the BKL was required for lock_journal.  This is no longer
+  * the case.
+  */
+ 
+ static inline void lock_journal(journal_t *journal)
+ {
+ 	down(&journal->j_sem);
+ }
+ 
+ /* This returns zero if we acquired the semaphore */
+ static inline int try_lock_journal(journal_t * journal)
+ {
+ 	return down_trylock(&journal->j_sem);
+ }
+ 
+ static inline void unlock_journal(journal_t * journal)
+ {
+ 	up(&journal->j_sem);
+ }
+ 
+ 
+ static inline handle_t *journal_current_handle(void)
+ {
+ 	return current->journal_info;
+ }
+ 
+ /* The journaling code user interface:
+  *
+  * Create and destroy handles
+  * Register buffer modifications against the current transaction. 
+  */
+ 
+ extern handle_t *journal_start(journal_t *, int nblocks);
+ extern handle_t *journal_try_start(journal_t *, int nblocks);
+ extern int	 journal_restart (handle_t *, int nblocks);
+ extern int	 journal_extend (handle_t *, int nblocks);
+ extern int	 journal_get_write_access (handle_t *, struct buffer_head *);
+ extern int	 journal_get_create_access (handle_t *, struct buffer_head *);
+ extern int	 journal_get_undo_access (handle_t *, struct buffer_head *);
+ extern int	 journal_dirty_data (handle_t *,
+ 				struct buffer_head *, int async);
+ extern int	 journal_dirty_metadata (handle_t *, struct buffer_head *);
+ extern void	 journal_release_buffer (handle_t *, struct buffer_head *);
+ extern void	 journal_forget (handle_t *, struct buffer_head *);
+ extern void	 journal_sync_buffer (struct buffer_head *);
+ extern int	 journal_flushpage(journal_t *, struct page *, unsigned long);
+ extern int	 journal_try_to_free_buffers(journal_t *, struct page *, int);
+ extern int	 journal_stop(handle_t *);
+ extern int	 journal_flush (journal_t *);
+ 
+ extern void	 journal_lock_updates (journal_t *);
+ extern void	 journal_unlock_updates (journal_t *);
+ 
+ extern journal_t * journal_init_dev(kdev_t dev, kdev_t fs_dev,
+ 				int start, int len, int bsize);
+ extern journal_t * journal_init_inode (struct inode *);
+ extern int	   journal_update_format (journal_t *);
+ extern int	   journal_check_used_features 
+ 		   (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int	   journal_check_available_features 
+ 		   (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int	   journal_set_features 
+ 		   (journal_t *, unsigned long, unsigned long, unsigned long);
+ extern int	   journal_create     (journal_t *);
+ extern int	   journal_load       (journal_t *journal);
+ extern void	   journal_destroy    (journal_t *);
+ extern int	   journal_recover    (journal_t *journal);
+ extern int	   journal_wipe       (journal_t *, int);
+ extern int	   journal_skip_recovery (journal_t *);
+ extern void	   journal_update_superblock (journal_t *, int);
+ extern void	   __journal_abort      (journal_t *);
+ extern void	   journal_abort      (journal_t *, int);
+ extern int	   journal_errno      (journal_t *);
+ extern void	   journal_ack_err    (journal_t *);
+ extern int	   journal_clear_err  (journal_t *);
+ extern unsigned long journal_bmap(journal_t *journal, unsigned long blocknr);
+ extern int	    journal_force_commit(journal_t *journal);
+ 
+ /*
+  * journal_head management
+  */
+ extern struct journal_head
+ 		*journal_add_journal_head(struct buffer_head *bh);
+ extern void	journal_remove_journal_head(struct buffer_head *bh);
+ extern void	__journal_remove_journal_head(struct buffer_head *bh);
+ extern void	journal_unlock_journal_head(struct journal_head *jh);
+ 
+ /* Primary revoke support */
+ #define JOURNAL_REVOKE_DEFAULT_HASH 256
+ extern int	   journal_init_revoke(journal_t *, int);
+ extern void	   journal_destroy_revoke_caches(void);
+ extern int	   journal_init_revoke_caches(void);
+ 
+ extern void	   journal_destroy_revoke(journal_t *);
+ extern int	   journal_revoke (handle_t *,
+ 				unsigned long, struct buffer_head *);
+ extern int	   journal_cancel_revoke(handle_t *, struct journal_head *);
+ extern void	   journal_write_revoke_records(journal_t *, transaction_t *);
+ 
+ /* Recovery revoke support */
+ extern int	   journal_set_revoke(journal_t *, unsigned long, tid_t);
+ extern int	   journal_test_revoke(journal_t *, unsigned long, tid_t);
+ extern void	   journal_clear_revoke(journal_t *);
+ extern void	   journal_brelse_array(struct buffer_head *b[], int n);
+ 
+ /* The log thread user interface:
+  *
+  * Request space in the current transaction, and force transaction commit
+  * transitions on demand.
+  */
+ 
+ extern int	log_space_left (journal_t *); /* Called with journal locked */
+ extern tid_t	log_start_commit (journal_t *, transaction_t *);
+ extern void	log_wait_commit (journal_t *, tid_t);
+ extern int	log_do_checkpoint (journal_t *, int);
+ 
+ extern void	log_wait_for_space(journal_t *, int nblocks);
+ extern void	__journal_drop_transaction(journal_t *, transaction_t *);
+ extern int	cleanup_journal_tail(journal_t *);
+ 
+ /* Reduce journal memory usage by flushing */
+ extern void shrink_journal_memory(void);
+ 
+ /* Debugging code only: */
+ 
+ #define jbd_ENOSYS() \
+ do {								      \
+ 	printk (KERN_ERR "JBD unimplemented function " __FUNCTION__); \
+ 	current->state = TASK_UNINTERRUPTIBLE;			      \
+ 	schedule();						      \
+ } while (1)
+ 
+ /*
+  * is_journal_abort
+  *
+  * Simple test wrapper function to test the JFS_ABORT state flag.  This
+  * bit, when set, indicates that we have had a fatal error somewhere,
+  * either inside the journaling layer or indicated to us by the client
+  * (eg. ext3), and that we and should not commit any further
+  * transactions.  
+  */
+ 
+ static inline int is_journal_aborted(journal_t *journal)
+ {
+ 	return journal->j_flags & JFS_ABORT;
+ }
+ 
+ static inline int is_handle_aborted(handle_t *handle)
+ {
+ 	if (handle->h_aborted)
+ 		return 1;
+ 	return is_journal_aborted(handle->h_transaction->t_journal);
+ }
+ 
+ static inline void journal_abort_handle(handle_t *handle)
+ {
+ 	handle->h_aborted = 1;
+ }
+ 
+ /* Not all architectures define BUG() */
+ #ifndef BUG
+  #define BUG() do { \
+         printk("kernel BUG at %s:%d!\n", __FILE__, __LINE__); \
+ 	* ((char *) 0) = 0; \
+  } while (0)
+ #endif /* BUG */
+ 
+ #endif /* __KERNEL__   */
+ 
+ /* Comparison functions for transaction IDs: perform comparisons using
+  * modulo arithmetic so that they work over sequence number wraps. */
+ 
+ static inline int tid_gt(tid_t x, tid_t y)
+ {
+ 	int difference = (x - y);
+ 	return (difference > 0);
+ }
+ 
+ static inline int tid_geq(tid_t x, tid_t y)
+ {
+ 	int difference = (x - y);
+ 	return (difference >= 0);
+ }
+ 
+ extern int journal_blocks_per_page(struct inode *inode);
+ 
+ /*
+  * Definitions which augment the buffer_head layer
+  */
+ 
+ /* JBD additions */ 
+ 
+ /* journaling buffer types */
+ #define BJ_None		0	/* Not journaled */
+ #define BJ_SyncData	1	/* Normal data: flush before commit */
+ #define BJ_AsyncData	2	/* writepage data: wait on it before commit */
+ #define BJ_Metadata	3	/* Normal journaled metadata */
+ #define BJ_Forget	4	/* Buffer superceded by this transaction */
+ #define BJ_IO		5	/* Buffer is for temporary IO use */
+ #define BJ_Shadow	6	/* Buffer contents being shadowed to the log */
+ #define BJ_LogCtl	7	/* Buffer contains log descriptors */
+ #define BJ_Reserved	8	/* Buffer is reserved for access by journal */
+ #define BJ_Types	9
+  
+ extern int jbd_blocks_per_page(struct inode *inode);
+ 
+ #ifdef __KERNEL__
+ 
+ extern spinlock_t jh_splice_lock;
+ /*
+  * Once `expr1' has been found true, take jh_splice_lock
+  * and then reevaluate everything.
+  */
+ #define SPLICE_LOCK(expr1, expr2)				\
+ 	({							\
+ 		int ret = (expr1);				\
+ 		if (ret) {					\
+ 			spin_lock(&jh_splice_lock);		\
+ 			ret = (expr1) && (expr2);		\
+ 			spin_unlock(&jh_splice_lock);		\
+ 		}						\
+ 		ret;						\
+ 	})
+ 
+ /*
+  * A number of buffer state predicates.  They test for
+  * buffer_jbd() because they are used in core kernel code.
+  *
+  * These will be racy on SMP unless we're *sure* that the
+  * buffer won't be detached from the journalling system
+  * in parallel.
+  */
+ 
+ /* Return true if the buffer is on journal list `list' */
+ static inline int buffer_jlist_eq(struct buffer_head *bh, int list)
+ {
+ 	return SPLICE_LOCK(buffer_jbd(bh), bh2jh(bh)->b_jlist == list);
+ }
+ 
+ /* Return true if this bufer is dirty wrt the journal */
+ static inline int buffer_jdirty(struct buffer_head *bh)
+ {
+ 	return buffer_jbd(bh) && __buffer_state(bh, JBDDirty);
+ }
+ 
+ /* Return true if it's a data buffer which journalling is managing */
+ static inline int buffer_jbd_data(struct buffer_head *bh)
+ {
+ 	return SPLICE_LOCK(buffer_jbd(bh),
+ 			bh2jh(bh)->b_jlist == BJ_SyncData ||
+ 			bh2jh(bh)->b_jlist == BJ_AsyncData);
+ }
+ 
+ #ifdef CONFIG_SMP
+ #define assert_spin_locked(lock)	J_ASSERT(spin_is_locked(lock))
+ #else
+ #define assert_spin_locked(lock)	do {} while(0)
+ #endif
+ 
+ #endif	/* __KERNEL__ */
+ 
+ #endif	/* CONFIG_JBD || CONFIG_JBD_MODULE || !__KERNEL__ */
+ 
+ /*
+  * Compatibility no-ops which allow the kernel to compile without CONFIG_JBD
+  * go here.
+  */
+ 
+ #if defined(__KERNEL__) && !(defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE))
+ 
+ #define J_ASSERT(expr)			do {} while (0)
+ #define J_ASSERT_BH(bh, expr)		do {} while (0)
+ #define buffer_jbd(bh)			0
+ #define buffer_jlist_eq(bh, val)	0
+ #define journal_buffer_journal_lru(bh)	0
+ 
+ #endif	/* defined(__KERNEL__) && !defined(CONFIG_JBD) */
+ #endif	/* _LINUX_JBD_H */
diff -rc2P linux/include/linux/journal-head.h linux-2.4.13/include/linux/journal-head.h
*** linux/include/linux/journal-head.h	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/journal-head.h	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,70 ----
+ /*
+  * include/linux/journal-head.h
+  *
+  * buffer_head fields for JBD
+  *
+  * 27 May 2001 ANdrew Morton <andrewm@uow.edu.au>
+  *	Created - pulled out of fs.h
+  */
+ 
+ #ifndef JOURNAL_HEAD_H_INCLUDED
+ #define JOURNAL_HEAD_H_INCLUDED
+ 
+ typedef unsigned int		tid_t;		/* Unique transaction ID */
+ typedef struct transaction_s	transaction_t;	/* Compound transaction type */
+ struct buffer_head;
+ 
+ struct journal_head {
+ #ifndef CONFIG_JBD_UNIFIED_BUFFERS
+ 	/* Points back to our buffer_head. */
+ 	struct buffer_head *b_bh;
+ #endif
+ 
+ 	/* Reference count - see description in journal.c */
+ 	int b_jcount;
+ 
+ 	/* Journaling list for this buffer */
+ 	unsigned b_jlist;
+ 
+ 	/* Copy of the buffer data frozen for writing to the log. */
+ 	char * b_frozen_data;
+ 
+ 	/* Pointer to a saved copy of the buffer containing no
+            uncommitted deallocation references, so that allocations can
+            avoid overwriting uncommitted deletes. */
+ 	char * b_committed_data;
+ 
+ 	/* Pointer to the compound transaction which owns this buffer's
+            metadata: either the running transaction or the committing
+            transaction (if there is one).  Only applies to buffers on a
+            transaction's data or metadata journaling list. */
+ 	/* Protected by journal_datalist_lock */
+ 	transaction_t * b_transaction;
+ 	
+ 	/* Pointer to the running compound transaction which is
+            currently modifying the buffer's metadata, if there was
+            already a transaction committing it when the new transaction
+            touched it. */
+ 	transaction_t * b_next_transaction;
+ 	
+ 	/* Doubly-linked list of buffers on a transaction's data,
+            metadata or forget queue. */
+ 	/* Protected by journal_datalist_lock */
+ 	struct journal_head *b_tnext, *b_tprev;
+ 
+ 	/*
+ 	 * Pointer to the compound transaction against which this buffer
+ 	 * is checkpointed.  Only dirty buffers can be checkpointed.
+ 	 */
+ 	/* Protected by journal_datalist_lock */
+ 	transaction_t * b_cp_transaction;
+ 	
+ 	/*
+ 	 * Doubly-linked list of buffers still remaining to be flushed
+ 	 * before an old transaction can be checkpointed.
+ 	 */
+ 	/* Protected by journal_datalist_lock */
+ 	struct journal_head *b_cpnext, *b_cpprev;
+ };
+ 
+ #endif		/* JOURNAL_HEAD_H_INCLUDED */
diff -rc2P linux/include/linux/sched.h linux-2.4.13/include/linux/sched.h
*** linux/include/linux/sched.h	Fri Nov  9 16:15:08 2001
--- linux-2.4.13/include/linux/sched.h	Fri Nov  9 16:58:32 2001
***************
*** 420,423 ****
--- 420,425 ----
  /* Protection of (de-)allocation: mm, files, fs, tty */
  	spinlock_t alloc_lock;
+ /* journalling filesystem info */
+        void *journal_info;
  /* Field to make virtual server running in chroot more  isolated */
  	int s_context;	/* Process can only deal with other processes */
***************
*** 513,516 ****
--- 515,519 ----
      blocked:		{{0}},						\
      alloc_lock:		SPIN_LOCK_UNLOCKED,				\
+     journal_info:      	NULL,						\
      cap_bset:		CAP_INIT_EFF_SET,				\
  }
diff -rc2P linux/include/linux/sched.h.orig linux-2.4.13/include/linux/sched.h.orig
*** linux/include/linux/sched.h.orig	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/sched.h.orig	Fri Nov  9 16:15:08 2001
***************
*** 0 ****
--- 1,936 ----
+ #ifndef _LINUX_SCHED_H
+ #define _LINUX_SCHED_H
+ 
+ #include <asm/param.h>	/* for HZ */
+ 
+ extern unsigned long event;
+ 
+ #include <linux/config.h>
+ #include <linux/binfmts.h>
+ #include <linux/threads.h>
+ #include <linux/kernel.h>
+ #include <linux/types.h>
+ #include <linux/times.h>
+ #include <linux/timex.h>
+ #include <linux/rbtree.h>
+ 
+ #include <asm/system.h>
+ #include <asm/semaphore.h>
+ #include <asm/page.h>
+ #include <asm/ptrace.h>
+ #include <asm/mmu.h>
+ 
+ #include <linux/smp.h>
+ #include <linux/tty.h>
+ #include <linux/sem.h>
+ #include <linux/signal.h>
+ #include <linux/securebits.h>
+ #include <linux/fs_struct.h>
+ 
+ struct exec_domain;
+ 
+ /*
+  * cloning flags:
+  */
+ #define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
+ #define CLONE_VM	0x00000100	/* set if VM shared between processes */
+ #define CLONE_FS	0x00000200	/* set if fs info shared between processes */
+ #define CLONE_FILES	0x00000400	/* set if open files shared between processes */
+ #define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
+ #define CLONE_PID	0x00001000	/* set if pid shared */
+ #define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
+ #define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
+ #define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
+ #define CLONE_THREAD	0x00010000	/* Same thread group? */
+ 
+ #define CLONE_SIGNAL	(CLONE_SIGHAND | CLONE_THREAD)
+ 
+ /*
+  * These are the constant used to fake the fixed-point load-average
+  * counting. Some notes:
+  *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
+  *    a load-average precision of 10 bits integer + 11 bits fractional
+  *  - if you want to count load-averages more often, you need more
+  *    precision, or rounding will get you. With 2-second counting freq,
+  *    the EXP_n values would be 1981, 2034 and 2043 if still using only
+  *    11 bit fractions.
+  */
+ extern unsigned long avenrun[];		/* Load averages */
+ 
+ #define FSHIFT		11		/* nr of bits of precision */
+ #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
+ #define LOAD_FREQ	(5*HZ)		/* 5 sec intervals */
+ #define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
+ #define EXP_5		2014		/* 1/exp(5sec/5min) */
+ #define EXP_15		2037		/* 1/exp(5sec/15min) */
+ 
+ #define CALC_LOAD(load,exp,n) \
+ 	load *= exp; \
+ 	load += n*(FIXED_1-exp); \
+ 	load >>= FSHIFT;
+ 
+ #define CT_TO_SECS(x)	((x) / HZ)
+ #define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)
+ 
+ extern int nr_running, nr_threads;
+ extern int last_pid;
+ 
+ #include <linux/fs.h>
+ #include <linux/time.h>
+ #include <linux/param.h>
+ #include <linux/resource.h>
+ #include <linux/timer.h>
+ 
+ #include <asm/processor.h>
+ 
+ #define TASK_RUNNING		0
+ #define TASK_INTERRUPTIBLE	1
+ #define TASK_UNINTERRUPTIBLE	2
+ #define TASK_ZOMBIE		4
+ #define TASK_STOPPED		8
+ 
+ #define __set_task_state(tsk, state_value)		\
+ 	do { (tsk)->state = (state_value); } while (0)
+ #ifdef CONFIG_SMP
+ #define set_task_state(tsk, state_value)		\
+ 	set_mb((tsk)->state, (state_value))
+ #else
+ #define set_task_state(tsk, state_value)		\
+ 	__set_task_state((tsk), (state_value))
+ #endif
+ 
+ #define __set_current_state(state_value)			\
+ 	do { current->state = (state_value); } while (0)
+ #ifdef CONFIG_SMP
+ #define set_current_state(state_value)		\
+ 	set_mb(current->state, (state_value))
+ #else
+ #define set_current_state(state_value)		\
+ 	__set_current_state(state_value)
+ #endif
+ 
+ /*
+  * Scheduling policies
+  */
+ #define SCHED_OTHER		0
+ #define SCHED_FIFO		1
+ #define SCHED_RR		2
+ 
+ /*
+  * This is an additional bit set when we want to
+  * yield the CPU for one re-schedule..
+  */
+ #define SCHED_YIELD		0x10
+ 
+ struct sched_param {
+ 	int sched_priority;
+ };
+ 
+ struct completion;
+ 
+ #ifdef __KERNEL__
+ 
+ #include <linux/spinlock.h>
+ 
+ /*
+  * This serializes "schedule()" and also protects
+  * the run-queue from deletions/modifications (but
+  * _adding_ to the beginning of the run-queue has
+  * a separate lock).
+  */
+ extern rwlock_t tasklist_lock;
+ extern spinlock_t runqueue_lock;
+ extern spinlock_t mmlist_lock;
+ 
+ extern void sched_init(void);
+ extern void init_idle(void);
+ extern void show_state(void);
+ extern void cpu_init (void);
+ extern void trap_init(void);
+ extern void update_process_times(int user);
+ extern void update_one_process(struct task_struct *p, unsigned long user,
+ 			       unsigned long system, int cpu);
+ 
+ #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
+ extern signed long FASTCALL(schedule_timeout(signed long timeout));
+ asmlinkage void schedule(void);
+ 
+ extern int schedule_task(struct tq_struct *task);
+ extern void flush_scheduled_tasks(void);
+ extern int start_context_thread(void);
+ extern int current_is_keventd(void);
+ 
+ /*
+  * The default fd array needs to be at least BITS_PER_LONG,
+  * as this is the granularity returned by copy_fdset().
+  */
+ #define NR_OPEN_DEFAULT BITS_PER_LONG
+ 
+ /*
+  * Open file table structure
+  */
+ struct files_struct {
+ 	atomic_t count;
+ 	rwlock_t file_lock;	/* Protects all the below members.  Nests inside tsk->alloc_lock */
+ 	int max_fds;
+ 	int max_fdset;
+ 	int next_fd;
+ 	struct file ** fd;	/* current fd array */
+ 	fd_set *close_on_exec;
+ 	fd_set *open_fds;
+ 	fd_set close_on_exec_init;
+ 	fd_set open_fds_init;
+ 	struct file * fd_array[NR_OPEN_DEFAULT];
+ };
+ 
+ #define INIT_FILES \
+ { 							\
+ 	count:		ATOMIC_INIT(1), 		\
+ 	file_lock:	RW_LOCK_UNLOCKED, 		\
+ 	max_fds:	NR_OPEN_DEFAULT, 		\
+ 	max_fdset:	__FD_SETSIZE, 			\
+ 	next_fd:	0, 				\
+ 	fd:		&init_files.fd_array[0], 	\
+ 	close_on_exec:	&init_files.close_on_exec_init, \
+ 	open_fds:	&init_files.open_fds_init, 	\
+ 	close_on_exec_init: { { 0, } }, 		\
+ 	open_fds_init:	{ { 0, } }, 			\
+ 	fd_array:	{ NULL, } 			\
+ }
+ 
+ /* Maximum number of active map areas.. This is a random (large) number */
+ #define MAX_MAP_COUNT	(65536)
+ 
+ struct mm_struct {
+ 	struct vm_area_struct * mmap;		/* list of VMAs */
+ 	rb_root_t mm_rb;
+ 	struct vm_area_struct * mmap_cache;	/* last find_vma result */
+ 	pgd_t * pgd;
+ 	atomic_t mm_users;			/* How many users with user space? */
+ 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
+ 	int map_count;				/* number of VMAs */
+ 	struct rw_semaphore mmap_sem;
+ 	spinlock_t page_table_lock;		/* Protects task page tables and mm->rss */
+ 
+ 	struct list_head mmlist;		/* List of all active mm's.  These are globally strung
+ 						 * together off init_mm.mmlist, and are protected
+ 						 * by mmlist_lock
+ 						 */
+ 
+ 	unsigned long start_code, end_code, start_data, end_data;
+ 	unsigned long start_brk, brk, start_stack;
+ 	unsigned long arg_start, arg_end, env_start, env_end;
+ 	unsigned long rss, total_vm, locked_vm;
+ 	unsigned long def_flags;
+ 	unsigned long cpu_vm_mask;
+ 	unsigned long swap_address;
+ 
+ 	unsigned dumpable:1;
+ 
+ 	/* Architecture-specific MM context */
+ 	mm_context_t context;
+ };
+ 
+ extern int mmlist_nr;
+ 
+ #define INIT_MM(name) \
+ {			 				\
+ 	mm_rb:		RB_ROOT,			\
+ 	pgd:		swapper_pg_dir, 		\
+ 	mm_users:	ATOMIC_INIT(2), 		\
+ 	mm_count:	ATOMIC_INIT(1), 		\
+ 	mmap_sem:	__RWSEM_INITIALIZER(name.mmap_sem), \
+ 	page_table_lock: SPIN_LOCK_UNLOCKED, 		\
+ 	mmlist:		LIST_HEAD_INIT(name.mmlist),	\
+ }
+ 
+ struct signal_struct {
+ 	atomic_t		count;
+ 	struct k_sigaction	action[_NSIG];
+ 	spinlock_t		siglock;
+ };
+ 
+ 
+ #define INIT_SIGNALS {	\
+ 	count:		ATOMIC_INIT(1), 		\
+ 	action:		{ {{0,}}, }, 			\
+ 	siglock:	SPIN_LOCK_UNLOCKED 		\
+ }
+ 
+ /*
+  * Some day this will be a full-fledged user tracking system..
+  */
+ struct user_struct {
+ 	atomic_t __count;	/* reference count */
+ 	atomic_t processes;	/* How many processes does this user have? */
+ 	atomic_t files;		/* How many open files does this user have? */
+ 
+ 	/* Hash table maintenance information */
+ 	struct user_struct *next, **pprev;
+ 	uid_t uid;
+ };
+ 
+ #define get_current_user() ({ 				\
+ 	struct user_struct *__user = current->user;	\
+ 	atomic_inc(&__user->__count);			\
+ 	__user; })
+ 
+ 
+ /*
+ 	We may have a different domainname and nodename for each security
+ 	context. By default, a security context share the same as its
+ 	parent, potentially the information in system_utsname
+ */
+ #define S_CTX_INFO_LOCK		1	/* Can't request a new s_context */
+ #define S_CTX_INFO_SCHED	2	/* All process in the s_context */
+ 					/* Contribute to the schedular */
+ struct context_info{
+ 	int refcount;
+ 	int s_context;
+ 	char nodename[65];
+ 	char domainname[65];
+ 	int flags;		/* S_CTX_INFO_xxx */
+ 	atomic_t ticks;		/* Number of ticks used by all process */
+ 				/* in the s_context */
+ };
+ 
+ 
+ extern struct user_struct root_user;
+ #define INIT_USER (&root_user)
+ 
+ struct task_struct {
+ 	/*
+ 	 * offsets of these are hardcoded elsewhere - touch with care
+ 	 */
+ 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
+ 	unsigned long flags;	/* per process flags, defined below */
+ 	int sigpending;
+ 	mm_segment_t addr_limit;	/* thread address space:
+ 					 	0-0xBFFFFFFF for user-thead
+ 						0-0xFFFFFFFF for kernel-thread
+ 					 */
+ 	struct exec_domain *exec_domain;
+ 	volatile long need_resched;
+ 	unsigned long ptrace;
+ 
+ 	int lock_depth;		/* Lock depth */
+ 
+ /*
+  * offset 32 begins here on 32-bit platforms. We keep
+  * all fields in a single cacheline that are needed for
+  * the goodness() loop in schedule().
+  */
+ 	long counter;
+ 	long nice;
+ 	unsigned long policy;
+ 	struct mm_struct *mm;
+ 	int has_cpu, processor;
+ 	unsigned long cpus_allowed;
+ 	/*
+ 	 * (only the 'next' pointer fits into the cacheline, but
+ 	 * that's just fine.)
+ 	 */
+ 	struct list_head run_list;
+ 	unsigned long sleep_time;
+ 
+ 	struct task_struct *next_task, *prev_task;
+ 	struct mm_struct *active_mm;
+ 	struct list_head local_pages;
+ 	unsigned int allocation_order, nr_local_pages;
+ 
+ /* task state */
+ 	struct linux_binfmt *binfmt;
+ 	int exit_code, exit_signal;
+ 	int pdeath_signal;  /*  The signal sent when the parent dies  */
+ 	/* ??? */
+ 	unsigned long personality;
+ 	int did_exec:1;
+ 	pid_t pid;
+ 	pid_t pgrp;
+ 	pid_t tty_old_pgrp;
+ 	pid_t session;
+ 	pid_t tgid;
+ 	/* boolean value for session group leader */
+ 	int leader;
+ 	/* 
+ 	 * pointers to (original) parent process, youngest child, younger sibling,
+ 	 * older sibling, respectively.  (p->father can be replaced with 
+ 	 * p->p_pptr->pid)
+ 	 */
+ 	struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
+ 	struct list_head thread_group;
+ 
+ 	/* PID hash table linkage. */
+ 	struct task_struct *pidhash_next;
+ 	struct task_struct **pidhash_pprev;
+ 
+ 	wait_queue_head_t wait_chldexit;	/* for wait4() */
+ 	struct completion *vfork_done;		/* for vfork() */
+ 	unsigned long rt_priority;
+ 	unsigned long it_real_value, it_prof_value, it_virt_value;
+ 	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
+ 	struct timer_list real_timer;
+ 	struct tms times;
+ 	unsigned long start_time;
+ 	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
+ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
+ 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+ 	int swappable:1;
+ /* process credentials */
+ 	uid_t uid,euid,suid,fsuid;
+ 	gid_t gid,egid,sgid,fsgid;
+ 	int ngroups;
+ 	gid_t	groups[NGROUPS];
+ 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
+ 	int keep_capabilities:1;
+ 	struct user_struct *user;
+ /* limits */
+ 	struct rlimit rlim[RLIM_NLIMITS];
+ 	unsigned short used_math;
+ 	char comm[16];
+ /* file system info */
+ 	int link_count, total_link_count;
+ 	struct tty_struct *tty; /* NULL if no tty */
+ 	unsigned int locks; /* How many file locks are being held */
+ /* ipc stuff */
+ 	struct sem_undo *semundo;
+ 	struct sem_queue *semsleeping;
+ /* CPU-specific state of this task */
+ 	struct thread_struct thread;
+ /* filesystem information */
+ 	struct fs_struct *fs;
+ /* open file information */
+ 	struct files_struct *files;
+ /* signal handlers */
+ 	spinlock_t sigmask_lock;	/* Protects signal and blocked */
+ 	struct signal_struct *sig;
+ 
+ 	sigset_t blocked;
+ 	struct sigpending pending;
+ 
+ 	unsigned long sas_ss_sp;
+ 	size_t sas_ss_size;
+ 	int (*notifier)(void *priv);
+ 	void *notifier_data;
+ 	sigset_t *notifier_mask;
+ 	
+ /* Thread group tracking */
+    	u32 parent_exec_id;
+    	u32 self_exec_id;
+ /* Protection of (de-)allocation: mm, files, fs, tty */
+ 	spinlock_t alloc_lock;
+ /* Field to make virtual server running in chroot more  isolated */
+ 	int s_context;	/* Process can only deal with other processes */
+ 			/* with the same s_context */
+ 	__u32 cap_bset;	/* Maximum capability of this process and children */
+ 	unsigned long ipv4root;	/* Process can only bind to this iP */
+ 	struct context_info *s_info;
+ };
+ 
+ /*
+  * Per process flags
+  */
+ #define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
+ 					/* Not implemented yet, only for 486*/
+ #define PF_STARTING	0x00000002	/* being created */
+ #define PF_EXITING	0x00000004	/* getting shut down */
+ #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
+ #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
+ #define PF_DUMPCORE	0x00000200	/* dumped core */
+ #define PF_SIGNALED	0x00000400	/* killed by a signal */
+ #define PF_MEMALLOC	0x00000800	/* Allocating memory */
+ #define PF_FREE_PAGES	0x00002000	/* per process page freeing */
+ 
+ #define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */
+ 
+ /*
+  * Ptrace flags
+  */
+ 
+ #define PT_PTRACED	0x00000001
+ #define PT_TRACESYS	0x00000002
+ #define PT_DTRACE	0x00000004	/* delayed trace (used on m68k, i386) */
+ #define PT_TRACESYSGOOD	0x00000008
+ #define PT_PTRACE_CAP	0x00000010	/* ptracer can follow suid-exec */
+ 
+ /*
+  * Limit the stack by to some sane default: root can always
+  * increase this limit if needed..  8MB seems reasonable.
+  */
+ #define _STK_LIM	(8*1024*1024)
+ 
+ #define DEF_COUNTER	(10*HZ/100)	/* 100 ms time slice */
+ #define MAX_COUNTER	(20*HZ/100)
+ #define DEF_NICE	(0)
+ 
+ 
+ /*
+  * The default (Linux) execution domain.
+  */
+ extern struct exec_domain	default_exec_domain;
+ 
+ /*
+  *  INIT_TASK is used to set up the first task table, touch at
+  * your own risk!. Base=0, limit=0x1fffff (=2MB)
+  */
+ #define INIT_TASK(tsk)	\
+ {									\
+     state:		0,						\
+     flags:		0,						\
+     sigpending:		0,						\
+     addr_limit:		KERNEL_DS,					\
+     exec_domain:	&default_exec_domain,				\
+     lock_depth:		-1,						\
+     counter:		DEF_COUNTER,					\
+     nice:		DEF_NICE,					\
+     policy:		SCHED_OTHER,					\
+     mm:			NULL,						\
+     active_mm:		&init_mm,					\
+     cpus_allowed:	-1,						\
+     run_list:		LIST_HEAD_INIT(tsk.run_list),			\
+     next_task:		&tsk,						\
+     prev_task:		&tsk,						\
+     p_opptr:		&tsk,						\
+     p_pptr:		&tsk,						\
+     thread_group:	LIST_HEAD_INIT(tsk.thread_group),		\
+     wait_chldexit:	__WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
+     real_timer:		{						\
+ 	function:		it_real_fn				\
+     },									\
+     cap_effective:	CAP_INIT_EFF_SET,				\
+     cap_inheritable:	CAP_INIT_INH_SET,				\
+     cap_permitted:	CAP_FULL_SET,					\
+     keep_capabilities:	0,						\
+     rlim:		INIT_RLIMITS,					\
+     user:		INIT_USER,					\
+     comm:		"swapper",					\
+     thread:		INIT_THREAD,					\
+     fs:			&init_fs,					\
+     files:		&init_files,					\
+     sigmask_lock:	SPIN_LOCK_UNLOCKED,				\
+     sig:		&init_signals,					\
+     pending:		{ NULL, &tsk.pending.head, {{0}}},		\
+     blocked:		{{0}},						\
+     alloc_lock:		SPIN_LOCK_UNLOCKED,				\
+     cap_bset:		CAP_INIT_EFF_SET,				\
+ }
+ 
+ 
+ #ifndef INIT_TASK_SIZE
+ # define INIT_TASK_SIZE	2048*sizeof(long)
+ #endif
+ 
+ union task_union {
+ 	struct task_struct task;
+ 	unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
+ };
+ 
+ extern union task_union init_task_union;
+ 
+ extern struct   mm_struct init_mm;
+ extern struct task_struct *init_tasks[NR_CPUS];
+ 
+ /* PID hashing. (shouldnt this be dynamic?) */
+ #define PIDHASH_SZ (4096 >> 2)
+ extern struct task_struct *pidhash[PIDHASH_SZ];
+ 
+ #define pid_hashfn(x)	((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))
+ 
+ static inline void hash_pid(struct task_struct *p)
+ {
+ 	struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];
+ 
+ 	if((p->pidhash_next = *htable) != NULL)
+ 		(*htable)->pidhash_pprev = &p->pidhash_next;
+ 	*htable = p;
+ 	p->pidhash_pprev = htable;
+ }
+ 
+ static inline void unhash_pid(struct task_struct *p)
+ {
+ 	if(p->pidhash_next)
+ 		p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
+ 	*p->pidhash_pprev = p->pidhash_next;
+ }
+ 
+ static inline struct task_struct *find_task_by_pid(int pid)
+ {
+ 	struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];
+ 
+ 	for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
+ 		;
+ 
+ 	return p;
+ }
+ 
+ /* per-UID process charging. */
+ extern struct user_struct * alloc_uid(uid_t);
+ extern void free_uid(struct user_struct *);
+ 
+ #include <asm/current.h>
+ 
+ extern unsigned long volatile jiffies;
+ extern unsigned long itimer_ticks;
+ extern unsigned long itimer_next;
+ extern struct timeval xtime;
+ extern void do_timer(struct pt_regs *);
+ 
+ extern unsigned int * prof_buffer;
+ extern unsigned long prof_len;
+ extern unsigned long prof_shift;
+ 
+ #define CURRENT_TIME (xtime.tv_sec)
+ 
+ extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
+ extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
+ extern void FASTCALL(sleep_on(wait_queue_head_t *q));
+ extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
+ 				      signed long timeout));
+ extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
+ extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
+ 						    signed long timeout));
+ extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+ 
+ #define wake_up(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+ #define wake_up_nr(x, nr)		__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+ #define wake_up_all(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
+ #define wake_up_sync(x)			__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
+ #define wake_up_sync_nr(x, nr)		__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
+ #define wake_up_interruptible(x)	__wake_up((x),TASK_INTERRUPTIBLE, 1)
+ #define wake_up_interruptible_nr(x, nr)	__wake_up((x),TASK_INTERRUPTIBLE, nr)
+ #define wake_up_interruptible_all(x)	__wake_up((x),TASK_INTERRUPTIBLE, 0)
+ #define wake_up_interruptible_sync(x)	__wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
+ #define wake_up_interruptible_sync_nr(x) __wake_up_sync((x),TASK_INTERRUPTIBLE,  nr)
+ asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);
+ 
+ extern int in_group_p(gid_t);
+ extern int in_egroup_p(gid_t);
+ 
+ extern void proc_caches_init(void);
+ extern void flush_signals(struct task_struct *);
+ extern void flush_signal_handlers(struct task_struct *);
+ extern int dequeue_signal(sigset_t *, siginfo_t *);
+ extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+ 			      sigset_t *mask);
+ extern void unblock_all_signals(void);
+ extern int send_sig_info(int, struct siginfo *, struct task_struct *);
+ extern int force_sig_info(int, struct siginfo *, struct task_struct *);
+ extern int kill_pg_info(int, struct siginfo *, pid_t);
+ extern int kill_sl_info(int, struct siginfo *, pid_t);
+ extern int kill_proc_info(int, struct siginfo *, pid_t);
+ extern void notify_parent(struct task_struct *, int);
+ extern void do_notify_parent(struct task_struct *, int);
+ extern void force_sig(int, struct task_struct *);
+ extern int send_sig(int, struct task_struct *, int);
+ extern int kill_pg(pid_t, int, int);
+ extern int kill_sl(pid_t, int, int);
+ extern int kill_proc(pid_t, int, int);
+ extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
+ extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);
+ 
+ static inline int signal_pending(struct task_struct *p)
+ {
+ 	return (p->sigpending != 0);
+ }
+ 
+ /*
+  * Re-calculate pending state from the set of locally pending
+  * signals, globally pending signals, and blocked signals.
+  */
+ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
+ {
+ 	unsigned long ready;
+ 	long i;
+ 
+ 	switch (_NSIG_WORDS) {
+ 	default:
+ 		for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
+ 			ready |= signal->sig[i] &~ blocked->sig[i];
+ 		break;
+ 
+ 	case 4: ready  = signal->sig[3] &~ blocked->sig[3];
+ 		ready |= signal->sig[2] &~ blocked->sig[2];
+ 		ready |= signal->sig[1] &~ blocked->sig[1];
+ 		ready |= signal->sig[0] &~ blocked->sig[0];
+ 		break;
+ 
+ 	case 2: ready  = signal->sig[1] &~ blocked->sig[1];
+ 		ready |= signal->sig[0] &~ blocked->sig[0];
+ 		break;
+ 
+ 	case 1: ready  = signal->sig[0] &~ blocked->sig[0];
+ 	}
+ 	return ready !=	0;
+ }
+ 
+ /* Reevaluate whether the task has signals pending delivery.
+    This is required every time the blocked sigset_t changes.
+    All callers should have t->sigmask_lock.  */
+ 
+ static inline void recalc_sigpending(struct task_struct *t)
+ {
+ 	t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
+ }
+ 
+ /* True if we are on the alternate signal stack.  */
+ 
+ static inline int on_sig_stack(unsigned long sp)
+ {
+ 	return (sp - current->sas_ss_sp < current->sas_ss_size);
+ }
+ 
+ static inline int sas_ss_flags(unsigned long sp)
+ {
+ 	return (current->sas_ss_size == 0 ? SS_DISABLE
+ 		: on_sig_stack(sp) ? SS_ONSTACK : 0);
+ }
+ 
+ extern int request_irq(unsigned int,
+ 		       void (*handler)(int, void *, struct pt_regs *),
+ 		       unsigned long, const char *, void *);
+ extern void free_irq(unsigned int, void *);
+ 
+ /*
+  * This has now become a routine instead of a macro, it sets a flag if
+  * it returns true (to do BSD-style accounting where the process is flagged
+  * if it uses root privs). The implication of this is that you should do
+  * normal permissions checks first, and check suser() last.
+  *
+  * [Dec 1997 -- Chris Evans]
+  * For correctness, the above considerations need to be extended to
+  * fsuser(). This is done, along with moving fsuser() checks to be
+  * last.
+  *
+  * These will be removed, but in the mean time, when the SECURE_NOROOT 
+  * flag is set, uids don't grant privilege.
+  */
+ static inline int suser(void)
+ {
+ 	if (!issecure(SECURE_NOROOT) && current->euid == 0) { 
+ 		current->flags |= PF_SUPERPRIV;
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ 
+ static inline int fsuser(void)
+ {
+ 	if (!issecure(SECURE_NOROOT) && current->fsuid == 0) {
+ 		current->flags |= PF_SUPERPRIV;
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ 
+ /*
+  * capable() checks for a particular capability.  
+  * New privilege checks should use this interface, rather than suser() or
+  * fsuser(). See include/linux/capability.h for defined capabilities.
+  */
+ 
+ static inline int capable(int cap)
+ {
+ #if 1 /* ok now */
+ 	if (cap_raised(current->cap_effective, cap))
+ #else
+ 	if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0)
+ #endif
+ 	{
+ 		current->flags |= PF_SUPERPRIV;
+ 		return 1;
+ 	}
+ 	return 0;
+ }
+ 
+ /*
+  * Routines for handling mm_structs
+  */
+ extern struct mm_struct * mm_alloc(void);
+ 
+ extern struct mm_struct * start_lazy_tlb(void);
+ extern void end_lazy_tlb(struct mm_struct *mm);
+ 
+ /* mmdrop drops the mm and the page tables */
+ extern inline void FASTCALL(__mmdrop(struct mm_struct *));
+ static inline void mmdrop(struct mm_struct * mm)
+ {
+ 	if (atomic_dec_and_test(&mm->mm_count))
+ 		__mmdrop(mm);
+ }
+ 
+ /* mmput gets rid of the mappings and all user-space */
+ extern void mmput(struct mm_struct *);
+ /* Remove the current tasks stale references to the old mm_struct */
+ extern void mm_release(void);
+ 
+ /*
+  * Routines for handling the fd arrays
+  */
+ extern struct file ** alloc_fd_array(int);
+ extern int expand_fd_array(struct files_struct *, int nr);
+ extern void free_fd_array(struct file **, int);
+ 
+ extern fd_set *alloc_fdset(int);
+ extern int expand_fdset(struct files_struct *, int nr);
+ extern void free_fdset(fd_set *, int);
+ 
+ extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+ extern void flush_thread(void);
+ extern void exit_thread(void);
+ 
+ extern void exit_mm(struct task_struct *);
+ extern void exit_files(struct task_struct *);
+ extern void exit_sighand(struct task_struct *);
+ 
+ extern void reparent_to_init(void);
+ extern void daemonize(void);
+ 
+ extern int do_execve(char *, char **, char **, struct pt_regs *);
+ extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);
+ 
+ extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+ extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
+ extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
+ 
+ #define __wait_event(wq, condition) 					\
+ do {									\
+ 	wait_queue_t __wait;						\
+ 	init_waitqueue_entry(&__wait, current);				\
+ 									\
+ 	add_wait_queue(&wq, &__wait);					\
+ 	for (;;) {							\
+ 		set_current_state(TASK_UNINTERRUPTIBLE);		\
+ 		if (condition)						\
+ 			break;						\
+ 		schedule();						\
+ 	}								\
+ 	current->state = TASK_RUNNING;					\
+ 	remove_wait_queue(&wq, &__wait);				\
+ } while (0)
+ 
+ #define wait_event(wq, condition) 					\
+ do {									\
+ 	if (condition)	 						\
+ 		break;							\
+ 	__wait_event(wq, condition);					\
+ } while (0)
+ 
+ #define __wait_event_interruptible(wq, condition, ret)			\
+ do {									\
+ 	wait_queue_t __wait;						\
+ 	init_waitqueue_entry(&__wait, current);				\
+ 									\
+ 	add_wait_queue(&wq, &__wait);					\
+ 	for (;;) {							\
+ 		set_current_state(TASK_INTERRUPTIBLE);			\
+ 		if (condition)						\
+ 			break;						\
+ 		if (!signal_pending(current)) {				\
+ 			schedule();					\
+ 			continue;					\
+ 		}							\
+ 		ret = -ERESTARTSYS;					\
+ 		break;							\
+ 	}								\
+ 	current->state = TASK_RUNNING;					\
+ 	remove_wait_queue(&wq, &__wait);				\
+ } while (0)
+ 	
+ #define wait_event_interruptible(wq, condition)				\
+ ({									\
+ 	int __ret = 0;							\
+ 	if (!(condition))						\
+ 		__wait_event_interruptible(wq, condition, __ret);	\
+ 	__ret;								\
+ })
+ 
+ #define REMOVE_LINKS(p) do { \
+ 	(p)->next_task->prev_task = (p)->prev_task; \
+ 	(p)->prev_task->next_task = (p)->next_task; \
+ 	if ((p)->p_osptr) \
+ 		(p)->p_osptr->p_ysptr = (p)->p_ysptr; \
+ 	if ((p)->p_ysptr) \
+ 		(p)->p_ysptr->p_osptr = (p)->p_osptr; \
+ 	else \
+ 		(p)->p_pptr->p_cptr = (p)->p_osptr; \
+ 	} while (0)
+ 
+ #define SET_LINKS(p) do { \
+ 	(p)->next_task = &init_task; \
+ 	(p)->prev_task = init_task.prev_task; \
+ 	init_task.prev_task->next_task = (p); \
+ 	init_task.prev_task = (p); \
+ 	(p)->p_ysptr = NULL; \
+ 	if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
+ 		(p)->p_osptr->p_ysptr = p; \
+ 	(p)->p_pptr->p_cptr = p; \
+ 	} while (0)
+ 
+ #define for_each_task(p) \
+ 	for (p = &init_task ; (p = p->next_task) != &init_task ; )
+ 
+ #define next_thread(p) \
+ 	list_entry((p)->thread_group.next, struct task_struct, thread_group)
+ 
+ static inline void del_from_runqueue(struct task_struct * p)
+ {
+ 	nr_running--;
+ 	p->sleep_time = jiffies;
+ 	list_del(&p->run_list);
+ 	p->run_list.next = NULL;
+ }
+ 
+ static inline int task_on_runqueue(struct task_struct *p)
+ {
+ 	return (p->run_list.next != NULL);
+ }
+ 
+ static inline void unhash_process(struct task_struct *p)
+ {
+ 	if (task_on_runqueue(p)) BUG();
+ 	write_lock_irq(&tasklist_lock);
+ 	nr_threads--;
+ 	unhash_pid(p);
+ 	REMOVE_LINKS(p);
+ 	list_del(&p->thread_group);
+ 	write_unlock_irq(&tasklist_lock);
+ }
+ 
+ /* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
+ static inline void task_lock(struct task_struct *p)
+ {
+ 	spin_lock(&p->alloc_lock);
+ }
+ 
+ static inline void task_unlock(struct task_struct *p)
+ {
+ 	spin_unlock(&p->alloc_lock);
+ }
+ 
+ /* write full pathname into buffer and return start of pathname */
+ static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
+ 				char *buf, int buflen)
+ {
+ 	char *res;
+ 	struct vfsmount *rootmnt;
+ 	struct dentry *root;
+ 	read_lock(&current->fs->lock);
+ 	rootmnt = mntget(current->fs->rootmnt);
+ 	root = dget(current->fs->root);
+ 	read_unlock(&current->fs->lock);
+ 	spin_lock(&dcache_lock);
+ 	res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen);
+ 	spin_unlock(&dcache_lock);
+ 	dput(root);
+ 	mntput(rootmnt);
+ 	return res;
+ }
+ 
+ /* Manage the reference count of the context_info pointer */
+ void sys_release_s_info (struct task_struct *);
+ void sys_assign_s_info (struct task_struct *);
+ void sys_alloc_s_info (void);
+ 
+ #endif /* __KERNEL__ */
+ 
+ #endif
diff -rc2P linux/include/linux/sched.h.rej linux-2.4.13/include/linux/sched.h.rej
*** linux/include/linux/sched.h.rej	Wed Dec 31 19:00:00 1969
--- linux-2.4.13/include/linux/sched.h.rej	Fri Nov  9 16:58:00 2001
***************
*** 0 ****
--- 1,36 ----
+ ***************
+ *** 399,404 ****
+      	u32 self_exec_id;
+   /* Protection of (de-)allocation: mm, files, fs, tty */
+   	spinlock_t alloc_lock;
+   };
+   
+   /*
+ --- 399,407 ----
+      	u32 self_exec_id;
+   /* Protection of (de-)allocation: mm, files, fs, tty */
+   	spinlock_t alloc_lock;
+ + 
+ + /* journalling filesystem info */
+ + 	void *journal_info;
+   };
+   
+   /*
+ ***************
+ *** 485,491 ****
+       sig:		&init_signals,					\
+       pending:		{ NULL, &tsk.pending.head, {{0}}},		\
+       blocked:		{{0}},						\
+ -     alloc_lock:		SPIN_LOCK_UNLOCKED				\
+   }
+   
+   
+ --- 488,495 ----
+       sig:		&init_signals,					\
+       pending:		{ NULL, &tsk.pending.head, {{0}}},		\
+       blocked:		{{0}},						\
+ +     alloc_lock:		SPIN_LOCK_UNLOCKED,				\
+ +     journal_info:	NULL						\
+   }
+   
+   
diff -rc2P linux/kernel/sysctl.c linux-2.4.13/kernel/sysctl.c
*** linux/kernel/sysctl.c	Fri Nov  9 16:15:08 2001
--- linux-2.4.13/kernel/sysctl.c	Fri Nov  9 16:58:00 2001
***************
*** 30,33 ****
--- 30,35 ----
  #include <linux/init.h>
  #include <linux/sysrq.h>
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
  #include <linux/highuid.h>
  
***************
*** 303,306 ****
--- 305,316 ----
  	{FS_LEASE_TIME, "lease-break-time", &lease_break_time, sizeof(int),
  	 0644, NULL, &proc_dointvec},
+ #ifdef CONFIG_JBD_DEBUG
+ 	{FS_LEASE_TIME+1, "jbd-debug", &journal_enable_debug, sizeof (int),
+ 	 0644, NULL, &proc_dointvec},
+ #endif
+ #if defined(CONFIG_JBD) || defined(CONFIG_JBD_MODULE)
+ 	{FS_LEASE_TIME+2, "jbd-oom-retry", &journal_oom_retry, sizeof (int),
+ 	 0644, NULL, &proc_dointvec},
+ #endif
  	{0}
  };
diff -rc2P linux/mm/filemap.c linux-2.4.13/mm/filemap.c
*** linux/mm/filemap.c	Tue Oct 23 20:52:48 2001
--- linux-2.4.13/mm/filemap.c	Fri Nov  9 16:58:00 2001
***************
*** 201,211 ****
  }
  
  static inline void truncate_partial_page(struct page *page, unsigned partial)
  {
  	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
- 				
  	if (page->buffers)
! 		block_flushpage(page, partial);
! 
  }
  
--- 201,218 ----
  }
  
+ static int do_flushpage(struct page *page, unsigned long offset)
+ {
+ 	int (*flushpage) (struct page *, unsigned long);
+ 	flushpage = page->mapping->a_ops->flushpage;
+ 	if (flushpage)
+ 		return (*flushpage)(page, offset);
+ 	return block_flushpage(page, offset);
+ }
+ 
  static inline void truncate_partial_page(struct page *page, unsigned partial)
  {
  	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
  	if (page->buffers)
! 		do_flushpage(page, partial);
  }
  
***************
*** 213,217 ****
  {
  	/* Leave it on the LRU if it gets converted into anonymous buffers */
! 	if (!page->buffers || block_flushpage(page, 0))
  		lru_cache_del(page);
  
--- 220,224 ----
  {
  	/* Leave it on the LRU if it gets converted into anonymous buffers */
! 	if (!page->buffers || do_flushpage(page, 0))
  		lru_cache_del(page);
  
***************
*** 1119,1122 ****
--- 1126,1130 ----
  }
  
+ 
  /*
   * Mark a page as having seen activity.
***************
*** 2817,2821 ****
  	err = written ? written : status;
  out:
- 
  	up(&inode->i_sem);
  	return err;
--- 2825,2828 ----
diff -rc2P linux/mm/memory.c linux-2.4.13/mm/memory.c
*** linux/mm/memory.c	Mon Oct 15 15:09:50 2001
--- linux-2.4.13/mm/memory.c	Fri Nov  9 16:58:00 2001
***************
*** 1243,1250 ****
  	struct page * new_page;
  	pte_t entry;
! 
  	if (!vma->vm_ops || !vma->vm_ops->nopage)
  		return do_anonymous_page(mm, vma, page_table, write_access, address);
  	spin_unlock(&mm->page_table_lock);
  
  	/*
--- 1243,1256 ----
  	struct page * new_page;
  	pte_t entry;
! 	int ret;
! 	struct inode *inode = NULL;
! 	
  	if (!vma->vm_ops || !vma->vm_ops->nopage)
  		return do_anonymous_page(mm, vma, page_table, write_access, address);
  	spin_unlock(&mm->page_table_lock);
+ 	if (vma->vm_file && vma->vm_file->f_dentry)
+ 		inode = vma->vm_file->f_dentry->d_inode;
+ 	if (inode)
+ 		down_read(&inode->i_truncate_sem);
  
  	/*
***************
*** 1256,1263 ****
  
  	spin_lock(&mm->page_table_lock);
! 	if (new_page == NULL)	/* no page was available -- SIGBUS */
! 		return 0;
! 	if (new_page == NOPAGE_OOM)
! 		return -1;
  	/*
  	 * This silly early PAGE_DIRTY setting removes a race
--- 1262,1275 ----
  
  	spin_lock(&mm->page_table_lock);
! 	if (new_page == NULL) {	/* no page was available -- SIGBUS */
! 		ret = 0;
! 		goto out;
! 	}
! 	
! 	if (new_page == NOPAGE_OOM) {
! 		ret =  -1;
! 		goto out;
! 	}
! 	
  	/*
  	 * This silly early PAGE_DIRTY setting removes a race
***************
*** 1285,1294 ****
  		/* One of our sibling threads was faster, back out. */
  		page_cache_release(new_page);
! 		return 1;
  	}
  
  	/* no need to invalidate: a not-present page shouldn't be cached */
  	update_mmu_cache(vma, address, entry);
! 	return 2;	/* Major fault */
  }
  
--- 1297,1311 ----
  		/* One of our sibling threads was faster, back out. */
  		page_cache_release(new_page);
! 		ret = 1;
! 		goto out;
  	}
  
  	/* no need to invalidate: a not-present page shouldn't be cached */
  	update_mmu_cache(vma, address, entry);
! 	ret = 2;	/* Major fault */
! out:
! 	if (inode)
! 		up_read(&inode->i_truncate_sem);
! 	return ret;
  }
  
diff -rc2P linux/mm/vmscan.c linux-2.4.13/mm/vmscan.c
*** linux/mm/vmscan.c	Wed Oct 24 00:48:55 2001
--- linux-2.4.13/mm/vmscan.c	Fri Nov  9 16:58:00 2001
***************
*** 8,12 ****
   *  Removed kswapd_ctl limits, and swap out as many pages as needed
   *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
-  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct Exp $
   *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
   *  Multiqueue VM started 5.8.00, Rik van Riel.
--- 8,11 ----
***************
*** 415,419 ****
  			page_cache_get(page);
  
! 			if (try_to_free_buffers(page, gfp_mask)) {
  				if (!page->mapping) {
  					/*
--- 414,418 ----
  			page_cache_get(page);
  
! 			if (try_to_release_page(page, gfp_mask)) {
  				if (!page->mapping) {
  					/*
***************
*** 436,440 ****
  					/*
  					 * The page is still in pagecache so undo the stuff
! 					 * before the try_to_free_buffers since we've not
  					 * finished and we can now try the next step.
  					 */
--- 435,439 ----
  					/*
  					 * The page is still in pagecache so undo the stuff
! 					 * before the try_to_release_page since we've not
  					 * finished and we can now try the next step.
  					 */