#!/usr/bin/perl -w use strict; use Fcntl ':mode'; BEGIN { use vars qw(@POD_HOOKS $be_random $do_suid); no strict 'refs'; =head1 NAME unify-dirs - Make identical files in two directories the same inode =cut push @POD_HOOKS, NAME => sub { my @m; ( @m = m/(\S+) - (.*)/ ) && do { *{PROGNAME} = sub { $m[0] }; *{SHORT_DESC} = sub { $m[1] }; } }; =head1 SYNOPSIS unify-dirs [options] dir1 dir2 =cut push @POD_HOOKS, SYNOPSIS => sub { my $a = $_; *{SYNOPSIS} = sub { $a } }; =head1 DESCRIPTION unify-dirs will traverse the given passed directories, and if it finds files that are identical (in name, contents, permission and mode), it will hard-link them and set the "immutable" and "immutable linkage invert" flags (if selected). =cut push @POD_HOOKS, DESCRIPTION => sub { my $a = $_; *{DESCRIPTION} = sub { $a } }; =head1 COMMAND LINE OPTIONS The following command line options are available: =cut # Extract the command line options for the "usage" screen from the # POD ;-) use vars qw(@options); push @POD_HOOKS, 'COMMAND LINE OPTIONS' => sub { # This hook is deleted below under RELEASE &Pod::Constants::add_hook (#-debug => 1, '*item' => sub { my ($switches, $description) = m/^(.*?)\n\n(.*)/s; my (@switches, $longest); $longest = ""; for my $switch ($switches =~ m/\G ((?:-\w|--\w+)) (?:,\s*)? /gx) { push @switches, $switch; if ( length $switch > length $longest) { $longest = $switch; } } $longest =~ s/^-*//; push @options, $longest, { options => \@switches, description => $description, }; } ); }; =over 4 =item -h, --help Display program usage =item -v, --verbose Verbose program execution =item -d, --debug Even more verbose program execution =item -V, --version Print the program version =item -i, --immutable Sets the "immutable" inode attribute. =item -l, --linkage Sets the "immutable linkage invert" inode attribute. =item -S, --suid Link files that have set user/group ID bits set =item -r, --random Turns on randomising of directory scanning and tree traversal. This option tries to prevent against racing symlink attacks. A better solution is planned. =back =head1 INODE ATTRIBUTES AND IMMUTABILITY Hard linking identical files between directories has a drawback: if one is changed, then the other one changes too. To counter this, you can set the "immutable" inode attribute on combined files (see L). Setting inode attribute requires root privileges, C, and a filesystem that supports it. Currently this includes default ext2 and ext3 in any recent kernel, or reiserfs with the "inode attributes" patch applied (available from C). The problem with setting "immutable" is that then the file can not be unlinked or renamed. In the case where you have a user without CAP_SYS_ATTR, but otherwise with write permission to a file, they cannot then change it. In comes the "immutable linkage invert" flag. This flag will toggle immutability of the file E, but leave the file E, E, etc protected. This means that you can unlink the file, and hence replace it, edit it with most editors, etc. This option requires a kernel patch - it is included in the s_context patch for the vserver project, which is at L). It is also available on its own from L. This works well with ext2 and ext3, but is a little trickier to get working with reiserfs, as inode attributes are not a standard reiserfs feature. See the above link for more information. =head1 RELEASE This is unify-dirs version 0.2. =cut push @POD_HOOKS, RELEASE => sub { Pod::Constants::delete_hook("*item"); my $v; (($v) = m/(\d+\.\d+)/) && (*{VERSION} = sub {$v}); }; }; no strict 'subs'; sub abort { print STDERR &PROGNAME.": aborting: @_\n", short_usage(); exit(1); } sub barf { print STDERR &PROGNAME.": @_\n"; exit(1) } sub moan { print STDERR &PROGNAME.": WARNING: @_\n"; } sub say { print &PROGNAME.": @_\n"; } sub mutter { } sub whisper { } use strict 'subs'; #===================================================================== # MAIN SECTION STARTS HERE #===================================================================== my ($action, @dirs, $immutable, $linkage, $mode); { use Getopt::Long; no strict "refs", 'vars'; local ($^W) = 0; $action = "unify"; $mode = ""; Getopt::Long::config("bundling"); #Getopt::Long::config("pass_through"); Getopt::Long::GetOptions ( 'help|h' => sub { $action = "show_help" }, 'version|V' => sub { $action = "show_version" }, 'verbose|v' => sub { *{"mutter"} = \&say }, 'debug|d' => sub { *{"whisper"} = *{"mutter"} = \&say }, 'immutable|i' => \$immutable, 'linkage|l' => \$linkage, 'random|r' => sub { $be_random = 1 }, 'suid|S' => sub { $do_suid = 1 }, ); $mode .= "i" if $immutable; $mode .= "I" if $linkage; if ( ! -t STDOUT ) { if ( -t STDERR ) { eval "sub say { print STDERR \"\@_\n\" }"; } else { eval "sub say { }"; } } if ($action eq "show_help") { print usage(); exit(0); } elsif ($action eq "show_version") { print version(), "\n"; exit(0); } } while ( my $dir = shift @ARGV ) { -d $dir or abort "Invalid directory $dir"; push @dirs, $dir; } (@dirs > 1) or abort "Need at least two paths to unify"; say "Unifying: @dirs"; unify_dirs($mode, @dirs); say "done"; exit(0); =head1 INTERNAL FUNCTIONS Documented for prosperity =head2 digest_file($filename) returns a HASH ref containing stat info and a checksum of the file named by the given filename: { sha1 => $sha1_checksum, stat => [ stat_info ] } "sha1" will be missing from the hash if the file is not regular (ie, a block special device, etc). =cut use Digest::SHA1; use IO::File; sub digest_file { my $filename = shift; my @stat = lstat $filename; my %returnval = ( stat => \@stat ); if ( -f _ ) { whisper "digesting $filename"; sysopen FILE, $filename, O_RDONLY or die "could not open $filename for reading; $!"; my $block_size = $stat[11] || 2**16; my ($buffer, $bytes_read); my $sha1 = Digest::SHA1->new; $sha1->add($buffer) while ( $bytes_read = sysread FILE, $buffer, $block_size ); die "Error digesting $filename; $!" unless defined $bytes_read; close FILE; $returnval{sha1} = $sha1->hexdigest; whisper "disgest is $returnval{sha1}"; } else { whisper "not digesting $filename - not a regular file"; } return \%returnval; } =head2 chattr ($filename, $attr_string) Calls the C IOCTL on $filename, setting flags as per $attr_string. =over =item Permitted chattr attributes The following table lists the allowed contents of $attr_string, and the corresponding bitmask to the IOCTL. i => 0x00000010 ("immutable") I => 0x00008000 ("immutable linkage invert") =back =cut BEGIN { use vars qw(%attr); push @POD_HOOKS, "Permitted chattr attributes" => sub { %attr = map { if ( m/(\w) => (0x\w+)/) { $1 => pack("L", hex($2)); } else { () } } split /\r?\n/, $_; }; } sub chattr { my( $file, $attr_str ) = @_; whisper ("chattr($file, $attr_str)"); my $EXT2_IOC_SETFLAGS = 0x40046602; my $flags = pack("L", 0); while( my $flag = chop($attr_str) ){ $flags |= $attr{$flag}; } open( F, $file ) or die "Can't open $file : $!"; ioctl( F, $EXT2_IOC_SETFLAGS, $flags ) or die "Can't set attr($flags) on $file: $!"; close F; } =head2 unify_dirs($mode, @dirs) This is the function that does the main work; it takes an arbitrary list of directories and combines them. This function calls itself recursively for sub-directories. It also uses the same trick "find" does to avoid performing unnecessary C's (see L) where possible. It is probably quite succeptible to missing possible unification in race condition situations; however it should never perform an incorrect unification. If $mode is set, it specifies arguments to the C function, above. FIXME - should probably put a secure chdir() in here later, otherwise we may be vulnerable to racing symlink attacks. To counter this, I've added some entropy throughout the function, to make it really hard to predict what order the program is going to do things. Enable it with `C<-r>' =cut use ReadDir qw(&readdir_inode); sub unify_dirs { my $mode = shift; my @workstack = ([@_]); while (my $dirs_a = pop @workstack) { # get rid of the invalid directories my @dirs = map { -d $_ ? $_ : () } @$dirs_a; next if @dirs == 1; (@dirs = sort { $be_random *= -1 } @dirs) if ( $be_random ); # %files is a (filename => inode) hash for this directory my %files = (); # This holds per-directory-to-be-unified information on the files; # $dir{$dir}->{$filename} = { sha1 => $sha1_checksum, # stat => [ stat_info ] } # or just the inode number if it hasn't been stat'd yet my %dir = (); # this contains a list of sub-directories of the current directory my @subdirs = (); for my $dir ( @dirs ) { whisper "Processing $dir..."; $dir{$dir} = { }; my @dirents = readdir_inode $dir; whisper "Readdir OK"; (@dirents = sort { $be_random *= -1 } @dirents) if ( $be_random ); # So we don't stat every file in a directory if we don't need # to. This is the same optimisation that "find" uses. my $num_dirs = ((lstat $dir)[3]) - 2; # for each directory entry, see if any other dir has the same # filename while ( my $ref = shift @dirents ) { my ($filename, $inode) = (@$ref); next if ($filename =~ m/^\.\.?$/); # we only have to stat entries where there are # subdirectories remaining. if ( $num_dirs ) { # there are still sub-directories left in here. # look for them. $dir{$dir}->{$filename} = { stat => [ lstat "$dir/$filename" ] }; if ( -d _ ) { # found one! $num_dirs--; push @subdirs, $filename; next; } } else { # non-directory; remember the inode number $dir{$dir}->{$filename} = $inode; } if ( exists $files{$filename} and $files{$filename} != $inode ) { # some other mirror has this same filename under a # different inode. Ler's see if we can unify them. my $my_files_h = $dir{$dir}; while ( my ($other_dir, $files_h) = each %dir ) { next if $other_dir eq $dir; # does this other directory have this file too? next unless exists $files_h->{$filename}; whisper "COMPARE: $dir/$filename vs $other_dir/$filename"; # stat both files if we haven't already $my_files_h->{$filename} = { stat => [ lstat "$dir/$filename" ] } unless ref $my_files_h->{$filename} eq "HASH"; $files_h->{$filename} = { stat => [ lstat "$other_dir/$filename" ] } unless ref $files_h->{$filename} eq "HASH"; my ($mine, $other) = ($my_files_h->{$filename}, $files_h->{$filename}); # compare file sizes, skip if different whisper "file stat() info different", next if ( $mine->{stat}->[7] != $other->{stat}->[7] or $mine->{stat}->[2] != $other->{stat}->[2] or $mine->{stat}->[4] != $other->{stat}->[4] or $mine->{stat}->[5] != $other->{stat}->[5] or $mine->{stat}->[1] == $other->{stat}->[1] ); whisper "Never touching set[ug]id binaries", next if ( !$do_suid and S_ISREG($mine->{stat}->[2]) and $mine->{stat}->[2] & 06000 and $mine->{stat}->[2] & 00111 ); # gather SHA1 checksums $my_files_h->{$filename} = digest_file "$dir/$filename" unless exists $my_files_h->{$filename}->{sha1}; $files_h->{$filename} = digest_file "$other_dir/$filename" unless exists $files_h->{$filename}->{sha1}; ($mine, $other) = ($my_files_h->{$filename}, $files_h->{$filename}); # skip if different ("sha1" won't be set if # this isn't a regular file) whisper "not suitable for unification", next unless ( $mine->{sha1} and $other->{sha1} ); whisper "sha1: $mine->{sha1} vs $other->{sha1}"; next unless ( $mine->{sha1} eq $other->{sha1} ); my $diff = ($mine->{stat}->[3] - $other->{stat}->[3]); my ($src, $dest) = ("$dir/$filename", "$other_dir/$filename"); if ( $diff < 0 or (!$diff and rand(1) < 0.5)) { ($src, $dest) = ($dest, $src); $my_files_h->{$filename} = $files_h->{$filename}; } else { $files_h->{$filename} = $my_files_h->{$filename}; } mutter "Linking $src on top of $dest"; chattr($src, $mode) if ( $mode and $mode eq "iI" ); chattr($src, "") if ( $mode and $mode eq "i" ); link $src, $dest."unify$$" or do { die "link ($src,${dest}unify$$) failed; $!"; next; }; rename $dest."unify$$", $dest or do { moan "rename (${dest}unify$$, $dest) failed; $!"; unlink $dest."unify$$"; exit(1); next; }; chattr($src, $mode) if ( $mode and $mode ne "iI" ); @{ $my_files_h->{$filename}->{stat} } = lstat $src; } } else { # first directory to have this file, just remember the # inode number. $files{$filename} = $inode; } } } # #while ( my ($k, $v) = eah %dir ) { #for my $file ( keys %$v ) { #delete $v->{$file}; #} #} #%files=%dir=(); # Recursion society withdrawn sponsorship, now receiving funds # from the Iterative Taskforce. for my $subdir ( no_dups(@subdirs) ) { push @workstack, [ map { "$_/$subdir" } @dirs ]; } } # while (my $dirs_a = pop @workstack) } sub no_dups { return keys %{{ map { $_ => 1 } @_ }} } BEGIN { eval "use Pod::Constants -trim => 1, \@POD_HOOKS"; die $@ if $@; } #--------------------------------------------------------------------- # Usage functions #--------------------------------------------------------------------- sub short_usage { return ("Usage: ${\(SYNOPSIS)}\n" ."Try `${\(PROGNAME)} --help' for a summary of options." ."\n"); } use Text::Wrap qw(wrap fill); use Term::ReadKey; =head2 usage Prints the program usage (extracted from the POD). =cut sub usage { # alright, I'm admit this function is silly. my $options_string; my $OPTIONS_INDENT = 2; my $OPTIONS_WIDTH = 20; my $OPTIONS_GAP = 2; my $TOTAL_WIDTH = (GetTerminalSize())[0] - 10 || 70; my $DESCRIPTION_WIDTH = ($TOTAL_WIDTH - $OPTIONS_GAP - $OPTIONS_INDENT - $OPTIONS_WIDTH); # go through each option, and format it for the screen for ( my $i = 0; $i < (@options>>1); $i ++ ) { my $option = $options[$i*2 + 1]; $Text::Wrap::huge = "overflow"; $Text::Wrap::columns = $OPTIONS_WIDTH; my @lhs = map { split /\n/ } wrap("","",join ", ", sort { length $a <=> length $b } @{$option->{options}}); $Text::Wrap::huge = "wrap"; $Text::Wrap::columns = $DESCRIPTION_WIDTH; my @rhs = map { split /\n/ } fill("","",$option->{description}); while ( @lhs or @rhs ) { my $left = shift @lhs; my $right = shift @rhs; chomp($left); $options_string .= join ("", " " x $OPTIONS_INDENT, $left . (" " x ($OPTIONS_WIDTH - length $left)), " " x $OPTIONS_GAP, $right, "\n"); } } $Text::Wrap::huge = "overflow"; $Text::Wrap::columns = $TOTAL_WIDTH; return (fill("","",PROGNAME . " - " . SHORT_DESC) ."\n\n" ."Usage: ".SYNOPSIS."\n\n" .fill(" ","",DESCRIPTION)."\n\n" .fill(" ","","Command line options:") ."\n\n" .$options_string."\n" ."See `perldoc $0' for more information.\n"); } __END__ =head1 BUGS/TODO Some suggestions for use as a security tool: I came up with these extra options for unify-dirs: -s --save save list of files in /etc/vserver/unified.vservername if no is given. -c --compare compare compare current link status with saved list (see -s :-) -t --test don't actually do any linking. just print out list of files that could be unified. together with -s option one could merely create an initial list, without doing anything else.. =head1 AUTHOR Sam Vilain, =cut