#!/usr/bin/perl -w

# $Id: findup-md5,v 1.4 1999/11/13 11:04:29 root Exp root $

# Copyright (c) 1999 Mark Summerfield. All Rights Reserved.
# May be used/distributed under the GPL.

require 5.004 ;

use strict ;
use integer ;

use Cwd ;
use File::Find ;
use Getopt::Long ;
use MD5 ;

use vars qw( $VERSION ) ;
$VERSION = '1.01' ;

my %Opt ;
my %MD5 ;
my $PrevDir = '' ;
my $Found   = 0 ;
my $MD5     = MD5->new() ;

my $startdir = cwd ;

$SIG{INT} = sub { chdir $startdir ; die "\nUser aborted.\n" } ;

&getoptions ;


print STDERR "\n-- Reading folders --\n\n" if $Opt{'verbose'} ;

find( \&wanted, @ARGV ) ;

print STDERR "\n-- Duplicate files --\n\n" if $Opt{'verbose'} ;

# Sort

my @Sorted ;
{
    my %Sortby ;
    while( my( $key, $arrayref ) = each %MD5 ) {
        # Sort the files that match the same MD5, stripping off the no longer
        # needed dev and inode no.
        @{$arrayref} = sort { lc $a cmp lc $b } 
                       map { ( split /\t/ )[ 0 ] } @{$arrayref} ;
        $Sortby{${$arrayref}[ 0 ]} = $key ;
    }
    # Sort the MD5's in order of the first file under each MD5
    @Sorted = map { $Sortby{$_} } sort { lc $a cmp lc $b } keys %Sortby ;
}

# Show

foreach my $key ( @Sorted ) {
    my @file = @{$MD5{$key}} ;
	next if @file < 2 ;
	$Found++ ;
    print "MD5=$key\n" ;
	foreach my $file ( @file ) {
		print "\t$file\n" ;
	}
}

if( $Found ) {
    print STDERR "\n-- Done --\n\n" if $Opt{'verbose'} ;
}
else {
    print STDERR "-- No duplicates found --\n\n" ;
}


sub wanted {
    return if -l ; # Ignore symbolic links
	if( -d ) {
		if( $Opt{'verbose'} ) {
			print STDERR "Reading $File::Find::dir\n"
            if $PrevDir !~ /^$File::Find::dir/ ;
			$PrevDir = $File::Find::dir ;
		}
		return ;
	}
    elsif( -f and -s ) { # Ignore non-files and zero length files
        my( $dev, $ino ) = ( stat( _ ) )[ 0, 1 ] ;
        $MD5->reset() ;
        if( open INPUT, $_ ) {
            $MD5->addfile( \*INPUT ) ;
            close INPUT ;
            my $key = $MD5->hexdigest() ;
            # Ignore hard links
            if( defined $MD5{$key} ) { 
                my @file = @{$MD5{$key}} ;
                foreach my $file ( @file ) {
                    # If we've already got this inode then its a hard link and
                    # we ignore.
                    $ino = undef, last 
                    if "$dev:$ino" eq ( split /\t/, $file )[ 1 ] ;
                }
            }
            push @{$MD5{$key}}, "$File::Find::name\t$dev:$ino" if defined $ino ;
        }
        else {
            warn "Failed to read $File::Find::name: $!\n" ;
        }
    }
}


sub getoptions {

    # Defaults.
    $Opt{'verbose'} = 0 ; 

    Getopt::Long::config 'no_ignore_case' ; 
    GetOptions( \%Opt,
        'h|help',
        'verbose|v',
        ) or die "$!\nfindup -h for help\n" ;

    &help if $Opt{'help'} or not @ARGV ;

#    print STDERR map { "$_=[$Opt{$_}]\n" } keys %Opt ; exit ; # DEBUG

}


sub help {
    print <<__EOT__ ;
findup-md5 v $VERSION. Copyright (c) Mark Summerfield 1999. 
All rights reserved. May be used/distributed under the GPL.

usage: findup-md5 [options] <path(s)>

Finds exact duplicates using MD5 algorithm, irrespective of filename or date.
Can work across filesytems.

-v  --verbose  Verbose to STDERR [$Opt{'verbose'}]

__EOT__
    exit ;
}


__END__


=head1 NAME

Finds exact duplicates using MD5 algorithm, irrespective of filename or date.

=head1 SYNOPSIS

findup-md5 -v /path1 > /tmp/duplicates.txt

=head1 README

Finds exact duplicates using MD5 algorithm, irrespective of filename or date.

=head1 PREREQUISITES

C<strict>
C<integer>
C<Cwd>
C<File::Find>
C<Getopt::Long>
C<MD5>

=head1 COREQUISITES

=head1 COPYRIGHT

Copyright (c) Mark Summerfield 1999. All Rights Reserved.
May be used/distributed under the GPL.
Email <summer@chest.ac.uk> with 'findup-md5' in the subject line.

=head1 OSNAMES

Linux

=head1 SCRIPT CATEGORIES

UNIX/System_administration

=cut


