#!/usr/bin/perl
# Fri Jan  2 00:17:07 EST 2004
# TO DO:
# - perlmongers:
#   - learn how to better handle lists-of-lists
#   - perhaps using by reference would help?
#
# CLARIFY OPTIONS
#	output format:
#	- verbose
#	- sort by (file size), (file size * (# files of that size -1)
#	- sorting order: largest first, smallest first, no sorting
#	- human readable
#	- linked files: list ALL, FIRST, NONE
#
#
#	Tue Nov 27 13:53:54 EST 2001
# this is dup7
# ??? better names: broom: helps you clean up / sweep up files

# Thu Jan  9 06:54:03 EST 2003
# !!! make the names a bit better, explain how the parts work together
#
# OVERVIEW:
# Given a list of regular files, this program determines what files are identical
# by content, regardless of file name.
#
# The main function groups files by filesize and compares all those files to each other.
# There are 2 filters to help for a "large" (*1) number of files
# "filterByFirstBlock"
#	in: a list of filenames
#	out: a zzz LoL of filenames that match by first block
#		(several lists may result, not just one)
#	zzz I need a generic standard name for LoL
#
#	The first block of each file is read into an array which is used for all further comparisions,
#	thus minimizing disk access.
#	For files of size <= the blocksize, this is all you need!
#	For files > the blocksize, further testing is needed.
#
# "filterBymd5"
#	in: a list of filenames
#	out: a zzz LoL of filenames that match by md5digest
#		(several lists may result, not just one)
#
#	The md5sum is calculated for each file.
#	Lists are made of all files with matching md5sum, indicating that they're PROBABLY identical.
#	While md5sum is really REALLY good, it's not perfect,
#	so for "belt and suspenders" assurance, it's best to fully compare the files too.
#	zzz need an option to use md5sum only, no further testing
#
#	This phase has the ADVANTAGE that each file is read only once.
#	This phase has the DRAWBACK that each file is read entirely from end to end
#	even if they differ at the beginning.


# DETAILS:

# (*1) what constitutes a "large" number of files?
# it's calculated via zzz, altered by zzz options/args/variables


# TO DO:
# add: getopt handling
# -v: verbose (the jsjv stuff)
# -m: machine readable output (default is human-friendly output)
# -z: don't report zero length files
# -s: don't report SYMLINKS
#     (unless part of a non-symlink match? or only one match?)
# -gnnn: only files greater than size n !!! GOTTA ALLOW k, m, ...
#
# -f: FORCE MD5sum only (no file compare)  WARNING: may be wrong
# -m: use MD5sum even for 2 files (set low threshold/parameters)
# -x: save MD5sum results in a format for md5sum verify/check
# -ziii: set parameters for when to use MD5sum: file size, # files, ...
#	perhaps allow an entire equation via 'eval'
# -biii: set block size
#
# -c: FORCE file compare (no md5sum)
#
#
# ADD: ARG or ENV VAR: command to run for "find"
#
# I NEED HELP:
# on non Unix/Linux systems:
# a) how to detect duplicate shortcuts or other ways to give
#	one file multiple names
# b) can it auto-detect that or must it be args/env variable?
#

# BETTER SYMLINK HANDLING
# - filter out all HARD LINKS and SYMLINKS at the beginning
#	? make a hash-of-strings?
#	ex: a1, a2, a3, a4 all symlinks
#	a) match all files by file size
#	b) for all files of the same size
#		links{"a1"} -> a2, a3, a4
#		just "a1" in the main list
#		(verify it's REGULAR FILE since stat() was done)
#	c) when printing results, select
#		only files matching (no hard/sym links)
#		only hard/sym links (no other matches)

# YET ANOTHER SORTING CRITERION:
# - by file size  <<< can be done in advance
# - by total space: (file size) * (# duplicates)  <<< can NOT be done in advance
#	those will save the most by linking/eliminating duplicates


# use strict;  ### gotta solve things first
use File::Compare;
use Digest::MD5;
use Getopt::Std;

# _____________________________________________________________________
# consider: pass to filter out all entries of <n files (normally 2)
# so no more "skipping entry ...",
# the total count is more useful then!
# NEEDS AN OPTION: prune entries with <n entries of the same size
#	(default is 2)
# NEEDS AN OPTION: prune entries <n bytes in size
#	(default is 0???)
# IN: hash of arrays, by file size
# OUT: hash of arrays, only with >n entries, only >n filesize
#	# of files total
#	# of entries
#

# _____________________________________________________________________

# POSSIBLE OPTIMIZE: sort out HARD LINKS before comparing
#	no need to diff/cmp the hard links!,
#	but SKIP THAT IF IT'S NOT SUPPORTED: nfs / win-98 / ...
# OPTION: hard links: list them or don't list them
# _____________________________________________________________________

# will this work?
# for all files of the same size
#     if (hard link detection desired)
#	yank all with the same hard links
#       if (print hard links) print the list
# continue with dup file detection: first block compare, md5, cmp
# HOW CAN I STILL GENERATE THE TOTAL:
#	5 identical files of size 11111  <<< THIS COUNT
#	  2 are LINKED files:
#
# IDEA: have a pass like compare_by_fc (), returning a list-of-lists

# _____________________________________________________________________

# TO DOCUMENT: 1) how to use gnu find, other finds
# 2) with options like: prune out /$HOME/.netscape
# _____________________________________________________________________

#
# OPTION: -f 0: format 0/1/2/3/...

# format 0: human readable

# format 1:
#   S file-size                                <<< may be suppressed separately
#   L link-file-1 link-file-2 ... link-file-n  <<< may be suppressed
#   D dup-file-1 dup-file-2 ... dup-file-n

# jsj ; TO DO:
# ARGS: block size, verbose, version
# ARG: post-process filenames to escape shell-sensitive characters
#	$_ =~ s/([ \$&'"(])/\\$1/g;  # escape strange characters


#	require 'getopts.pl';
#	&Getopts('s:x');
#	if (defined($opt_s))
#	elsif ($opt_x)
# THAT IS OBSOLETE:
# Suggested alternatives: Getopt::Long  or  Getopt::Std

# SKIP md5 under certain circumstances
# ??? perhaps some args to change those rules?

# ARG: alternat output formats for easier SHELL/PERL reading
#	option: list/suppress symbolic links


# DOCUMENT: the algorithms, performance
# ASK FOR HELP: when to use md5, when to skip
# (can that be determined by number of files and their length, or are
# other things needed to be known?)

# !!! need: documentation style for comments

# Author: Jeffrey Jonas  jeffj@panix.com
# version 1.0  March 2001

# EMPHASIZE:
# this only lists the files - it's up to you how to eliminate the duplicates.
#	jsj mention the rm / link solution
# files are sorted with largest file size first since those are the ones
#	to handle first!

# hard links are identified since they don't take additional disk space


# can skip small files with FIND options -size +nnn

# this does NOT handle:
# 1) soft links.
#	it's deliberately "find ... -type f" since I can't think of anything
#	useful to do with symbolic links.
#	Symbolic links are *NOT* followed since they may point to directories
#	and cause loops.
# 2) directories
# 3) files that are similar
#	ex: "This is a file" to "This is a file / with more".
#	It's hard enough finding files that are identical.
#	I'm unsure how to score a diff for being "similar enough".


# jsj DESCRIBE THE ALGORTIHMS AND HOW THIS EVOLVED:

# there are 2 main methods of finding duplicate files
# compare the files themselves, or compare their checksums.

# comparing the files directly works well IF
# 1) the files really match, since the files MUST be compared to assure
#	they're really identical
# 2) the files don't match within the first block or 2

# The worst case scenario: files mis-match only near the end,
# so every file is read in entirely for every attempt at a match.
# for 3 files, that's 3 compares (a,b) (a,c) (b,c),
# so files are read end to end 6 times.
# for 4 files, that's 6 compares (a,b) (a,c) (a,d) (b,c) (b,d) (c,d)
# so files are read end to end 12 times.

# jsj: general formula for (n) files


# jsj MD5
# best case: no files match: so each file is read end to end only once
# to generate the checksum.
#
# The worst case scenario: many files match, since they need to then be
# re-read in entirety to verify every potential match with cmp()


# credits:
# jsj to the md5 fellow

# to Chip Unicorn for cutting the performance gordion knot
# and suggesting the pass with just reading the first block.

# !!! need: arg parsing stuff

my $usage = "NO ARGS!\
This perl script finds all duplicate files by content, not file name\
usage: $0 <options to find>\
example: $0 . -mount\n\n";
die $usage if ($#ARGV < 0);

# jsj: give example with -size +2k
# jsj: give example with -prune

# jsj: give example like
#	$HOME ~friend -size +100k -mount

#

our($opt_v);
getopts('v');

my ($verbose1, $verbose2, $verbose3);

$verbose1++ if ($opt_v);
my %names_by_size;

# Form the external "find" command using the arg list.
# Gather filenames by size using a hash of lists
# names_by_size { file_size } -> (name1, name2, name3, ...)

# jsj !!! better error handling if FIND fails:
# - get the retc
# - just let STDERR go thru?
#
# jsj: perhaps the "-type f" needs supression at times?

print STDERR "jsjv: find is generating the list of files\n"
    if ($verbose1);

my $cmd = "find " . join (" ", @ARGV) . " -type f -print |";
# print "cmd = ", $cmd, "\n";

open (pipefd, $cmd);

my ($filecount) = 0;  # count just for stats
while (<pipefd>)
{
    chomp;  # eat the trailing newline
    # see the Camel book page 266 for hashes of arrays
    push @{ $names_by_size{-s $_} }, $_;
    # $filecount++;  now handled in next loop
}

close (pipefd);


# delete all entries in names_by_size{} with <2 filenames
# This is done so the progress statistics are more useful.
# The main loop skips all entries with <2 names,
# but this way status can say "processing entry 2 of 200" instead of
# "processing entry 2 of 500" only to find 300 entries are only 1 file.

# # print "BEFORE: ", scalar (keys %names_by_size), " entries $filecount files\n";
$filecount = 0;
# jsj: DELETE THE SORT
foreach $filesz (sort { $b <=> $a } keys %names_by_size)
{
    my $num_files_this_size = scalar (@{$names_by_size{$filesz}});
    if ($num_files_this_size < 2)
    {
	# # print "... 1 file, sz $filesz, name \"@{$names_by_size{$filesz}}\"\n";
	delete $names_by_size{$filesz};
    }
    else
    {
	$filecount += $num_files_this_size;
    }
}
# # print "AFTER : ", scalar (keys %names_by_size), " entries $filecount files\n";

# # END NEW

my $num_unique_sizes = scalar (keys %names_by_size);

print STDERR
    "jsjv: found $num_unique_sizes unique file sizes, $filecount files\n"
    if ($verbose1);

# Examine the list of filenames by file size,
# sorting the keys numerically, largest file sizes first.

my $size_list_index=0;
my $filesz;
foreach $filesz (sort { $b <=> $a } keys %names_by_size)
{
    my ($array_ref);
    # my (@fc_LoL);  # file compare List of Lists
    $size_list_index++;
    my $num_files_this_size = scalar (@{$names_by_size{$filesz}});

    # FUTURE DIRECTIONS: add some graphic progress indicator,
    # showing progress by number of entries and/or number of files

    if (!defined ($filesz))  # this should never happen, but handle it anyway
    {
	print "$num_files_this_size files of UNDEF:\n    ",
	    join ("\n    ", @{$names_by_size{$filesz}}), "\n";
	next;
    }

    if ($filesz == 0) # skip files of zero length
    {
	print "$num_files_this_size files of size 0:\n    ",
	    join ("\n    ", @{$names_by_size{$filesz}}), "\n";
	next;
    }

    # skip if only 1 file of that size
    if ($num_files_this_size < 2)
    {
print STDERR "jsjv: skipping entry $size_list_index of $num_unique_sizes : ",
"1 file of size $filesz\n"
if ($verbose1);

	next;
    }

print STDERR "jsjv: processing entry $size_list_index of $num_unique_sizes : ",
"$num_files_this_size files of size $filesz\n" if ($verbose1);

    if ($filesz > 1024)
    {
	my (@name_list) = compare_by_first_block (1024,
		@{$names_by_size{$filesz}});

	# for each list of files of the same size,
	# divide into a list-of-lists by comparing the first block.


	# jsj: THIS NEEDS REFINEMENT:
	# the MD5 algorithm saves time under SOME circumstances.
	# Can this be determined using only $filesz and $num_files_this_size ?
	#
	# The underlying tradeoff: md5 reads each file ONCE,
	# but all files with matching checksums
	# must be read again to verify that they are truly identical.
	#
	# compare_by_fc() (compare all files using file compare)
	# used alone saves time when either
	# a) many files match entirely
	# b) the files don't match near the beginning
	# *BUT* it takes SIGNIFICANTLY LONGER for the case of
	# many files if they don't match near the beginning.
	#
	# The first pass: comparing by first block only seems a reasonable
	# compromise and performance enhancement
	# by grouping files that match/differ based on the first block.
	# And all files' first blocks are read into a buffer
	# so no need to re-read for every compare!

	# # my @md5_LoL;

	if ($num_files_this_size > 20)
	{
	    for $array_ref (@name_list)
	    {
		compare_by_md5_sum (@$array_ref);  # results in @md5_LoL
	    }
	}
	else
	{
	    @md5_LoL = @name_list;  # skip md5 sum step
	}

	# note: shift() is used to consume the list of lists
	# If md5_LoL is needed after this point, use foreach() here
	# but be sure to delete/free the list
	# before the main loop iterates again
	# so new results are not appended to the previous loop!

	while ($array_ref = shift (@md5_LoL))
	{
	    # print "\t    list: ", scalar (@$array_ref), " files\n";
	    # print "\t\t", join ("\n\t\t", @$array_ref), "\n";
	    compare_by_fc (@$array_ref);  # results in @fc_LoL
	}
    }
    else
    {
	# For files small enough to be read entirely into RAM,
	# handle them directly on one pass!

	@fc_LoL = compare_by_first_block ($filesz, @{$names_by_size{$filesz}});
    }

    # the global list of lists fc_LoL has the results from either case
    while ($array_ref = shift (@fc_LoL))
    {
	print scalar (@$array_ref) .
	    " identical files of size $filesz:\n";
	test_hard_links (@$array_ref);
    }
}


# compare_by_first_block: given a list of file names of same sized files,
#	read just the first block and compare files based on that
#	to generate lists of likely matches.

# args: buffer size, list of files
# returns: array of array of filenames that match by first block
sub compare_by_first_block
{
    my ($readsize) = shift (@_);
    my (@filenames) = @_;

    my ($first_block);         # read a block here
    my (%first_block_by_name); # save block by filename
	# this can get rather large, thus the need to free this ASAP
	# and return only the filenames as results

    my ($filenm);
    my ($first_file);
    my (@return_LoL);

# print "phase 2 input: ", scalar (@filenames), " names, buffer size=$readsize\n"; # jsj: verbose
# print "\t", join ("\n\t", @filenames), "\n";


    #  pass 1: for each file, hash the first block by filename
    #      $first_block_by_name { $filenm } = $first_block;
    #  Files that cannot be opened or read entirely
    #  are NOT on the resulting list.

    foreach $filenm (@filenames)
    {
	# print "\ttrying file $filenm\n";

	# jsj: use BINMODE for non-Unix systems
	unless (open FD, "<$filenm")
	{
	    print STDERR "ERROR: cannot open file \"$filenm\", skipping\n";
	    next;
	}

# turn off UTF-8 Unicode
# jsjj binmode(FD, ":slurp :raw");
binmode(FD, ":raw");

	my ($kount) = read FD, $first_block, $readsize;
	if ($kount != $readsize)
	{
	    print STDERR "ERROR: file \"$filenm\", is short ",
			 "(read $kount of $readsize bytes), skipping\n";
	}
	else
	{
	    $first_block_by_name { $filenm } = $first_block;
	}

	unless (close FD)
	{
	    print STDERR "WARNING: cannot close file \"$filenm\"\n";
	}

    }

    #  pass 2: compare all files to each other
    #  There's probably a name for this algorithm:
    #
    # ex: files a, b, c, d, e
    # if none match, the following files are compared
    #	(a,b) (a,c) (a,d) (a,e)
    #	(b,c) (b,d) (b,e)
    #	(c,d) (c,e)
    #	(d,e)
    # BUT if any match, delete them from the list
    #
    # ex: in the list (a,b,c,d,e), files a, c, e are all identical
    # so in the first pass, (a,c) and (a,e) comparisons succeed
    # leaving the file list as b, d
    # since there's no need to further compare files that matched!
    #
    # Oh yea, and save all the successes as an array of arrays.

    my (@file_list) = sort keys %first_block_by_name;
# print "file list = ", join ",", @file_list, "\n";

    while ($first_file = shift (@file_list))
    {
	my (@dups_found) = $first_file;
	my (@not_dups);

	foreach $filenm (@file_list)
	{
# print "\tcompare $first_file to $filenm\n";
	    if ($first_block_by_name {$first_file} eq
		$first_block_by_name {$filenm})
	    {
		push @dups_found, $filenm;
	    }
	    else
	    {
		push @not_dups, $filenm;
	    }

	}

# print "\tdups_found = ", join ",", @dups_found, "\n";
# print "\tnot_dups = ", join ",", @not_dups, "\n";
	push @return_LoL , [ @dups_found ]
	    if (scalar (@dups_found) > 1);
	    # save results only if >1 matched
	    # since the first file name is always pushed

	@file_list = @not_dups;  # continue searching the non-matched files
    }
return @return_LoL;
}


# compare_by_md5_sum(): given a list of file names of same sized files,
#   compute the md5sum for each file,
#   generating lists of files with the same sum
#   (which need further verification before declaring they're identical).
#   A null list is returned if nothing matches by md5.
#
# args: list of same sized files
# returns: nothing directly
# GLOBALS USED:
#   appends the lists of filenames that match by md5sum
#   to @md5_LoL, thus gathering the results to all calls allowing
#	for $array_ref ( @name_list )
#	{
#	    compare_by_md5_sum (@$array_ref);
#	}
#	while ($array_ref = shift (@md5_LoL))
#	{
#	    ...
#

sub compare_by_md5_sum
{
    my (%file_name_by_md5sum);  # gather file names by hash result
    my ($md5sum);
    my ($filename);
    my ($md5) = Digest::MD5->new;
    # # my (@md5_LoL);

print STDERR "jsjvv: entering md5sum with ", scalar (@_), " filenames\n"
    if ($verbose1);

    # for each file, compute the MD5 digest sum,
    # gathering file names of the same hash via a hash of arrays
    # (the hash index is the checksum, pointing to a list of filenames
    # that gave that same result)

    foreach $filename (@_)
    {
	unless (open MD5_fd, $filename)
	{
	    print STDERR "ERROR: cannot open file \"$filename\", skipping\n";
	    next;
	}

	$md5->reset();
	$md5->addfile(MD5_fd);
	# OLDmd5 $md5sum = $md5->hexdigest();
	$md5sum = $md5->digest();
	close(MD5_fd);

# jsjnew  $digest = $ctx->digest;
# jsjnew  $digest = $ctx->hexdigest;

	push @{ $file_name_by_md5sum {$md5sum} }, $filename;
    }

    #  Append all lists of >1 files to the global list-of-lists
    #  (to allow calling this in a loop and handling all the results
    #   after the loop)

    foreach $md5sum (keys %file_name_by_md5sum)
    {
	push @md5_LoL , [ @{$file_name_by_md5sum{$md5sum}} ]
	    if ( scalar  (@{$file_name_by_md5sum{$md5sum}}) > 1);
	    # do NOT push single file lists, there's nothing to match!

print STDERR "jsjvv: md5sum found " .
	    scalar  (@{$file_name_by_md5sum{$md5sum}}) . " matches\n"
    if ($verbose1) && ( scalar  (@{$file_name_by_md5sum{$md5sum}}) > 1);
    }
}


# compare_by_fc: compare the list of files using file compare
# arg: list of filenames
# GLOBALS USED:
#	@fc_LoL : list of lists of matching files
#	results are appended thus allowing calling this in a loop
# 	and gathering the results for use afterwards.

sub compare_by_fc
{
    my (@file_list) = @_;
    my ($first_file);

    while ($first_file = shift (@file_list))
    {
	my (@dups_found) = $first_file;
	my (@not_dups);
	my ($filenm);

	foreach $filenm (@file_list)
	{
	    if (compare ($first_file, $filenm) == 0)
	    {
		push @dups_found, $filenm;
	    }
	    else
	    {
		push @not_dups, $filenm;
	    }

	}

	push @fc_LoL , [ @dups_found ]
	    if (scalar (@dups_found) > 1);
	    # save results only if >1 matched
	    # since the first file name is always pushed

	@file_list = @not_dups;  # continue searching the non-matched files
    }
}


# test_hard_links():
# args: list of filenames of identical files
# output: to stdout

# Within duplicate files, list those that are hard links.
# That's important to know since links are just multiple names
# for the same file and do NOT take up additional disk space!
# (well, except for the additional directory entry).
#
# This is probably Unix specific.
# The Unix file system allows multiple names for the same file.
# This depends on stat() returning a unique inode and device number per file.
# jsj: test over NFS, Cygwin, ...
#
#
# There are several ways to have multiple names for the same file:
# hard and soft links.  Hard links can be detected since they're limited
# to the same file system
# (they're merely directory entries to the same inode).
# Soft links can be anywhere (other file systems, even on offline NFS machines)
# so there's no way to find them all.
# jsj: are these Win-98 "shortcuts"  What of NT?
#
# ex: input of
#	./gatekeeper.dec.com/bison-1.28.tar.gz
#	./gate1/bison-1.28.tar.gz
#	./gate2/bison-1.28.tar.gz
#	./junk/gatekeeper.dec.com/bison-1.28.tar.gz
# generates the output:
# 4 identical files of size 420341:
#    ./gatekeeper.dec.com/bison-1.28.tar.gz
#    2 are LINKED files:
#	./gate1/bison-1.28.tar.gz
#	./gate2/bison-1.28.tar.gz
#    ./junk/gatekeeper.dec.com/bison-1.28.tar.gz
#
# this means there are actually 3 copies of the file,
# one having 2 names.

sub test_hard_links
{
    my (@file_list) = @_;
    my ($first_file);
    my ($filenm);

    while ($first_file = shift (@file_list))
    {
	my (@dups_found) = $first_file;
	my (@not_dups);
# jsj handle stat failure
	my ($dev1, $ino1) = stat ($first_file);

	foreach $filenm (@file_list)
	{
	    my ($dev2, $ino2) = stat ($filenm);
	    if (($dev1 == $dev2) && ($ino1 == $ino2))
	    {
		push @dups_found, $filenm;
	    }
	    else
	    {
		push @not_dups, $filenm;
	    }

	}

	if (scalar (@dups_found) > 1)
	{
	    print "    ", scalar (@dups_found), " are LINKED files:\n\t",
		join ("\n\t", @dups_found), "\n";
	}
	else
	{
	    print "    $first_file\n";
	}

	@file_list = @not_dups;  # continue searching the non-matched files
    }
}

# jsj: PRUNE sample:
#	dup6 . -name .netscape -prune -o

# EXAMPLE: to find all duplicates of a specific file,
# find all files of that size
#	ls -l this_file
#	dup6 . -size zzzzz
#
# ??? add an option like
#	--of-this-file <filename>
#	lists duplicates of that file only
#	(to be precise, files the same size of that file)


# AND: show "lnn" as a comment, describe how I use them together
#
#
# 2/2009: OH NO!
# I was usually assuming it was safe to rm -rf "A"
# if A was a subset of B
#
#	CONSIDER THIS TEST SCENARIO
# A contains file1, file2, symlink1, special file 1
# B contains file1, file2
#
# SINCE I CURRENTLY IGNORE SYMLINKS and SPECIAL FILES, I erroneously delete ALL of "A"
#
# Solution 1: give WARNINGS of "OTHER" file types (non-directory, non-regular)
# 	a) opt-args selects "no warnings" "warn once" "warn per file"
# 		THIS IS NEEDED for special files even if I solve symlinks
# Solution 2: try to see if the sym links are the same
# 	a: by comparing them as literals
# 	   PROBLEM: relative pathnames will fail
# 	   Even expanding to full pathname may fail if there are intermediate symlinks
# 	   or they point to files that are hard-linked together
# 	b: follow the symlink, compare the major/minor/inode of the resulting file
# 	   PROBLEM: fails for broken symlinks
#
#	another test scenario
# h1, h2 are hard linked together in "C" (yet another directory)
# "A": file1, file2, file3, symlink1 (to h1), symlink (to something else)
# "B": file1, file2, hardlink to file3, symlink2 (to h2)
#
# symlink compare-by-name (relative or full pathname)
# won't recognize that symlink1 and symlink2 are to the same file
# because it's by 2 different names.