#!/usr/bin/perl -w
#
# Find files with identical contents
#
# Copyright (c) 2009 Douglas G. Henke -- Released under GPL v2
# See http://www.gnu.org/licenses/gpl-2.0.txt for full license terms.
#
# Takes a list of files on stdin. By default, echoes to stdout the names of
# all files the contents of which are identical to the files appearing
# earlier in the list. (Non-files are never echoed.)
#
# For example, "find | same | xargs rm" would remove all but the first
# instance of any given file contents. (If you are doing this in earnest,
# though, use "find . -print0 | same -0 | xargs -0 rm" instead; see below.)
#
# Options:
#   -0 -- Input and output lists are nul-delimited. (Default: newline
#         delimited. This is useful when you have files with names that
#         contain newlines.)
#   -v -- Verbose output. The filenames of the first file is listed
#         preceded by a +, followed by the names of all copies, each
#         preceded by a -. (Files which are unique are not listed.)
#   -s -- Summary output. Only the filename of the first copy is shown,
#         preceded by the number of copies and a colon. (Files which
#         are unique are not listed.)
#   -e -- Consider empty files too. (Default: Empty files are ignored.)
#

$fmt = 0;
$zerobyte = 0; # 0=ignore files zero bytes in length
while($_=shift(@ARGV)) {
   if($_ eq "-0") { $/ = "\0"; next; }
   if($_ eq "-v") { $fmt = 1; next; }
   if($_ eq "-s") { $fmt = 2; next; }
   if($_ eq "-e") { $zerobyte = 1; next; }
   die "unrecognized option: $_\n";
}

# Create an associative array whose keys are file sizes and whose
# values are references to lists of filenames of files with that size
while(<STDIN>) {
  chomp;
  next unless(-f);
  push(@{$lst{(stat)[7]}}, $_);
}

# For each file size...
foreach $key (keys(%lst)) {
  # Ignore zero-byte files unless told to process them.
  next if($key == 0 && !$zerobyte);

  # Optimization: skip sizes which belong to only one file.
  next unless($#{$lst{$key}} >= 1);

  # Compare each item left in the array with all later items.
  while($head=shift(@{$lst{$key}})) {

    $cnt = 0;
    # For each later item in the array...
    for ($i=0; $i <= $#{$lst{$key}}; $i++) {

       # Is this item the same as $head? If not, try the next one.
       $name = ${$lst{$key}}[$i];
       next if(system("cmp", "-s", $head, $name));

       # Item is a duplicate. Report it according to $fmt.
       if($fmt == 1) {
          print "+$head$/" if($cnt++ == 0);
          print "-$name$/";
       } elsif($fmt == 2) { $cnt++; }
       else { print "$name$/"; }

       # Remove that element so it is not considered again, and
       # check the next element (now at the same index once occupied
       # by the removed item).
       splice(@{$lst{$key}}, $i, 1);
       $i--;
    }
    print(($cnt+1).": $head$/") if($fmt == 2);
  }
}

exit(0);
