#!/usr/bin/perl -w # # Find files with identical contents # # Copyright (c) 2009 Douglas G. Henke -- Released under GPL v2 # See http://www.gnu.org/licenses/gpl-2.0.txt for full license terms. # # Takes a list of files on stdin. By default, echoes to stdout the names of # all files the contents of which are identical to the files appearing # earlier in the list. (Non-files are never echoed.) # # For example, "find | same | xargs rm" would remove all but the first # instance of any given file contents. (If you are doing this in earnest, # though, use "find . -print0 | same -0 | xargs -0 rm" instead; see below.) # # Options: # -0 -- Input and output lists are nul-delimited. (Default: newline # delimited. This is useful when you have files with names that # contain newlines.) # -v -- Verbose output. The filenames of the first file is listed # preceded by a +, followed by the names of all copies, each # preceded by a -. (Files which are unique are not listed.) # -s -- Summary output. Only the filename of the first copy is shown, # preceded by the number of copies and a colon. (Files which # are unique are not listed.) # -e -- Consider empty files too. (Default: Empty files are ignored.) # $fmt = 0; $zerobyte = 0; # 0=ignore files zero bytes in length while($_=shift(@ARGV)) { if($_ eq "-0") { $/ = "\0"; next; } if($_ eq "-v") { $fmt = 1; next; } if($_ eq "-s") { $fmt = 2; next; } if($_ eq "-e") { $zerobyte = 1; next; } die "unrecognized option: $_\n"; } # Create an associative array whose keys are file sizes and whose # values are references to lists of filenames of files with that size while() { chomp; next unless(-f); push(@{$lst{(stat)[7]}}, $_); } # For each file size... foreach $key (keys(%lst)) { # Ignore zero-byte files unless told to process them. next if($key == 0 && !$zerobyte); # Optimization: skip sizes which belong to only one file. next unless($#{$lst{$key}} >= 1); # Compare each item left in the array with all later items. while($head=shift(@{$lst{$key}})) { $cnt = 0; # For each later item in the array... for ($i=0; $i <= $#{$lst{$key}}; $i++) { # Is this item the same as $head? If not, try the next one. $name = ${$lst{$key}}[$i]; next if(system("cmp", "-s", $head, $name)); # Item is a duplicate. Report it according to $fmt. if($fmt == 1) { print "+$head$/" if($cnt++ == 0); print "-$name$/"; } elsif($fmt == 2) { $cnt++; } else { print "$name$/"; } # Remove that element so it is not considered again, and # check the next element (now at the same index once occupied # by the removed item). splice(@{$lst{$key}}, $i, 1); $i--; } print(($cnt+1).": $head$/") if($fmt == 2); } } exit(0);