#!/usr/bin/perl #) { chomp; if (/^(\S+)\s\s(.*)/) { $nhash = $1; $npath = $2; } else { warn "$_: bad line, skipping"; next; } if ($ohash eq $nhash) { if ($dryrun) { print "would remove $npath\n"; } else { unlink("$npath") || warn "$npath: $!\n"; print "removing $npath\n"; } } else { print "$_\n" if $keep; } $ohash = $nhash; $opath = $npath; } exit(0); #--------------------------------------------------------------------- # Print a usage message from the comments and exit. sub usage { my ($emsg) = @_; use Pod::Usage qw(pod2usage); warn "$emsg\n" if defined $emsg; pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS"); } sub manpage { my @args = ("perldoc", "$0"); exec {$args[0]} @args; # safe even with one-arg list die("should not get here\n"); } #--------------------------------------------------------------------- # Print the UUID, current version, or source location. sub myuuid { my $UUID = $1 if q$UUID: 8f8e5c79-e220-3afc-8b80-ccbede641a7a $ =~ /UUID: (.*) /; print "$UUID\n"; exit(0); } sub version { my $VERSION = sprintf("%d.%02d", q$Revision: 1.7 $ =~ /(\d+)\.(\d+)/); my $DATE = $1 if q$Date: 2023-05-05 14:28:58-04 $ =~ /Date: (.*) /; print "$myname $VERSION $DATE\n"; exit(0); } sub where { my $SOURCE = $1 if q$Source: /usr/local/src/d/dups/RCS/killdups,v $ =~ /Source: (.*) /; my $HOST = $1 if q$Host: furbag.my.domain $ =~ /Host: (.*) /; print "file://$HOST", "$SOURCE\n"; exit(0); } #--------------------------------------------------------------------- __END__ =head1 NAME killdups - read sorted MD5/SHA entries, list and/or kill duplicate files =head1 SYNOPSIS killdups [-hkmsuvwz] [hashfile] =head1 OPTIONS =over 4 =item B<-h> Print a brief help message and exit. =item B<-k> Show complete input lines with files to be kept (non-duplicates). This still removes the duplicates, but preserves usable MD5/SHA output. =item B<-m> Print the manual page and exit. =item B<-s> Show just the files to be removed, but don't actually remove them. =item B<-u> Print the script UUID and exit. =item B<-v> Print the version and exit. =item B<-w> Print the source location and exit. =item B<-z> Expect NUL-terminated input, so filenames can contain whitespace. =back =head1 DESCRIPTION B reads sorted MD5/SHA entries from "hashfile" or stdin, looks for duplicate files, and lists or removes the duplicates. It can also keep the input lines for files that aren't removed. The hashes aren't limited to MD5 or SHA; anything that writes output in the form HASH two-spaces FILENAME can provide input to this program. When B finds duplicate entries, the first file is preserved and the remaining files are either displayed or removed. If you want to remove a different duplicate file, try sorting the hash entries in reverse order. If that still isn't what you want, consider using B instead. =head1 EXAMPLE If you have filenames containing whitespace under the current directory, and you want a dry-run to only see what would be removed: me% find . -type f -print0 | xargs -0 md5sum -z | sort -z > /tmp/hashes me% killdups -sz /tmp/hashes =head1 AUTHOR Karl Vogel =cut