#!/usr/bin/perl #) { chomp; # Show progress if -t option present. if ($options{'t'}) { $k++; if ($k % 10000 == 0) { my @lt = localtime(time()); my $ts = strftime("%Y-%m-%d %T", @lt); print "$ts $k\n"; } } # The format should be "HASH SPACE SPACE FILENAME". $sep = index($_, ' ') + 1; die "no separator: line $.\n" if $sep <= 0; $nhash = substr($_, 0, $sep); $npath = substr($_, $sep); $npath =~ s/^\s*//; if ($ohash eq $nhash) { print "[$opath] --> [$npath]\n" unless $options{'q'}; unless ($options{'s'}) { unlink($npath) || warn("$npath unlink failed\n"); link($opath, $npath) || warn("link ($opath, $npath) failed\n"); } } $ohash = $nhash; $opath = $npath; } exit(0); #--------------------------------------------------------------------- # Print a usage message from the comments and exit. sub usage { my ($emsg) = @_; use Pod::Usage qw(pod2usage); warn "$emsg\n" if defined $emsg; pod2usage(-verbose => 99, -sections => "NAME|SYNOPSIS|OPTIONS"); } sub manpage { my @args = ("perldoc", "$0"); exec {$args[0]} @args; # safe even with one-arg list die("should not get here\n"); } #--------------------------------------------------------------------- # Print the UUID, current version, or source location. sub myuuid { my $UUID = sprintf("%s", q$UUID: b79d686c-8162-3058-b027-1a399279664c $ =~ /UUID: (.*) /); print "$UUID\n"; exit(0); } sub version { my $VERSION = sprintf("%d.%02d", q$Revision: 1.8 $ =~ /(\d+)\.(\d+)/); my $DATE = sprintf("%s", q$Date: 2023-05-06 00:24:28-04 $ =~ /Date: (.*) /); print "$myname $VERSION $DATE\n"; exit(0); } sub where { my $SOURCE = sprintf("%s", q$Source: /usr/local/src/d/dups/RCS/linkdups,v $ =~ /Source: (.*) /); print "$SOURCE\n"; exit(0); } #--------------------------------------------------------------------- __END__ =head1 NAME linkdups - hard-link identical files to save diskspace =head1 SYNOPSIS linkdups [-dhmqtuvw] [hashfile] =head1 OPTIONS =over 4 =item B<-h> Print a brief help message and exit. =item B<-m> Print the manual page and exit. =item B<-q> Don't print filenames being linked. =item B<-s> Shows what would be done without doing it. =item B<-t> Print time and progress every 10,000 lines. Not affected by -q. =item B<-u> Print the script UUID and exit. =item B<-v> Print the version and exit. =item B<-w> Print the source location and exit. =item B<-z> Expect NUL-terminated input, so filenames can contain whitespace. =back =head1 DESCRIPTION B reads sorted hashes (MD5, RMD160, etc.) for regular files in the same filesystem, looks for duplicate entries, and makes hard-links to save space. The hash-file format should be "HASH SPACE SPACE FILENAME". =head1 EXAMPLE If you're only interested in duplicate files that are fairly large: root# cd /some/place root# find . -size +70000 -print0 | xargs -0 md5sum | sort > /tmp/hash root# cat /tmp/hash f9203cea58151e423367d147bd334242 ./a/b/c/summary.pdf f9203cea58151e423367d147bd334242 ./d/e/f/report.pdf ... Then, later on: root# cd /some/place root# linkdups /tmp/hash [./a/b/c/summary.pdf] --> [./d/e/f/report.pdf] ... If filenames contain spaces, etc. you can use NUL-separated input: root# find . -print0 | xargs -0 md5sum -z | sort -z > /tmp/hash root# linkdups -z /tmp/hash =head1 AUTHOR Karl Vogel =cut