#!/usr/bin/perl

use strict;
use warnings;
use File::Basename qw(basename);
use Getopt::Long;
use Pod::Usage;

# run fdupes and read output into data structure
sub get_dupes { # {{{
    open(my $fdupes, 'fdupes -rH . |') or die "Cannot run fdupes: $!";
    my @groups;
    my @currgroup;
    while (<$fdupes>) {
        chomp;

        # store group
        if (m{^$}o) {
            my @ready = @currgroup;
            push @groups, \@ready;
            undef @currgroup;
            next;
        }

        # add filename to group
        push @currgroup, $_;
    }
    close($fdupes) or die "Cannot close fdupes: $!";

    return @groups;
} # }}}

# checks if files in a list still exist
# (fdupes runs long and maybe someone is working on the hierarchy)
sub check_existence { # {{{
    my (@files) = @_;

    my @ok = grep { $_ if -f $_ } @files;

    return @ok;
} # }}}

# takes file list, returns a selected file name and the remaining list
sub select_preserved_file { # {{{
    my ($opt_dislike, @files) = @_;

    @files = check_existence(@files);

    my %len   = map { $_ => length(basename($_)) } @files;
    my %depth = map { $_ => $_ =~ tr{/}{/}       } @files;
    my %dislike  = map { $_ => scalar m{\Q$opt_dislike\E}smo  } @files;

    @files = sort {
        # avoid files containing the 'dislike' match
        $dislike{$a}  <=> $dislike{$b} or
        # prefer greater depth
        $depth{$b} <=> $depth{$a} or
        # prefer the longer name
        $len{$b}   <=> $len{$a}
    } @files;

    my $preserved = shift @files;

    return ($preserved, \@files);
} # }}}

# try to link files, one failure copy them back
sub try_link { # {{{
    my ($target, $link, $opt_symlink, $opt_fb_text, $flags) = @_;

    unlink $link;

    my $res;
    unless ($opt_symlink) {
        # create hardlink
        $res = link $target, $link;
        return if $res;

        # there was a problem
        unless ($flags->{warned_on_hardlinks}) {
            print "Cannot create hardlink, trying fallback\n";
            $flags->{warned_on_hardlinks} = 1;
        }
    }

    if ($flags->{can_symlink}) {
        # create symlink
        $res = symlink($target, $link);
        return if $res;

        # on error create a copy
        unless ($flags->{warned_on_symlinks}) {
            print "Cannot create symlink, trying fallback\n";
            $flags->{warned_on_symlinks} = 1;
        }
    }

    # last resort
    if ($opt_fb_text) {
        # replace files by a (hopefully smaller) pointer to the preserved file
        my $pointer;
        unless (open($pointer, '>', $link .'-dupe.txt')) {
            print STDERR "Could not create pointer file to $target; ".
                "lost $link\n";
            return;
        }
        print $pointer "$link\nwas deleted in favour of\n$target\n";
        close($pointer);
        # remove if still existing
        unlink $link if -f $link;

    } else {
        # try to copy the preserved file back to the old location
        system('/bin/cp', $target, $link) == 0 or
            print STDERR "Copy error on $target; lost $link\n";
    }
} # }}}

# parse options, help message {{{
=head1 NAME

dedupe - Do something agains duplicate files

=head1 DESCRIPTION

This program will use fdupes to search the current directory for
duplicate files.  It can delete or link them, depending on the
options.  It will always preserve the file with the greatest path
depth.  If there are multiple files matching that criteria, the
longest file name will be used.

=head1 SYNOPSIS

dedupe [options]

=head1 OPTIONS

=over 4

=item B<--delete>

Delete mode.  Will delete duplicate files.

=item B<--link>

Link mode.  Duplicate files will be hard-linked together.

=item B<--symlink>

Modify link mode.  Will create soft links instead.

Automatically activates 'link mode'.

=item B<--fallback-text>

Modify link mode.  When creating links fails, delete the dupe and
put a text message pointing to the preserved file.  Without this,
the file would just be copied back.

=item B<--simulate>

Do not delete, do not link.  Only simulate what would be done.

Automatically activates verbosity.

=item B<--(no)subvert>

Per default, dedupe will no longer touch files or directories named
'.svn'.  You can restore the old behaviour with this option.

=item B<--verbose>

Enable verbosity.  Shows what is going on.

=item B<--dislike> I<string>

Try not to prefer files containing the disliked string in their full
path.

=back

=head1 AUTHOR

Written by Christoph 'Mehdorn' Weber

=head1 REPORTING BUGS

Report bugs to <kontakt@das-mehdorn.de>

=head1 COPYRIGHT

Copyright 2009-2011 Christoph 'Mehdorn' Weber.  License Creative
Commons Attribution-Share Alike 3.0 Germany
<http://creativecommons.org/licenses/by-sa/3.0/de/>.
This is free software: you are free to change and redistribute it.
There is NO WARRANTY.

=cut

my $opt_delete   = 0;
my $opt_link     = 0;
my $opt_simulate = 0;
my $opt_subvert  = 1;
my $opt_symlink  = 0;
my $opt_verbose  = 0;
my $opt_fb_text  = 0;
my $opt_dislike     = '';
GetOptions(
    'delete'        => \$opt_delete,
    'dislike=s'     => \$opt_dislike,
    'fallback-text' => \$opt_fb_text,
    'link'          => \$opt_link,
    'simulate'      => \$opt_simulate,
    'subvert!'      => \$opt_subvert,
    'symlink'       => \$opt_symlink,
    'verbose'       => \$opt_verbose,

    'help'          => sub { pod2usage(-exitval => 0, -verbose => 2) },
) or pod2usage(1);

# if symlink option is used, automatically activate link mode
$opt_link = 1 if $opt_symlink;

# if simulation modifier is used, automatically show what is done
$opt_verbose = 1 if $opt_simulate;

pod2usage(
    -msg     => '--link and --delete cannot be used together',
    -exitval => 1,
) if ($opt_link and $opt_delete);

# default action is to simulate
unless ($opt_delete + $opt_link) {
    $opt_simulate = 1;
    $opt_verbose  = 1;
}
# }}}

# main part {{{
print "Note: this is a simulation\n\n" if ($opt_simulate and $opt_verbose);

# get the dupes via 'fdupes' call
my @dupes = get_dupes();

my $flags = {
    # check if symlinks are available (see perldoc -f symlink)
    can_symlink         => eval { symlink("",""); 1 },

    # try_link will only warn on the first occurrence of each error type
    warned_on_hardlinks => 0,
    warned_on_symlinks  => 0,
};

foreach my $group (@dupes) {
    if ($opt_subvert) {
        # filter subversion control files
        @$group = grep { not ($_ =~ m{/\.svn}) } @$group;
        next unless @$group;
    }

    # split group into preserved file and rest
    my ($theone, $theothers) = select_preserved_file($opt_dislike, @$group);

    # if the files vanished, just ignore the group
    next unless $theone;
    next unless @$theothers;

    if ($opt_delete) {
        # delete mode is simple
        print join("\n* ", 'Deleting:', @$theothers), "\n"
            if $opt_verbose;
        unlink @$theothers unless $opt_simulate;

    } elsif ($opt_link) {
        # link mode
        print join("\n* ", "Linking: $theone as", @$theothers), "\n\n"
            if $opt_verbose;

        unless ($opt_simulate) {
            try_link($theone, $_, $opt_symlink, $opt_fb_text, $flags)
                foreach @$theothers;
        }
        print "\n";

    } else {
        # show mode
        print "Preferred: $theone\n";
        if ($opt_verbose) {
            print join("\n* ", 'Dupes:', @$theothers), "\n";
            print "\n";
        }
    }
} # }}}
