#!/usr/bin/perl

=head1 Name

laster - genetic algorithm backpack problem solver

=head1 Synopsis

laster [--help] [--size <sz>] [--all] [--dumpf <f>] [--nooss] [--eff <p>] [--iter <n>] file0 [file1 ...]

=over 1

=item B<--help>

Type short help instructions regarding usage and default values.

=item B<--size sz>

Specify target size for optimization. K,M,G suffixes are recognized, kilo, mega and gigabytes.

=item B<--all>

Process all files, generate multiple volumes. The results for each medium are separated by single newline.

=item B<--dumpf f>

Dump optimization progress into file f. Fitness coefficient of each iteration is written. The output is suitable for drawing in gnuplot.

=item B<--nooss>

Do not try to generate optimal starting solution. With this option the algorithm starts with very bad starting solution.

=item B<--eff e>

Set efficiency limit 0 <= e <= 1. The algorithm ends whenever fitness coefficient reaches value of e.

=item B<--iter n>

Specify maximal number of iterations.

=back

=head1 Description

Solution of the backpack problems lies in finding suitable combination of objects which fit into a limited space so that the used capacity is highest possible. Laster works with files and directories which shall be written on a medium.

=head2 Example Usage

laster --size=1.44M /usr/bin/*

This command tries to find the best combination of files and directories from /usr/bin which would fit best on a 1.44MB floppy.

=head1 Algorithm Description

First, after removing objects which are too large and exceed target capacity, a starting optimal solution is generated which is then processed with a loop performing optimizations on GA basis. The program works with array of 100 solution candidates. Each single solution candidate contains an array of items with value of 1, if the object is considered in the backpack, or 0 if not.

=head2 Optimal Starting Solution

Optimal starting solution, unless --nooss is specified, is generated as 100 starting solution candidates with knowledge of target media capacity and total size of all processed files and directories. Sample candidate's backpack vector is then generated randomly, generating random float number from the interval [0;1] and to each respective object assigning 1 if the random number is lower than computed weight (backpack capacity/total weight of all objects ratio) and vice-versa. The first candidate has vector (1,0,0,...). This is to ensure at least one acceptable solution and to forego possible algorithm divergence in case that all starting solution were unacceptable.

=head2 The Fitness Function

The fitness coefficient ratio of sum of all items in backpack vector and target capacity. If a solution candidate exceeds the target capacity, its fitness is set to 0.

=head2 Solution Convergence

Before every genetic iteration, fitness coefficients are computed for every solution candidate. If the fitness of the best candidate reached minimal required fitness, the algorithm ends.

The list of candidates is sorted and the worse half of it as well as all candidates with zero fitness are released. The best candidates are the iteratively copied on free space and mutated. Because the best solution stays alway on top and is always acceptable, the algorithm converges and in the next step of iteration returns equal or better solution.

Thanks to good optimal starting solution finding method, even the initial solution is often usable, and even without it (--nooss) the GA iterative kill/mutate algorithm converges very quickly and for its purpose achieves excellent results in very short even with high number of input objects which were already unsolvable for non-GA recursive algorithm implementation.

=head2 Solution Mutation

The mutation function is simple process which sets 5% of randomly selected items (but at least 1) to 1 and 5% of randomly selected items (but at least 1) to 0 (further values of selected items could already be 1 or 0 respectively). This quantity showed to be adequate and provides fast convergence.

=head1 Authors

Ondra Havel <ondra.havel@gmail.com>

=cut

use strict;
use warnings;
use Getopt::Long;

use Storable qw/dclone/;
use constant MAX_IT=>100;

my $VERSION="1.0d-gen";
my $date="2006-10-26";

my $units;

sub get_real_cap {
    $_=shift;

    die "Invalid capacity format '$_'" if(!/^(\d+(?:\.\d*)?)([kmgKMG]?)/);
    my $cap=$1;
    $units=uc $2;
    return int $cap*1024 if $units eq 'K';
    return int $cap*1024*1024 if $units eq 'M';
    return int $cap*1024*1024*1024 if $units eq 'G';
}

sub get_human_size {
    $_=shift;

    $_=sprintf("%.2f",$_/1024) if $units eq 'K';
    $_=sprintf("%.2f",$_/(1024*1024)) if $units eq 'M';
    $_=sprintf("%.2f",$_/(1024*1024*1024)) if $units eq 'G';

    return $_.$units;
}

my $my_cap="4485M";
my ($start_time,$end_time);

my @ftable;
my $nr_files=0;
my $nr_combinations;
my $nr_large=0;     # Objects that are larger than media
my $total_size=0;
my $fl_all=0;

sub usage {
    my $ec=shift;
    print<<EOH;
lorry $VERSION, written by Ondra Havel, $date
--help           give this help
--size <sz>      optimize for given size (default $my_cap)
                 K,M,G suffixes are recognized
--all            process all files, do multiple volumes
--dumpf <ff>     dump fitness evolution into a file
--nooss          do not generate optimal starting solution set
--eff <p>        terminate when efficiency <p> is achieved, 0 <= p <= 1
--iter <n>       specify maximal number of iterations
EOH

    exit defined $ec ? $ec : 0;
}

my($dumpf,$nooss,$best_req);
my $max_iter=MAX_IT;
$start_time=time;

GetOptions("help"=>\&usage,"size=s"=>\$my_cap,"all"=>\$fl_all,"dumpf=s"=>\$dumpf,"nooss"=>\$nooss,"eff=s"=>\$best_req,"iter=i"=>\$max_iter);

my $real_cap=get_real_cap $my_cap;
print STDERR "Optimizing for capacity $my_cap ($real_cap bytes)\n";

for (@ARGV) {
    next if /^\.\.?$/;
    next if ! -e $_;
    s/`/\\`/g;
    my $s=`du -sb "$_"`;
        chomp $s;
        $s=~s/^(\d+).*/$1/;
    if($s>$real_cap) {
        $nr_large++;
        next;
    }
    push(@ftable,{name=>$_, size=>$s});
    $total_size+=$s;
    $nr_files++;
}

print STDERR "Warning $nr_large objects were rejected because of large size.\n" if $nr_large;
if(!$nr_files) {
    print STDERR "Nothing to do\n";
    exit 0;
}
print STDERR "Processing $nr_files objects, total size @{[ get_human_size $total_size
]}.\n";

if($total_size<=$real_cap) {
    print STDERR "Nothing to do media capacity not exceeded.\n";
    print "${$_}{'name'}\n" for (@ftable);
    exit 0;
}

sub gen_mutate {
    my($data)=@_;

#   for(my $i=0;$i<int(@{$data}*0.05)+1;$i++) {
#       $data->[int(rand(@{$data}))]^=1;
#   }

    for(my $i=0;$i<int(@{$data}*0.05)+1;$i++) {
        $data->[int(rand(@{$data}))]=1;
    }

    for(my $i=0;$i<int(@{$data}*0.05)+1;$i++) {
        $data->[int(rand(@{$data}))]=0;
    }
}

sub gen_combine {
    my($ret)=@_;
    my $df;

    open $df,">",$dumpf if$dumpf;

    print STDERR $nooss ? "Generating one starting candidate...\n" : "Generating starting population...\n";
    my @genpool;

    if(!$nooss) {
        my $weigth=$real_cap/$total_size;
        print STDERR "Weigth = $weigth\n";

        $genpool[0]->{'data'}=[1,(0)x(@ftable-1)];

        for(my $c=1;$c<100;$c++) {
            my $field;
            my $i=0;
            for(@ftable) {
                $field->[$i++]=rand(1) < $weigth ? 1 : 0;
            }
            push @genpool,{data=>$field};
        }
    } else {
        for(my $c=0;$c<100;$c++) {
            $genpool[$c]->{'data'}=[(0)x@ftable];
        }
        $genpool[0]->{'data'}->[0]=1;
    }

    my $nr;

    for(my $it=1;$it<=$max_iter;$it++) {
# Compute fitness for all elements.
        $nr=0;
        for my $h (@genpool) {
            my $i=0;
            my $fitness=0;
            $fitness+=$ftable[$i++]->{'size'}*$_ for(@{$h->{'data'}});
            $fitness=0 if $fitness>$real_cap;
            $fitness/=$real_cap;
            $h->{'fitness'}=$fitness;
            $nr++;
        }

        my $nr_good=0;
        for(@genpool) {
            $nr_good++ if$_->{'fitness'};
        }
        my $nr_bad=scalar @genpool-$nr_good;
        @genpool=sort {$b->{'fitness'} cmp $a->{'fitness'}} @genpool;
        my $best=$genpool[0]->{'fitness'};

        print $df "$it $best\n" if$df;

        print STDERR "Fit: $nr_good not-fit: $nr_bad  best: $best\n";

        if($best_req && $best>=$best_req) {
            print STDERR "Fitness sufficient, exitting.\n";
            last;
        }

        if($nr_bad<$nr_good) {
            print STDERR "Killing ".(scalar(@genpool)/2-$nr_bad)." candidates.\n";
            $nr_bad=$nr_good=scalar(@genpool)/2;
        }

        print STDERR "$it/$max_iter Mutating $nr_bad candidates.\n";
        for(my $j=0;$j<$nr_bad;$j++) {
            $genpool[$j+$nr_good]=dclone($genpool[$j%$nr_good]);
            gen_mutate $genpool[$j+$nr_good]->{'data'};
        }
    }

    close $df if$df;

    return 0 if!$genpool[0]->{'fitness'};  # no usable result

    $nr=0;
    my $gsize=0;
    for(@{$genpool[0]->{'data'}}) {
        if($_) {
            push @{$ret},$nr;
            $gsize+=$ftable[$nr]->{'size'};
        }
        $nr++;
    }

    return $gsize;
}

while(scalar @ftable) {
    my $myres=[];
    my $gsize=gen_combine $myres;
    for(@{$myres}) {
        print "$ftable[$_]{'name'}\n";
        $ftable[$_]='';
    }
    my $eff=sprintf("%.2f",100*$gsize/$real_cap);
    print STDERR "$gsize/$real_cap ($eff%, ".($real_cap-$gsize)." bytes left)\n";
    last if !$fl_all;
    print "\n";
    @ftable=grep(!/^$/,@ftable);
}

$end_time=time;

$end_time-=$start_time;
if($end_time) {
    my $days=int($end_time/86400);
    $end_time%=86400;
    my $hours=int($end_time/3600);
    $end_time%=3600;
    my $mins=int($end_time/60);
    my $secs=$end_time % 60;

    print STDERR "Elapsed time: ";
    print STDERR "$days days  " if $days;
    print STDERR "$hours hours  " if $hours or $days;
    print STDERR "$mins minutes  " if $mins or $hours or $days;
    print STDERR "$secs seconds\n";
}