#!/usr/bin/perl
#
# 2011/11/27 gabriel

use strict;

use Getopt::Long();
use Pod::Usage;
use Coro;
use Coro::Semaphore;
use Coro::Signal;
use Coro::Channel;
use Coro::Handle;
use IO::File;
use POSIX qw( WNOHANG WEXITSTATUS );
use Cwd qw( getcwd );

my $file    = '';
my $logfile = '';
my $verbose;
my $job_np = 1;
my $nodefile = $ENV{OAR_NODE_FILE} || '';
my $masterio;
my $switchio;
my $help;
my $oarsh = 'oarsh -q -T';

Getopt::Long::GetOptions(
   'file=s'     => \$file,
   'logfile=s'  => \$logfile,
   'verbose'    => \$verbose,
   'help'       => \$help,
   'oarsh=s'    => \$oarsh,
   'jobnp=i'    => \$job_np,
   'nodefile=s' => \$nodefile,
   'masterio=s' => \$masterio,
   'switchio'   => \$switchio,
   ) || pod2usage(-verbose => 0);
pod2usage(-verbose => 2) if $help;
pod2usage(-verbose => 2) if not -e $file;

my %state;
my $log_h = IO::File->new();
if (-e $logfile) {
   $log_h->open("< $logfile")
      or die "can't read log file: $!";
   while (<$log_h>) {
      $state{$1} = 'start' if m/^start\s+job\s+(\d+)\s/;
      $state{$1} = 'end'   if m/^end\s+job\s+(\d+)\s/;
      }
   $log_h->close();
   }
if ($logfile) {
   $log_h->open(">> $logfile")
      or die "can't append log file $logfile: $!";
   $log_h->autoflush;
   $log_h = unblock $log_h;
   }

my @job = ();
open(JOB_LIST, '<', "$file") or die "can't open job file $file: $!";
while (<JOB_LIST>) {
   chomp;
   next if m/^#/;
   next if m/^\s*$/;
   push @job, $_;
   }
close JOB_LIST;

my @ressources = ();
open(NODE_FILE, '<', "$nodefile")
   or die "can't open $nodefile: $!";
while (<NODE_FILE>) {
   chomp;
   next if m/^#/;
   next if m/^\s*$/;
   push @ressources, $_;
   }
close NODE_FILE;

my $ressource_size = scalar(@ressources);
die "not enought ressources jobnp $job_np > ressources $ressource_size"
   if $job_np > $ressource_size;

my $current_dir = getcwd();

my $stderr = $ENV{OAR_STDERR} || '';
$stderr =~ s/\.stderr$//;
$stderr = $masterio if $masterio;
my $stdout = $ENV{OAR_STDOUT} || '';
$stdout =~ s/\.stdout$//;
$stdout = $masterio if $masterio;

my $finished = new Coro::Signal;
my $job_todo = new Coro::Semaphore 0;
$job_todo->up for (@job);

my $ressources = new Coro::Channel;
for my $slot (1 .. int($ressource_size / $job_np)) {
   $ressources->put(
      join(',',
         @ressources[ (($slot - 1) * $job_np) .. (($slot * $job_np) - 1) ])
         );
   }

my $job_num   = 0;
my %scheduled = ();

my $oar_checkpoint = new Coro::Semaphore 0;
$SIG{USR2} = sub {$oar_checkpoint->up};

async {
   for my $job (@job) {
      $job_num++;

      if (exists $state{$job_num}) {
         if ($state{$job_num} eq 'start') {
            print "warning: job $job_num was not finished, relaunching...\n"
               if $verbose;
            }
         elsif ($state{$job_num} eq 'end') {
            delete $state{$job_num};
            $job_todo->down;
            print "warning: job $job_num already done\n" if $verbose;
            cede;
            next;
            }
         }

      my $job_ressource = $ressources->get;

      last if $oar_checkpoint->count() > 0;

      my ($node_connect) = split ',', $job_ressource;
      my $fh = IO::File->new();
      my $job_pid = $fh->open("| $oarsh $node_connect >/dev/null 2>&1")
         or die "don't start subjob: $!";

      $fh->autoflush;
      $fh = unblock $fh;

      $scheduled{$job_pid} = {
         fh           => $fh,
         node_connect => $node_connect,
         ressource    => $job_ressource,
         num          => $job_num
         };

      $log_h->printf("start job %5i at %s\n", $job_num, time) if $logfile;
      printf "start job %5i / %5i at %s on node %s\n",
         $job_num, $job_pid, time, $job_ressource
         if $verbose;

      my ($job_stdout, $job_stderr);
      $job_stdout = ">  $stdout-$job_num.stdout" if $stdout ne '' and $switchio;
      $job_stderr = "2> $stderr-$job_num.stderr" if $stderr ne '' and $switchio;

      my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$job_num";

      if ($job_np > 1) {
         $fh->print("printf \""
               . join('\n', split(',', $job_ressource,))
               . "\" > $job_nodefile\n");
         $fh->print("OAR_NODE_FILE=$job_nodefile\n");
         $fh->print("OAR_NP=$job_np\n");
         $fh->print("export OAR_NODE_FILE\n");
         $fh->print("export OAR_NP\n");
         $fh->print("unset OAR_MSG_NODEFILE\n");
         }
      $fh->print("cd $current_dir\n");
      $fh->print("$job $job_stdout $job_stderr\n");
      $fh->print("rm -f $job_nodefile\n") if $job_np > 1;
      $fh->print("exit\n");
      cede;
      }
   }

async {
   while () {
      for my $job_pid (keys %scheduled) {
         if (waitpid($job_pid, WNOHANG)) {
            $log_h->printf("end   job %5i at %s\n",
               $scheduled{$job_pid}->{num}, time)
               if $logfile;
            printf "end   job %5i / %5i at %s on node %s\n",
               $scheduled{$job_pid}->{num},
               $job_pid, time, $scheduled{$job_pid}->{ressource}
               if $verbose;
            close $scheduled{$job_pid}->{fh};
            $ressources->put($scheduled{$job_pid}->{ressource});
            $job_todo->down;
            delete $scheduled{$job_pid};
            }
         cede;
         }

      $finished->send if $oar_checkpoint->count > 0 and keys(%scheduled) == 0;

      $finished->send if $job_todo->count == 0;
      cede;
      }
   }

cede;

$finished->wait;

$log_h->close() if $logfile;

__END__

=head1 NAME

oar-parexec - parallel execute lot of small job

=head1 SYNOPSIS

 oar-parexec --file filecommand [--verbose] [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
 oar-parexec --help

=head1 DESCRIPTION

C<oar-parexec> execute lot of small job.in parallel inside a cluster.
Number of parallel job at one time cannot excede core number in the node file.
C<oar-parexec> is easier to use inside an OAR job environment
which define automatically theses strategics parameters...

Option C<--file> is the only mandatory one.

Small job will be launch in the same folder as the master job.
Two environment variable are define for each small job
and only in case of parallel small job (option C<--jobnp> > 1).

 OAR_NODE_FILE - file that list node for parallel computing
 OAR_NP        - number of processor affected

The file define by OAR_NODE_FILE is created on the node before launching
the small job in /tmp and will be delete after...
C<oar-parexec> is a simple script,
OAR_NODE_FILE will not be deleted in case of crash of the master job.

OAR define other variable that are equivalent to OAR_NODE_FILE:
OAR_NODEFILE, OAR_FILE_NODES, OAR_RESOURCE_FILE...
You can use in your script the OAR original file ressources
by using these variable if you need it.
 

=head1 OPTIONS

=over 12

=item B<-f|--file	filecommand>

File name which content job list.

=item B<-v|--verbose>

=item B<-j|--jobnp integer>

Number of processor to allocated for each small job.
1 by default.

=item B<-n|--nodefile filenode>

File name that list all the node to launch job.
By defaut, it's define automatically by OAR via
environment variable C<OAR_NODE_FILE>.

For example, if you want to use 6 core on your cluster node,
you need to put 6 times the hostname node in this file,
one per line...
It's a very common file in MPI process !

=item B<-m|--masterio basefileio> 

The C<basefileio> will be use in place of environment variable
C<OAR_STDOUT> and C<OAR_STDERR> (without extension) to build the base name of the small job standart output
(only use when option C<swithio> is activated).

=item B<-s|--switchio> 

Each small job will have it's own output STDOUT and STDERR
base on master OAR job with C<JOB_NUM> inside
(or base on C<basefileio> if option C<masterio>).
Example :

 OAR.151524.stdout -> OAR.151524-JOB_NUM.stdout

where 151524 here is the master C<OAR_JOB_ID>
and C<JOB_NUM> is the small job nnumber.

=item B<-o|-oarsh command>

Command use to launch a shell on a node.
By default

        oarsh -q -T

=item B<-h|--help>

=back


=head1 EXAMPLE

Content for the job file (option C<--file>) could have:

 - empty line
 - comment line begin with #
 - valid shell command

Example where F<$HOME/test/subjob1.sh> is a shell script (executable).

 $HOME/test/subjob1.sh
 $HOME/test/subjob2.sh
 $HOME/test/subjob3.sh
 $HOME/test/subjob4.sh
 ...
 $HOME/test/subjob38.sh
 $HOME/test/subjob39.sh
 $HOME/test/subjob40.sh

These jobs could be launch by

 oarsub -n test -l /core=6,walltime=00:35:00 "oar-parexec -f ./subjob.list.txt"


=head1 SEE ALSO

oar-dispatch, mpilauncher


=head1 AUTHORS

Written by Gabriel Moreau, Grenoble - France


=head1 LICENSE AND COPYRIGHT

GPL version 2 or later and Perl equivalent

Copyright (C) 2011 Gabriel Moreau / LEGI - CNRS UMR 5519 - France