Changeset 43


Ignore:
Timestamp:
Dec 5, 2011, 7:05:00 PM (12 years ago)
Author:
g7moreau
Message:
  • --file -> --filecmd
  • --logfile -> --logtrace
  • Comment code
  • Begin man for option --logtrace
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/oarutils/oar-parexec

    r42 r43  
    1616use Cwd qw( getcwd );
    1717
    18 my $file    = '';
    19 my $logfile = '';
     18my $filecmd  = '';
     19my $logtrace = '';
    2020my $verbose;
    2121my $job_np = 1;
     
    2727
    2828Getopt::Long::GetOptions(
    29    'file=s'     => \$file,
    30    'logfile=s'  => \$logfile,
     29   'filecmd=s'  => \$filecmd,
     30   'logtrace=s' => \$logtrace,
    3131   'verbose'    => \$verbose,
    3232   'help'       => \$help,
     
    3838   ) || pod2usage(-verbose => 0);
    3939pod2usage(-verbose => 2) if $help;
    40 pod2usage(-verbose => 2) if not -e $file;
    41 
     40pod2usage(-verbose => 2) if not -e $filecmd;
     41
     42# re-run, keep trace of job already done
    4243my %state;
    4344my $log_h = IO::File->new();
    44 if (-e $logfile) {
    45    $log_h->open("< $logfile")
    46       or die "can't read log file: $!";
     45if (-e $logtrace) {
     46   $log_h->open("< $logtrace")
     47      or die "error: can't read log file: $!";
    4748   while (<$log_h>) {
    4849      $state{$1} = 'start' if m/^start\s+job\s+(\d+)\s/;
     
    5152   $log_h->close();
    5253   }
    53 if ($logfile) {
    54    $log_h->open(">> $logfile")
    55       or die "can't append log file $logfile: $!";
     54if ($logtrace) {
     55   $log_h->open(">> $logtrace")
     56      or die "error: can't append log file $logtrace: $!";
    5657   $log_h->autoflush;
    5758   $log_h = unblock $log_h;
    5859   }
    5960
     61# job to run
    6062my @job = ();
    61 open(JOB_LIST, '<', "$file") or die "can't open job file $file: $!";
     63open(JOB_LIST, '<', "$filecmd") or die "error: can't open job file $filecmd: $!";
    6264while (<JOB_LIST>) {
    6365   chomp;
     
    6870close JOB_LIST;
    6971
     72# ressources available
    7073my @ressources = ();
    7174open(NODE_FILE, '<', "$nodefile")
     
    8083
    8184my $ressource_size = scalar(@ressources);
    82 die "not enought ressources jobnp $job_np > ressources $ressource_size"
     85die "error: not enought ressources jobnp $job_np > ressources $ressource_size"
    8386   if $job_np > $ressource_size;
    8487
     
    9699$job_todo->up for (@job);
    97100
     101# slice of ressources for parallel job
    98102my $ressources = new Coro::Channel;
    99103for my $slot (1 .. int($ressource_size / $job_np)) {
     
    107111my %scheduled = ();
    108112
     113# OAR checkpoint and default signal SIGUSR2
    109114my $oar_checkpoint = new Coro::Semaphore 0;
    110115$SIG{USR2} = sub {
     
    116121   };
    117122
     123# asynchrone start job block
    118124async {
     125        JOB:
    119126   for my $job (@job) {
    120127      $job_num++;
    121128
     129      # job has been already run ?
    122130      if (exists $state{$job_num}) {
    123131         if ($state{$job_num} eq 'start') {
    124             print "warning: job $job_num was not finished, relaunching...\n"
     132            print "warning: job $job_num was not clearly finished, relaunching...\n"
    125133               if $verbose;
    126134            }
    127135         elsif ($state{$job_num} eq 'end') {
    128             delete $state{$job_num};
     136            delete $state{$job_num}; # free memory
    129137            $job_todo->down;
    130             print "warning: job $job_num already done\n" if $verbose;
     138            print "warning: job $job_num already run\n" if $verbose;
    131139            cede;
    132             next;
     140            next JOB;
    133141            }
    134142         }
    135143
     144      # take job ressource
    136145      my $job_ressource = $ressources->get;
    137146
    138       last if $oar_checkpoint->count() > 0;
     147      # no more launch job when OAR checkpointing
     148      last JOB if $oar_checkpoint->count() > 0;
    139149
    140150      my ($node_connect) = split ',', $job_ressource;
    141151      my $fh = IO::File->new();
    142152      my $job_pid = $fh->open("| $oarsh $node_connect >/dev/null 2>&1")
    143          or die "don't start subjob: $!";
     153         or die "error: can't start subjob: $!";
    144154
    145155      $fh->autoflush;
     
    155165      my $msg = sprintf "start job %5i / %5i at %s on node %s\n",
    156166         $job_num, $job_pid, time, $job_ressource;
    157       $log_h->print($msg) if $logfile;
     167      $log_h->print($msg) if $logtrace;
    158168      print($msg) if $verbose;
    159169
     
    164174      my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$job_num";
    165175
     176     # set job environment, run it and clean
    166177      if ($job_np > 1) {
    167178         $fh->print("printf \""
     
    182193   }
    183194
     195# asynchrone end job block
    184196async {
    185197   while () {
    186198      for my $job_pid (keys %scheduled) {
     199                        # non blocking PID test
    187200         if (waitpid($job_pid, WNOHANG)) {
    188201            my $msg = sprintf "end   job %5i / %5i at %s on node %s\n",
    189202               $scheduled{$job_pid}->{num},
    190203               $job_pid, time, $scheduled{$job_pid}->{ressource};
    191             $log_h->print($msg) if $logfile;
     204            $log_h->print($msg) if $logtrace;
    192205            print($msg) if $verbose;
    193206            close $scheduled{$job_pid}->{fh};
     207            # leave ressources for another job
    194208            $ressources->put($scheduled{$job_pid}->{ressource});
    195209            $job_todo->down;
     
    199213         }
    200214
     215      # checkpointing ! just finishing running job and quit
    201216      $finished->send if $oar_checkpoint->count() > 0 and scalar(keys(%scheduled)) == 0;
    202217
     
    208223cede;
    209224
     225# all job have been done
    210226$finished->wait;
    211227
    212 $log_h->close() if $logfile;
     228# close log trace file
     229$log_h->close() if $logtrace;
    213230
    214231__END__
     
    220237=head1 SYNOPSIS
    221238
    222  oar-parexec --file filecommand [--verbose] [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
     239 oar-parexec --filecmd filecommand [--logtrace tracefile] [--verbose] [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
    223240 oar-parexec --help
    224241
     
    230247which define automatically theses strategics parameters...
    231248
    232 Option C<--file> is the only mandatory one.
     249Option C<--filecmd> is the only mandatory one.
    233250
    234251Small job will be launch in the same folder as the master job.
     
    254271=over 12
    255272
    256 =item B<-f|--file       filecommand>
     273=item B<-f|--filecmd    filecommand>
    257274
    258275File name which content job list.
     276
     277=item B<-l|--logtrace tracefile>
     278
     279File which log and trace running job.
     280In case of running the same command (after crash for example),
     281only job that ar not mark as done will be run again.
     282Be carefful, job mark as running (start but for finish) will be run again.
     283
     284This option is very usefull in case of crash
     285but also for checkpointing and idempotent OAR job.
    259286
    260287=item B<-v|--verbose>
     
    308335=head1 EXAMPLE
    309336
    310 Content for the job file (option C<--file>) could have:
     337Content for the job file command (option C<--filecmd>) could have:
    311338
    312339 - empty line
Note: See TracChangeset for help on using the changeset viewer.