Changeset 94 for trunk/oarutils


Ignore:
Timestamp:
Jan 24, 2013, 7:17:46 PM (11 years ago)
Author:
g7moreau
Message:
  • Add checkpointing and signal API. No test done...
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/oarutils/oar-dispatch

    r93 r94  
    1111use Coro::Semaphore;
    1212use Coro::Timer qw(sleep);
     13use Coro::Handle;
     14use IO::File;
    1315
    1416my $task = 0;
    1517my $overload = 1.1;
    1618my $file = '';
     19my $logtrace;
    1720my $verbose;
    1821my $help;
     22my $sig_transmit;
     23my $sig_checkpoint = 'USR2';
    1924
    2025Getopt::Long::GetOptions(
     
    2227   'overload=f'   => \$overload,
    2328   'file=s'       => \$file,
     29   'logtrace=s'   => \$logtrace,
    2430   'verbose'      => \$verbose,
    2531   'help'         => \$help,
     32   'transmit'     => \$sig_transmit,
     33   'kill=s'       => \$sig_checkpoint,
    2634   ) || pod2usage(-verbose => 0);
    2735pod2usage(-verbose => 2) if $help;
     
    3139   $task++ while <NODE_FILE>;
    3240   close NODE_FILE;
     41   }
     42
     43# re-run, keep trace of job already done
     44my %state;
     45my $log_h = IO::File->new();
     46if (-e "$logtrace") {
     47   $log_h->open("< $logtrace")
     48      or die "error: can't read log file: $!";
     49   while (<$log_h>) {
     50      $state{$1} = 'start' if m/^start\s+job\s+([^\s]+)\s/;
     51      $state{$1} = 'end'   if m/^end\s+job\s+([^\s]+)\s/;
     52      }
     53   $log_h->close();
     54   }
     55if ($logtrace) {
     56   $log_h->open(">> $logtrace")
     57      or die "error: can't append log file $logtrace: $!";
     58   $log_h->autoflush;
     59   $log_h = unblock $log_h;
    3360   }
    3461
     
    6087my %scheduled = ();
    6188
     89# OAR checkpoint and default signal SIGUSR2
     90my $oar_checkpoint = new Coro::Semaphore 0;
     91my $notify         = new Coro::Signal;
     92$SIG{$sig_checkpoint} = sub {
     93   print "warning: receive checkpoint at "
     94      . time
     95      . ", no new job, just finishing running job\n"
     96      if $verbose;
     97   $oar_checkpoint->up();
     98   $notify->send if $sig_transmit;
     99   };
     100
     101# asynchrone notify job
     102async {
     103   while () {
     104      $notify->wait;
     105
     106      for my $job_pid (keys %scheduled) {
     107         system "oardel --checkpoint --signal $sig_checkpoint $job_pid";
     108         cede;
     109         }
     110      }
     111   }
     112
    62113# asynchrone start job block
    63114async {
     
    67118         cede;
    68119         }
     120
     121      # no more launch job when OAR checkpointing
     122      last JOB if $oar_checkpoint->count() > 0;
     123
    69124      $job =~ s/^\s*oarsub//;
    70125      print "oarsub $insert_oar_option $job" if $verbose;
     
    107162#         }
    108163
     164      # checkpointing ! just finishing running job and quit
     165      $finished->send if $oar_checkpoint->count() > 0 and scalar(keys(%scheduled)) == 0;
     166
    109167      $finished->send if $job_todo->count == 0;
    110168      cede;
     
    116174# all job have been done
    117175$finished->wait;
     176
     177# close log trace file
     178$log_h->close() if $logtrace;
    118179
    119180
Note: See TracChangeset for help on using the changeset viewer.