Changeset 45


Ignore:
Timestamp:
Dec 6, 2011, 12:26:31 PM (9 years ago)
Author:
g7moreau
Message:
  • Folder list job
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/oarutils/oar-parexec

    r44 r45  
    1717
    1818my $filecmd  = '';
     19my $dir      = '';
     20my $cmd      = '';
    1921my $logtrace = '';
    2022my $verbose;
     
    2830Getopt::Long::GetOptions(
    2931   'filecmd=s'  => \$filecmd,
     32   'dir=s'      => \$dir,
     33   'cmd=s'      => \$cmd,
    3034   'logtrace=s' => \$logtrace,
    3135   'verbose'    => \$verbose,
     
    3842   ) || pod2usage(-verbose => 0);
    3943pod2usage(-verbose => 2) if $help;
    40 pod2usage(-verbose => 2) if not -e $filecmd;
     44pod2usage(-verbose => 2) if not (
     45 (-e "$filecmd")
     46 or (-d "$dir" and $cmd ne '')
     47 );
    4148
    4249# re-run, keep trace of job already done
    4350my %state;
    4451my $log_h = IO::File->new();
    45 if (-e $logtrace) {
     52if (-e "$logtrace") {
    4653   $log_h->open("< $logtrace")
    4754      or die "error: can't read log file: $!";
    4855   while (<$log_h>) {
    49       $state{$1} = 'start' if m/^start\s+job\s+(\d+)\s/;
    50       $state{$1} = 'end'   if m/^end\s+job\s+(\d+)\s/;
     56      $state{$1} = 'start' if m/^start\s+job\s+([^\s]+)\s/;
     57      $state{$1} = 'end'   if m/^end\s+job\s+([^\s]+)\s/;
    5158      }
    5259   $log_h->close();
     
    6168# job to run
    6269my @job = ();
    63 open(JOB_LIST, '<', "$filecmd") or die "error: can't open job file $filecmd: $!";
    64 while (<JOB_LIST>) {
    65    chomp;
    66    next if m/^#/;
    67    next if m/^\s*$/;
    68    push @job, $_;
    69    }
    70 close JOB_LIST;
     70if (-e "$filecmd") {
     71   my $job_num = 0;
     72   open(JOB_LIST, '<', "$filecmd") or die "error: can't open job file $filecmd: $!";
     73   while (<JOB_LIST>) {
     74      chomp;
     75      next if m/^#/;
     76      next if m/^\s*$/;
     77      $job_num++;
     78      push @job, { name => $job_num, cmd => "$_" };
     79      }
     80   close JOB_LIST;
     81   }
     82else {
     83   opendir(DIR, $dir) or die "error: can't open folder $dir: $!";
     84   while (my $item = readdir(DIR)) {
     85      next if $item =~ m/^\./;
     86      next if $item =~ m/:/;
     87      next if $item =~ m/\.old$/;
     88      next if $item =~ m/\.sav$/;
     89      next if $item =~ m/\.bak$/;
     90      next if $item =~ m/\.no$/;
     91      next unless (-d "$dir/$item");
     92      push @job, { name => $item, cmd => "( cd $dir/$item/; $cmd )" };
     93      }
     94   closedir DIR;
     95   }
    7196
    7297# ressources available
     
    97122my $finished = new Coro::Signal;
    98123my $job_todo = new Coro::Semaphore 0;
    99 $job_todo->up for (@job);
     124my $job_name_maxlen;
     125for (@job) {
     126   $job_todo->up;
     127   $job_name_maxlen = length($_->{name}) if length($_->{name}) > $job_name_maxlen;
     128   }
    100129
    101130# slice of ressources for parallel job
     
    108137   }
    109138
    110 my $job_num   = 0;
    111139my %scheduled = ();
    112140
     
    125153        JOB:
    126154   for my $job (@job) {
    127       $job_num++;
     155      my $job_name = $job->{name};
     156      my $job_cmd  = $job->{cmd};
    128157
    129158      # job has been already run ?
    130       if (exists $state{$job_num}) {
    131          if ($state{$job_num} eq 'start') {
    132             print "warning: job $job_num was not clearly finished, relaunching...\n"
     159      if (exists $state{$job_name}) {
     160         if ($state{$job_name} eq 'start') {
     161            print "warning: job $job_name was not clearly finished, relaunching...\n"
    133162               if $verbose;
    134163            }
    135          elsif ($state{$job_num} eq 'end') {
    136             delete $state{$job_num}; # free memory
     164         elsif ($state{$job_name} eq 'end') {
     165            delete $state{$job_name}; # free memory
    137166            $job_todo->down;
    138             print "warning: job $job_num already run\n" if $verbose;
     167            print "warning: job $job_name already run\n" if $verbose;
    139168            cede;
    140169            next JOB;
     
    160189         node_connect => $node_connect,
    161190         ressource    => $job_ressource,
    162          num          => $job_num
     191         name         => $job_name
    163192         };
    164193
    165       my $msg = sprintf "start job %5i / %5i at %s on node %s\n",
    166          $job_num, $job_pid, time, $job_ressource;
     194      my $msg = sprintf "start job %${job_name_maxlen}s / %5i at %s on node %s\n",
     195         $job_name, $job_pid, time, $job_ressource;
    167196      $log_h->print($msg) if $logtrace;
    168197      print($msg) if $verbose;
    169198
    170199      my ($job_stdout, $job_stderr);
    171       $job_stdout = ">  $stdout-$job_num.stdout" if $stdout ne '' and $switchio;
    172       $job_stderr = "2> $stderr-$job_num.stderr" if $stderr ne '' and $switchio;
    173 
    174       my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$job_num";
     200      $job_stdout = ">  $stdout-$job_name.stdout" if $stdout ne '' and $switchio;
     201      $job_stderr = "2> $stderr-$job_name.stderr" if $stderr ne '' and $switchio;
     202
     203      my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$job_name";
    175204
    176205     # set job environment, run it and clean
     
    186215         }
    187216      $fh->print("cd $current_dir\n");
    188       $fh->print("$job $job_stdout $job_stderr\n");
     217      $fh->print("$job_cmd $job_stdout $job_stderr\n");
    189218      $fh->print("rm -f $job_nodefile\n") if $job_np > 1;
    190219      $fh->print("exit\n");
     
    199228                        # non blocking PID test
    200229         if (waitpid($job_pid, WNOHANG)) {
    201             my $msg = sprintf "end   job %5i / %5i at %s on node %s\n",
    202                $scheduled{$job_pid}->{num},
     230            my $msg = sprintf "end   job %${job_name_maxlen}s / %5i at %s on node %s\n",
     231               $scheduled{$job_pid}->{name},
    203232               $job_pid, time, $scheduled{$job_pid}->{ressource};
    204233            $log_h->print($msg) if $logtrace;
     
    237266=head1 SYNOPSIS
    238267
    239  oar-parexec --filecmd filecommand [--logtrace tracefile] [--verbose] [--jobnp integer] \
    240             [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
     268 oar-parexec --filecmd filecommand [--logtrace tracefile] [--verbose] \
     269            [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
     270 oar-parexec --dir foldertoitemize --cmd commandtolaunch [--logtrace tracefile] [--verbose] \
     271            [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
    241272 oar-parexec --help
    242273
     
    249280However, it can be used outside OAR.
    250281
    251 Option C<--filecmd> is the only mandatory one.
     282Option C<--filecmd> or C<--dir> and C<--cmd> are the only mandatory parameters.
    252283
    253284Small job will be launch in the same folder as the master job.
     
    274305=over 12
    275306
    276 =item B<-f|--filecmd    filecommand>
     307=item B<-f|--filecmd filecommand>
    277308
    278309File name which content job list.
     310For the JOB_NAME definition,
     311the first valid job in the list will have the number 1 and so on...
     312
     313=item B<-d|--dir foldertoitemize>
     314
     315Command C<--cmd> will be launch in all sub-folder of this master folder.
     316Files in this folder will be ignored.
     317Sub-folder name which begin with '.'
     318or finish with '.old', '.sav', '.bak', '.no' will either be ignored...
     319
     320The JOB_NAME is simply the Sub-folder name.
     321
     322=item B<-c|--cmd commandtolaunch>
     323
     324Command (and argument to it) tha will be launch in all sub-folder
     325parameter folfer C<--dir>
    279326
    280327=item B<-l|--logtrace tracefile>
     
    284331only job that are not mark as done will be run again.
    285332Be careful, job mark as running (start but not finish) will be run again.
     333Tracing is base on the JOB_NAME between multiple run.
    286334
    287335This option is very usefull in case of crash
     
    315363
    316364Each small job will have it's own output STDOUT and STDERR
    317 base on master OAR job with C<JOB_NUM> inside
     365base on master OAR job with C<JOB_NAME> inside
    318366(or base on C<basefileio> if option C<masterio>).
    319367Example :
    320368
    321  OAR.151524.stdout -> OAR.151524-JOB_NUM.stdout
     369 OAR.151524.stdout -> OAR.151524-JOB_NAME.stdout
    322370
    323371where 151524 here is the master C<OAR_JOB_ID>
    324 and C<JOB_NUM> is the small job nnumber.
     372and C<JOB_NAME> is the small job name.
    325373
    326374=item B<-o|-oarsh command>
Note: See TracChangeset for help on using the changeset viewer.