Changeset 34


Ignore:
Timestamp:
Dec 1, 2011, 11:26:43 PM (9 years ago)
Author:
g7moreau
Message:
  • Parallel sub-job !
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/oarutils/oar-parexec

    r32 r34  
    1818my $file = '';
    1919my $verbose;
     20my $job_np = 1;
    2021my $nodefile = $ENV{OAR_NODE_FILE} || '';
    2122my $masterio;
     
    2930   'help'       => \$help,
    3031   'oarsh=s'    => \$oarsh,
     32   'jobnp=i'    => \$job_np,
    3133   'nodefile=s' => \$nodefile,
    3234   'masterio=s' => \$masterio,
     
    4648close JOB_LIST;
    4749
     50my @ressources = ();
     51open( NODE_FILE, '<', "$nodefile" )
     52   or die "can't open $nodefile: $!";
     53while (<NODE_FILE>) {
     54   chomp;
     55   next if m/^#/;
     56   next if m/^\s*$/;
     57   push @ressources, $_ ;
     58   }
     59close NODE_FILE;
     60
     61my $ressource_size = scalar(@ressources);
     62die "not enought ressources jobnp $job_np > ressources $ressource_size" if not $job_np > $ressource_size;
     63
     64my $current_dir = getcwd();
     65
    4866my $stderr = $ENV{OAR_STDERR} || '';
    4967$stderr =~ s/\.stderr$//;
     
    5371$stdout = $masterio if $masterio;
    5472
    55 my $current_dir = getcwd();
    5673
    5774my $finished = new Coro::Signal;
     
    6077
    6178my $ressources = new Coro::Channel;
    62 open( NODE_FILE, '<', "$nodefile" )
    63    or die "can't open $nodefile: $!";
    64 while (<NODE_FILE>) {
    65    chomp;
    66    next if m/^#/;
    67    next if m/^\s*$/;
    68    $ressources->put($_);
    69    }
    70 close NODE_FILE;
     79for my $slot (1 .. int($ressource_size / $job_np)) {
     80   $ressources->put( [ @ressources[(($slot - 1) * $job_np) .. (($slot * $job_np) - 1)] ] );
     81   }
     82
    7183
    7284my $job_num   = 0;
     
    7587async {
    7688   for my $job (@job) {
    77       my $node = $ressources->get;
     89      my $nodes = $ressources->get;
    7890
    7991      $job_num++;
    8092
     93      my $node_connect = $nodes->[0];
    8194      my $fh      = IO::File->new();
    82       my $job_pid = $fh->open("| $oarsh $node >/dev/null 2>&1")
     95      my $job_pid = $fh->open("| $oarsh $node_connect >/dev/null 2>&1")
    8396         or die "don't start subjob: $!";
    8497
     
    8699      $fh = unblock $fh;
    87100
    88       $scheduled{$job_pid} = { fh => $fh, node => $node, num => $job_num };
     101      $scheduled{$job_pid} = { fh => $fh, node => $node_connect, num => $job_num };
    89102
    90103      printf "start job %5i / %5i on node %s at %s\n",
    91          $job_num, $job_pid, $node, time
     104         $job_num, $job_pid, $node_connect, time
    92105         if $verbose;
    93106
     
    96109      $job_stderr = "2> $stderr-$job_num.stderr" if $stderr ne '' and $switchio;
    97110
     111      my $job_nodefile = "/tmp/oar-parexec-$ENV{USERNAME}-$job_num";
     112
     113      if ($job_np > 1) {
     114         $fh->print("printf \".join('\n',@{$nodes}).\" > $job_nodefile\n");
     115         $fh->print("OAR_NODE_FILE=$job_nodefile\n");
     116         $fh->print("OAR_NP=$job_np\n");
     117         $fh->print("export OAR_NODE_FILE\n");
     118         $fh->print("export OAR_NP\n");
     119         $fh->print("unset OAR_MSG_NODEFILE\n");
     120         }
    98121      $fh->print("cd $current_dir\n");
    99122      $fh->print("$job $job_stdout $job_stderr\n");
     123      $fh->print("rm -f $job_nodefile\n") if $job_np > 1;
    100124      $fh->print("exit\n");
    101125      cede;
     
    136160=head1 SYNOPSIS
    137161
    138  oar-parexec --file filecommand [--verbose] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
     162 oar-parexec --file filecommand [--verbose] [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
    139163 oar-parexec --help
    140164
     
    149173
    150174Small job will be launch in the same folder as the master job.
     175Two environment variable are define for each small job
     176only in case of parallel small job (option C<--jobnp> > 1).
     177
     178 OAR_NODE_FILE - file that list node for parallel computing
     179 OAR_NP        - number of processor affected
     180
     181The file define by OAR_NODE_FILE is created on the node before launching
     182the small job in /tmp and will be delete after...
     183C<oar-parexec> is a simple script,
     184OAR_NODE_FILE will not be deleted in case of crash of the master job.
    151185
    152186
     
    161195=item B<-v|--verbose>
    162196
    163 =item B<-n|nodefile filenode>
     197=item B<-j|--jobnp integer>
     198
     199Number of processor to allocated for each small job.
     2001 by default.
     201
     202=item B<-n|--nodefile filenode>
    164203
    165204File name that list all the node to launch job.
Note: See TracChangeset for help on using the changeset viewer.