Changeset 34 for trunk/oarutils/oar-parexec
- Timestamp:
- Dec 1, 2011, 11:26:43 PM (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/oarutils/oar-parexec
r32 r34 18 18 my $file = ''; 19 19 my $verbose; 20 my $job_np = 1; 20 21 my $nodefile = $ENV{OAR_NODE_FILE} || ''; 21 22 my $masterio; … … 29 30 'help' => \$help, 30 31 'oarsh=s' => \$oarsh, 32 'jobnp=i' => \$job_np, 31 33 'nodefile=s' => \$nodefile, 32 34 'masterio=s' => \$masterio, … … 46 48 close JOB_LIST; 47 49 50 my @ressources = (); 51 open( NODE_FILE, '<', "$nodefile" ) 52 or die "can't open $nodefile: $!"; 53 while (<NODE_FILE>) { 54 chomp; 55 next if m/^#/; 56 next if m/^\s*$/; 57 push @ressources, $_ ; 58 } 59 close NODE_FILE; 60 61 my $ressource_size = scalar(@ressources); 62 die "not enought ressources jobnp $job_np > ressources $ressource_size" if not $job_np > $ressource_size; 63 64 my $current_dir = getcwd(); 65 48 66 my $stderr = $ENV{OAR_STDERR} || ''; 49 67 $stderr =~ s/\.stderr$//; … … 53 71 $stdout = $masterio if $masterio; 54 72 55 my $current_dir = getcwd();56 73 57 74 my $finished = new Coro::Signal; … … 60 77 61 78 my $ressources = new Coro::Channel; 62 open( NODE_FILE, '<', "$nodefile" ) 63 or die "can't open $nodefile: $!"; 64 while (<NODE_FILE>) { 65 chomp; 66 next if m/^#/; 67 next if m/^\s*$/; 68 $ressources->put($_); 69 } 70 close NODE_FILE; 79 for my $slot (1 .. int($ressource_size / $job_np)) { 80 $ressources->put( [ @ressources[(($slot - 1) * $job_np) .. (($slot * $job_np) - 1)] ] ); 81 } 82 71 83 72 84 my $job_num = 0; … … 75 87 async { 76 88 for my $job (@job) { 77 my $node = $ressources->get;89 my $nodes = $ressources->get; 78 90 79 91 $job_num++; 80 92 93 my $node_connect = $nodes->[0]; 81 94 my $fh = IO::File->new(); 82 my $job_pid = $fh->open("| $oarsh $node >/dev/null 2>&1")95 my $job_pid = $fh->open("| $oarsh $node_connect >/dev/null 2>&1") 83 96 or die "don't start subjob: $!"; 84 97 … … 86 99 $fh = unblock $fh; 87 100 88 $scheduled{$job_pid} = { fh => $fh, node => $node , num => $job_num };101 $scheduled{$job_pid} = { fh => $fh, node => $node_connect, num => $job_num }; 89 102 90 103 printf "start job %5i / %5i on node %s at %s\n", 91 $job_num, $job_pid, $node , time104 $job_num, $job_pid, $node_connect, time 92 105 if $verbose; 93 106 … … 96 109 $job_stderr = "2> $stderr-$job_num.stderr" if $stderr ne '' and $switchio; 97 110 111 my $job_nodefile = "/tmp/oar-parexec-$ENV{USERNAME}-$job_num"; 112 113 if ($job_np > 1) { 114 $fh->print("printf \".join('\n',@{$nodes}).\" > $job_nodefile\n"); 115 $fh->print("OAR_NODE_FILE=$job_nodefile\n"); 116 $fh->print("OAR_NP=$job_np\n"); 117 $fh->print("export OAR_NODE_FILE\n"); 118 $fh->print("export OAR_NP\n"); 119 $fh->print("unset OAR_MSG_NODEFILE\n"); 120 } 98 121 $fh->print("cd $current_dir\n"); 99 122 $fh->print("$job $job_stdout $job_stderr\n"); 123 $fh->print("rm -f $job_nodefile\n") if $job_np > 1; 100 124 $fh->print("exit\n"); 101 125 cede; … … 136 160 =head1 SYNOPSIS 137 161 138 oar-parexec --file filecommand [--verbose] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]162 oar-parexec --file filecommand [--verbose] [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh] 139 163 oar-parexec --help 140 164 … … 149 173 150 174 Small job will be launch in the same folder as the master job. 175 Two environment variable are define for each small job 176 only in case of parallel small job (option C<--jobnp> > 1). 177 178 OAR_NODE_FILE - file that list node for parallel computing 179 OAR_NP - number of processor affected 180 181 The file define by OAR_NODE_FILE is created on the node before launching 182 the small job in /tmp and will be delete after... 183 C<oar-parexec> is a simple script, 184 OAR_NODE_FILE will not be deleted in case of crash of the master job. 151 185 152 186 … … 161 195 =item B<-v|--verbose> 162 196 163 =item B<-n|nodefile filenode> 197 =item B<-j|--jobnp integer> 198 199 Number of processor to allocated for each small job. 200 1 by default. 201 202 =item B<-n|--nodefile filenode> 164 203 165 204 File name that list all the node to launch job.
Note: See TracChangeset
for help on using the changeset viewer.