Changeset 45
- Timestamp:
- Dec 6, 2011, 12:26:31 PM (13 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/oarutils/oar-parexec
r44 r45 17 17 18 18 my $filecmd = ''; 19 my $dir = ''; 20 my $cmd = ''; 19 21 my $logtrace = ''; 20 22 my $verbose; … … 28 30 Getopt::Long::GetOptions( 29 31 'filecmd=s' => \$filecmd, 32 'dir=s' => \$dir, 33 'cmd=s' => \$cmd, 30 34 'logtrace=s' => \$logtrace, 31 35 'verbose' => \$verbose, … … 38 42 ) || pod2usage(-verbose => 0); 39 43 pod2usage(-verbose => 2) if $help; 40 pod2usage(-verbose => 2) if not -e $filecmd; 44 pod2usage(-verbose => 2) if not ( 45 (-e "$filecmd") 46 or (-d "$dir" and $cmd ne '') 47 ); 41 48 42 49 # re-run, keep trace of job already done 43 50 my %state; 44 51 my $log_h = IO::File->new(); 45 if (-e $logtrace) {52 if (-e "$logtrace") { 46 53 $log_h->open("< $logtrace") 47 54 or die "error: can't read log file: $!"; 48 55 while (<$log_h>) { 49 $state{$1} = 'start' if m/^start\s+job\s+( \d+)\s/;50 $state{$1} = 'end' if m/^end\s+job\s+( \d+)\s/;56 $state{$1} = 'start' if m/^start\s+job\s+([^\s]+)\s/; 57 $state{$1} = 'end' if m/^end\s+job\s+([^\s]+)\s/; 51 58 } 52 59 $log_h->close(); … … 61 68 # job to run 62 69 my @job = (); 63 open(JOB_LIST, '<', "$filecmd") or die "error: can't open job file $filecmd: $!"; 64 while (<JOB_LIST>) { 65 chomp; 66 next if m/^#/; 67 next if m/^\s*$/; 68 push @job, $_; 69 } 70 close JOB_LIST; 70 if (-e "$filecmd") { 71 my $job_num = 0; 72 open(JOB_LIST, '<', "$filecmd") or die "error: can't open job file $filecmd: $!"; 73 while (<JOB_LIST>) { 74 chomp; 75 next if m/^#/; 76 next if m/^\s*$/; 77 $job_num++; 78 push @job, { name => $job_num, cmd => "$_" }; 79 } 80 close JOB_LIST; 81 } 82 else { 83 opendir(DIR, $dir) or die "error: can't open folder $dir: $!"; 84 while (my $item = readdir(DIR)) { 85 next if $item =~ m/^\./; 86 next if $item =~ m/:/; 87 next if $item =~ m/\.old$/; 88 next if $item =~ m/\.sav$/; 89 next if $item =~ m/\.bak$/; 90 next if $item =~ m/\.no$/; 91 next unless (-d "$dir/$item"); 92 push @job, { name => $item, cmd => "( cd $dir/$item/; $cmd )" }; 93 } 94 closedir DIR; 95 } 71 96 72 97 # ressources available … … 97 122 my $finished = new Coro::Signal; 98 123 my $job_todo = new Coro::Semaphore 0; 99 $job_todo->up for (@job); 124 my $job_name_maxlen; 125 for (@job) { 126 $job_todo->up; 127 $job_name_maxlen = length($_->{name}) if length($_->{name}) > $job_name_maxlen; 128 } 100 129 101 130 # slice of ressources for parallel job … … 108 137 } 109 138 110 my $job_num = 0;111 139 my %scheduled = (); 112 140 … … 125 153 JOB: 126 154 for my $job (@job) { 127 $job_num++; 155 my $job_name = $job->{name}; 156 my $job_cmd = $job->{cmd}; 128 157 129 158 # job has been already run ? 130 if (exists $state{$job_n um}) {131 if ($state{$job_n um} eq 'start') {132 print "warning: job $job_n umwas not clearly finished, relaunching...\n"159 if (exists $state{$job_name}) { 160 if ($state{$job_name} eq 'start') { 161 print "warning: job $job_name was not clearly finished, relaunching...\n" 133 162 if $verbose; 134 163 } 135 elsif ($state{$job_n um} eq 'end') {136 delete $state{$job_n um}; # free memory164 elsif ($state{$job_name} eq 'end') { 165 delete $state{$job_name}; # free memory 137 166 $job_todo->down; 138 print "warning: job $job_n umalready run\n" if $verbose;167 print "warning: job $job_name already run\n" if $verbose; 139 168 cede; 140 169 next JOB; … … 160 189 node_connect => $node_connect, 161 190 ressource => $job_ressource, 162 n um => $job_num191 name => $job_name 163 192 }; 164 193 165 my $msg = sprintf "start job % 5i/ %5i at %s on node %s\n",166 $job_n um, $job_pid, time, $job_ressource;194 my $msg = sprintf "start job %${job_name_maxlen}s / %5i at %s on node %s\n", 195 $job_name, $job_pid, time, $job_ressource; 167 196 $log_h->print($msg) if $logtrace; 168 197 print($msg) if $verbose; 169 198 170 199 my ($job_stdout, $job_stderr); 171 $job_stdout = "> $stdout-$job_n um.stdout" if $stdout ne '' and $switchio;172 $job_stderr = "2> $stderr-$job_n um.stderr" if $stderr ne '' and $switchio;173 174 my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$job_n um";200 $job_stdout = "> $stdout-$job_name.stdout" if $stdout ne '' and $switchio; 201 $job_stderr = "2> $stderr-$job_name.stderr" if $stderr ne '' and $switchio; 202 203 my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$job_name"; 175 204 176 205 # set job environment, run it and clean … … 186 215 } 187 216 $fh->print("cd $current_dir\n"); 188 $fh->print("$job $job_stdout $job_stderr\n");217 $fh->print("$job_cmd $job_stdout $job_stderr\n"); 189 218 $fh->print("rm -f $job_nodefile\n") if $job_np > 1; 190 219 $fh->print("exit\n"); … … 199 228 # non blocking PID test 200 229 if (waitpid($job_pid, WNOHANG)) { 201 my $msg = sprintf "end job % 5i/ %5i at %s on node %s\n",202 $scheduled{$job_pid}->{n um},230 my $msg = sprintf "end job %${job_name_maxlen}s / %5i at %s on node %s\n", 231 $scheduled{$job_pid}->{name}, 203 232 $job_pid, time, $scheduled{$job_pid}->{ressource}; 204 233 $log_h->print($msg) if $logtrace; … … 237 266 =head1 SYNOPSIS 238 267 239 oar-parexec --filecmd filecommand [--logtrace tracefile] [--verbose] [--jobnp integer] \ 240 [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh] 268 oar-parexec --filecmd filecommand [--logtrace tracefile] [--verbose] \ 269 [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh] 270 oar-parexec --dir foldertoitemize --cmd commandtolaunch [--logtrace tracefile] [--verbose] \ 271 [--jobnp integer] [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh] 241 272 oar-parexec --help 242 273 … … 249 280 However, it can be used outside OAR. 250 281 251 Option C<--filecmd> is the only mandatory one.282 Option C<--filecmd> or C<--dir> and C<--cmd> are the only mandatory parameters. 252 283 253 284 Small job will be launch in the same folder as the master job. … … 274 305 =over 12 275 306 276 =item B<-f|--filecmd 307 =item B<-f|--filecmd filecommand> 277 308 278 309 File name which content job list. 310 For the JOB_NAME definition, 311 the first valid job in the list will have the number 1 and so on... 312 313 =item B<-d|--dir foldertoitemize> 314 315 Command C<--cmd> will be launch in all sub-folder of this master folder. 316 Files in this folder will be ignored. 317 Sub-folder name which begin with '.' 318 or finish with '.old', '.sav', '.bak', '.no' will either be ignored... 319 320 The JOB_NAME is simply the Sub-folder name. 321 322 =item B<-c|--cmd commandtolaunch> 323 324 Command (and argument to it) tha will be launch in all sub-folder 325 parameter folfer C<--dir> 279 326 280 327 =item B<-l|--logtrace tracefile> … … 284 331 only job that are not mark as done will be run again. 285 332 Be careful, job mark as running (start but not finish) will be run again. 333 Tracing is base on the JOB_NAME between multiple run. 286 334 287 335 This option is very usefull in case of crash … … 315 363 316 364 Each small job will have it's own output STDOUT and STDERR 317 base on master OAR job with C<JOB_N UM> inside365 base on master OAR job with C<JOB_NAME> inside 318 366 (or base on C<basefileio> if option C<masterio>). 319 367 Example : 320 368 321 OAR.151524.stdout -> OAR.151524-JOB_N UM.stdout369 OAR.151524.stdout -> OAR.151524-JOB_NAME.stdout 322 370 323 371 where 151524 here is the master C<OAR_JOB_ID> 324 and C<JOB_N UM> is the small job nnumber.372 and C<JOB_NAME> is the small job name. 325 373 326 374 =item B<-o|-oarsh command>
Note: See TracChangeset
for help on using the changeset viewer.