source: trunk/oarutils/oar-parexec @ 32

Last change on this file since 32 was 32, checked in by g7moreau, 12 years ago
  • Add many option and now could be use outside OAR environment!
File size: 5.4 KB
Line 
1#!/usr/bin/perl
2#
3# 2011/11/27 gabriel
4
5use strict;
6
7use Getopt::Long();
8use Pod::Usage;
9use Coro;
10use Coro::Semaphore;
11use Coro::Signal;
12use Coro::Channel;
13use Coro::Handle;
14use IO::File;
15use POSIX qw( WNOHANG WEXITSTATUS );
16use Cwd qw( getcwd );
17
18my $file = '';
19my $verbose;
20my $nodefile = $ENV{OAR_NODE_FILE} || '';
21my $masterio;
22my $switchio;
23my $help;
24my $oarsh = 'oarsh -q -T';
25
26Getopt::Long::GetOptions(
27   'file=s'     => \$file,
28   'verbose'    => \$verbose,
29   'help'       => \$help,
30   'oarsh=s'    => \$oarsh,
31   'nodefile=s' => \$nodefile,
32   'masterio=s' => \$masterio,
33   'switchio'   => \$switchio,
34   ) || pod2usage( -verbose => 0 );
35pod2usage( -verbose => 2 ) if $help;
36pod2usage( -verbose => 2 ) if not -e $file;
37
38my @job = ();
39open( JOB_LIST, '<', "$file" ) or die "can't open $file: $!";
40while (<JOB_LIST>) {
41   chomp;
42   next if m/^#/;
43   next if m/^\s*$/;
44   push @job, $_ ;
45   }
46close JOB_LIST;
47
48my $stderr = $ENV{OAR_STDERR} || '';
49$stderr =~ s/\.stderr$//;
50$stderr = $masterio if $masterio;
51my $stdout = $ENV{OAR_STDOUT} || '';
52$stdout =~ s/\.stdout$//;
53$stdout = $masterio if $masterio;
54
55my $current_dir = getcwd();
56
57my $finished = new Coro::Signal;
58my $job_todo = new Coro::Semaphore 0;
59$job_todo->up for (@job);
60
61my $ressources = new Coro::Channel;
62open( NODE_FILE, '<', "$nodefile" )
63   or die "can't open $nodefile: $!";
64while (<NODE_FILE>) {
65   chomp;
66   next if m/^#/;
67   next if m/^\s*$/;
68   $ressources->put($_);
69   }
70close NODE_FILE;
71
72my $job_num   = 0;
73my %scheduled = ();
74
75async {
76   for my $job (@job) {
77      my $node = $ressources->get;
78
79      $job_num++;
80
81      my $fh      = IO::File->new();
82      my $job_pid = $fh->open("| $oarsh $node >/dev/null 2>&1")
83         or die "don't start subjob: $!";
84
85      $fh->autoflush;
86      $fh = unblock $fh;
87
88      $scheduled{$job_pid} = { fh => $fh, node => $node, num => $job_num };
89
90      printf "start job %5i / %5i on node %s at %s\n",
91         $job_num, $job_pid, $node, time
92         if $verbose;
93
94      my ( $job_stdout, $job_stderr );
95      $job_stdout = ">  $stdout-$job_num.stdout" if $stdout ne '' and $switchio;
96      $job_stderr = "2> $stderr-$job_num.stderr" if $stderr ne '' and $switchio;
97
98      $fh->print("cd $current_dir\n");
99      $fh->print("$job $job_stdout $job_stderr\n");
100      $fh->print("exit\n");
101      cede;
102      }
103   }
104
105async {
106   while () {
107      for my $job_pid ( keys %scheduled ) {
108         if ( waitpid( $job_pid, WNOHANG ) ) {
109            printf "end   job %5i / %5i on node %s at %s\n",
110               $scheduled{$job_pid}->{num},
111               $job_pid, $scheduled{$job_pid}->{node}, time
112               if $verbose;
113            close $scheduled{$job_pid}->{fh};
114            $ressources->put( $scheduled{$job_pid}->{node} );
115            $job_todo->down;
116            delete $scheduled{$job_pid};
117            }
118         cede;
119         }
120
121      $finished->send if $job_todo->count == 0;
122      cede;
123      }
124   }
125
126cede;
127
128$finished->wait;
129
130__END__
131
132=head1 NAME
133
134oar-parexec - parallel execute lot of small job
135
136=head1 SYNOPSIS
137
138 oar-parexec --file filecommand [--verbose]  [--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
139 oar-parexec --help
140
141=head1 DESCRIPTION
142
143C<oar-parexec> execute lot of small job.in parallel inside a cluster.
144Number of parallel job at one time cannot excede core number in the node file.
145C<oar-parexec> is easier to use inside an OAR job environment
146which define automatically theses strategics parameters...
147
148Option C<--file> is the only mandatory one.
149
150Small job will be launch in the same folder as the master job.
151
152
153=head1 OPTIONS
154
155=over 12
156
157=item B<-f|--file       filecommand>
158
159File name which content job list.
160
161=item B<-v|--verbose>
162
163=item B<-n|nodefile filenode>
164
165File name that list all the node to launch job.
166By defaut, it's define automatically by OAR via
167environment variable C<OAR_NODE_FILE>.
168
169For example, if you want to use 6 core on your cluster node,
170you need to put 6 times the hostname node in this file,
171one per line...
172It's a very common file in MPI process !
173
174=item B<-m|--masterio basefileio>
175
176The C<basefileio> will be use in place of environment variable
177C<OAR_STDOUT> and C<OAR_STDERR> (without extension) to build the base name of the small job standart output
178(only when option C<swithio> is activated).
179
180=item B<-s|--switchio>
181
182Each small job will have it's own output STDOUT and STDERR
183base on master OAR job with C<JOB_NUM> inside
184(or base on C<basefileio> if option C<masterio>).
185Example :
186
187 OAR.151524.stdout -> OAR.151524-JOB_NUM.stdout
188
189where 151524 here is the master C<OAR_JOB_ID>
190and C<JOB_NUM> is the small job nnumber.
191
192=item B<-o|-oarsh command>
193
194Command use to launch a shell on a node.
195By default
196
197        oarsh -q -T
198
199=item B<-h|--help>
200
201=back
202
203
204=head1 EXAMPLE
205
206Content for the job file (option C<--file>) could have:
207
208 - empty line
209 - comment line begin with #
210 - valid shell command
211
212Example where F<$HOME/test/subjob1.sh> is a shell script (executable).
213
214 $HOME/test/subjob1.sh
215 $HOME/test/subjob2.sh
216 $HOME/test/subjob3.sh
217 $HOME/test/subjob4.sh
218 ...
219 $HOME/test/subjob38.sh
220 $HOME/test/subjob39.sh
221 $HOME/test/subjob40.sh
222
223These jobs could be launch by
224
225 oarsub -n test -l /core=6,walltime=00:35:00 "oar-parexec -f ./subjob.list.txt"
226
227
228=head1 SEE ALSO
229
230oar-dispatch, mpilauncher
231
232
233=head1 AUTHORS
234
235Written by Gabriel Moreau, Grenoble - France
236
237
238=head1 LICENSE AND COPYRIGHT
239
240GPL version 2 or later and Perl equivalent
241
242Copyright (C) 2011 Gabriel Moreau / LEGI - CNRS UMR 5519 - France
243
Note: See TracBrowser for help on using the repository browser.