source: trunk/oar/oar-parexec @ 28

Last change on this file since 28 was 28, checked in by g7moreau, 12 years ago
  • Just change Copyright
File size: 4.2 KB
Line 
1#!/usr/bin/perl
2#
3# 2011/11/27 gabriel
4
5use strict;
6
7use Getopt::Long();
8use Pod::Usage;
9use Coro;
10use Coro::Semaphore;
11use Coro::Signal;
12use Coro::Channel;
13use Coro::Handle;
14use IO::File;
15use POSIX qw( WNOHANG WEXITSTATUS );
16
17my $file = '';
18my $verbose;
19my $switchio;
20my $help;
21my $oarsh = 'oarsh -q -T';
22
23Getopt::Long::GetOptions(
24   'file=s'   => \$file,
25   'verbose'  => \$verbose,
26   'help'     => \$help,
27   'oarsh'    => \$oarsh,
28   'switchio' => \$switchio,
29   ) || pod2usage( -verbose => 0 );
30pod2usage( -verbose => 2 ) if $help;
31pod2usage( -verbose => 2 ) if not -e $file;
32
33my @job = ();
34open( JOB_LIST, '<', "$file" ) or die "can't open $file: $!";
35while (<JOB_LIST>) {
36   chomp;
37   next if m/^#/;
38   push @job, $_ if not m/^\s*$/;
39   }
40close JOB_LIST;
41
42my $stderr = $ENV{OAR_STDERR};
43$stderr =~ s/\.stderr$//;
44my $stdout = $ENV{OAR_STDOUT};
45$stdout =~ s/\.stdout$//;
46
47my $finished = new Coro::Signal;
48my $job_todo = new Coro::Semaphore 0;
49$job_todo->up for (@job);
50
51my $ressources = new Coro::Channel;
52open( NODE_FILE, '<', "$ENV{OAR_NODE_FILE}" )
53   or die "can't open ENV{OAR_NODE_FILE}: $!";
54while (<NODE_FILE>) {
55   chomp;
56   $ressources->put($_);
57   }
58close NODE_FILE;
59
60my $job_num   = 0;
61my %scheduled = ();
62
63async {
64   for my $job (@job) {
65      my $node = $ressources->get;
66
67      $job_num++;
68
69      my $fh      = IO::File->new();
70      my $job_pid = $fh->open("| $oarsh $node >/dev/null 2>&1")
71         or die "don't start subjob: $!";
72
73      $fh->autoflush;
74      $fh = unblock $fh;
75
76      $scheduled{$job_pid} = { fh => $fh, node => $node, num => $job_num };
77
78      printf "start job %5i / %5i on node %s at %s\n",
79         $job_num, $job_pid, $node, time
80         if $verbose;
81
82      my ( $job_stdout, $job_stderr );
83      $job_stdout = ">  $stdout-$job_num.stdout" if $stdout ne '' and $switchio;
84      $job_stderr = "2> $stderr-$job_num.stderr" if $stderr ne '' and $switchio;
85
86      $fh->print("cd $ENV{OAR_WORKDIR}\n");
87      $fh->print("$job $job_stdout $job_stderr\n");
88      $fh->print("exit\n");
89      cede;
90      }
91   }
92
93async {
94   while () {
95      for my $job_pid ( keys %scheduled ) {
96         if ( waitpid( $job_pid, WNOHANG ) ) {
97            printf "end   job %5i / %5i on node %s at %s\n",
98               $scheduled{$job_pid}->{num},
99               $job_pid, $scheduled{$job_pid}->{node}, time
100               if $verbose;
101            close $scheduled{$job_pid}->{fh};
102            $ressources->put( $scheduled{$job_pid}->{node} );
103            $job_todo->down;
104            delete $scheduled{$job_pid};
105            }
106         cede;
107         }
108
109      $finished->send if $job_todo->count == 0;
110      cede;
111      }
112   }
113
114cede;
115
116$finished->wait;
117
118__END__
119
120=head1 NAME
121
122oar-parexec - parallel execute lot of small job
123
124=head1 SYNOPSIS
125
126 oar-parexec --file filepath [--verbose] [--switchio] [--oarsh sssh]
127 oar-parexec --help
128
129=head1 OPTIONS
130
131 --file file name which content job list
132
133 --verbose
134
135 --switchio each small job will have it's own output STDOUT and STDERR
136            base on master OAR job with JOB_NUM inside. Example :
137
138            OAR.151524.stdout -> OAR.151524-JOB_NUM.stdout
139
140            where 151524 here is the master OAR_JOB_ID
141
142 -oarsh command use to connect a shell on a node
143        by default
144
145        oarsh -q -T
146
147 --help
148
149
150=head1 DESCRIPTION
151
152C<oar-parexec> need to be executed inside an OAR job environment.
153because it need the two environment variable that OAR define by
154default:
155
156 OAR_NODE_FILE path to a file which content one node by line
157
158 OAR_WORKDIR   dir to launch job and do a chdir inside
159
160Content for the job file (option C<--file>) could have:
161
162 - empty line
163 - comment line begin with #
164 - valid shell command
165
166Example where F<$HOME/test/subjob1.sh> is a shell script (executable).
167
168 $HOME/test/subjob1.sh
169 $HOME/test/subjob2.sh
170 $HOME/test/subjob3.sh
171 $HOME/test/subjob4.sh
172
173 $HOME/test/subjob38.sh
174 $HOME/test/subjob39.sh
175 $HOME/test/subjob40.sh
176
177These jobs could be launch by
178
179 oarsub -n test -l /core=6,walltime=00:35:00 "oar-parexec -f ./subjob.list.txt"
180
181
182=head1 SEE ALSO
183
184oar-dispatch, mpilauncher
185
186
187=head1 AUTHORS
188
189Written by Gabriel Moreau, Grenoble - France
190
191
192=head1 LICENSE AND COPYRIGHT
193
194GPL version 2 or later and Perl equivalent
195
196Copyright (C) 2011 Gabriel Moreau / LEGI - CNRS UMR 5519 - France
197
Note: See TracBrowser for help on using the repository browser.