source: trunk/oar/oar-parexec @ 13

Last change on this file since 13 was 13, checked in by g7moreau, 12 years ago
  • Add oar parallel execute
File size: 3.6 KB
Line 
1#!/usr/bin/perl
2#
3# 2011/11/27 gabriel
4
5use strict;
6
7use Getopt::Long();
8use Pod::Usage;
9use Coro;
10use Coro::Semaphore;
11use Coro::Signal;
12use Coro::Channel;
13use Coro::Handle;
14use IO::File;
15use POSIX qw( WNOHANG WEXITSTATUS );
16
17my $file = '';
18my $verbose;
19my $switchio;
20my $help;
21my $oarsh = 'oarsh -q -T';
22
23Getopt::Long::GetOptions(
24   'file=s'   => \$file,
25   'verbose'  => \$verbose,
26   'help'     => \$help,
27   'oarsh'    => \$oarsh,
28   'switchio' => \$switchio,
29   ) || pod2usage( -verbose => 0 );
30pod2usage( -verbose => 2 ) if $help;
31
32my @job = ();
33open( JOB_LIST, '<', "$file" ) or die "can't open $file: $!";
34while (<JOB_LIST>) {
35   chomp;
36   next if m/^#/;
37   push @job, $_ if not m/^\s*$/;
38   }
39close JOB_LIST;
40
41my $stderr = $ENV{OAR_STDERR};
42$stderr =~ s/\.stderr$//;
43my $stdout = $ENV{OAR_STDOUT};
44$stdout =~ s/\.stdout$//;
45
46my $finished = new Coro::Signal;
47my $job_todo = new Coro::Semaphore 0;
48$job_todo->up for (@job);
49
50my $ressources = new Coro::Channel;
51open( NODE_FILE, '<', "$ENV{OAR_NODE_FILE}" )
52   or die "can't open ENV{OAR_NODE_FILE}: $!";
53while (<NODE_FILE>) {
54   chomp;
55   $ressources->put($_);
56   }
57close NODE_FILE;
58
59my $job_num   = 0;
60my %scheduled = ();
61
62async {
63   for my $job (@job) {
64      my $node = $ressources->get;
65
66      $job_num++;
67
68      my $fh      = IO::File->new();
69      my $job_pid = $fh->open("| $oarsh $node >/dev/null 2>&1")
70         or die "don't start subjob: $!";
71
72      $fh->autoflush;
73      $fh = unblock $fh;
74
75      $scheduled{$job_pid} = { fh => $fh, node => $node, num => $job_num };
76
77      printf "start job %5i / %5i on node %s at %s\n",
78         $job_num, $job_pid, $node, time
79         if $verbose;
80
81      my ( $job_stdout, $job_stderr );
82      $job_stdout = ">  $stdout-$job_num.stdout" if $stdout ne '' and $switchio;
83      $job_stderr = "2> $stderr-$job_num.stderr" if $stderr ne '' and $switchio;
84
85      $fh->print("cd $ENV{OAR_WORKDIR}\n");
86      $fh->print("$job $job_stdout $job_stderr\n");
87      $fh->print("exit\n");
88      cede;
89      }
90   }
91
92async {
93   while () {
94      for my $job_pid ( keys %scheduled ) {
95         if ( waitpid( $job_pid, WNOHANG ) ) {
96            printf "end   job %5i / %5i on node %s at %s\n",
97               $scheduled{$job_pid}->{num},
98               $job_pid, $scheduled{$job_pid}->{node}, time
99               if $verbose;
100            close $scheduled{$job_pid}->{fh};
101            $ressources->put( $scheduled{$job_pid}->{node} );
102            $job_todo->down;
103            delete $scheduled{$job_pid};
104            }
105         cede;
106         }
107
108      $finished->send if $job_todo->count == 0;
109      cede;
110      }
111   }
112
113cede;
114
115$finished->wait;
116
117__END__
118
119=head1 NAME
120
121oar-parexec - parallel execute lot of small job
122
123=head1 SYNOPSIS
124
125 oar-parexec --file filepath [--verbose] [--switchio] [--oarsh sssh]
126 oar-parexec --help
127
128=head1 OPTIONS
129
130 --file file name which content job list
131
132 --verbose
133
134 --switchio each small job will have it's own output STDOUT and STDERR
135            base on master OAR job with JOB_NUM inside. Example :
136
137            OAR.151524.stdout -> OAR.151524-JOB_NUM.stdout
138
139            where 151524 here is the master OAR_JOB_ID
140
141 -oarsh command use to connect a shell on a node
142        by default
143
144        oarsh -q -T
145
146 --help
147
148File name content can have
149
150 - empty line
151 - comment line begin with #
152 - valid shell command
153
154Example where F<$HOME/test/subjob1.sh> is a shell script (executable).
155
156 $HOME/test/subjob1.sh
157 $HOME/test/subjob2.sh
158 $HOME/test/subjob3.sh
159 $HOME/test/subjob4.sh
160
161 $HOME/test/subjob38.sh
162 $HOME/test/subjob39.sh
163 $HOME/test/subjob40.sh
164
165These jobs could be launch by
166
167 oarsub -n test -l /core=6,walltime=00:35:00 "oar-parexec -f ./subjob.list.txt"
168
169=head1 AUTHORS
170
171Gabriel Moreau (C) 2011
172
Note: See TracBrowser for help on using the repository browser.