Context Navigation

source: trunk/oarutils/oar-parexec @ 44

Last change on this file since 44 was 44, checked in by g7moreau, 14 years ago
doc doc and doc!
File size: 12.2 KB

Line
1	#!/usr/bin/perl
2	#
3	# 2011/11/27 gabriel
4
5	use strict;
6
7	use Getopt::Long();
8	use Pod::Usage;
9	use Coro;
10	use Coro::Semaphore;
11	use Coro::Signal;
12	use Coro::Channel;
13	use Coro::Handle;
14	use IO::File;
15	use POSIX qw( WNOHANG WEXITSTATUS );
16	use Cwd qw( getcwd );
17
18	my $filecmd = '';
19	my $logtrace = '';
20	my $verbose;
21	my $job_np = 1;
22	my $nodefile = $ENV{OAR_NODE_FILE} \|\| '';
23	my $masterio;
24	my $switchio;
25	my $help;
26	my $oarsh = 'oarsh -q -T';
27
28	Getopt::Long::GetOptions(
29	'filecmd=s' => \$filecmd,
30	'logtrace=s' => \$logtrace,
31	'verbose' => \$verbose,
32	'help' => \$help,
33	'oarsh=s' => \$oarsh,
34	'jobnp=i' => \$job_np,
35	'nodefile=s' => \$nodefile,
36	'masterio=s' => \$masterio,
37	'switchio' => \$switchio,
38	) \|\| pod2usage(-verbose => 0);
39	pod2usage(-verbose => 2) if $help;
40	pod2usage(-verbose => 2) if not -e $filecmd;
41
42	# re-run, keep trace of job already done
43	my %state;
44	my $log_h = IO::File->new();
45	if (-e $logtrace) {
46	$log_h->open("< $logtrace")
47	or die "error: can't read log file: $!";
48	while (<$log_h>) {
49	$state{$1} = 'start' if m/^start\s+job\s+(\d+)\s/;
50	$state{$1} = 'end' if m/^end\s+job\s+(\d+)\s/;
51	}
52	$log_h->close();
53	}
54	if ($logtrace) {
55	$log_h->open(">> $logtrace")
56	or die "error: can't append log file $logtrace: $!";
57	$log_h->autoflush;
58	$log_h = unblock $log_h;
59	}
60
61	# job to run
62	my @job = ();
63	open(JOB_LIST, '<', "$filecmd") or die "error: can't open job file $filecmd: $!";
64	while (<JOB_LIST>) {
65	chomp;
66	next if m/^#/;
67	next if m/^\s*$/;
68	push @job, $_;
69	}
70	close JOB_LIST;
71
72	# ressources available
73	my @ressources = ();
74	open(NODE_FILE, '<', "$nodefile")
75	or die "can't open $nodefile: $!";
76	while (<NODE_FILE>) {
77	chomp;
78	next if m/^#/;
79	next if m/^\s*$/;
80	push @ressources, $_;
81	}
82	close NODE_FILE;
83
84	my $ressource_size = scalar(@ressources);
85	die "error: not enought ressources jobnp $job_np > ressources $ressource_size"
86	if $job_np > $ressource_size;
87
88	my $current_dir = getcwd();
89
90	my $stderr = $ENV{OAR_STDERR} \|\| '';
91	$stderr =~ s/\.stderr$//;
92	$stderr = $masterio if $masterio;
93	my $stdout = $ENV{OAR_STDOUT} \|\| '';
94	$stdout =~ s/\.stdout$//;
95	$stdout = $masterio if $masterio;
96
97	my $finished = new Coro::Signal;
98	my $job_todo = new Coro::Semaphore 0;
99	$job_todo->up for (@job);
100
101	# slice of ressources for parallel job
102	my $ressources = new Coro::Channel;
103	for my $slot (1 .. int($ressource_size / $job_np)) {
104	$ressources->put(
105	join(',',
106	@ressources[ (($slot - 1) * $job_np) .. (($slot * $job_np) - 1) ])
107	);
108	}
109
110	my $job_num = 0;
111	my %scheduled = ();
112
113	# OAR checkpoint and default signal SIGUSR2
114	my $oar_checkpoint = new Coro::Semaphore 0;
115	$SIG{USR2} = sub {
116	print "warning: receive checkpoint at "
117	. time
118	. ", no new job, just finishing running job\n"
119	if $verbose;
120	$oar_checkpoint->up();
121	};
122
123	# asynchrone start job block
124	async {
125	JOB:
126	for my $job (@job) {
127	$job_num++;
128
129	# job has been already run ?
130	if (exists $state{$job_num}) {
131	if ($state{$job_num} eq 'start') {
132	print "warning: job $job_num was not clearly finished, relaunching...\n"
133	if $verbose;
134	}
135	elsif ($state{$job_num} eq 'end') {
136	delete $state{$job_num}; # free memory
137	$job_todo->down;
138	print "warning: job $job_num already run\n" if $verbose;
139	cede;
140	next JOB;
141	}
142	}
143
144	# take job ressource
145	my $job_ressource = $ressources->get;
146
147	# no more launch job when OAR checkpointing
148	last JOB if $oar_checkpoint->count() > 0;
149
150	my ($node_connect) = split ',', $job_ressource;
151	my $fh = IO::File->new();
152	my $job_pid = $fh->open("\| $oarsh $node_connect >/dev/null 2>&1")
153	or die "error: can't start subjob: $!";
154
155	$fh->autoflush;
156	$fh = unblock $fh;
157
158	$scheduled{$job_pid} = {
159	fh => $fh,
160	node_connect => $node_connect,
161	ressource => $job_ressource,
162	num => $job_num
163	};
164
165	my $msg = sprintf "start job %5i / %5i at %s on node %s\n",
166	$job_num, $job_pid, time, $job_ressource;
167	$log_h->print($msg) if $logtrace;
168	print($msg) if $verbose;
169
170	my ($job_stdout, $job_stderr);
171	$job_stdout = "> $stdout-$job_num.stdout" if $stdout ne '' and $switchio;
172	$job_stderr = "2> $stderr-$job_num.stderr" if $stderr ne '' and $switchio;
173
174	my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$job_num";
175
176	# set job environment, run it and clean
177	if ($job_np > 1) {
178	$fh->print("printf \""
179	. join('\n', split(',', $job_ressource,))
180	. "\" > $job_nodefile\n");
181	$fh->print("OAR_NODE_FILE=$job_nodefile\n");
182	$fh->print("OAR_NP=$job_np\n");
183	$fh->print("export OAR_NODE_FILE\n");
184	$fh->print("export OAR_NP\n");
185	$fh->print("unset OAR_MSG_NODEFILE\n");
186	}
187	$fh->print("cd $current_dir\n");
188	$fh->print("$job $job_stdout $job_stderr\n");
189	$fh->print("rm -f $job_nodefile\n") if $job_np > 1;
190	$fh->print("exit\n");
191	cede;
192	}
193	}
194
195	# asynchrone end job block
196	async {
197	while () {
198	for my $job_pid (keys %scheduled) {
199	# non blocking PID test
200	if (waitpid($job_pid, WNOHANG)) {
201	my $msg = sprintf "end job %5i / %5i at %s on node %s\n",
202	$scheduled{$job_pid}->{num},
203	$job_pid, time, $scheduled{$job_pid}->{ressource};
204	$log_h->print($msg) if $logtrace;
205	print($msg) if $verbose;
206	close $scheduled{$job_pid}->{fh};
207	# leave ressources for another job
208	$ressources->put($scheduled{$job_pid}->{ressource});
209	$job_todo->down;
210	delete $scheduled{$job_pid};
211	}
212	cede;
213	}
214
215	# checkpointing ! just finishing running job and quit
216	$finished->send if $oar_checkpoint->count() > 0 and scalar(keys(%scheduled)) == 0;
217
218	$finished->send if $job_todo->count() == 0;
219	cede;
220	}
221	}
222
223	cede;
224
225	# all job have been done
226	$finished->wait;
227
228	# close log trace file
229	$log_h->close() if $logtrace;
230
231	__END__
232
233	=head1 NAME
234
235	oar-parexec - parallel execution of many small job
236
237	=head1 SYNOPSIS
238
239	oar-parexec --filecmd filecommand [--logtrace tracefile] [--verbose] [--jobnp integer] \
240	[--nodefile filenode] [--masterio basefileio] [--switchio] [--oarsh sssh]
241	oar-parexec --help
242
243	=head1 DESCRIPTION
244
245	C<oar-parexec> can execute lot of small job in parallel inside a cluster.
246	Number of parallel job at one time cannot exceed the number of core define in the node file
247	C<oar-parexec> is easier to use inside an OAR job environment
248	which define automatically these strategics parameters...
249	However, it can be used outside OAR.
250
251	Option C<--filecmd> is the only mandatory one.
252
253	Small job will be launch in the same folder as the master job.
254	Two environment variable are defined for each small job
255	and only in case of parallel small job (option C<--jobnp> > 1).
256
257	OAR_NODE_FILE - file that list node for parallel computing
258	OAR_NP - number of processor affected
259
260	The file define by OAR_NODE_FILE is created in /tmp
261	on the node before launching the small job
262	and this file will be delete after job complete.
263	C<oar-parexec> is a simple script,
264	OAR_NODE_FILE will not be deleted in case of crash of the master job.
265
266	OAR define other variable that are equivalent to OAR_NODE_FILE:
267	OAR_NODEFILE, OAR_FILE_NODES, OAR_RESOURCE_FILE...
268	You can use in your script the OAR original file ressources
269	by using these variable if you need it.
270
271
272	=head1 OPTIONS
273
274	=over 12
275
276	=item B<-f\|--filecmd filecommand>
277
278	File name which content job list.
279
280	=item B<-l\|--logtrace tracefile>
281
282	File which log and trace running job.
283	In case of running the same master command (after crash for example),
284	only job that are not mark as done will be run again.
285	Be careful, job mark as running (start but not finish) will be run again.
286
287	This option is very usefull in case of crash
288	but also for checkpointing and idempotent OAR job.
289
290	=item B<-v\|--verbose>
291
292	=item B<-j\|--jobnp integer>
293
294	Number of processor to allocated for each small job.
295	1 by default.
296
297	=item B<-n\|--nodefile filenode>
298
299	File name that list all the node where job could be launch.
300	By defaut, it's define automatically by OAR via
301	environment variable C<OAR_NODE_FILE>.
302
303	For example, if you want to use 6 core on your cluster node,
304	you need to put 6 times the hostname node in this file,
305	one per line...
306	It's a very common file in MPI process !
307
308	=item B<-m\|--masterio basefileio>
309
310	The C<basefileio> will be use in place of environment variable
311	C<OAR_STDOUT> and C<OAR_STDERR> (without extension) to build the base name of the small job standart output
312	(only use when option C<swithio> is activated).
313
314	=item B<-s\|--switchio>
315
316	Each small job will have it's own output STDOUT and STDERR
317	base on master OAR job with C<JOB_NUM> inside
318	(or base on C<basefileio> if option C<masterio>).
319	Example :
320
321	OAR.151524.stdout -> OAR.151524-JOB_NUM.stdout
322
323	where 151524 here is the master C<OAR_JOB_ID>
324	and C<JOB_NUM> is the small job nnumber.
325
326	=item B<-o\|-oarsh command>
327
328	Command use to launch a shell on a node.
329	By default
330
331	oarsh -q -T
332
333	Change it to C<ssh> if you are not using an OAR cluster...
334
335	=item B<-h\|--help>
336
337	=back
338
339
340	=head1 EXAMPLE
341
342	=head2 Simple list of sequential job
343
344	Content for the job file command (option C<--filecmd>) could have:
345
346	- empty line
347	- comment line begin with #
348	- valid shell command
349
350	Example where F<$HOME/test/subjob1.sh> is a shell script (executable).
351
352	$HOME/test/subjob1.sh
353	$HOME/test/subjob2.sh
354	$HOME/test/subjob3.sh
355	$HOME/test/subjob4.sh
356	...
357	$HOME/test/subjob38.sh
358	$HOME/test/subjob39.sh
359	$HOME/test/subjob40.sh
360
361	These jobs could be launch by:
362
363	oarsub -n test -l /core=6,walltime=04:00:00 "oar-parexec -f ./subjob.list.txt"
364
365	=head2 Parallel job
366
367	You need to put the number of core each small job need with option C<--jobnp>.
368	If your job is build on OpenMP or MPI,
369	you can use OAR_NP and OAR_NODE_FILE variables to configure them.
370	On OAR cluster, you need to use C<oarsh> or a wrapper like C<oar-envsh>
371	for connexion between node instead of C<ssh>.
372
373	Example with parallel small job on 2 core:
374
375	oarsub -n test -l /core=6,walltime=04:00:00 "oar-parexec -j 2 -f ./subjob.list.txt"
376
377	=head2 Tracing and master crash
378
379	If the master node crash after hours of calculus, everything is lost ?
380	No, with option C<--logtrace>,
381	it's possible to remember older result
382	and not re-run these job the second and next time.
383
384	oarsub -n test -l /core=6,walltime=04:00:00 "oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
385
386	After a crash or an C<oardel> command,
387	you can then re-run the same command that will end to execute the jobs in the list
388
389	oarsub -n test -l /core=6,walltime=04:00:00 "oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
390
391	C<logtrace> file are just plain file.
392	We use the extension '.log' because these files are automatically
393	eliminate from our backup system!
394
395	=head2 Checkpointing and Idempotent
396
397	C<oar-parexec> is compatible with the OAR checkpointing.
398	Il you have 2000 small jobs that need 55h to be done on 6 cores,
399	you can cut this in small parts.
400
401	For this example, we suppose that each small job need about 10min...
402	So, we send a checkpoint 12min before the end of the process
403	to let C<oar-parexec> finish the jobs started.
404	After being checkpointed, C<oar-parexec> do not start any new small job.
405
406	oarsub -t idempotent -n test -l /core=6,walltime=04:00:00 --checkpoint 720 \
407	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
408
409	After 3h48min, the OAR job will begin to stop launching new small job.
410	When all running small job are finished, it's exit.
411	But as the OAR job is type C<idempotent>,
412	OAR will re-submit it as long as all small job are not executed...
413
414	This way, we let other users a chance to use the cluster!
415
416	In this last exemple, we use moldable OAR job with idempotent
417	to reserve many core for a small time or a few cores for a long time:
418
419	oarsub -t idempotent -n test \
420	-l /core=50,walltime=01:05:00 \
421	-l /core=6,walltime=04:00:00 \
422	--checkpoint 720 \
423	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
424
425
426	=head1 SEE ALSO
427
428	oar-dispatch, mpilauncher,
429	orsh, oar-envsh, ssh
430
431
432	=head1 AUTHORS
433
434	Written by Gabriel Moreau, Grenoble - France
435
436
437	=head1 LICENSE AND COPYRIGHT
438
439	GPL version 2 or later and Perl equivalent
440
441	Copyright (C) 2011 Gabriel Moreau / LEGI - CNRS UMR 5519 - France
442

Note: See TracBrowser for help on using the repository browser.

Download in other formats: