Context Navigation

source: trunk/oarutils/oar-parexec @ 83

Last change on this file since 83 was 83, checked in by g7moreau, 14 years ago
Add folder parameter Rewrite notify async section
File size: 17.7 KB

Line
1	#!/usr/bin/perl
2	#
3	# 2011/11/27 gabriel
4
5	use strict;
6
7	use Getopt::Long();
8	use Pod::Usage;
9	use Coro;
10	use Coro::Semaphore;
11	use Coro::Signal;
12	use Coro::Channel;
13	use Coro::Handle;
14	use IO::File;
15	use POSIX qw( WNOHANG WEXITSTATUS );
16	use Cwd qw( getcwd );
17
18	my $file;
19	my $dir;
20	my $cmd;
21	my $logtrace;
22	my $verbose;
23	my $job_np = 1;
24	my $nodefile = $ENV{OAR_NODE_FILE} \|\| '';
25	my $masterio;
26	my $switchio;
27	my $help;
28	my $oarsh = 'oarsh -q -T';
29	my $sig_transmit;
30	my $sig_checkpoint = 'USR2';
31
32	Getopt::Long::GetOptions(
33	'file=s' => \$file,
34	'dir=s' => \$dir,
35	'cmd=s' => \$cmd,
36	'logtrace=s' => \$logtrace,
37	'verbose' => \$verbose,
38	'help' => \$help,
39	'oarsh=s' => \$oarsh,
40	'jobnp=i' => \$job_np,
41	'nodefile=s' => \$nodefile,
42	'masterio=s' => \$masterio,
43	'switchio' => \$switchio,
44	'transmit' => \$sig_transmit,
45	'kill=s' => \$sig_checkpoint,
46	) \|\| pod2usage(-verbose => 0);
47	pod2usage(-verbose => 2) if $help;
48	pod2usage(-verbose => 2) if not (
49	(-e "$file")
50	or (-d "$dir" and $cmd ne '')
51	);
52
53	# re-run, keep trace of job already done
54	my %state;
55	my $log_h = IO::File->new();
56	if (-e "$logtrace") {
57	$log_h->open("< $logtrace")
58	or die "error: can't read log file: $!";
59	while (<$log_h>) {
60	$state{$1} = 'start' if m/^start\s+job\s+([^\s]+)\s/;
61	$state{$1} = 'end' if m/^end\s+job\s+([^\s]+)\s/;
62	}
63	$log_h->close();
64	}
65	if ($logtrace) {
66	$log_h->open(">> $logtrace")
67	or die "error: can't append log file $logtrace: $!";
68	$log_h->autoflush;
69	$log_h = unblock $log_h;
70	}
71
72	# job to run
73	my @job = ();
74	if (-e "$file") {
75	my $job_num = 0;
76	open(JOB_LIST, '<', "$file") or die "error: can't open job file $file: $!";
77	while (my $job_cmd = <JOB_LIST>) {
78	chomp $job_cmd;
79	next if $job_cmd =~ m/^#/;
80	next if $job_cmd =~ m/^\s*$/;
81	$job_num++;
82	my ($job_name) = $job_cmd =~ m/#.*?\bname=(\S+?)\b/i;
83	$job_name \|\|= $job_num;
84	my ($job_folder) = $job_cmd =~ m/#.*?\bfolder=(\S+?)\b/i;
85	$job_folder \|\|= './';
86	push @job, { name => $job_name, cmd => "$job_cmd", folder => $job_folder };
87	}
88	close JOB_LIST;
89	}
90	else {
91	opendir(DIR, $dir) or die "error: can't open folder $dir: $!";
92	while (my $item = readdir(DIR)) {
93	next if $item =~ m/^\./;
94	next if $item =~ m/:/;
95	next if $item =~ m/\.old$/;
96	next if $item =~ m/\.sav$/;
97	next if $item =~ m/\.bak$/;
98	next if $item =~ m/\.no$/;
99	next unless (-d "$dir/$item");
100	push @job, { name => $item, cmd => "( cd $dir/$item/; $cmd )" };
101	}
102	closedir DIR;
103	}
104
105	# ressources available
106	my @ressources = ();
107	open(NODE_FILE, '<', "$nodefile")
108	or die "can't open $nodefile: $!";
109	while (<NODE_FILE>) {
110	chomp;
111	next if m/^#/;
112	next if m/^\s*$/;
113	push @ressources, $_;
114	}
115	close NODE_FILE;
116
117	my $ressource_size = scalar(@ressources);
118	die "error: not enought ressources jobnp $job_np > ressources $ressource_size"
119	if $job_np > $ressource_size;
120
121	my $current_dir = getcwd();
122
123	my $stderr = $ENV{OAR_STDERR} \|\| '';
124	$stderr =~ s/\.stderr$//;
125	$stderr = $masterio if $masterio;
126	my $stdout = $ENV{OAR_STDOUT} \|\| '';
127	$stdout =~ s/\.stdout$//;
128	$stdout = $masterio if $masterio;
129
130	my $finished = new Coro::Signal;
131	my $job_todo = new Coro::Semaphore 0;
132	my $job_name_maxlen;
133	for (@job) {
134	$job_todo->up;
135	$job_name_maxlen = length($_->{name}) if length($_->{name}) > $job_name_maxlen;
136	}
137
138	# slice of ressources for parallel job
139	my $ressources = new Coro::Channel;
140	for my $slot (1 .. int($ressource_size / $job_np)) {
141	$ressources->put(
142	join(',',
143	@ressources[ (($slot - 1) * $job_np) .. (($slot * $job_np) - 1) ])
144	);
145	}
146
147	my %scheduled = ();
148
149	# OAR checkpoint and default signal SIGUSR2
150	my $oar_checkpoint = new Coro::Semaphore 0;
151	$SIG{$sig_checkpoint} = sub {
152	print "warning: receive checkpoint at "
153	. time
154	. ", no new job, just finishing running job\n"
155	if $verbose;
156	$oar_checkpoint->up();
157	};
158
159	# asynchrone notify job
160	async {
161	NOTIFY:
162	while () {
163	# only notify with transmit flag
164	if ($sig_transmit and $oar_checkpoint->count() > 0) {
165
166	for my $job_pid (keys %scheduled) {
167	my $job_name = $scheduled{$job_pid}->{name};
168	my $job_pidfile = $scheduled{$job_pid}->{pidfile};
169	my $node_connect = $scheduled{$job_pid}->{node_connect};
170
171	my $fh = IO::File->new();
172	$fh->open("\| $oarsh $node_connect >/dev/null 2>&1")
173	or die "error: can't notify subjob: $!";
174
175	$fh->autoflush;
176	$fh = unblock $fh;
177
178	$fh->print("kill -$sig_checkpoint \$(cat $job_pidfile)");
179	$fh->print("exit\n");
180
181	print "warning: transmit signal $sig_checkpoint"
182	. " to $job_name on $node_connect.\n"
183	if $verbose;
184
185	close $fh;
186	cede;
187	}
188	}
189
190	cede;
191	}
192	}
193
194	# asynchrone start job block
195	async {
196	JOB:
197	for my $job (@job) {
198	my $job_name = $job->{name};
199	my $job_cmd = $job->{cmd};
200	my $job_folder = $job->{folder};
201
202	# job has been already run ?
203	if (exists $state{$job_name}) {
204	if ($state{$job_name} eq 'start') {
205	print "warning: job $job_name was not clearly finished, relaunching...\n"
206	if $verbose;
207	}
208	elsif ($state{$job_name} eq 'end') {
209	delete $state{$job_name}; # free memory
210	$job_todo->down;
211	print "warning: job $job_name already run\n" if $verbose;
212	cede;
213	next JOB;
214	}
215	}
216
217	# take job ressource
218	my $job_ressource = $ressources->get;
219
220	# no more launch job when OAR checkpointing
221	last JOB if $oar_checkpoint->count() > 0;
222
223	my ($node_connect) = split ',', $job_ressource;
224	my $fh = IO::File->new();
225	my $job_pid = $fh->open("\| $oarsh $node_connect >/dev/null 2>&1")
226	or die "error: can't start subjob: $!";
227
228	$fh->autoflush;
229	$fh = unblock $fh;
230
231	my $msg = sprintf "start job %${job_name_maxlen}s / %5i at %s on node %s\n",
232	$job_name, $job_pid, time, $job_ressource;
233	$log_h->print($msg) if $logtrace;
234	print($msg) if $verbose;
235
236	my ($job_stdout, $job_stderr);
237	$job_stdout = "> $stdout-$job_name.stdout" if $stdout ne '' and $switchio;
238	$job_stderr = "2> $stderr-$job_name.stderr" if $stderr ne '' and $switchio;
239
240	my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$ENV{OAR_JOB_ID}-$job_name";
241	my $job_pidfile = "/tmp/oar-parexec-$ENV{LOGNAME}-$ENV{OAR_JOB_ID}-$job_name.pid";
242
243	$scheduled{$job_pid} = {
244	fh => $fh,
245	node_connect => $node_connect,
246	ressource => $job_ressource,
247	name => $job_name,
248	pidfile => $job_pidfile,
249	};
250
251	# set job environment, run it and clean
252	if ($job_np > 1) {
253	$fh->print("printf \""
254	. join('\n', split(',', $job_ressource,))
255	. "\" > $job_nodefile\n");
256	$fh->print("OAR_NODE_FILE=$job_nodefile\n");
257	$fh->print("OAR_NP=$job_np\n");
258	$fh->print("export OAR_NODE_FILE\n");
259	$fh->print("export OAR_NP\n");
260	$fh->print("unset OAR_MSG_NODEFILE\n");
261	}
262	$fh->print("cd $current_dir\n");
263	$fh->print("cd $job_folder\n");
264	if ($sig_transmit) {
265	$fh->print("trap 'kill -$sig_checkpoint \$(jobs -p)' $sig_checkpoint\n");
266	$fh->print("echo \$\$ > $job_pidfile\n");
267	$fh->print("$job_cmd $job_stdout $job_stderr &\n");
268	$fh->print("while [ \$(jobs -p \| wc -l) -gt 0 ]\n");
269	$fh->print("do\n");
270	$fh->print(" wait\n");
271	$fh->print("done\n");
272	$fh->print("rm -f $job_pidfile\n");
273	}
274	else {
275	$fh->print("$job_cmd $job_stdout $job_stderr\n");
276	}
277	$fh->print("rm -f $job_nodefile\n") if $job_np > 1;
278	$fh->print("exit\n");
279	cede;
280	}
281	}
282
283	# asynchrone end job block
284	async {
285	while () {
286	for my $job_pid (keys %scheduled) {
287	# non blocking PID test
288	if (waitpid($job_pid, WNOHANG)) {
289	my $msg = sprintf "end job %${job_name_maxlen}s / %5i at %s on node %s\n",
290	$scheduled{$job_pid}->{name},
291	$job_pid, time, $scheduled{$job_pid}->{ressource};
292
293	# Job non finish, just suspend if received checkpoint signal
294	$msg =~ s/^end\s+job/suspend job/
295	if $sig_transmit and $oar_checkpoint->count() > 0;
296
297	$log_h->print($msg) if $logtrace;
298	print($msg) if $verbose;
299	close $scheduled{$job_pid}->{fh};
300	# leave ressources for another job
301	$ressources->put($scheduled{$job_pid}->{ressource});
302	$job_todo->down;
303	delete $scheduled{$job_pid};
304	}
305	cede;
306	}
307
308	# checkpointing ! just finishing running job and quit
309	$finished->send if $oar_checkpoint->count() > 0 and scalar(keys(%scheduled)) == 0;
310
311	$finished->send if $job_todo->count() == 0;
312	cede;
313	}
314	}
315
316	cede;
317
318	# all job have been done
319	$finished->wait;
320
321	# close log trace file
322	$log_h->close() if $logtrace;
323
324	__END__
325
326	=head1 NAME
327
328	oar-parexec - parallel execution of many small job
329
330	=head1 SYNOPSIS
331
332	oar-parexec --file filecommand \
333	[--logtrace tracefile] [--verbose] \
334	[--jobnp integer] [--nodefile filenode] [--oarsh sssh] \
335	[--switchio] [--masterio basefileio]
336
337	oar-parexec --dir foldertoiterate --cmd commandtolaunch \
338	[--logtrace tracefile] [--verbose] \
339	[--jobnp integer] [--nodefile filenode] [--oarsh sssh] \
340	[--switchio] [--masterio basefileio]
341
342	oar-parexec --help
343
344	=head1 DESCRIPTION
345
346	C<oar-parexec> can execute lot of small job in parallel inside a cluster.
347	Number of parallel job at one time cannot exceed the number of core define in the node file
348	C<oar-parexec> is easier to use inside an OAR job environment
349	which define automatically these strategics parameters...
350	However, it can be used outside OAR.
351
352	Option C<--file> or C<--dir> and C<--cmd> are the only mandatory parameters.
353
354	Small job will be launch in the same folder as the master job.
355	Two environment variable are defined for each small job
356	and only in case of parallel small job (option C<--jobnp> > 1).
357
358	OAR_NODE_FILE - file that list node for parallel computing
359	OAR_NP - number of processor affected
360
361	The file define by OAR_NODE_FILE is created in /tmp
362	on the node before launching the small job
363	and this file will be delete after job complete.
364	C<oar-parexec> is a simple script,
365	OAR_NODE_FILE will not be deleted in case of crash of the master job.
366
367	OAR define other variable that are equivalent to OAR_NODE_FILE:
368	OAR_NODEFILE, OAR_FILE_NODES, OAR_RESOURCE_FILE...
369	You can use in your script the OAR original file ressources
370	by using these variable if you need it.
371
372
373	=head1 OPTIONS
374
375	=over 12
376
377	=item B<-f\|--file filecommand>
378
379	File name which content job list.
380	For the JOB_NAME definition,
381	the first valid job in the list will have the number 1 and so on...
382
383	It's possible to fix the name inside a comment on the job line.
384	For example:
385
386	$HOME/test/subjob1.sh # name=subjob1
387
388	The key C<name> is case insensitive,
389	the associated value cannot have a space...
390
391	=item B<-d\|--dir foldertoiterate>
392
393	Command C<--cmd> will be launch in all sub-folder of this master folder.
394	Files in this folder will be ignored.
395	Sub-folder name which begin with F<.>
396	or finish with F<.old>, F<.sav>, F<.bak>, F<.no> will either be ignored...
397
398	The JOB_NAME is simply the Sub-folder name.
399
400	=item B<-c\|--cmd commandtolaunch>
401
402	Command (and argument to it) tha will be launch in all sub-folder
403	parameter folfer C<--dir>
404
405	=item B<-l\|--logtrace tracefile>
406
407	File which log and trace running job.
408	In case of running the same master command (after crash for example),
409	only job that are not mark as done will be run again.
410	Be careful, job mark as running (start but not finish) will be run again.
411	Tracing is base on the JOB_NAME between multiple run.
412
413	This option is very usefull in case of crash
414	but also for checkpointing and idempotent OAR job.
415
416	=item B<-v\|--verbose>
417
418	=item B<-j\|--jobnp integer>
419
420	Number of processor to allocated for each small job.
421	1 by default.
422
423	=item B<-n\|--nodefile filenode>
424
425	File name that list all the node where job could be launch.
426	By defaut, it's define automatically by OAR via
427	environment variable C<OAR_NODE_FILE>.
428
429	For example, if you want to use 6 core on your cluster node,
430	you need to put 6 times the hostname node in this file,
431	one per line...
432	It's a very common file in MPI process !
433
434	=item B<-o\|-oarsh command>
435
436	Command use to launch a shell on a node.
437	By default
438
439	oarsh -q -T
440
441	Change it to C<ssh> if you are not using an OAR cluster...
442
443	=item B<-s\|--switchio>
444
445	Each small job will have it's own output STDOUT and STDERR
446	base on master OAR job with C<JOB_NAME> inside
447	(or base on C<basefileio> if option C<masterio>).
448	Example :
449
450	OAR.151524.stdout -> OAR.151524-JOB_NAME.stdout
451
452	where 151524 here is the master C<OAR_JOB_ID>
453	and C<JOB_NAME> is the small job name.
454
455	=item B<-m\|--masterio basefileio>
456
457	The C<basefileio> will be use in place of environment variable
458	C<OAR_STDOUT> and C<OAR_STDERR> (without extension) to build the base name of the small job standart output
459	(only use when option C<swithio> is activated).
460
461	=item B<-k\|--kill signal>
462
463	Signal to listen and make a clean stop of the current C<oar-parexec> process.
464	By default, use USR2 signal (see C<kill -l>> for a list of possible signal).
465
466	=item B<-t\|--transmit>
467
468	Resend catch signal to sub-job when receiving it.
469	By default, no signal is transmis to child process.
470
471	It's only valuable if use for long sub-job than can
472	in return make themselves a clean restart.
473
474
475	=item B<-h\|--help>
476
477	=back
478
479
480	=head1 EXAMPLE
481
482	=head2 Simple list of sequential job
483
484	Content for the job file command (option C<--file>) could have:
485
486	- empty line
487	- comment line begin with #
488	- valid shell command
489
490	Example where F<$HOME/test/subjob1.sh> is a shell script (executable).
491
492	$HOME/test/subjob01.sh # name=subjob01
493	$HOME/test/subjob02.sh # name=subjob02
494	$HOME/test/subjob03.sh # name=subjob03
495	$HOME/test/subjob04.sh # name=subjob04
496	...
497	$HOME/test/subjob38.sh # name=subjob38
498	$HOME/test/subjob39.sh # name=subjob39
499	$HOME/test/subjob40.sh # name=subjob40
500
501	These jobs could be launch by:
502
503	oarsub -n test -l /core=6,walltime=04:00:00 \
504	"oar-parexec -f ./subjob.list.txt"
505
506	=head2 Folder job
507
508	In a folder F<subjob.d>, create sub-folder with your data inside : F<test1>, <test2>...
509	The same command will be executed in every sub-folder.
510	C<oar-parexec> change the current directory to the sub-folder before launching it.
511
512	A very simple job could be:
513
514	oarsub -n test -l /core=6,walltime=04:00:00 \
515	"oar-parexec -d ./subjob.d -c 'sleep 10; env'"
516
517	The command C<env> will be excuted in all folder F<test1>, F<test2>... after a 10s pause.
518
519	Sometime, it's simpler to use file list command,
520	sometime, jobs by folder with the same command run is more relevant.
521
522	=head2 Parallel job
523
524	You need to put the number of core each small job need with option C<--jobnp>.
525	If your job is build on OpenMP or MPI,
526	you can use OAR_NP and OAR_NODE_FILE variables to configure them.
527	On OAR cluster, you need to use C<oarsh> or a wrapper like C<oar-envsh>
528	for connexion between node instead of C<ssh>.
529
530	Example with parallel small job on 2 core:
531
532	oarsub -n test -l /core=6,walltime=04:00:00 \
533	"oar-parexec -j 2 -f ./subjob.list.txt"
534
535	=head2 Tracing and master crash
536
537	If the master node crash after hours of calculus, everything is lost ?
538	No, with option C<--logtrace>,
539	it's possible to remember older result
540	and not re-run these job the second and next time.
541
542	oarsub -n test -l /core=6,walltime=04:00:00 \
543	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
544
545	After a crash or an C<oardel> command,
546	you can then re-run the same command that will end to execute the jobs in the list
547
548	oarsub -n test -l /core=6,walltime=04:00:00 \
549	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
550
551	C<logtrace> file are just plain file.
552	We use the extension '.log' because these files are automatically
553	eliminate from our backup system!
554
555	=head2 Checkpointing and Idempotent
556
557	C<oar-parexec> is compatible with the OAR checkpointing.
558	Il you have 2000 small jobs that need 55h to be done on 6 cores,
559	you can cut this in small parts.
560
561	For this example, we suppose that each small job need about 10min...
562	So, we send a checkpoint 12min before the end of the process
563	to let C<oar-parexec> finish the jobs started.
564	After being checkpointed, C<oar-parexec> do not start any new small job.
565
566	oarsub -t idempotent -n test \
567	-l /core=6,walltime=04:00:00 \
568	--checkpoint 720 \
569	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
570
571	After 3h48min, the OAR job will begin to stop launching new small job.
572	When all running small job are finished, it's exit.
573	But as the OAR job is type C<idempotent>,
574	OAR will re-submit it as long as all small job are not executed...
575
576	This way, we let other users a chance to use the cluster!
577
578	In this last exemple, we use moldable OAR job with idempotent
579	to reserve many core for a small time or a few cores for a long time:
580
581	oarsub -t idempotent -n test \
582	-l /core=50,walltime=01:05:00 \
583	-l /core=6,walltime=04:00:00 \
584	--checkpoint 720 \
585	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
586
587	=head2 Signal, recurse and long job
588
589	By default, OAR use signal USR2 for checkpointing.
590	It's possible to change this with option C<--kill>.
591
592	When use with long small job, checkpointing could be too long...
593	More than walltime!
594	The option C<--transmit> could be use to checkpoint small job!
595	These long small job will then stop cleanly and will be restarted next time.
596
597	In the C<logtrace> file, small job will have the status suspend.
598	They will be launch with the same command line at the next OAR run.
599
600	=head1 SEE ALSO
601
602	oar-dispatch, mpilauncher,
603	orsh, oar-envsh, ssh
604
605
606	=head1 AUTHORS
607
608	Written by Gabriel Moreau, Grenoble - France
609
610
611	=head1 LICENSE AND COPYRIGHT
612
613	GPL version 2 or later and Perl equivalent
614
615	Copyright (C) 2011 Gabriel Moreau / LEGI - CNRS UMR 5519 - France
616

Note: See TracBrowser for help on using the repository browser.

Download in other formats: