Context Navigation

source: trunk/oarutils/oar-parexec @ 49

Last change on this file since 49 was 49, checked in by g7moreau, 14 years ago
General makefile width install and update Cut long line
File size: 14.4 KB

Line
1	#!/usr/bin/perl
2	#
3	# 2011/11/27 gabriel
4
5	use strict;
6
7	use Getopt::Long();
8	use Pod::Usage;
9	use Coro;
10	use Coro::Semaphore;
11	use Coro::Signal;
12	use Coro::Channel;
13	use Coro::Handle;
14	use IO::File;
15	use POSIX qw( WNOHANG WEXITSTATUS );
16	use Cwd qw( getcwd );
17
18	my $file = '';
19	my $dir = '';
20	my $cmd = '';
21	my $logtrace = '';
22	my $verbose;
23	my $job_np = 1;
24	my $nodefile = $ENV{OAR_NODE_FILE} \|\| '';
25	my $masterio;
26	my $switchio;
27	my $help;
28	my $oarsh = 'oarsh -q -T';
29
30	Getopt::Long::GetOptions(
31	'file=s' => \$file,
32	'dir=s' => \$dir,
33	'cmd=s' => \$cmd,
34	'logtrace=s' => \$logtrace,
35	'verbose' => \$verbose,
36	'help' => \$help,
37	'oarsh=s' => \$oarsh,
38	'jobnp=i' => \$job_np,
39	'nodefile=s' => \$nodefile,
40	'masterio=s' => \$masterio,
41	'switchio' => \$switchio,
42	) \|\| pod2usage(-verbose => 0);
43	pod2usage(-verbose => 2) if $help;
44	pod2usage(-verbose => 2) if not (
45	(-e "$file")
46	or (-d "$dir" and $cmd ne '')
47	);
48
49	# re-run, keep trace of job already done
50	my %state;
51	my $log_h = IO::File->new();
52	if (-e "$logtrace") {
53	$log_h->open("< $logtrace")
54	or die "error: can't read log file: $!";
55	while (<$log_h>) {
56	$state{$1} = 'start' if m/^start\s+job\s+([^\s]+)\s/;
57	$state{$1} = 'end' if m/^end\s+job\s+([^\s]+)\s/;
58	}
59	$log_h->close();
60	}
61	if ($logtrace) {
62	$log_h->open(">> $logtrace")
63	or die "error: can't append log file $logtrace: $!";
64	$log_h->autoflush;
65	$log_h = unblock $log_h;
66	}
67
68	# job to run
69	my @job = ();
70	if (-e "$file") {
71	my $job_num = 0;
72	open(JOB_LIST, '<', "$file") or die "error: can't open job file $file: $!";
73	while (<JOB_LIST>) {
74	chomp;
75	next if m/^#/;
76	next if m/^\s*$/;
77	$job_num++;
78	push @job, { name => $job_num, cmd => "$_" };
79	}
80	close JOB_LIST;
81	}
82	else {
83	opendir(DIR, $dir) or die "error: can't open folder $dir: $!";
84	while (my $item = readdir(DIR)) {
85	next if $item =~ m/^\./;
86	next if $item =~ m/:/;
87	next if $item =~ m/\.old$/;
88	next if $item =~ m/\.sav$/;
89	next if $item =~ m/\.bak$/;
90	next if $item =~ m/\.no$/;
91	next unless (-d "$dir/$item");
92	push @job, { name => $item, cmd => "( cd $dir/$item/; $cmd )" };
93	}
94	closedir DIR;
95	}
96
97	# ressources available
98	my @ressources = ();
99	open(NODE_FILE, '<', "$nodefile")
100	or die "can't open $nodefile: $!";
101	while (<NODE_FILE>) {
102	chomp;
103	next if m/^#/;
104	next if m/^\s*$/;
105	push @ressources, $_;
106	}
107	close NODE_FILE;
108
109	my $ressource_size = scalar(@ressources);
110	die "error: not enought ressources jobnp $job_np > ressources $ressource_size"
111	if $job_np > $ressource_size;
112
113	my $current_dir = getcwd();
114
115	my $stderr = $ENV{OAR_STDERR} \|\| '';
116	$stderr =~ s/\.stderr$//;
117	$stderr = $masterio if $masterio;
118	my $stdout = $ENV{OAR_STDOUT} \|\| '';
119	$stdout =~ s/\.stdout$//;
120	$stdout = $masterio if $masterio;
121
122	my $finished = new Coro::Signal;
123	my $job_todo = new Coro::Semaphore 0;
124	my $job_name_maxlen;
125	for (@job) {
126	$job_todo->up;
127	$job_name_maxlen = length($_->{name}) if length($_->{name}) > $job_name_maxlen;
128	}
129
130	# slice of ressources for parallel job
131	my $ressources = new Coro::Channel;
132	for my $slot (1 .. int($ressource_size / $job_np)) {
133	$ressources->put(
134	join(',',
135	@ressources[ (($slot - 1) * $job_np) .. (($slot * $job_np) - 1) ])
136	);
137	}
138
139	my %scheduled = ();
140
141	# OAR checkpoint and default signal SIGUSR2
142	my $oar_checkpoint = new Coro::Semaphore 0;
143	$SIG{USR2} = sub {
144	print "warning: receive checkpoint at "
145	. time
146	. ", no new job, just finishing running job\n"
147	if $verbose;
148	$oar_checkpoint->up();
149	};
150
151	# asynchrone start job block
152	async {
153	JOB:
154	for my $job (@job) {
155	my $job_name = $job->{name};
156	my $job_cmd = $job->{cmd};
157
158	# job has been already run ?
159	if (exists $state{$job_name}) {
160	if ($state{$job_name} eq 'start') {
161	print "warning: job $job_name was not clearly finished, relaunching...\n"
162	if $verbose;
163	}
164	elsif ($state{$job_name} eq 'end') {
165	delete $state{$job_name}; # free memory
166	$job_todo->down;
167	print "warning: job $job_name already run\n" if $verbose;
168	cede;
169	next JOB;
170	}
171	}
172
173	# take job ressource
174	my $job_ressource = $ressources->get;
175
176	# no more launch job when OAR checkpointing
177	last JOB if $oar_checkpoint->count() > 0;
178
179	my ($node_connect) = split ',', $job_ressource;
180	my $fh = IO::File->new();
181	my $job_pid = $fh->open("\| $oarsh $node_connect >/dev/null 2>&1")
182	or die "error: can't start subjob: $!";
183
184	$fh->autoflush;
185	$fh = unblock $fh;
186
187	$scheduled{$job_pid} = {
188	fh => $fh,
189	node_connect => $node_connect,
190	ressource => $job_ressource,
191	name => $job_name
192	};
193
194	my $msg = sprintf "start job %${job_name_maxlen}s / %5i at %s on node %s\n",
195	$job_name, $job_pid, time, $job_ressource;
196	$log_h->print($msg) if $logtrace;
197	print($msg) if $verbose;
198
199	my ($job_stdout, $job_stderr);
200	$job_stdout = "> $stdout-$job_name.stdout" if $stdout ne '' and $switchio;
201	$job_stderr = "2> $stderr-$job_name.stderr" if $stderr ne '' and $switchio;
202
203	my $job_nodefile = "/tmp/oar-parexec-$ENV{LOGNAME}-$job_name";
204
205	# set job environment, run it and clean
206	if ($job_np > 1) {
207	$fh->print("printf \""
208	. join('\n', split(',', $job_ressource,))
209	. "\" > $job_nodefile\n");
210	$fh->print("OAR_NODE_FILE=$job_nodefile\n");
211	$fh->print("OAR_NP=$job_np\n");
212	$fh->print("export OAR_NODE_FILE\n");
213	$fh->print("export OAR_NP\n");
214	$fh->print("unset OAR_MSG_NODEFILE\n");
215	}
216	$fh->print("cd $current_dir\n");
217	$fh->print("$job_cmd $job_stdout $job_stderr\n");
218	$fh->print("rm -f $job_nodefile\n") if $job_np > 1;
219	$fh->print("exit\n");
220	cede;
221	}
222	}
223
224	# asynchrone end job block
225	async {
226	while () {
227	for my $job_pid (keys %scheduled) {
228	# non blocking PID test
229	if (waitpid($job_pid, WNOHANG)) {
230	my $msg = sprintf "end job %${job_name_maxlen}s / %5i at %s on node %s\n",
231	$scheduled{$job_pid}->{name},
232	$job_pid, time, $scheduled{$job_pid}->{ressource};
233	$log_h->print($msg) if $logtrace;
234	print($msg) if $verbose;
235	close $scheduled{$job_pid}->{fh};
236	# leave ressources for another job
237	$ressources->put($scheduled{$job_pid}->{ressource});
238	$job_todo->down;
239	delete $scheduled{$job_pid};
240	}
241	cede;
242	}
243
244	# checkpointing ! just finishing running job and quit
245	$finished->send if $oar_checkpoint->count() > 0 and scalar(keys(%scheduled)) == 0;
246
247	$finished->send if $job_todo->count() == 0;
248	cede;
249	}
250	}
251
252	cede;
253
254	# all job have been done
255	$finished->wait;
256
257	# close log trace file
258	$log_h->close() if $logtrace;
259
260	__END__
261
262	=head1 NAME
263
264	oar-parexec - parallel execution of many small job
265
266	=head1 SYNOPSIS
267
268	oar-parexec --file filecommand \
269	[--logtrace tracefile] [--verbose] \
270	[--jobnp integer] [--nodefile filenode] [--oarsh sssh] \
271	[--switchio] [--masterio basefileio]
272
273	oar-parexec --dir foldertoiterate --cmd commandtolaunch \
274	[--logtrace tracefile] [--verbose] \
275	[--jobnp integer] [--nodefile filenode] [--oarsh sssh] \
276	[--switchio] [--masterio basefileio]
277
278	oar-parexec --help
279
280	=head1 DESCRIPTION
281
282	C<oar-parexec> can execute lot of small job in parallel inside a cluster.
283	Number of parallel job at one time cannot exceed the number of core define in the node file
284	C<oar-parexec> is easier to use inside an OAR job environment
285	which define automatically these strategics parameters...
286	However, it can be used outside OAR.
287
288	Option C<--file> or C<--dir> and C<--cmd> are the only mandatory parameters.
289
290	Small job will be launch in the same folder as the master job.
291	Two environment variable are defined for each small job
292	and only in case of parallel small job (option C<--jobnp> > 1).
293
294	OAR_NODE_FILE - file that list node for parallel computing
295	OAR_NP - number of processor affected
296
297	The file define by OAR_NODE_FILE is created in /tmp
298	on the node before launching the small job
299	and this file will be delete after job complete.
300	C<oar-parexec> is a simple script,
301	OAR_NODE_FILE will not be deleted in case of crash of the master job.
302
303	OAR define other variable that are equivalent to OAR_NODE_FILE:
304	OAR_NODEFILE, OAR_FILE_NODES, OAR_RESOURCE_FILE...
305	You can use in your script the OAR original file ressources
306	by using these variable if you need it.
307
308
309	=head1 OPTIONS
310
311	=over 12
312
313	=item B<-f\|--file filecommand>
314
315	File name which content job list.
316	For the JOB_NAME definition,
317	the first valid job in the list will have the number 1 and so on...
318
319	=item B<-d\|--dir foldertoiterate>
320
321	Command C<--cmd> will be launch in all sub-folder of this master folder.
322	Files in this folder will be ignored.
323	Sub-folder name which begin with F<.>
324	or finish with F<.old>, F<.sav>, F<.bak>, F<.no> will either be ignored...
325
326	The JOB_NAME is simply the Sub-folder name.
327
328	=item B<-c\|--cmd commandtolaunch>
329
330	Command (and argument to it) tha will be launch in all sub-folder
331	parameter folfer C<--dir>
332
333	=item B<-l\|--logtrace tracefile>
334
335	File which log and trace running job.
336	In case of running the same master command (after crash for example),
337	only job that are not mark as done will be run again.
338	Be careful, job mark as running (start but not finish) will be run again.
339	Tracing is base on the JOB_NAME between multiple run.
340
341	This option is very usefull in case of crash
342	but also for checkpointing and idempotent OAR job.
343
344	=item B<-v\|--verbose>
345
346	=item B<-j\|--jobnp integer>
347
348	Number of processor to allocated for each small job.
349	1 by default.
350
351	=item B<-n\|--nodefile filenode>
352
353	File name that list all the node where job could be launch.
354	By defaut, it's define automatically by OAR via
355	environment variable C<OAR_NODE_FILE>.
356
357	For example, if you want to use 6 core on your cluster node,
358	you need to put 6 times the hostname node in this file,
359	one per line...
360	It's a very common file in MPI process !
361
362	=item B<-o\|-oarsh command>
363
364	Command use to launch a shell on a node.
365	By default
366
367	oarsh -q -T
368
369	Change it to C<ssh> if you are not using an OAR cluster...
370
371	=item B<-s\|--switchio>
372
373	Each small job will have it's own output STDOUT and STDERR
374	base on master OAR job with C<JOB_NAME> inside
375	(or base on C<basefileio> if option C<masterio>).
376	Example :
377
378	OAR.151524.stdout -> OAR.151524-JOB_NAME.stdout
379
380	where 151524 here is the master C<OAR_JOB_ID>
381	and C<JOB_NAME> is the small job name.
382
383	=item B<-m\|--masterio basefileio>
384
385	The C<basefileio> will be use in place of environment variable
386	C<OAR_STDOUT> and C<OAR_STDERR> (without extension) to build the base name of the small job standart output
387	(only use when option C<swithio> is activated).
388
389	=item B<-h\|--help>
390
391	=back
392
393
394	=head1 EXAMPLE
395
396	=head2 Simple list of sequential job
397
398	Content for the job file command (option C<--file>) could have:
399
400	- empty line
401	- comment line begin with #
402	- valid shell command
403
404	Example where F<$HOME/test/subjob1.sh> is a shell script (executable).
405
406	$HOME/test/subjob1.sh
407	$HOME/test/subjob2.sh
408	$HOME/test/subjob3.sh
409	$HOME/test/subjob4.sh
410	...
411	$HOME/test/subjob38.sh
412	$HOME/test/subjob39.sh
413	$HOME/test/subjob40.sh
414
415	These jobs could be launch by:
416
417	oarsub -n test -l /core=6,walltime=04:00:00 \
418	"oar-parexec -f ./subjob.list.txt"
419
420	=head2 Folder job
421
422	In a folder F<subjob.d>, create sub-folder with your data inside : F<test1>, <test2>...
423	The same command will be executed in every sub-folder.
424	C<oar-parexec> change the current directory to the sub-folder before launching it.
425
426	A very simple job could be:
427
428	oarsub -n test -l /core=6,walltime=04:00:00 \
429	"oar-parexec -d ./subjob.d -c 'sleep 10; env'"
430
431	The command C<env> will be excuted in all folder F<test1>, F<test2>... after a 10s pause.
432
433	Sometime, it's simpler to use file list command,
434	sometime, jobs by folder with the same command run is more relevant.
435
436	=head2 Parallel job
437
438	You need to put the number of core each small job need with option C<--jobnp>.
439	If your job is build on OpenMP or MPI,
440	you can use OAR_NP and OAR_NODE_FILE variables to configure them.
441	On OAR cluster, you need to use C<oarsh> or a wrapper like C<oar-envsh>
442	for connexion between node instead of C<ssh>.
443
444	Example with parallel small job on 2 core:
445
446	oarsub -n test -l /core=6,walltime=04:00:00 \
447	"oar-parexec -j 2 -f ./subjob.list.txt"
448
449	=head2 Tracing and master crash
450
451	If the master node crash after hours of calculus, everything is lost ?
452	No, with option C<--logtrace>,
453	it's possible to remember older result
454	and not re-run these job the second and next time.
455
456	oarsub -n test -l /core=6,walltime=04:00:00 \
457	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
458
459	After a crash or an C<oardel> command,
460	you can then re-run the same command that will end to execute the jobs in the list
461
462	oarsub -n test -l /core=6,walltime=04:00:00 \
463	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
464
465	C<logtrace> file are just plain file.
466	We use the extension '.log' because these files are automatically
467	eliminate from our backup system!
468
469	=head2 Checkpointing and Idempotent
470
471	C<oar-parexec> is compatible with the OAR checkpointing.
472	Il you have 2000 small jobs that need 55h to be done on 6 cores,
473	you can cut this in small parts.
474
475	For this example, we suppose that each small job need about 10min...
476	So, we send a checkpoint 12min before the end of the process
477	to let C<oar-parexec> finish the jobs started.
478	After being checkpointed, C<oar-parexec> do not start any new small job.
479
480	oarsub -t idempotent -n test \
481	-l /core=6,walltime=04:00:00 \
482	--checkpoint 720 \
483	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
484
485	After 3h48min, the OAR job will begin to stop launching new small job.
486	When all running small job are finished, it's exit.
487	But as the OAR job is type C<idempotent>,
488	OAR will re-submit it as long as all small job are not executed...
489
490	This way, we let other users a chance to use the cluster!
491
492	In this last exemple, we use moldable OAR job with idempotent
493	to reserve many core for a small time or a few cores for a long time:
494
495	oarsub -t idempotent -n test \
496	-l /core=50,walltime=01:05:00 \
497	-l /core=6,walltime=04:00:00 \
498	--checkpoint 720 \
499	"oar-parexec -f ./subjob.list.txt -l ./subjob.list.log"
500
501
502	=head1 SEE ALSO
503
504	oar-dispatch, mpilauncher,
505	orsh, oar-envsh, ssh
506
507
508	=head1 AUTHORS
509
510	Written by Gabriel Moreau, Grenoble - France
511
512
513	=head1 LICENSE AND COPYRIGHT
514
515	GPL version 2 or later and Perl equivalent
516
517	Copyright (C) 2011 Gabriel Moreau / LEGI - CNRS UMR 5519 - France
518

Note: See TracBrowser for help on using the repository browser.

Download in other formats: