source: trunk/src/cluster_command.m @ 1179

Last change on this file since 1179 was 1179, checked in by sommeria, 4 weeks ago

a few bug repairs and cleaning

File size: 2.6 KB
Line 
1%'cluster_command': creates the command string for launching jobs in the cluster system 'oar'
2% other cluster options 'pbs' and 'psmn' are available in cluster_command_pbs and
3% cluster_command_psmn. The choice is made in the xml file for parameters: series.xml
4%-- ----------------------------------------------------------------------
5% function cmd=cluster_command(ListProcess,ActionFullName,DirLog,NbProcess, NbCore,CPUTimeProcess)
6%
7%OUTPUT
8% cmd=system command (char string) to launch jobs
9%%
10%
11%INPUT:
12% ListProcessFile: name of the file containing the list of processes to perform
13% ActionFullName: name given to the action (function activated by series)
14% DirLog: name of the folder used to store the log files from calculations
15% NbProcess: number of processes in the list, these processed are grouped by the systwm into jobs dipatched to NbCore cores
16% NbCore: number of computer cores to which the processes are dispatched
17% CPUTimeProcess: estimated CPU time for an individual process (in min)
18
19function cmd=cluster_command(ListProcessFile,ActionFullName,DirLog,NbProcess, NbCore,CPUTimeProcess)
20
21filename_log=fullfile(DirLog,'job_list.stdout'); % file for output messages of the master oar process
22filename_errors=fullfile(DirLog,'job_list.stderr'); % file for error messages of the master oar process
23if NbProcess>=6
24    bigiojob_string=['+{type = ' char(39) 'bigiojob' char(39) '}/licence=1'];% char(39) is quote - bigiojob limit UVmat parallel launch on cluster to avoid saturation of disk access to data
25else
26    bigiojob_string='';
27end
28
29WallTimeMax=23;% absolute limit on computation time (in hours)
30WallTimeTotal=min(WallTimeMax,4*CPUTimeProcess/60);% chosen limit on computation time (in hours),possibly smaller than the absolute limit to favor job priority in the system.
31WallTimeOneProcess=min(4*CPUTimeProcess+10,WallTimeTotal*60/2); % estimated max time of an individual process, used for checkpoint:
32%if less than this time remains before walltime, the job is stopped and a new one can be launched ( by the option 'idempotent')
33
34% if NbCore==1
35%     corestring='cpu=1/core=4'; %increases the allowed memory in case of single core job
36% else
37    corestring=['{cluster=''calcul8''}/core=' num2str(max(NbCore,4))];
38% end
39cmd=['oarsub -n UVmat_' ActionFullName ' '...
40    '-t idempotent --checkpoint ' num2str(WallTimeOneProcess*60) ' '...
41    '-l "' corestring bigiojob_string... % char(39) is quote - bigiojob limit UVmat parallel launch on cluster
42    ',walltime=' datestr(WallTimeTotal/24,13) '" '...
43    '-E ' filename_errors ' '...
44    '-O ' filename_log ' '...
45    '"oar-parexec -s -f ' ListProcessFile ' '...
46    '-l ' ListProcessFile '.log"'];
Note: See TracBrowser for help on using the repository browser.