source: trunk/src/cluster_command.m @ 1060

Last change on this file since 1060 was 1019, checked in by sommeria, 3 years ago

more general cluster commands introduced

File size: 2.4 KB
Line 
1%'cluster_command': creates the command string for launching jobs in the cluster
2%------------------------------------------------------------------------
3% function cmd=cluster_command(ListProcess,ActionFullName,DirLog,NbProcess, NbCore,CPUTimeProcess)
4%
5%OUTPUT
6% cmd=system command (char string) to launch jobs
7%%
8%
9%INPUT:
10% ListProcessFile: name of the file containing the list of processes to perform
11% ActionFullName: name given to the action (function activated by series)
12% DirLog: name of the folder used to store the log files from calculations
13% NbProcess: number of processes in the list, these processed are grouped by the systwm into jobs dipatched to NbCore cores
14% NbCore: number of computer cores to which the processes are dispatched
15% CPUTimeProcess: estimated CPU time for an individual process (in min)
16
17function cmd=cluster_command(ListProcessFile,ActionFullName,DirLog,NbProcess, NbCore,CPUTimeProcess)
18
19filename_log=fullfile(DirLog,'job_list.stdout'); % file for output messages of the master oar process
20filename_errors=fullfile(DirLog,'job_list.stderr'); % file for error messages of the master oar process
21        if NbProcess>=8
22            bigiojob_string=['+{type = ' char(39) 'bigiojob' char(39) '}/licence=1'];% char(39) is quote - bigiojob limit UVmat parallel launch on cluster to avoid saturation of disk access to data
23        else
24            bigiojob_string='';
25        end
26
27WallTimeMax=23;% absolute limit on computation time (in hours)
28WallTimeTotal=min(WallTimeMax,4*CPUTimeProcess/60);% chosen limit on computation time (in hours),possibly smaller than the absolute limit to favor job priority in the system. 
29WallTimeOneProcess=min(4*CPUTimeProcess+10,WallTimeTotal*60/2); % estimated max time of an individual process, used for checkpoint:
30                                                                %if less than this time remains before walltime, the job is stopped and a new one can be launched ( by the option 'idempotent')
31cmd=['oarsub -n UVmat_' ActionFullName ' '...
32            '-t idempotent --checkpoint ' num2str(WallTimeOneProcess*60) ' '...
33            '-l "/core=' num2str(NbCore)...
34            bigiojob_string... % char(39) is quote - bigiojob limit UVmat parallel launch on cluster
35            ',walltime=' datestr(WallTimeTotal/24,13) '" '...
36            '-E ' filename_errors ' '...
37            '-O ' filename_log ' '...
38            '"oar-parexec -s -f ' ListProcessFile ' '...
39            '-l ' ListProcessFile '.log"'];
Note: See TracBrowser for help on using the repository browser.