[1019] | 1 | %'cluster_command': creates the command string for launching jobs in the cluster |
---|
| 2 | %------------------------------------------------------------------------ |
---|
| 3 | % function cmd=cluster_command(ListProcess,ActionFullName,DirLog,NbProcess, NbCore,CPUTimeProcess) |
---|
| 4 | % |
---|
| 5 | %OUTPUT |
---|
| 6 | % cmd=system command (char string) to launch jobs |
---|
| 7 | %% |
---|
| 8 | % |
---|
| 9 | %INPUT: |
---|
| 10 | % ListProcessFile: name of the file containing the list of processes to perform |
---|
| 11 | % ActionFullName: name given to the action (function activated by series) |
---|
| 12 | % DirLog: name of the folder used to store the log files from calculations |
---|
| 13 | % NbProcess: number of processes in the list, these processed are grouped by the systwm into jobs dipatched to NbCore cores |
---|
| 14 | % NbCore: number of computer cores to which the processes are dispatched |
---|
| 15 | % CPUTimeProcess: estimated CPU time for an individual process (in min) |
---|
| 16 | |
---|
| 17 | function cmd=cluster_command(ListProcessFile,ActionFullName,DirLog,NbProcess, NbCore,CPUTimeProcess) |
---|
| 18 | |
---|
| 19 | filename_log=fullfile(DirLog,'job_list.stdout'); % file for output messages of the master oar process |
---|
| 20 | filename_errors=fullfile(DirLog,'job_list.stderr'); % file for error messages of the master oar process |
---|
| 21 | if NbProcess>=8 |
---|
| 22 | bigiojob_string=['+{type = ' char(39) 'bigiojob' char(39) '}/licence=1'];% char(39) is quote - bigiojob limit UVmat parallel launch on cluster to avoid saturation of disk access to data |
---|
| 23 | else |
---|
| 24 | bigiojob_string=''; |
---|
| 25 | end |
---|
| 26 | |
---|
| 27 | WallTimeMax=23;% absolute limit on computation time (in hours) |
---|
| 28 | WallTimeTotal=min(WallTimeMax,4*CPUTimeProcess/60);% chosen limit on computation time (in hours),possibly smaller than the absolute limit to favor job priority in the system. |
---|
| 29 | WallTimeOneProcess=min(4*CPUTimeProcess+10,WallTimeTotal*60/2); % estimated max time of an individual process, used for checkpoint: |
---|
| 30 | %if less than this time remains before walltime, the job is stopped and a new one can be launched ( by the option 'idempotent') |
---|
[1114] | 31 | |
---|
| 32 | if NbCore==1 |
---|
| 33 | corestring='cpu=1/core=4'; %increases the allowed memory in case of single core job |
---|
| 34 | else |
---|
| 35 | corestring=['/core=' num2str(NbCore)]; |
---|
| 36 | end |
---|
| 37 | cmd=['oarsub -n UVmat_' ActionFullName ' '... |
---|
[1019] | 38 | '-t idempotent --checkpoint ' num2str(WallTimeOneProcess*60) ' '... |
---|
[1114] | 39 | '-l "' corestring bigiojob_string... % char(39) is quote - bigiojob limit UVmat parallel launch on cluster |
---|
[1019] | 40 | ',walltime=' datestr(WallTimeTotal/24,13) '" '... |
---|
| 41 | '-E ' filename_errors ' '... |
---|
| 42 | '-O ' filename_log ' '... |
---|
| 43 | '"oar-parexec -s -f ' ListProcessFile ' '... |
---|
| 44 | '-l ' ListProcessFile '.log"']; |
---|