- Timestamp:
- Nov 9, 2017, 6:17:10 PM (7 years ago)
- Location:
- trunk/src
- Files:
-
- 4 added
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/src/series.m
r1016 r1019 144 144 if exist(xmlfile,'file') 145 145 SeriesData.SeriesParam=xml2struct(xmlfile); 146 if ~(isfield(SeriesData.SeriesParam,'ClusterParam')&& isfield(SeriesData.SeriesParam.ClusterParam,'LaunchCmdFcn')) 147 [success,message]=copyfile(xmlfile,fullfile(path_series,'series_old.xml'));% update the file series.xml inot correctly documented 148 delete(xmlfile); 149 [success,message]=copyfile(fullfile(path_series,'series.xml.default'),xmlfile); 150 end 151 SeriesData.SeriesParam=xml2struct(xmlfile); 146 152 end 147 153 … … 164 170 ActionPathList(:)={path_series_fct}; % set the default path to series fcts to all list members 165 171 RunModeList={'local';'background'}; % default choice of extensions (Matlab fct .m or compiled version .sh) 166 [s,w]=system('oarstat'); % look for cluster system 'oar' 172 [s,w]=system(SeriesData.SeriesParam.ClusterParam.ExistenceTest); % look for cluster system presence 173 % [s,w]=system('oarstat'); % look for cluster system 'oar' 167 174 if isequal(s,0) 168 RunModeList=[RunModeList;{'cluster _oar'}];175 RunModeList=[RunModeList;{'cluster'}]; 169 176 set(handles.MonitorCluster,'Visible','on'); % make visible button for access to Monika 170 177 set(handles.num_CPUTime,'Visible','on'); % make visible button for access to Monika 171 set(handles.CPUTime_txt,'Visible','on'); % make visible button for access to Monika172 end 173 [s,w]=system('qstat -help'); % look for cluster system 'sge'174 if isequal(s,0)175 if regexp(w,'^pbs')176 RunModeList=[RunModeList;{'cluster_pbs'}];177 elseif regexp(w,'^SGE')178 RunModeList=[RunModeList;{'cluster_sge'}];179 else180 RunModeList=[RunModeList;{'cluster_qstat_unknown'}];181 end182 end178 set(handles.CPUTime_txt,'Visible','on'); % make visible button for CPU time estimate for one ref index 179 end 180 % [s,w]=system('qstat -help'); % look for cluster system 'sge' 181 % if isequal(s,0) 182 % if regexp(w,'^pbs') 183 % RunModeList=[RunModeList;{'cluster_pbs'}]; 184 % elseif regexp(w,'^SGE') 185 % RunModeList=[RunModeList;{'cluster_sge'}]; 186 % else 187 % RunModeList=[RunModeList;{'cluster_qstat_unknown'}]; 188 % end 189 % end 183 190 set(handles.RunMode,'String',RunModeList)% display the menu of available run modes, local, background or cluster manager 184 191 … … 1567 1574 case {'local','background'} 1568 1575 NbCore=1; % no need to split the calculation 1569 case 'cluster _oar'1576 case 'cluster' 1570 1577 %proposed number of cores to reserve in the cluster 1571 NbCoreAdvised=SeriesData.SeriesParam. OarParam.NbCoreAdvised;1572 NbCoreMax=SeriesData.SeriesParam. OarParam.NbCoreMax;1578 NbCoreAdvised=SeriesData.SeriesParam.ClusterParam.NbCoreAdvised; 1579 NbCoreMax=SeriesData.SeriesParam.ClusterParam.NbCoreMax; 1573 1580 if strcmp(ActionExt,'.m')% case of Matlab function (uncompiled) 1574 1581 warning_string=', preferably use .sh option to save Matlab licences'; … … 1576 1583 warning_string=')'; 1577 1584 end 1578 answer= inputdlg({['Number of cores (max ' num2str(NbCoreMax) warning_string],'extra oar options'},'oarsub parameter',1,{num2str(NbCoreAdvised),''});1585 answer=msgbox_uvmat('INPUT_TXT',['Number of cores (max ' num2str(NbCoreMax) ', ' warning_string],num2str(NbCoreAdvised)); 1579 1586 if isempty(answer) 1580 1587 errormsg='Action launch interrupted by user'; 1581 1588 return 1582 1589 end 1583 NbCore=str2double(answer {1});1584 extra_oar=answer{2};1585 case {'cluster_pbs', 'cluster_sge', 'cluster_qstat_unknown'}1586 if strcmp(ActionExt,'.m')% case of Matlab function (uncompiled)1587 NbCore=1; % one core used only (limitation of Matlab licences)1588 answer=msgbox_uvmat('INPUT_Y-N','Number of cores =1: select the compiled version .sh for multi-core processing. Proceed with the .m version?');1589 if ~strcmp(answer,'Yes')1590 errormsg='Action launch interrupted';1591 return1592 end1593 extra_oar='';1594 else1595 answer=inputdlg({'Number of jobs (max 1000)','Queue'},'qsub parameters',1,{'100','piv_debian'});1596 NbCore=str2double(answer{1});1597 qstat_Queue=answer{2};1598 %extra_oar=answer{2}; % TODO : fix this for LMFA cluster. Maybe1599 %extrs_oar and extra_pbs are not the best names1600 end1590 NbCore=str2double(answer); 1591 % extra_oar=answer{2}; 1592 % case {'cluster_pbs', 'cluster_sge', 'cluster_qstat_unknown'} 1593 % if strcmp(ActionExt,'.m')% case of Matlab function (uncompiled) 1594 % NbCore=1; % one core used only (limitation of Matlab licences) 1595 % answer=msgbox_uvmat('INPUT_Y-N','Number of cores =1: select the compiled version .sh for multi-core processing. Proceed with the .m version?'); 1596 % if ~strcmp(answer,'Yes') 1597 % errormsg='Action launch interrupted'; 1598 % return 1599 % end 1600 % extra_oar=''; 1601 % else 1602 % answer=inputdlg({'Number of jobs (max 1000)','Queue'},'qsub parameters',1,{'100','piv_debian'}); 1603 % NbCore=str2double(answer{1}); 1604 % qstat_Queue=answer{2}; 1605 % %extra_oar=answer{2}; % TODO : fix this for LMFA cluster. Maybe 1606 % %extrs_oar and extra_pbs are not the best names 1607 % end 1601 1608 end 1602 1609 if ~isfield(Param.IndexRange,'NbSlice') … … 1722 1729 NbProcess=1; 1723 1730 switch RunMode 1724 case {'cluster_oar','cluster_pbs','cluster_sge','cluster_qstat_unknown'}1725 JobNumberMax=SeriesData.SeriesParam. OarParam.JobNumberMax;1726 JobCPUTimeAdvised=SeriesData.SeriesParam. OarParam.JobCPUTimeAdvised;1731 case 'cluster' 1732 JobNumberMax=SeriesData.SeriesParam.ClusterParam.JobNumberMax; 1733 JobCPUTimeAdvised=SeriesData.SeriesParam.ClusterParam.JobCPUTimeAdvised; 1727 1734 if isempty(Param.IndexRange.NbSlice)% if NbSlice is not defined 1728 1735 BlockLength= ceil(JobCPUTimeAdvised/(CPUTime*nbfield_j)); % iterations are grouped in sets with length BlockLength such that the typical CPU time of a job is MinJobNumber. … … 1942 1949 end 1943 1950 1944 case 'cluster _oar' % option 'oar-parexec' used1951 case 'cluster' % option 'oar-parexec' used 1945 1952 %create subdirectory for oar commands 1946 1953 for iprocess=1:NbProcess … … 1974 1981 system(['chmod +x ' batch_file_list{iprocess}]); % set the file to executable 1975 1982 end 1976 D irOAR=fullfile(OutputDir,'0_OAR');1977 if exist(D irOAR,'dir')% delete the content of the dir 0_LOG to allow new input1983 DIR_CLUSTER=fullfile(OutputDir,'0_CLUSTER'); 1984 if exist(DIR_CLUSTER,'dir')% delete the content of the dir 0_LOG to allow new input 1978 1985 curdir=pwd; 1979 cd(D irOAR)1986 cd(DIR_CLUSTER) 1980 1987 delete('*') 1981 1988 cd(curdir) 1982 1989 else 1983 [tild,msg1]=mkdir(D irOAR);1990 [tild,msg1]=mkdir(DIR_CLUSTER); 1984 1991 if ~strcmp(msg1,'') 1985 errormsg=['cannot create ' D irOAR ': ' msg1]; % error message for directory creation1992 errormsg=['cannot create ' DIR_CLUSTER ': ' msg1]; % error message for directory creation 1986 1993 return 1987 1994 end 1988 1995 end 1989 1996 % create file containing the list of jobs 1990 filename_joblist=fullfile(DirOAR,'job_list.txt'); % name of the file containing the list of executables1991 fid=fopen( filename_joblist,'w'); % open it for writting1997 ListProcess=fullfile(DIR_CLUSTER,'job_list.txt'); % name of the file containing the list of executables 1998 fid=fopen(ListProcess,'w'); % open it for writting 1992 1999 for iprocess=1:length(batch_file_list) 1993 2000 fprintf(fid,[batch_file_list{iprocess} '\n']); % write list of exe files 1994 2001 end 1995 2002 fclose(fid); 1996 system(['chmod +x ' filename_joblist]); % set the file to executable 1997 1998 filename_log=fullfile(DirLog,'job_list.stdout'); % file for output messages of the master oar process 1999 filename_errors=fullfile(DirLog,'job_list.stderr'); % file for error messages of the master oar process 2000 % the command job_list.txt contains the list of NbProcess independent individual jobs 2001 % in which the total calculation has been split. Those are written as executable files .sh in the folder /O_EXE. 2002 % These individual jobs are grouped by the system as oar jobs on the NbCore processors. 2003 % For each processor, the oar job must stop after the walltime which has been set, which is limited to 24 h. 2004 % However, the oar job is automatically restarted (option 'idempotent') provided the individual jobs are 2005 % shorter than the wall time: in the time interval 'checkpoint' (WallTimeOneJob) before the end of the allowed duration, 2006 % the oar job restarts when an individual job ends. 2007 WallTimeMax=SeriesData.SeriesParam.OarParam.WallTimeMax; 2008 JobTime=CPUTime*BlockLength*nbfield_j; % estimated CPU time for one individual job (in minutes) 2009 % wall time (in hours ) for each oar job, allowing 10 individual jobs, but limited to 23 h: 2010 WallTimeTotal=min(WallTimeMax,4*JobTime/60); 2011 %disp(['WallTimeTotal: ' num2str(WallTimeTotal) ' hours']) 2012 % estimated time of an individual job (in min), with a margin of error 2013 WallTimeOneJob=min(4*JobTime+10,WallTimeTotal*60/2); % estimated max time of an individual job for checkpoint 2014 disp(['WallTimeOneJob: ' num2str(WallTimeOneJob) ' minutes']) 2015 if NbProcess>=8 2016 bigiojob_string=['+{type = ' char(39) 'bigiojob' char(39) '}/licence=1'];% char(39) is quote - bigiojob limit UVmat parallel launch on cluster to avoid saturation of disk access to data 2017 else 2018 bigiojob_string=''; 2019 end 2020 oar_command=['oarsub -n UVmat_' ActionFullName ' '... 2021 '-t idempotent --checkpoint ' num2str(WallTimeOneJob*60) ' '... 2022 '-l "/core=' num2str(NbCore)... 2023 bigiojob_string... % char(39) is quote - bigiojob limit UVmat parallel launch on cluster 2024 ',walltime=' datestr(WallTimeTotal/24,13) '" '... 2025 '-E ' filename_errors ' '... 2026 '-O ' filename_log ' '... 2027 extra_oar ' '... 2028 '"oar-parexec -s -f ' filename_joblist ' '... 2029 '-l ' filename_joblist '.log"']; 2030 2031 fprintf(oar_command); % display system command on the Matlab command window 2003 system(['chmod +x ' ListProcess]); % set the file to executable 2004 2005 CPUTimeProcess=CPUTime*BlockLength*nbfield_j; % estimated CPU time for one individual process (in minutes) 2006 LaunchCmdFcn=SeriesData.SeriesParam.ClusterParam.LaunchCmdFcn; 2007 oar_command=feval(LaunchCmdFcn,ListProcess,ActionFullName,DirLog,NbProcess, NbCore,CPUTimeProcess) 2032 2008 [status,result]=system(oar_command)% execute system command and show the result (ID number of the launched job) on the Matlab command window 2033 filename_oarcommand=fullfile(D irOAR,'0_oar_command'); % keep track of the command in file '0-OAR/0_oar_command'2009 filename_oarcommand=fullfile(DIR_CLUSTER,'0_cluster_command'); % keep track of the command in file '0-OAR/0_cluster_command' 2034 2010 fid=fopen(filename_oarcommand,'w'); 2035 2011 fprintf(fid,oar_command); % store the command … … 2038 2014 msgbox_uvmat('CONFIRMATION',[ActionFullName ' launched as ' num2str(NbProcess) ' processes in cluster: press STATUS to see results']) 2039 2015 2040 case 'cluster_pbs' % for LMFA Kepler machine 2041 %create subdirectory for pbs command and log files 2042 DirPBS=fullfile(OutputDir,'0_PBS'); % todo : common name OAR/PBS 2043 if exist(DirPBS,'dir')% delete the content of the dir 0_LOG to allow new input 2044 curdir=pwd; 2045 cd(DirPBS) 2046 delete('*') 2047 cd(curdir) 2048 else 2049 [tild,msg1]=mkdir(DirPBS); 2050 if ~strcmp(msg1,'') 2051 errormsg=['cannot create ' DirPBS ': ' msg1]; % error message for directory creation 2052 return 2053 end 2054 end 2055 max_walltime=3600*20; % 20h max total calculation (cannot exceed 24 h) 2056 walltime_onejob=1800; % seconds, max estimated time for asingle file index value 2057 filename_joblist=fullfile(DirPBS,'job_list.txt'); % create name of the global executable file 2058 fid=fopen(filename_joblist,'w'); 2059 for iprocess=1:length(batch_file_list) 2060 fprintf(fid,[batch_file_list{iprocess} '\n']); % list of exe files 2061 end 2062 fclose(fid); 2063 system(['chmod +x ' filename_joblist]); % set the file to executable 2064 pbs_command=['qsub -n CIVX '... 2065 '-t idempotent --checkpoint ' num2str(walltime_onejob+60) ' '... 2066 '-l /core=' num2str(NbCore) ','... 2067 'walltime=' datestr(min(1.05*walltime_onejob/86400*max(NbProcess*BlockLength*nbfield_j,NbCore)/NbCore,max_walltime/86400),13) ' '... 2068 '-E ' regexprep(filename_joblist,'\.txt\>','.stderr') ' '... 2069 '-O ' regexprep(filename_joblist,'\.txt\>','.log') ' '... 2070 extra_qstat ' '... 2071 '"oar-parexec -s -f ' filename_joblist ' '... 2072 '-l ' filename_joblist '.log"']; 2073 filename_oarcommand=fullfile(DirPBS,'pbs_command'); 2074 fid=fopen(filename_oarcommand,'w'); 2075 fprintf(fid,pbs_command); 2076 fclose(fid); 2077 fprintf(pbs_command); % display in command line 2078 %system(pbs_command); 2079 msgbox_uvmat('CONFIRMATION',[ActionFullName ' command ready to be launched in cluster']) 2080 2081 case 'cluster_sge' % for PSMN 2016 % case 'cluster_pbs' % for LMFA Kepler machine: trqnsferred to fct 2017 2018 % %create subdirectory for pbs command and log files 2019 % DirPBS=fullfile(OutputDir,'0_PBS'); % todo : common name OAR/PBS 2020 % if exist(DirPBS,'dir')% delete the content of the dir 0_LOG to allow new input 2021 % curdir=pwd; 2022 % cd(DirPBS) 2023 % delete('*') 2024 % cd(curdir) 2025 % else 2026 % [tild,msg1]=mkdir(DirPBS); 2027 % if ~strcmp(msg1,'') 2028 % errormsg=['cannot create ' DirPBS ': ' msg1]; % error message for directory creation 2029 % return 2030 % end 2031 % end 2032 % max_walltime=3600*20; % 20h max total calculation (cannot exceed 24 h) 2033 % walltime_onejob=1800; % seconds, max estimated time for asingle file index value 2034 % ListProcess=fullfile(DirPBS,'job_list.txt'); % create name of the global executable file 2035 % fid=fopen(ListProcess,'w'); 2036 % for iprocess=1:length(batch_file_list) 2037 % fprintf(fid,[batch_file_list{iprocess} '\n']); % list of exe files 2038 % end 2039 % fclose(fid); 2040 % system(['chmod +x ' ListProcess]); % set the file to executable 2041 % pbs_command=['qsub -n CIVX '... 2042 % '-t idempotent --checkpoint ' num2str(walltime_onejob+60) ' '... 2043 % '-l /core=' num2str(NbCore) ','... 2044 % 'walltime=' datestr(min(1.05*walltime_onejob/86400*max(NbProcess*BlockLength*nbfield_j,NbCore)/NbCore,max_walltime/86400),13) ' '... 2045 % '-E ' regexprep(ListProcess,'\.txt\>','.stderr') ' '... 2046 % '-O ' regexprep(ListProcess,'\.txt\>','.log') ' '... 2047 % extra_qstat ' '... 2048 % '"oar-parexec -s -f ' ListProcess ' '... 2049 % '-l ' ListProcess '.log"']; 2050 % filename_oarcommand=fullfile(DirPBS,'pbs_command'); 2051 % fid=fopen(filename_oarcommand,'w'); 2052 % fprintf(fid,pbs_command); 2053 % fclose(fid); 2054 % fprintf(pbs_command); % display in command line 2055 % %system(pbs_command); 2056 % msgbox_uvmat('CONFIRMATION',[ActionFullName ' command ready to be launched in cluster']) 2057 2058 case 'cluster_sge' % for PSMN % TODO: use the standard 'cluster' config with an external fct 2082 2059 % Au PSMN, on ne cr??e pas 1 job avec plusieurs c??urs, mais N jobs de 1 c??urs 2083 2060 % o?? N < 1000. -
trunk/src/series.xml.default
r997 r1019 2 2 <SeriesParam> 3 3 <DiskQuotaCmd>quota -s -g -A</DiskQuotaCmd> 4 <OarParam> 4 <ClusterParam> 5 <ExistenceTest>oarstat</ExistenceTest> 5 6 <NbCoreAdvised>16</NbCoreAdvised><!--proposed default number of parallel cores attributed for the computations --> 6 7 <NbCoreMax>36</NbCoreMax><!--maximum number of cores allowed for the computations --> … … 9 10 <WallTimeMax unit="hour">23</WallTimeMax> <!--maximum allowed time for a job --> 10 11 <JobStatCmd>oarstat |grep N=UVmat</JobStatCmd> <!--command to know the number of active and waiting job launches--> 11 </OarParam> 12 <LaunchCmdFcn>cluster_command</LaunchCmdFcn> <!--name of the function used to create job launch commmand--> 13 </ClusterParam> 12 14 <SgeParam> 13 15 </SgeParam>
Note: See TracChangeset
for help on using the changeset viewer.