Changeset 1019 for trunk


Ignore:
Timestamp:
Nov 9, 2017, 6:17:10 PM (7 years ago)
Author:
sommeria
Message:

more general cluster commands introduced

Location:
trunk/src
Files:
4 added
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/series.m

    r1016 r1019  
    144144if exist(xmlfile,'file')
    145145    SeriesData.SeriesParam=xml2struct(xmlfile);
     146    if ~(isfield(SeriesData.SeriesParam,'ClusterParam')&& isfield(SeriesData.SeriesParam.ClusterParam,'LaunchCmdFcn'))
     147        [success,message]=copyfile(xmlfile,fullfile(path_series,'series_old.xml'));% update the file series.xml inot correctly documented
     148        delete(xmlfile);
     149        [success,message]=copyfile(fullfile(path_series,'series.xml.default'),xmlfile);
     150    end 
     151    SeriesData.SeriesParam=xml2struct(xmlfile);
    146152end
    147153
     
    164170ActionPathList(:)={path_series_fct}; % set the default path to series fcts to all list members
    165171RunModeList={'local';'background'}; % default choice of extensions (Matlab fct .m or compiled version .sh)
    166 [s,w]=system('oarstat'); % look for cluster system 'oar'
     172[s,w]=system(SeriesData.SeriesParam.ClusterParam.ExistenceTest); % look for cluster system presence
     173% [s,w]=system('oarstat'); % look for cluster system 'oar'
    167174if isequal(s,0)
    168     RunModeList=[RunModeList;{'cluster_oar'}];
     175    RunModeList=[RunModeList;{'cluster'}];
    169176    set(handles.MonitorCluster,'Visible','on'); % make visible button for access to Monika
    170177    set(handles.num_CPUTime,'Visible','on'); % make visible button for access to Monika
    171     set(handles.CPUTime_txt,'Visible','on'); % make visible button for access to Monika
    172 end
    173 [s,w]=system('qstat -help'); % look for cluster system 'sge'
    174 if isequal(s,0)
    175     if regexp(w,'^pbs')
    176         RunModeList=[RunModeList;{'cluster_pbs'}];
    177     elseif regexp(w,'^SGE')
    178         RunModeList=[RunModeList;{'cluster_sge'}];
    179     else
    180         RunModeList=[RunModeList;{'cluster_qstat_unknown'}];
    181     end
    182 end
     178    set(handles.CPUTime_txt,'Visible','on'); % make visible button for CPU time estimate for one ref index
     179end
     180% [s,w]=system('qstat -help'); % look for cluster system 'sge'
     181% if isequal(s,0)
     182%     if regexp(w,'^pbs')
     183%         RunModeList=[RunModeList;{'cluster_pbs'}];
     184%     elseif regexp(w,'^SGE')
     185%         RunModeList=[RunModeList;{'cluster_sge'}];
     186%     else
     187%         RunModeList=[RunModeList;{'cluster_qstat_unknown'}];
     188%     end
     189% end
    183190set(handles.RunMode,'String',RunModeList)% display the menu of available run modes, local, background or cluster manager
    184191
     
    15671574    case {'local','background'}
    15681575        NbCore=1; % no need to split the calculation
    1569     case 'cluster_oar'
     1576    case 'cluster'
    15701577        %proposed number of cores to reserve in the cluster
    1571         NbCoreAdvised=SeriesData.SeriesParam.OarParam.NbCoreAdvised;
    1572         NbCoreMax=SeriesData.SeriesParam.OarParam.NbCoreMax;
     1578        NbCoreAdvised=SeriesData.SeriesParam.ClusterParam.NbCoreAdvised;
     1579        NbCoreMax=SeriesData.SeriesParam.ClusterParam.NbCoreMax;
    15731580        if strcmp(ActionExt,'.m')% case of Matlab function (uncompiled)
    15741581            warning_string=', preferably use .sh option to save Matlab licences';
     
    15761583            warning_string=')';
    15771584        end
    1578         answer=inputdlg({['Number of cores (max ' num2str(NbCoreMax) warning_string],'extra oar options'},'oarsub parameter',1,{num2str(NbCoreAdvised),''});
     1585        answer=msgbox_uvmat('INPUT_TXT',['Number of cores (max ' num2str(NbCoreMax) ', ' warning_string],num2str(NbCoreAdvised));
    15791586        if isempty(answer)
    15801587            errormsg='Action launch interrupted by user';
    15811588            return
    15821589        end
    1583         NbCore=str2double(answer{1});
    1584         extra_oar=answer{2};
    1585     case {'cluster_pbs', 'cluster_sge', 'cluster_qstat_unknown'}
    1586         if strcmp(ActionExt,'.m')% case of Matlab function (uncompiled)
    1587             NbCore=1; % one core used only (limitation of Matlab licences)
    1588             answer=msgbox_uvmat('INPUT_Y-N','Number of cores =1: select the compiled version .sh for multi-core processing. Proceed with the .m version?');
    1589             if ~strcmp(answer,'Yes')
    1590                 errormsg='Action launch interrupted';
    1591                 return
    1592             end
    1593             extra_oar='';
    1594         else
    1595             answer=inputdlg({'Number of jobs (max 1000)','Queue'},'qsub parameters',1,{'100','piv_debian'});
    1596             NbCore=str2double(answer{1});
    1597             qstat_Queue=answer{2};
    1598             %extra_oar=answer{2}; % TODO : fix this for LMFA cluster. Maybe
    1599             %extrs_oar and extra_pbs are not the best names
    1600         end
     1590        NbCore=str2double(answer);
     1591%         extra_oar=answer{2};
     1592%     case {'cluster_pbs', 'cluster_sge', 'cluster_qstat_unknown'}
     1593%         if strcmp(ActionExt,'.m')% case of Matlab function (uncompiled)
     1594%             NbCore=1; % one core used only (limitation of Matlab licences)
     1595%             answer=msgbox_uvmat('INPUT_Y-N','Number of cores =1: select the compiled version .sh for multi-core processing. Proceed with the .m version?');
     1596%             if ~strcmp(answer,'Yes')
     1597%                 errormsg='Action launch interrupted';
     1598%                 return
     1599%             end
     1600%             extra_oar='';
     1601%         else
     1602%             answer=inputdlg({'Number of jobs (max 1000)','Queue'},'qsub parameters',1,{'100','piv_debian'});
     1603%             NbCore=str2double(answer{1});
     1604%             qstat_Queue=answer{2};
     1605%             %extra_oar=answer{2}; % TODO : fix this for LMFA cluster. Maybe
     1606%             %extrs_oar and extra_pbs are not the best names
     1607%         end
    16011608end
    16021609if ~isfield(Param.IndexRange,'NbSlice')
     
    17221729NbProcess=1;
    17231730switch RunMode
    1724     case {'cluster_oar','cluster_pbs','cluster_sge','cluster_qstat_unknown'}
    1725         JobNumberMax=SeriesData.SeriesParam.OarParam.JobNumberMax;
    1726         JobCPUTimeAdvised=SeriesData.SeriesParam.OarParam.JobCPUTimeAdvised;
     1731    case 'cluster'
     1732        JobNumberMax=SeriesData.SeriesParam.ClusterParam.JobNumberMax;
     1733        JobCPUTimeAdvised=SeriesData.SeriesParam.ClusterParam.JobCPUTimeAdvised;
    17271734        if isempty(Param.IndexRange.NbSlice)% if NbSlice is not defined
    17281735            BlockLength= ceil(JobCPUTimeAdvised/(CPUTime*nbfield_j)); % iterations are grouped in sets with length BlockLength  such that the typical CPU time of a job is MinJobNumber.
     
    19421949        end
    19431950       
    1944     case 'cluster_oar' % option 'oar-parexec' used
     1951    case 'cluster' % option 'oar-parexec' used
    19451952        %create subdirectory for oar commands
    19461953        for iprocess=1:NbProcess
     
    19741981            system(['chmod +x ' batch_file_list{iprocess}]); % set the file to executable
    19751982        end
    1976         DirOAR=fullfile(OutputDir,'0_OAR');
    1977         if exist(DirOAR,'dir')% delete the content of the dir 0_LOG to allow new input
     1983        DIR_CLUSTER=fullfile(OutputDir,'0_CLUSTER');
     1984        if exist(DIR_CLUSTER,'dir')% delete the content of the dir 0_LOG to allow new input
    19781985            curdir=pwd;
    1979             cd(DirOAR)
     1986            cd(DIR_CLUSTER)
    19801987            delete('*')
    19811988            cd(curdir)
    19821989        else
    1983             [tild,msg1]=mkdir(DirOAR);
     1990            [tild,msg1]=mkdir(DIR_CLUSTER);
    19841991            if ~strcmp(msg1,'')
    1985                 errormsg=['cannot create ' DirOAR ': ' msg1]; % error message for directory creation
     1992                errormsg=['cannot create ' DIR_CLUSTER ': ' msg1]; % error message for directory creation
    19861993                return
    19871994            end
    19881995        end
    19891996        % create file containing the list of jobs
    1990         filename_joblist=fullfile(DirOAR,'job_list.txt'); % name of the file containing the list of executables
    1991         fid=fopen(filename_joblist,'w'); % open it for writting
     1997        ListProcess=fullfile(DIR_CLUSTER,'job_list.txt'); % name of the file containing the list of executables
     1998        fid=fopen(ListProcess,'w'); % open it for writting
    19921999        for iprocess=1:length(batch_file_list)
    19932000            fprintf(fid,[batch_file_list{iprocess} '\n']); % write list of exe files
    19942001        end
    19952002        fclose(fid);
    1996         system(['chmod +x ' filename_joblist]); % set the file to executable
    1997        
    1998         filename_log=fullfile(DirLog,'job_list.stdout'); % file for output messages of the master oar process
    1999         filename_errors=fullfile(DirLog,'job_list.stderr'); % file for error messages of the master oar process
    2000         % the command job_list.txt contains the list of NbProcess independent individual jobs
    2001         % in which the total calculation has been split. Those are written as executable files .sh in the folder /O_EXE.
    2002         %  These individual jobs are grouped by the system as oar jobs on the NbCore processors.
    2003         %  For each processor, the oar job must stop after the walltime which has been set, which is limited to 24 h.
    2004         %  However, the oar job is automatically restarted (option 'idempotent') provided the individual jobs are
    2005         % shorter than the wall time: in the time interval 'checkpoint' (WallTimeOneJob) before the end of the allowed duration,
    2006         %  the oar job restarts when an individual job ends.
    2007         WallTimeMax=SeriesData.SeriesParam.OarParam.WallTimeMax;
    2008         JobTime=CPUTime*BlockLength*nbfield_j; % estimated CPU time for one individual job (in minutes)
    2009         % wall time (in hours ) for each oar job, allowing 10 individual jobs, but limited to 23 h:
    2010         WallTimeTotal=min(WallTimeMax,4*JobTime/60);
    2011         %disp(['WallTimeTotal: ' num2str(WallTimeTotal) ' hours'])
    2012         % estimated time of an individual job (in min), with a margin of error
    2013         WallTimeOneJob=min(4*JobTime+10,WallTimeTotal*60/2); % estimated max time of an individual job for checkpoint
    2014         disp(['WallTimeOneJob: ' num2str(WallTimeOneJob) ' minutes'])
    2015         if NbProcess>=8
    2016             bigiojob_string=['+{type = ' char(39) 'bigiojob' char(39) '}/licence=1'];% char(39) is quote - bigiojob limit UVmat parallel launch on cluster to avoid saturation of disk access to data
    2017         else
    2018             bigiojob_string='';
    2019         end
    2020         oar_command=['oarsub -n UVmat_' ActionFullName ' '...
    2021             '-t idempotent --checkpoint ' num2str(WallTimeOneJob*60) ' '...
    2022             '-l "/core=' num2str(NbCore)...
    2023             bigiojob_string... % char(39) is quote - bigiojob limit UVmat parallel launch on cluster
    2024             ',walltime=' datestr(WallTimeTotal/24,13) '" '...
    2025             '-E ' filename_errors ' '...
    2026             '-O ' filename_log ' '...
    2027             extra_oar ' '...
    2028             '"oar-parexec -s -f ' filename_joblist ' '...
    2029             '-l ' filename_joblist '.log"'];
    2030        
    2031         fprintf(oar_command); % display  system command on the Matlab command window
     2003        system(['chmod +x ' ListProcess]); % set the file to executable
     2004 
     2005        CPUTimeProcess=CPUTime*BlockLength*nbfield_j; % estimated CPU time for one individual process (in minutes)
     2006        LaunchCmdFcn=SeriesData.SeriesParam.ClusterParam.LaunchCmdFcn;
     2007        oar_command=feval(LaunchCmdFcn,ListProcess,ActionFullName,DirLog,NbProcess, NbCore,CPUTimeProcess)
    20322008        [status,result]=system(oar_command)% execute system command and show the result (ID number of the launched job) on the Matlab command window
    2033         filename_oarcommand=fullfile(DirOAR,'0_oar_command'); % keep track of the command in file '0-OAR/0_oar_command'
     2009        filename_oarcommand=fullfile(DIR_CLUSTER,'0_cluster_command'); % keep track of the command in file '0-OAR/0_cluster_command'
    20342010        fid=fopen(filename_oarcommand,'w');
    20352011        fprintf(fid,oar_command); % store the command
     
    20382014        msgbox_uvmat('CONFIRMATION',[ActionFullName ' launched as  ' num2str(NbProcess) ' processes in cluster: press STATUS to see results'])
    20392015       
    2040     case 'cluster_pbs' % for LMFA Kepler machine
    2041         %create subdirectory for pbs command and log files
    2042         DirPBS=fullfile(OutputDir,'0_PBS'); % todo : common name OAR/PBS
    2043         if exist(DirPBS,'dir')% delete the content of the dir 0_LOG to allow new input
    2044             curdir=pwd;
    2045             cd(DirPBS)
    2046             delete('*')
    2047             cd(curdir)
    2048         else
    2049             [tild,msg1]=mkdir(DirPBS);
    2050             if ~strcmp(msg1,'')
    2051                 errormsg=['cannot create ' DirPBS ': ' msg1]; % error message for directory creation
    2052                 return
    2053             end
    2054         end
    2055         max_walltime=3600*20; % 20h max total calculation (cannot exceed 24 h)
    2056         walltime_onejob=1800; % seconds, max estimated time for asingle file index value
    2057         filename_joblist=fullfile(DirPBS,'job_list.txt'); % create name of the global executable file
    2058         fid=fopen(filename_joblist,'w');
    2059         for iprocess=1:length(batch_file_list)
    2060             fprintf(fid,[batch_file_list{iprocess} '\n']); % list of exe files
    2061         end
    2062         fclose(fid);
    2063         system(['chmod +x ' filename_joblist]); % set the file to executable
    2064         pbs_command=['qsub -n CIVX '...
    2065             '-t idempotent --checkpoint ' num2str(walltime_onejob+60) ' '...
    2066             '-l /core=' num2str(NbCore) ','...
    2067             'walltime=' datestr(min(1.05*walltime_onejob/86400*max(NbProcess*BlockLength*nbfield_j,NbCore)/NbCore,max_walltime/86400),13) ' '...
    2068             '-E ' regexprep(filename_joblist,'\.txt\>','.stderr') ' '...
    2069             '-O ' regexprep(filename_joblist,'\.txt\>','.log') ' '...
    2070             extra_qstat ' '...
    2071             '"oar-parexec -s -f ' filename_joblist ' '...
    2072             '-l ' filename_joblist '.log"'];
    2073         filename_oarcommand=fullfile(DirPBS,'pbs_command');
    2074         fid=fopen(filename_oarcommand,'w');
    2075         fprintf(fid,pbs_command);
    2076         fclose(fid);
    2077         fprintf(pbs_command); % display in command line
    2078         %system(pbs_command);
    2079         msgbox_uvmat('CONFIRMATION',[ActionFullName ' command ready to be launched in cluster'])
    2080 
    2081      case 'cluster_sge' % for PSMN
     2016%     case 'cluster_pbs' % for LMFA Kepler machine:  trqnsferred to fct
     2017
     2018%         %create subdirectory for pbs command and log files
     2019%         DirPBS=fullfile(OutputDir,'0_PBS'); % todo : common name OAR/PBS
     2020%         if exist(DirPBS,'dir')% delete the content of the dir 0_LOG to allow new input
     2021%             curdir=pwd;
     2022%             cd(DirPBS)
     2023%             delete('*')
     2024%             cd(curdir)
     2025%         else
     2026%             [tild,msg1]=mkdir(DirPBS);
     2027%             if ~strcmp(msg1,'')
     2028%                 errormsg=['cannot create ' DirPBS ': ' msg1]; % error message for directory creation
     2029%                 return
     2030%             end
     2031%         end
     2032%         max_walltime=3600*20; % 20h max total calculation (cannot exceed 24 h)
     2033%         walltime_onejob=1800; % seconds, max estimated time for asingle file index value
     2034%         ListProcess=fullfile(DirPBS,'job_list.txt'); % create name of the global executable file
     2035%         fid=fopen(ListProcess,'w');
     2036%         for iprocess=1:length(batch_file_list)
     2037%             fprintf(fid,[batch_file_list{iprocess} '\n']); % list of exe files
     2038%         end
     2039%         fclose(fid);
     2040%         system(['chmod +x ' ListProcess]); % set the file to executable
     2041%         pbs_command=['qsub -n CIVX '...
     2042%             '-t idempotent --checkpoint ' num2str(walltime_onejob+60) ' '...
     2043%             '-l /core=' num2str(NbCore) ','...
     2044%             'walltime=' datestr(min(1.05*walltime_onejob/86400*max(NbProcess*BlockLength*nbfield_j,NbCore)/NbCore,max_walltime/86400),13) ' '...
     2045%             '-E ' regexprep(ListProcess,'\.txt\>','.stderr') ' '...
     2046%             '-O ' regexprep(ListProcess,'\.txt\>','.log') ' '...
     2047%             extra_qstat ' '...
     2048%             '"oar-parexec -s -f ' ListProcess ' '...
     2049%             '-l ' ListProcess '.log"'];
     2050%         filename_oarcommand=fullfile(DirPBS,'pbs_command');
     2051%         fid=fopen(filename_oarcommand,'w');
     2052%         fprintf(fid,pbs_command);
     2053%         fclose(fid);
     2054%         fprintf(pbs_command); % display in command line
     2055%         %system(pbs_command);
     2056%         msgbox_uvmat('CONFIRMATION',[ActionFullName ' command ready to be launched in cluster'])
     2057
     2058     case 'cluster_sge' % for PSMN % TODO: use the standard 'cluster' config with an external fct
    20822059        % Au PSMN, on ne cr??e pas 1 job avec plusieurs c??urs, mais N jobs de 1 c??urs
    20832060        % o?? N < 1000.
  • trunk/src/series.xml.default

    r997 r1019  
    22<SeriesParam>
    33   <DiskQuotaCmd>quota -s -g -A</DiskQuotaCmd>
    4 <OarParam>
     4<ClusterParam>
     5<ExistenceTest>oarstat</ExistenceTest>
    56  <NbCoreAdvised>16</NbCoreAdvised><!--proposed default number of parallel cores attributed for the computations -->
    67  <NbCoreMax>36</NbCoreMax><!--maximum number of cores allowed for the computations -->
     
    910  <WallTimeMax unit="hour">23</WallTimeMax> <!--maximum allowed time for a job --> 
    1011  <JobStatCmd>oarstat |grep N=UVmat</JobStatCmd> <!--command to know the number of active and waiting job launches--> 
    11 </OarParam>
     12  <LaunchCmdFcn>cluster_command</LaunchCmdFcn> <!--name of the function used to create job launch commmand-->
     13</ClusterParam>
    1214<SgeParam>
    1315</SgeParam>
Note: See TracChangeset for help on using the changeset viewer.