#!/bin/csh
#
#  last update = 26 March 2012
#
#  This is a C-shell script to execute GAMESS, by typing
#       rungms JOB VERNO NCPUS >& JOB.log &
#  JOB    is the name of the 'JOB.inp' file to be executed,
#  VERNO  is the number of the executable you chose at 'lked' time,
#  NCPUS  is the number of processors to be used.
#
#  Unfortunately execution is harder to standardize than compiling,
#  so you have to do a bit more than name your machine type here:
#
#    a) choose the target for execution from the following list:
#           sockets, mpi, altix, cray-xt, ibm64-sp, sgi64
#       IBM Blue Gene uses separate execution files: ~/gamess/machines/ibm-bg
#
#       choose "sockets" if your compile time target was any of these:
#             axp64, hpux32, hpux64, ibm32, ibm64, linux32,
#             mac32, mac64, sgi32, sun32, sun64
#       as all of these systems use TCP/IP sockets.  Do not name your
#       specific compile time target, instead choose "sockets".
#
#       If your target was 'linux64', you may chose "sockets" or "mpi",
#       according to how you chose to compile.  The MPI example below
#       should be carefully matched against info found in 'readme.ddi'!
#
#           Search on the words typed in capital letters just below
#           in order to find the right place to choose each one:
#    b) choose a directory SCR where large temporary files can reside.
#       This should be the fastest possible disk access, very spacious,
#       and almost certainly a local disk.
#       Translation: do not put these files on a slow network file system!
#    c) choose a directory USERSCR on the file server where small ASCII
#       supplementary output files should be directed.
#       Translation: it is OK to put this on a network file system!
#    d) name the location GMSPATH of your GAMESS binary.
#    e) change the the VERNO default to the version number you chose when
#       running "lked" as the VERNO default, and maybe NCPUS' default.
#    f) make sure that the ERICFMT file name and MCPPATH pathname point to
#       your file server's GAMESS tree, so that all runs can find them.
#       Again, a network file system is quite OK for these two.
#    g) customize the execution section for your target below,
#       each has its own list of further requirements.
#    h) it is unwise to have every user take a copy of this script, as you
#       can *NEVER* update all the copies later on.  Instead, it is better
#       to ask other users to create an alias to point to a common script,
#       such as this in their C-shell .login file,
#             alias gms '/share/apps/gamess/rungms'
#    i) it is entirely possible to make 'rungms' run in a batch queue,
#       be it PBS, DQS, et cetera.  This is so installation dependent
#       that we leave it to up to you, although we give examples.
#       See ~/gamess/tools, where there are two examples of "front-end"
#       scripts which can use this file as the "back-end" actual job.
#       We use the front-end "gms" on local Infiniband clusters using
#       both Sun Grid Engine (SGE), and Portable Batch System (PBS).
#       See also a very old LoadLeveler "ll-gms" for some IBM systems.
#
set TARGET=mpi
set SCR=/local_scratch/$USER
set USERSCR=/scratch/$USER
set GMSPATH=/share/apps/gamess
#
set JOB=$1      # name of the input file xxx.inp, give only the xxx part
set VERNO=$2    # revision number of the executable created by 'lked' step
set NCPUS=$3    # number of compute processes to be run
#
# provide defaults if last two arguments are not given to this script
if (null$VERNO == null) set VERNO=01
if (null$NCPUS == null) set NCPUS=1
#
#  ---- the top third of the script is input and other file assignments ----
#
echo "----- GAMESS execution script 'rungms' -----"
set master=`hostname`
echo This job is running on host $master
echo under operating system `uname` at `date`
#
#      Batch schedular, if any, should provide its own working directory,
#      on every assigned node (if not, modify schedular's prolog script).
#      The SCHED variable, and schedular assigned work space, is used
#      below only in the MPI section.  See that part for more info.
#                      set SCHED=none
                      set SCHED=PBS
#if ($?PBS_O_LOGNAME)  set SCHED=PBS
#if ($?SGE_O_LOGNAME)  set SCHED=SGE
#if ($SCHED == SGE) then
#   set SCR=$TMPDIR
#   echo "SGE has assigned the following compute nodes to this run:"
#   uniq $TMPDIR/machines
#endif
if ($SCHED == PBS) then
   #set SCR=/scratch/$USER/$PBS_JOBID
   #mkdir -p $SCR
   echo "PBS has assigned the following compute nodes to this run:"
   uniq $PBS_NODEFILE
endif
#
echo "Available scratch disk space (Kbyte units) at beginning of the job is"
df -h $SCR
echo "GAMESS temporary binary files will be written to $SCR"
echo "GAMESS supplementary output files will be written to $USERSCR"

#        this added as experiment, February 2007
#        its intent is to detect large arrays allocated off the stack
limit stacksize 8192

#  Grab a copy of the input file.
#  In the case of examNN jobs, file is in tests/standard subdirectory.
#  In the case of exam-vbNN jobs, file is in vb2000's tests subdirectory.
if ($JOB:r.inp == $JOB) set JOB=$JOB:r      # strip off possible .inp
echo "Copying input file $JOB.inp to your run's scratch directory..."
if (-e $JOB.inp) then
   set echo
   cp  $JOB.inp  $SCR/$JOB.F05
   unset echo
else
   if (-e tests/standard/$JOB.inp) then
      set echo
      cp  tests/standard/$JOB.inp  $SCR/$JOB.F05
      unset echo
   else
      if (-e tests/$JOB.inp) then
         set echo
         cp  tests/$JOB.inp  $SCR/$JOB.F05
         unset echo
      else
         echo "Input file $JOB.inp does not exist."
         echo "This job expected the input file to be in directory `pwd`"
         echo "Please fix your file name problem, and resubmit."
         exit 4
      endif
   endif
endif

#    define many environment variables setting up file names.
#    anything can be overridden by a user's own choice, read 2nd.
source $GMSPATH/gms-files.csh
if (-e $HOME/.gmsrc) then
   echo "reading your own $HOME/.gmsrc"
   source $HOME/.gmsrc
endif
#
#    If a $GDDI input group is present, the calculation will be using
#    subgroups within DDI (the input NGROUP=0 means this isn't GDDI).
#
#    The master within each group must have a copy of INPUT, which is
#    dealt with below (prior to execution), once we know something about
#    the host names where INPUT is required.  The INPUT does not have
#    the global rank appended to its name, unlike all other files.
#
#    OUTPUT and PUNCH (and perhaps many other files) are opened on all
#    processes (not just the master in each subgroup), but unique names
#    will be generated by appending the global ranks.  Note that OUTPUT
#    is not opened by the master in the first group, but is used by all
#    other groups.  Typically, the OUTPUT from the first group's master
#    is the only one worth saving, unless perhaps if runs crash out.
#
#    The other files that GDDI runs might use are already defined above.
#
set ngddi=`grep -i '^ \$GDDI' $SCR/$JOB.F05 | grep -iv 'NGROUP=0 ' | wc -l`
if ($ngddi > 0) then
   set GDDIjob=true
   echo "This is a GDDI run, keeping various output files on local disks"
   set echo
   setenv  OUTPUT $SCR/$JOB.F06
   setenv   PUNCH $SCR/$JOB.F07
   unset echo
else
   set GDDIjob=false
endif

#    data left over from a previous run might be precious, stop if found.
if ((-e $PUNCH) || (-e $MAKEFP) || (-e $TRAJECT) || (-e $RESTART) ) then
   echo "Please save, rename, or erase these files from a previous run:"
   echo "     $PUNCH,"
   echo "     $TRAJECT,"
   echo "     $RESTART, and/or"
   echo "     $MAKEFP,"
   echo "and then resubmit this computation."
   exit 4
endif

#  ---- the middle third of the script is to execute GAMESS ----
#
#  we show execution sections that should work for
#        sockets, mpi, altix, cray-xt, ibm64-sp, sgi64
#  and then two others
#        cray-x1, necsx
#  which are not mentioned at the top of this file, as they are quite stale.
#
#   Most workstations run DDI over TCP/IP sockets, and therefore execute
#   according to the following clause.  The installer must
#      a) Set the path to point to the DDIKICK and GAMESS executables.
#      b) Build the HOSTLIST variable as a word separated string, i.e. ()'s.
#         There should be one host name for every compute process that is
#         to be run.  DDIKICK will automatically generate a set of data
#         server processes (if required) on the same hosts.
#   An extended explanation of the arguments to ddikick.x can be found
#   in the file gamess/ddi/readme.ddi, if you have any trouble executing.
#
if ($TARGET == sockets) then
#
#        adjust the path pointing to GAMESS and DDIKICK binaries
#           The default path to GAMESS was already set above!
#     At Iowa State, we have many operating systems, and store files
#     in different partitions according to which system is being used.
#     The other nodes have a separate directory for each machine,
#     based on their host names.
#
#       special compilation for IBM AIX pSeries p4+   (uname AIX)
#      -- some special settings for certain operating systems --
   set os=`uname`
#         IBM's AIX needs special setting if node is more than a 4-way SMP
   if ($os == AIX) setenv EXTSHM ON
#         Fedora Core 1 can't run DDI processes w/o placing a finite
#         but large limit on the stack size (2**27 bytes seems OK)
   if ($os == Linux) limit stacksize 131072
#         In case this Linux system is using Intel's Math Kernel Library
#         to obtain its BLAS, we insist each process runs single-threaded.
#         one variable is for MKL up to 9, the other from 10 on up.
   if ($os == Linux) setenv MKL_SERIAL YES
   if ($os == Linux) setenv MKL_NUM_THREADS 1
#         it is unlikely that you would need to change DDI_VER from 'new'!
#         some antique system lacking pthreads, for example, might have
#         to use the old DDI code, so we keep an execution example below.
   set DDI_VER='new'
#
#       Six examples of how to build the HOSTLIST are shown....
#           terminology: CPU= processor core,
#                       NODE= physical enclosure (box/blade)
#
#       1. Sequential execution is sure to be on this very same host
   if ($NCPUS == 1) then
      set NNODES=1
      set HOSTLIST=(`hostname`)
   endif
#
#       2. This is an example of how to run on a multi-core SMP enclosure,
#          where all CPUs (aka COREs) are inside a -single- NODE.
#     At other locations, you may wish to consider some of the examples
#     that follow below, after commenting out this ISU specific part.
   if ($NCPUS > 1) then
      set NNODES=1
      set HOSTLIST=(`hostname`:cpus=16)
   endif
#    if ($NCPUS > 1) then
#       set NNODES=8
#       set HOSTLIST=(compute3013:cpus=16 compute3014:cpus=16 compute3015:cpus=16 compute3016:cpus=16 compute3017:cpus=16 compute3018:cpus=16 compute3019:cpus=16 compute3020:cpus=16)
#   endif
#       3. How to run in a single computer, namely the "localhost", so
#          this computer needn't have a proper Internet name.
#          This example also presumes SysV was deliberately *not* chosen
#          when DDI was compiled, so that host names have to be repeated,
#          instead of using the simpler localhost:cpus=$NCPU form.
#
#          This example is appropriate for use with the pre-compiled
#          Apple binary from our web site, provided it is uncommented,
#          and the passage #2 just above is deleted or commented out.
#
#   set HOSTLIST=()
#   @ n=1
#   while ($n <= $NCPUS)
#      set HOSTLIST=($HOSTLIST localhost)
#      @ n++
#   end
#   set NNODES=$NCPUS
#
#       4. A phony example, of four dual processors (arbitrary names)
#          Since their names never change, we just can just specify them.
#          Note that we can use a short name like 'bb' if and only if
#          system name resolution can map them onto the true host names.
#       6. Scalable Computing Lab's clusters running PBS batch queues.
#          Here the task is to manipulate the dynamically assigned host
#          names into the HOSTLIST string for the kickoff program,
#          and to request the host name of the fast network adapters.
#
    if ($?PBS_JOBID) then
#
#         The IBM cluster has two Gigabit adapters in each 4-way SMP,
#         while the AXP cluster is based on a Myrinet network.
#         repeated host names in the PBS host file indicate being assigned
#         CPUs in the same SMP enclosure, which we must count up correctly.
#         Fortunately PBS gives duplicate host names in a row, not scrambled.
#         The number of hosts in the PBS node file (nmax) should equal the
#         requested processor count, NCPUS.  We need to count duplicates
#         in order to learn the number of SMP enclosures, NNODES, and how
#         many CPUs inside each SMP were assigned (NSMPCPU).  For example,
#         if we are assigned the host names "a a a b b c c c" we must build
#         the string "a:cpus=3 b:cpus=2 c:cpus=3" so that ddikick.x will
#         know the SMP structure of the assigned node names.  (C-shell handles
#         variable substitution followed by colon gracefully by ${HOST}:cpus.)
#
      set HOSTLIST=()
      set nmax=`wc -l $PBS_NODEFILE`
      set nmax=$nmax[1]
      if ($nmax != $NCPUS) then
         echo There is processor count confusion
         exit
      endif
#            1st host in the list is sure to be a new SMP enclosure
      set MYNODE=`sed -n -e "1 p" $PBS_NODEFILE`
      set MYNODE=`echo $MYNODE | awk '{split($0,a,"."); print a[1]}'`
#            IPROC counts assigned processors (up to NCPUS),
#            NNODES counts number of SMP enclosures.
#            NSMPCPU counts processors in the current SMP enclosure
      @ IPROC = 2
      @ NNODES = 1
      @ NSMPCPU = 1
      set spacer1=":cpus="
      set spacer2=":netext="
      while($IPROC <= $nmax)
         set MYPROC=`sed -n -e "$IPROC p" $PBS_NODEFILE`
         set MYPROC=`echo $MYPROC | awk '{split($0,a,"."); print a[1]}'`
         if($MYPROC != $MYNODE) then
            set HOSTLIST = ($HOSTLIST $MYNODE$spacer1$NSMPCPU$spacer2$NETEXT)
            set MYNODE=$MYPROC
            @ NSMPCPU = 0
            @ NNODES++
         endif
         @ IPROC++
         @ NSMPCPU++
      end
      set HOSTLIST = ($HOSTLIST $MYNODE$spacer1$NSMPCPU$spacer2$NETEXT)
   endif
#
#        we have now finished setting up a correct HOSTLIST.
#        uncomment the next two if you are doing script debugging.
#--echo "The generated host list is"
#--echo $HOSTLIST
#
#
#        choose remote shell execution program.
#    Parallel run do initial launch of GAMESS on remote nodes by the
#    following program.  Note that the authentication keys for ssh
#    must have been set up correctly.
#    If you wish, choose 'rsh' using .rhosts authentication on next line.
   setenv DDI_RSH ssh
#
   if($DDI_RSH == ssh) then
      setenv DDI_RCP scp
   else
      setenv DDI_RCP rcp
   endif

#        One way to be sure that the master node of each subgroup
#        has its necessary copy of the input file is to stuff a
#        copy of the input file onto every single node right here.
   if ($GDDIjob == true) then
      @ n=2   # master in master group already did 'cp' above
      while ($n <= $NNODES)
         set host=$HOSTLIST[$n]
         set host=`echo $host | cut -f 1 -d :` # drop anything behind a colon
         echo $DDI_RCP $SCR/$JOB.F05 ${host}:$SCR/$JOB.F05
              $DDI_RCP $SCR/$JOB.F05 ${host}:$SCR/$JOB.F05
         @ n++
      end
   endif

#
#        Just make sure we have the binaries, before we try to run
#
   if ((-x $GMSPATH/gamess.$VERNO.x) && (-x $GMSPATH/ddikick.x)) then
   else
      echo The GAMESS executable gamess.$VERNO.x or else
      echo the DDIKICK executable ddikick.x
      echo could not be found in directory $GMSPATH,
      echo or else they did not properly link to executable permission.
      exit 8
   endif
#
#        OK, now we are ready to execute!
#    The kickoff program initiates GAMESS process(es) on all CPUs/nodes.
#
   if ($DDI_VER == new) then
      set echo
      $GMSPATH/ddikick.x $GMSPATH/gamess.$VERNO.x $JOB \
          -ddi $NNODES $NCPUS $HOSTLIST \
          -scr $SCR < /dev/null
      unset echo
   else
      set path=($GMSPATH $path)
      set echo
      ddikick.x $JOB $GMSPATH gamess.$VERNO.x $SCR $NCPUS $HOSTLIST < /dev/null
      unset echo
   endif
endif

#      ------ end of the TCP/IP socket execution section -------


#                     - a typical MPI example -
#
#         This section is customized to two possible MPI libraries:
#             Intel MPI (default),  or MVAPICH2 (commented out).
#             We do not know tunings to use openMPI correctly!!!
#         This section is customized to two possible batch schedulars:
#             Sun Grid Engine (SGE), or Portable Batch System (PBS)
#         used on two different Infiniband-based Rocks clusters.
#
#         See ~/gamess/tools/gms, which is a front-end script to submit
#         this file 'rungms' as a back-end script, to either schedular.
#
#                   if you are using some other MPI:
#         See ~/gamess/ddi/readme.ddi for information about launching
#         processes using other MPI libraries (each may be different).
#         Again: we do not know how to run openMPI effectively.
#
#                   if you are using some other batch schedular:
#         Illustrating other batch schedular's way's of providing the
#         hostname list is considered beyond the scope of this script.
#         Suffice it to say that
#             a) you will be given hostnames at run time
#             b) a typical way is a disk file, named by an environment
#                variable, containing the names in some format.
#             c) another typical way is an blank separated list in some
#                environment variable.
#         Either way, whatever the batch schedular gives you must be
#         sliced-and-diced into the format required by your MPI kickoff.
#
if ($TARGET == mpi) then
   #
   #      Besides the usual three arguments to 'rungms' (see top),
   #      we'll pass in a "processers per node" value, that is,
   #      all nodes are presumed to have equal numbers of cores.
   #
   #      When using on the Infiniband clusters at Iowa State,
   #      just pick target=mpi, version no. of the iMPI, and kickoff style.
   #
   set PPN=$4
   #
   #  Allow for compute process and data servers (one pair per core)
   #  note that NCPUS = #cores, and NPROCS = #MPI processes
   #
   @ NPROCS = $NCPUS + $NCPUS
   #
   #  Argonne's MPICH2, offers two possible kick-off procedures,
   #  guided by two disk files (A and B).
   #
   #  Kickoff procedure #1 uses mpd demons, which potentially collide
   #  if the same user runs multiple jobs that end up on the same nodes.
   #  This is called "3steps" here because three commands (mpdboot,
   #  mpiexec, mpdallexit) are needed to run.
   #  Kickoff procedure #2 is said to be faster, and involves only
   #  one command (mpiexec.hydra), so it is called "hydra" here.
   #
   #  Note that "mpirun" is not shown, since it cannot understand
   #  how to place a c.p. and a d.s. pair on a single core.
   #
   #  Other MPI implementations are often derived from Argonne's, so
   #  often offer these two styles.
   #  For example, iMPI and MVAPICH2 can choose "3steps",
   #  while iMPI (as of version 4.0) can choose "hydra".
   #
   set MPI_KICKOFF_STYLE=hydra
   #
   #  A. build HOSTFILE,
   #     This is always used below during file cleaning, and while
   #     creating the PROCFILE at step B, so we always make it.
   #     This file is explicitly used by "3steps" initiation.
   #
   setenv HOSTFILE $SCR/$JOB.nodes.mpd
   if (-e $HOSTFILE) rm $HOSTFILE
   touch $HOSTFILE
   #
   if ($NCPUS == 1) then
             # Serial run must be on this node itself!
      echo `hostname` >> $HOSTFILE
      set NNODES=1
   else
             # Parallel run gets node names from schedular's assigned list:
      if ($SCHED == SGE) then
         uniq $TMPDIR/machines $HOSTFILE
         set NNODES=`wc -l $HOSTFILE`
         set NNODES=$NNODES[1]
      endif
      if ($SCHED == PBS) then
         uniq $PBS_NODEFILE $HOSTFILE
         set NNODES=`wc -l $HOSTFILE`
         set NNODES=$NNODES[1]
      endif
   endif
   #           uncomment these if you are still setting up...
   echo '-----debug----'
   echo HOSTFILE $HOSTFILE contains
   cat $HOSTFILE
   echo '--------------'
   #
   #  B. the next file forces explicit "which process on what node" rules.
   #     The contents depend on the kickoff style.  This file is how
   #     we tell MPI to double-book the cores with two processes,
   #     thus accounting for both compute processes and data servers.
   #
   setenv PROCFILE $SCR/$JOB.processes.mpd
   if (-e $PROCFILE) rm $PROCFILE
   touch $PROCFILE

   switch ($MPI_KICKOFF_STYLE)

   case 3steps:

   #
   if ($NCPUS == 1) then
      echo "-n $NPROCS -host `hostname` /share/apps/gamess/gamess.$VERNO.x" >> $PROCFILE
   else
      if ($NNODES == 1) then
             # when all processes are inside a single node, it is simple!
             # all MPI processes, whether compute processes or data servers,
             # are just in this node.   (note: NPROCS = 2*NCPUS!)
         echo "-n $NPROCS -host `hostname` /share/apps/gamess/gamess.$VERNO.x" >> $PROCFILE
      else
             # For more than one node, we want PPN compute processes on
             # each node, and of course, PPN data servers on each.
             # Hence, PPN2 is doubled up.
             # Front end script 'gms' is responsible to ensure that NCPUS
             # is a multiple of PPN, and that PPN is less than or equals
             # the actual number of cores in the node.
         @ PPN2 = $PPN + $PPN
         @ n=1
         while ($n <= $NNODES)
            set host=`sed -n -e "$n p" $HOSTFILE`
            set host=$host[1]
            echo "-n $PPN2 -host $host /share/apps/gamess/gamess.$VERNO.x" >> $PROCFILE
            @ n++
         end
      endif
   endif
   breaksw

   case hydra:
   if ($NNODES == 1) then
             # when all processes are inside a single node, it is simple!
             # all MPI processes, whether compute processes or data servers,
             # are just in this node.   (note: NPROCS = 2*NCPUS!)
      @ PPN2 = $PPN + $PPN
      echo "`hostname`:$NPROCS" > $PROCFILE
   else
             # For more than one node, we want PPN compute processes on
             # each node, and of course, PPN data servers on each.
             # Hence, PPN2 is doubled up.
             # Front end script 'gms' is responsible to ensure that NCPUS
             # is a multiple of PPN, and that PPN is less than or equals
             # the actual number of cores in the node.
      @ PPN2 = $PPN + $PPN
      @ n=1
      while ($n <= $NNODES)
         set host=`sed -n -e "$n p" $HOSTFILE`
         set host=$host[1]
         echo "${host}:$PPN2" >> $PROCFILE
         @ n++
      end
   endif
   breaksw
   endsw
   #           uncomment these if you are still setting up...
   echo '-----debug----'
   echo PROCFILE $PROCFILE contains
   cat $PROCFILE
   echo '--------------'
   #
   #     add Intel MPI to the library path and to the execution path.
   #        version number might be 3.2.1, 4.0.1.007, 4.0.2.003, or ...
   #
   #setenv LD_LIBRARY_PATH /opt/intel/impi/4.0.2.003/lib64:$LD_LIBRARY_PATH
   #set path=(/opt/intel/impi/4.0.2.003/bin64 $path)
   #
   #  add MVAPICH2 to the execution path
   #--set path=(/usr/mpi/gcc/mvapich2-1.2p1/bin $path)
   #
   #     tunings below are specific to Intel MPI 3.2 and/or 4.0:
   #        a very important option avoids polling for incoming messages
   #           which allows us to compile DDI in pure "mpi" mode,
   #           and get sleeping data servers if the run is SCF level.
   #        trial and error showed process pinning slows down GAMESS runs,
   #        set debug option to 2 or to 5 to see messages while kicking off,
   #        set statistics option to 1 or 2 to collect messaging info,
   #        iMPI 3.2 allows specification of a DAPL 2.0 library,
   #        iMPI 4.0 defaults fabric to shm,dapl: forcing dapl only is faster.
   #
   set echo
   setenv I_MPI_WAIT_MODE enable
   setenv I_MPI_PIN disable
   setenv I_MPI_DEBUG 0
   setenv I_MPI_STATS 0
   setenv I_MPI_FABRICS dapl
   setenv I_MPI_DAT_LIBRARY libdat2.so
   #      in case someone wants to try the "tag matching interface",
   #      an option which unfortunately ignores the WAIT_MODE in 4.0.2!
   #--setenv I_MPI_FABRICS tmi
   #--setenv I_MPI_TMI_LIBRARY libtmi.so
   #--setenv I_MPI_TMI_PROVIDER psm
   #--setenv TMI_CONFIG /share/apps/intel/impi/4.0.2.003/intel64/etc/tmi.conf
   unset echo
   #
   #      similar tunings for MVAPICH2 are
   setenv MV2_USE_BLOCKING 1
   setenv MV2_ENABLE_AFFINITY 0
   #
   #         ... thus ends setting up the process initiation,
   #             tunings, pathnames, library paths, for the MPI.
   #
   #  set up Intel MKL (math kernel library).  As of version 10, GAMESS
   #  links MKL statically, and also for single threaded execution.
   #  So, you probably don't need to sweat this...
   #--setenv LD_LIBRARY_PATH /opt/intel/mkl/10.0.3.020/lib/em64t
   #--setenv LD_LIBRARY_PATH /opt/intel/composerxe-2011.1.107/mkl/lib/intel64
   #--setenv MKL_SERIAL YES
   #--setenv MKL_NUM_THREADS 1
   #
   #     Set up Fragment MO runs (or the other runs exploiting subgroups).
   #     One way to be sure that the master node of each subgroup
   #     has its necessary copy of the input file is to stuff a
   #     copy of the input file onto every single node right here.
   if ($GDDIjob == true) then
      setenv DDI_RCP scp
      set nmax=`wc -l $HOSTFILE`
      set nmax=$nmax[1]
      set lasthost=$master
      @ n=2   # input has already been copied into the master node.
      while ($n <= $nmax)
         set host=`sed -n -e "$n p" $HOSTFILE`
         set host=$host[1]
         if ($host != $lasthost) then
            echo $DDI_RCP $SCR/$JOB.F05 ${host}:$SCR/$JOB.F05
                 $DDI_RCP $SCR/$JOB.F05 ${host}:$SCR/$JOB.F05
            set lasthost=$host
         endif
         @ n++
      end
   endif
   #
   #      Next pertains to thread count in case GAMESS has been
   #      linked to "libcchem" (see that program's readme file),
   #      otherwise, these settings will just be ignored.
   #
   #      the profiling variable inserts extra timing output into the
   #      log files.  Omit it, if you like.
   #
   #      "libcchem" controls the CPU thread count by NUM_THREADS,
   #      which defaults to the number of CPU cores, if not given.
   #      Note that the number of CPU threads doing quantum chemistry
   #      will always be one less than NUM_THREADS, as one core is
   #      required to do GPU/CPU management things, except if the
   #      number of cores is just 1 (of course it runs 1 thread then).
   #
   #      "libcchem" controls the GPU devices used by GPU_DEVICES,
   #      which is a list of the devices to be used.  If not given,
   #      which safest, all avaliable GPUs will be detected and used.
   #
   #      You will need to put a path to your CUDA run time libraries here.
   #
   #      Finally, note that the front end 'gms' script must edit
   #      the string 'NUMGPU=0' to the desired number, after which,
   #      the if clause will actually be executed!
   #      Or, hardwire this to non-zero if all your nodes own GPUs.
   #      Setting GPU_DEVICES to -1 runs CPU threads, only.
   #
   @ NUMGPU=0
   if ($NUMGPU > 0) then
      @ NUMCPU = $PPN - 1
      echo libcchem kernels will use $NUMCPU cores and $NUMGPU GPUs per node...
      set echo
      setenv CCHEM_PROFILE 1
      setenv NUM_THREADS $PPN
      #--if ($NUMGPU == 0) setenv GPU_DEVICES -1
      #--if ($NUMGPU == 2) setenv GPU_DEVICES 0,1
      #--if ($NUMGPU == 4) setenv GPU_DEVICES 0,1,2,3
      setenv LD_LIBRARY_PATH /share/apps/cuda/lib64:$LD_LIBRARY_PATH
      ###### LD_LIBRARY_PATH /usr/local/cuda/lib64:$LD_LIBRARY_PATH
      unset echo
   else
      setenv GPU_DEVICES -1
   endif
   #
   #  Now, at last, we can actually kick-off the MPI processes...
   #
   echo "MPI kickoff will run GAMESS on $NCPUS cores in $NNODES nodes."
   echo "The binary to be executed is /share/apps/gamess/gamess.$VERNO.x"
   echo "MPI will run $NCPUS compute processes and $NCPUS data servers,"
   echo "    placing $PPN of each process type onto each node."
   echo "The scratch disk space on each node is $SCR, with free space"
   df -k $SCR
   #
   chdir $SCR
   #
   switch ($MPI_KICKOFF_STYLE)

   case 3steps:
      #
      #  a) bring up a 'ring' of MPI demons
      #
      set echo
      mpdboot --rsh=ssh -n $NNODES -f $HOSTFILE
      #
      #  b) kick off the compute processes and the data servers
      #
      mpiexec -configfile $PROCFILE < /dev/null
      #
      #  c) shut down the 'ring' of MPI demons
      #
      mpdallexit
      unset echo
      breaksw
   #
   case hydra:
      set echo
      setenv I_MPI_HYDRA_ENV all
      setenv I_MPI_PERHOST $PPN2
      mpiexec.hydra -f $PROCFILE -n $NPROCS \
            /share/apps/gamess/gamess.$VERNO.x < /dev/null
      unset echo
      breaksw
   case default:
      echo No valid MPI startup procedure chosen.
      exit
   endsw
   #
   #    keep HOSTFILE, as it is passed to the file erasing step below
   rm -f $PROCFILE
   #
endif
#      ------ end of the MPI execution section -------
#
#  ---- the bottom third of the script is to clean up all disk files ----
#  It is quite useful to display to users how big the disk files got to be.
#
echo ----- accounting info -----
#
#   in the case of GDDI runs, we save the first PUNCH file only.
#   If something goes wrong, the .F06.00x, .F07.00x, ... from the
#   other groups are potentially interesting to look at.
if ($GDDIjob == true) cp $SCR/$JOB.F07 ~/scr/$JOB.dat
#
#   Clean up the master's scratch directory.
#
echo Files used on the master node $master were:
ls -lF $SCR/$JOB.*
rm -f  $SCR/$JOB.F*
#
#   Clean/Rescue any files created by the VB2000 plug-in
if (-e $SCR/$JOB.V84)        mv $SCR/$JOB.V84     $USERSCR
if (-e $SCR/$JOB.V80)        rm -f $SCR/$JOB.V*
if (-e $SCR/$JOB.TEMP02)     rm -f $SCR/$JOB.TEMP*
if (-e $SCR/$JOB.orb)        mv $SCR/$JOB.orb     $USERSCR
if (-e $SCR/$JOB.vec)        mv $SCR/$JOB.vec     $USERSCR
if (-e $SCR/$JOB.mol)        mv $SCR/$JOB.mol     $USERSCR
if (-e $SCR/$JOB.molf)       mv $SCR/$JOB.molf    $USERSCR
if (-e $SCR/$JOB.mkl)        mv $SCR/$JOB.mkl     $USERSCR
if (-e $SCR/$JOB.xyz)        mv $SCR/$JOB.xyz     $USERSCR
ls $SCR/${JOB}-*.cube > $SCR/${JOB}.lis
if (! -z $SCR/${JOB}.lis) mv $SCR/${JOB}*.cube $USERSCR
rm -f $SCR/${JOB}.lis
ls $SCR/${JOB}-*.grd > $SCR/${JOB}.lis
if (! -z $SCR/${JOB}.lis) mv $SCR/${JOB}*.grd $USERSCR
rm -f $SCR/${JOB}.lis
ls $SCR/${JOB}-*.csv > $SCR/${JOB}.lis
if (! -z $SCR/${JOB}.lis) mv $SCR/${JOB}*.csv $USERSCR
rm -f $SCR/${JOB}.lis
#
#   Clean up scratch directory of remote nodes.
#
#   This may not be necessary, e.g. on a T3E where all files are in the
#   same directory, and just got cleaned out by the previous 'rm'.  Many
#   batch queue managers provide cleaning out of scratch directories.
#   It still may be interesting to the user to see the sizes of files.
#
#   The 'lasthost' business prevents multiple cleanup tries on SMP nodes.
#
#
#    This particular example is for the combination iMPI, w/SGE or PBS.
#    We have inherited a file of unique node names from above.
#    There is an option to rescue the output files from group DDI runs,
#    such as FMO, in case you need to see the other group's outputs.
if ($TARGET == mpi) then
   set nnodes=`wc -l $HOSTFILE`
   set nnodes=$nnodes[1]
   @ n=1
   set master=`hostname`
           # burn off the .local suffix in our cluster's hostname
   set master=$master:r
   while ($n <= $nnodes)
      set host=`sed -n -e "$n p" $HOSTFILE`
           # in case of openMPI, unwanted stuff may follow the hostname
      set host=$host[1]
      if ($host != $master) then
         echo Files used on node $host were:
         #---------FMO rescue------
         #--if ($GDDIjob == true) then
         #--   echo "========= OUTPUT from node $host is =============="
         #--   ssh $host -l $USER "cat $SCR/$JOB.F06*"
         #--endif
         #---------FMO rescue------
         ssh $host -l $USER "ls -l $SCR/$JOB.*"
         ssh $host -l $USER "rm -f $SCR/$JOB.*"
      endif
      @ n++
   end
#          clean off the last file on the master's scratch disk.
   rm -f $HOSTFILE
   #
   if ($?I_MPI_STATS) then
      if ($I_MPI_STATS > 0) mv $SCR/stats.txt ~/$JOB.$NCPUS.stats
   endif
endif

#  and this is the end
#
date
time
exit