Re: [AMBER] Amber 14.13 pmemd.cuda: Error: invalid configuration argument launching kernel kPMEReduceChargeGridBuffer from Ross Walker on 2016-08-10 (Amber Archive Aug 2016)

From: Ross Walker <ross.rosswalker.co.uk>
Date: Wed, 10 Aug 2016 20:01:52 -0700

Hi William,

I wouldn't recommend minimizing with the GPU code due to limitations of the SPFP precision model. It's better to minimize with CPU and just do MD on the GPU. Nevertheless the problem would be the same.

The fix made it into Amber 16 but it looks like a patch was never made for Amber 14.

commit 0134305a35541685397b10dda3e01fe07e2f757e
Author: scott legrand <slegrand.amber.(none)>
Date: Mon Nov 17 17:31:55 2014 -0800

    Fix for large systems and removal of some unused code

Here's the full diffs

-------------------------- src/pmemd/src/cuda/kPGS.h --------------------------
index 2b42fc9..de3e70f 100644
.. -70,17 +70,17 .. __shared__ PMEFloat sRecipf[9];
#endif

     // Determine grid offsets
     const int tOffsetX = tgx & 0x03;
     const int tOffsetY = (tgx & 0x07) >> 2;
     const int iOffsetX = tOffsetX;
     const int iOffsetY = tOffsetY;

- unsigned int pos = blockIdx.x * GRADSUMLOADSIZE;
+ unsigned int pos = (blockIdx.x + offset) * GRADSUMLOADSIZE;

     // Read batch of atoms and procedurally generate spline weights
     unsigned int maxatom = min(pos + GRADSUMLOADSIZE, cSim.atoms);
     unsigned int pos1 = pos + threadIdx.x;
     if (pos1 < maxatom)
     {
         PMEFloat charge = cSim.pAtomChargeSP[pos1];
         PMEFloat fx = cSim.pFractX[pos1];

------------------ src/pmemd/src/cuda/kPMEInterpolation.cu -------------------
index 03a9709..f79def8 100644
.. -178,19 +178,19 .. extern "C" void kPMEClearChargeGridBuffer(gpuContext gpu)
}

__global__ void
#if (__CUDA_ARCH__ >= 300)
__launch_bounds__(SM_3X_REDUCEFORCES_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(SM_2X_REDUCEFORCES_THREADS_PER_BLOCK, 1)
#endif
-kPMEReduceChargeGridBuffer_kernel()
+kPMEReduceChargeGridBuffer_kernel(unsigned int offset)
{
- unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
+ unsigned int pos = (blockIdx.x + offset) * blockDim.x + threadIdx.x;

     if (pos < cSim.nfft1xnfft2xnfft3)
     {
         unsigned int iz = pos / (cSim.nfft1xnfft2);
         unsigned int iy = (pos - iz * cSim.nfft1xnfft2) / cSim.nfft1;
         unsigned int ix = pos - iz * cSim.nfft1xnfft2 - iy * cSim.nfft1;
         unsigned int spos = ((ix & 0x3) + (iy & 3) * 4 + (iz & 1) * 16) + (ix >> 2) * 32
                                                 + ((iy >> 2) << 3) * cSim.nfft1 + ((iz >> 1) << 1) * cSim.nfft1xnfft2;
.. -200,19 +200,26 .. kPMEReduceChargeGridBuffer_kernel()
         PMEFloat value1 = (PMEFloat)value * ONEOVERLATTICESCALEF;
         cSim.pXYZ_q[pos] = value1;
     }
}

extern "C" void kPMEReduceChargeGridBuffer(gpuContext gpu)
{
- unsigned int blocks = (gpu->sim.nfft1 * gpu->sim.nfft2 * gpu->sim.nfft3 + 127) >> 7;
- kPMEReduceChargeGridBuffer_kernel<<<blocks, 128>>>();
- LAUNCHERROR("kPMEReduceChargeGridBuffer");
+ int blocks = (gpu->sim.nfft1 * gpu->sim.nfft2 * gpu->sim.nfft3 + 127) >> 7;
+ int offset = 0;
+ while (blocks > 0)
+ {
+ int lblocks = min(blocks, 65535);
+ kPMEReduceChargeGridBuffer_kernel<<<lblocks, 128>>>(offset);
+ LAUNCHERROR("kPMEReduceChargeGridBuffer");
+ offset += 65535;
+ blocks -= 65535;
+ }
}

struct FillChargeGridAtomData
{
     int ix;
     int iy;
     int iz;
     PMEFloat tx[4];
.. -374,28 +381,27 .. __shared__ volatile FillChargeGridAtomData sAtom[LOADSIZE];
     __syncthreads();
}

extern "C" void kPMEInterpolationInitKernels(gpuContext gpu)
{

}

-
extern "C" void kPMEFillChargeGridBuffer(gpuContext gpu)
{
     int blocks = (gpu->sim.atoms + LOADSIZE - 1) / LOADSIZE;
     int offset = 0;
     while (blocks > 0)
     {
- int lblocks = min(blocks, 65536);
+ int lblocks = min(blocks, 65535);
         kPMEFillChargeGridBuffer_kernel<<<lblocks, 64>>>(offset);
         LAUNCHERROR("kPMEFillChargeGridBuffer");
- offset += 65536;
- blocks -= 65536;
+ offset += 65535;
+ blocks -= 65535;
     }
}

#define PME_ENERGY
__global__ void
#if (__CUDA_ARCH__ >= 300)
__launch_bounds__(SM_3X_GENERAL_THREADS_PER_BLOCK, 1)
#else
.. -467,31 +473,31 .. __global__ void
#ifdef use_DPFP
__launch_bounds__(GRADSUMTHREADS, 8)
#else
__launch_bounds__(GRADSUMTHREADS, 16)
#endif
#else
__launch_bounds__(GRADSUMTHREADS, 8)
#endif
-kPMEGradSum64_kernel()
+kPMEGradSum64_kernel(unsigned int offset)
#include "kPGS.h"

#define PME_VIRIAL
__global__ void
#if (__CUDA_ARCH__ >= 300)
#ifdef use_DPFP
__launch_bounds__(GRADSUMTHREADS, 8)
#else
__launch_bounds__(GRADSUMTHREADS, 8)
#endif
#else
__launch_bounds__(GRADSUMTHREADS, 8)
#endif
-kPMEGradSum64Virial_kernel()
+kPMEGradSum64Virial_kernel(unsigned int offset)
#include "kPGS.h"
#undef PME_VIRIAL

extern "C" void kPMEGradSum(gpuContext gpu)
{
     texref.normalized = 0;
     texref.filterMode = cudaFilterModePoint;
.. -509,22 +515,22 .. extern "C" void kPMEGradSum(gpuContext gpu)
#else
     cudaBindTexture(NULL, texref, (PMEFloat*)(gpu->sim.pXYZ_q), gpu->sim.nfft1 * gpu->sim.nfft2 * gpu->sim.nfft3 * sizeof(PMEFloat));
#endif

     int blocks = (gpu->sim.atoms + GRADSUMLOADSIZE - 1) / GRADSUMLOADSIZE;
     int offset = 0;
     while (blocks > 0)
     {
- int lblocks = min(blocks, 65536);
+ int lblocks = min(blocks, 65535);
         if ((gpu->sim.ntp > 0) && (gpu->sim.barostat == 1))
- kPMEGradSum64Virial_kernel<<<lblocks, GRADSUMTHREADS>>>();
+ kPMEGradSum64Virial_kernel<<<lblocks, GRADSUMTHREADS>>>(offset);
         else
- kPMEGradSum64_kernel<<<lblocks, GRADSUMTHREADS>>>();
+ kPMEGradSum64_kernel<<<lblocks, GRADSUMTHREADS>>>(offset);

         LAUNCHERROR("kPMEGradSum");
- blocks -= 65536;
- offset += 65536;
+ blocks -= 65535;
+ offset += 65535;
     }
     cudaUnbindTexture(texref);
}

All the best
Ross

> On Aug 10, 2016, at 2:02 AM, William Lees <william.lees.org.uk> wrote:
>
> Hello,
>
> We are getting the above error when trying to start minimisation of a 630,000 atom structure on a Tesla M2090 GPU using Amber 14 patch 13. The minimisation runs without error on a different system at the same Amber patch level with a TITAN-X card. Output from pmemd.cuda is attached.
>
> I found a thread relating to this error on older cards here: http://archive.ambermd.org/201410/0184.html . Although there is mention there of a patch being issued, I can't find it in the Amber 14 patch list. The key change appears to be a change to the definition of lblocks at around line 400 in amber14/src/pmemd/src/cuda/kPMEInterpolation.cu (I'm finding it a little hard to discern from the thread whether other chanegs are required):
>
> from
>
> int lblocks = min(blocks, 65536);
>
> to
>
> int lblocks = min(blocks, 65535);
>
> On the system in error, the definition has not been changed: it is still min(blocks, 65536).
>
> Did a fix for this problem ever make it in to the official patches?
>
> Many thanks
>
> William
>
> William Lees
> Associate Research Fellow
> Institute of Structural and Molecular Biology
> Birkbeck, University of London
> http://shepherd-group.ismb.lon.ac.uk
>
>
> <cuda error.log>_______________________________________________
> AMBER mailing list
> AMBER.ambermd.org
> http://lists.ambermd.org/mailman/listinfo/amber

_______________________________________________
AMBER mailing list
AMBER.ambermd.org
http://lists.ambermd.org/mailman/listinfo/amber
Received on Wed Aug 10 2016 - 20:30:02 PDT