Re: [AMBER] Amber 14.13 pmemd.cuda: Error: invalid configuration argument launching kernel kPMEReduceChargeGridBuffer

From: William Lees <william.lees.org.uk>
Date: Fri, 12 Aug 2016 14:19:05 +0100

Hi Ross,

Thanks very much for the advice and the patches. Just thought I'd
confirm that the patch has fixed the problem.

All the best

William


On 11/08/2016 04:01, Ross Walker wrote:
> Hi William,
>
> I wouldn't recommend minimizing with the GPU code due to limitations of the SPFP precision model. It's better to minimize with CPU and just do MD on the GPU. Nevertheless the problem would be the same.
>
> The fix made it into Amber 16 but it looks like a patch was never made for Amber 14.
>
> commit 0134305a35541685397b10dda3e01fe07e2f757e
> Author: scott legrand <slegrand.amber.(none)>
> Date: Mon Nov 17 17:31:55 2014 -0800
>
> Fix for large systems and removal of some unused code
>
>
> Here's the full diffs
>
> -------------------------- src/pmemd/src/cuda/kPGS.h --------------------------
> index 2b42fc9..de3e70f 100644
> .@ -70,17 +70,17 @@ __shared__ PMEFloat sRecipf[9];
> #endif
>
> // Determine grid offsets
> const int tOffsetX = tgx & 0x03;
> const int tOffsetY = (tgx & 0x07) >> 2;
> const int iOffsetX = tOffsetX;
> const int iOffsetY = tOffsetY;
>
> - unsigned int pos = blockIdx.x * GRADSUMLOADSIZE;
> + unsigned int pos = (blockIdx.x + offset) * GRADSUMLOADSIZE;
>
> // Read batch of atoms and procedurally generate spline weights
> unsigned int maxatom = min(pos + GRADSUMLOADSIZE, cSim.atoms);
> unsigned int pos1 = pos + threadIdx.x;
> if (pos1 < maxatom)
> {
> PMEFloat charge = cSim.pAtomChargeSP[pos1];
> PMEFloat fx = cSim.pFractX[pos1];
>
> ------------------ src/pmemd/src/cuda/kPMEInterpolation.cu -------------------
> index 03a9709..f79def8 100644
> .@ -178,19 +178,19 @@ extern "C" void kPMEClearChargeGridBuffer(gpuContext gpu)
> }
>
> __global__ void
> #if (__CUDA_ARCH__ >= 300)
> __launch_bounds__(SM_3X_REDUCEFORCES_THREADS_PER_BLOCK, 1)
> #else
> __launch_bounds__(SM_2X_REDUCEFORCES_THREADS_PER_BLOCK, 1)
> #endif
> -kPMEReduceChargeGridBuffer_kernel()
> +kPMEReduceChargeGridBuffer_kernel(unsigned int offset)
> {
> - unsigned int pos = blockIdx.x * blockDim.x + threadIdx.x;
> + unsigned int pos = (blockIdx.x + offset) * blockDim.x + threadIdx.x;
>
> if (pos < cSim.nfft1xnfft2xnfft3)
> {
> unsigned int iz = pos / (cSim.nfft1xnfft2);
> unsigned int iy = (pos - iz * cSim.nfft1xnfft2) / cSim.nfft1;
> unsigned int ix = pos - iz * cSim.nfft1xnfft2 - iy * cSim.nfft1;
> unsigned int spos = ((ix & 0x3) + (iy & 3) * 4 + (iz & 1) * 16) + (ix >> 2) * 32
> + ((iy >> 2) << 3) * cSim.nfft1 + ((iz >> 1) << 1) * cSim.nfft1xnfft2;
> .@ -200,19 +200,26 @@ kPMEReduceChargeGridBuffer_kernel()
> PMEFloat value1 = (PMEFloat)value * ONEOVERLATTICESCALEF;
> cSim.pXYZ_q[pos] = value1;
> }
> }
>
>
> extern "C" void kPMEReduceChargeGridBuffer(gpuContext gpu)
> {
> - unsigned int blocks = (gpu->sim.nfft1 * gpu->sim.nfft2 * gpu->sim.nfft3 + 127) >> 7;
> - kPMEReduceChargeGridBuffer_kernel<<<blocks, 128>>>();
> - LAUNCHERROR("kPMEReduceChargeGridBuffer");
> + int blocks = (gpu->sim.nfft1 * gpu->sim.nfft2 * gpu->sim.nfft3 + 127) >> 7;
> + int offset = 0;
> + while (blocks > 0)
> + {
> + int lblocks = min(blocks, 65535);
> + kPMEReduceChargeGridBuffer_kernel<<<lblocks, 128>>>(offset);
> + LAUNCHERROR("kPMEReduceChargeGridBuffer");
> + offset += 65535;
> + blocks -= 65535;
> + }
> }
>
> struct FillChargeGridAtomData
> {
> int ix;
> int iy;
> int iz;
> PMEFloat tx[4];
> .@ -374,28 +381,27 @@ __shared__ volatile FillChargeGridAtomData sAtom[LOADSIZE];
> __syncthreads();
> }
>
> extern "C" void kPMEInterpolationInitKernels(gpuContext gpu)
> {
>
> }
>
> -
> extern "C" void kPMEFillChargeGridBuffer(gpuContext gpu)
> {
> int blocks = (gpu->sim.atoms + LOADSIZE - 1) / LOADSIZE;
> int offset = 0;
> while (blocks > 0)
> {
> - int lblocks = min(blocks, 65536);
> + int lblocks = min(blocks, 65535);
> kPMEFillChargeGridBuffer_kernel<<<lblocks, 64>>>(offset);
> LAUNCHERROR("kPMEFillChargeGridBuffer");
> - offset += 65536;
> - blocks -= 65536;
> + offset += 65535;
> + blocks -= 65535;
> }
> }
>
> #define PME_ENERGY
> __global__ void
> #if (__CUDA_ARCH__ >= 300)
> __launch_bounds__(SM_3X_GENERAL_THREADS_PER_BLOCK, 1)
> #else
> .@ -467,31 +473,31 @@ __global__ void
> #ifdef use_DPFP
> __launch_bounds__(GRADSUMTHREADS, 8)
> #else
> __launch_bounds__(GRADSUMTHREADS, 16)
> #endif
> #else
> __launch_bounds__(GRADSUMTHREADS, 8)
> #endif
> -kPMEGradSum64_kernel()
> +kPMEGradSum64_kernel(unsigned int offset)
> #include "kPGS.h"
>
> #define PME_VIRIAL
> __global__ void
> #if (__CUDA_ARCH__ >= 300)
> #ifdef use_DPFP
> __launch_bounds__(GRADSUMTHREADS, 8)
> #else
> __launch_bounds__(GRADSUMTHREADS, 8)
> #endif
> #else
> __launch_bounds__(GRADSUMTHREADS, 8)
> #endif
> -kPMEGradSum64Virial_kernel()
> +kPMEGradSum64Virial_kernel(unsigned int offset)
> #include "kPGS.h"
> #undef PME_VIRIAL
>
> extern "C" void kPMEGradSum(gpuContext gpu)
> {
> texref.normalized = 0;
> texref.filterMode = cudaFilterModePoint;
> .@ -509,22 +515,22 @@ extern "C" void kPMEGradSum(gpuContext gpu)
> #else
> cudaBindTexture(NULL, texref, (PMEFloat*)(gpu->sim.pXYZ_q), gpu->sim.nfft1 * gpu->sim.nfft2 * gpu->sim.nfft3 * sizeof(PMEFloat));
> #endif
>
> int blocks = (gpu->sim.atoms + GRADSUMLOADSIZE - 1) / GRADSUMLOADSIZE;
> int offset = 0;
> while (blocks > 0)
> {
> - int lblocks = min(blocks, 65536);
> + int lblocks = min(blocks, 65535);
> if ((gpu->sim.ntp > 0) && (gpu->sim.barostat == 1))
> - kPMEGradSum64Virial_kernel<<<lblocks, GRADSUMTHREADS>>>();
> + kPMEGradSum64Virial_kernel<<<lblocks, GRADSUMTHREADS>>>(offset);
> else
> - kPMEGradSum64_kernel<<<lblocks, GRADSUMTHREADS>>>();
> + kPMEGradSum64_kernel<<<lblocks, GRADSUMTHREADS>>>(offset);
>
> LAUNCHERROR("kPMEGradSum");
> - blocks -= 65536;
> - offset += 65536;
> + blocks -= 65535;
> + offset += 65535;
> }
> cudaUnbindTexture(texref);
> }
>
> All the best
> Ross
>
>> On Aug 10, 2016, at 2:02 AM, William Lees <william.lees.org.uk> wrote:
>>
>> Hello,
>>
>> We are getting the above error when trying to start minimisation of a 630,000 atom structure on a Tesla M2090 GPU using Amber 14 patch 13. The minimisation runs without error on a different system at the same Amber patch level with a TITAN-X card. Output from pmemd.cuda is attached.
>>
>> I found a thread relating to this error on older cards here: http://archive.ambermd.org/201410/0184.html . Although there is mention there of a patch being issued, I can't find it in the Amber 14 patch list. The key change appears to be a change to the definition of lblocks at around line 400 in amber14/src/pmemd/src/cuda/kPMEInterpolation.cu (I'm finding it a little hard to discern from the thread whether other chanegs are required):
>>
>> from
>>
>> int lblocks = min(blocks, 65536);
>>
>> to
>>
>> int lblocks = min(blocks, 65535);
>>
>> On the system in error, the definition has not been changed: it is still min(blocks, 65536).
>>
>> Did a fix for this problem ever make it in to the official patches?
>>
>> Many thanks
>>
>> William
>>
>> William Lees
>> Associate Research Fellow
>> Institute of Structural and Molecular Biology
>> Birkbeck, University of London
>> http://shepherd-group.ismb.lon.ac.uk
>>
>>
>> <cuda error.log>_______________________________________________
>> AMBER mailing list
>> AMBER.ambermd.org
>> http://lists.ambermd.org/mailman/listinfo/amber
>
> _______________________________________________
> AMBER mailing list
> AMBER.ambermd.org
> http://lists.ambermd.org/mailman/listinfo/amber
>


_______________________________________________
AMBER mailing list
AMBER.ambermd.org
http://lists.ambermd.org/mailman/listinfo/amber
Received on Fri Aug 12 2016 - 06:30:02 PDT
Custom Search