*******> update.12 Author: David S. Cerutti Date: January 21, 2019 Programs: pmemd.cuda Description: This prepares pmemd.cuda to work with the new Turing architecture, and CUDA 10. Also removes the need to specify -volta during configuration to get V100 optimizations. They are now standard. -------------------------------------------------------------------------------- src/pmemd/src/cuda/gputypes.h | 26 +-- src/pmemd/src/cuda/kCalculatePMENonbondEnergy.cu | 8 +- src/pmemd/src/cuda/kNLCPNE.h | 14 +- 3 files changed, 107 insertions(+), 167 deletions(-) index 051a95c67c..af92045b21 100644 --- src/pmemd/src/cuda/gputypes.h +++ src/pmemd/src/cuda/gputypes.h @@ -291,10 +291,10 @@ union llconstruct { //--------------------------------------------------------------------------------------------- static const int THREADS_PER_BLOCK = 1024; static const int NLCALCULATE_OFFSETS_THREADS_PER_BLOCK = 720; -static const int NLBUILD_NEIGHBORLIST8_THREADS_PER_BLOCK = 640; -static const int NLBUILD_NEIGHBORLIST16_THREADS_PER_BLOCK = 640; -static const int NLBUILD_NEIGHBORLIST32_THREADS_PER_BLOCK = 640; -static const int NLBUILD_NEIGHBORLIST_BLOCKS_MULTIPLIER = 2; +static const int NLBUILD_NEIGHBORLIST8_THREADS_PER_BLOCK = 128; +static const int NLBUILD_NEIGHBORLIST16_THREADS_PER_BLOCK = 128; +static const int NLBUILD_NEIGHBORLIST32_THREADS_PER_BLOCK = 128; +static const int NLBUILD_NEIGHBORLIST_BLOCKS_MULTIPLIER = 10; static const int LOCALFORCES_BLOCKS = 16; static const int LOCALFORCES_THREADS_PER_BLOCK = 64; static const int AFE_EXCHANGE_THREADS_PER_BLOCK = 256; @@ -328,15 +328,15 @@ static const int IPSNONBONDENERGY_BLOCKS_MULTIPLIER = 1; static const int TRANSPOSE_QMESH_THREADS_PER_BLOCK = 64; static const int FILTER_IMAGE_THREADS_PER_BLOCK = 64; #else -static const int GBBORNRADII_THREADS_PER_BLOCK = 768; -static const int GBBORNRADII_BLOCKS_MULTIPLIER = 2; -static const int GBNONBONDENERGY1_THREADS_PER_BLOCK = 576; -static const int GBNONBONDENERGY1_BLOCKS_MULTIPLIER = 2; -static const int GBNONBONDENERGY2_THREADS_PER_BLOCK = 640; -static const int GBNONBONDENERGY2_BLOCKS_MULTIPLIER = 2; -static const int PMENONBONDENERGY_THREADS_PER_BLOCK = 512; -static const int PMENONBONDFORCES_THREADS_PER_BLOCK = 640; -static const int PMENONBONDENERGY_BLOCKS_MULTIPLIER = 2; +static const int GBBORNRADII_THREADS_PER_BLOCK = 64; +static const int GBNONBONDENERGY1_THREADS_PER_BLOCK = 64; +static const int GBNONBONDENERGY2_THREADS_PER_BLOCK = 64; +static const int PMENONBONDFORCES_THREADS_PER_BLOCK = 128; +static const int GBNONBONDENERGY1_BLOCKS_MULTIPLIER = 24; +static const int GBNONBONDENERGY2_BLOCKS_MULTIPLIER = 24; +static const int GBBORNRADII_BLOCKS_MULTIPLIER = 24; +static const int PMENONBONDENERGY_THREADS_PER_BLOCK = 128; +static const int PMENONBONDENERGY_BLOCKS_MULTIPLIER = 10; static const int IPSNONBONDENERGY_THREADS_PER_BLOCK = 512; static const int IPSNONBONDFORCES_THREADS_PER_BLOCK = 512; static const int IPSNONBONDENERGY_BLOCKS_MULTIPLIER = 2; diff --git src/pmemd/src/cuda/kCalculatePMENonbondEnergy.cu src/pmemd/src/cuda/kCalculatePMENonbondEnergy.cu index 7cf7fc520d..e5ba2acbe4 100644 --- src/pmemd/src/cuda/kCalculatePMENonbondEnergy.cu +++ src/pmemd/src/cuda/kCalculatePMENonbondEnergy.cu @@ -1446,7 +1446,7 @@ kCalcPMEOrthoNBMiniNrg8_kernel() //--------------------------------------------------------------------------------------------- extern "C" void kCalculatePMENonbondForces(gpuContext gpu) { -#if !defined(use_DPFP) && !defined(VOLTAOPT) +#if !defined(use_DPFP) // Bind textures xytexref.normalized = 0; xytexref.filterMode = cudaFilterModePoint; @@ -1806,7 +1806,7 @@ extern "C" void kCalculatePMENonbondForces(gpuContext gpu) } LAUNCHERROR("kCalculatePMENonbondForces"); -#if !defined(use_DPFP) && !defined(VOLTAOPT) +#if !defined(use_DPFP) // Unbind textures cudaUnbindTexture(xytexref); cudaUnbindTexture(ztexref); @@ -1825,7 +1825,7 @@ extern "C" void kCalculatePMENonbondForces(gpuContext gpu) //--------------------------------------------------------------------------------------------- extern "C" void kCalculatePMENonbondEnergy(gpuContext gpu) { -#if !defined(use_DPFP) && !defined(VOLTAOPT) +#if !defined(use_DPFP) // Bind textures xytexref.normalized = 0; xytexref.filterMode = cudaFilterModePoint; @@ -2171,7 +2171,7 @@ extern "C" void kCalculatePMENonbondEnergy(gpuContext gpu) } LAUNCHERROR("kCalculatePMENonbondEnergy"); -#if !defined(use_DPFP) && !defined(VOLTAOPT) +#if !defined(use_DPFP) // Unbind textures cudaUnbindTexture(xytexref); cudaUnbindTexture(ztexref); diff --git src/pmemd/src/cuda/kNLCPNE.h src/pmemd/src/cuda/kNLCPNE.h index c7940fd406..9292f63e6d 100644 --- src/pmemd/src/cuda/kNLCPNE.h +++ src/pmemd/src/cuda/kNLCPNE.h @@ -265,7 +265,7 @@ #if defined(use_SPFP) && !defined(PME_FSWITCH) && !defined(PME_ENERGY) PMEFloat r2inv = (PMEFloat)1.0 / r2; int cidx = 2*(__float_as_int(r2) >> 18) + (exclusion & 0x1); -# ifdef VOLTAOPT +# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 700) PMEFloat4 coef = cSim.pErfcCoeffsTable[cidx]; # else PMEFloat4 coef = tex1Dfetch(texErfcCoeffsTable, cidx); @@ -276,7 +276,7 @@ PMEFloat r2inv = rinv * rinv; #endif #ifndef use_DPFP -# ifdef VOLTAOPT +# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 700) PMEFloat2 term = cSim.pLJTerm[index]; # else PMEFloat2 term = tex1Dfetch(ljtermtexref, index); @@ -492,7 +492,7 @@ if (tx + tgx < psWarp->nlEntry.NL.xatoms) { unsigned int atom = shAtom.ID >> NLATOM_CELL_SHIFT; #ifndef use_DPFP -# ifdef VOLTAOPT +# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 700) PMEFloat2 xy = cSim.pAtomXYSP[atom]; PMEFloat2 qljid = cSim.pAtomChargeSPLJID[atom]; shAtom.z = cSim.pAtomZSP[atom]; @@ -583,7 +583,7 @@ #if defined(use_SPFP) && !defined(PME_FSWITCH) && !defined(PME_ENERGY) PMEFloat r2inv = (PMEFloat)1.0 / r2; int cidx = 2*(__float_as_int(r2) >> 18) + (exclusion & 0x1); -# ifdef VOLTAOPT +# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 700) PMEFloat4 coef = cSim.pErfcCoeffsTable[cidx]; # else PMEFloat4 coef = tex1Dfetch(texErfcCoeffsTable, cidx); @@ -594,7 +594,7 @@ PMEFloat r2inv = rinv * rinv; #endif #ifndef use_DPFP -# ifdef VOLTAOPT +# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 700) PMEFloat2 term = cSim.pLJTerm[index]; # else PMEFloat2 term = tex1Dfetch(ljtermtexref, index); @@ -747,7 +747,7 @@ #if defined(use_SPFP) && !defined(PME_FSWITCH) && !defined(PME_ENERGY) PMEFloat r2inv = (PMEFloat)1.0 / r2; int cidx = 2 * (__float_as_int(r2) >> 18); -# ifdef VOLTAOPT +# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 700) PMEFloat4 coef = cSim.pErfcCoeffsTable[cidx]; # else PMEFloat4 coef = tex1Dfetch(texErfcCoeffsTable, cidx); @@ -758,7 +758,7 @@ PMEFloat r2inv = rinv * rinv; #endif #ifndef use_DPFP -# ifdef VOLTAOPT +# if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ == 700) PMEFloat2 term = cSim.pLJTerm[index]; # else PMEFloat2 term = tex1Dfetch(ljtermtexref, index);