*******> update.15 Author: Delaram Ghoreishi Date: August 26, 2019 Programs: pmemd.cuda.MPI Description: This incorporates the NEB code in pmemd.cuda.MPI -------------------------------------------------------------------------------- src/pmemd/src/cuda/CMakeLists.txt | 4 +- src/pmemd/src/cuda/Makefile | 4 +- src/pmemd/src/cuda/base_gpuContext.cpp | 21 + src/pmemd/src/cuda/base_gpuContext.h | 15 + src/pmemd/src/cuda/base_simulationConst.cpp | 23 + src/pmemd/src/cuda/base_simulationConst.h | 39 ++ src/pmemd/src/cuda/gpu.cpp | 229 ++++++- src/pmemd/src/cuda/gpu.h | 15 + src/pmemd/src/cuda/gti_cuda.cu | 1 + src/pmemd/src/cuda/gti_f95.cpp | 10 +- src/pmemd/src/cuda/gti_f95.h | 2 +- src/pmemd/src/cuda/kCalculateNEBForces.cu | 692 ++++++++++++++++++++++ src/pmemd/src/cuda/kNEB.h | 95 +++ src/pmemd/src/cuda/kNorm.h | 45 ++ src/pmemd/src/cuda/kRMSFit.h | 258 ++++++++ src/pmemd/src/gb_force.F90 | 52 +- src/pmemd/src/mdin_ctrl_dat.F90 | 11 +- src/pmemd/src/neb.F90 | 52 +- src/pmemd/src/nebread.F90 | 34 +- src/pmemd/src/pme_force.F90 | 132 ++--- src/pmemd/src/pmemd.F90 | 13 + src/pmemd/src/runmd.F90 | 2 +- test/cuda/neb-testcases/neb_explicit/groupfile.in | 8 +- 23 files changed, 1592 insertions(+), 165 deletions(-) diff --git src/pmemd/src/cuda/CMakeLists.txt src/pmemd/src/cuda/CMakeLists.txt index e1fbc6c..f7ba57f 100644 --- src/pmemd/src/cuda/CMakeLists.txt +++ src/pmemd/src/cuda/CMakeLists.txt @@ -2,7 +2,7 @@ set(PMEMD_CUDA_CUDA_SOURCES kForcesUpdate.cu kCalculateLocalForces.cu kCalculateGBBornRadii.cu kCalculatePMENonbondEnergy.cu kCalculateGBNonbondEnergy1.cu kNLRadixSort.cu kCalculateGBNonbondEnergy2.cu kShake.cu kNeighborList.cu kPMEInterpolation.cu - kCalculateEFieldEnergy.cu kDataTransfer.cu) + kCalculateEFieldEnergy.cu kDataTransfer.cu kCalculateNEBForces.cu) set(PMEMD_CUDA_GTI_SOURCES gti_energies_kernels.cu gti_general_kernels.cu gti_utils.cu @@ -81,4 +81,4 @@ foreach(PRECISION ${PMEMD_CUDA_PRECISIONS}) target_link_libraries(pmemd_cuda_${PRECISION}_mpi curand cublas cufft mpi_c mpi_cxx mpi_fortran) endif() -endforeach() \ No newline at end of file +endforeach() diff --git src/pmemd/src/cuda/Makefile src/pmemd/src/cuda/Makefile index 1b5ee1f..514d2a5 100644 --- src/pmemd/src/cuda/Makefile +++ src/pmemd/src/cuda/Makefile @@ -12,7 +12,7 @@ CPP_OBJS = gpu.o cuda_info.o gpu.o \ CU_OBJS = kForcesUpdate.o kCalculateLocalForces.o kCalculateGBBornRadii.o \ kCalculatePMENonbondEnergy.o kCalculateGBNonbondEnergy1.o kNLRadixSort.o \ kCalculateGBNonbondEnergy2.o kShake.o kNeighborList.o kPMEInterpolation.o \ - kCalculateEFieldEnergy.o kDataTransfer.o + kCalculateEFieldEnergy.o kDataTransfer.o kCalculateNEBForces.o GTI_OBJS = gti_energies_kernels.o gti_general_kernels.o gti_utils.o \ gti_NBList_kernels.o gti_nonBond_kernels.o gti_PME_kernels.o @@ -69,6 +69,8 @@ kCalculateLocalForces.o: kCNF.h kBWU.h \ kPMEInterpolation.o: kReImageCoord.h kPGS.h kPSSE.h kCalculateEFieldEnergy.o: kEFE.h kShake.o: kShake.h +kDataTransfer.o: kShuttle.h +kCalculateNEBForces.o: kShuttle.h kNEB.h kRMSFit.h kNorm.h $(CU_OBJS): gpu.h gputypes.h simulationConst.h gti_cuda.o: gti_cuda.cuh gti_general_kernels.cuh gti_NBList_kernels.cuh gti_nonBond_kernels.cuh gti_energies_kernels.cuh gti_PME_kernels.cuh gti_energies_kernels.o: gti_kernelAttribute.cuh gti_energies_kernels.cuh gti_utils.cuh gti_const.cuh diff --git src/pmemd/src/cuda/base_gpuContext.cpp src/pmemd/src/cuda/base_gpuContext.cpp index 7818f3f..596f57a 100644 --- src/pmemd/src/cuda/base_gpuContext.cpp +++ src/pmemd/src/cuda/base_gpuContext.cpp @@ -140,6 +140,12 @@ base_gpuContext::base_gpuContext() : // Atom shuttling pbDataShuttle(NULL), pbShuttleTickets(NULL), + //NEB buffers + pbRMSMask(NULL), pbFitMask(NULL), pbAtmIdx(NULL), pbNEBEnergyAll(NULL), + pbTangents(NULL), pbSpringForce(NULL), pbNEBForce(NULL), + pbNextDataShuttle(NULL), pbPrevDataShuttle(NULL), pbKabschCOM(NULL), + pbtotFitMass(NULL), pbDataSPR(NULL), pbRotAtm(NULL), + // AMD buffers pbAMDfwgtd(NULL), pAmdWeightsAndEnergy(NULL), @@ -273,6 +279,21 @@ base_gpuContext::~base_gpuContext() delete pbDataShuttle; delete pbShuttleTickets; + // Delete NEB stuff + delete pbRMSMask; + delete pbFitMask; + delete pbAtmIdx; + delete pbNEBEnergyAll; + delete pbTangents; + delete pbSpringForce; + delete pbNEBForce; + delete pbNextDataShuttle; + delete pbPrevDataShuttle; + delete pbKabschCOM; + delete pbtotFitMass; + delete pbDataSPR; + delete pbRotAtm; + // Delete NTP stuff delete pbSoluteAtomID; delete pbSoluteAtomMass; diff --git src/pmemd/src/cuda/base_gpuContext.h src/pmemd/src/cuda/base_gpuContext.h index 63218d6..2ab4d7b 100644 --- src/pmemd/src/cuda/base_gpuContext.h +++ src/pmemd/src/cuda/base_gpuContext.h @@ -282,6 +282,21 @@ public: GpuBuffer* pbShuttleTickets; // List of atoms that will be on the // shuttle to and from the CPU + // NEB info + GpuBuffer* pbRMSMask; // Mask of atoms used for NEB force calculation. + GpuBuffer* pbFitMask; // Mask of atoms used for NEB structure fitting. + GpuBuffer* pbAtmIdx; + GpuBuffer* pbNEBEnergyAll; // Energy of the replicas in NEB. + GpuBuffer* pbTangents; // Tangents to the path. + GpuBuffer* pbSpringForce; // Artificial spring forces. + GpuBuffer* pbNEBForce; // NEB forces. + GpuBuffer* pbNextDataShuttle; // Next neighboring replica's coordinate. + GpuBuffer* pbPrevDataShuttle; // Previous neighboring replica's coordinate. + GpuBuffer* pbKabschCOM; + GpuBuffer* pbtotFitMass; + GpuBuffer* pbDataSPR; + GpuBuffer* pbRotAtm; + // Remapped bonded interactions and Shake constraint atom IDs GpuBuffer* pbImageNMRCOMDistanceID; // Remapped NMR COM Distance i, j GpuBuffer* pbImageNMRCOMDistanceCOM; // Remapped NMR COM DistanceCOM range diff --git src/pmemd/src/cuda/base_simulationConst.cpp src/pmemd/src/cuda/base_simulationConst.cpp index 1d31e5e..ab83be4 100644 --- src/pmemd/src/cuda/base_simulationConst.cpp +++ src/pmemd/src/cuda/base_simulationConst.cpp @@ -592,6 +592,29 @@ void base_simulationConst::InitSimulationConst() pDataShuttle = NULL; pShuttleTickets = NULL; + //NEB parameters + neb_nbead = 0; + beadid = 0; + nattgtrms = 0; + nattgtfit = 0; + last_neb_atm = 0; + skmin = 50; + skmax = 100; + tmode = 1; + pRMSMask = NULL; + pFitMask = NULL; + pAtmIdx = NULL; + pNEBEnergyAll = NULL; + pTangents = NULL; + pSpringForce = NULL; + pNEBForce = NULL; + pNextDataShuttle = NULL; + pPrevDataShuttle = NULL; + pKabschCOM = NULL; + pDataSPR = NULL; + pRotAtm = NULL; + ptotFitMass = NULL; + // General IPS parameters bIPSActive = false; rips = (PMEFloat)8.0; diff --git src/pmemd/src/cuda/base_simulationConst.h src/pmemd/src/cuda/base_simulationConst.h index 6d41adc..0e3caa6 100644 --- src/pmemd/src/cuda/base_simulationConst.h +++ src/pmemd/src/cuda/base_simulationConst.h @@ -602,6 +602,45 @@ public: int* pShuttleTickets; // List of atoms that will get tickets on the shuttle to and // from the CPU. + //NEB info + int neb_nbead; // Total number of NEB replicas. + int beadid; // NEB replica ID. + int nattgtrms; // Number of atoms used for ENB force calculation. + int nattgtfit; // Number of atoms used for NEB structure fitting. + int last_neb_atm; // Last atom involved in NEB force calculation or + // structure fitting. + int skmin; // Spring constants for NEB. + int skmax; // Spring constants for NEB. + int tmode; // Tangent mode for NEB. + int* pRMSMask; // Mask of atoms used for ENB force calculation. + int* pFitMask; // Mask of atoms used for NEB structure fitting. + int* pAtmIdx; + PMEDouble* pNEBEnergyAll; // Energy of the replicas in NEB. + double* pTangents; // Tangents to the path. + double* pSpringForce; // Artificial spring forces. + double* pNEBForce; // NEB forces. + double* pNextDataShuttle; // Next neighboring replica's coordinate. + double* pPrevDataShuttle; // Previous neighboring replica's coordinate. + double* pKabschCOM; + double* pPrevKabsch; + double* pNextKabsch; + double* pSelfCOM; + double* pNextCOM; + double* pPrevCOM; + double* pKabsch2; + double* pPrevKabsch2; + double* pNextKabsch2; + double* pDataSPR; + double* ptotFitMass; // Pointer to total mass of atoms in RMSFit array. + double* pNorm; + double* pDotProduct1; + double* pDotProduct2; + double* pSpring1; + double* pSpring2; + double* pRotAtm; + double* pPrevRotAtm; + double* pNextRotAtm; + // Direct space decomposition uint2* pNLNonbondCellStartEnd; // Nonbond cell boundaries pointer int srTableLength; // Length of the list describing the reciprocal diff --git src/pmemd/src/cuda/gpu.cpp src/pmemd/src/cuda/gpu.cpp index 38062e9..cc213af 100644 --- src/pmemd/src/cuda/gpu.cpp +++ src/pmemd/src/cuda/gpu.cpp @@ -1098,6 +1098,213 @@ extern "C" void gpu_upload_gamma_ln_(double* gamma_ln) } //--------------------------------------------------------------------------------------------- +// gpu_upload_neb_info_: Upload nattgtrms, nattgtfit, rmsmask, fitmask for neb calculation. +// +// Arguments: +// neb_nbead: number of NEB replicas. +// beadid: NEB replica ID. +// nattgtrms: number of atoms in rmsmask. +// nattgtfit: number of atoms in fitmask. +// last_neb_atm: last atom included in either rmsmask or fitmask. +// skmin: NEB min spring constant. +// skmax: NEB max spring constant. +// tmode: NEB tangent mode. +// rmsmask: atom selection mask used for NEB force application. +// fitmask: atom selection mask used for NEB structure fitting. +// idx_mask: combined ordered rmsmask and fitmask atoms. +//--------------------------------------------------------------------------------------------- +extern "C" void gpu_upload_neb_info_(int* neb_nbead, int* beadid, int* nattgtrms, int* nattgtfit, + int* last_neb_atm, int* skmin, int* skmax, int* tmode, + int rmsmask[], int fitmask[], int idx_mask[]) +{ + PRINTMETHOD("gpu_upload_neb_info"); + gpu->sim.neb_nbead = *neb_nbead; + gpu->sim.beadid = *beadid; + gpu->sim.nattgtrms = *nattgtrms; + gpu->sim.nattgtfit = *nattgtfit; + gpu->sim.last_neb_atm = *last_neb_atm; + gpu->sim.skmin = *skmin; + gpu->sim.skmax = *skmax; + gpu->sim.tmode = *tmode; + + delete gpu->pbRMSMask; + delete gpu->pbFitMask; + delete gpu->pbAtmIdx; + int RMSStride = ((gpu->sim.nattgtrms + gpu->sim.grid - 1) >> + gpu->sim.gridBits) << gpu->sim.gridBits; + int FitStride = ((gpu->sim.nattgtfit + gpu->sim.grid - 1) >> + gpu->sim.gridBits) << gpu->sim.gridBits; + gpu->pbRMSMask = new GpuBuffer(RMSStride); + gpu->pbFitMask = new GpuBuffer(FitStride); + gpu->pbAtmIdx = new GpuBuffer(get_shuttle_size(1, sizeof(int), + gpu->sim.nShuttle)); + + gpu->sim.pRMSMask = gpu->pbRMSMask->_pDevData; + gpu->sim.pFitMask = gpu->pbFitMask->_pDevData; + gpu->sim.pAtmIdx = gpu->pbAtmIdx->_pDevData; + + for (int i = 0; i < gpu->sim.nattgtrms; i++){ + gpu->pbRMSMask->_pSysData[i] = rmsmask[i] - 1; + } + + for (int i = 0; i < gpu->sim.nattgtfit; i++){ + gpu->pbFitMask->_pSysData[i] = fitmask[i] - 1; + } + + for (int i = 0; i < gpu->sim.nShuttle; i++){ + gpu->pbAtmIdx->_pSysData[i] = idx_mask[i]; + } + + gpu->pbRMSMask->Upload(); + gpu->pbFitMask->Upload(); + gpu->pbAtmIdx->Upload(); + gpuCopyConstants(); +} + +//--------------------------------------------------------------------------------------------- +// gpu_setup_neb_: Setup for NEB, called from pmemd.F90. +//--------------------------------------------------------------------------------------------- +extern "C" void gpu_setup_neb_() +{ + PRINTMETHOD("gpu_setup_neb"); + delete gpu->pbNEBEnergyAll; + delete gpu->pbTangents; + delete gpu->pbSpringForce; + delete gpu->pbNEBForce; + delete gpu->pbNextDataShuttle; + delete gpu->pbPrevDataShuttle; + delete gpu->pbKabschCOM; + delete gpu->pbDataSPR; + delete gpu->pbRotAtm; + delete gpu->pbtotFitMass; + + int NEBStride = ((gpu->sim.neb_nbead + gpu->sim.grid - 1) >> + gpu->sim.gridBits) << gpu->sim.gridBits; + gpu->pbNEBEnergyAll = new GpuBuffer(NEBStride); + gpu->pbTangents = new GpuBuffer(get_shuttle_size(3, sizeof(double), + gpu->sim.nShuttle)); + gpu->pbSpringForce = new GpuBuffer(get_shuttle_size(3, sizeof(double), + gpu->sim.nShuttle)); + gpu->pbNEBForce = new GpuBuffer(get_shuttle_size(3, sizeof(double), + gpu->sim.nShuttle)); + if (gpu->bCanMapHostMemory) { + gpu->pbNextDataShuttle = new GpuBuffer(get_shuttle_size(3, sizeof(double), + gpu->sim.nShuttle), false, true); + gpu->pbPrevDataShuttle = new GpuBuffer(get_shuttle_size(3, sizeof(double), + gpu->sim.nShuttle), false, true); + } + else { + gpu->pbNextDataShuttle = new GpuBuffer(get_shuttle_size(3, sizeof(double), + gpu->sim.nShuttle)); + gpu->pbPrevDataShuttle = new GpuBuffer(get_shuttle_size(3, sizeof(double), + gpu->sim.nShuttle)); + } + gpu->pbKabschCOM = new GpuBuffer(27); + if (gpu->bCanMapHostMemory) { + gpu->pbDataSPR = new GpuBuffer(5, false, true); + } + else { + gpu->pbDataSPR = new GpuBuffer(5); + } + if (gpu->bCanMapHostMemory) { + gpu->pbRotAtm = new GpuBuffer(9, false, true); + } + else { + gpu->pbRotAtm = new GpuBuffer(9); + } + gpu->pbtotFitMass = new GpuBuffer(1); + + gpu->sim.pNEBEnergyAll = gpu->pbNEBEnergyAll->_pDevData; + gpu->sim.pTangents = gpu->pbTangents->_pDevData; + gpu->sim.pSpringForce = gpu->pbSpringForce->_pDevData; + gpu->sim.pNEBForce = gpu->pbNEBForce->_pDevData; + gpu->sim.pNextDataShuttle = gpu->pbNextDataShuttle->_pDevData; + gpu->sim.pPrevDataShuttle = gpu->pbPrevDataShuttle->_pDevData; + gpu->sim.pKabschCOM = gpu->pbKabschCOM->_pDevData; + gpu->sim.pPrevKabsch = gpu->pbKabschCOM->_pDevData + 0; + gpu->sim.pNextKabsch = gpu->pbKabschCOM->_pDevData + 9; + gpu->sim.pSelfCOM = gpu->pbKabschCOM->_pDevData + 18; + gpu->sim.pPrevCOM = gpu->pbKabschCOM->_pDevData + 21; + gpu->sim.pNextCOM = gpu->pbKabschCOM->_pDevData + 24; + gpu->sim.pDataSPR = gpu->pbDataSPR->_pDevData; + gpu->sim.pNorm = gpu->pbDataSPR->_pDevData + 0; + gpu->sim.pDotProduct1 = gpu->pbDataSPR->_pDevData + 1; + gpu->sim.pDotProduct2 = gpu->pbDataSPR->_pDevData + 2; + gpu->sim.pSpring1 = gpu->pbDataSPR->_pDevData + 3; + gpu->sim.pSpring2 = gpu->pbDataSPR->_pDevData + 4; + gpu->sim.ptotFitMass = gpu->pbtotFitMass->_pDevData; + gpu->sim.pRotAtm = gpu->pbRotAtm->_pDevData; + + gpuCopyConstants(); +} + +//-------------------------------------------------------------------------------------------- +// gpu_neb_exchange_crd_: NEB routine for exchanging the neighboring replicas coordinates +// through the shuttle transfer. +//-------------------------------------------------------------------------------------------- +extern "C" void gpu_neb_exchange_crd_() +{ + PRINTMETHOD("gpu_neb_exchange_crd"); + int buff_size = get_shuttle_size(3, sizeof(double),gpu->sim.nShuttle); + kNEBSendRecv(gpu, buff_size); +} + +//-------------------------------------------------------------------------------------------- +// gpu_report_neb_ene_: NEB routine for the replicas' energy print-out. +// +// Arguments: +// master_size: +// neb_nrg_all: +//-------------------------------------------------------------------------------------------- +extern "C" void gpu_report_neb_ene_(int* master_size, double neb_nrg_all[]) +{ + PRINTMETHOD("gpu_report_neb_ene"); + NEB_report_energy(gpu, master_size, neb_nrg_all); +} + +//-------------------------------------------------------------------------------------------- +// gpu_set_neb_springs_: NEB routine for setting the spring constants. +//-------------------------------------------------------------------------------------------- +extern "C" void gpu_set_neb_springs_() +{ + PRINTMETHOD("gpu_set_neb_springs"); + kNEBspr(gpu); +} + +//-------------------------------------------------------------------------------------------- +// gpu_calculate_neb_frc_: Routine for calculating the NEB forces every nebfreq steps. +// +// Arguments: +// neb_force: NEB forces. +//-------------------------------------------------------------------------------------------- +extern "C" void gpu_calculate_neb_frc_(double neb_force[][3]) +{ + PRINTMETHOD("gpu_calculate_neb_frc"); + kNEBfrc(gpu, neb_force); +} + +//-------------------------------------------------------------------------------------------- +// gpu_calculate_neb_frc_nstep_: Routine for calculating the NEB forces at the off steps. +// +// Arguments: +// neb_force: NEB forces. +//-------------------------------------------------------------------------------------------- +extern "C" void gpu_calculate_neb_frc_nstep_(double neb_force[][3]) +{ + PRINTMETHOD("gpu_calculate_neb_frc_nstep"); + kNEBfrc_nstep(gpu, neb_force); +} + +//-------------------------------------------------------------------------------------------- +// gpu_neb_rmsfit_: Routine for calculating the NEB structure fitting. +//-------------------------------------------------------------------------------------------- +extern "C" void gpu_neb_rmsfit_() +{ + PRINTMETHOD("gpu_neb_rmsfit"); + kFitCOM(gpu); +} + +//--------------------------------------------------------------------------------------------- // gpu_setup_shuttle_info_: Upload atom_list and nshatom for partial coordinates download. // // Arguments: @@ -4399,6 +4606,7 @@ extern "C" void gpuCopyConstants() #else SetkForcesUpdateSim(gpu); SetkDataTransferSim(gpu); + SetkCalculateNEBForcesSim(gpu); SetkShakeSim(gpu); SetkCalculateLocalForcesSim(gpu); if (gpu->ntb == 0) { @@ -7296,7 +7504,8 @@ extern "C" void gpu_allreduce(GpuBuffer* pbBuff, int size) //--------------------------------------------------------------------------------------------- extern "C" void gpu_pme_ene_(double* ewaldcof, double* vol, pme_pot_ene_rec* pEnergy, double enmr[3], double virial[3], double ekcmt[3], int* nstep, - double* dt, double atm_crd[][3], double atm_frc[][3]) + double* dt, double atm_crd[][3], double atm_frc[][3], + int* ineb, int* nebfreq) { PRINTMETHOD("gpu_pme_ene"); @@ -7431,6 +7640,13 @@ extern "C" void gpu_pme_ene_(double* ewaldcof, double* vol, pme_pot_ene_rec* pEn pEnergy->total += gpu->vdw_recip + gpu->self_energy; #ifdef MPI } +//NEB + if ((*ineb > 0) && (*nstep%*nebfreq == 0)) { + MPI_Allgather(&pEnergy->total, 1, MPI_PMEDOUBLE, gpu->pbNEBEnergyAll->_pSysData, + 1, MPI_PMEDOUBLE, MPI_COMM_WORLD); + gpu->pbNEBEnergyAll->Upload(); + } + if (gpu->gpuID == 0) { #endif pEnergy->vdw_tot = energy[1] + gpu->vdw_recip; @@ -9514,7 +9730,7 @@ void gpu_gather_gb_forces() // pEnergy: record to store GB simulation energies // enmr: NMR restraints energy (three components again) //--------------------------------------------------------------------------------------------- -extern "C" void gpu_gb_ene_(gb_pot_ene_rec* pEnergy, double enmr[3]) +extern "C" void gpu_gb_ene_(gb_pot_ene_rec* pEnergy, double enmr[3], int* ineb) { PRINTMETHOD("gpu_gb_ene"); kClearForces(gpu); @@ -9569,6 +9785,15 @@ extern "C" void gpu_gb_ene_(gb_pot_ene_rec* pEnergy, double enmr[3]) } pEnergy->total += energy[i]; } +//NEB +#ifdef MPI + if ((*ineb > 0)) { + MPI_Allgather(&pEnergy->total, 1, MPI_PMEDOUBLE, gpu->pbNEBEnergyAll->_pSysData, + 1, MPI_PMEDOUBLE, MPI_COMM_WORLD); + gpu->pbNEBEnergyAll->Upload(); + } +#endif + pEnergy->vdw_tot = energy[1]; pEnergy->elec_tot = energy[0]; pEnergy->gb = energy[2]; diff --git src/pmemd/src/cuda/gpu.h src/pmemd/src/cuda/gpu.h index 93c5914..920fca3 100644 --- src/pmemd/src/cuda/gpu.h +++ src/pmemd/src/cuda/gpu.h @@ -28,6 +28,12 @@ extern "C" void gpu_upload_gamma_ln_(double* gamma_ln); extern "C" void gpu_setup_shuttle_info_(int* nshatom, int* infocode, int atom_list[]); extern "C" void gpu_shuttle_retrieve_data_(double atm_data[][3], int* flag); extern "C" void gpu_shuttle_post_data_(double atm_data[][3], int* flag); +extern "C" void gpu_neb_exchange_crd_(); +extern "C" void gpu_report_neb_ene_(int* master_size, double neb_nrg_all[]); +extern "C" void gpu_set_neb_springs_(); +extern "C" void gpu_calculate_neb_frc_(double neb_force[][3]); +extern "C" void gpu_calculate_neb_frc_nstep_(double neb_force[][3]); +extern "C" void gpu_neb_rmsfit_(); extern "C" void gpu_upload_crd_(double atm_crd[][3]); extern "C" void gpu_upload_crd_gb_cph_(double atm_crd[][3]); extern "C" void gpu_download_crd_(double atm_crd[][3]); @@ -145,6 +151,8 @@ extern "C" void SetkForcesUpdateSim(gpuContext gpu); extern "C" void GetkForcesUpdateSim(gpuContext gpu); extern "C" void SetkDataTransferSim(gpuContext gpu); extern "C" void GetkDataTransferSim(gpuContext gpu); +extern "C" void SetkCalculateNEBForcesSim(gpuContext gpu); +extern "C" void GetkCalculateNEBForcesSim(gpuContext gpu); extern "C" void SetkCalculateEFieldEnergySim(gpuContext gpu); extern "C" void GetkCalculateEFieldEnergySim(gpuContext gpu); extern "C" void SetkCalculateLocalForcesSim(gpuContext gpu); @@ -276,6 +284,13 @@ extern "C" void kAFEExchangeCrd(gpuContext gpu); extern "C" void kRetrieveSimData(gpuContext gpu, double atm_data[][3], int modifier); extern "C" void kPostSimData(gpuContext gpu, double atm_data[][3], int modifier); extern "C" void DownloadAllCoord(gpuContext gpu, double atm_crd[][3]); +//NEB +extern "C" void kNEBSendRecv(gpuContext gpu, int buff_size); +extern "C" void NEB_report_energy(gpuContext gpu, int* master_size, double neb_nrg_all[]); +extern "C" void kNEBspr(gpuContext gpu); +extern "C" void kNEBfrc(gpuContext gpu, double neb_force[][3]); +extern "C" void kNEBfrc_nstep(gpuContext gpu, double neb_force[][3]); +extern "C" void kFitCOM(gpuContext gpu); #ifdef MPI extern "C" void kCopyToAccumulator(gpuContext gpu, PMEAccumulator* p1, PMEAccumulator* p2, diff --git src/pmemd/src/cuda/gti_cuda.cu src/pmemd/src/cuda/gti_cuda.cu index dd1da9d..85e53e7 100644 --- src/pmemd/src/cuda/gti_cuda.cu +++ src/pmemd/src/cuda/gti_cuda.cu @@ -20,6 +20,7 @@ void ik_updateAllSimulationConst(gpuContext gpu) { // Original pmemd cu units SetkForcesUpdateSim(gpu); SetkDataTransferSim(gpu); + SetkCalculateNEBForcesSim(gpu); SetkShakeSim(gpu); SetkCalculateLocalForcesSim(gpu); if (gpu->ntb == 0) { diff --git src/pmemd/src/cuda/gti_f95.cpp src/pmemd/src/cuda/gti_f95.cpp index 80855da..1696a5a 100644 --- src/pmemd/src/cuda/gti_f95.cpp +++ src/pmemd/src/cuda/gti_f95.cpp @@ -628,7 +628,7 @@ extern "C" void gti_others_(bool* needEnergy, int* nstep, double* dt) // ekcmt: //--------------------------------------------------------------------------------------------- extern "C" void gti_update_md_ene_(pme_pot_ene_rec* pEnergy, double enmr[3], double virial[3], - double ekcmt[3]) + double ekcmt[3], int* ineb, int* nebfreq, int* nstep) { gpuContext gpu = theGPUContext::GetPointer(); double energy[ENERGY_TERMS]; @@ -673,6 +673,14 @@ extern "C" void gti_update_md_ene_(pme_pot_ene_rec* pEnergy, double enmr[3], dou pEnergy->cmap; if (masterNode) pEnergy->total += (pEnergy->vdw_recip + pEnergy->elec_self); +//NEB +#ifdef MPI + if ((*ineb > 0) && masterNode && (*nstep%*nebfreq ==0)) { + MPI_Allgather(&pEnergy->total, 1, MPI_PMEDOUBLE, gpu->pbNEBEnergyAll->_pSysData, + 1, MPI_PMEDOUBLE, MPI_COMM_WORLD); + gpu->pbNEBEnergyAll->Upload(); + } +#endif // Grab virial if needed if ((gpu->sim.ntp > 0) && (gpu->sim.barostat == 1)) { diff --git src/pmemd/src/cuda/gti_f95.h src/pmemd/src/cuda/gti_f95.h index c7c8415..55f98ea 100644 --- src/pmemd/src/cuda/gti_f95.h +++ src/pmemd/src/cuda/gti_f95.h @@ -26,7 +26,7 @@ extern "C" void gti_build_nl_list_(int* ti_moode, int* lj1264); extern "C" void gti_clear_(bool* need_pot_enes, int* ti_mode, double atm_crd[][3]); extern "C" void gti_update_md_ene_(pme_pot_ene_rec* pEnergy, double enmr[3], double virial[3], - double ekcmt[3]); + double ekcmt[3], int* ineb, int* nebfreq, int* nstep); extern "C" void gti_ele_recip_(bool* needPotEnergy, bool* need_virials, double* vol, double ti_weights[2], int* ti_mode); diff --git src/pmemd/src/cuda/kCalculateNEBForces.cu src/pmemd/src/cuda/kCalculateNEBForces.cu new file mode 100644 index 0000000..39eec78 --- /dev/null +++ src/pmemd/src/cuda/kCalculateNEBForces.cu @@ -0,0 +1,692 @@ +#include "copyright.i" + +//--------------------------------------------------------------------------------------------- +// AMBER NVIDIA CUDA GPU IMPLEMENTATION: PMEMD VERSION +// +// July 2017, by Scott Le Grand, David S. Cerutti, Daniel J. Mermelstein, Charles Lin, and +// Ross C. Walker +//--------------------------------------------------------------------------------------------- +#include +#include "gpu.h" +#include "ptxmacros.h" + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +// Use global instance instead of a local copy +#include "simulationConst.h" +CSIM_STO simulationConst cSim; + +bool first = true; +//--------------------------------------------------------------------------------------------- +// SetkCalculateNEBForcesSim: called by gpuCopyConstants to orient the instance of cSim in this +// library +// +// Arguments: +// gpu: overarching type for storing all parameters, coordinates, and the energy function +//--------------------------------------------------------------------------------------------- +void SetkCalculateNEBForcesSim(gpuContext gpu) +{ + cudaError_t status; + status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(simulationConst)); + RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed"); +} + +//--------------------------------------------------------------------------------------------- +// GetkCalculateNEBForcesSim: download information about the CUDA simulation data struct from the +// device. +// +// Arguments: +// gpu: overarching type for storing all parameters, coordinates, and the energy function +// +// This is a debugging function. +//--------------------------------------------------------------------------------------------- +void GetkCalculateNEBForcesSim(gpuContext gpu) +{ + cudaError_t status; + status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(simulationConst)); + RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed"); +} + +//---------------------------------------------------------------------------------------------- +// Shuffle Warp Reduce: https://devblogs.nvidia.com/faster-parallel-reductions-kepler/ +//---------------------------------------------------------------------------------------------- +__inline__ __device__ double warpReduceSum(double val) { + for (int offset = GRID/2; offset > 0; offset /= 2) + val += __shfl_down(val, offset); + return val; +} + +//---------------------------------------------------------------------------------------------- +// atomicAdd function for double data types +//---------------------------------------------------------------------------------------------- +//#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +#if (__CUDA_ARCH__ < 600) || (__CUDACC_VER_MAJOR__ < 8) +__device__ void voidatomicAdd(double* ptr, double val) { + unsigned long long int* ulli_ptr = (unsigned long long int*)ptr; + unsigned long long int old = *ulli_ptr; + unsigned long long int assumed; + + do { + assumed = old; + old = atomicCAS(ulli_ptr, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); + } while (old != assumed); +} +#endif + +//--------------------------------------------------------------------------------------------- +// dgeev from lapack for computing eigenvalues and left/right enigenvectors of a square matrix +//--------------------------------------------------------------------------------------------- +extern "C" { +extern int dgeev_(char*,char*,int*,double*,int*,double*, double*, double*, int*, double*, int*, double*, int*, int*); +} + +//--------------------------------------------------------------------------------------------- +// Kernel for clearing the NEB buffers at each step +//--------------------------------------------------------------------------------------------- + +//--------------------------------------------------------------------------------------------- +// Kernel for shuttling the coordinates for sending and receiving between the neighboring +// replicas in NEB. +//--------------------------------------------------------------------------------------------- + +#define TAKE_IMAGE +#define PULL_COORD +__global__ void kRetrieveCoordNL_kernel() +#include "kShuttle.h" +#undef PULL_COORD + +#define PULL_FORCE +__global__ void kRetrieveForceNL_kernel() +#include "kShuttle.h" +#undef PULL_FORCE +#undef TAKE_IMAGE + +#define PULL_COORD +__global__ void kRetrieveCoord_kernel() +#include "kShuttle.h" +#undef PULL_COORD + +#define PULL_FORCE +__global__ void kRetrieveForce_kernel() +#include "kShuttle.h" +#undef PULL_FORCE + +//---------------------------------------------------------------------------------------------- +// kernels for RMSfit calculations. +//---------------------------------------------------------------------------------------------- + +#define NEIGHBOR_LIST +#define MASS_CALC +__global__ void kTotMassFitNL_kernel() +#include "kRMSFit.h" +#undef MASS_CALC + +#define CALC_COM +__global__ void kCALCCOMNL_kernel() +#include "kRMSFit.h" +#undef CALC_COM + +#define KABSCH +#define NEXT_NEIGHBOR +__global__ void kKabschNextNL_kernel() +#include "kRMSFit.h" +#undef NEXT_NEIGHBOR + +#define PREV_NEIGHBOR +__global__ void kKabschPrevNL_kernel() +#include "kRMSFit.h" +#undef PREV_NEIGHBOR +#undef KABSCH +#undef NEIGHBOR_LIST + +#define MASS_CALC +__global__ void kTotMassFit_kernel() +#include "kRMSFit.h" +#undef MASS_CALC + +#define CALC_COM +__global__ void kCALCCOM_kernel() +#include "kRMSFit.h" +#undef CALC_COM + +#define CENTER_DATA +__global__ void kCenterData_kernel() +#include "kRMSFit.h" +#undef CENTER_DATA + +#define RECENTER_DATA +__global__ void kReCenterData_kernel() +#include "kRMSFit.h" +#undef RECENTER_DATA + +#define ROTATE_NEIGHBOR +#define NEXT_NEIGHBOR +__global__ void kRotateNext_kernel() +#include "kRMSFit.h" +#undef NEXT_NEIGHBOR + +#define PREV_NEIGHBOR +__global__ void kRotatePrev_kernel() +#include "kRMSFit.h" +#undef PREV_NEIGHBOR +#undef ROTATE_NEIGHBOR + +#define KABSCH +#define NEXT_NEIGHBOR +__global__ void kKabschNext_kernel() +#include "kRMSFit.h" +#undef NEXT_NEIGHBOR + +#define PREV_NEIGHBOR +__global__ void kKabschPrev_kernel() +#include "kRMSFit.h" +#undef PREV_NEIGHBOR +#undef KABSCH + +//---------------------------------------------------------------------------------------------- +// Kernel for calculating NEB forces. +//---------------------------------------------------------------------------------------------- + +#define REVISED_TAN +#define FIRST +__global__ void kNEBRevisedTanFIRST_kernel() +#include "kNEB.h" +#undef FIRST + +#define SECOND +__global__ void kNEBRevisedTanSECOND_kernel() +#include "kNEB.h" +#undef SECOND + +#define THIRD +__global__ void kNEBRevisedTanTHIRD_kernel() +#include "kNEB.h" +#undef THIRD + +#define FOURTH +__global__ void kNEBRevisedTanFOURTH_kernel() +#include "kNEB.h" +#undef FOURTH +#undef REVISED_TAN + +#define BASIC_TAN +__global__ void kNEBBasicTan_kernel() +#include "kNEB.h" +#undef BASIC_TAN + +#define NORMALIZE +__global__ void kNormalizeTan_kernel() +#include "kNEB.h" +#undef NORMALIZE + +#define NEIGHBOR_LIST +#define NEB_FRC +__global__ void kNEBFRCNL_kernel() +#include "kNEB.h" +#undef NEB_FRC +#undef NEIGHBOR_LIST + +#define NEB_FRC +__global__ void kNEBFRC_kernel() +#include "kNEB.h" +#undef NEB_FRC + +#define REDUCE_TAN +__global__ void kNormalize_kernel() +#include "kNorm.h" +#undef REDUCE_TAN + +#define REDUCE_DOTPROD +__global__ void kDotProduct_kernel() +#include "kNorm.h" +#undef REDUCE_DOTPROD + +//----------------------------------------------------------------------------------------------- +extern "C" void kNEBSendRecv(gpuContext gpu, int buff_size) +{ +#ifdef MPI + if (gpu->sim.ShuttleType == 0) { + int next_node = gpu->sim.beadid; //replica id, goes from 0 to neb_nbead-1 + if (next_node >= gpu->sim.neb_nbead) { + next_node = 0; + } + int prev_node = gpu->sim.beadid - 2; + if (prev_node < 0) { + prev_node = gpu->sim.neb_nbead -1; + } + + if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) { + kRetrieveCoordNL_kernel<<blocks, 96>>>(); + } + else { + kRetrieveCoord_kernel<<blocks, 96>>>(); + } + cudaDeviceSynchronize(); + // Download from the device + if (!(gpu->bCanMapHostMemory)) { + gpu->pbDataShuttle->Download(); + } + int tag1 = 101010; + int tag2 = 202020; + MPI_Status status; + if ( (gpu->sim.beadid & 1) == 0) { + MPI_Sendrecv(gpu->pbDataShuttle->_pSysData, buff_size, MPI_DOUBLE, next_node, tag1, + gpu->pbNextDataShuttle->_pSysData, buff_size, MPI_DOUBLE, next_node, tag1, + MPI_COMM_WORLD, &status); + } + else { + MPI_Sendrecv(gpu->pbDataShuttle->_pSysData, buff_size, MPI_DOUBLE, prev_node, tag1, + gpu->pbPrevDataShuttle->_pSysData, buff_size, MPI_DOUBLE, prev_node, tag1, + MPI_COMM_WORLD, &status); + } + if ( (gpu->sim.beadid & 1) == 0) { + MPI_Sendrecv(gpu->pbDataShuttle->_pSysData, buff_size, MPI_DOUBLE, prev_node, tag2, + gpu->pbPrevDataShuttle->_pSysData, buff_size, MPI_DOUBLE, prev_node, tag2, + MPI_COMM_WORLD, &status); + } + else { + MPI_Sendrecv(gpu->pbDataShuttle->_pSysData, buff_size, MPI_DOUBLE, next_node, tag2, + gpu->pbNextDataShuttle->_pSysData, buff_size, MPI_DOUBLE, next_node, tag2, + MPI_COMM_WORLD, &status); + } + + //Upload to the device + if (!(gpu->bCanMapHostMemory)) { + gpu->pbNextDataShuttle->Upload(); + gpu->pbPrevDataShuttle->Upload(); + } + } + else if (gpu->sim.ShuttleType == 1) { + // Print an error for now + printf("| Error: this functionality (ShuttleType == 1) is not yet available.\n"); + } +#endif +} + +extern "C" void NEB_report_energy(gpuContext gpu, int* master_size, double neb_nrg_all[]) +{ + //this is needed for energy output + // at this point the pbNEBEnergyAll is filled with the replica's energies + for (int i = 0; i < *master_size; i++) { + neb_nrg_all[i] = gpu->pbNEBEnergyAll->_pSysData[i]; + } +} + +// Set the springs: +// at this point the pbNEBEnergyAll is filled with the replica's energies +extern "C" void kNEBspr(gpuContext gpu) +{ + PMEDouble ezero = MAX(gpu->pbNEBEnergyAll->_pSysData[0], + gpu->pbNEBEnergyAll->_pSysData[gpu->sim.neb_nbead-1]); + PMEDouble emax = gpu->pbNEBEnergyAll->_pSysData[0]; + for (int rep = 1; rep < gpu->sim.neb_nbead; rep++) { + emax = MAX(emax, gpu->pbNEBEnergyAll->_pSysData[rep]); + } + + for (int i = 0; i < 5; i++) { + gpu->pbDataSPR->_pSysData[i] = 0.0; + } + + if (gpu->sim.skmin == gpu->sim.skmax) { + gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax; + } + else if ((gpu->pbNEBEnergyAll->_pSysData[1] > ezero) && (emax != ezero)) { + gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax - gpu->sim.skmin * (emax - + MAX(gpu->pbNEBEnergyAll->_pSysData[0], gpu->pbNEBEnergyAll->_pSysData[1])/(emax - ezero)); + } + else { + gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax - gpu->sim.skmin; + } + int rep = gpu->sim.beadid - 1; //replica id, goes from 0 to neb_nbead-1 + if ((rep == 0) || (rep == (gpu->sim.neb_nbead - 1))){ + goto exit3; + } + gpu->pbDataSPR->_pSysData[3] = gpu->pbDataSPR->_pSysData[4]; + if (gpu->sim.skmin == gpu->sim.skmax) { + gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax; + } + else if ((gpu->pbNEBEnergyAll->_pSysData[rep + 1] > ezero) && (emax != ezero)) { + gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax - gpu->sim.skmin * (emax - + MAX(gpu->pbNEBEnergyAll->_pSysData[rep - 1], gpu->pbNEBEnergyAll->_pSysData[rep])/(emax - ezero)); + } + else { + gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax - gpu->sim.skmin; + } + + if (!(gpu->bCanMapHostMemory)) { + gpu->pbDataSPR->Upload(); + } + + exit3: {} +} + +extern "C" void kFitCOM(gpuContext gpu) +{ + + double det; + double small; + double b[9]; + double norm; + int ismall; + + //vars needed for lapack diagonalization + double Kabsch2[9], eigReal[3], eigImag[3]; + double vl[1]; + double vr[9]; + int info; + int ldvl = 1, ldvr = 3, n = 3; + const int worksize = 48; + int lwork = worksize; + double work[worksize]; + char Nchar='N', Vchar='V'; + + if (first) { + // Calculate total mass of the fit region + if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) { + kTotMassFitNL_kernel<<blocks, gpu->threadsPerBlock/4>>>(); + LAUNCHERROR("kFitCOM"); + } + else { + kTotMassFit_kernel<<blocks, gpu->threadsPerBlock/4>>>(); + LAUNCHERROR("kFitCOM"); + } + first = false; + } + + cudaDeviceSynchronize(); + + for (int i = 0; i < 27; i++) { + gpu->pbKabschCOM->_pSysData[i] = 0.0; + } + gpu->pbKabschCOM->Upload(); + + // Calculate center of mass of the self and neighboring replicas + if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) { + kCALCCOMNL_kernel<<blocks, gpu->threadsPerBlock*3/16>>>(); + LAUNCHERROR("kFitCOM"); + } + else { + kCALCCOM_kernel<<blocks, gpu->threadsPerBlock*3/16>>>(); + LAUNCHERROR("kFitCOM"); + } + + // Center atoms wrt COM of fit regions. + kCenterData_kernel<<blocks, gpu->threadsPerBlock/4>>>(); + LAUNCHERROR("kFitCOM"); + + + // Calculate Kabsch matrix + if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) { + kKabschPrevNL_kernel<<blocks, gpu->threadsPerBlock*3/16>>>(); + LAUNCHERROR("kFitCOM"); + kKabschNextNL_kernel<<blocks, gpu->threadsPerBlock*3/16>>>(); + LAUNCHERROR("kFitCOM"); + } + else { + kKabschPrev_kernel<<blocks, gpu->threadsPerBlock*3/16>>>(); + LAUNCHERROR("kFitCOM"); + kKabschNext_kernel<<blocks, gpu->threadsPerBlock*3/16>>>(); + LAUNCHERROR("kFitCOM"); + } + gpu->pbKabschCOM->Download(); + + det = gpu->pbKabschCOM->_pSysData[0] * gpu->pbKabschCOM->_pSysData[4] * gpu->pbKabschCOM->_pSysData[8] - + gpu->pbKabschCOM->_pSysData[0] * gpu->pbKabschCOM->_pSysData[5] * gpu->pbKabschCOM->_pSysData[7] - + gpu->pbKabschCOM->_pSysData[1] * gpu->pbKabschCOM->_pSysData[3] * gpu->pbKabschCOM->_pSysData[8] + + gpu->pbKabschCOM->_pSysData[1] * gpu->pbKabschCOM->_pSysData[5] * gpu->pbKabschCOM->_pSysData[6] + + gpu->pbKabschCOM->_pSysData[2] * gpu->pbKabschCOM->_pSysData[3] * gpu->pbKabschCOM->_pSysData[7] - + gpu->pbKabschCOM->_pSysData[2] * gpu->pbKabschCOM->_pSysData[4] * gpu->pbKabschCOM->_pSysData[6]; + + if (abs(det) < 1.0e-07) { + printf("small determinant in rmsfit(): %f\n", abs(det)); + goto exit1; + } + + for(int i = 0; i < 9; i++) { + Kabsch2[i] = 0.0; + b[i] = 0.0; + } + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < 3; k++) { + Kabsch2[3*i+j] += gpu->pbKabschCOM->_pSysData[3*k+i] + * gpu->pbKabschCOM->_pSysData[3*k+j]; + } + } + } + + // Calculate eigenvalues using the DGEEV subroutine + dgeev_(&Nchar,&Vchar,&n,Kabsch2,&n,eigReal,eigImag, + vl,&ldvl,vr,&ldvr,work,&lwork,&info); + + // Check for errors + if (info!=0){ + printf("Error in diagonalization routine dgeev %d\n", info); + goto exit1; + } + + // Find the smallest eigenvalue + small = 1.0e20; + for (int i = 0; i < 3; i++) { + if (eigReal[i] < small) { + ismall = i; + small = eigReal[i]; + } + } + + // Calculate the b vectors + for (int j = 0; j < 3; j++) { + norm = 1.0/sqrt(abs(eigReal[j])); + for (int i = 0; i < 3; i++) { + for (int k = 0; k < 3; k++) { + b[3*i+j] += gpu->pbKabschCOM->_pSysData[3*i+k] * vr[3*j+k] * norm; + } + } + } + + CONTINUE1: + + for(int i = 0; i < 9; i++) { + gpu->pbRotAtm->_pSysData[i] = 0.0; + } + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < 3; k++) { + gpu->pbRotAtm->_pSysData[3*i+j] += b[3*i+k] * vr[3*k+j]; + } + } + } + + det = gpu->pbRotAtm->_pSysData[0] * gpu->pbRotAtm->_pSysData[4] * gpu->pbRotAtm->_pSysData[8] - + gpu->pbRotAtm->_pSysData[0] * gpu->pbRotAtm->_pSysData[5] * gpu->pbRotAtm->_pSysData[7] - + gpu->pbRotAtm->_pSysData[1] * gpu->pbRotAtm->_pSysData[3] * gpu->pbRotAtm->_pSysData[8] + + gpu->pbRotAtm->_pSysData[1] * gpu->pbRotAtm->_pSysData[5] * gpu->pbRotAtm->_pSysData[6] + + gpu->pbRotAtm->_pSysData[2] * gpu->pbRotAtm->_pSysData[3] * gpu->pbRotAtm->_pSysData[7] - + gpu->pbRotAtm->_pSysData[2] * gpu->pbRotAtm->_pSysData[4] * gpu->pbRotAtm->_pSysData[6]; + + if (abs(det) < 1.0e-10) { + printf("small determinant in rmsfit(): %f\n", abs(det)); + goto exit1; + } + + if (det < 0) { + for (int i = 0; i < 3; i++) { + b[3*i+ismall] = -b[3*i+ismall]; + } + goto CONTINUE1; + } + + if (!(gpu->bCanMapHostMemory)) { + gpu->pbRotAtm->Upload(); + } + + kRotatePrev_kernel<<blocks, gpu->threadsPerBlock/4>>>(); + LAUNCHERROR("kFitCOM"); + + cudaDeviceSynchronize(); + + exit1: {} + + det = gpu->pbKabschCOM->_pSysData[9] * gpu->pbKabschCOM->_pSysData[13] * gpu->pbKabschCOM->_pSysData[17] - + gpu->pbKabschCOM->_pSysData[9] * gpu->pbKabschCOM->_pSysData[14] * gpu->pbKabschCOM->_pSysData[16] - + gpu->pbKabschCOM->_pSysData[10] * gpu->pbKabschCOM->_pSysData[12] * gpu->pbKabschCOM->_pSysData[17] + + gpu->pbKabschCOM->_pSysData[10] * gpu->pbKabschCOM->_pSysData[14] * gpu->pbKabschCOM->_pSysData[15] + + gpu->pbKabschCOM->_pSysData[11] * gpu->pbKabschCOM->_pSysData[12] * gpu->pbKabschCOM->_pSysData[16] - + gpu->pbKabschCOM->_pSysData[11] * gpu->pbKabschCOM->_pSysData[13] * gpu->pbKabschCOM->_pSysData[15]; + + if (abs(det) < 1.0e-07) { + printf("small determinant in rmsfit(): %f\n", abs(det)); + goto exit2; + } + + for(int i = 0; i < 9; i++) { + Kabsch2[i] = 0.0; + b[i] = 0.0; + } + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < 3; k++) { + Kabsch2[3*i+j] += gpu->pbKabschCOM->_pSysData[3*k+i+9] + * gpu->pbKabschCOM->_pSysData[3*k+j+9]; + } + } + } + + // Calculate eigenvalues using the DGEEV subroutine + dgeev_(&Nchar,&Vchar,&n,Kabsch2,&n,eigReal,eigImag, + vl,&ldvl,vr,&ldvr,work,&lwork,&info); + + // Check for errors + if (info!=0){ + printf("Error in diagonalization routine dgeev"); + goto exit2; + } + + // Find the smallest eigenvalue + small = 1.0e20; + for (int i = 0; i < 3; i++) { + if (eigReal[i] < small) { + ismall = i; + small = eigReal[i]; + } + } + + // Calculate the b vectors + for (int j = 0; j < 3; j++) { + norm = 1.0/sqrt(abs(eigReal[j])); + for (int i = 0; i < 3; i++) { + for (int k = 0; k < 3; k++) { + b[3*i+j] += gpu->pbKabschCOM->_pSysData[3*i+k+9] * vr[3*j+k] * norm; + } + } + } + + CONTINUE2: + + for(int i = 0; i < 9; i++) { + gpu->pbRotAtm->_pSysData[i] = 0.0; + } + + for (int i = 0; i < 3; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < 3; k++) { + gpu->pbRotAtm->_pSysData[3*i+j] += b[3*i+k] * vr[3*k+j]; + } + } + } + + det = gpu->pbRotAtm->_pSysData[0] * gpu->pbRotAtm->_pSysData[4] * gpu->pbRotAtm->_pSysData[8] - + gpu->pbRotAtm->_pSysData[0] * gpu->pbRotAtm->_pSysData[5] * gpu->pbRotAtm->_pSysData[7] - + gpu->pbRotAtm->_pSysData[1] * gpu->pbRotAtm->_pSysData[3] * gpu->pbRotAtm->_pSysData[8] + + gpu->pbRotAtm->_pSysData[1] * gpu->pbRotAtm->_pSysData[5] * gpu->pbRotAtm->_pSysData[6] + + gpu->pbRotAtm->_pSysData[2] * gpu->pbRotAtm->_pSysData[3] * gpu->pbRotAtm->_pSysData[7] - + gpu->pbRotAtm->_pSysData[2] * gpu->pbRotAtm->_pSysData[4] * gpu->pbRotAtm->_pSysData[6]; + + if (abs(det) < 1.0e-10) { + printf("small determinant in rmsfit(): %f\n", abs(det)); + goto exit2; + } + + if (det < 0) { + for (int i = 0; i < 3; i++) { + b[3*i+ismall] = -b[3*i+ismall]; + } + goto CONTINUE2; + } + + if (!(gpu->bCanMapHostMemory)) { + gpu->pbRotAtm->Upload(); + } + + kRotateNext_kernel<<blocks, gpu->threadsPerBlock/4>>>(); + LAUNCHERROR("kFitCOM"); + + kReCenterData_kernel<<blocks, gpu->threadsPerBlock/4>>>(); + LAUNCHERROR("kFitCOM"); + cudaDeviceSynchronize(); + + exit2: {} + +} + +extern "C" void kNEBfrc(gpuContext gpu, double neb_force[][3]) +{ + if (gpu->sim.tmode == 1) { + + int rep = gpu->sim.beadid - 1; + if ((gpu->pbNEBEnergyAll->_pSysData[rep + 1] > gpu->pbNEBEnergyAll->_pSysData[rep]) && + (gpu->pbNEBEnergyAll->_pSysData[rep] > gpu->pbNEBEnergyAll->_pSysData[rep - 1])) { + kNEBRevisedTanFIRST_kernel<<blocks, 96>>>(); + } + else if ((gpu->pbNEBEnergyAll->_pSysData[rep + 1] < gpu->pbNEBEnergyAll->_pSysData[rep]) && + (gpu->pbNEBEnergyAll->_pSysData[rep] < gpu->pbNEBEnergyAll->_pSysData[rep - 1])) { + kNEBRevisedTanSECOND_kernel<<blocks, 96>>>(); + } + else if (gpu->pbNEBEnergyAll->_pSysData[rep + 1] > gpu->pbNEBEnergyAll->_pSysData[rep - 1]) { + kNEBRevisedTanTHIRD_kernel<<blocks, 96>>>(); + } + else { + kNEBRevisedTanFOURTH_kernel<<blocks, 96>>>(); + } + } + else { + kNEBBasicTan_kernel<<blocks, 96>>>(); + } + + kNormalize_kernel<<blocks, 96>>>(); + cudaDeviceSynchronize(); + + kNormalizeTan_kernel<<blocks, 96>>>(); + cudaDeviceSynchronize(); + + if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) { + kRetrieveForceNL_kernel<<blocks, 96>>>(); + } + else { + kRetrieveForce_kernel<<blocks, 96>>>(); + } + cudaDeviceSynchronize(); + + kDotProduct_kernel<<blocks, 96>>>(); + + if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) { + kNEBFRCNL_kernel<<blocks, 96>>>(); + } + else { + kNEBFRC_kernel<<blocks, 96>>>(); + } +} + +extern "C" void kNEBfrc_nstep(gpuContext gpu, double neb_force[][3]) +{ + if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) { + kNEBFRCNL_kernel<<blocks, 96>>>(); + } + else { + kNEBFRC_kernel<<blocks, 96>>>(); + } +} diff --git src/pmemd/src/cuda/kNEB.h src/pmemd/src/cuda/kNEB.h new file mode 100644 index 0000000..6e526ee --- /dev/null +++ src/pmemd/src/cuda/kNEB.h @@ -0,0 +1,95 @@ +#include "copyright.i" +{ + int warpIdx = threadIdx.x >> GRID_BITS; + int tgx = threadIdx.x & GRID_BITS_MASK; + int pos = tgx + (GRID * blockIdx.x); // Position in the 1D array. +#ifdef REVISED_TAN + #if defined(THIRD) || defined(FOURTH) + double emax = 0.0; + double emin = 0.0; + int replica = cSim.beadid - 1; + #endif +#endif + +#ifdef NEB_FRC + __shared__ int atidx[GRID]; +#endif + +#ifdef NORMALIZE + __shared__ double norm; +#endif + + while (pos < cSim.nShuttle) { + int idx = pos + warpIdx * cSim.nShuttle; // Position in the extended 3D array. + +#ifdef NEB_FRC + if (warpIdx == 0) { +#ifdef NEIGHBOR_LIST + int shidx = cSim.pShuttleTickets[pos]; + atidx[tgx] = cSim.pImageAtomLookup[shidx]; +#else + atidx[tgx] = cSim.pShuttleTickets[pos]; +#endif + } + __syncthreads(); +#endif + + if ((cSim.pAtmIdx[pos] == 0) || (cSim.pAtmIdx[pos] == 2)) { +#ifdef REVISED_TAN + #ifdef FIRST + cSim.pTangents[idx] = cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx]; + #elif defined(SECOND) + cSim.pTangents[idx] = cSim.pDataShuttle[idx] - cSim.pPrevDataShuttle[idx]; + #elif defined(THIRD) + emax = max(abs(cSim.pNEBEnergyAll[replica + 1] - cSim.pNEBEnergyAll[replica]), + abs(cSim.pNEBEnergyAll[replica - 1] - cSim.pNEBEnergyAll[replica])); + emin = min(abs(cSim.pNEBEnergyAll[replica + 1] - cSim.pNEBEnergyAll[replica]), + abs(cSim.pNEBEnergyAll[replica - 1] - cSim.pNEBEnergyAll[replica])); + cSim.pTangents[idx] = (cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx]) * emax; + cSim.pTangents[idx] += (cSim.pDataShuttle[idx] - cSim.pPrevDataShuttle[idx]) * emin; + #elif defined(FOURTH) + emax = max(abs(cSim.pNEBEnergyAll[replica + 1] - cSim.pNEBEnergyAll[replica]), + abs(cSim.pNEBEnergyAll[replica - 1] - cSim.pNEBEnergyAll[replica])); + emin = min(abs(cSim.pNEBEnergyAll[replica + 1] - cSim.pNEBEnergyAll[replica]), + abs(cSim.pNEBEnergyAll[replica - 1] - cSim.pNEBEnergyAll[replica])); + cSim.pTangents[idx] = (cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx]) * emin; + cSim.pTangents[idx] += (cSim.pDataShuttle[idx] - cSim.pPrevDataShuttle[idx]) * emax; + #endif +#elif defined(BASIC_TAN) + cSim.pTangents[idx] = cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx]; +#endif + } + else { + cSim.pTangents[idx] = 0.0; + } + +#ifdef NORMALIZE + if (threadIdx.x == 0) { + if (*cSim.pNorm > 1.0e-06) { + norm = 1.0/sqrt(*cSim.pNorm); + } + else { + norm = 0.0; + } + } + __syncthreads(); + + if ((cSim.pAtmIdx[pos] == 0) || (cSim.pAtmIdx[pos] == 2)) { + cSim.pTangents[idx] *= norm; + } + + cSim.pSpringForce[idx] = ((cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx]) * (*cSim.pSpring2) + - (cSim.pDataShuttle[idx] - cSim.pPrevDataShuttle[idx]) * (*cSim.pSpring1)) * cSim.pTangents[idx]; +#endif + +#ifdef NEB_FRC + //cSim.pNEBForce[idx] = ((*cSim.pDotProduct2) * cSim.pTangents[idx]) + // - ((*cSim.pDotProduct1) * cSim.pTangents[idx]);//comment here after test + PMEAccumulator neb_frc = (((*cSim.pDotProduct2) * cSim.pTangents[idx]) + - ((*cSim.pDotProduct1) * cSim.pTangents[idx])) * FORCESCALE; + cSim.pForceAccumulator[atidx[tgx] + warpIdx * cSim.stride] += neb_frc; +#endif + + pos += gridDim.x * GRID; + } +} diff --git src/pmemd/src/cuda/kNorm.h src/pmemd/src/cuda/kNorm.h new file mode 100644 index 0000000..6a750ce --- /dev/null +++ src/pmemd/src/cuda/kNorm.h @@ -0,0 +1,45 @@ +#include "copyright.i" +{ + int index = blockIdx.x * blockDim.x + threadIdx.x; + int tgx = threadIdx.x & GRID_BITS_MASK; +#ifdef REDUCE_TAN + double length = 0.0; +#elif defined(REDUCE_DOTPROD) + double dotprod1 = 0.0; + double dotprod2 = 0.0; +#endif + + for (int idx = index; idx < 3*cSim.nShuttle; idx += blockDim.x * gridDim.x) { +#ifdef REDUCE_TAN + length += (cSim.pTangents[idx] * cSim.pTangents[idx]); +#elif defined(REDUCE_DOTPROD) + dotprod1 += (cSim.pDataShuttle[idx] * cSim.pTangents[idx]); //pDataShuttle at this line should include forces, not the coordinates. + dotprod2 += (cSim.pSpringForce[idx]); +#endif + } + +#ifdef REDUCE_TAN + length = warpReduceSum(length); +#elif defined(REDUCE_DOTPROD) + dotprod1 = warpReduceSum(dotprod1); + dotprod2 = warpReduceSum(dotprod2); +#endif + + if ( tgx == 0 ) { +#if (__CUDA_ARCH__ < 600) || (__CUDACC_VER_MAJOR__ < 8) +#ifdef REDUCE_TAN + voidatomicAdd(cSim.pNorm,length); +#elif defined(REDUCE_DOTPROD) + voidatomicAdd(cSim.pDotProduct1, dotprod1); + voidatomicAdd(cSim.pDotProduct2, dotprod2); +#endif +#else +#ifdef REDUCE_TAN + atomicAdd(cSim.pNorm,length); +#elif defined(REDUCE_DOTPROD) + atomicAdd(cSim.pDotProduct1, dotprod1); + atomicAdd(cSim.pDotProduct2, dotprod2); +#endif +#endif + } +} diff --git src/pmemd/src/cuda/kRMSFit.h src/pmemd/src/cuda/kRMSFit.h new file mode 100644 index 0000000..0519547 --- /dev/null +++ src/pmemd/src/cuda/kRMSFit.h @@ -0,0 +1,258 @@ +#include "copyright.i" +{ +#if !defined(CALC_COM) && !defined(KABSCH) + int pos = threadIdx.x + blockDim.x * blockIdx.x; +#endif + +#ifdef MASS_CALC + int tid = threadIdx.x; + __shared__ PMEDouble totMass; +#endif + +#if defined(CALC_COM) || defined(KABSCH) + int warpIdx = threadIdx.x >> 6; //there are 64*3=7192 total threads per block and 3 warpIdxs: 0, 1, and 2 + int tgx = threadIdx.x & 63; // this goes from 0-63 in each warp + int pos = tgx + blockIdx.x * 64; + int tid = threadIdx.x; + __shared__ PMEDouble mass[64]; + __shared__ double sXData[THREADS_PER_BLOCK*3/16]; + __shared__ double sYData[THREADS_PER_BLOCK*3/16]; + __shared__ double sZData[THREADS_PER_BLOCK*3/16]; +#endif + + while (pos < cSim.nShuttle) { + +#if defined(CALC_COM) || defined(KABSCH) + sXData[tid] = 0.0; + sYData[tid] = 0.0; + sZData[tid] = 0.0; + + if (warpIdx == 0) { + #ifdef NEIGHBOR_LIST + int shidx = cSim.pShuttleTickets[pos]; + int atidx = cSim.pImageAtomLookup[shidx]; + mass[tgx] = cSim.pImageMass[atidx]; + #else + int atidx = cSim.pShuttleTickets[pos]; + mass[tgx] = cSim.pAtomMass[atidx]; + #endif + } + __syncthreads(); +#endif + +#ifdef MASS_CALC + #ifdef NEIGHBOR_LIST + int shidx = cSim.pShuttleTickets[pos]; + int atidx = cSim.pImageAtomLookup[shidx]; + PMEDouble mass = cSim.pImageMass[atidx]; + #else + int atidx = cSim.pShuttleTickets[pos]; + PMEDouble mass = cSim.pAtomMass[atidx]; + #endif +#endif + +#ifdef CALC_COM + if ((cSim.pAtmIdx[pos] == 1) || (cSim.pAtmIdx[pos] == 2)) { + if (warpIdx == 0) { + sXData[tid] = cSim.pDataShuttle[pos] * mass[tgx]; + sYData[tid] = cSim.pDataShuttle[pos + cSim.nShuttle] * mass[tgx]; + sZData[tid] = cSim.pDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx]; + } + else if (warpIdx == 1) { + sXData[tid] = cSim.pPrevDataShuttle[pos] * mass[tgx]; + sYData[tid] = cSim.pPrevDataShuttle[pos + cSim.nShuttle] * mass[tgx]; + sZData[tid] = cSim.pPrevDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx]; + } + else if (warpIdx == 2) { + sXData[tid] = cSim.pNextDataShuttle[pos] * mass[tgx]; + sYData[tid] = cSim.pNextDataShuttle[pos + cSim.nShuttle] * mass[tgx]; + sZData[tid] = cSim.pNextDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx]; + } + } + __syncthreads(); +#elif defined(KABSCH) + #ifdef NEXT_NEIGHBOR + if ((cSim.pAtmIdx[pos] == 1) || (cSim.pAtmIdx[pos] == 2)) { + sXData[tid] = cSim.pNextDataShuttle[pos] * mass[tgx] * + cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle]; + sYData[tid] = cSim.pNextDataShuttle[pos + cSim.nShuttle] * mass[tgx] * + cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle]; + sZData[tid] = cSim.pNextDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx] * + cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle]; + } + __syncthreads(); + #elif defined(PREV_NEIGHBOR) + if ((cSim.pAtmIdx[pos] == 1) || (cSim.pAtmIdx[pos] == 2)) { + sXData[tid] = cSim.pPrevDataShuttle[pos] * mass[tgx] * + cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle]; + sYData[tid] = cSim.pPrevDataShuttle[pos + cSim.nShuttle] * mass[tgx] * + cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle]; + sZData[tid] = cSim.pPrevDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx] * + cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle]; + } + __syncthreads(); + #endif +#endif + +#ifdef MASS_CALC + if (tid == 0) { + totMass = 0.0; + } + __syncthreads(); + if ((cSim.pAtmIdx[pos] == 1) || (cSim.pAtmIdx[pos] == 2)) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 + voidatomicAdd(&totMass, mass); +#else + atomicAdd(&totMass, mass); +#endif + } + __syncthreads(); +#endif + +#if defined(CALC_COM) || defined(KABSCH) + for (unsigned int i = blockDim.x/6; i > 0; i >>= 1) { + if (tgx < i && (pos + i) < cSim.nShuttle) { + sXData[tid] += sXData[tid + i]; + sYData[tid] += sYData[tid + i]; + sZData[tid] += sZData[tid + i]; + } + __syncthreads(); + } +#endif + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 + #ifdef CALC_COM + if (tgx == 0) { + if (warpIdx == 0) { + voidatomicAdd(&cSim.pSelfCOM[0], sXData[0] / (*cSim.ptotFitMass)); + voidatomicAdd(&cSim.pSelfCOM[1], sYData[0] / (*cSim.ptotFitMass)); + voidatomicAdd(&cSim.pSelfCOM[2], sZData[0] / (*cSim.ptotFitMass)); + } + else if (warpIdx == 1) { + voidatomicAdd(&cSim.pPrevCOM[0], sXData[64] / (*cSim.ptotFitMass)); + voidatomicAdd(&cSim.pPrevCOM[1], sYData[64] / (*cSim.ptotFitMass)); + voidatomicAdd(&cSim.pPrevCOM[2], sZData[64] / (*cSim.ptotFitMass)); + } + else if (warpIdx == 2) { + voidatomicAdd(&cSim.pNextCOM[0], sXData[128] / (*cSim.ptotFitMass)); + voidatomicAdd(&cSim.pNextCOM[1], sYData[128] / (*cSim.ptotFitMass)); + voidatomicAdd(&cSim.pNextCOM[2], sZData[128] / (*cSim.ptotFitMass)); + } + } + #elif defined(KABSCH) + if (tgx == 0) { + #ifdef NEXT_NEIGHBOR + voidatomicAdd(&cSim.pNextKabsch[0 + warpIdx * 3], sXData[warpIdx * 64]); + voidatomicAdd(&cSim.pNextKabsch[1 + warpIdx * 3], sYData[warpIdx * 64]); + voidatomicAdd(&cSim.pNextKabsch[2 + warpIdx * 3], sZData[warpIdx * 64]); + #elif defined(PREV_NEIGHBOR) + voidatomicAdd(&cSim.pPrevKabsch[0 + warpIdx * 3], sXData[warpIdx * 64]); + voidatomicAdd(&cSim.pPrevKabsch[1 + warpIdx * 3], sYData[warpIdx * 64]); + voidatomicAdd(&cSim.pPrevKabsch[2 + warpIdx * 3], sZData[warpIdx * 64]); + #endif + } + #elif defined(MASS_CALC) + if (tid == 0) { + voidatomicAdd(cSim.ptotFitMass, totMass); + } + #endif +#else + #ifdef CALC_COM + if (tgx == 0) { + if (warpIdx == 0) { + atomicAdd(&cSim.pSelfCOM[0], sXData[0] / (*cSim.ptotFitMass)); + atomicAdd(&cSim.pSelfCOM[1], sYData[0] / (*cSim.ptotFitMass)); + atomicAdd(&cSim.pSelfCOM[2], sZData[0] / (*cSim.ptotFitMass)); + } + else if (warpIdx == 1) { + atomicAdd(&cSim.pPrevCOM[0], sXData[64] / (*cSim.ptotFitMass)); + atomicAdd(&cSim.pPrevCOM[1], sYData[64] / (*cSim.ptotFitMass)); + atomicAdd(&cSim.pPrevCOM[2], sZData[64] / (*cSim.ptotFitMass)); + } + else if (warpIdx == 2) { + atomicAdd(&cSim.pNextCOM[0], sXData[128] / (*cSim.ptotFitMass)); + atomicAdd(&cSim.pNextCOM[1], sYData[128] / (*cSim.ptotFitMass)); + atomicAdd(&cSim.pNextCOM[2], sZData[128] / (*cSim.ptotFitMass)); + } + } + #elif defined(KABSCH) + if (tgx == 0) { + #ifdef NEXT_NEIGHBOR + atomicAdd(&cSim.pNextKabsch[0 + warpIdx * 3], sXData[warpIdx * 64]); + atomicAdd(&cSim.pNextKabsch[1 + warpIdx * 3], sYData[warpIdx * 64]); + atomicAdd(&cSim.pNextKabsch[2 + warpIdx * 3], sZData[warpIdx * 64]); + #elif defined(PREV_NEIGHBOR) + atomicAdd(&cSim.pPrevKabsch[0 + warpIdx * 3], sXData[warpIdx * 64]); + atomicAdd(&cSim.pPrevKabsch[1 + warpIdx * 3], sYData[warpIdx * 64]); + atomicAdd(&cSim.pPrevKabsch[2 + warpIdx * 3], sZData[warpIdx * 64]); + #endif + } + #elif defined(MASS_CALC) + if (tid == 0) { + atomicAdd(cSim.ptotFitMass, totMass); + } + #endif +#endif + +#ifdef CENTER_DATA + cSim.pDataShuttle[ pos] = cSim.pDataShuttle[ pos] - cSim.pSelfCOM[0]; + cSim.pDataShuttle[ cSim.nShuttle + pos] = cSim.pDataShuttle[ cSim.nShuttle + pos] - cSim.pSelfCOM[1]; + cSim.pDataShuttle[2 * cSim.nShuttle + pos] = cSim.pDataShuttle[2 * cSim.nShuttle + pos] - cSim.pSelfCOM[2]; + cSim.pPrevDataShuttle[ pos] = cSim.pPrevDataShuttle[ pos] - cSim.pPrevCOM[0]; + cSim.pPrevDataShuttle[ cSim.nShuttle + pos] = cSim.pPrevDataShuttle[ cSim.nShuttle + pos] - cSim.pPrevCOM[1]; + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] = cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] - cSim.pPrevCOM[2]; + cSim.pNextDataShuttle[ pos] = cSim.pNextDataShuttle[ pos] - cSim.pNextCOM[0]; + cSim.pNextDataShuttle[ cSim.nShuttle + pos] = cSim.pNextDataShuttle[ cSim.nShuttle + pos] - cSim.pNextCOM[1]; + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] = cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] - cSim.pNextCOM[2]; +#endif + +#ifdef ROTATE_NEIGHBOR + #ifdef NEXT_NEIGHBOR + double xtmp = cSim.pNextDataShuttle[ pos] * cSim.pRotAtm[0] + + cSim.pNextDataShuttle[ cSim.nShuttle + pos] * cSim.pRotAtm[1] + + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[2]; + double ytmp = cSim.pNextDataShuttle[ + pos] * cSim.pRotAtm[3] + + cSim.pNextDataShuttle[ cSim.nShuttle + pos] * cSim.pRotAtm[4] + + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[5]; + double ztmp = cSim.pNextDataShuttle[ + pos] * cSim.pRotAtm[6] + + cSim.pNextDataShuttle[ cSim.nShuttle + pos] * cSim.pRotAtm[7] + + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[8]; + cSim.pNextDataShuttle[ pos] = xtmp; + cSim.pNextDataShuttle[ cSim.nShuttle + pos] = ytmp; + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] = ztmp; + #elif defined(PREV_NEIGHBOR) + double xtmp = cSim.pPrevDataShuttle[ pos] * cSim.pRotAtm[0] + + cSim.pPrevDataShuttle[ cSim.nShuttle + pos] * cSim.pRotAtm[1] + + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[2]; + double ytmp = cSim.pPrevDataShuttle[ + pos] * cSim.pRotAtm[3] + + cSim.pPrevDataShuttle[ cSim.nShuttle + pos] * cSim.pRotAtm[4] + + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[5]; + double ztmp = cSim.pPrevDataShuttle[ + pos] * cSim.pRotAtm[6] + + cSim.pPrevDataShuttle[ cSim.nShuttle + pos] * cSim.pRotAtm[7] + + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[8]; + cSim.pPrevDataShuttle[ pos] = xtmp; + cSim.pPrevDataShuttle[ cSim.nShuttle + pos] = ytmp; + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] = ztmp; + #endif +#endif + +#ifdef RECENTER_DATA + cSim.pDataShuttle[ pos] = cSim.pDataShuttle[ pos] + cSim.pSelfCOM[0]; + cSim.pDataShuttle[ cSim.nShuttle + pos] = cSim.pDataShuttle[ cSim.nShuttle + pos] + cSim.pSelfCOM[1]; + cSim.pDataShuttle[2 * cSim.nShuttle + pos] = cSim.pDataShuttle[2 * cSim.nShuttle + pos] + cSim.pSelfCOM[2]; + cSim.pNextDataShuttle[ pos] = cSim.pNextDataShuttle[ pos] + cSim.pSelfCOM[0]; + cSim.pNextDataShuttle[ cSim.nShuttle + pos] = cSim.pNextDataShuttle[ cSim.nShuttle + pos] + cSim.pSelfCOM[1]; + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] = cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] + cSim.pSelfCOM[2]; + cSim.pPrevDataShuttle[ pos] = cSim.pPrevDataShuttle[ pos] + cSim.pSelfCOM[0]; + cSim.pPrevDataShuttle[ cSim.nShuttle + pos] = cSim.pPrevDataShuttle[ cSim.nShuttle + pos] + cSim.pSelfCOM[1]; + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] = cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] + cSim.pSelfCOM[2]; +#endif + + +#if defined(CALC_COM) || defined(KABSCH) + pos += gridDim.x*64; +#else + pos += gridDim.x*256; +#endif + } +} diff --git src/pmemd/src/gb_force.F90 src/pmemd/src/gb_force.F90 index 16eedb4..34e5a70 100644 --- src/pmemd/src/gb_force.F90 +++ src/pmemd/src/gb_force.F90 @@ -287,18 +287,18 @@ subroutine gb_force(atm_cnt, crd, frc, pot_ene, irespa, need_pot_ene) ! END GBSA implementation to work with the GPU code. if (on_cpstep) then call gpu_refresh_charges(cit_nb14, gbl_one_scee, proposed_qterm) - call gpu_gb_ene(pot_ene_prop, enmr) + call gpu_gb_ene(pot_ene_prop, enmr, ineb) call gpu_refresh_charges(cit_nb14, gbl_one_scee, atm_qterm) - call gpu_gb_ene(pot_ene, enmr) + call gpu_gb_ene(pot_ene, enmr, ineb) pot_ene%dvdl = pot_ene_prop%total - pot_ene%total else if (on_cestep) then call gpu_refresh_charges(cit_nb14, gbl_one_scee, proposed_qterm) - call gpu_gb_ene(pot_ene_prop, enmr) + call gpu_gb_ene(pot_ene_prop, enmr, ineb) call gpu_refresh_charges(cit_nb14, gbl_one_scee, atm_qterm) - call gpu_gb_ene(pot_ene, enmr) + call gpu_gb_ene(pot_ene, enmr, ineb) pot_ene%dvdl = pot_ene_prop%total - pot_ene%total else - call gpu_gb_ene(pot_ene, enmr) + call gpu_gb_ene(pot_ene, enmr, ineb) end if ! BEGIN GBSA implementation to work with the GPU code. ! force elements sent to the GPU to be added to the other force contributions @@ -394,33 +394,13 @@ subroutine gb_force(atm_cnt, crd, frc, pot_ene, irespa, need_pot_ene) #ifdef MPI if (ineb>0) then - call gpu_download_crd(crd) - call gpu_gb_ene(pot_ene,enmr) - call gpu_download_frc(frc) - - if(firsttime) then - call nebread() - firsttime= .false. - end if - - call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, irespa) - - do jatm=1,nattgtrms - iatm=rmsmask(jatm) - if(iatm .eq. 0) cycle - frc(1,iatm)= frc(1,iatm)+neb_force(1,jatm) - frc(2,iatm)= frc(2,iatm)+neb_force(2,jatm) - frc(3,iatm)= frc(3,iatm)+neb_force(3,jatm) - end do + call transfer_fit_neb_crd(irespa) + call full_neb_forces(irespa) if (beadid==1 .or. beadid==neb_nbead) then - do i=1, atm_cnt - frc(:,i)=0.d0 - enddo + call gpu_clear_forces() end if - call gpu_upload_frc(frc) - end if #endif /* MPI */ !END_NEB @@ -874,19 +854,14 @@ subroutine gb_force(atm_cnt, crd, frc, pot_ene, irespa, need_pot_ene) call gb_mpi_gathervec(atm_cnt, frc) if(mytaskid.eq.0) then - if(firsttime) then - call nebread() - firsttime= .false. - end if call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, irespa) end if - - call mpi_barrier(mpi_comm_world,err_code_mpi) + call mpi_bcast(neb_force, 3*nattgtrms, MPI_DOUBLE_PRECISION, 0, pmemd_comm, err_code_mpi) do jatm=1,nattgtrms iatm=rmsmask(jatm) if(iatm .eq. 0) cycle - if(mytaskid .eq. 0) then + if (gbl_atm_owner_map(iatm) .eq. mytaskid) then frc(1,iatm)= frc(1,iatm)+neb_force(1,jatm) frc(2,iatm)= frc(2,iatm)+neb_force(2,jatm) frc(3,iatm)= frc(3,iatm)+neb_force(3,jatm) @@ -895,12 +870,11 @@ subroutine gb_force(atm_cnt, crd, frc, pot_ene, irespa, need_pot_ene) if (beadid==1 .or. beadid==neb_nbead) then do i=1, atm_cnt - frc(:,i)=0.d0 + if (gbl_atm_owner_map(i) .eq. mytaskid) then + frc(:,i)=0.d0 + endif enddo end if - - call mpi_bcast(frc, 3*atm_cnt, MPI_DOUBLE_PRECISION, 0, pmemd_comm, err_code_mpi) - end if #endif /* MPI */ diff --git src/pmemd/src/mdin_ctrl_dat.F90 src/pmemd/src/mdin_ctrl_dat.F90 index b4315a2..e284620f 100644 --- src/pmemd/src/mdin_ctrl_dat.F90 +++ src/pmemd/src/mdin_ctrl_dat.F90 @@ -42,7 +42,7 @@ use file_io_dat_mod nucat, & ! gbneck2nu: check if atom belong to nucleic or not mcrescyc,nmd,nmc,mcverbosity, & infe, & ! added by FENG PAN - ineb, skmin, skmax, tmode, vv, & + ineb, skmin, skmax, tmode, vv, nebfreq, & irandom, mask_from_ref, mbar_states, & geq_nstep, ginit_vel, gsyn_mass, & !GTI gremd_acyc, gti_cpu_output, & ! GTI @@ -76,10 +76,10 @@ use file_io_dat_mod geq_nstep, ginit_vel, gsyn_mass, & !99 gremd_acyc, gti_cpu_output, & !101 mcint,nmd,mcrescyc,nmc,mcverbosity, & !106 - ineb,skmin,skmax,tmode,vv, & !111 - iphmd, midpoint !113 + ineb,skmin,skmax,tmode,vv, nebfreq, & !112 + iphmd, midpoint !114 - integer, parameter :: mdin_ctrl_int_cnt = 113 + integer, parameter :: mdin_ctrl_int_cnt = 114 save :: / mdin_ctrl_int / @@ -303,7 +303,7 @@ use file_io_dat_mod mbar_lambda, mbar_states, mask_from_ref, & geq_nstep, ginit_vel, gsyn_mass, gremd_acyc, & gti_cpu_output,mcwat,nmd,mcint,nmc,mcrescyc,mcverbosity,& - mcresstr,mcwatmaxdif,ineb,skmin,skmax,tmode,vv,vfac,& + mcresstr,mcwatmaxdif,ineb,skmin,skmax,tmode,vv,vfac,nebfreq,& mcboxshift, midpoint @@ -705,6 +705,7 @@ subroutine init_mdin_ctrl_dat(remd_method) tmode = 1 ! Use the revised tangent definition by default vv = 0 ! Flag for quenched velocity verlet minimization vfac = 0.d0 ! Scaling factor for quenched velocity verlet + nebfreq = 1 infe = 0 ! added by FENG PAN ! Read input in namelist format: diff --git src/pmemd/src/neb.F90 src/pmemd/src/neb.F90 index 7db2853..ea3c1a3 100644 --- src/pmemd/src/neb.F90 +++ src/pmemd/src/neb.F90 @@ -104,7 +104,27 @@ Contains !------------------------------------------------------------------------------- #ifdef MPI +#ifdef CUDA + subroutine transfer_fit_neb_crd(irespa) + use mdin_ctrl_dat_mod + use nebread_mod + + implicit none + + integer, intent(in) :: irespa + + if ( irespa .eq. 1 .or. mod(irespa,nebfreq)==0 ) then + call gpu_neb_exchange_crd() + call gpu_neb_rmsfit() + endif + end subroutine transfer_fit_neb_crd +#endif + +#ifdef CUDA + subroutine full_neb_forces(irespa) +#else subroutine full_neb_forces(mass, x, f, epot, fitgp, rmsgp, irespa) +#endif use mdin_ctrl_dat_mod use prmtop_dat_mod @@ -114,11 +134,15 @@ Contains implicit none !passed variables - double precision :: mass(:), x(:,:), f(:,:) +#ifndef CUDA + double precision :: x(:,:) + double precision :: mass(:), f(:,:) double precision :: epot integer :: rmsgp(:), fitgp(:) +#endif integer, intent(in) :: irespa +#ifndef CUDA !local variables logical :: rmsok integer :: i, j, rep, iatm, jatm @@ -126,7 +150,7 @@ Contains double precision :: eplus, eminus, energy, ezero, emax double precision :: dotproduct, dotproduct2 double precision :: rmsdvalue, spring, spring2 - +#endif if (mod(neb_nbead,2)/=0) then write (6,*) "must have even number of beads",neb_nbead @@ -139,8 +163,16 @@ Contains ! note also that next_node and prev_node are circular, ! i.e. first and last still have both neighbors defined. !********************************************************** - - !DG: these ranks imply we use pmemd_master_comm for the communicator +#ifdef CUDA + if(irespa .eq. 1 .or. mod(irespa,nebfreq)==0) then + call gpu_set_neb_springs() + call gpu_calculate_neb_frc(neb_force) + else + call gpu_calculate_neb_frc_nstep(neb_force) + endif +#else + if(irespa .eq. 1 .or. mod(irespa,nebfreq)==0) then + !these ranks imply we use pmemd_master_comm for the communicator next_node = master_rank+1 if(next_node>=master_size) next_node = 0 prev_node = master_rank-1 @@ -193,12 +225,9 @@ Contains pmemd_master_comm,err_code_mpi) ! fit neighbor coords to self -! ok to fit both neighbors since next/prev were defined as circular -! note that we pass fitgp and rmsgp, which were arguments to this routine in the call from force() ! first argument is refc (which gets fit to the second argument, the actual MD coords) ! refc in this case is the neighbor image - call rmsfit( xprev, x, mass, fitgp,rmsgp, rmsdvalue, nattgtrms, nattgtfit, rmsok) call rmsfit( xnext, x, mass, fitgp,rmsgp, rmsdvalue, nattgtrms, nattgtfit, rmsok) @@ -397,7 +426,8 @@ Contains write(worldrank+50,'(a)') "-------------------------------------------------" # endif - + endif !nebfreq +#endif /* CUDA */ return end subroutine full_neb_forces @@ -412,6 +442,10 @@ Contains integer :: filenumber,ix double precision :: sum +#if defined(CUDA) && defined(MPI) + call gpu_report_neb_ene(master_size, neb_nrg_all) +#endif + sum = 0.d0 do ix = 1, neb_nbead sum = sum + neb_nrg_all(ix) @@ -659,7 +693,7 @@ Contains ! here are copies, and we do not actually modify the coordinates being ! used by the neighbor image MD (just the copies that we received from them) - do i=1, last_neb_atom !natom DG + do i=1, last_neb_atom x(1,i) = x(1,i) + xcm1 x(2,i) = x(2,i) + ycm1 x(3,i) = x(3,i) + zcm1 diff --git src/pmemd/src/nebread.F90 src/pmemd/src/nebread.F90 index db20cf2..66d7b36 100644 --- src/pmemd/src/nebread.F90 +++ src/pmemd/src/nebread.F90 @@ -19,8 +19,9 @@ module nebread_mod integer :: beadid !mybeadid in sander neb integer, allocatable :: fitmask(:), rmsmask(:) type(atm_info), allocatable :: sortfit(:), sortrms(:) - integer, allocatable :: combined_mask(:) !DG: combined and sorted mask from fitmask and rmsmask + type(atm_info), allocatable :: combined_mask(:) !DG: combined and sorted mask from fitmask and rmsmask integer, allocatable :: partial_mask(:) !DG: same as combined_mask with no end zeroes, needed for partial coordinates download from gpu. + integer, allocatable :: idx_mask(:) integer :: pcnt !DG: partial counts, partial_mask number of elements. integer :: alloc_stat @@ -100,7 +101,8 @@ subroutine nebread() allocate(combined_mask(nattgtrms+nattgtfit), stat=alloc_stat) if ( alloc_stat .ne. 0 ) & call allocation_error('nebread','Error allocating temporary arrays') - combined_mask(:) = 0 + combined_mask(:)%atm_num = 0 + combined_mask(:)%atm_idx = 0 do n=1,nattgtfit sortfit(n)%atm_num = fitmask(n) @@ -114,7 +116,7 @@ subroutine nebread() pcnt=0 do n=1,size(combined_mask) - if(combined_mask(n) .ne. 0) then + if(combined_mask(n)%atm_num .ne. 0) then pcnt = pcnt + 1 endif enddo @@ -122,8 +124,15 @@ subroutine nebread() allocate(partial_mask(pcnt), stat=alloc_stat) if ( alloc_stat .ne. 0 ) & call allocation_error('nebread','Error allocating temporary arrays') - partial_mask = combined_mask(:pcnt) + partial_mask(:) = 0 + allocate(idx_mask(pcnt), stat=alloc_stat) + if ( alloc_stat .ne. 0 ) & + call allocation_error('nebread','Error allocating temporary arrays') + idx_mask(:) = 0 + + partial_mask(1:pcnt) = combined_mask(1:pcnt)%atm_num + idx_mask(1:pcnt) = combined_mask(1:pcnt)%atm_idx endif end subroutine nebread !------------------------------------------------------------------------- @@ -132,7 +141,7 @@ subroutine combine_sort(combined, sortfit, sortrms, nattgtfit, nattgtrms) implicit none !Passed variables - integer :: combined(:) + type(atm_info) :: combined(:) type(atm_info) :: sortfit(:), sortrms(:) integer :: nattgtfit, nattgtrms @@ -146,14 +155,16 @@ subroutine combine_sort(combined, sortfit, sortrms, nattgtfit, nattgtrms) do i=1,nattgtfit+nattgtrms if(sortfit(fitcnt)%atm_num .lt. sortrms(rmscnt)%atm_num) then - combined(i) = sortfit(fitcnt)%atm_num + combined(i)%atm_num = sortfit(fitcnt)%atm_num sortfit(fitcnt)%atm_idx = i + combined(i)%atm_idx = 1 !atom belongs to sortfit fitcnt = fitcnt + 1 else - combined(i) = sortrms(rmscnt)%atm_num - sortrms(rmscnt)%atm_idx = i + combined(i)%atm_num = sortrms(rmscnt)%atm_num + sortrms(rmscnt)%atm_idx = i !leave combined(i)%atm_idx = 0, atom belongs to sortrms if(sortfit(fitcnt)%atm_num .eq. sortrms(rmscnt)%atm_num) then sortfit(fitcnt)%atm_idx = i + combined(i)%atm_idx = 2 !atom belongs to both sortfit and sortrms fitcnt = fitcnt + 1 endif rmscnt = rmscnt + 1 @@ -165,14 +176,15 @@ subroutine combine_sort(combined, sortfit, sortrms, nattgtfit, nattgtrms) if(rmscnt .gt. nattgtrms) then m = nattgtfit - fitcnt do i=0,m - combined(cntr + i + 1) = sortfit(fitcnt + i)%atm_num + combined(cntr + i + 1)%atm_num = sortfit(fitcnt + i)%atm_num sortfit(fitcnt +i)%atm_idx = cntr + i + 1 + combined(cntr + i + 1)%atm_idx = 1 !atom belongs to sortfit enddo else if(fitcnt .gt. nattgtfit) then m = nattgtrms - rmscnt do i=0,m - combined(cntr + i + 1) = sortrms(rmscnt + i)%atm_num - sortrms(rmscnt + i)%atm_idx = cntr + i + 1 + combined(cntr + i + 1)%atm_num = sortrms(rmscnt + i)%atm_num + sortrms(rmscnt + i)%atm_idx = cntr + i + 1 !leave combined(i)%atm_idx = 0, atom belongs to sortrms enddo endif diff --git src/pmemd/src/pme_force.F90 src/pmemd/src/pme_force.F90 index 09bc8ef..a5083f8 100644 --- src/pmemd/src/pme_force.F90 +++ src/pmemd/src/pme_force.F90 @@ -296,7 +296,6 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, & type(pme_virial_rec) :: proc_vir integer :: offset_loc - logical :: firsttime=.true. logical :: send_data integer :: iatm, jatm, task_id type(pme_virial_rec) :: vir @@ -393,16 +392,22 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, & call gti_build_nl_list(ti_mode, lj1264) call gti_clear(need_pot_enes, ti_mode, crd) - call gti_ele_recip(need_pot_enes, need_virials, uc_volume, ti_weights, ti_mode) call gti_nb(need_pot_enes, need_virials, ti_mode, lj1264) - call gti_bonded(need_pot_enes, need_virials, ti_mode) call gti_others(need_pot_enes, nstep, dt) - call gti_finalize_force_virial(numextra, need_virials, ti_mode, netfrc, frc) + if (ineb>0) then +#ifdef TIME_TEST + call start_test_timer(14, 'neb_exchange_crd', 0) +#endif + call transfer_fit_neb_crd(nstep) +#ifdef TIME_TEST + call stop_test_timer(14) +#endif + endif - if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt) + if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt, ineb, nebfreq, nstep) if (ti_mode .gt. 0) then if (need_pot_enes) then @@ -412,21 +417,24 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, & if (need_virials) call gti_get_virial(ti_weights(1:2), virial, ekcmt, ti_mode) end if - if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt) - - if (ti_mode >0) then - if (need_pot_enes) then - if (ifmbar .ne. 0) call gti_update_mbar_from_gpu - call gti_update_ti_pot_energy_from_gpu(pot_ene, enmr, ti_mode, gti_cpu_output) - endif - if (need_virials) call gti_get_virial(ti_weights(1:2), virial, ekcmt, ti_mode) - endif +!DG if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt) +!DG +!DG if (ti_mode >0) then +!DG if (need_pot_enes) then +!DG if (ifmbar .ne. 0) call gti_update_mbar_from_gpu +!DG call gti_update_ti_pot_energy_from_gpu(pot_ene, enmr, ti_mode, gti_cpu_output) +!DG endif +!DG if (need_virials) call gti_get_virial(ti_weights(1:2), virial, ekcmt, ti_mode) +!DG endif #else /* GTI */ if (need_pot_enes) then + if (ineb>0) then + call transfer_fit_neb_crd(nstep) + endif if (ti_mode .eq. 0) then call gpu_pme_ene(ew_coeff, uc_volume, pot_ene, enmr, virial, & - ekcmt, nstep, dt, crd, frc) + ekcmt, nstep, dt, crd, frc, ineb, nebfreq) else call gpu_ti_pme_ene(ew_coeff, uc_volume, pot_ene, enmr, virial, & ekcmt, nstep, dt, crd, frc, bar_cont) @@ -554,34 +562,17 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, & end if if (ineb .gt. 0) then +#ifdef TIME_TEST + call start_test_timer(15, 'neb_force', 0) +#endif + call full_neb_forces(nstep) +#ifdef TIME_TEST + call stop_test_timer(15) +#endif - if (firsttime) then - call nebread() - call gpu_setup_shuttle_info(pcnt, 0, partial_mask) - firsttime = .false. - end if - - call gpu_shuttle_retrieve_data(crd, 0) - call gpu_shuttle_retrieve_data(frc, 1) - call gpu_pme_ene(ew_coeff, uc_volume, pot_ene, enmr, virial, & - ekcmt, nstep, dt) - - call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, nstep) - - do jatm=1,nattgtrms - iatm=rmsmask(jatm) - if (iatm .eq. 0) cycle - - frc(1,iatm) = frc(1,iatm)+neb_force(1,jatm) - frc(2,iatm) = frc(2,iatm)+neb_force(2,jatm) - frc(3,iatm) = frc(3,iatm)+neb_force(3,jatm) - enddo - - call gpu_shuttle_post_data(frc, 1) - if (beadid .eq. 1 .or. beadid .eq. neb_nbead) then + if (beadid==1 .or. beadid==neb_nbead) then call gpu_clear_forces() - end if - + endif end if #else /*Not cuda version below*/ @@ -1638,14 +1629,6 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, & if (ineb .gt. 0) then call mpi_gathervec(atm_cnt, frc) call mpi_gathervec(atm_cnt, crd) - if (firsttime) then - if (mytaskid .eq. 0) then - call nebread() - end if - call mpi_bcast(nattgtrms, 1, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi) - call mpi_bcast(rmsmask, nattgtrms, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi) - firsttime = .false. - end if if (mytaskid .eq. 0) then call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, nstep) @@ -1765,7 +1748,6 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, & type(pme_virial_rec) :: proc_vir integer :: offset_loc - logical :: firsttime=.true. logical :: send_data integer :: iatm, jatm, task_id type(pme_virial_rec) :: vir @@ -1867,8 +1849,11 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, & call gti_others(need_pot_enes, nstep, dt) call gti_finalize_force_virial(numextra, need_virials, ti_mode, netfrc, frc) + if (ineb>0) then + call transfer_fit_neb_crd(nstep) + endif - if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt) + if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt, ineb, nebfreq, nstep) if (ti_mode .gt. 0) then if (need_pot_enes) then @@ -1893,6 +1878,9 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, & call gpu_pme_force(ew_coeff, uc_volume, virial, ekcmt, nstep, dt, crd, frc) end if + if (ineb>0) then + call transfer_fit_neb_crd(nstep) + endif #endif /* GTI */ @@ -2012,37 +2000,11 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, & end if if (ineb .gt. 0) then - - if (firsttime) then - call nebread() - call gpu_setup_shuttle_info(pcnt, 0, partial_mask) - firsttime = .false. - end if - - call gpu_shuttle_retrieve_data(crd, 0) - call gpu_shuttle_retrieve_data(frc, 1) - call gpu_pme_ene(ew_coeff, uc_volume, pot_ene, enmr, virial, & - ekcmt, nstep, dt) - - - call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, nstep) - - do jatm=1,nattgtrms - iatm=rmsmask(jatm) - if (iatm .eq. 0) cycle - - frc(1,iatm)= frc(1,iatm)+neb_force(1,jatm) - frc(2,iatm)= frc(2,iatm)+neb_force(2,jatm) - frc(3,iatm)= frc(3,iatm)+neb_force(3,jatm) - enddo - - call gpu_shuttle_post_data(frc, 1) - if (beadid .eq. 1 .or. beadid .eq. neb_nbead) then + call full_neb_forces(nstep) + if (beadid==1 .or. beadid==neb_nbead) then call gpu_clear_forces() - end if - + endif end if - !END_NEB #else /*Not cuda version below*/ @@ -3056,14 +3018,6 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, & if (ineb .gt. 0) then call mpi_gathervec(atm_cnt, frc) call mpi_gathervec(atm_cnt, crd) - if (firsttime) then - if (mytaskid .eq. 0) then - call nebread() - end if - call mpi_bcast(nattgtrms, 1, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi) - call mpi_bcast(rmsmask, nattgtrms, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi) - firsttime = .false. - end if if (mytaskid .eq. 0) then call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, nstep) @@ -3250,7 +3204,7 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, & call gti_finalize_force_virial(numextra, need_virials, ti_mode, netfrc, frc) - if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt) + if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt, ineb, nebfreq, nstep) if (ti_mode .gt. 0) then if (need_pot_enes) then diff --git src/pmemd/src/pmemd.F90 src/pmemd/src/pmemd.F90 index 9201e34..e553fc2 100644 --- src/pmemd/src/pmemd.F90 +++ src/pmemd/src/pmemd.F90 @@ -746,6 +746,19 @@ if(.not. usemidpoint) then #ifdef MPI if(ineb>0) then call neb_init() +#ifdef CUDA + call nebread() + call gpu_setup_shuttle_info(pcnt, 0, partial_mask) + call gpu_upload_neb_info(neb_nbead, beadid, nattgtrms, nattgtfit, & + last_neb_atom, skmin, skmax, tmode, rmsmask, fitmask,idx_mask) + call gpu_setup_neb() +#else + if (master) then + call nebread() + end if + call mpi_bcast(nattgtrms, 1, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi) + call mpi_bcast(rmsmask, nattgtrms, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi) +#endif end if #endif !DG diff --git src/pmemd/src/runmd.F90 src/pmemd/src/runmd.F90 index 4be6b87..c613359 100644 --- src/pmemd/src/runmd.F90 +++ src/pmemd/src/runmd.F90 @@ -1319,7 +1319,7 @@ DBG_ARRAYS_TIME_STEP(nstep) #endif #ifdef MPI - if ( ineb .gt. 0 ) then + if ( ineb .gt. 0 .and. (mod(nstep, nebfreq)==0)) then need_pot_enes = .true. end if #endif diff --git test/cuda/neb-testcases/neb_explicit/groupfile.in test/cuda/neb-testcases/neb_explicit/groupfile.in index 41f07c7..265d08a 100644 --- test/cuda/neb-testcases/neb_explicit/groupfile.in +++ test/cuda/neb-testcases/neb_explicit/groupfile.in @@ -1,4 +1,4 @@ --O -p neb.prmtop -c inpcrds/neb01.inpcrd -o neb_explicit_01.out -r neb_explicit_01.rst -x neb_explicit_01.mdcrd -inf neb_explicit_01.inf --O -p neb.prmtop -c inpcrds/neb02.inpcrd -o neb_explicit_02.out -r neb_explicit_02.rst -x neb_explicit_02.mdcrd -inf neb_explicit_02.inf --O -p neb.prmtop -c inpcrds/neb03.inpcrd -o neb_explicit_03.out -r neb_explicit_03.rst -x neb_explicit_03.mdcrd -inf neb_explicit_03.inf --O -p neb.prmtop -c inpcrds/neb04.inpcrd -o neb_explicit_04.out -r neb_explicit_04.rst -x neb_explicit_04.mdcrd -inf neb_explicit_04.inf +-O -p neb.prmtop -c inpcrds/neb01.inpcrd -o neb_explicit_01.out -r neb_explicit_01.rst -x neb_explicit_01.mdcrd -inf neb_explicit_01.inf -AllowSmallBox +-O -p neb.prmtop -c inpcrds/neb02.inpcrd -o neb_explicit_02.out -r neb_explicit_02.rst -x neb_explicit_02.mdcrd -inf neb_explicit_02.inf -AllowSmallBox +-O -p neb.prmtop -c inpcrds/neb03.inpcrd -o neb_explicit_03.out -r neb_explicit_03.rst -x neb_explicit_03.mdcrd -inf neb_explicit_03.inf -AllowSmallBox +-O -p neb.prmtop -c inpcrds/neb04.inpcrd -o neb_explicit_04.out -r neb_explicit_04.rst -x neb_explicit_04.mdcrd -inf neb_explicit_04.inf -AllowSmallBox