********>Provisional Patch to Support NVIDIA GPU Acceleration of implicit solvent GB simulations in PMEMD v10.0 Programs: AMBER 10 (PMEMD) NVIDIA CUDA GPU Implementation v0.1 Description: This patch file is part of experimental support for NVIDIA GPU (cuda) support for implicit solvent GB simulations in PMEMD v10.0. Patch Author: Ross C. Walker (SDSC) NVIDIA CUDA Implementation: Scott Le Grand (NVIDIA) Duncan Poole (NVIDIA) Mark Williamson (SDSC) Ross Walker (SDSC) Please note that this patch is EXPERIMENTAL! All questions or comments should be directed to Ross Walker via the AMBER mailing list (http://lists.ambermd.org/). Feedback is welcome please just realize that this implementation is to be considered ALPHA software for the moment. Validation of the GPU implementation has been undertaken but considerable work is still required to ensure that ensemble averages and measurable properties match the cpu implementations. Initial tests show that energy conservation is equivalent, however, other issues may remain. Bugs are also probably present. Hence caveat emptor. You use this code at your own risk. Should you choose to publish work conducted using this code please clearly describe that you used AMBER 10 PMEMD running on NVIDIA graphics card XXXXX utilizing AMBER NVIDIA CUDA GPU implementation v0.1. Usage: This patch is to be used along with the accompanying tar file http://ambermd.org/gpus/amber10_nvidia_cuda_support_v0.1.tar.gz This patch also assumes that your AMBER 10 installation is fully patched with bugfixes 1 to 24. Bugfix.all as of August 26th 2009. If this is NOT the case then you need to apply these bugfixes before proceeding. Apply this patch and the accompanying tar file as follows: cd $AMBERHOME tar xvzf /path_to_tar_file/amber10_nvidia_cuda_support_v0.1.tar.gz patch -p0 -N -r patch_rejects f90depends clean: - cd src && rm -f *.f90 *.o *.mod pmemd *.d work.pc* + cd src && $(MAKE) clean + --- src/pmemd/configure 2009-08-26 11:55:50.000000000 -0700 +++ src/pmemd/configure 2009-09-02 22:16:59.000000000 -0700 @@ -30,6 +30,10 @@ sgi_altix (SGI Linux, Intel Itanium 2 processors, IA64) sgi_mips (SGI IRIX UNIX, MIPS R8k,10k,12k,14k or 16k processors) + GPU Options: + linux_em64t_cuda (Linux, Intel EM64T-compatible, x86_64, NVIDIA GPU, CUDA, nopar) + (Requires CUDA_HOME to be set. Default /usr/local/cuda/) + where is one of compaqf90 (Compaq Fortran 90, osf1_alpha only) @@ -38,6 +42,7 @@ pathf90 (Pathscale Fortran 90/95, linux64_opteron only) pgf90 (Portland Group Fortran 90/95, cray*, linux*) xlf90 (IBM xlf90/mpxlf90/mpxlf90_r, aix_spx only) + gfortran (GNU Fortran, em64t, em64t_cuda only) where is one of @@ -386,25 +391,76 @@ if [ $platform = linux_p3_athlon -o \ $platform = linux_p4 ]; then - cat << EOD >> config.h +# Try to work out what version of MKL this is. +# Since Intel changed the linking between MKL 9 and 10. +# In version 9 and earlier of MKL the file libmkl_lapack.a is of type: current ar archive +# In version 10 and later it is of type: ASCII text + file -b -i $MKL_HOME/lib/32/libmkl_lapack.a | grep -q "text" + if [ $? != 0 ]; then + #This is MKL v9.x or earlier + echo "MKL Version 9.x or earlier detected." + # Intel® Math Kernel Library 6.1 for Linux Technical User Notes + # IA-32®static linking of LAPACK and kernels + cat << EOD >> config.h MATH_LIBS = $MKL_HOME/lib/32/libmkl_ia32.a -L$MKL_HOME/lib/32 -lguide -lpthread EOD + else + #This is MKL v10.x or later + echo "MKL Version 10.x or later detected." + # IA-32 static linking of LAPACK and kernels + cat << EOD >> config.h +MATH_LIBS = -L$MKL_HOME/lib/32/ -Wl,--start-group $MKL_HOME/lib/32/libmkl_intel.a $MKL_HOME/lib/32/libmkl_sequential.a $MKL_HOME/lib/32/libmkl_core.a -Wl,--end-group -lpthread +EOD + fi elif [ $platform = linux64_opteron -o \ - $platform = linux_em64t ]; then - if [ -d $MKL_HOME/lib_serial ]; then # Cluster MKL 9.x - math_libs="$MKL_HOME/lib_serial/em64t/libmkl_em64t.a" - elif [ -f $MKL_HOME/lib/em64t/libmkl_sequential.a ]; then # MKL 10.x - math_libs="$MKL_HOME/lib/em64t/libmkl_intel_lp64.a $MKL_HOME/lib/em64t/libmkl_core.a $MKL_HOME/lib/em64t/libmkl_sequential.a" - else - math_libs="$MKL_HOME/lib/em64t/libmkl_em64t.a -L$MKL_HOME/lib/em64t -lguide -lpthread" - fi - cat << EOD >> config.h -MATH_LIBS = $math_libs + $platform = linux_em64t -o \ + $platform = linux_em64t_cuda ]; then +# Try to work out what version of MKL this is. +# Since Intel changed the linking between MKL 9 and 10. +# In version 9 and earlier of MKL the file libmkl_lapack.a is of type: current ar archive +# In version 10 and later it is of type: ASCII text + file -b -i $MKL_HOME/lib/em64t/libmkl_lapack.a | grep -q "text" + if [ $? != 0 ]; then + #This is MKL v9.x or earlier + echo "MKL Version 9.x or earlier detected." + cat << EOD >> config.h +MATH_LIBS = $MKL_HOME/lib/em64t/libmkl_em64t.a -L$MKL_HOME/lib/em64t -lguide -lpthread +EOD + else + #This is MKL v10.x or later + echo "MKL Version 10.x or later detected." + # IA-32 static linking of LAPACK and kernels + cat << EOD >> config.h +MATH_LIBS = -L$MKL_HOME/lib/em64t/ -Wl,--start-group $MKL_HOME/lib/em64t/libmkl_intel_lp64.a $MKL_HOME/lib/em64t/libmkl_sequential.a $MKL_HOME/lib/em64t/libmkl_core.a -Wl,--end-group -lpthread EOD + fi elif [ $platform = sgi_altix ]; then +# Try to work out what version of MKL this is. +# Since Intel changed the linking between MKL 9 and 10. +# In version 9 and earlier of MKL the file libmkl_lapack.a is of type: current ar archive +# In version 10 and later it is of type: ASCII text + file -b -i $MKL_HOME/lib/64/libmkl_lapack.a | grep -q "text" + if [ $? != 0 ]; then + #This is MKL v9.x or earlier + echo "MKL Version 9.x or earlier detected." + # MKL Technical Notes: Itanium static linking of LAPACK and kernels cat << EOD >> config.h MATH_LIBS = $MKL_HOME/lib/64/libmkl_ipf.a -L$MKL_HOME/lib/64 -lguide -lpthread -ldl EOD + else + #This is MKL v10.x or later + echo "MKL Version 10.x or later detected." + #We always link to the sequential version of MKL + #since typically one runs an MPI thread for each core. + #However, if openmp is specified, for example to turn on + #SMP diagonalizers for QMMM then we link to the threaded version + #of MKL and inside the code the number of threads for vector functions + #etc will be set to 1. + # IA-64 static linking of LAPACK and kernels + cat << EOD >> config.h +MATH_LIBS = -L$MKL_HOME/lib/64/ -Wl,--start-group $MKL_HOME/lib/64/libmkl_intel_lp64.a $MKL_HOME/lib/64/libmkl_sequential.a $MKL_HOME/lib/64/libmkl_core.a -Wl,--end-group -lpthread +EOD + fi else echo "PMEMD Configuration Internal Error!" echo "Unexpected platform type while configuring Intel MKL!" --- src/pmemd/src/angles.fpp 2009-08-26 12:00:16.000000000 -0700 +++ src/pmemd/src/angles.fpp 2009-09-02 22:19:41.000000000 -0700 @@ -152,6 +152,10 @@ cit_angle(1:my_angle_cnt) = angles_copy(1:my_angle_cnt) end if +#ifdef CUDA + call gpu_angles_setup(my_angle_cnt, cit_angle, gbl_teq, gbl_tk) +#endif + !BEGIN DBG ! write(0,*)'task,cit_ntheth,cit_ntheta,', mytaskid, & ! cit_ntheth, cit_ntheta --- src/pmemd/src/bonds.fpp 2009-08-26 12:00:22.000000000 -0700 +++ src/pmemd/src/bonds.fpp 2009-09-02 22:19:46.000000000 -0700 @@ -93,6 +93,10 @@ cit_a_bond(1:cit_nbona) = bonds_copy(1:cit_nbona) end if +#ifdef CUDA + call gpu_bonds_setup(cit_nbona, cit_a_bond, cit_nbonh, cit_h_bond, gbl_req, gbl_rk); +#endif + return end subroutine bonds_setup --- src/pmemd/src/dihedrals.fpp 2009-08-26 12:00:29.000000000 -0700 +++ src/pmemd/src/dihedrals.fpp 2009-09-02 22:19:51.000000000 -0700 @@ -175,6 +175,10 @@ end if cit_dihed(1:my_dihed_cnt) = diheds_copy(1:my_dihed_cnt) end if + +#ifdef CUDA + call gpu_dihedrals_setup(my_dihed_cnt, cit_dihed, gbl_ipn, gbl_pn, gbl_pk, gbl_gamc, gbl_gams); +#endif !BEGIN DBG ! write(0,*)'task,cit_nphih,cit_nphia', mytaskid, & --- src/pmemd/src/extra_pnts_nb14.fpp 2009-08-26 12:00:34.000000000 -0700 +++ src/pmemd/src/extra_pnts_nb14.fpp 2009-09-02 22:19:55.000000000 -0700 @@ -203,7 +203,7 @@ ! Local variables: integer :: alloc_failed - integer :: nb14_copy(2, gbl_nb14_cnt) + integer :: nb14_copy(3, gbl_nb14_cnt) integer :: atm_i, atm_j, nb14_idx ! This routine can handle reallocation, and thus can be called multiple @@ -217,6 +217,7 @@ atm_i = gbl_nb14(1, nb14_idx) atm_j = gbl_nb14(2, nb14_idx) +! nb14_parm_idx = gbl_nb14(3, nb14_idx) #ifdef MPI if (gbl_atm_owner_map(atm_i) .eq. mytaskid) then @@ -241,20 +242,26 @@ if (cit_nb14_cnt .gt. 0) then if (allocated(cit_nb14)) then - if (size(cit_nb14) .lt. cit_nb14_cnt * 2) then - num_ints = num_ints - size(cit_nb14) * 2 + if (size(cit_nb14) .lt. cit_nb14_cnt * 3) then + num_ints = num_ints - size(cit_nb14) * 3 deallocate(cit_nb14) - allocate(cit_nb14(2, cit_nb14_cnt), stat = alloc_failed) + allocate(cit_nb14(3, cit_nb14_cnt), stat = alloc_failed) if (alloc_failed .ne. 0) call setup_alloc_error - num_ints = num_ints + size(cit_nb14) * 2 + num_ints = num_ints + size(cit_nb14) * 3 end if else - allocate(cit_nb14(2, cit_nb14_cnt), stat = alloc_failed) + allocate(cit_nb14(3, cit_nb14_cnt), stat = alloc_failed) if (alloc_failed .ne. 0) call setup_alloc_error - num_ints = num_ints + size(cit_nb14) * 2 + num_ints = num_ints + size(cit_nb14) * 3 end if cit_nb14(:, 1:cit_nb14_cnt) = nb14_copy(:, 1:cit_nb14_cnt) end if + +#ifdef CUDA + call gpu_nb14_setup(cit_nb14_cnt, cit_nb14, gbl_one_scee, gbl_one_scnb, ntypes, atm_iac, typ_ico, gbl_cn1, gbl_cn2) +#endif + + !BEGIN DBG ! write(0,*)'task, cit_nb14_cnt =', mytaskid, cit_nb14_cnt @@ -289,7 +296,7 @@ integer :: alloc_failed - allocate(gbl_nb14(2, max_nb14), & + allocate(gbl_nb14(3, max_nb14), & stat = alloc_failed) if (alloc_failed .ne. 0) call setup_alloc_error @@ -330,7 +337,7 @@ allocate(ep_frames(max_frames), & ep_lcl_crd(3, 2, max_frames), & - gbl_nb14(2, max_nb14), & + gbl_nb14(3, max_nb14), & stat = alloc_failed) if (alloc_failed .ne. 0) call setup_alloc_error @@ -387,7 +394,7 @@ ! All callers need the nb14 stuff... - call mpi_bcast(gbl_nb14, gbl_nb14_cnt * 2, mpi_integer, 0, & + call mpi_bcast(gbl_nb14, gbl_nb14_cnt * 3, mpi_integer, 0, & mpi_comm_world, err_code_mpi) @@ -452,6 +459,7 @@ gbl_nb14_cnt = gbl_nb14_cnt + 1 gbl_nb14(1, gbl_nb14_cnt) = gbl_dihed(dihed_idx)%atm_i gbl_nb14(2, gbl_nb14_cnt) = gbl_dihed(dihed_idx)%atm_l + gbl_nb14(3, gbl_nb14_cnt) = gbl_dihed(dihed_idx)%parm_idx end if end do ! n = 1, nphih + nphia @@ -494,6 +502,7 @@ integer, allocatable :: i12(:,:) integer, allocatable :: i13(:,:) integer, allocatable :: i14(:,:) + integer, allocatable :: nb_14_list(:,:) integer, allocatable :: s3(:) integer :: n @@ -539,6 +548,7 @@ i12(2, 3 * (nbonh + nbona)), & i13(2, 3 * (ntheth + ntheta)), & i14(2, 3 * (nphih + nphia)), & + nb_14_list(3,nphih + nphia), & s3(maxa), & stat = alloc_failed) @@ -555,6 +565,7 @@ i12(:,:) = 0 i13(:,:) = 0 i14(:,:) = 0 + nb_14_list(:,:) = 0 call get_nghbrs(nghbrs, hnghbrs, enghbrs, numnghbr, epowner, epbtyp) @@ -571,12 +582,12 @@ i11, i12, i13, i14, offset, test) end if - ! Reuse i14() and then copy to permanent: + ! Use nb_14_list and then copy to permanent: - call build_nb14(i14, gbl_nb14_cnt, max14, epowner, numnghbr, enghbrs, & + call build_nb14(nb_14_list, gbl_nb14_cnt, max14, epowner, numnghbr, enghbrs, & natom, chngmask, verbose) - call copy_nb14(i14, gbl_nb14, gbl_nb14_cnt) + call copy_nb14(nb_14_list, gbl_nb14, gbl_nb14_cnt) ! if frameon = 1 use frames and forces to define ep's; ! else use their masses and amber force params @@ -623,7 +634,7 @@ end if deallocate(epbtyp, nghbrs, hnghbrs, enghbrs, numnghbr, epowner, offset, & - test, i11, i12, i13, i14, s3) + test, i11, i12, i13, i14, nb_14_list, s3) return @@ -1214,7 +1225,7 @@ ! Formal arguments: - integer :: nb14(2, *) + integer :: nb14(3, *) integer :: nb14_cnt integer :: maxnb14 integer :: epowner(*) @@ -1238,7 +1249,7 @@ call mexit(6, 1) end if - call sort_pairs(nb14, nb14_cnt, atm_cnt) + call sort_pairs_14_nb(nb14, nb14_cnt, atm_cnt) if (verbose .gt. 0) write(6, '(a, i6)') & '| EXTRA_PTS, build_nb14: num of 14 terms = ', nb14_cnt @@ -1270,8 +1281,8 @@ ! Formal arguments: - integer :: from14(2, *) - integer :: to14(2, *) + integer :: from14(3, *) + integer :: to14(3, *) integer :: nb14_cnt ! Local variables: @@ -1281,6 +1292,7 @@ do n = 1, nb14_cnt to14(1, n) = from14(1, n) to14(2, n) = from14(2, n) + to14(3, n) = from14(3, n) end do return @@ -1740,6 +1752,97 @@ end subroutine sort_pairs !*******************************************************************************! +! Internal Subroutine: sort_pairs_14_nb +! +! Description: +! +!******************************************************************************* + +subroutine sort_pairs_14_nb(nb_14_list, num, atm_cnt) + + implicit none + +! Formal arguments: + + integer :: nb_14_list(3, *) + integer :: num + integer :: atm_cnt + +! Local variables: + + integer :: i, j, k, m, n, ntot, parm_idx + integer :: scr1(atm_cnt), scr2(atm_cnt), scr3(maxa), scr4(maxa) + + !Indexes of nb_14_list = 1,j,parm_index + !First pass, make sure the first index contains the lowest number. Ignore + !index 3 here since this won't change anything. + + do n = 1, num + i = nb_14_list(1, n) + j = nb_14_list(2, n) + if (i .lt. j) then + nb_14_list(1, n) = i + nb_14_list(2, n) = j + else + nb_14_list(1, n) = j + nb_14_list(2, n) = i + end if + end do + + ! Now get rid of duplicates: + + ! First pass: + scr1(1:atm_cnt) = 0 + do n = 1, num + i = nb_14_list(1, n) + scr1(i) = scr1(i) + 1 + end do + + scr2(1) = 0 + do n = 2, atm_cnt + scr2(n) = scr2(n - 1) + scr1(n - 1) + scr1(n - 1) = 0 + end do + scr1(atm_cnt) = 0 + + ! Second pass: + + do n = 1, num + i = nb_14_list(1, n) + j = nb_14_list(2, n) + parm_idx = nb_14_list(3, n) + scr1(i) = scr1(i) + 1 + scr3(scr1(i) + scr2(i)) = j + scr4(scr1(i) + scr2(i)) = parm_idx + end do + + scr2(1:atm_cnt) = 0 + + ! Now trim them: + + ntot = 0 + k = 0 + do n = 1, atm_cnt + do m = 1, scr1(n) + j = scr3(ntot + m) + parm_idx = scr4(ntot + m) + if (scr2(j) .ne. n) then + k = k + 1 + nb_14_list(1, k) = n + nb_14_list(2, k) = j + nb_14_list(3, k) = parm_idx + scr2(j) = n + end if + end do + ntot = ntot + scr1(n) + end do + num = k + + return + +end subroutine sort_pairs_14_nb + +!*******************************************************************************! ! Internal Subroutine: add_one_list_iblo ! ! Description: @@ -1811,7 +1914,7 @@ ! !******************************************************************************* -subroutine do_14pairs(list, num, maxnb14, epowner, numnghbr, enghbrs, & +subroutine do_14pairs(nb_14_list, num, maxnb14, epowner, numnghbr, enghbrs, & ifail, chngmask) use prmtop_dat_mod @@ -1820,7 +1923,7 @@ ! Formal arguments: - integer :: list(2, *) + integer :: nb_14_list(3, *) integer :: num integer :: maxnb14 integer :: epowner(*) @@ -1831,7 +1934,7 @@ ! Local variables: - integer :: atm_i, atm_l + integer :: atm_i, atm_l, parm_idx integer :: i, j, n, ni, nl ifail = 0 @@ -1844,14 +1947,16 @@ atm_i = gbl_dihed(n)%atm_i atm_l = gbl_dihed(n)%atm_l + parm_idx = gbl_dihed(n)%parm_idx if (chngmask .eq. 0) then if (num + 1 .gt. maxnb14) then ifail = 1 return end if num = num + 1 - list(1, num) = atm_i - list(2, num) = atm_l + nb_14_list(1, num) = atm_i + nb_14_list(2, num) = atm_l + nb_14_list(3, num) = parm_idx !used for lookup into scnb and scee arrays. else ! Check neither is extra. Count this bond and also extra attached: if (epowner(atm_i) .eq. 0 .and. epowner(atm_l) .eq. 0) then @@ -1862,23 +1967,27 @@ return end if num = num + 1 - list(1, num) = atm_i - list(2, num) = atm_l + nb_14_list(1, num) = atm_i + nb_14_list(2, num) = atm_l + nb_14_list(3, num) = parm_idx !used for lookup into scnb and scee arrays. do i = 1, ni num = num + 1 - list(1, num) = enghbrs(i, atm_i) - list(2, num) = atm_l + nb_14_list(1, num) = enghbrs(i, atm_i) + nb_14_list(2, num) = atm_l + nb_14_list(3, num) = parm_idx !used for lookup into scnb and scee arrays. end do do j = 1, nl num = num + 1 - list(1, num) = atm_i - list(2, num) = enghbrs(j, atm_l) + nb_14_list(1, num) = atm_i + nb_14_list(2, num) = enghbrs(j, atm_l) + nb_14_list(3, num) = parm_idx !used for lookup into scnb and scee arrays. end do do i = 1, ni do j = 1, nl - num = num + 1 - list(1, num) = enghbrs(i, atm_i) - list(2, num) = enghbrs(j, atm_l) + num = num + 1 + nb_14_list(1, num) = enghbrs(i, atm_i) + nb_14_list(2, num) = enghbrs(j, atm_l) + nb_14_list(3, num) = parm_idx !used for lookup into scnb and scee arrays. end do end do end if @@ -2402,8 +2511,7 @@ subroutine get_nb14_energy(charge, crd, frc, iac, ico, cn1, cn2, & nb14, nb14_cnt, ee14, enb14, e14vir) - use mdin_ctrl_dat_mod, only : scee, scnb - use prmtop_dat_mod, only : ntypes + use prmtop_dat_mod, only : ntypes, gbl_one_scee, gbl_one_scnb use parallel_dat_mod implicit none @@ -2417,7 +2525,7 @@ integer, intent(in) :: ico(*) double precision, intent(in) :: cn1(*) double precision, intent(in) :: cn2(*) - integer, intent(in) :: nb14(2, *) + integer, intent(in) :: nb14(3, *) integer, intent(in) :: nb14_cnt double precision, intent(out) :: ee14 double precision, intent(out) :: enb14 @@ -2425,7 +2533,7 @@ ! Local variables: - integer :: n, i, j, ic + integer :: n, i, j, ic, parm_idx integer :: ntypes_lcl logical :: do_virial double precision :: dx, dy, dz, r2, r2inv, rinv @@ -2434,21 +2542,23 @@ ee14 = 0.d0 enb14 = 0.d0 -#ifdef MPI - if (nb14_cnt .eq. 0) return -#endif - - scee0 = 1.d0 / scee - scnb0 = 1.d0 / scnb - ntypes_lcl = ntypes - do_virial = present(e14vir) if (do_virial) e14vir(:, :) = 0.d0 + if (nb14_cnt .eq. 0) return + +!RCW Temp +! scee0 = gbl_one_scee(1) +! scnb0 = gbl_one_scnb(1) + ntypes_lcl = ntypes + do n = 1, nb14_cnt i = nb14(1, n) j = nb14(2, n) + parm_idx = nb14(3, n) + scee0 = gbl_one_scee(parm_idx) + scnb0 = gbl_one_scnb(parm_idx) dx = crd(1, j) - crd(1, i) dy = crd(2, j) - crd(2, i) dz = crd(3, j) - crd(3, i) @@ -2456,7 +2566,7 @@ rinv = sqrt(1.d0 / r2) r2inv = rinv * rinv r6 = r2inv * r2inv * r2inv - g = charge(i) * charge(j) * rinv + g = charge(i) * charge(j) * rinv * scee0 ic = ico(ntypes_lcl * (iac(i) - 1) + iac(j)) if (ic .ne. 0) then f6 = cn2(ic) * r6 @@ -2465,7 +2575,7 @@ f6 = 0.d0 f12 = 0.d0 end if - df = (scee0 * g + scnb0 * (12.d0 * f12 - 6.d0 * f6)) * r2inv + df = (g + scnb0 * (12.d0 * f12 - 6.d0 * f6)) * r2inv #ifdef MPI ! We use i to determine who sums up the energy... @@ -2473,7 +2583,7 @@ if (gbl_atm_owner_map(i) .eq. mytaskid) then ee14 = ee14 + g - enb14 = enb14 + f12 - f6 + enb14 = enb14 + (f12 - f6) * scnb0 frc(1, i) = frc(1, i) - df * dx frc(2, i) = frc(2, i) - df * dy @@ -2501,7 +2611,7 @@ #else ee14 = ee14 + g - enb14 = enb14 + f12 - f6 + enb14 = enb14 + (f12 - f6) * scnb0 frc(1, i) = frc(1, i) - df * dx frc(2, i) = frc(2, i) - df * dy frc(3, i) = frc(3, i) - df * dz @@ -2527,9 +2637,6 @@ e14vir(3, 2) = e14vir(2, 3) end if - ee14 = ee14 * scee0 - enb14 = enb14 * scnb0 - return end subroutine get_nb14_energy --- src/pmemd/src/gb_alltasks_setup.fpp 2009-08-26 12:00:45.000000000 -0700 +++ src/pmemd/src/gb_alltasks_setup.fpp 2009-09-02 22:33:37.000000000 -0700 @@ -133,6 +133,9 @@ if (ntx .eq. 1 .or. ntx .eq. 2) then call all_atom_setvel(natom, atm_vel, atm_mass_inv, tempi) +#ifdef CUDA + call gpu_upload_vel(atm_vel) +#endif end if if (ibelly .gt. 0) then --- src/pmemd/src/gb_ene.fpp 2009-08-26 12:00:49.000000000 -0700 +++ src/pmemd/src/gb_ene.fpp 2009-09-02 22:20:08.000000000 -0700 @@ -325,6 +325,15 @@ end do end if + + +#ifdef CUDA + call gpu_create_outputbuffers() + call gpu_upload_rborn(atm_gb_radii) + call gpu_upload_fs(atm_gb_fs) + call gpu_final_gb_setup(igb, alpb, saltcon, rgbmax, gb_fs_max, atm_gb_radii, atm_gb_fs) + call gpu_build_threadblock_work_list(atm_numex, gbl_natex) +#endif return @@ -544,7 +553,7 @@ !--------------------------------------------------------------------------- ! Step 1: loop over pairs of atoms to compute the effective Born radii. !--------------------------------------------------------------------------- - + if (irespa .lt. 2 .or. mod(irespa, nrespai) .eq. 0) & call calc_born_radii(atm_cnt, crd, fs, rborn) @@ -610,17 +619,24 @@ yij = yi - crd(3 * j - 1) zij = zi - crd(3 * j) r2 = xij * xij + yij * yij + zij * zij +#ifdef CUDA + !CUDA GB Code does not currently support cutoffs in GB. + ! if (r2 .gt. cut2) cycle + ! if (.not. onstep .and. r2 .gt. cut_inner2) cycle +#else if (r2 .gt. cut2) cycle if (.not. onstep .and. r2 .gt. cut_inner2) cycle +#endif icount = icount + 1 jj(icount) = j r2x(icount) = r2 rjx(icount) = reff(j) - + end do vectmp1(1:icount) = 4.d0 * ri * rjx(1:icount) + call vdinv(icount, vectmp1, vectmp1) vectmp1(1:icount) = -r2x(1:icount) * vectmp1(1:icount) call vdexp(icount, vectmp1, vectmp1) @@ -646,7 +662,8 @@ ! vectmp5 = 1/rij ! Start first outer loop - !dir$ ivdep + ! dir$ ivdep + do k = 1, icount j = jj(k) @@ -695,6 +712,7 @@ rj = rjx(k) temp5 = 0.5d0 * temp1 * temp6 * (ri * rj + 0.25d0 * r2) + sumdeijda(i) = sumdeijda(i) + ri * temp5 sumdeijda(j) = sumdeijda(j) + rj * temp5 @@ -799,7 +817,7 @@ #endif frespa = nrespa - + ! diagonal egb term, plus off-diag derivs wrt alpha .eq. reff^-1: #ifdef MPI @@ -1358,6 +1376,7 @@ reff(i) = reff_i end do ! i = 1, atm_cnt + #ifdef MPI call update_gb_time(calc_gb_rad_timer) @@ -2041,9 +2060,14 @@ yij = yi - crd(3 * j - 1) zij = zi - crd(3 * j) r2 = xij * xij + yij * yij + zij * zij +#ifdef CUDA + !CUDA GB Code does not currently support cutoffs in GB. + ! if (r2 .gt. cut2) cycle + ! if (.not. onstep .and. r2 .gt. cut_inner2) cycle +#else if (r2 .gt. cut2) cycle if (.not. onstep .and. r2 .gt. cut_inner2) cycle - +#endif icount = icount + 1 jj(icount) = j r2x(icount) = r2 --- src/pmemd/src/Makefile 2009-08-26 12:01:07.000000000 -0700 +++ src/pmemd/src/Makefile 2009-09-02 22:44:03.000000000 -0700 @@ -41,13 +41,33 @@ get_cmdline.o master_setup.o pme_alltasks_setup.o pme_setup.o \ ene_frc_splines.o gb_alltasks_setup.o nextprmtop_section.o +ifeq ($(CU_DEFINES),-DCUDA) +all: pmemd.cuda +else all: pmemd +endif install: all +ifeq ($(CU_DEFINES),-DCUDA) + mv pmemd.cuda ../../../exe +else mv pmemd ../../../exe +endif +ifeq ($(CU_DEFINES),-DCUDA) +PMEMD_CU_LIBS=./cuda/cuda.a + +$(PMEMD_CU_LIBS): + cd ./cuda; $(MAKE) + +pmemd.cuda: $(NETCDF_MOD) $(OBJS) $(PMEMD_CU_LIBS) + $(LOAD) $(LOADFLAGS) -o $@ $(OBJS) $(FFT_LIBS) $(NETCDF_LIBS) $(MATH_LIBS) $(MPI_LIBS) \ + $(CU_LIBS) $(CU_LOADLIBS) $(PMEMD_CU_LIBS) $(LOADLIBS) +else pmemd: $(NETCDF_MOD) $(OBJS) $(LOAD) $(LOADFLAGS) -o $@ $(OBJS) $(FFT_LIBS) $(NETCDF_LIBS) $(MATH_LIBS) $(MPI_LIBS) $(LOADLIBS) +endif + # Some implementations of netcdf seem to have more than just netcdf.mod... @@ -55,7 +75,8 @@ cp $(NETCDF_HOME)/include/*.mod . clean: - rm -f *.f90 *.o *.$(MODULE_SUFFIX) pmemd *.d work.pc* + rm -f *.f90 *.o *.$(MODULE_SUFFIX) pmemd pmemd.cuda *.d work.pc* + cd ./cuda ; $(MAKE) clean # The following statement keeps the .f90 files from getting clobbered; this # would happen because they are intermediates. @@ -69,10 +90,19 @@ .SUFFIXES: .fpp .c .o + +ifeq ($(CU_DEFINES),-DCUDA) +.fpp.o: + $(CPP) $(CPPFLAGS) $(CU_DEFINES) $(FFT_INCLUDE) $(MPI_INCLUDE) $(FFT_DEFINES) $(NETCDF_DEFINES) $(MPI_DEFINES) $(DIRFRC_DEFINES) $(MATH_DEFINES) $(F90_DEFINES) $*.fpp $*.f90 + $(F90) $(F90FLAGS) $(F90_OPT_DFLT) $*.f90 +.fpp.o: +else .fpp.o: $(CPP) $(CPPFLAGS) $(FFT_INCLUDE) $(MPI_INCLUDE) $(FFT_DEFINES) $(NETCDF_DEFINES) $(MPI_DEFINES) $(DIRFRC_DEFINES) $(MATH_DEFINES) $(F90_DEFINES) $*.fpp $*.f90 $(F90) $(F90FLAGS) $(F90_OPT_DFLT) $*.f90 +endif + .c.o: $(CC) $(CFLAGS) -c $*.c --- src/pmemd/src/master_setup.fpp 2009-09-02 22:48:44.000000000 -0700 +++ src/pmemd/src/master_setup.fpp 2009-09-02 22:48:57.000000000 -0700 @@ -130,6 +130,14 @@ call init_mdin_ctrl_dat call validate_mdin_ctrl_dat +#ifdef CUDA + call gpu_init() + + !Write info about CUDA GPU(s) + call gpu_write_cuda_info(mdout) + +#endif + ! Read the input coordinates or restart file (inpcrd): call init_inpcrd_dat(num_ints, num_reals, inpcrd_natom, & @@ -728,6 +736,15 @@ write(mdout, '(a)') '| MASSVP' #endif +#ifdef CUDA + write(mdout, '(a)') '| CUDA' + !Check for incompatible defines +#ifdef MPI + write(mdout, '(a)') 'ERROR: CUDA and MPI defines are mutually exclusive.' + call mexit(6, 1) +#endif +#endif + write(mdout, *) return --- src/pmemd/src/mdin_ctrl_dat.fpp 2009-08-26 12:01:19.000000000 -0700 +++ src/pmemd/src/mdin_ctrl_dat.fpp 2009-09-02 22:20:26.000000000 -0700 @@ -203,7 +203,7 @@ ! Local variables: - integer ifind + integer ifind, sec_temp character(4) watdef(4) ! Define default water residue name and the names of water oxygen & hydrogens @@ -531,6 +531,9 @@ using_gb_potential = .false. end if + !Support for random seed using time of day in microsecs + if( ig==-1 ) call get_wall_time(sec_temp,ig) + return end subroutine init_mdin_ctrl_dat @@ -681,11 +684,6 @@ inerr = 1 end if - if (scee .eq. 0.0d0) then - write(mdout, '(a,a)') error_hdr, 'scee must != 0.0!' - inerr = 1 - end if - if (nsnb .ne. 25) then write(mdout, '(a,a)') info_hdr, & 'The nsnb ctrl option does not affect nonbonded list update frequency.' @@ -766,6 +764,11 @@ inerr = 1 end if + if (ibelly == 1 .and. ntt == 3) then + write(mdout, '(a,a)') error_hdr, 'ibelly = 1 with ntt = 3 is not a valid option.' + inerr = 1 + end if + if (ntr .ne. 0 .and. ntr .ne. 1) then write(mdout, '(a,a)') error_hdr, 'ntr must be set to 0 or 1!' inerr = 1 @@ -1213,6 +1216,68 @@ end if +#ifdef CUDA +!Trap specific limitations for NVIDIA GPU CUDA implementations. +!Current limitations are: +! +! ibelly /= 0 !No support for belly. +! ntb /= 0 !Only GB support at present. +! cut < systemsize !GPU sims do not use a cutoff in GB. +! nmropt /= 0 !No support for nmropt right now. +! igb /= 1,2,5 !Only igb models 1,2 and 5 are currently supported. +! nrespa /= 1 !No multiple time stepping supported. +! +! ntt=2 - Temporarily disable this due to issues in the code. + if (ntt == 2) then + write(mdout, '(a)') 'CUDA (GPU): Use of NTT=2 is currently disabled due to unresolved issues' + write(mdout, '(a)') ' in the GPU implementation. Require ntt /= 0.' + inerr = 1 + end if + + if (ntb /= 0) then + write(mdout, '(a)') 'CUDA (GPU): Implementation currently only supports GB simulations.' + write(mdout, '(a)') ' Require ntb == 0.' + inerr = 1 + end if + + if (ibelly /= 0) then + write(mdout, '(a)') 'CUDA (GPU): Implementation does not support Belly options.' + write(mdout, '(a)') ' Require ibelly == 0.' + inerr = 1 + end if + + if (nmropt /= 0) then + write(mdout, '(a)') 'CUDA (GPU): Implementation does not support nmropt options.' + write(mdout, '(a)') ' Require nmropt == 0.' + inerr = 1 + end if + + if (nrespa /= 1) then + write(mdout, '(a)') 'CUDA (GPU): Implementation does not support nrespa.' + write(mdout, '(a)') ' Require nrespa == 1.' + inerr = 1 + end if + + if ( .not. (igb == 0 .or. igb == 1 .or. igb == 2 .or. igb == 5)) then + write(mdout, '(a)') 'CUDA (GPU): Implementation only supports GB options 1,2 and 5.' + write(mdout, '(a)') ' Require igb == 0,1,2 or 5.' + inerr = 1 + end if + + !Check the size of the cutoff with respect to the system size. + if (ntb == 0) then + !Check not fully implemented. Ideally this check should loop over the current + !coordinates and find the largest distance between any two atoms and make sure + !that cut if larger than that value. For the moment though we shall just post + !an error if cut looks to be too small in GB sims. + if ( cut < 999.0d0 ) then + write(mdout, '(a)') 'CUDA (GPU): Implementation does not support the use of a cutoff in GB simulations.' + write(mdout, '(a)') ' Require cut > 999.0d0.' + end if + end if + +#endif + ! Field any errors and bag out. if (inerr .eq. 1) then @@ -1425,6 +1490,13 @@ using_gb_potential = .false. end if +#ifdef NO_NTT3_SYNC + !Here we are not synching the random number generatore across threads for + !NTT=3 but we don't want every thread to have the same random seed. So for + !the moment just add mytask id to ig. + ig = ig+mytaskid +#endif + return end subroutine bcast_mdin_ctrl_dat --- src/pmemd/src/pbc.fpp 2009-08-26 12:01:37.000000000 -0700 +++ src/pmemd/src/pbc.fpp 2009-09-02 22:20:31.000000000 -0700 @@ -196,6 +196,10 @@ 'max pairlist cutoff must be less than unit cell max sphere radius!' call mexit(6, 1) end if + +#ifdef CUDA + call gpu_init_pbc(a, b, c, alpha, beta, gamma, uc_volume, uc_sphere, max_cutoff, pbc_box, reclng, cut_factor, ucell, recip) +#endif return --- src/pmemd/src/pme_alltasks_setup.fpp 2009-08-26 12:01:42.000000000 -0700 +++ src/pmemd/src/pme_alltasks_setup.fpp 2009-09-03 10:34:36.000000000 -0700 @@ -247,6 +247,10 @@ call all_atom_belly(natom, atm_igroup, atm_vel) end if +#ifdef CUDA + call gpu_pme_alltasks_setup(nfft1, nfft2, nfft3) +#endif + return end subroutine pme_alltasks_setup --- src/pmemd/src/pmemd.fpp 2009-08-26 12:01:51.000000000 -0700 +++ src/pmemd/src/pmemd.fpp 2009-09-02 22:20:43.000000000 -0700 @@ -95,8 +95,6 @@ call enable_test_timers #endif -! Uniprocessor and mpi master initialization. The master does all i/o. - if (master) call master_setup(num_ints, num_reals) #ifdef MPI @@ -109,6 +107,20 @@ call parallel_dat_setup(natom, num_ints, num_reals) #endif +#ifdef CUDA +! Initialize system + + call gpu_setup_system(natom, tol) + call gpu_upload_crd(atm_crd) + call gpu_upload_charges(atm_qterm) + call gpu_upload_masses(atm_mass) + call gpu_upload_frc(atm_frc) + call gpu_upload_vel(atm_vel) + call gpu_upload_last_vel(atm_last_vel) + call gpu_constraints_setup(atm_jrc, atm_weight, atm_xc) + +#endif + ! The following call does more uniprocessor setup and mpi master/slave setup. if (using_pme_potential) then @@ -117,6 +129,8 @@ call gb_alltasks_setup(num_ints, num_reals) end if + + ! Call shake_setup, which will tag those bonds which are part of 3-point ! water molecules and also set up data structures for non-fastwater shake. ! Constraints will be done for waters using a fast analytic routine -- dap. @@ -309,6 +323,13 @@ dble(run_end_walltime - run_start_walltime) / 3600.d0, ' hours' #endif +#ifdef CUDA +! Shut down GPU + call gpu_shutdown() +#endif + + + #ifdef TIME_TEST call print_test_timers ! Debugging output of performance timings #endif --- src/pmemd/src/random.fpp 2009-08-26 12:01:54.000000000 -0700 +++ src/pmemd/src/random.fpp 2009-09-02 22:20:48.000000000 -0700 @@ -122,6 +122,11 @@ j97 = 33 set = .true. + +#ifdef CUDA + call gpu_amrset(iseed) +#endif + return --- src/pmemd/src/runmd.fpp 2009-08-26 12:02:01.000000000 -0700 +++ src/pmemd/src/runmd.fpp 2009-09-03 12:33:02.000000000 -0700 @@ -317,8 +317,13 @@ si(si_tot_virial) = virial(1) + virial(2) + virial(3) else if (using_gb_potential) then - +#ifdef CUDA + call gpu_upload_crd(crd) +#endif call gb_force(atm_cnt, crd, frc, gb_pot_ene, irespa) +#ifdef CUDA + call gpu_download_frc(frc) +#endif si(si_pot_ene) = gb_pot_ene%total si(si_vdw_ene) = gb_pot_ene%vdw_tot @@ -404,6 +409,10 @@ eke = 0.d0 +#ifdef CUDA + call gpu_download_frc(frc) +#endif + #ifdef MPI do atm_lst_idx = 1, my_atm_cnt j = my_atm_lst(atm_lst_idx) @@ -416,6 +425,9 @@ if (use_vlimit) vel(i, j) = sign(min(abs(vel(i, j)), vlimit), vel(i, j)) end do end do +#ifdef CUDA + call gpu_upload_vel(vel) +#endif if (ntt .eq. 1) then ekmh = max(si(si_solute_kin_ene), fac(1) * 10.d0) @@ -457,6 +469,9 @@ ekmh = ekmh * 0.5d0 last_vel(:,:) = vel(:,:) +#ifdef CUDA + call gpu_upload_last_vel(last_vel) +#endif if (irest .eq. 0) then @@ -596,6 +611,9 @@ ! Note that this fix only works for Newtonian dynamics. if (.not. doing_langevin) then +#ifdef CUDA + call gpu_download_frc(frc) +#endif #ifdef MPI do atm_lst_idx = 1, my_atm_cnt j = my_atm_lst(atm_lst_idx) @@ -605,12 +623,17 @@ wfac = atm_mass_inv(j) * half_dtx vel(:, j) = vel(:, j) - frc(:, j) * wfac end do +#ifdef CUDA + call gpu_upload_vel(vel) +#endif end if end if ! (reset_velocities) ! Do the velocity update: - +#ifdef CUDA + call gpu_update(dt, temp0, gamma_ln) +#else if (.not. doing_langevin) then ! ---Newtonian dynamics: @@ -636,10 +659,14 @@ dt, temp0, gamma_ln) end if ! (doing_langevin) +#endif ! Consider vlimit: if (use_vlimit) then +#ifdef CUDA + call gpu_download_vel(vel) +#endif ! We here provide code that is most efficient if vlimit is not exceeded. @@ -672,6 +699,9 @@ vel(i, j) = sign(min(abs(vel(i, j)), vlimit), vel(i, j)) end do end do +#ifdef CUDA + call gpu_upload_vel(vel) +#endif ! Only violations on the master node are actually reported ! to avoid both MPI communication and non-master writes. write(mdout, '(a,i6,a,f10.4)') 'vlimit exceeded for step ', nstep, & @@ -682,6 +712,7 @@ ! Update the positions, putting the "old" positions into frc(): +#ifndef CUDA #ifdef MPI do atm_lst_idx = 1, my_atm_cnt i = my_atm_lst(atm_lst_idx) @@ -691,14 +722,19 @@ frc(:, i) = crd(:, i) crd(:, i) = crd(:, i) + vel(:, i) * dtx end do +#endif ! If shake is being used, update new positions to fix bond lengths: if (ntc .ne. 1) then call update_time(runmd_time) +#ifdef CUDA + call gpu_shake() +#else call shake(frc, crd) call shake_fastwater(frc, crd) +#endif call update_time(shake_time) ! Must update extra point coordinates after the frame is moved: @@ -709,7 +745,9 @@ end if ! Re-estimate velocities from differences in positions: - +#ifdef CUDA + call gpu_recalculate_velocities(dtx_inv) +#else #ifdef MPI do atm_lst_idx = 1, my_atm_cnt j = my_atm_lst(atm_lst_idx) @@ -718,7 +756,7 @@ #endif vel(:, j) = (crd(:, j) - frc(:, j)) * dtx_inv end do - +#endif else ! Must update extra point coordinates after the frame is moved: @@ -734,6 +772,10 @@ ! them when they are output; ignore them otherwise. if (ntt .eq. 1 .or. onstep) then +#ifdef CUDA + call gpu_download_vel(vel) + call gpu_download_last_vel(last_vel) +#endif ! Get the kinetic energy, either for printing or for Berendsen. ! The process is completed later under mpi, in order to avoid an @@ -973,6 +1015,12 @@ ! --- following is from T.E. Cheatham, III and B.R. Brooks, ! Theor. Chem. Acc. 99:279, 1998. + +#ifdef CUDA + ! Temporarily do NTT=1 on cpu. We ultimately need a GPU kernel for this + ! to avoid this synchronization. + call gpu_download_vel(vel) +#endif #ifdef MPI if (my_atm_cnt .gt. 0) & @@ -987,6 +1035,12 @@ vel(:, j) = vel(:, j) * scaltp end do +#ifdef CUDA + ! Temporarily do NTT=1 on cpu. We ultimately need a GPU kernel for this + ! to avoid this synchronization. + call gpu_upload_vel(vel) +#endif + end if ! Pastor, Brooks, Szabo conserved quantity for harmonic oscillator: @@ -1017,61 +1071,69 @@ ! "block of ice flying thru space" phenomenon. We make this correction for pme ! explicitly before collecting velocities to the master... - if (mod(nstep + 1, nscm) .eq. 0) then - if (ifbox .ne. 0) then - if (.not. doing_langevin) then + if (nscm .ne. 0) then + if (mod(nstep + 1, nscm) .eq. 0) then + if (ifbox .ne. 0) then + if (.not. doing_langevin) then +#ifdef CUDA + call gpu_download_vel(vel) +#endif - vcm(:) = 0.d0 + vcm(:) = 0.d0 #ifdef MPI - do atm_lst_idx = 1, my_atm_cnt - j = my_atm_lst(atm_lst_idx) + do atm_lst_idx = 1, my_atm_cnt + j = my_atm_lst(atm_lst_idx) #else - do j = 1, atm_cnt + do j = 1, atm_cnt #endif - aamass = mass(j) - vcm(:) = vcm(:) + aamass * vel(:, j) - end do + aamass = mass(j) + vcm(:) = vcm(:) + aamass * vel(:, j) + end do #ifdef MPI - call update_time(runmd_time) - reduce_buf_in(1:3) = vcm(:) + call update_time(runmd_time) + reduce_buf_in(1:3) = vcm(:) #ifdef COMM_TIME_TEST call start_test_timer(8, 'allreduce vcm', 0) #endif - call mpi_allreduce(reduce_buf_in, reduce_buf_out, 3, & - mpi_double_precision, & - mpi_sum, mpi_comm_world, err_code_mpi) + call mpi_allreduce(reduce_buf_in, reduce_buf_out, 3, & + mpi_double_precision, & + mpi_sum, mpi_comm_world, err_code_mpi) #ifdef COMM_TIME_TEST call stop_test_timer(8) #endif - vcm(:) = reduce_buf_out(1:3) - call update_time(fcve_dist_time) + vcm(:) = reduce_buf_out(1:3) + call update_time(fcve_dist_time) #endif /* MPI */ - vcm(:) = vcm(:) * tmassinv + vcm(:) = vcm(:) * tmassinv - if (master) then - velocity2 = vcm(1) * vcm(1) + vcm(2) * vcm(2) + vcm(3) * vcm(3) - write(mdout,'(a,f15.6,f9.2,a)') 'check COM velocity, temp: ', & - sqrt(velocity2), 0.5d0 * tmass * & - velocity2 / fac(1), '(Removed)' - end if + if (master) then + velocity2 = vcm(1) * vcm(1) + vcm(2) * vcm(2) + vcm(3) * vcm(3) + write(mdout,'(a,f15.6,f9.2,a)') 'check COM velocity, temp: ', & + sqrt(velocity2), 0.5d0 * tmass * & + velocity2 / fac(1), '(Removed)' + end if #ifdef MPI - do atm_lst_idx = 1, my_atm_cnt - j = my_atm_lst(atm_lst_idx) + do atm_lst_idx = 1, my_atm_cnt + j = my_atm_lst(atm_lst_idx) #else - do j = 1, atm_cnt + do j = 1, atm_cnt #endif - vel(:, j) = vel(:, j) - vcm(:) - end do + vel(:, j) = vel(:, j) - vcm(:) + end do +#ifdef CUDA + call gpu_upload_vel(vel) +#endif + end if end if - end if - end if ! (mod(nstep + 1, nscm) .eq. 0) + end if ! (mod(nstep + 1, nscm) .eq. 0) + end if ! if nscm .ne. 0 #ifdef MPI ! It may be necessary for the master to have a complete copy of the velocities @@ -1132,70 +1194,84 @@ ! Zero COM velocity if requested; here we are doing the adjustment for a ! nonperiodic system, not pme. - if (mod(nstep + 1, nscm) .eq. 0) then - if (ifbox .eq. 0) then -#ifdef MPI - ! WARNING - currently only GB code has ifbox .eq. 0, and currently - ! all coordinates are always valid for GB. We include the conditional - ! below more-or-less as maintenance insurance... - if (.not. all_crds_valid) then ! Currently always false... - call update_time(runmd_time) - call gb_mpi_allgathervec(atm_cnt, crd) - all_crds_valid = .true. - call update_time(fcve_dist_time) - end if -#endif /* MPI */ - if (.not. doing_langevin) then - -#ifdef MPI - ! WARNING - currently GB code never updates all velocities unless - ! forced by this scenario... - if (.not. all_vels_valid) then ! Currently always true... - - ! The following conditional is currently never .true., but we - ! include it if there is ever extra points support under GB, or - ! if other types of nonperiodic simulations with extra points - ! are eventually supported... - - if (numextra .gt. 0 .and. frameon .ne. 0) & - call zero_extra_pnts_vec(vel, ep_frames, gbl_frame_cnt) - + if (nscm .ne. 0) then + if (mod(nstep + 1, nscm) .eq. 0) then + if (ifbox .eq. 0) then +#ifdef MPI + ! WARNING - currently only GB code has ifbox .eq. 0, and currently + ! all coordinates are always valid for GB. We include the conditional + ! below more-or-less as maintenance insurance... + if (.not. all_crds_valid) then ! Currently always false... call update_time(runmd_time) - call gb_mpi_allgathervec(atm_cnt, vel) - all_vels_valid = .true. + call gb_mpi_allgathervec(atm_cnt, crd) + all_crds_valid = .true. call update_time(fcve_dist_time) end if #endif /* MPI */ - ! Nonperiodic simulation. Remove both translation and rotation. - ! Back the coords up a half step so they correspond to the velocities, - ! temporarily storing them in frc(:,:). - - frc(:,:) = crd(:,:) - vel(:,:) * half_dtx + if (.not. doing_langevin) then +#ifdef CUDA + call gpu_download_crd(crd) + call gpu_download_vel(vel) +#endif - ! Now compute COM motion and remove it; then recompute (sander - ! compatibility...). +#ifdef MPI + ! WARNING - currently GB code never updates all velocities unless + ! forced by this scenario... + if (.not. all_vels_valid) then ! Currently always true... + + ! The following conditional is currently never .true., but we + ! include it if there is ever extra points support under GB, or + ! if other types of nonperiodic simulations with extra points + ! are eventually supported... + + if (numextra .gt. 0 .and. frameon .ne. 0) & + call zero_extra_pnts_vec(vel, ep_frames, gbl_frame_cnt) + + call update_time(runmd_time) + call gb_mpi_allgathervec(atm_cnt, vel) + all_vels_valid = .true. + call update_time(fcve_dist_time) + end if +#endif /* MPI */ + ! Nonperiodic simulation. Remove both translation and rotation. + ! Back the coords up a half step so they correspond to the velocities, + ! temporarily storing them in frc(:,:). - ! NOTE - if mass can change, that has to be taken into account for - ! tmass, tmassinv (say for TI). + frc(:,:) = crd(:,:) - vel(:,:) * half_dtx + + ! Now compute COM motion and remove it; then recompute (sander + ! compatibility...). - call cenmas(atm_cnt, frc, vel, tmass, tmassinv, mass, xcm, vcm, ocm) - call stopcm(atm_cnt, frc, vel, xcm, vcm, ocm) - call cenmas(atm_cnt, frc, vel, tmass, tmassinv, mass, xcm, vcm, ocm) + ! NOTE - if mass can change, that has to be taken into account for + ! tmass, tmassinv (say for TI). + call cenmas(atm_cnt, frc, vel, tmass, tmassinv, mass, xcm, vcm, ocm) + call stopcm(atm_cnt, frc, vel, xcm, vcm, ocm) + call cenmas(atm_cnt, frc, vel, tmass, tmassinv, mass, xcm, vcm, ocm) +#ifdef CUDA + call gpu_upload_vel(vel) +#endif else - ! Get current center of the system: - call get_position(atm_cnt, crd, vcm(1), vcm(2), vcm(3), sys_range) +#ifdef CUDA + call gpu_recenter_molecule() +#else + ! Get current center of the system: + call get_position(atm_cnt, crd, vcm(1), vcm(2), vcm(3), sys_range) - ! Recenter system to the original center: + ! Recenter system to the original center: + ! All threads call so we do not need to send adjust coordinates or + ! restraint coordinates around. - call re_position(atm_cnt, 0, crd, crd, vcm(1), vcm(2), vcm(3), & - sys_x, sys_y, sys_z, sys_range, 0) + call re_position(atm_cnt, ntr, crd, atm_xc, vcm(1), vcm(2), vcm(3), & + sys_x, sys_y, sys_z, sys_range, 0) +#endif + end if end if - end if - end if ! (mod(nstep + 1, nscm) .eq. 0) + end if ! (mod(nstep + 1, nscm) .eq. 0) + end if ! if nscm .ne. 0 ! Zero out any non-moving velocities if a belly is active: @@ -1290,7 +1366,10 @@ end if if (write_restrt) then - +#ifdef CUDA + call gpu_download_crd(crd) + call gpu_download_vel(vel) +#endif #ifdef MPI #else if (numextra .gt. 0 .and. frameon .ne. 0) & @@ -1309,6 +1388,10 @@ if (ntwx .gt. 0) then if (mod(nstep, ntwx) .eq. 0) then +#ifdef CUDA + call gpu_download_crd(crd) + call gpu_download_vel(vel) +#endif if (iwrap .eq. 0) then call corpac(nrx, crd, 1, mdcrd) @@ -1348,6 +1431,9 @@ if (ntwv .gt. 0) then if (mod(nstep, ntwv) .eq. 0) then +#ifdef CUDA + call gpu_download_vel(vel) +#endif #ifdef MPI #else ! The zeroing of extra points may be redundant here. --- src/pmemd/src/runmin.fpp 2009-08-26 12:02:05.000000000 -0700 +++ src/pmemd/src/runmin.fpp 2009-09-03 13:39:15.000000000 -0700 @@ -280,8 +280,13 @@ si(si_restraint_ene) = pme_pot_ene%restraint else if (using_gb_potential) then - +#ifdef CUDA + call gpu_upload_crd(crd) +#endif call gb_force(atm_cnt, crd, frc, gb_pot_ene, ncalls) +#ifdef CUDA + call gpu_download_frc(frc) +#endif si(si_pot_ene) = gb_pot_ene%total si(si_vdw_ene) = gb_pot_ene%vdw_tot --- src/pmemd/src/shake.fpp 2009-08-26 12:02:09.000000000 -0700 +++ src/pmemd/src/shake.fpp 2009-09-02 22:21:02.000000000 -0700 @@ -619,6 +619,10 @@ end if call nonfastwat_shake_setup + +#ifdef CUDA + call gpu_shake_setup(atm_mass, my_nonfastwat_bond_cnt, my_nonfastwat_bond_dat, my_fastwat_res_cnt, iorwat, my_fastwat_res_lst) +#endif return @@ -664,6 +668,11 @@ call mexit(6, 1) #endif +#ifdef CUDA + write(mdout, *) 'CUDA version only works for ntc < 3' + call mexit(6, 1) +#endif + bond_cnt = nbonh + nbona else @@ -1507,7 +1516,7 @@ /5x, 'within 3000 iterations') 321 format(/5x, 'Coordinate resetting cannot be accomplished,', & /5x, 'deviation is too large', & - /5x, 'iter_cnt, my_bond_idx, i and j are :', 7i5) + /5x, 'iter_cnt, my_bond_idx, i and j are :', 4i8) 331 format(1x, ' *** Especially for minimization, try ntc=1 (no shake)') end subroutine shake --- src/pmemd/src/prmtop_dat.fpp 2009-09-03 15:48:19.000000000 -0700 +++ src/pmemd/src/prmtop_dat.fpp 2009-09-03 22:15:57.000000000 -0700 @@ -79,6 +79,8 @@ double precision, allocatable, save :: gbl_pk(:) double precision, allocatable, save :: gbl_pn(:) double precision, allocatable, save :: gbl_phase(:) + double precision, allocatable, save :: gbl_one_scee(:) + double precision, allocatable, save :: gbl_one_scnb(:) ! Derived from atom paramaters: @@ -268,6 +270,9 @@ nttyp = ntypes * (ntypes + 1) / 2 ! Here we set nscm and ndfmin. We need natom to do this... + if (ntr.eq.1) & + nscm = 0 + if (nscm .gt. 0 .and. ntb .eq. 0) then ndfmin = 6 ! both translation and rotation com motion removed @@ -279,11 +284,11 @@ ndfmin = 0 end if if (ibelly .gt. 0) then ! No COM Motion Removal, ever. - nscm = big_int + nscm = 0 ndfmin = 0 end if - if (nscm .le. 0) nscm = big_int + if (nscm .le. 0) nscm = 0 ! For Langevin Dynamics... @@ -351,9 +356,9 @@ subroutine init_amber_prmtop_dat - implicit none + use mdin_ctrl_dat_mod, only : scee, scnb -! Local variables: + implicit none double precision :: dumd ! scratch or trash variable integer :: idum ! scratch or trash variable @@ -463,9 +468,8 @@ fmtin = rfmt type = 'ANGLE_EQUIL_VALUE' call nxtsec(prmtop, mdout, 0, fmtin, type, fmt, errcode) - read(prmtop, fmt) (gbl_teq(i), i = 1, numang) - + fmtin = rfmt type = 'DIHEDRAL_FORCE_CONSTANT' call nxtsec(prmtop, mdout, 0, fmtin, type, fmt, errcode) @@ -484,6 +488,60 @@ read(prmtop, fmt) (gbl_phase(i), i = 1, nptra) + ! Read variable SCEE and SCNB values if they exist. + ! 1) SCEE + fmtin = rfmt + type = 'SCEE_SCALE_FACTOR' + call nxtsec(prmtop, mdout, 1,fmtin, type, fmt, errcode) + if (errcode == 0) then + !We found the SCEE scale factor data so read it in, there should be + !a value for each dihedral type. Note while there is one for each + !dihedral type not all are actually used in the calculation since + !1-4's are only done over unique dihedral terms. If multiple dihedrals + !for a given set of atom types exist, with different pn's for example + !then the last value of SCEE/SCNB should be used. + + write (mdout,'(a)') '' + write (mdout,'(a)') '| Note: 1-4 EEL scale factors are being read from the topology file.' + write (mdout,'(a)') '| SCEE value from cntrl namelist will be ignored.' + + read(prmtop,fmt) (gbl_one_scee(i), i = 1,nptra) + call vdinv(nptra,gbl_one_scee,gbl_one_scee) + else + !We will use whatever scee is set to in the mdin file for all 1-4's + if (scee == 0.0d0) then + write(mdout,'(/2x,a)') 'SCEE not found in topology file so it must be set explicitly' + call mexit(6, 1) + end if + do i=1,nptra + gbl_one_scee(i)=1.0d0/scee + end do + end if !if errcode==0 + + ! 2) SCNB + fmtin = rfmt + type = 'SCNB_SCALE_FACTOR' + call nxtsec(prmtop, mdout, 1,fmtin, type, fmt, errcode) + if (errcode == 0) then + write (mdout,'(a)') '' + write (mdout,'(a)') '| Note: 1-4 VDW scale factors are being read from the topology file.' + write (mdout,'(a)') '| SCNB value from cntrl namelist will be ignored.' + + read(prmtop,fmt) (gbl_one_scnb(i), i = 1,nptra) + call vdinv(nptra,gbl_one_scnb,gbl_one_scnb) + else + !We will use whatever scee is set to in the mdin file for all 1-4's. + if (scnb == 0.0d0) then + write(mdout,'(/2x,a)') 'SCNB not found in topology file so it must be set explicitly' + call mexit(6, 1) + end if + do i=1,nptra + gbl_one_scnb(i)=1.0d0/scnb + end do + end if !if errcode==0 + + ! End read variable SCEE and SCNB values. + fmtin = rfmt type = 'SOLTY' call nxtsec(prmtop, mdout, 0, fmtin, type, fmt, errcode) @@ -708,9 +766,12 @@ fmtin = rfmt type = 'BOX_DIMENSIONS' - call nxtsec(prmtop, mdout, 0, fmtin, type, fmt, errcode) + call nxtsec(prmtop, mdout, 1, fmtin, type, fmt, errcode) + + if (errcode == 0) then + read(prmtop, fmt) dumd, dumd, dumd, dumd + end if - read(prmtop, fmt) dumd, dumd, dumd, dumd else @@ -992,6 +1053,10 @@ mpi_comm_world, err_code_mpi) call mpi_bcast(gbl_ipn, nptra, mpi_integer, 0, & mpi_comm_world, err_code_mpi) + call mpi_bcast(gbl_one_scee, nptra, mpi_double_precision, 0, & + mpi_comm_world, err_code_mpi) + call mpi_bcast(gbl_one_scnb, nptra, mpi_double_precision, 0, & + mpi_comm_world, err_code_mpi) end if ! Generalized Born: @@ -1108,10 +1173,10 @@ stat = alloc_failed) if (alloc_failed .ne. 0) call setup_alloc_error - + num_ints = num_ints + size(gbl_bond) * bond_rec_ints + & size(gbl_angle) * angle_rec_ints + & - size(gbl_dihed) * dihed_rec_ints + size(gbl_dihed) * dihed_rec_ints if (nptra > 0) then allocate(gbl_pk(nptra), & @@ -1120,6 +1185,8 @@ gbl_gamc(nptra), & gbl_gams(nptra), & gbl_fmn(nptra), & + gbl_one_scee(nptra), & + gbl_one_scnb(nptra), & gbl_ipn(nptra), & stat = alloc_failed) @@ -1130,7 +1197,9 @@ size(gbl_phase) + & size(gbl_gamc) + & size(gbl_gams) + & - size(gbl_fmn) + size(gbl_fmn) + & + size(gbl_one_scee) + & + size(gbl_one_scnb) num_ints = num_ints + size(gbl_ipn) @@ -1156,8 +1225,9 @@ ! Allocation of stuff that will later be deallocated: - ! Bugfix note: gbl_natex dimension increased to provide workspace for - ! extra points code... + ! gbl_natex dimension is allocated as 2*next here since it can be expanded inside + ! the extra point code to be more than next. From the prmtop though only next + ! elements are read and subsequently broadcast. allocate(atm_numex(natom), & gbl_natex(next * 2), & --- src/pmemd/src/gb_force.fpp 2009-08-26 12:01:01.000000000 -0700 +++ src/pmemd/src/gb_force.fpp 2009-09-03 22:27:57.000000000 -0700 @@ -81,6 +81,17 @@ double precision :: enmr(3) integer :: i, j + +#ifdef CUDA + if (mod(irespa, ntpr) .eq. 0) then + call gpu_gb_ene(pot_ene) + else + call gpu_gb_forces() + end if + call update_time(nonbond_time) + return +#else + ! Zero energies that are stack or call parameters: pot_ene = null_gb_pot_ene_rec @@ -157,7 +168,7 @@ ! If belly is on then set the belly atom forces to zero: if (ibelly .gt. 0) call bellyf(atm_cnt, atm_igroup, frc) - +#endif return end subroutine gb_force