*******> update.5 Author: Taisung Lee Date: August 19, 2020 Programs: pmemd, pmemd.cuda Description: This fixes a number of issues: 1) fix: REMD with odd-number replicas may crash 2) fix: gti_sc_add=3 may crash near the end points 3) fix: iwrap=1 may cause out-of-boundary problem in TI 4) Add initialization of some GTI variables to avoid potential bugs -------------------------------------------------------------------------------- src/pmemd/src/cuda/gpu.cpp | 10 ++-- src/pmemd/src/cuda/gti_f95.cpp | 3 +- src/pmemd/src/cuda/gti_nonBond_kernels.cu | 32 ++-------- src/pmemd/src/cuda/gti_simulationConst.cpp | 21 ++++++- src/pmemd/src/gti.F90 | 2 +- src/pmemd/src/mdin_ctrl_dat.F90 | 4 +- src/pmemd/src/pbc.F90 | 16 +++++ src/pmemd/src/remd_exchg.F90 | 33 +++++++---- test/cuda/gti/Na/logfile | 85 --------------------------- test/cuda/gti/SC_Correction/ligand/Run.SC_NVE | 4 +- test/cuda/gti/SC_Correction/ligand/Run.SC_NVT | 4 +- 11 files changed, 78 insertions(+), 136 deletions(-) diff --git src/pmemd/src/cuda/gpu.cpp src/pmemd/src/cuda/gpu.cpp index 1153a3f6c9..c21f9813f9 100644 --- src/pmemd/src/cuda/gpu.cpp +++ src/pmemd/src/cuda/gpu.cpp @@ -3630,8 +3630,8 @@ extern "C" void gpu_refresh_charges_(int cit_nb14[][3], double gbl_one_scee[], d int i, j; for (i = 0; i < gpu->sim.nb14s; i++) { int parm_idx = cit_nb14[i][2] - 1; - gpu->pbNb141->_pSysData[i].x = gbl_one_scee[parm_idx] * qterm[abs(cit_nb14[i][0]) - 1] * - qterm[abs(cit_nb14[i][1]) - 1]; + gpu->pbNb141->_pSysData[i].x = (parm_idx>=0)? gbl_one_scee[parm_idx] * qterm[abs(cit_nb14[i][0]) - 1] * + qterm[abs(cit_nb14[i][1]) - 1] : 0.0; } // Updating the Nb141 array on the device keeps things consistent between the device and @@ -4043,10 +4043,10 @@ extern "C" void gpu_nb14_setup_(int *nb14_cnt, int cit_nb14[][3], double gbl_one if (gpu->ntf < 8) { for (i = 0; i < nb14s; i++) { int parm_idx = cit_nb14[i][2] - 1; - gpu->pbNb141->_pSysData[i].x = gbl_one_scee[parm_idx] * + gpu->pbNb141->_pSysData[i].x = (parm_idx>=0) ? gbl_one_scee[parm_idx] * gpu->pbAtomCharge->_pSysData[abs(cit_nb14[i][0]) - 1] * - gpu->pbAtomCharge->_pSysData[abs(cit_nb14[i][1]) - 1]; - gpu->pbNb141->_pSysData[i].y = gbl_one_scnb[parm_idx]; + gpu->pbAtomCharge->_pSysData[abs(cit_nb14[i][1]) - 1] :0.0; + gpu->pbNb141->_pSysData[i].y = (parm_idx>=0) ? gbl_one_scnb[parm_idx] :0.0 ; int tt = (*ntypes)*(iac[abs(cit_nb14[i][0]) - 1] - 1) + (iac[abs(cit_nb14[i][1]) - 1] - 1); diff --git src/pmemd/src/cuda/gti_f95.cpp src/pmemd/src/cuda/gti_f95.cpp index ebf51b584a..ef6d62b506 100644 --- src/pmemd/src/cuda/gti_f95.cpp +++ src/pmemd/src/cuda/gti_f95.cpp @@ -211,7 +211,7 @@ extern "C" void gti_ti_nb_setup_(int* pNtypes, int iac[], int ico[], int* pNb14_ gpuContext gpu = theGPUContext::GetPointer(); unsigned int ntypes = *pNtypes; - unsigned maxNumberTINBEntries = gpu->sim.numberTIAtoms * 2048; + unsigned maxNumberTINBEntries = gpu->sim.numberTIAtoms * 3000; gpu->pbTINBList = std::unique_ptr< GpuBuffer > (new GpuBuffer(maxNumberTINBEntries)); for (unsigned i = 0; i < maxNumberTINBEntries; i++) { @@ -249,6 +249,7 @@ extern "C" void gti_ti_nb_setup_(int* pNtypes, int iac[], int ico[], int* pNb14_ for (unsigned int j = 0; j < 3; j++) { temp[counter][j] = cit_nb14[i][j]; } + cit_nb14[i][2]=-1; counter++; } } diff --git src/pmemd/src/cuda/gti_nonBond_kernels.cu src/pmemd/src/cuda/gti_nonBond_kernels.cu index 9797afbd66..a7bfe4123f 100644 --- src/pmemd/src/cuda/gti_nonBond_kernels.cu +++ src/pmemd/src/cuda/gti_nonBond_kernels.cu @@ -412,11 +412,7 @@ _kPlainHead_ kgCalculateTI14NB_kernel(bool needEnergy, bool needVirial) { if (needEnergy) { pE = (addToDVDL_ele) ? ((hasSC) ? cSim.pTIEE14SC[TIRegion] : cSim.pTIEE14CC[TIRegion]) : cSim.pTISCEE14[TIRegion]; - pE0 = (hasSC) ? cSim.pTIEE14SC[2] : cSim.pTIEE14CC[2]; addEnergy(pE, de); - if (addToDVDL_ele) { - addEnergy(pE0, -de); - } if (useSC14 && addToDVDL_ele) { pEDL = (hasSC) ? cSim.pTIEE14SC_DL[TIRegion] : cSim.pTIEE14CC_DL[TIRegion]; @@ -469,24 +465,8 @@ _kPlainHead_ kgCalculateTI14NB_kernel(bool needEnergy, bool needVirial) { addVirial(cSim.pV33[TIRegion], -ttz * dz); } - - // Correction for double-counted 1-4 ele - ttx = df * dx; - tty = df * dy; - ttz = df * dz; - addForce(cSim.pTINBForceX[2] + iatom0, ttx); - addForce(cSim.pTINBForceX[2] + iatom1, -ttx); - addForce(cSim.pTINBForceY[2] + iatom0, tty); - addForce(cSim.pTINBForceY[2] + iatom1, -tty); - addForce(cSim.pTINBForceZ[2] + iatom0, ttz); - addForce(cSim.pTINBForceZ[2] + iatom1, -ttz); - if (needVirial) { - addVirial(cSim.pVirial_11, ttx * dx); - addVirial(cSim.pVirial_22, tty * dy); - addVirial(cSim.pVirial_33, ttz * dz); - } - // VDW part + PMEFloat& scalpha = cSim.TISCAlpha; PMEDouble g2 = cSim.pTINb141[pos].y; PMEDouble& A = cSim.pTINb142[pos].x; PMEDouble& B = cSim.pTINb142[pos].y; @@ -496,7 +476,7 @@ _kPlainHead_ kgCalculateTI14NB_kernel(bool needEnergy, bool needVirial) { } PMEDouble& BoverA = cSim.pTINb142[pos].z; PMEDouble t6 = ZeroF; - PMEDouble rp6 = ZeroF; + PMEDouble rt6 = ZeroF; int vdwSCType = cSim.vdwSC; df = OneF; // df must be 1.0 here @@ -504,8 +484,8 @@ _kPlainHead_ kgCalculateTI14NB_kernel(bool needEnergy, bool needVirial) { t6 = r2inv * r2inv * r2inv; if (useSC14) { - rp6 = r2 * r2 * r2 * BoverA; - t6 = (BoverA) / ((cSim.TISCAlpha * cSim.TILambda[TIRegion]) + rp6); + rt6 = r2 * r2 * r2 * BoverA; + t6 = (BoverA) / ((scalpha * cSim.TILambda[TIRegion]) + rt6); df = t6 * r2 * r2 * r2; } f6 = g2 * B * t6; @@ -525,8 +505,8 @@ _kPlainHead_ kgCalculateTI14NB_kernel(bool needEnergy, bool needVirial) { for (unsigned l = 0; l < cSim.nMBARStates; l++) { t6 = (vdwSCType == 1) - ? BoverA / (cSim.TISCAlpha * (smooth_step_func(lambda[TIRegion][l], cSim.vdwSmoothLambdaType)) + rp6) - : BoverA / (cSim.TISCAlpha * lambda[TIRegion][l] + rp6); + ? BoverA / (scalpha * (smooth_step_func(lambda[TIRegion][l], cSim.vdwSmoothLambdaType)) + rt6) + : BoverA / (scalpha * lambda[TIRegion][l] + rt6); PMEFloat dl_bar = g2 * t6 * (A * t6 - B); addEnergy(&(MBAREnergy[l]), dl_bar - de); } diff --git src/pmemd/src/cuda/gti_simulationConst.cpp src/pmemd/src/cuda/gti_simulationConst.cpp index e41b3d6e9c..0b65926047 100644 --- src/pmemd/src/cuda/gti_simulationConst.cpp +++ src/pmemd/src/cuda/gti_simulationConst.cpp @@ -6,13 +6,30 @@ //--------------------------------------------------------------------------------------------- void gti_simulationConst::InitSimulationConst() { - base_simulationConst::InitSimulationConst(); + base_simulationConst::InitSimulationConst(); + + needVirial = false; - needMBAR=false; pNTPData = NULL; pTIList = NULL; pSCList = NULL; numberTIAtoms = 0; + numberTICommonPairs = 0; + doTIHeating = false; + + numberTI14NBEntries = 0; + numberTIBond = 0; + numberTIAngle = 0; + numberTIDihedral = 0; + + numberTINMRDistance = 0; + numberTINMRAngle = 0; + numberTINMRDihedral = 0; + + nMBARStates = 0; + needMBAR = false; + numberLJ1264Atoms = 0; + } #endif diff --git src/pmemd/src/gti.F90 src/pmemd/src/gti.F90 index 3c2bb7dd03..3ce9067a52 100644 --- src/pmemd/src/gti.F90 +++ src/pmemd/src/gti.F90 @@ -250,7 +250,7 @@ implicit none if (sc_index(i)>0) then ti_ene(1:2,sc_index(i)) = gti_potEnergy(i,TIEnergySCShift+1:TIEnergySCShift+2) !! if (i>=4 .and. i<=7) then - if (i>=6 .and. i<=7) then + if (i==6) then ! torsion terms pEnergy(i)%ptr = pEnergy(i)%ptr - gti_potEnergy(i,TIEnergySCShift+1) & - gti_potEnergy(i,TIEnergySCShift+2) endif diff --git src/pmemd/src/mdin_ctrl_dat.F90 src/pmemd/src/mdin_ctrl_dat.F90 index 61d3797bee..1a3f731478 100644 --- src/pmemd/src/mdin_ctrl_dat.F90 +++ src/pmemd/src/mdin_ctrl_dat.F90 @@ -2767,7 +2767,9 @@ subroutine validate_mdin_ctrl_dat(remd_method, rremd_type) end if - if (mod(numgroups, 2) .ne. 0 .and. gremd_acyc .eq. 0) then + if (gremd_acyc .lt.0 .and. icfe.gt.0) gremd_acyc =1 + + if (mod(numgroups, 2) .ne. 0 .and. gremd_acyc .le. 0) then write(mdout, '(a,a)') error_hdr, 'REMD requires an even number of & &replicas!' diff --git src/pmemd/src/pbc.F90 src/pmemd/src/pbc.F90 index d373816ce9..60eae4d23c 100644 --- src/pmemd/src/pbc.F90 +++ src/pmemd/src/pbc.F90 @@ -1065,6 +1065,11 @@ subroutine wrap_molecules(crd) if (ti_mode .ne. 0) then if (ti_mol_type(mol_id) .eq. ti_mol_part_sc_ex) then + mol_id_part = ti_sc_partner(mol_id) + if (mol_id_part .gt. 0) then + mol_atm_cnt_part = gbl_mol_atms_listdata(mol_id_part)%cnt + offset_part = gbl_mol_atms_listdata(mol_id_part)%offset + if (mol_id_part .gt. mol_id) then do i = offset_part + 1, offset_part + mol_atm_cnt_part atm_id = gbl_mol_atms_lists(i) crd(1, atm_id) = crd(1, atm_id) + tran(1) @@ -1073,6 +1078,8 @@ subroutine wrap_molecules(crd) end do ! restore so we don't overflow below mol_atm_cnt = gbl_mol_atms_listdata(mol_id)%cnt + end if + end if end if end if @@ -1182,6 +1189,7 @@ subroutine wrap_to(crd) if (ti_mode .ne. 0) then if (ti_mol_type(mol_id) .eq. ti_mol_part_sc_ex) then mol_id_part = ti_sc_partner(mol_id) + if (mol_id_part .gt. 0) then mol_atm_cnt_part = gbl_mol_atms_listdata(mol_id_part)%cnt offset_part = gbl_mol_atms_listdata(mol_id_part)%offset if (mol_id_part .gt. mol_id) then @@ -1195,6 +1203,7 @@ subroutine wrap_to(crd) else cycle !already moved partner end if + end if end if end if @@ -1260,6 +1269,11 @@ subroutine wrap_to(crd) if (ti_mode .ne. 0) then if (ti_mol_type(mol_id) .eq. ti_mol_part_sc_ex) then + mol_id_part = ti_sc_partner(mol_id) + if (mol_id_part .gt. 0) then + mol_atm_cnt_part = gbl_mol_atms_listdata(mol_id_part)%cnt + offset_part = gbl_mol_atms_listdata(mol_id_part)%offset + if (mol_id_part .gt. mol_id) then do i = offset_part + 1, offset_part + mol_atm_cnt_part atm_id = gbl_mol_atms_lists(i) crd(1, atm_id) = crd(1, atm_id) + cx @@ -1268,6 +1282,8 @@ subroutine wrap_to(crd) end do ! restore so we don't overflow below mol_atm_cnt = gbl_mol_atms_listdata(mol_id)%cnt + end if + end if end if end if diff --git src/pmemd/src/remd_exchg.F90 src/pmemd/src/remd_exchg.F90 index 47220cb72f..9839b82513 100644 --- src/pmemd/src/remd_exchg.F90 +++ src/pmemd/src/remd_exchg.F90 @@ -891,6 +891,7 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & double precision :: frc_bak(3,atm_cnt) logical :: i_do_exchg + logical :: silent logical :: rank_success(num_replicas) ! log of all replica's success logical :: success integer :: success_array(num_replicas) ! to keep track of successes @@ -924,6 +925,7 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & ! First determine if we're controlling the exchange or not + silent=.false. if (master) then ! Figure out our partners @@ -965,14 +967,16 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & neighbor_rank = partners(1) - 1 end if - if (neighbor_rank .lt. 0) i_do_exchg = .false. + silent=(neighbor_rank .lt. 0) + if (silent) i_do_exchg = .false. + ! Exchange coordinates with your neighbor. crd_temp must be a backup since ! the reciprocal space sum needs the global array to be updated with the new ! coordinates (and crd is a reference to the global array) crd_temp(1,:) = crd(1,:) crd_temp(2,:) = crd(2,:) crd_temp(3,:) = crd(3,:) - call mpi_sendrecv_replace(crd, atm_cnt * 3, mpi_double_precision, & + if (.not. silent) call mpi_sendrecv_replace(crd, atm_cnt * 3, mpi_double_precision, & neighbor_rank, remd_tag, & neighbor_rank, remd_tag, & remd_comm, istat, err_code_mpi) @@ -985,7 +989,11 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & original_box(1) = pbc_box(ord1) original_box(2) = pbc_box(ord2) original_box(3) = pbc_box(ord3) + new_box(ord1) = original_box(1) + new_box(ord2) = original_box(2) + new_box(ord3) = original_box(3) !original_box(1:3) = pbc_box(1:3) + if (.not. silent) then call mpi_sendrecv(original_box, 3, mpi_double_precision, neighbor_rank, 11, & boxtmp, 3, mpi_double_precision, neighbor_rank, 11, & remd_comm, istat, err_code_mpi) @@ -994,6 +1002,7 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & new_box(ord1) = boxtmp(1) new_box(ord2) = boxtmp(2) new_box(ord3) = boxtmp(3) + endif end if end if ! master @@ -1007,9 +1016,9 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & ! Now broadcast the temporary coordinates to the rest of pmemd_comm. I don't ! know if this is necessary or not (i.e. is it done in the load balancing?) - call mpi_bcast(crd, atm_cnt * 3, mpi_double_precision, & + if (.not. silent) call mpi_bcast(crd, atm_cnt * 3, mpi_double_precision, & 0, pmemd_comm, err_code_mpi) - call mpi_bcast(crd_temp, atm_cnt * 3, mpi_double_precision, & + if (.not. silent) call mpi_bcast(crd_temp, atm_cnt * 3, mpi_double_precision, & 0, pmemd_comm, err_code_mpi) #ifdef CUDA @@ -1055,29 +1064,30 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & if (nmropt .ne. 0) call nmrdcp + neighbor_ene_temp = my_ene_temp ! Now trade that potential energy with your partner if (master) then - call mpi_sendrecv(my_ene_temp, SIZE_ENE_TEMP, mpi_double_precision, & + if (.not. silent) call mpi_sendrecv(my_ene_temp, SIZE_ENE_TEMP, mpi_double_precision, & neighbor_rank, remd_tag, & neighbor_ene_temp, SIZE_ENE_TEMP, mpi_double_precision, & neighbor_rank, remd_tag, & remd_comm, istat, err_code_mpi) ! Calculate the PV correction if necessary - if (exchange_ucell.and.use_pv) & + if (.not.silent .and. exchange_ucell .and. use_pv) & pvterm = pv_correction(temp0, neighbor_ene_temp%temperature, & neighbor_rank, remd_comm) - ! Now it's time to calculate the exchange criteria. This detailed balance was ! derived under the assumption that *if* we have different T's, then they are ! NOT exchanged. Swapping temperatures requires a slightly different equation ! and then requires you to update temp0 at the end with an mpi_sendrecv call. + success = .false. + if (.not. silent) then if (i_do_exchg) then - if (neighbor_rank .lt. 0) then - success = .false. - else + if (abs(my_ene_temp%energy_2)<1e8) then ! exclude extreme cases (like GPU overflow) + delta = -ONEKB / temp0 * (my_ene_temp%energy_2 - my_ene_temp%energy_1) - & ONEKB / neighbor_ene_temp%temperature * & (neighbor_ene_temp%energy_2 - neighbor_ene_temp%energy_1) @@ -1085,7 +1095,7 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & metrop = exp(delta + pvterm) call amrand_gen(remd_rand_gen, random_value) success = random_value .lt. metrop - endif + end if ! Let my neighbor know about our success @@ -1163,6 +1173,7 @@ subroutine hamiltonian_exchange(atm_cnt, my_atm_lst, crd, vel, frc, & #endif end if ! i_do_exchg + end if ! silent end if ! master diff --git test/cuda/gti/Na/logfile test/cuda/gti/Na/logfile deleted file mode 100755 index 9df79453a9..0000000000 --- test/cuda/gti/Na/logfile +++ /dev/null @@ -1,85 +0,0 @@ - -Initial FFT Block Distribution Based on Workload Estimate: - - FFT blocks assigned to 8 tasks - -First FFT Block Distribution Based on Actual Workload: - - FFT blocks assigned to 16 tasks - -Major Routine Parallel Profiling - NonSetup CPU Seconds: - - D - a - t - a D - D N i - i o h - s n A e S R O T - T t B B n d h u t o - a r o o g r a n h t - s i n n l a k M e a - k b d d e l e D r l ------------------------------------------------------------------------------- - 0 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 1 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 2 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 3 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 4 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 5 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 6 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 7 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 8 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 9 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 10 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 11 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 12 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 13 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 14 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 15 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 16 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 17 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 18 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - 19 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 ------------------------------------------------------------------------------- - avg 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - min 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - max 0.0 0.1 0.0 0.0 0.0 0.0 0.0 0.0 0.1 - std 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ------------------------------------------------------------------------------- - -PME NonBond Parallel Profiling - NonSetup CPU Seconds: - - D - P i R - a r e L - i e c o - r c i a T - T l t p d o - a i F F B t - s s r r a a - k t c c l l -------------------------------------------------- - 0 0.0 0.1 0.0 0.0 0.1 - 1 0.0 0.0 0.1 0.0 0.1 - 2 0.0 0.0 0.1 0.0 0.1 - 3 0.0 0.0 0.1 0.0 0.1 - 4 0.0 0.0 0.1 0.0 0.1 - 5 0.0 0.1 0.0 0.0 0.1 - 6 0.0 0.0 0.1 0.0 0.1 - 7 0.0 0.0 0.1 0.0 0.1 - 8 0.0 0.0 0.1 0.0 0.1 - 9 0.0 0.0 0.1 0.0 0.1 - 10 0.0 0.1 0.0 0.0 0.1 - 11 0.0 0.0 0.1 0.0 0.1 - 12 0.0 0.0 0.1 0.0 0.1 - 13 0.0 0.0 0.1 0.0 0.1 - 14 0.0 0.0 0.1 0.0 0.1 - 15 0.0 0.1 0.0 0.0 0.1 - 16 0.0 0.0 0.1 0.0 0.1 - 17 0.0 0.0 0.1 0.0 0.1 - 18 0.0 0.0 0.1 0.0 0.1 - 19 0.0 0.0 0.1 0.0 0.1 -------------------------------------------------- - avg 0.0 0.0 0.1 0.0 0.1 -------------------------------------------------- diff --git test/cuda/gti/SC_Correction/ligand/Run.SC_NVE test/cuda/gti/SC_Correction/ligand/Run.SC_NVE index d6ffb937b7..00f243e125 100755 --- test/cuda/gti/SC_Correction/ligand/Run.SC_NVE +++ test/cuda/gti/SC_Correction/ligand/Run.SC_NVE @@ -85,8 +85,8 @@ $sander -O -i mdin -c inpcrd -p prmtop -o $output