*******> update.15

 Author: Delaram Ghoreishi

 Date: August 26, 2019

 Programs: pmemd.cuda.MPI

 Description: This incorporates the NEB code in pmemd.cuda.MPI
--------------------------------------------------------------------------------
 src/pmemd/src/cuda/CMakeLists.txt                 |   4 +-
 src/pmemd/src/cuda/Makefile                       |   4 +-
 src/pmemd/src/cuda/base_gpuContext.cpp            |  21 +
 src/pmemd/src/cuda/base_gpuContext.h              |  15 +
 src/pmemd/src/cuda/base_simulationConst.cpp       |  23 +
 src/pmemd/src/cuda/base_simulationConst.h         |  39 ++
 src/pmemd/src/cuda/gpu.cpp                        | 229 ++++++-
 src/pmemd/src/cuda/gpu.h                          |  15 +
 src/pmemd/src/cuda/gti_cuda.cu                    |   1 +
 src/pmemd/src/cuda/gti_f95.cpp                    |  10 +-
 src/pmemd/src/cuda/gti_f95.h                      |   2 +-
 src/pmemd/src/cuda/kCalculateNEBForces.cu         | 692 ++++++++++++++++++++++
 src/pmemd/src/cuda/kNEB.h                         |  95 +++
 src/pmemd/src/cuda/kNorm.h                        |  45 ++
 src/pmemd/src/cuda/kRMSFit.h                      | 258 ++++++++
 src/pmemd/src/gb_force.F90                        |  52 +-
 src/pmemd/src/mdin_ctrl_dat.F90                   |  11 +-
 src/pmemd/src/neb.F90                             |  52 +-
 src/pmemd/src/nebread.F90                         |  34 +-
 src/pmemd/src/pme_force.F90                       | 132 ++---
 src/pmemd/src/pmemd.F90                           |  13 +
 src/pmemd/src/runmd.F90                           |   2 +-
 test/cuda/neb-testcases/neb_explicit/groupfile.in |   8 +-
 23 files changed, 1592 insertions(+), 165 deletions(-)

diff --git src/pmemd/src/cuda/CMakeLists.txt src/pmemd/src/cuda/CMakeLists.txt
index e1fbc6c..f7ba57f 100644
--- src/pmemd/src/cuda/CMakeLists.txt
+++ src/pmemd/src/cuda/CMakeLists.txt
@@ -2,7 +2,7 @@ set(PMEMD_CUDA_CUDA_SOURCES
 	kForcesUpdate.cu kCalculateLocalForces.cu kCalculateGBBornRadii.cu 
 	kCalculatePMENonbondEnergy.cu kCalculateGBNonbondEnergy1.cu kNLRadixSort.cu 
 	kCalculateGBNonbondEnergy2.cu kShake.cu kNeighborList.cu kPMEInterpolation.cu 
-	kCalculateEFieldEnergy.cu kDataTransfer.cu)
+	kCalculateEFieldEnergy.cu kDataTransfer.cu kCalculateNEBForces.cu)
 
 set(PMEMD_CUDA_GTI_SOURCES
 	gti_energies_kernels.cu gti_general_kernels.cu gti_utils.cu  
@@ -81,4 +81,4 @@ foreach(PRECISION ${PMEMD_CUDA_PRECISIONS})
 		target_link_libraries(pmemd_cuda_${PRECISION}_mpi curand cublas cufft mpi_c mpi_cxx mpi_fortran)
 	endif()
 	
-endforeach()
\ No newline at end of file
+endforeach()
diff --git src/pmemd/src/cuda/Makefile src/pmemd/src/cuda/Makefile
index 1b5ee1f..514d2a5 100644
--- src/pmemd/src/cuda/Makefile
+++ src/pmemd/src/cuda/Makefile
@@ -12,7 +12,7 @@ CPP_OBJS = gpu.o cuda_info.o gpu.o \
 CU_OBJS = kForcesUpdate.o kCalculateLocalForces.o kCalculateGBBornRadii.o \
           kCalculatePMENonbondEnergy.o kCalculateGBNonbondEnergy1.o kNLRadixSort.o \
           kCalculateGBNonbondEnergy2.o kShake.o kNeighborList.o kPMEInterpolation.o \
-          kCalculateEFieldEnergy.o kDataTransfer.o  
+          kCalculateEFieldEnergy.o kDataTransfer.o kCalculateNEBForces.o 
 
 GTI_OBJS = gti_energies_kernels.o gti_general_kernels.o gti_utils.o  \
           gti_NBList_kernels.o gti_nonBond_kernels.o gti_PME_kernels.o 
@@ -69,6 +69,8 @@ kCalculateLocalForces.o:	kCNF.h kBWU.h \
 kPMEInterpolation.o:	kReImageCoord.h kPGS.h kPSSE.h
 kCalculateEFieldEnergy.o: kEFE.h
 kShake.o:	kShake.h
+kDataTransfer.o:                kShuttle.h
+kCalculateNEBForces.o:                 kShuttle.h kNEB.h kRMSFit.h kNorm.h
 $(CU_OBJS):	gpu.h gputypes.h simulationConst.h
 gti_cuda.o:     gti_cuda.cuh gti_general_kernels.cuh gti_NBList_kernels.cuh gti_nonBond_kernels.cuh gti_energies_kernels.cuh gti_PME_kernels.cuh
 gti_energies_kernels.o: gti_kernelAttribute.cuh gti_energies_kernels.cuh gti_utils.cuh gti_const.cuh
diff --git src/pmemd/src/cuda/base_gpuContext.cpp src/pmemd/src/cuda/base_gpuContext.cpp
index 7818f3f..596f57a 100644
--- src/pmemd/src/cuda/base_gpuContext.cpp
+++ src/pmemd/src/cuda/base_gpuContext.cpp
@@ -140,6 +140,12 @@ base_gpuContext::base_gpuContext() :
   // Atom shuttling
   pbDataShuttle(NULL), pbShuttleTickets(NULL),
 
+  //NEB buffers
+  pbRMSMask(NULL), pbFitMask(NULL), pbAtmIdx(NULL), pbNEBEnergyAll(NULL),
+  pbTangents(NULL), pbSpringForce(NULL), pbNEBForce(NULL),
+  pbNextDataShuttle(NULL), pbPrevDataShuttle(NULL), pbKabschCOM(NULL),
+  pbtotFitMass(NULL), pbDataSPR(NULL), pbRotAtm(NULL),
+
   // AMD buffers
   pbAMDfwgtd(NULL), pAmdWeightsAndEnergy(NULL),
 
@@ -273,6 +279,21 @@ base_gpuContext::~base_gpuContext()
   delete pbDataShuttle;
   delete pbShuttleTickets;
 
+  // Delete NEB stuff
+  delete pbRMSMask;
+  delete pbFitMask;
+  delete pbAtmIdx;
+  delete pbNEBEnergyAll;
+  delete pbTangents;
+  delete pbSpringForce;
+  delete pbNEBForce;
+  delete pbNextDataShuttle;
+  delete pbPrevDataShuttle;
+  delete pbKabschCOM;
+  delete pbtotFitMass;
+  delete pbDataSPR;
+  delete pbRotAtm;
+
   // Delete NTP stuff
   delete pbSoluteAtomID;
   delete pbSoluteAtomMass;
diff --git src/pmemd/src/cuda/base_gpuContext.h src/pmemd/src/cuda/base_gpuContext.h
index 63218d6..2ab4d7b 100644
--- src/pmemd/src/cuda/base_gpuContext.h
+++ src/pmemd/src/cuda/base_gpuContext.h
@@ -282,6 +282,21 @@ public:
   GpuBuffer<int>* pbShuttleTickets;             // List of atoms that will be on the
                                                 //   shuttle to and from the CPU
 
+  // NEB info
+  GpuBuffer<int>* pbRMSMask;                    // Mask of atoms used for NEB force calculation.
+  GpuBuffer<int>* pbFitMask;                    // Mask of atoms used for NEB structure fitting.
+  GpuBuffer<int>* pbAtmIdx;
+  GpuBuffer<PMEDouble>* pbNEBEnergyAll;         // Energy of the replicas in NEB.
+  GpuBuffer<double>* pbTangents;                // Tangents to the path.
+  GpuBuffer<double>* pbSpringForce;             // Artificial spring forces.
+  GpuBuffer<double>* pbNEBForce;                // NEB forces.
+  GpuBuffer<double>* pbNextDataShuttle;         // Next neighboring replica's coordinate.
+  GpuBuffer<double>* pbPrevDataShuttle;         // Previous neighboring replica's coordinate.
+  GpuBuffer<double>* pbKabschCOM;
+  GpuBuffer<double>* pbtotFitMass;
+  GpuBuffer<double>* pbDataSPR;
+  GpuBuffer<double>* pbRotAtm;
+
   // Remapped bonded interactions and Shake constraint atom IDs
   GpuBuffer<int2>*   pbImageNMRCOMDistanceID;        // Remapped NMR COM Distance i, j
   GpuBuffer<int2>*   pbImageNMRCOMDistanceCOM;       // Remapped NMR COM DistanceCOM range
diff --git src/pmemd/src/cuda/base_simulationConst.cpp src/pmemd/src/cuda/base_simulationConst.cpp
index 1d31e5e..ab83be4 100644
--- src/pmemd/src/cuda/base_simulationConst.cpp
+++ src/pmemd/src/cuda/base_simulationConst.cpp
@@ -592,6 +592,29 @@ void base_simulationConst::InitSimulationConst()
   pDataShuttle                 = NULL;
   pShuttleTickets              = NULL;
 
+  //NEB parameters
+  neb_nbead               = 0;
+  beadid                  = 0;
+  nattgtrms               = 0;
+  nattgtfit               = 0;
+  last_neb_atm            = 0;
+  skmin                   = 50;
+  skmax                   = 100;
+  tmode                   = 1;
+  pRMSMask                = NULL;
+  pFitMask                = NULL;
+  pAtmIdx                 = NULL;
+  pNEBEnergyAll           = NULL;
+  pTangents               = NULL;
+  pSpringForce            = NULL;
+  pNEBForce               = NULL;
+  pNextDataShuttle        = NULL;
+  pPrevDataShuttle        = NULL;
+  pKabschCOM              = NULL;
+  pDataSPR                = NULL;
+  pRotAtm                 = NULL;
+  ptotFitMass             = NULL;
+
   // General IPS parameters
   bIPSActive                   = false;
   rips                         = (PMEFloat)8.0;
diff --git src/pmemd/src/cuda/base_simulationConst.h src/pmemd/src/cuda/base_simulationConst.h
index 6d41adc..0e3caa6 100644
--- src/pmemd/src/cuda/base_simulationConst.h
+++ src/pmemd/src/cuda/base_simulationConst.h
@@ -602,6 +602,45 @@ public:
   int* pShuttleTickets;    // List of atoms that will get tickets on the shuttle to and
                            //   from the CPU.
 
+  //NEB info
+  int neb_nbead;                       // Total number of NEB replicas.
+  int beadid;                          // NEB replica ID.
+  int nattgtrms;                       // Number of atoms used for ENB force calculation.
+  int nattgtfit;                       // Number of atoms used for NEB structure fitting.
+  int last_neb_atm;                    // Last atom involved in NEB force calculation or
+                                       //   structure fitting.
+  int skmin;                           // Spring constants for NEB.
+  int skmax;                           // Spring constants for NEB.
+  int tmode;                           // Tangent mode for NEB.
+  int* pRMSMask;                       // Mask of atoms used for ENB force calculation.
+  int* pFitMask;                       // Mask of atoms used for NEB structure fitting.
+  int* pAtmIdx;
+  PMEDouble* pNEBEnergyAll;            // Energy of the replicas in NEB.
+  double* pTangents;                   // Tangents to the path.
+  double* pSpringForce;                // Artificial spring forces.
+  double* pNEBForce;                   // NEB forces.
+  double* pNextDataShuttle;            // Next neighboring replica's coordinate.
+  double* pPrevDataShuttle;            // Previous neighboring replica's coordinate.
+  double* pKabschCOM;
+  double* pPrevKabsch;
+  double* pNextKabsch;
+  double* pSelfCOM;
+  double* pNextCOM;
+  double* pPrevCOM;
+  double* pKabsch2;
+  double* pPrevKabsch2;
+  double* pNextKabsch2;
+  double* pDataSPR;
+  double* ptotFitMass;                 // Pointer to total mass of atoms in RMSFit array.
+  double* pNorm;
+  double* pDotProduct1;
+  double* pDotProduct2;
+  double* pSpring1;
+  double* pSpring2;
+  double* pRotAtm;
+  double* pPrevRotAtm;
+  double* pNextRotAtm;
+
   // Direct space decomposition
   uint2* pNLNonbondCellStartEnd;           // Nonbond cell boundaries pointer
   int  srTableLength;                      // Length of the list describing the reciprocal
diff --git src/pmemd/src/cuda/gpu.cpp src/pmemd/src/cuda/gpu.cpp
index 38062e9..cc213af 100644
--- src/pmemd/src/cuda/gpu.cpp
+++ src/pmemd/src/cuda/gpu.cpp
@@ -1098,6 +1098,213 @@ extern "C" void gpu_upload_gamma_ln_(double* gamma_ln)
 }
 
 //---------------------------------------------------------------------------------------------
+// gpu_upload_neb_info_: Upload nattgtrms, nattgtfit, rmsmask, fitmask for neb calculation.
+//
+// Arguments:
+//   neb_nbead:    number of NEB replicas.
+//   beadid:       NEB replica ID.
+//   nattgtrms:    number of atoms in rmsmask.
+//   nattgtfit:    number of atoms in fitmask.
+//   last_neb_atm: last atom included in either rmsmask or fitmask.
+//   skmin:        NEB min spring constant.
+//   skmax:        NEB max spring constant.
+//   tmode:        NEB tangent mode.
+//   rmsmask:      atom selection mask used for NEB force application.
+//   fitmask:      atom selection mask used for NEB structure fitting.
+//   idx_mask:     combined ordered rmsmask and fitmask atoms.
+//---------------------------------------------------------------------------------------------
+extern "C" void gpu_upload_neb_info_(int* neb_nbead, int* beadid, int* nattgtrms, int* nattgtfit,
+                                     int* last_neb_atm, int* skmin, int* skmax, int* tmode,
+                                     int rmsmask[], int fitmask[], int idx_mask[])
+{
+    PRINTMETHOD("gpu_upload_neb_info");
+    gpu->sim.neb_nbead                  = *neb_nbead;
+    gpu->sim.beadid                     = *beadid;
+    gpu->sim.nattgtrms                  = *nattgtrms;
+    gpu->sim.nattgtfit                  = *nattgtfit;
+    gpu->sim.last_neb_atm               = *last_neb_atm;
+    gpu->sim.skmin                      = *skmin;
+    gpu->sim.skmax                      = *skmax;
+    gpu->sim.tmode                      = *tmode;
+
+    delete gpu->pbRMSMask;
+    delete gpu->pbFitMask;
+    delete gpu->pbAtmIdx;
+    int RMSStride = ((gpu->sim.nattgtrms + gpu->sim.grid - 1) >>
+                        gpu->sim.gridBits) << gpu->sim.gridBits;
+    int FitStride = ((gpu->sim.nattgtfit + gpu->sim.grid - 1) >>
+                        gpu->sim.gridBits) << gpu->sim.gridBits;
+    gpu->pbRMSMask = new GpuBuffer<int>(RMSStride);
+    gpu->pbFitMask = new GpuBuffer<int>(FitStride);
+    gpu->pbAtmIdx = new GpuBuffer<int>(get_shuttle_size(1, sizeof(int),
+                                                   gpu->sim.nShuttle));
+
+    gpu->sim.pRMSMask = gpu->pbRMSMask->_pDevData;
+    gpu->sim.pFitMask = gpu->pbFitMask->_pDevData;
+    gpu->sim.pAtmIdx = gpu->pbAtmIdx->_pDevData;
+
+    for (int i = 0; i < gpu->sim.nattgtrms; i++){
+      gpu->pbRMSMask->_pSysData[i] = rmsmask[i] - 1;
+    }
+
+    for (int i = 0; i < gpu->sim.nattgtfit; i++){
+      gpu->pbFitMask->_pSysData[i] = fitmask[i] - 1;
+    }
+
+    for (int i = 0; i < gpu->sim.nShuttle; i++){
+      gpu->pbAtmIdx->_pSysData[i] = idx_mask[i];
+    }
+
+    gpu->pbRMSMask->Upload();
+    gpu->pbFitMask->Upload();
+    gpu->pbAtmIdx->Upload();
+    gpuCopyConstants();
+}
+
+//---------------------------------------------------------------------------------------------
+// gpu_setup_neb_: Setup for NEB, called from pmemd.F90.
+//---------------------------------------------------------------------------------------------
+extern "C" void gpu_setup_neb_()
+{
+    PRINTMETHOD("gpu_setup_neb");
+    delete gpu->pbNEBEnergyAll;
+    delete gpu->pbTangents;
+    delete gpu->pbSpringForce;
+    delete gpu->pbNEBForce;
+    delete gpu->pbNextDataShuttle;
+    delete gpu->pbPrevDataShuttle;
+    delete gpu->pbKabschCOM;
+    delete gpu->pbDataSPR;
+    delete gpu->pbRotAtm;
+    delete gpu->pbtotFitMass;
+
+    int NEBStride = ((gpu->sim.neb_nbead + gpu->sim.grid - 1) >>
+                        gpu->sim.gridBits) << gpu->sim.gridBits;
+    gpu->pbNEBEnergyAll = new GpuBuffer<PMEDouble>(NEBStride);
+    gpu->pbTangents = new GpuBuffer<double>(get_shuttle_size(3, sizeof(double),
+                                                   gpu->sim.nShuttle));
+    gpu->pbSpringForce = new GpuBuffer<double>(get_shuttle_size(3, sizeof(double),
+                                                   gpu->sim.nShuttle));
+    gpu->pbNEBForce = new GpuBuffer<double>(get_shuttle_size(3, sizeof(double),
+                                                   gpu->sim.nShuttle));
+    if (gpu->bCanMapHostMemory) {
+      gpu->pbNextDataShuttle = new GpuBuffer<double>(get_shuttle_size(3, sizeof(double),
+                                                   gpu->sim.nShuttle), false, true);
+      gpu->pbPrevDataShuttle = new GpuBuffer<double>(get_shuttle_size(3, sizeof(double),
+                                                   gpu->sim.nShuttle), false, true);
+    }
+    else {
+      gpu->pbNextDataShuttle = new GpuBuffer<double>(get_shuttle_size(3, sizeof(double),
+                                                   gpu->sim.nShuttle));
+      gpu->pbPrevDataShuttle = new GpuBuffer<double>(get_shuttle_size(3, sizeof(double),
+                                                   gpu->sim.nShuttle));
+    }
+    gpu->pbKabschCOM = new GpuBuffer<double>(27);
+    if (gpu->bCanMapHostMemory) {
+      gpu->pbDataSPR = new GpuBuffer<double>(5, false, true);
+    }
+    else {
+      gpu->pbDataSPR = new GpuBuffer<double>(5);
+    }
+    if (gpu->bCanMapHostMemory) {
+      gpu->pbRotAtm = new GpuBuffer<double>(9, false, true);
+    }
+    else {
+      gpu->pbRotAtm = new GpuBuffer<double>(9);
+    }
+    gpu->pbtotFitMass = new GpuBuffer<double>(1);
+
+    gpu->sim.pNEBEnergyAll = gpu->pbNEBEnergyAll->_pDevData;
+    gpu->sim.pTangents = gpu->pbTangents->_pDevData;
+    gpu->sim.pSpringForce = gpu->pbSpringForce->_pDevData;
+    gpu->sim.pNEBForce = gpu->pbNEBForce->_pDevData;
+    gpu->sim.pNextDataShuttle = gpu->pbNextDataShuttle->_pDevData;
+    gpu->sim.pPrevDataShuttle = gpu->pbPrevDataShuttle->_pDevData;
+    gpu->sim.pKabschCOM = gpu->pbKabschCOM->_pDevData;
+    gpu->sim.pPrevKabsch = gpu->pbKabschCOM->_pDevData + 0;
+    gpu->sim.pNextKabsch = gpu->pbKabschCOM->_pDevData + 9;
+    gpu->sim.pSelfCOM = gpu->pbKabschCOM->_pDevData + 18;
+    gpu->sim.pPrevCOM = gpu->pbKabschCOM->_pDevData + 21;
+    gpu->sim.pNextCOM = gpu->pbKabschCOM->_pDevData + 24;
+    gpu->sim.pDataSPR = gpu->pbDataSPR->_pDevData;
+    gpu->sim.pNorm = gpu->pbDataSPR->_pDevData + 0;
+    gpu->sim.pDotProduct1 = gpu->pbDataSPR->_pDevData + 1;
+    gpu->sim.pDotProduct2 = gpu->pbDataSPR->_pDevData + 2;
+    gpu->sim.pSpring1 = gpu->pbDataSPR->_pDevData + 3;
+    gpu->sim.pSpring2 = gpu->pbDataSPR->_pDevData + 4;
+    gpu->sim.ptotFitMass = gpu->pbtotFitMass->_pDevData;
+    gpu->sim.pRotAtm = gpu->pbRotAtm->_pDevData;
+
+    gpuCopyConstants();
+}
+
+//--------------------------------------------------------------------------------------------
+// gpu_neb_exchange_crd_: NEB routine for exchanging the neighboring replicas coordinates
+//                        through the shuttle transfer.
+//--------------------------------------------------------------------------------------------
+extern "C" void gpu_neb_exchange_crd_()
+{
+  PRINTMETHOD("gpu_neb_exchange_crd");
+  int buff_size = get_shuttle_size(3, sizeof(double),gpu->sim.nShuttle);
+  kNEBSendRecv(gpu, buff_size);
+}
+
+//--------------------------------------------------------------------------------------------
+// gpu_report_neb_ene_: NEB routine for the replicas' energy print-out.
+//
+// Arguments:
+//   master_size:
+//   neb_nrg_all:
+//--------------------------------------------------------------------------------------------
+extern "C" void gpu_report_neb_ene_(int* master_size, double neb_nrg_all[])
+{
+  PRINTMETHOD("gpu_report_neb_ene");
+  NEB_report_energy(gpu, master_size, neb_nrg_all);
+}
+
+//--------------------------------------------------------------------------------------------
+// gpu_set_neb_springs_: NEB routine for setting the spring constants.
+//--------------------------------------------------------------------------------------------
+extern "C" void gpu_set_neb_springs_()
+{
+  PRINTMETHOD("gpu_set_neb_springs");
+  kNEBspr(gpu);
+}
+
+//--------------------------------------------------------------------------------------------
+// gpu_calculate_neb_frc_: Routine for calculating the NEB forces every nebfreq steps.
+//
+// Arguments:
+//   neb_force: NEB forces.
+//--------------------------------------------------------------------------------------------
+extern "C" void gpu_calculate_neb_frc_(double neb_force[][3])
+{
+  PRINTMETHOD("gpu_calculate_neb_frc");
+  kNEBfrc(gpu, neb_force);
+}
+
+//--------------------------------------------------------------------------------------------
+// gpu_calculate_neb_frc_nstep_: Routine for calculating the NEB forces at the off steps.
+//
+// Arguments:
+//   neb_force: NEB forces.
+//--------------------------------------------------------------------------------------------
+extern "C" void gpu_calculate_neb_frc_nstep_(double neb_force[][3])
+{
+  PRINTMETHOD("gpu_calculate_neb_frc_nstep");
+  kNEBfrc_nstep(gpu, neb_force);
+}
+
+//--------------------------------------------------------------------------------------------
+// gpu_neb_rmsfit_: Routine for calculating the NEB structure fitting.
+//--------------------------------------------------------------------------------------------
+extern "C" void gpu_neb_rmsfit_()
+{
+  PRINTMETHOD("gpu_neb_rmsfit");
+  kFitCOM(gpu);
+}
+
+//---------------------------------------------------------------------------------------------
 // gpu_setup_shuttle_info_: Upload atom_list and nshatom for partial coordinates download.
 //
 // Arguments:
@@ -4399,6 +4606,7 @@ extern "C" void gpuCopyConstants()
 #else
   SetkForcesUpdateSim(gpu);
   SetkDataTransferSim(gpu);
+  SetkCalculateNEBForcesSim(gpu);
   SetkShakeSim(gpu);
   SetkCalculateLocalForcesSim(gpu);
   if (gpu->ntb == 0) {
@@ -7296,7 +7504,8 @@ extern "C" void gpu_allreduce(GpuBuffer<PMEAccumulator>* pbBuff, int size)
 //---------------------------------------------------------------------------------------------
 extern "C" void gpu_pme_ene_(double* ewaldcof, double* vol, pme_pot_ene_rec* pEnergy,
                              double enmr[3], double virial[3], double ekcmt[3], int* nstep,
-                             double* dt, double atm_crd[][3], double atm_frc[][3])
+                             double* dt, double atm_crd[][3], double atm_frc[][3],
+                             int* ineb, int* nebfreq)
 {
   PRINTMETHOD("gpu_pme_ene");
 
@@ -7431,6 +7640,13 @@ extern "C" void gpu_pme_ene_(double* ewaldcof, double* vol, pme_pot_ene_rec* pEn
   pEnergy->total                 += gpu->vdw_recip + gpu->self_energy;
 #ifdef MPI
   }
+//NEB
+  if ((*ineb > 0) && (*nstep%*nebfreq == 0)) {
+    MPI_Allgather(&pEnergy->total, 1, MPI_PMEDOUBLE, gpu->pbNEBEnergyAll->_pSysData,
+                                   1, MPI_PMEDOUBLE, MPI_COMM_WORLD);
+    gpu->pbNEBEnergyAll->Upload();
+  }
+
   if (gpu->gpuID == 0) {
 #endif
   pEnergy->vdw_tot = energy[1] + gpu->vdw_recip;
@@ -9514,7 +9730,7 @@ void gpu_gather_gb_forces()
 //   pEnergy:   record to store GB simulation energies
 //   enmr:      NMR restraints energy (three components again)
 //---------------------------------------------------------------------------------------------
-extern "C" void gpu_gb_ene_(gb_pot_ene_rec* pEnergy, double enmr[3])
+extern "C" void gpu_gb_ene_(gb_pot_ene_rec* pEnergy, double enmr[3], int* ineb)
 {
   PRINTMETHOD("gpu_gb_ene"); 
   kClearForces(gpu);   
@@ -9569,6 +9785,15 @@ extern "C" void gpu_gb_ene_(gb_pot_ene_rec* pEnergy, double enmr[3])
     }
     pEnergy->total += energy[i];
   }
+//NEB
+#ifdef MPI
+  if ((*ineb > 0)) {
+      MPI_Allgather(&pEnergy->total, 1, MPI_PMEDOUBLE, gpu->pbNEBEnergyAll->_pSysData,
+                    1, MPI_PMEDOUBLE, MPI_COMM_WORLD);
+      gpu->pbNEBEnergyAll->Upload();
+  }
+#endif
+
   pEnergy->vdw_tot = energy[1];
   pEnergy->elec_tot = energy[0];
   pEnergy->gb = energy[2];
diff --git src/pmemd/src/cuda/gpu.h src/pmemd/src/cuda/gpu.h
index 93c5914..920fca3 100644
--- src/pmemd/src/cuda/gpu.h
+++ src/pmemd/src/cuda/gpu.h
@@ -28,6 +28,12 @@ extern "C" void gpu_upload_gamma_ln_(double* gamma_ln);
 extern "C" void gpu_setup_shuttle_info_(int* nshatom, int* infocode, int atom_list[]);
 extern "C" void gpu_shuttle_retrieve_data_(double atm_data[][3], int* flag);
 extern "C" void gpu_shuttle_post_data_(double atm_data[][3], int* flag);
+extern "C" void gpu_neb_exchange_crd_();
+extern "C" void gpu_report_neb_ene_(int* master_size, double neb_nrg_all[]);
+extern "C" void gpu_set_neb_springs_();
+extern "C" void gpu_calculate_neb_frc_(double neb_force[][3]);
+extern "C" void gpu_calculate_neb_frc_nstep_(double neb_force[][3]);
+extern "C" void gpu_neb_rmsfit_();
 extern "C" void gpu_upload_crd_(double atm_crd[][3]);
 extern "C" void gpu_upload_crd_gb_cph_(double atm_crd[][3]);
 extern "C" void gpu_download_crd_(double atm_crd[][3]);
@@ -145,6 +151,8 @@ extern "C" void SetkForcesUpdateSim(gpuContext gpu);
 extern "C" void GetkForcesUpdateSim(gpuContext gpu);
 extern "C" void SetkDataTransferSim(gpuContext gpu);
 extern "C" void GetkDataTransferSim(gpuContext gpu);
+extern "C" void SetkCalculateNEBForcesSim(gpuContext gpu);
+extern "C" void GetkCalculateNEBForcesSim(gpuContext gpu);
 extern "C" void SetkCalculateEFieldEnergySim(gpuContext gpu);
 extern "C" void GetkCalculateEFieldEnergySim(gpuContext gpu);
 extern "C" void SetkCalculateLocalForcesSim(gpuContext gpu);
@@ -276,6 +284,13 @@ extern "C" void kAFEExchangeCrd(gpuContext gpu);
 extern "C" void kRetrieveSimData(gpuContext gpu, double atm_data[][3], int modifier);
 extern "C" void kPostSimData(gpuContext gpu, double atm_data[][3], int modifier);
 extern "C" void DownloadAllCoord(gpuContext gpu, double atm_crd[][3]);
+//NEB
+extern "C" void kNEBSendRecv(gpuContext gpu, int buff_size);
+extern "C" void NEB_report_energy(gpuContext gpu, int* master_size, double neb_nrg_all[]);
+extern "C" void kNEBspr(gpuContext gpu);
+extern "C" void kNEBfrc(gpuContext gpu, double neb_force[][3]);
+extern "C" void kNEBfrc_nstep(gpuContext gpu, double neb_force[][3]);
+extern "C" void kFitCOM(gpuContext gpu);
 
   #ifdef MPI
 extern "C" void kCopyToAccumulator(gpuContext gpu, PMEAccumulator* p1, PMEAccumulator* p2,
diff --git src/pmemd/src/cuda/gti_cuda.cu src/pmemd/src/cuda/gti_cuda.cu
index dd1da9d..85e53e7 100644
--- src/pmemd/src/cuda/gti_cuda.cu
+++ src/pmemd/src/cuda/gti_cuda.cu
@@ -20,6 +20,7 @@ void ik_updateAllSimulationConst(gpuContext gpu) {
   // Original pmemd cu units
   SetkForcesUpdateSim(gpu);
   SetkDataTransferSim(gpu);
+  SetkCalculateNEBForcesSim(gpu);
   SetkShakeSim(gpu);
   SetkCalculateLocalForcesSim(gpu);
   if (gpu->ntb == 0) {
diff --git src/pmemd/src/cuda/gti_f95.cpp src/pmemd/src/cuda/gti_f95.cpp
index 80855da..1696a5a 100644
--- src/pmemd/src/cuda/gti_f95.cpp
+++ src/pmemd/src/cuda/gti_f95.cpp
@@ -628,7 +628,7 @@ extern "C" void gti_others_(bool* needEnergy, int* nstep, double* dt)
 //   ekcmt:    
 //---------------------------------------------------------------------------------------------
 extern "C" void gti_update_md_ene_(pme_pot_ene_rec* pEnergy, double enmr[3], double virial[3],
-                                   double ekcmt[3])
+                                   double ekcmt[3], int* ineb, int* nebfreq, int* nstep)
 {
   gpuContext gpu = theGPUContext::GetPointer();
   double energy[ENERGY_TERMS];
@@ -673,6 +673,14 @@ extern "C" void gti_update_md_ene_(pme_pot_ene_rec* pEnergy, double enmr[3], dou
     pEnergy->cmap;
 
   if (masterNode) pEnergy->total += (pEnergy->vdw_recip + pEnergy->elec_self);
+//NEB
+#ifdef MPI
+  if ((*ineb > 0) && masterNode && (*nstep%*nebfreq ==0)) {
+    MPI_Allgather(&pEnergy->total, 1, MPI_PMEDOUBLE, gpu->pbNEBEnergyAll->_pSysData,
+                                   1, MPI_PMEDOUBLE, MPI_COMM_WORLD);
+    gpu->pbNEBEnergyAll->Upload();
+  }
+#endif
 
   // Grab virial if needed
   if ((gpu->sim.ntp > 0) && (gpu->sim.barostat == 1)) {
diff --git src/pmemd/src/cuda/gti_f95.h src/pmemd/src/cuda/gti_f95.h
index c7c8415..55f98ea 100644
--- src/pmemd/src/cuda/gti_f95.h
+++ src/pmemd/src/cuda/gti_f95.h
@@ -26,7 +26,7 @@ extern "C" void gti_build_nl_list_(int* ti_moode, int* lj1264);
 extern "C" void gti_clear_(bool* need_pot_enes, int* ti_mode, double atm_crd[][3]);
 
 extern "C" void gti_update_md_ene_(pme_pot_ene_rec* pEnergy, double enmr[3], double virial[3],
-                                   double ekcmt[3]);
+                                   double ekcmt[3], int* ineb, int* nebfreq, int* nstep);
 
 extern "C" void gti_ele_recip_(bool* needPotEnergy, bool* need_virials, double* vol,
                                double ti_weights[2], int* ti_mode);
diff --git src/pmemd/src/cuda/kCalculateNEBForces.cu src/pmemd/src/cuda/kCalculateNEBForces.cu
new file mode 100644
index 0000000..39eec78
--- /dev/null
+++ src/pmemd/src/cuda/kCalculateNEBForces.cu
@@ -0,0 +1,692 @@
+#include "copyright.i"
+
+//---------------------------------------------------------------------------------------------
+// AMBER NVIDIA CUDA GPU IMPLEMENTATION: PMEMD VERSION
+//
+// July 2017, by Scott Le Grand, David S. Cerutti, Daniel J. Mermelstein, Charles Lin, and
+//               Ross C. Walker
+//---------------------------------------------------------------------------------------------
+#include <cuda.h>
+#include "gpu.h"
+#include "ptxmacros.h"
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+// Use global instance instead of a local copy
+#include "simulationConst.h"
+CSIM_STO simulationConst cSim;
+
+bool first = true;
+//---------------------------------------------------------------------------------------------
+// SetkCalculateNEBForcesSim: called by gpuCopyConstants to orient the instance of cSim in this
+//                     library
+//
+// Arguments:
+//   gpu: overarching type for storing all parameters, coordinates, and the energy function
+//---------------------------------------------------------------------------------------------
+void SetkCalculateNEBForcesSim(gpuContext gpu)
+{
+  cudaError_t status;
+  status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(simulationConst));
+  RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+//---------------------------------------------------------------------------------------------
+// GetkCalculateNEBForcesSim: download information about the CUDA simulation data struct from the
+//                     device.
+//
+// Arguments:
+//   gpu: overarching type for storing all parameters, coordinates, and the energy function
+//
+// This is a debugging function.
+//---------------------------------------------------------------------------------------------
+void GetkCalculateNEBForcesSim(gpuContext gpu)
+{
+  cudaError_t status;
+  status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(simulationConst));
+  RTERROR(status, "cudaMemcpyToSymbol: SetSim copy to cSim failed");
+}
+
+//----------------------------------------------------------------------------------------------
+// Shuffle Warp Reduce: https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
+//----------------------------------------------------------------------------------------------
+__inline__ __device__ double warpReduceSum(double val) {
+  for (int offset = GRID/2; offset > 0; offset /= 2) 
+    val += __shfl_down(val, offset);
+  return val;
+}
+
+//----------------------------------------------------------------------------------------------
+// atomicAdd function for double data types
+//----------------------------------------------------------------------------------------------
+//#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+#if (__CUDA_ARCH__ < 600) || (__CUDACC_VER_MAJOR__ < 8)
+__device__ void voidatomicAdd(double* ptr, double val) {
+  unsigned long long int* ulli_ptr = (unsigned long long int*)ptr;
+  unsigned long long int old = *ulli_ptr;
+  unsigned long long int assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(ulli_ptr, assumed,  __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (old != assumed);
+}
+#endif
+
+//---------------------------------------------------------------------------------------------
+// dgeev from lapack for computing eigenvalues and left/right enigenvectors of a square matrix
+//---------------------------------------------------------------------------------------------
+extern "C" {
+extern int dgeev_(char*,char*,int*,double*,int*,double*, double*, double*, int*, double*, int*, double*, int*, int*);
+}
+
+//---------------------------------------------------------------------------------------------
+// Kernel for clearing the NEB buffers at each step
+//---------------------------------------------------------------------------------------------
+
+//---------------------------------------------------------------------------------------------
+// Kernel for shuttling the coordinates for sending and receiving between the neighboring 
+// replicas in NEB.
+//---------------------------------------------------------------------------------------------
+
+#define TAKE_IMAGE
+#define PULL_COORD
+__global__ void kRetrieveCoordNL_kernel()
+#include "kShuttle.h"
+#undef PULL_COORD
+
+#define PULL_FORCE
+__global__ void kRetrieveForceNL_kernel()
+#include "kShuttle.h"
+#undef PULL_FORCE
+#undef TAKE_IMAGE
+
+#define PULL_COORD
+__global__ void kRetrieveCoord_kernel()
+#include "kShuttle.h"
+#undef PULL_COORD
+
+#define PULL_FORCE
+__global__ void kRetrieveForce_kernel()
+#include "kShuttle.h"
+#undef PULL_FORCE
+
+//----------------------------------------------------------------------------------------------
+// kernels for RMSfit calculations.
+//----------------------------------------------------------------------------------------------
+
+#define NEIGHBOR_LIST
+#define MASS_CALC
+__global__ void kTotMassFitNL_kernel()
+#include "kRMSFit.h"
+#undef MASS_CALC
+
+#define CALC_COM
+__global__ void kCALCCOMNL_kernel()
+#include "kRMSFit.h"
+#undef CALC_COM
+
+#define KABSCH
+#define NEXT_NEIGHBOR
+__global__ void kKabschNextNL_kernel()
+#include "kRMSFit.h"
+#undef NEXT_NEIGHBOR
+
+#define PREV_NEIGHBOR
+__global__ void kKabschPrevNL_kernel()
+#include "kRMSFit.h"
+#undef PREV_NEIGHBOR
+#undef KABSCH
+#undef NEIGHBOR_LIST
+
+#define MASS_CALC
+__global__ void kTotMassFit_kernel()
+#include "kRMSFit.h"
+#undef MASS_CALC
+
+#define CALC_COM
+__global__ void kCALCCOM_kernel()
+#include "kRMSFit.h"
+#undef CALC_COM
+
+#define CENTER_DATA
+__global__ void kCenterData_kernel()
+#include "kRMSFit.h"
+#undef CENTER_DATA
+
+#define RECENTER_DATA
+__global__ void kReCenterData_kernel()
+#include "kRMSFit.h"
+#undef RECENTER_DATA
+
+#define ROTATE_NEIGHBOR
+#define NEXT_NEIGHBOR
+__global__ void kRotateNext_kernel()
+#include "kRMSFit.h"
+#undef NEXT_NEIGHBOR
+
+#define PREV_NEIGHBOR
+__global__ void kRotatePrev_kernel()
+#include "kRMSFit.h"
+#undef PREV_NEIGHBOR
+#undef ROTATE_NEIGHBOR
+
+#define KABSCH
+#define NEXT_NEIGHBOR
+__global__ void kKabschNext_kernel()
+#include "kRMSFit.h"
+#undef NEXT_NEIGHBOR
+
+#define PREV_NEIGHBOR
+__global__ void kKabschPrev_kernel()
+#include "kRMSFit.h"
+#undef PREV_NEIGHBOR
+#undef KABSCH
+
+//----------------------------------------------------------------------------------------------
+// Kernel for calculating NEB forces.
+//----------------------------------------------------------------------------------------------
+
+#define REVISED_TAN
+#define FIRST
+__global__ void kNEBRevisedTanFIRST_kernel()
+#include "kNEB.h"
+#undef FIRST
+
+#define SECOND
+__global__ void kNEBRevisedTanSECOND_kernel()
+#include "kNEB.h"
+#undef SECOND
+
+#define THIRD
+__global__ void kNEBRevisedTanTHIRD_kernel()
+#include "kNEB.h"
+#undef THIRD
+
+#define FOURTH
+__global__ void kNEBRevisedTanFOURTH_kernel()
+#include "kNEB.h"
+#undef FOURTH
+#undef REVISED_TAN
+
+#define BASIC_TAN
+__global__ void kNEBBasicTan_kernel()
+#include "kNEB.h"
+#undef BASIC_TAN
+
+#define NORMALIZE
+__global__ void kNormalizeTan_kernel()
+#include "kNEB.h"
+#undef NORMALIZE
+
+#define NEIGHBOR_LIST
+#define NEB_FRC
+__global__ void kNEBFRCNL_kernel()
+#include "kNEB.h"
+#undef NEB_FRC
+#undef NEIGHBOR_LIST
+
+#define NEB_FRC
+__global__ void kNEBFRC_kernel()
+#include "kNEB.h"
+#undef NEB_FRC
+
+#define REDUCE_TAN
+__global__ void kNormalize_kernel()
+#include "kNorm.h"
+#undef REDUCE_TAN
+
+#define REDUCE_DOTPROD
+__global__ void kDotProduct_kernel()
+#include "kNorm.h"
+#undef REDUCE_DOTPROD
+
+//-----------------------------------------------------------------------------------------------
+extern "C" void kNEBSendRecv(gpuContext gpu, int buff_size)
+{
+#ifdef MPI
+  if (gpu->sim.ShuttleType == 0) {
+    int next_node = gpu->sim.beadid;  //replica id, goes from 0 to neb_nbead-1
+    if (next_node >= gpu->sim.neb_nbead) {
+      next_node = 0;
+    }
+    int prev_node = gpu->sim.beadid - 2;
+    if (prev_node < 0) {
+      prev_node = gpu->sim.neb_nbead -1;
+    }
+
+    if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) {
+      kRetrieveCoordNL_kernel<<<gpu->blocks, 96>>>();
+    }
+    else {
+      kRetrieveCoord_kernel<<<gpu->blocks, 96>>>();
+    }
+    cudaDeviceSynchronize();
+    // Download from the device
+    if (!(gpu->bCanMapHostMemory)) {
+      gpu->pbDataShuttle->Download();
+    }
+    int tag1 = 101010;
+    int tag2 = 202020;
+    MPI_Status status;
+    if ( (gpu->sim.beadid & 1) == 0) {
+      MPI_Sendrecv(gpu->pbDataShuttle->_pSysData, buff_size, MPI_DOUBLE, next_node, tag1,
+                   gpu->pbNextDataShuttle->_pSysData, buff_size, MPI_DOUBLE, next_node, tag1,
+                   MPI_COMM_WORLD, &status);
+    }
+    else {
+      MPI_Sendrecv(gpu->pbDataShuttle->_pSysData, buff_size, MPI_DOUBLE, prev_node, tag1,
+                   gpu->pbPrevDataShuttle->_pSysData, buff_size, MPI_DOUBLE, prev_node, tag1,
+                   MPI_COMM_WORLD, &status);
+    }
+    if ( (gpu->sim.beadid & 1) == 0) {
+      MPI_Sendrecv(gpu->pbDataShuttle->_pSysData, buff_size, MPI_DOUBLE, prev_node, tag2,
+                   gpu->pbPrevDataShuttle->_pSysData, buff_size, MPI_DOUBLE, prev_node, tag2,
+                   MPI_COMM_WORLD, &status);
+    }
+    else {
+      MPI_Sendrecv(gpu->pbDataShuttle->_pSysData, buff_size, MPI_DOUBLE, next_node, tag2,
+                   gpu->pbNextDataShuttle->_pSysData, buff_size, MPI_DOUBLE, next_node, tag2,
+                   MPI_COMM_WORLD, &status);
+    }
+
+    //Upload to the device
+    if (!(gpu->bCanMapHostMemory)) {
+      gpu->pbNextDataShuttle->Upload();
+      gpu->pbPrevDataShuttle->Upload();
+    }
+  }
+  else if (gpu->sim.ShuttleType == 1) {
+    // Print an error for now
+    printf("| Error: this functionality (ShuttleType == 1) is not yet available.\n");
+  }
+#endif
+}
+
+extern "C" void NEB_report_energy(gpuContext gpu, int* master_size, double neb_nrg_all[])
+{
+    //this is needed for energy output
+    // at this point the pbNEBEnergyAll is filled with the replica's energies
+    for (int i = 0; i < *master_size; i++) {
+      neb_nrg_all[i] = gpu->pbNEBEnergyAll->_pSysData[i];
+    }
+}
+
+// Set the springs:
+// at this point the pbNEBEnergyAll is filled with the replica's energies
+extern "C" void kNEBspr(gpuContext gpu)
+{
+    PMEDouble ezero = MAX(gpu->pbNEBEnergyAll->_pSysData[0],
+                       gpu->pbNEBEnergyAll->_pSysData[gpu->sim.neb_nbead-1]);
+    PMEDouble emax = gpu->pbNEBEnergyAll->_pSysData[0];
+    for (int rep = 1; rep < gpu->sim.neb_nbead; rep++) {
+      emax = MAX(emax, gpu->pbNEBEnergyAll->_pSysData[rep]);
+    }
+
+    for (int i = 0; i < 5; i++) {
+      gpu->pbDataSPR->_pSysData[i] = 0.0;
+    }
+
+    if (gpu->sim.skmin == gpu->sim.skmax) {
+      gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax;
+    }
+    else if ((gpu->pbNEBEnergyAll->_pSysData[1] > ezero) && (emax != ezero)) {
+      gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax - gpu->sim.skmin * (emax - 
+         MAX(gpu->pbNEBEnergyAll->_pSysData[0], gpu->pbNEBEnergyAll->_pSysData[1])/(emax - ezero));
+    }
+    else {
+      gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax - gpu->sim.skmin;
+    }
+    int rep = gpu->sim.beadid - 1;  //replica id, goes from 0 to neb_nbead-1
+    if ((rep == 0) || (rep == (gpu->sim.neb_nbead - 1))){
+      goto exit3;
+    }
+    gpu->pbDataSPR->_pSysData[3] = gpu->pbDataSPR->_pSysData[4];
+    if (gpu->sim.skmin == gpu->sim.skmax) {
+      gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax;
+    }
+    else if ((gpu->pbNEBEnergyAll->_pSysData[rep + 1] > ezero) && (emax != ezero)) {
+      gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax - gpu->sim.skmin * (emax -
+         MAX(gpu->pbNEBEnergyAll->_pSysData[rep - 1], gpu->pbNEBEnergyAll->_pSysData[rep])/(emax - ezero));
+    }
+    else {
+      gpu->pbDataSPR->_pSysData[4] = gpu->sim.skmax - gpu->sim.skmin;
+    }
+
+    if (!(gpu->bCanMapHostMemory)) {
+      gpu->pbDataSPR->Upload();
+    }
+
+    exit3: {}
+}
+
+extern "C" void kFitCOM(gpuContext gpu)
+{
+
+  double det;
+  double small;
+  double b[9];
+  double norm;
+  int ismall;
+
+  //vars needed for lapack diagonalization
+  double Kabsch2[9], eigReal[3], eigImag[3];
+  double vl[1]; 
+  double vr[9];
+  int info;
+  int ldvl = 1, ldvr = 3, n = 3;
+  const int worksize = 48;
+  int lwork = worksize;
+  double work[worksize];
+  char Nchar='N', Vchar='V';
+
+  if (first) {
+    // Calculate total mass of the fit region
+    if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) {
+     kTotMassFitNL_kernel<<<gpu->blocks, gpu->threadsPerBlock/4>>>();
+      LAUNCHERROR("kFitCOM");
+    }
+    else {
+      kTotMassFit_kernel<<<gpu->blocks, gpu->threadsPerBlock/4>>>();
+      LAUNCHERROR("kFitCOM");
+    }
+    first = false;
+  }
+
+  cudaDeviceSynchronize();
+
+    for (int i = 0; i < 27; i++) {
+      gpu->pbKabschCOM->_pSysData[i] = 0.0;
+    }
+    gpu->pbKabschCOM->Upload();
+
+    // Calculate center of mass of the self and neighboring replicas
+    if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) {
+      kCALCCOMNL_kernel<<<gpu->blocks, gpu->threadsPerBlock*3/16>>>();
+      LAUNCHERROR("kFitCOM");
+    }
+    else {
+      kCALCCOM_kernel<<<gpu->blocks, gpu->threadsPerBlock*3/16>>>();
+      LAUNCHERROR("kFitCOM");
+    }
+
+    // Center atoms wrt COM of fit regions.
+    kCenterData_kernel<<<gpu->blocks, gpu->threadsPerBlock/4>>>();
+    LAUNCHERROR("kFitCOM");
+
+
+    // Calculate Kabsch matrix
+    if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) {
+      kKabschPrevNL_kernel<<<gpu->blocks, gpu->threadsPerBlock*3/16>>>();
+      LAUNCHERROR("kFitCOM");
+      kKabschNextNL_kernel<<<gpu->blocks, gpu->threadsPerBlock*3/16>>>();
+      LAUNCHERROR("kFitCOM");
+    }
+    else {
+      kKabschPrev_kernel<<<gpu->blocks, gpu->threadsPerBlock*3/16>>>();
+      LAUNCHERROR("kFitCOM");
+      kKabschNext_kernel<<<gpu->blocks, gpu->threadsPerBlock*3/16>>>();
+      LAUNCHERROR("kFitCOM");
+    }
+    gpu->pbKabschCOM->Download();
+
+    det = gpu->pbKabschCOM->_pSysData[0] * gpu->pbKabschCOM->_pSysData[4] * gpu->pbKabschCOM->_pSysData[8] -
+          gpu->pbKabschCOM->_pSysData[0] * gpu->pbKabschCOM->_pSysData[5] * gpu->pbKabschCOM->_pSysData[7] -
+          gpu->pbKabschCOM->_pSysData[1] * gpu->pbKabschCOM->_pSysData[3] * gpu->pbKabschCOM->_pSysData[8] +
+          gpu->pbKabschCOM->_pSysData[1] * gpu->pbKabschCOM->_pSysData[5] * gpu->pbKabschCOM->_pSysData[6] +
+          gpu->pbKabschCOM->_pSysData[2] * gpu->pbKabschCOM->_pSysData[3] * gpu->pbKabschCOM->_pSysData[7] -
+          gpu->pbKabschCOM->_pSysData[2] * gpu->pbKabschCOM->_pSysData[4] * gpu->pbKabschCOM->_pSysData[6];
+
+    if (abs(det) < 1.0e-07) {
+      printf("small determinant in rmsfit(): %f\n", abs(det));
+      goto exit1;
+    }
+
+    for(int i = 0; i < 9; i++) {
+      Kabsch2[i] = 0.0;
+      b[i] = 0.0;
+    }
+
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < 3; k++) {
+          Kabsch2[3*i+j] += gpu->pbKabschCOM->_pSysData[3*k+i]
+                          * gpu->pbKabschCOM->_pSysData[3*k+j];  
+        }
+      }
+    }
+
+    // Calculate eigenvalues using the DGEEV subroutine
+    dgeev_(&Nchar,&Vchar,&n,Kabsch2,&n,eigReal,eigImag,
+              vl,&ldvl,vr,&ldvr,work,&lwork,&info);
+
+    // Check for errors
+    if (info!=0){
+      printf("Error in diagonalization routine dgeev %d\n", info);
+      goto exit1;
+    }
+
+    // Find the smallest eigenvalue
+    small = 1.0e20;
+    for (int i = 0; i < 3; i++) {
+      if (eigReal[i] < small) {
+        ismall = i;
+        small = eigReal[i];
+      }
+    }
+
+    // Calculate the b vectors
+    for (int j = 0; j < 3; j++) {
+      norm = 1.0/sqrt(abs(eigReal[j]));
+      for (int i = 0; i < 3; i++) {
+        for (int k = 0; k < 3; k++) {
+          b[3*i+j] += gpu->pbKabschCOM->_pSysData[3*i+k] * vr[3*j+k] * norm;
+        }
+      }
+    }
+
+    CONTINUE1:
+
+    for(int i = 0; i < 9; i++) {
+      gpu->pbRotAtm->_pSysData[i] = 0.0;
+    }
+
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < 3; k++) {
+          gpu->pbRotAtm->_pSysData[3*i+j] += b[3*i+k] * vr[3*k+j];
+        }
+      }
+    }
+
+    det = gpu->pbRotAtm->_pSysData[0] * gpu->pbRotAtm->_pSysData[4] * gpu->pbRotAtm->_pSysData[8] -
+          gpu->pbRotAtm->_pSysData[0] * gpu->pbRotAtm->_pSysData[5] * gpu->pbRotAtm->_pSysData[7] -
+          gpu->pbRotAtm->_pSysData[1] * gpu->pbRotAtm->_pSysData[3] * gpu->pbRotAtm->_pSysData[8] +
+          gpu->pbRotAtm->_pSysData[1] * gpu->pbRotAtm->_pSysData[5] * gpu->pbRotAtm->_pSysData[6] +
+          gpu->pbRotAtm->_pSysData[2] * gpu->pbRotAtm->_pSysData[3] * gpu->pbRotAtm->_pSysData[7] -
+          gpu->pbRotAtm->_pSysData[2] * gpu->pbRotAtm->_pSysData[4] * gpu->pbRotAtm->_pSysData[6];
+
+    if (abs(det) < 1.0e-10) {
+      printf("small determinant in rmsfit(): %f\n", abs(det));
+      goto exit1;
+    }
+
+    if (det < 0) {
+      for (int i = 0; i < 3; i++) {
+        b[3*i+ismall] = -b[3*i+ismall];
+      }
+      goto CONTINUE1;
+    }
+
+    if (!(gpu->bCanMapHostMemory)) {
+      gpu->pbRotAtm->Upload();
+    }
+
+    kRotatePrev_kernel<<<gpu->blocks, gpu->threadsPerBlock/4>>>();
+    LAUNCHERROR("kFitCOM");
+
+    cudaDeviceSynchronize();
+
+    exit1: {}
+
+    det = gpu->pbKabschCOM->_pSysData[9] * gpu->pbKabschCOM->_pSysData[13] * gpu->pbKabschCOM->_pSysData[17] -
+          gpu->pbKabschCOM->_pSysData[9] * gpu->pbKabschCOM->_pSysData[14] * gpu->pbKabschCOM->_pSysData[16] -
+          gpu->pbKabschCOM->_pSysData[10] * gpu->pbKabschCOM->_pSysData[12] * gpu->pbKabschCOM->_pSysData[17] +
+          gpu->pbKabschCOM->_pSysData[10] * gpu->pbKabschCOM->_pSysData[14] * gpu->pbKabschCOM->_pSysData[15] +
+          gpu->pbKabschCOM->_pSysData[11] * gpu->pbKabschCOM->_pSysData[12] * gpu->pbKabschCOM->_pSysData[16] -
+          gpu->pbKabschCOM->_pSysData[11] * gpu->pbKabschCOM->_pSysData[13] * gpu->pbKabschCOM->_pSysData[15];
+
+    if (abs(det) < 1.0e-07) {
+      printf("small determinant in rmsfit(): %f\n", abs(det));
+      goto exit2;
+    }
+
+    for(int i = 0; i < 9; i++) {
+      Kabsch2[i] = 0.0;
+      b[i] = 0.0;
+    }
+
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < 3; k++) {
+          Kabsch2[3*i+j] += gpu->pbKabschCOM->_pSysData[3*k+i+9]
+                          * gpu->pbKabschCOM->_pSysData[3*k+j+9];  
+        }
+      }
+    }
+
+    // Calculate eigenvalues using the DGEEV subroutine
+    dgeev_(&Nchar,&Vchar,&n,Kabsch2,&n,eigReal,eigImag,
+              vl,&ldvl,vr,&ldvr,work,&lwork,&info);
+
+    // Check for errors
+    if (info!=0){
+      printf("Error in diagonalization routine dgeev");
+      goto exit2;
+    }
+
+    // Find the smallest eigenvalue
+    small = 1.0e20;
+    for (int i = 0; i < 3; i++) {
+      if (eigReal[i] < small) {
+        ismall = i;
+        small = eigReal[i];
+      }
+    }
+
+    // Calculate the b vectors
+    for (int j = 0; j < 3; j++) {
+      norm = 1.0/sqrt(abs(eigReal[j]));
+      for (int i = 0; i < 3; i++) {
+        for (int k = 0; k < 3; k++) {
+          b[3*i+j] += gpu->pbKabschCOM->_pSysData[3*i+k+9] * vr[3*j+k] * norm;
+        }
+      }
+    }
+
+    CONTINUE2:
+
+    for(int i = 0; i < 9; i++) {
+      gpu->pbRotAtm->_pSysData[i] = 0.0;
+    }
+
+    for (int i = 0; i < 3; i++) {
+      for (int j = 0; j < 3; j++) {
+        for (int k = 0; k < 3; k++) {
+          gpu->pbRotAtm->_pSysData[3*i+j] += b[3*i+k] * vr[3*k+j];
+        }
+      }
+    }
+
+    det = gpu->pbRotAtm->_pSysData[0] * gpu->pbRotAtm->_pSysData[4] * gpu->pbRotAtm->_pSysData[8] -
+          gpu->pbRotAtm->_pSysData[0] * gpu->pbRotAtm->_pSysData[5] * gpu->pbRotAtm->_pSysData[7] -
+          gpu->pbRotAtm->_pSysData[1] * gpu->pbRotAtm->_pSysData[3] * gpu->pbRotAtm->_pSysData[8] +
+          gpu->pbRotAtm->_pSysData[1] * gpu->pbRotAtm->_pSysData[5] * gpu->pbRotAtm->_pSysData[6] +
+          gpu->pbRotAtm->_pSysData[2] * gpu->pbRotAtm->_pSysData[3] * gpu->pbRotAtm->_pSysData[7] -
+          gpu->pbRotAtm->_pSysData[2] * gpu->pbRotAtm->_pSysData[4] * gpu->pbRotAtm->_pSysData[6];
+
+    if (abs(det) < 1.0e-10) {
+      printf("small determinant in rmsfit(): %f\n", abs(det));
+      goto exit2;
+    }
+
+    if (det < 0) {
+      for (int i = 0; i < 3; i++) {
+        b[3*i+ismall] = -b[3*i+ismall];
+      }
+      goto CONTINUE2;
+    }
+
+    if (!(gpu->bCanMapHostMemory)) {
+      gpu->pbRotAtm->Upload();
+    }
+
+    kRotateNext_kernel<<<gpu->blocks, gpu->threadsPerBlock/4>>>();
+    LAUNCHERROR("kFitCOM");
+
+    kReCenterData_kernel<<<gpu->blocks, gpu->threadsPerBlock/4>>>();
+    LAUNCHERROR("kFitCOM");
+    cudaDeviceSynchronize();
+
+    exit2: {}
+
+}
+
+extern "C" void kNEBfrc(gpuContext gpu, double neb_force[][3])
+{
+    if (gpu->sim.tmode == 1) {
+
+      int rep = gpu->sim.beadid - 1;
+      if ((gpu->pbNEBEnergyAll->_pSysData[rep + 1] > gpu->pbNEBEnergyAll->_pSysData[rep]) &&
+          (gpu->pbNEBEnergyAll->_pSysData[rep] > gpu->pbNEBEnergyAll->_pSysData[rep - 1])) {
+        kNEBRevisedTanFIRST_kernel<<<gpu->blocks, 96>>>();
+      }
+      else if ((gpu->pbNEBEnergyAll->_pSysData[rep + 1] < gpu->pbNEBEnergyAll->_pSysData[rep]) &&
+               (gpu->pbNEBEnergyAll->_pSysData[rep] < gpu->pbNEBEnergyAll->_pSysData[rep - 1])) {
+        kNEBRevisedTanSECOND_kernel<<<gpu->blocks, 96>>>();
+      }
+      else if (gpu->pbNEBEnergyAll->_pSysData[rep + 1] > gpu->pbNEBEnergyAll->_pSysData[rep - 1]) {
+        kNEBRevisedTanTHIRD_kernel<<<gpu->blocks, 96>>>();
+      }
+      else {
+        kNEBRevisedTanFOURTH_kernel<<<gpu->blocks, 96>>>();
+      } 
+    }
+    else {
+      kNEBBasicTan_kernel<<<gpu->blocks, 96>>>();
+    }
+
+    kNormalize_kernel<<<gpu->blocks, 96>>>();
+    cudaDeviceSynchronize();
+
+    kNormalizeTan_kernel<<<gpu->blocks, 96>>>();
+    cudaDeviceSynchronize();
+
+    if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) {
+      kRetrieveForceNL_kernel<<<gpu->blocks, 96>>>();
+    }
+    else {
+      kRetrieveForce_kernel<<<gpu->blocks, 96>>>();
+    }
+    cudaDeviceSynchronize();
+
+      kDotProduct_kernel<<<gpu->blocks, 96>>>();
+
+    if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) {
+      kNEBFRCNL_kernel<<<gpu->blocks, 96>>>();
+    }
+    else {
+      kNEBFRC_kernel<<<gpu->blocks, 96>>>();
+    }
+}
+
+extern "C" void kNEBfrc_nstep(gpuContext gpu, double neb_force[][3])
+{
+    if (gpu->bNeighborList && (gpu->pbImageIndex != NULL)) {
+      kNEBFRCNL_kernel<<<gpu->blocks, 96>>>();
+    }
+    else {
+      kNEBFRC_kernel<<<gpu->blocks, 96>>>();
+    }
+}
diff --git src/pmemd/src/cuda/kNEB.h src/pmemd/src/cuda/kNEB.h
new file mode 100644
index 0000000..6e526ee
--- /dev/null
+++ src/pmemd/src/cuda/kNEB.h
@@ -0,0 +1,95 @@
+#include "copyright.i"
+{
+  int warpIdx = threadIdx.x >> GRID_BITS;
+  int tgx = threadIdx.x & GRID_BITS_MASK;
+  int pos = tgx + (GRID * blockIdx.x);  // Position in the 1D array.
+#ifdef REVISED_TAN
+  #if defined(THIRD) || defined(FOURTH)
+    double emax = 0.0;
+    double emin = 0.0;
+    int replica = cSim.beadid - 1;
+  #endif
+#endif
+
+#ifdef NEB_FRC
+  __shared__ int atidx[GRID];  
+#endif
+
+#ifdef NORMALIZE
+  __shared__ double norm;
+#endif
+
+  while (pos < cSim.nShuttle) {
+    int idx = pos + warpIdx * cSim.nShuttle;  // Position in the extended 3D array.
+
+#ifdef NEB_FRC
+    if (warpIdx == 0) {
+#ifdef NEIGHBOR_LIST
+      int shidx = cSim.pShuttleTickets[pos];
+      atidx[tgx] = cSim.pImageAtomLookup[shidx];
+#else
+      atidx[tgx] = cSim.pShuttleTickets[pos];
+#endif
+    }
+    __syncthreads();
+#endif
+
+    if ((cSim.pAtmIdx[pos] == 0) || (cSim.pAtmIdx[pos] == 2)) {
+#ifdef REVISED_TAN
+  #ifdef FIRST
+      cSim.pTangents[idx] = cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx];
+  #elif defined(SECOND)
+      cSim.pTangents[idx] = cSim.pDataShuttle[idx] - cSim.pPrevDataShuttle[idx];
+  #elif defined(THIRD)
+      emax = max(abs(cSim.pNEBEnergyAll[replica + 1] - cSim.pNEBEnergyAll[replica]),
+                 abs(cSim.pNEBEnergyAll[replica - 1] - cSim.pNEBEnergyAll[replica]));
+      emin = min(abs(cSim.pNEBEnergyAll[replica + 1] - cSim.pNEBEnergyAll[replica]),
+                 abs(cSim.pNEBEnergyAll[replica - 1] - cSim.pNEBEnergyAll[replica]));
+      cSim.pTangents[idx] = (cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx]) * emax;
+      cSim.pTangents[idx] += (cSim.pDataShuttle[idx] - cSim.pPrevDataShuttle[idx]) * emin;
+  #elif defined(FOURTH)
+      emax = max(abs(cSim.pNEBEnergyAll[replica + 1] - cSim.pNEBEnergyAll[replica]),
+                 abs(cSim.pNEBEnergyAll[replica - 1] - cSim.pNEBEnergyAll[replica]));
+      emin = min(abs(cSim.pNEBEnergyAll[replica + 1] - cSim.pNEBEnergyAll[replica]),
+                 abs(cSim.pNEBEnergyAll[replica - 1] - cSim.pNEBEnergyAll[replica]));
+      cSim.pTangents[idx] = (cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx]) * emin;
+      cSim.pTangents[idx] += (cSim.pDataShuttle[idx] - cSim.pPrevDataShuttle[idx]) * emax;
+  #endif
+#elif defined(BASIC_TAN)
+      cSim.pTangents[idx] = cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx];
+#endif
+    }
+    else {
+      cSim.pTangents[idx] = 0.0;
+    }
+
+#ifdef NORMALIZE
+  if (threadIdx.x == 0) {
+    if (*cSim.pNorm > 1.0e-06) {
+      norm = 1.0/sqrt(*cSim.pNorm);
+    }
+    else {
+      norm = 0.0;
+    }
+  }
+  __syncthreads();
+
+  if ((cSim.pAtmIdx[pos] == 0) || (cSim.pAtmIdx[pos] == 2)) {
+    cSim.pTangents[idx] *= norm;
+  }
+
+  cSim.pSpringForce[idx] = ((cSim.pNextDataShuttle[idx] - cSim.pDataShuttle[idx]) * (*cSim.pSpring2)
+  - (cSim.pDataShuttle[idx] - cSim.pPrevDataShuttle[idx]) * (*cSim.pSpring1)) * cSim.pTangents[idx];
+#endif
+
+#ifdef NEB_FRC
+  //cSim.pNEBForce[idx] = ((*cSim.pDotProduct2) * cSim.pTangents[idx])
+  //                    - ((*cSim.pDotProduct1) * cSim.pTangents[idx]);//comment here after test
+  PMEAccumulator neb_frc = (((*cSim.pDotProduct2) * cSim.pTangents[idx])
+                         - ((*cSim.pDotProduct1) * cSim.pTangents[idx])) * FORCESCALE;
+  cSim.pForceAccumulator[atidx[tgx] + warpIdx * cSim.stride] += neb_frc;
+#endif
+
+    pos += gridDim.x * GRID;
+  }
+}
diff --git src/pmemd/src/cuda/kNorm.h src/pmemd/src/cuda/kNorm.h
new file mode 100644
index 0000000..6a750ce
--- /dev/null
+++ src/pmemd/src/cuda/kNorm.h
@@ -0,0 +1,45 @@
+#include "copyright.i"
+{
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int tgx = threadIdx.x & GRID_BITS_MASK;
+#ifdef REDUCE_TAN
+  double length = 0.0;
+#elif defined(REDUCE_DOTPROD)
+  double dotprod1 = 0.0;
+  double dotprod2 = 0.0;
+#endif
+
+  for (int idx = index; idx < 3*cSim.nShuttle; idx += blockDim.x * gridDim.x) {
+#ifdef REDUCE_TAN
+    length += (cSim.pTangents[idx] * cSim.pTangents[idx]);
+#elif defined(REDUCE_DOTPROD)
+    dotprod1 += (cSim.pDataShuttle[idx] * cSim.pTangents[idx]); //pDataShuttle at this line should include forces, not the coordinates. 
+    dotprod2 += (cSim.pSpringForce[idx]);
+#endif
+  }
+
+#ifdef REDUCE_TAN
+  length = warpReduceSum(length);
+#elif defined(REDUCE_DOTPROD)
+  dotprod1 = warpReduceSum(dotprod1);
+  dotprod2 = warpReduceSum(dotprod2);
+#endif
+
+  if ( tgx == 0 ) {
+#if (__CUDA_ARCH__ < 600) || (__CUDACC_VER_MAJOR__ < 8)
+#ifdef REDUCE_TAN
+    voidatomicAdd(cSim.pNorm,length);
+#elif defined(REDUCE_DOTPROD)
+    voidatomicAdd(cSim.pDotProduct1, dotprod1);
+    voidatomicAdd(cSim.pDotProduct2, dotprod2);
+#endif
+#else
+#ifdef REDUCE_TAN
+    atomicAdd(cSim.pNorm,length);
+#elif defined(REDUCE_DOTPROD)
+    atomicAdd(cSim.pDotProduct1, dotprod1);
+    atomicAdd(cSim.pDotProduct2, dotprod2);
+#endif
+#endif
+  }
+}
diff --git src/pmemd/src/cuda/kRMSFit.h src/pmemd/src/cuda/kRMSFit.h
new file mode 100644
index 0000000..0519547
--- /dev/null
+++ src/pmemd/src/cuda/kRMSFit.h
@@ -0,0 +1,258 @@
+#include "copyright.i"
+{
+#if !defined(CALC_COM) && !defined(KABSCH)
+  int pos = threadIdx.x + blockDim.x * blockIdx.x;
+#endif
+
+#ifdef MASS_CALC
+  int tid = threadIdx.x;
+  __shared__ PMEDouble totMass;
+#endif
+
+#if defined(CALC_COM) || defined(KABSCH)
+  int warpIdx = threadIdx.x >> 6; //there are 64*3=7192 total threads per block and 3 warpIdxs: 0, 1, and 2
+  int tgx = threadIdx.x & 63; // this goes from 0-63 in each warp
+  int pos = tgx + blockIdx.x * 64;
+  int tid = threadIdx.x;
+  __shared__ PMEDouble mass[64];
+  __shared__ double sXData[THREADS_PER_BLOCK*3/16];
+  __shared__ double sYData[THREADS_PER_BLOCK*3/16];
+  __shared__ double sZData[THREADS_PER_BLOCK*3/16];
+#endif
+
+  while (pos < cSim.nShuttle) {
+
+#if defined(CALC_COM) || defined(KABSCH)
+  sXData[tid] = 0.0;
+  sYData[tid] = 0.0;
+  sZData[tid] = 0.0;
+
+  if (warpIdx == 0) {
+  #ifdef NEIGHBOR_LIST
+    int shidx = cSim.pShuttleTickets[pos];
+    int atidx = cSim.pImageAtomLookup[shidx];
+    mass[tgx] = cSim.pImageMass[atidx];
+  #else
+    int atidx = cSim.pShuttleTickets[pos];
+    mass[tgx] = cSim.pAtomMass[atidx];
+  #endif
+  }
+  __syncthreads();
+#endif
+
+#ifdef MASS_CALC
+  #ifdef NEIGHBOR_LIST
+    int shidx = cSim.pShuttleTickets[pos];
+    int atidx = cSim.pImageAtomLookup[shidx];
+    PMEDouble mass = cSim.pImageMass[atidx];
+  #else 
+    int atidx = cSim.pShuttleTickets[pos];
+    PMEDouble mass = cSim.pAtomMass[atidx];
+  #endif
+#endif
+
+#ifdef CALC_COM
+    if ((cSim.pAtmIdx[pos] == 1) || (cSim.pAtmIdx[pos] == 2)) {
+      if (warpIdx == 0) {
+        sXData[tid] = cSim.pDataShuttle[pos] * mass[tgx];
+        sYData[tid] = cSim.pDataShuttle[pos + cSim.nShuttle] * mass[tgx];
+        sZData[tid] = cSim.pDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx];
+      }
+      else if (warpIdx == 1) {
+        sXData[tid] = cSim.pPrevDataShuttle[pos] * mass[tgx];
+        sYData[tid] = cSim.pPrevDataShuttle[pos + cSim.nShuttle] * mass[tgx];
+        sZData[tid] = cSim.pPrevDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx];
+      }
+      else if (warpIdx == 2) {
+        sXData[tid] = cSim.pNextDataShuttle[pos] * mass[tgx];
+        sYData[tid] = cSim.pNextDataShuttle[pos + cSim.nShuttle] * mass[tgx];
+        sZData[tid] = cSim.pNextDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx];
+      }
+    }
+    __syncthreads();
+#elif defined(KABSCH)
+  #ifdef NEXT_NEIGHBOR
+    if ((cSim.pAtmIdx[pos] == 1) || (cSim.pAtmIdx[pos] == 2)) {
+      sXData[tid] = cSim.pNextDataShuttle[pos] * mass[tgx] *
+                    cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle];
+      sYData[tid] = cSim.pNextDataShuttle[pos + cSim.nShuttle] * mass[tgx] *
+                    cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle];
+      sZData[tid] = cSim.pNextDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx] *
+                    cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle];
+    }
+    __syncthreads();
+  #elif defined(PREV_NEIGHBOR)
+    if ((cSim.pAtmIdx[pos] == 1) || (cSim.pAtmIdx[pos] == 2)) {
+      sXData[tid] = cSim.pPrevDataShuttle[pos] * mass[tgx] *
+                    cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle];
+      sYData[tid] = cSim.pPrevDataShuttle[pos + cSim.nShuttle] * mass[tgx] *
+                    cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle];
+      sZData[tid] = cSim.pPrevDataShuttle[pos + 2 * cSim.nShuttle] * mass[tgx] *
+                    cSim.pDataShuttle[pos + warpIdx * cSim.nShuttle];
+    }
+    __syncthreads();
+  #endif
+#endif
+
+#ifdef MASS_CALC
+    if (tid == 0) {
+      totMass = 0.0;
+    }
+    __syncthreads();
+    if ((cSim.pAtmIdx[pos] == 1) || (cSim.pAtmIdx[pos] == 2)) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+      voidatomicAdd(&totMass, mass);
+#else
+      atomicAdd(&totMass, mass);
+#endif
+    }
+    __syncthreads();
+#endif
+
+#if defined(CALC_COM) || defined(KABSCH) 
+    for (unsigned int i = blockDim.x/6; i > 0; i >>= 1) {
+      if (tgx < i && (pos + i) < cSim.nShuttle) {
+        sXData[tid] += sXData[tid + i];
+        sYData[tid] += sYData[tid + i];
+        sZData[tid] += sZData[tid + i];
+      }
+      __syncthreads();
+    }
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+  #ifdef CALC_COM
+    if (tgx == 0) {
+      if (warpIdx == 0) {
+        voidatomicAdd(&cSim.pSelfCOM[0], sXData[0] / (*cSim.ptotFitMass));
+        voidatomicAdd(&cSim.pSelfCOM[1], sYData[0] / (*cSim.ptotFitMass));
+        voidatomicAdd(&cSim.pSelfCOM[2], sZData[0] / (*cSim.ptotFitMass));
+      }
+      else if (warpIdx == 1) {
+        voidatomicAdd(&cSim.pPrevCOM[0], sXData[64] / (*cSim.ptotFitMass));
+        voidatomicAdd(&cSim.pPrevCOM[1], sYData[64] / (*cSim.ptotFitMass));
+        voidatomicAdd(&cSim.pPrevCOM[2], sZData[64] / (*cSim.ptotFitMass));
+      }
+      else if (warpIdx == 2) {
+        voidatomicAdd(&cSim.pNextCOM[0], sXData[128] / (*cSim.ptotFitMass));
+        voidatomicAdd(&cSim.pNextCOM[1], sYData[128] / (*cSim.ptotFitMass));
+        voidatomicAdd(&cSim.pNextCOM[2], sZData[128] / (*cSim.ptotFitMass));
+      }
+    }
+  #elif defined(KABSCH)
+    if (tgx == 0) {
+    #ifdef NEXT_NEIGHBOR
+        voidatomicAdd(&cSim.pNextKabsch[0 + warpIdx * 3], sXData[warpIdx * 64]);
+        voidatomicAdd(&cSim.pNextKabsch[1 + warpIdx * 3], sYData[warpIdx * 64]);
+        voidatomicAdd(&cSim.pNextKabsch[2 + warpIdx * 3], sZData[warpIdx * 64]);
+    #elif defined(PREV_NEIGHBOR)
+        voidatomicAdd(&cSim.pPrevKabsch[0 + warpIdx * 3], sXData[warpIdx * 64]);
+        voidatomicAdd(&cSim.pPrevKabsch[1 + warpIdx * 3], sYData[warpIdx * 64]);
+        voidatomicAdd(&cSim.pPrevKabsch[2 + warpIdx * 3], sZData[warpIdx * 64]);
+    #endif
+    }
+  #elif defined(MASS_CALC)
+    if (tid == 0) {
+      voidatomicAdd(cSim.ptotFitMass, totMass);
+    }
+  #endif
+#else
+  #ifdef CALC_COM
+    if (tgx == 0) {
+      if (warpIdx == 0) {
+        atomicAdd(&cSim.pSelfCOM[0], sXData[0] / (*cSim.ptotFitMass));
+        atomicAdd(&cSim.pSelfCOM[1], sYData[0] / (*cSim.ptotFitMass));
+        atomicAdd(&cSim.pSelfCOM[2], sZData[0] / (*cSim.ptotFitMass));
+      }
+      else if (warpIdx == 1) {
+        atomicAdd(&cSim.pPrevCOM[0], sXData[64] / (*cSim.ptotFitMass));
+        atomicAdd(&cSim.pPrevCOM[1], sYData[64] / (*cSim.ptotFitMass));
+        atomicAdd(&cSim.pPrevCOM[2], sZData[64] / (*cSim.ptotFitMass));
+      }
+      else if (warpIdx == 2) {
+        atomicAdd(&cSim.pNextCOM[0], sXData[128] / (*cSim.ptotFitMass));
+        atomicAdd(&cSim.pNextCOM[1], sYData[128] / (*cSim.ptotFitMass));
+        atomicAdd(&cSim.pNextCOM[2], sZData[128] / (*cSim.ptotFitMass));
+      }
+    }
+  #elif defined(KABSCH)
+    if (tgx == 0) {
+    #ifdef NEXT_NEIGHBOR
+        atomicAdd(&cSim.pNextKabsch[0 + warpIdx * 3], sXData[warpIdx * 64]);
+        atomicAdd(&cSim.pNextKabsch[1 + warpIdx * 3], sYData[warpIdx * 64]);
+        atomicAdd(&cSim.pNextKabsch[2 + warpIdx * 3], sZData[warpIdx * 64]);
+    #elif defined(PREV_NEIGHBOR)
+        atomicAdd(&cSim.pPrevKabsch[0 + warpIdx * 3], sXData[warpIdx * 64]);
+        atomicAdd(&cSim.pPrevKabsch[1 + warpIdx * 3], sYData[warpIdx * 64]);
+        atomicAdd(&cSim.pPrevKabsch[2 + warpIdx * 3], sZData[warpIdx * 64]);
+    #endif
+    }
+  #elif defined(MASS_CALC)
+    if (tid == 0) {
+      atomicAdd(cSim.ptotFitMass, totMass);
+    }
+  #endif
+#endif
+
+#ifdef CENTER_DATA
+      cSim.pDataShuttle[                    pos] = cSim.pDataShuttle[                    pos] - cSim.pSelfCOM[0];
+      cSim.pDataShuttle[    cSim.nShuttle + pos] = cSim.pDataShuttle[    cSim.nShuttle + pos] - cSim.pSelfCOM[1];
+      cSim.pDataShuttle[2 * cSim.nShuttle + pos] = cSim.pDataShuttle[2 * cSim.nShuttle + pos] - cSim.pSelfCOM[2];
+      cSim.pPrevDataShuttle[                    pos] = cSim.pPrevDataShuttle[                    pos] - cSim.pPrevCOM[0];
+      cSim.pPrevDataShuttle[    cSim.nShuttle + pos] = cSim.pPrevDataShuttle[    cSim.nShuttle + pos] - cSim.pPrevCOM[1];
+      cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] = cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] - cSim.pPrevCOM[2];
+      cSim.pNextDataShuttle[                    pos] = cSim.pNextDataShuttle[                    pos] - cSim.pNextCOM[0];
+      cSim.pNextDataShuttle[    cSim.nShuttle + pos] = cSim.pNextDataShuttle[    cSim.nShuttle + pos] - cSim.pNextCOM[1];
+      cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] = cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] - cSim.pNextCOM[2];
+#endif
+
+#ifdef ROTATE_NEIGHBOR
+  #ifdef NEXT_NEIGHBOR
+    double xtmp = cSim.pNextDataShuttle[                    pos] * cSim.pRotAtm[0]
+                + cSim.pNextDataShuttle[    cSim.nShuttle + pos] * cSim.pRotAtm[1]
+                + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[2];
+    double ytmp = cSim.pNextDataShuttle[                  + pos] * cSim.pRotAtm[3]
+                + cSim.pNextDataShuttle[    cSim.nShuttle + pos] * cSim.pRotAtm[4]
+                + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[5];
+    double ztmp = cSim.pNextDataShuttle[                  + pos] * cSim.pRotAtm[6]
+                + cSim.pNextDataShuttle[    cSim.nShuttle + pos] * cSim.pRotAtm[7]
+                + cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[8];
+    cSim.pNextDataShuttle[                    pos] = xtmp;
+    cSim.pNextDataShuttle[    cSim.nShuttle + pos] = ytmp;
+    cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] = ztmp;
+  #elif defined(PREV_NEIGHBOR)
+    double xtmp = cSim.pPrevDataShuttle[                    pos] * cSim.pRotAtm[0]
+                + cSim.pPrevDataShuttle[    cSim.nShuttle + pos] * cSim.pRotAtm[1]
+                + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[2];
+    double ytmp = cSim.pPrevDataShuttle[                  + pos] * cSim.pRotAtm[3]
+                + cSim.pPrevDataShuttle[    cSim.nShuttle + pos] * cSim.pRotAtm[4]
+                + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[5];
+    double ztmp = cSim.pPrevDataShuttle[                  + pos] * cSim.pRotAtm[6]
+                + cSim.pPrevDataShuttle[    cSim.nShuttle + pos] * cSim.pRotAtm[7]
+                + cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] * cSim.pRotAtm[8];
+    cSim.pPrevDataShuttle[                    pos] = xtmp;
+    cSim.pPrevDataShuttle[    cSim.nShuttle + pos] = ytmp;
+    cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] = ztmp;
+  #endif
+#endif
+
+#ifdef RECENTER_DATA
+    cSim.pDataShuttle[                    pos] = cSim.pDataShuttle[                    pos] + cSim.pSelfCOM[0];
+    cSim.pDataShuttle[    cSim.nShuttle + pos] = cSim.pDataShuttle[    cSim.nShuttle + pos] + cSim.pSelfCOM[1];
+    cSim.pDataShuttle[2 * cSim.nShuttle + pos] = cSim.pDataShuttle[2 * cSim.nShuttle + pos] + cSim.pSelfCOM[2];
+    cSim.pNextDataShuttle[                    pos] = cSim.pNextDataShuttle[                    pos] + cSim.pSelfCOM[0];
+    cSim.pNextDataShuttle[    cSim.nShuttle + pos] = cSim.pNextDataShuttle[    cSim.nShuttle + pos] + cSim.pSelfCOM[1];
+    cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] = cSim.pNextDataShuttle[2 * cSim.nShuttle + pos] + cSim.pSelfCOM[2];
+    cSim.pPrevDataShuttle[                    pos] = cSim.pPrevDataShuttle[                    pos] + cSim.pSelfCOM[0];
+    cSim.pPrevDataShuttle[    cSim.nShuttle + pos] = cSim.pPrevDataShuttle[    cSim.nShuttle + pos] + cSim.pSelfCOM[1];
+    cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] = cSim.pPrevDataShuttle[2 * cSim.nShuttle + pos] + cSim.pSelfCOM[2];
+#endif
+
+
+#if defined(CALC_COM) || defined(KABSCH)
+    pos += gridDim.x*64;
+#else
+    pos += gridDim.x*256;
+#endif
+  }
+}
diff --git src/pmemd/src/gb_force.F90 src/pmemd/src/gb_force.F90
index 16eedb4..34e5a70 100644
--- src/pmemd/src/gb_force.F90
+++ src/pmemd/src/gb_force.F90
@@ -287,18 +287,18 @@ subroutine gb_force(atm_cnt, crd, frc, pot_ene, irespa, need_pot_ene)
 ! END GBSA implementation to work with the GPU code.
     if (on_cpstep) then
       call gpu_refresh_charges(cit_nb14, gbl_one_scee, proposed_qterm)
-      call gpu_gb_ene(pot_ene_prop, enmr)
+      call gpu_gb_ene(pot_ene_prop, enmr, ineb)
       call gpu_refresh_charges(cit_nb14, gbl_one_scee, atm_qterm)
-      call gpu_gb_ene(pot_ene, enmr)
+      call gpu_gb_ene(pot_ene, enmr, ineb)
       pot_ene%dvdl = pot_ene_prop%total - pot_ene%total
     else if (on_cestep) then
       call gpu_refresh_charges(cit_nb14, gbl_one_scee, proposed_qterm)
-      call gpu_gb_ene(pot_ene_prop, enmr)
+      call gpu_gb_ene(pot_ene_prop, enmr, ineb)
       call gpu_refresh_charges(cit_nb14, gbl_one_scee, atm_qterm)
-      call gpu_gb_ene(pot_ene, enmr)
+      call gpu_gb_ene(pot_ene, enmr, ineb)
       pot_ene%dvdl = pot_ene_prop%total - pot_ene%total
     else
-      call gpu_gb_ene(pot_ene, enmr)
+      call gpu_gb_ene(pot_ene, enmr, ineb)
     end if
 ! BEGIN GBSA implementation to work with the GPU code.
 ! force elements sent to the GPU to be added to the other force contributions
@@ -394,33 +394,13 @@ subroutine gb_force(atm_cnt, crd, frc, pot_ene, irespa, need_pot_ene)
 
 #ifdef MPI
   if (ineb>0) then
-   call gpu_download_crd(crd)
-   call gpu_gb_ene(pot_ene,enmr)
-   call gpu_download_frc(frc)
-
-   if(firsttime) then
-     call nebread()
-     firsttime= .false.
-   end if
-
-   call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, irespa)
-
-   do jatm=1,nattgtrms
-      iatm=rmsmask(jatm)
-      if(iatm .eq. 0) cycle
-      frc(1,iatm)= frc(1,iatm)+neb_force(1,jatm)
-      frc(2,iatm)= frc(2,iatm)+neb_force(2,jatm)
-      frc(3,iatm)= frc(3,iatm)+neb_force(3,jatm)
-   end do
+   call transfer_fit_neb_crd(irespa)
+   call full_neb_forces(irespa)
 
    if (beadid==1 .or. beadid==neb_nbead) then
-      do i=1, atm_cnt
-         frc(:,i)=0.d0
-      enddo
+      call gpu_clear_forces()
    end if
 
-    call gpu_upload_frc(frc)
-
   end if
 #endif /* MPI */
 !END_NEB
@@ -874,19 +854,14 @@ subroutine gb_force(atm_cnt, crd, frc, pot_ene, irespa, need_pot_ene)
     call gb_mpi_gathervec(atm_cnt, frc)
 
     if(mytaskid.eq.0) then
-      if(firsttime) then
-        call nebread()
-        firsttime= .false.
-      end if
       call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, irespa)
     end if
-
-    call mpi_barrier(mpi_comm_world,err_code_mpi)
+    call mpi_bcast(neb_force, 3*nattgtrms, MPI_DOUBLE_PRECISION, 0, pmemd_comm, err_code_mpi)
 
     do jatm=1,nattgtrms
       iatm=rmsmask(jatm)
       if(iatm .eq. 0) cycle
-      if(mytaskid .eq. 0) then
+      if (gbl_atm_owner_map(iatm) .eq. mytaskid) then
         frc(1,iatm)= frc(1,iatm)+neb_force(1,jatm)
         frc(2,iatm)= frc(2,iatm)+neb_force(2,jatm)
         frc(3,iatm)= frc(3,iatm)+neb_force(3,jatm)
@@ -895,12 +870,11 @@ subroutine gb_force(atm_cnt, crd, frc, pot_ene, irespa, need_pot_ene)
 
     if (beadid==1 .or. beadid==neb_nbead) then
       do i=1, atm_cnt
-        frc(:,i)=0.d0
+        if (gbl_atm_owner_map(i) .eq. mytaskid) then
+          frc(:,i)=0.d0
+        endif
       enddo
     end if
-
-    call mpi_bcast(frc, 3*atm_cnt, MPI_DOUBLE_PRECISION, 0, pmemd_comm, err_code_mpi)
-
   end if
 #endif /* MPI */
 
diff --git src/pmemd/src/mdin_ctrl_dat.F90 src/pmemd/src/mdin_ctrl_dat.F90
index b4315a2..e284620f 100644
--- src/pmemd/src/mdin_ctrl_dat.F90
+++ src/pmemd/src/mdin_ctrl_dat.F90
@@ -42,7 +42,7 @@ use file_io_dat_mod
                                 nucat, & ! gbneck2nu: check if atom belong to nucleic or not
                                 mcrescyc,nmd,nmc,mcverbosity, &
                                 infe, & ! added by FENG PAN
-                                ineb, skmin, skmax, tmode, vv, &
+                                ineb, skmin, skmax, tmode, vv, nebfreq, &
                                 irandom, mask_from_ref, mbar_states, &
                                 geq_nstep, ginit_vel, gsyn_mass, & !GTI
                                 gremd_acyc, gti_cpu_output, & ! GTI
@@ -76,10 +76,10 @@ use file_io_dat_mod
                                 geq_nstep, ginit_vel,  gsyn_mass, & !99
                                 gremd_acyc, gti_cpu_output, & !101
                                 mcint,nmd,mcrescyc,nmc,mcverbosity, & !106
-                                ineb,skmin,skmax,tmode,vv,  & !111
-                                iphmd, midpoint !113
+                                ineb,skmin,skmax,tmode,vv, nebfreq, & !112
+                                iphmd, midpoint !114
 
-  integer, parameter    :: mdin_ctrl_int_cnt = 113
+  integer, parameter    :: mdin_ctrl_int_cnt = 114
 
   save  :: / mdin_ctrl_int /
 
@@ -303,7 +303,7 @@ use file_io_dat_mod
                         mbar_lambda, mbar_states, mask_from_ref, &
                         geq_nstep, ginit_vel, gsyn_mass, gremd_acyc, &
                         gti_cpu_output,mcwat,nmd,mcint,nmc,mcrescyc,mcverbosity,&
-                        mcresstr,mcwatmaxdif,ineb,skmin,skmax,tmode,vv,vfac,&
+                        mcresstr,mcwatmaxdif,ineb,skmin,skmax,tmode,vv,vfac,nebfreq,&
                         mcboxshift, midpoint
       
       
@@ -705,6 +705,7 @@ subroutine init_mdin_ctrl_dat(remd_method)
   tmode = 1                     ! Use the revised tangent definition by default
   vv = 0                        ! Flag for quenched velocity verlet minimization
   vfac = 0.d0                   ! Scaling factor for quenched velocity verlet
+  nebfreq = 1
 
   infe = 0   ! added by FENG PAN
 ! Read input in namelist format:
diff --git src/pmemd/src/neb.F90 src/pmemd/src/neb.F90
index 7db2853..ea3c1a3 100644
--- src/pmemd/src/neb.F90
+++ src/pmemd/src/neb.F90
@@ -104,7 +104,27 @@ Contains
 !-------------------------------------------------------------------------------
 #ifdef MPI
 
+#ifdef CUDA
+  subroutine transfer_fit_neb_crd(irespa)
+    use mdin_ctrl_dat_mod
+    use nebread_mod
+
+    implicit none
+
+    integer, intent(in)                 :: irespa
+
+     if ( irespa .eq. 1 .or. mod(irespa,nebfreq)==0 ) then
+       call gpu_neb_exchange_crd()
+       call gpu_neb_rmsfit()
+     endif
+  end subroutine transfer_fit_neb_crd
+#endif
+
+#ifdef CUDA
+  subroutine full_neb_forces(irespa)
+#else
   subroutine full_neb_forces(mass, x, f, epot, fitgp, rmsgp, irespa)
+#endif
 
     use mdin_ctrl_dat_mod
     use prmtop_dat_mod
@@ -114,11 +134,15 @@ Contains
     implicit none
 
     !passed variables
-    double precision                    :: mass(:), x(:,:), f(:,:)
+#ifndef CUDA
+    double precision                    :: x(:,:)
+    double precision                    :: mass(:), f(:,:)
     double precision                    :: epot
     integer                             :: rmsgp(:), fitgp(:)
+#endif
     integer, intent(in)                 :: irespa
 
+#ifndef CUDA
     !local variables
     logical                             :: rmsok
     integer                             :: i, j, rep, iatm, jatm
@@ -126,7 +150,7 @@ Contains
     double precision                    :: eplus, eminus, energy, ezero, emax
     double precision                    :: dotproduct, dotproduct2
     double precision                    :: rmsdvalue, spring, spring2
-
+#endif
 
     if (mod(neb_nbead,2)/=0) then
        write (6,*) "must have even number of beads",neb_nbead
@@ -139,8 +163,16 @@ Contains
 ! note also that next_node and prev_node are circular,
 !  i.e. first and last still have both neighbors defined.
 !**********************************************************
-
-  !DG: these ranks imply we use pmemd_master_comm for the communicator
+#ifdef CUDA
+ if(irespa .eq. 1 .or. mod(irespa,nebfreq)==0) then
+  call gpu_set_neb_springs()
+  call gpu_calculate_neb_frc(neb_force)
+ else
+  call gpu_calculate_neb_frc_nstep(neb_force)
+ endif
+#else
+ if(irespa .eq. 1 .or. mod(irespa,nebfreq)==0) then
+  !these ranks imply we use pmemd_master_comm for the communicator
   next_node = master_rank+1
   if(next_node>=master_size) next_node = 0
   prev_node = master_rank-1
@@ -193,12 +225,9 @@ Contains
                        pmemd_master_comm,err_code_mpi)
 
 ! fit neighbor coords to self
-! ok to fit both neighbors since next/prev were defined as circular
 
-! note that we pass fitgp and rmsgp, which were arguments to this routine in the call from force()
 ! first argument is refc (which gets fit to the second argument, the actual MD coords)
 ! refc in this case is the neighbor image
-
     call rmsfit( xprev, x, mass, fitgp,rmsgp, rmsdvalue, nattgtrms, nattgtfit, rmsok)
     call rmsfit( xnext, x, mass, fitgp,rmsgp, rmsdvalue, nattgtrms, nattgtfit, rmsok)
 
@@ -397,7 +426,8 @@ Contains
    write(worldrank+50,'(a)') "-------------------------------------------------"
 #  endif
 
-
+ endif !nebfreq
+#endif /* CUDA */
    return
   end subroutine full_neb_forces
 
@@ -412,6 +442,10 @@ Contains
     integer :: filenumber,ix
     double precision :: sum
 
+#if defined(CUDA) && defined(MPI)
+    call gpu_report_neb_ene(master_size, neb_nrg_all)
+#endif
+
     sum = 0.d0
     do ix = 1, neb_nbead
        sum = sum + neb_nrg_all(ix)
@@ -659,7 +693,7 @@ Contains
       ! here are copies, and we do not actually modify the coordinates being
       ! used by the neighbor image MD (just the copies that we received from them)
 
-      do i=1, last_neb_atom !natom DG
+      do i=1, last_neb_atom
         x(1,i) = x(1,i) + xcm1
         x(2,i) = x(2,i) + ycm1
         x(3,i) = x(3,i) + zcm1
diff --git src/pmemd/src/nebread.F90 src/pmemd/src/nebread.F90
index db20cf2..66d7b36 100644
--- src/pmemd/src/nebread.F90
+++ src/pmemd/src/nebread.F90
@@ -19,8 +19,9 @@ module nebread_mod
   integer :: beadid  !mybeadid in sander neb
   integer, allocatable :: fitmask(:), rmsmask(:)
   type(atm_info), allocatable :: sortfit(:), sortrms(:)
-  integer, allocatable :: combined_mask(:)   !DG: combined and sorted mask from fitmask and rmsmask
+  type(atm_info), allocatable :: combined_mask(:)   !DG: combined and sorted mask from fitmask and rmsmask
   integer, allocatable :: partial_mask(:)    !DG: same as combined_mask with no end zeroes, needed for partial coordinates download from gpu.
+  integer, allocatable :: idx_mask(:)
   integer :: pcnt                            !DG: partial counts, partial_mask number of elements.
   integer :: alloc_stat
 
@@ -100,7 +101,8 @@ subroutine nebread()
       allocate(combined_mask(nattgtrms+nattgtfit), stat=alloc_stat)
       if ( alloc_stat .ne. 0 ) &
         call allocation_error('nebread','Error allocating temporary arrays')
-      combined_mask(:) = 0
+      combined_mask(:)%atm_num = 0
+      combined_mask(:)%atm_idx = 0
 
       do n=1,nattgtfit
         sortfit(n)%atm_num = fitmask(n)
@@ -114,7 +116,7 @@ subroutine nebread()
 
        pcnt=0
        do n=1,size(combined_mask)
-         if(combined_mask(n) .ne. 0) then
+         if(combined_mask(n)%atm_num .ne. 0) then
            pcnt = pcnt + 1
          endif
        enddo
@@ -122,8 +124,15 @@ subroutine nebread()
        allocate(partial_mask(pcnt), stat=alloc_stat)
        if ( alloc_stat .ne. 0 ) &
         call allocation_error('nebread','Error allocating temporary arrays')
-       partial_mask = combined_mask(:pcnt)
+       partial_mask(:) = 0
 
+       allocate(idx_mask(pcnt), stat=alloc_stat)
+       if ( alloc_stat .ne. 0 ) &
+        call allocation_error('nebread','Error allocating temporary arrays')
+       idx_mask(:) = 0
+
+       partial_mask(1:pcnt) = combined_mask(1:pcnt)%atm_num
+       idx_mask(1:pcnt) = combined_mask(1:pcnt)%atm_idx
    endif
 end subroutine nebread
 !-------------------------------------------------------------------------
@@ -132,7 +141,7 @@ subroutine combine_sort(combined, sortfit, sortrms, nattgtfit, nattgtrms)
   implicit none
 
   !Passed variables
-  integer :: combined(:)
+  type(atm_info) :: combined(:)
   type(atm_info) :: sortfit(:), sortrms(:)
   integer :: nattgtfit, nattgtrms
 
@@ -146,14 +155,16 @@ subroutine combine_sort(combined, sortfit, sortrms, nattgtfit, nattgtrms)
 
   do i=1,nattgtfit+nattgtrms
     if(sortfit(fitcnt)%atm_num .lt. sortrms(rmscnt)%atm_num) then
-      combined(i) = sortfit(fitcnt)%atm_num
+      combined(i)%atm_num = sortfit(fitcnt)%atm_num
       sortfit(fitcnt)%atm_idx = i
+      combined(i)%atm_idx = 1  !atom belongs to sortfit
       fitcnt = fitcnt + 1
     else
-      combined(i) = sortrms(rmscnt)%atm_num
-      sortrms(rmscnt)%atm_idx = i
+      combined(i)%atm_num = sortrms(rmscnt)%atm_num
+      sortrms(rmscnt)%atm_idx = i !leave combined(i)%atm_idx = 0, atom belongs to sortrms
       if(sortfit(fitcnt)%atm_num .eq. sortrms(rmscnt)%atm_num) then
         sortfit(fitcnt)%atm_idx = i
+        combined(i)%atm_idx = 2 !atom belongs to both sortfit and sortrms
         fitcnt = fitcnt + 1
       endif
       rmscnt = rmscnt + 1
@@ -165,14 +176,15 @@ subroutine combine_sort(combined, sortfit, sortrms, nattgtfit, nattgtrms)
   if(rmscnt .gt. nattgtrms) then
     m = nattgtfit - fitcnt
     do i=0,m
-      combined(cntr + i + 1) = sortfit(fitcnt + i)%atm_num
+      combined(cntr + i + 1)%atm_num = sortfit(fitcnt + i)%atm_num
       sortfit(fitcnt +i)%atm_idx = cntr + i + 1
+      combined(cntr + i + 1)%atm_idx = 1  !atom belongs to sortfit
     enddo
   else if(fitcnt .gt. nattgtfit) then
     m = nattgtrms - rmscnt
     do i=0,m
-      combined(cntr + i + 1) = sortrms(rmscnt + i)%atm_num
-      sortrms(rmscnt + i)%atm_idx = cntr + i + 1
+      combined(cntr + i + 1)%atm_num = sortrms(rmscnt + i)%atm_num
+      sortrms(rmscnt + i)%atm_idx = cntr + i + 1 !leave combined(i)%atm_idx = 0, atom belongs to sortrms
     enddo
   endif
 
diff --git src/pmemd/src/pme_force.F90 src/pmemd/src/pme_force.F90
index 09bc8ef..a5083f8 100644
--- src/pmemd/src/pme_force.F90
+++ src/pmemd/src/pme_force.F90
@@ -296,7 +296,6 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, &
 
   type(pme_virial_rec)          :: proc_vir
   integer                       :: offset_loc
-  logical                       :: firsttime=.true.
   logical                       :: send_data
   integer                       :: iatm, jatm, task_id
   type(pme_virial_rec)          :: vir
@@ -393,16 +392,22 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, &
 
     call gti_build_nl_list(ti_mode, lj1264)
     call gti_clear(need_pot_enes, ti_mode, crd)
-
     call gti_ele_recip(need_pot_enes, need_virials, uc_volume, ti_weights, ti_mode)
     call gti_nb(need_pot_enes, need_virials, ti_mode, lj1264) 
-    
     call gti_bonded(need_pot_enes, need_virials, ti_mode) 
     call gti_others(need_pot_enes, nstep, dt)    
-
     call gti_finalize_force_virial(numextra, need_virials, ti_mode, netfrc, frc)
+    if (ineb>0) then
+#ifdef TIME_TEST
+        call start_test_timer(14, 'neb_exchange_crd', 0)
+#endif
+      call transfer_fit_neb_crd(nstep)
+#ifdef TIME_TEST
+        call stop_test_timer(14)
+#endif
+    endif
 
-    if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt)
+    if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt, ineb, nebfreq, nstep)
 
     if (ti_mode .gt. 0) then
       if (need_pot_enes) then
@@ -412,21 +417,24 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, &
       if (need_virials) call gti_get_virial(ti_weights(1:2), virial, ekcmt, ti_mode)
     end if
 
-    if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt)    
-    
-    if (ti_mode >0) then
-        if (need_pot_enes) then
-            if (ifmbar .ne. 0) call gti_update_mbar_from_gpu
-            call gti_update_ti_pot_energy_from_gpu(pot_ene, enmr, ti_mode, gti_cpu_output) 
-        endif
-        if (need_virials) call gti_get_virial(ti_weights(1:2), virial, ekcmt, ti_mode)
-    endif
+!DG    if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt)    
+!DG    
+!DG    if (ti_mode >0) then
+!DG        if (need_pot_enes) then
+!DG            if (ifmbar .ne. 0) call gti_update_mbar_from_gpu
+!DG            call gti_update_ti_pot_energy_from_gpu(pot_ene, enmr, ti_mode, gti_cpu_output) 
+!DG        endif
+!DG        if (need_virials) call gti_get_virial(ti_weights(1:2), virial, ekcmt, ti_mode)
+!DG    endif
 #else /* GTI */
 
     if (need_pot_enes) then
+      if (ineb>0) then
+        call transfer_fit_neb_crd(nstep)
+      endif
       if (ti_mode .eq. 0) then
         call gpu_pme_ene(ew_coeff, uc_volume, pot_ene, enmr, virial, &
-                         ekcmt, nstep, dt, crd, frc)
+                         ekcmt, nstep, dt, crd, frc, ineb, nebfreq)
       else
         call gpu_ti_pme_ene(ew_coeff, uc_volume, pot_ene, enmr, virial, &
                             ekcmt, nstep, dt, crd, frc, bar_cont)
@@ -554,34 +562,17 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, &
   end if
 
   if (ineb .gt. 0) then
+#ifdef TIME_TEST
+        call start_test_timer(15, 'neb_force', 0)
+#endif
+    call full_neb_forces(nstep)
+#ifdef TIME_TEST
+        call stop_test_timer(15)
+#endif
 
-    if (firsttime) then
-      call nebread()
-      call gpu_setup_shuttle_info(pcnt, 0, partial_mask)
-      firsttime = .false.
-    end if
-
-    call gpu_shuttle_retrieve_data(crd, 0)
-    call gpu_shuttle_retrieve_data(frc, 1)
-    call gpu_pme_ene(ew_coeff, uc_volume, pot_ene, enmr, virial, &
-                     ekcmt, nstep, dt)
-
-    call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, nstep)
-
-    do jatm=1,nattgtrms
-      iatm=rmsmask(jatm)
-      if (iatm .eq. 0) cycle
-
-      frc(1,iatm) = frc(1,iatm)+neb_force(1,jatm)
-      frc(2,iatm) = frc(2,iatm)+neb_force(2,jatm)
-      frc(3,iatm) = frc(3,iatm)+neb_force(3,jatm)
-    enddo
-
-    call gpu_shuttle_post_data(frc, 1)
-    if (beadid .eq. 1 .or. beadid .eq. neb_nbead) then
+    if (beadid==1 .or. beadid==neb_nbead) then
       call gpu_clear_forces()
-    end if
-
+    endif
   end if
 
 #else /*Not cuda version below*/
@@ -1638,14 +1629,6 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, &
     if (ineb .gt. 0) then
       call mpi_gathervec(atm_cnt, frc)
       call mpi_gathervec(atm_cnt, crd)
-      if (firsttime) then
-        if (mytaskid .eq. 0) then
-          call nebread()
-        end if
-        call mpi_bcast(nattgtrms, 1, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi)
-        call mpi_bcast(rmsmask, nattgtrms, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi)
-        firsttime = .false.
-      end if
 
       if (mytaskid .eq. 0) then
         call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, nstep)
@@ -1765,7 +1748,6 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, &
 
   type(pme_virial_rec)          :: proc_vir
   integer                       :: offset_loc
-  logical                       :: firsttime=.true.
   logical                       :: send_data
   integer                       :: iatm, jatm, task_id
   type(pme_virial_rec)          :: vir
@@ -1867,8 +1849,11 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, &
     call gti_others(need_pot_enes, nstep, dt)
 
     call gti_finalize_force_virial(numextra, need_virials, ti_mode, netfrc, frc)
+    if (ineb>0) then
+      call transfer_fit_neb_crd(nstep)
+    endif
 
-    if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt)
+    if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt, ineb, nebfreq, nstep)
 
     if (ti_mode .gt. 0) then
       if (need_pot_enes) then
@@ -1893,6 +1878,9 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, &
       call gpu_pme_force(ew_coeff, uc_volume, virial, ekcmt, nstep, dt, crd, frc)
 
     end if
+    if (ineb>0) then
+      call transfer_fit_neb_crd(nstep)
+    endif
 
 #endif /* GTI  */
 
@@ -2012,37 +2000,11 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, &
   end if
 
   if (ineb .gt. 0) then
-
-    if (firsttime) then
-      call nebread()
-      call gpu_setup_shuttle_info(pcnt, 0, partial_mask)
-      firsttime = .false.
-    end if
-
-    call gpu_shuttle_retrieve_data(crd, 0)
-    call gpu_shuttle_retrieve_data(frc, 1)
-    call gpu_pme_ene(ew_coeff, uc_volume, pot_ene, enmr, virial, &
-                     ekcmt, nstep, dt)
-
-
-    call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, nstep)
-
-    do jatm=1,nattgtrms
-      iatm=rmsmask(jatm)
-      if (iatm .eq. 0) cycle
-
-      frc(1,iatm)= frc(1,iatm)+neb_force(1,jatm)
-      frc(2,iatm)= frc(2,iatm)+neb_force(2,jatm)
-      frc(3,iatm)= frc(3,iatm)+neb_force(3,jatm)
-    enddo
-
-    call gpu_shuttle_post_data(frc, 1)
-    if (beadid .eq. 1 .or. beadid .eq. neb_nbead) then
+    call full_neb_forces(nstep)
+    if (beadid==1 .or. beadid==neb_nbead) then
       call gpu_clear_forces()
-    end if
-
+    endif
   end if
-  !END_NEB
 
 #else /*Not cuda version below*/
 
@@ -3056,14 +3018,6 @@ subroutine pme_force_midpoint(atm_cnt, crd, frc, &
     if (ineb .gt. 0) then
       call mpi_gathervec(atm_cnt, frc)
       call mpi_gathervec(atm_cnt, crd)
-      if (firsttime) then
-        if (mytaskid .eq. 0) then
-          call nebread()
-        end if
-        call mpi_bcast(nattgtrms, 1, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi)
-        call mpi_bcast(rmsmask, nattgtrms, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi)
-        firsttime = .false.
-      end if
 
       if (mytaskid .eq. 0) then
         call full_neb_forces(atm_mass, crd, frc, pot_ene%total, fitmask, rmsmask, nstep)
@@ -3250,7 +3204,7 @@ subroutine pme_force(atm_cnt, crd, frc, img_atm_map, atm_img_map, &
 
       call gti_finalize_force_virial(numextra, need_virials, ti_mode, netfrc, frc)
 
-      if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt)
+      if (need_pot_enes .or. need_virials) call gti_update_md_ene(pot_ene, enmr, virial, ekcmt, ineb, nebfreq, nstep)
 
       if (ti_mode .gt. 0) then
         if (need_pot_enes) then
diff --git src/pmemd/src/pmemd.F90 src/pmemd/src/pmemd.F90
index 9201e34..e553fc2 100644
--- src/pmemd/src/pmemd.F90
+++ src/pmemd/src/pmemd.F90
@@ -746,6 +746,19 @@ if(.not. usemidpoint) then
 #ifdef MPI
     if(ineb>0) then
       call neb_init()
+#ifdef CUDA
+      call nebread()
+      call gpu_setup_shuttle_info(pcnt, 0, partial_mask)
+      call gpu_upload_neb_info(neb_nbead, beadid, nattgtrms, nattgtfit, &
+           last_neb_atom, skmin, skmax, tmode, rmsmask, fitmask,idx_mask)
+      call gpu_setup_neb()
+#else
+        if (master) then
+          call nebread()
+        end if
+        call mpi_bcast(nattgtrms, 1, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi)
+        call mpi_bcast(rmsmask, nattgtrms, MPI_INTEGER, 0, mpi_comm_world, err_code_mpi)
+#endif
     end if
 #endif
 !DG
diff --git src/pmemd/src/runmd.F90 src/pmemd/src/runmd.F90
index 4be6b87..c613359 100644
--- src/pmemd/src/runmd.F90
+++ src/pmemd/src/runmd.F90
@@ -1319,7 +1319,7 @@ DBG_ARRAYS_TIME_STEP(nstep)
 #endif
 
 #ifdef MPI
-      if ( ineb .gt. 0 ) then
+      if ( ineb .gt. 0 .and. (mod(nstep, nebfreq)==0)) then
         need_pot_enes = .true.
       end if
 #endif
diff --git test/cuda/neb-testcases/neb_explicit/groupfile.in test/cuda/neb-testcases/neb_explicit/groupfile.in
index 41f07c7..265d08a 100644
--- test/cuda/neb-testcases/neb_explicit/groupfile.in
+++ test/cuda/neb-testcases/neb_explicit/groupfile.in
@@ -1,4 +1,4 @@
--O -p neb.prmtop -c inpcrds/neb01.inpcrd -o neb_explicit_01.out -r neb_explicit_01.rst -x neb_explicit_01.mdcrd -inf neb_explicit_01.inf 
--O -p neb.prmtop -c inpcrds/neb02.inpcrd -o neb_explicit_02.out -r neb_explicit_02.rst -x neb_explicit_02.mdcrd -inf neb_explicit_02.inf 
--O -p neb.prmtop -c inpcrds/neb03.inpcrd -o neb_explicit_03.out -r neb_explicit_03.rst -x neb_explicit_03.mdcrd -inf neb_explicit_03.inf 
--O -p neb.prmtop -c inpcrds/neb04.inpcrd -o neb_explicit_04.out -r neb_explicit_04.rst -x neb_explicit_04.mdcrd -inf neb_explicit_04.inf 
+-O -p neb.prmtop -c inpcrds/neb01.inpcrd -o neb_explicit_01.out -r neb_explicit_01.rst -x neb_explicit_01.mdcrd -inf neb_explicit_01.inf -AllowSmallBox 
+-O -p neb.prmtop -c inpcrds/neb02.inpcrd -o neb_explicit_02.out -r neb_explicit_02.rst -x neb_explicit_02.mdcrd -inf neb_explicit_02.inf -AllowSmallBox 
+-O -p neb.prmtop -c inpcrds/neb03.inpcrd -o neb_explicit_03.out -r neb_explicit_03.rst -x neb_explicit_03.mdcrd -inf neb_explicit_03.inf -AllowSmallBox 
+-O -p neb.prmtop -c inpcrds/neb04.inpcrd -o neb_explicit_04.out -r neb_explicit_04.rst -x neb_explicit_04.mdcrd -inf neb_explicit_04.inf -AllowSmallBox