********>Bugfix noBTREE:
Author: Roberto Gomperts
Date: 07/29/2005
Programs: sander
Description: Fixed incorrect calls to MPI routines in the noBTREE section
As expected the noBTREE has a much poorer accumulation of forces
and distribution of coordinates than the BTREE alternative. This is
most evident in certain ewald calculations.
To aleviate these slowdowns a "ring" algorithm has been included.
It makes use of SHMEM_PTR and requires substituting -lmpi by -lsma
in LOADLIB in config.h. This code is triggered by the inclusion of
-DSGI_SMA in FPPFLAGS in config.h.
Note 1: The -DnoBTREE implementation assumes that MPI_IN_PLACE is
working properly
Note 2: -DnoBTREE can be used alone for any machine but -DSGI_SMA
should be used together with -DnoBTREE
Fix: apply the following patch to amber8/src/sander/parallel.f
apply the following patch to amber8/src/sander/sander.f
------------------------------------------------------------------------------
*** parallel.f 2004-09-14 13:51:22.000000000 -0500
--- parallel.f 2005-07-29 04:21:40.000000000 -0500
***************
*** 377,382 ****
--- 377,389 ----
# define MPI_DOUBLE_PRECISION MPI_REAL8
#endif
+ #ifdef SGI_SMA
+ INCLUDE "mpp/shmem.fh"
+ _REAL_ fr(*)
+ pointer (fr_p,fr)
+ integer other, k, ibeg, iend
+ #endif
+
#ifndef noBTREE
integer other,ncyclesm1,k,bit,cs,cr,ns,nr
integer ist(mpi_status_size)
***************
*** 384,391 ****
call trace_enter( 'fsum' )
- #ifndef noBTREE
if (numtasks <= 1) return
ncyclesm1 = logtwo(numtasks) - 1
bit=numtasks/2
--- 391,399 ----
call trace_enter( 'fsum' )
if (numtasks <= 1) return
+
+ #ifndef noBTREE
ncyclesm1 = logtwo(numtasks) - 1
bit=numtasks/2
***************
*** 418,432 ****
end do ! k = 0,ncyclesm1
#else
call trace_mpi('mpi_reduce_scatter', &
! rcvcnt3,'MPI_DOUBLE_PRECISION', mpi_sum)
! call mpi_reduce_scatter(f, tmp(iparpt3(mytaskid)+1), &
rcvcnt3, MPI_DOUBLE_PRECISION, mpi_sum, &
commsander, ierr)
! do i=1, 3*natom
! f(i) = tmp(i)
! end do
#endif
call trace_exit( 'fsum' )
return
--- 426,454 ----
end do ! k = 0,ncyclesm1
#else
+ #ifdef SGI_SMA
+ ! Implement a "simple" ring algoritm: 0->1, 0->2, 0->3.. 0->n
+ ! 1->2, 1->3, 1->4.. 1->0
+ ! Need only a barrier at the beginning
+ call mpi_barrier(commsander,ierr)
+ do k=1,numtasks-1
+ other = mod(mytaskid+k,numtasks)
+ fr_p = SHMEM_PTR(f(1),other)
+ ibeg = iparpt3(mytaskid)
+ iend = ibeg + rcvcnt3(mytaskid)
+ ibeg = ibeg + 1
+ f(ibeg:iend) = f(ibeg:iend) + fr(ibeg:iend)
+ enddo
+ #else
+ ! --- Assume an "in-place" allgatherv works: this seems(?) to
+ ! be true everywhere....
call trace_mpi('mpi_reduce_scatter', &
! rcvcnt3(mytaskid),'MPI_DOUBLE_PRECISION', mpi_sum)
! call mpi_reduce_scatter(f, f(iparpt3(mytaskid)+1), &
rcvcnt3, MPI_DOUBLE_PRECISION, mpi_sum, &
commsander, ierr)
! #endif
#endif
call trace_exit( 'fsum' )
return
***************
*** 450,455 ****
--- 472,483 ----
# define MPI_DOUBLE_PRECISION MPI_REAL8
# endif
+ #ifdef SGI_SMA
+ INCLUDE "mpp/shmem.fh"
+ _REAL_ xr(*)
+ pointer (xr_p,xr)
+ integer other, k, ibeg, iend
+ #endif
# ifndef noBTREE
integer other,ncyclesm1,k,bit,cs,cr,ns,nr
integer ist(mpi_status_size),ireq
***************
*** 457,465 ****
#endif
call trace_enter( 'xdist' )
# ifndef noBTREE
- if (numtasks <= 1) return
ncyclesm1 = logtwo(numtasks) - 1
bit=1
do k = 0,ncyclesm1
--- 485,493 ----
#endif
call trace_enter( 'xdist' )
+ if (numtasks <= 1) return
# ifndef noBTREE
ncyclesm1 = logtwo(numtasks) - 1
bit=1
do k = 0,ncyclesm1
***************
*** 495,511 ****
bit = ishft(bit,1)
end do
# else
!
! --- Assume an "in-place" allgatherv works: this seems(?) to
! be true everywhere....
call trace_mpi('mpi_allgatherv', &
! rcvcnt3(mytaskid)+rcvcnt3,'MPI_DOUBLE_PRECISION', iparpt3)
call mpi_allgatherv( &
! x(iparpt3(mytaskid)+1),rcvcnt3(mytaskid), &
MPI_DOUBLE_PRECISION,x,rcvcnt3,iparpt3, &
MPI_DOUBLE_PRECISION,commsander, ierr)
# endif
call trace_exit( 'xdist' )
return
end subroutine xdist
--- 523,553 ----
bit = ishft(bit,1)
end do
# else
! #ifdef SGI_SMA
! ! Implement a "simple" ring algoritm: 0->1, 0->2, 0->3.. 0->n
! ! 1->2, 1->3, 1->4.. 1->0
! ! Need only a barrier at the end
! do k=1,numtasks-1
! other = mod(mytaskid+k,numtasks)
! xr_p = SHMEM_PTR(x(1),other)
! ibeg = iparpt3(mytaskid)
! iend = ibeg + rcvcnt3(mytaskid)
! ibeg = ibeg + 1
! xr(ibeg:iend) = x(ibeg:iend)
! enddo
! call mpi_barrier(commsander,ierr)
! #else
! --- Assume an "in-place" allgatherv works: this seems(?) to
! be true everywhere....
call trace_mpi('mpi_allgatherv', &
! rcvcnt3(mytaskid),'MPI_DOUBLE_PRECISION', iparpt3(mytaskid))
call mpi_allgatherv( &
! MPI_IN_PLACE,rcvcnt3(mytaskid), &
MPI_DOUBLE_PRECISION,x,rcvcnt3,iparpt3, &
MPI_DOUBLE_PRECISION,commsander, ierr)
# endif
+ # endif
call trace_exit( 'xdist' )
return
end subroutine xdist
------------------------------------------------------------------------------
*** sander.f 2004-09-14 13:51:23.000000000 -0500
--- sander.f 2005-07-28 13:41:14.000000000 -0500
***************
*** 588,594 ****
--- 588,600 ----
_REAL_ dummy
#endif
+ #ifndef SGI_SMA
_REAL_, dimension(:), allocatable :: x, r_stack
+ #else
+ _REAL_, dimension(:), allocatable :: r_stack
+ _REAL_ x(*)
+ pointer (x_p,x)
+ #endif
integer, dimension(:), allocatable :: ix, ipairs, i_stack
character(len=4), dimension(:), allocatable :: ih
***************
*** 680,688 ****
--- 686,717 ----
! --- dynamic memory allocation:
+ #ifdef SGI_SMA
+ ! if we are using symmetric heap allocation:
+ ! - end the master code
+ ! - broadcast the sizes (just send all)
+ ! - do a barrier
+ ! - shpalloc x (for all)
+ ! - resume master code
+
+ allocate( ix(lasti), ipairs(lastpr), ih(lasth), stat = ier )
+ REQUIRE( ier == 0 )
+
+ endif ! master
+
+ call mpi_bcast(natom,BC_MEMORY,mpi_integer,0,commsander,ierr)
+ call mpi_barrier(commsander,ierr)
+
+ CALL SHPALLOC(x_p, lastr*2, ier, 1 )
+ REQUIRE( ier == 0 )
+
+ if (master) then
+ #else
allocate( x(lastr), ix(lasti), ipairs(lastpr), ih(lasth), stat = ier )
REQUIRE( ier == 0 )
+ #endif
+
if( igb == 0 ) then
#ifndef PSANDER
lastrst = sizffwrk + siz_q + natom*(4 + 6*order)
***************
*** 1117,1125 ****
! ---allocate memory on the non-master nodes:
if( .not.master ) then
allocate( x(1:lastr), stat = ier )
REQUIRE( ier == 0 )
!
allocate( ix(1:lasti), stat = ier )
REQUIRE( ier == 0 )
--- 1146,1155 ----
! ---allocate memory on the non-master nodes:
if( .not.master ) then
+ #ifndef SGI_SMA
allocate( x(1:lastr), stat = ier )
REQUIRE( ier == 0 )
! #endif
allocate( ix(1:lasti), stat = ier )
REQUIRE( ier == 0 )
***************
*** 1387,1394 ****
--- 1417,1429 ----
REQUIRE( ier == 0 )
deallocate( ix, stat = ier )
REQUIRE( ier == 0 )
+ #ifndef SGI_SMA
deallocate( x, stat = ier )
REQUIRE( ier == 0 )
+ #else
+ CALL SHPDEALLC(x_p, ier, 1 )
+ REQUIRE( ier == 0 )
+ #endif
return