********>Bugfix noBTREE:
Author: Roberto Gomperts
Date: 07/29/2005

Programs: sander

Description: Fixed incorrect calls to MPI routines in the noBTREE section
             As expected the noBTREE has a much poorer accumulation of forces 
             and distribution of coordinates than the BTREE alternative. This is
             most evident in certain ewald calculations.
             To aleviate these slowdowns a "ring" algorithm has been included.
             It makes use of SHMEM_PTR and requires substituting -lmpi by -lsma 
             in LOADLIB in config.h. This code is triggered by the inclusion of
             -DSGI_SMA in FPPFLAGS in config.h.
             Note 1: The -DnoBTREE implementation assumes that MPI_IN_PLACE is 
                     working properly
             Note 2: -DnoBTREE can be used alone for any machine but -DSGI_SMA
                     should be used together with -DnoBTREE

Fix:  apply the following patch to amber8/src/sander/parallel.f
      apply the following patch to amber8/src/sander/sander.f

------------------------------------------------------------------------------

*** parallel.f	2004-09-14 13:51:22.000000000 -0500
--- parallel.f	2005-07-29 04:21:40.000000000 -0500
***************
*** 377,382 ****
--- 377,389 ----
  #  define MPI_DOUBLE_PRECISION MPI_REAL8
  #endif
     
+ #ifdef SGI_SMA
+ INCLUDE "mpp/shmem.fh"
+   _REAL_ fr(*)
+   pointer (fr_p,fr)
+   integer other, k, ibeg, iend
+ #endif
+ 
  #ifndef noBTREE
     integer other,ncyclesm1,k,bit,cs,cr,ns,nr
     integer ist(mpi_status_size)
***************
*** 384,391 ****
     
     call trace_enter( 'fsum' )
     
- #ifndef noBTREE
     if (numtasks <= 1) return
     ncyclesm1 = logtwo(numtasks) - 1
     bit=numtasks/2
     
--- 391,399 ----
     
     call trace_enter( 'fsum' )
     
     if (numtasks <= 1) return
+ 
+ #ifndef noBTREE
     ncyclesm1 = logtwo(numtasks) - 1
     bit=numtasks/2
     
***************
*** 418,432 ****
        
     end do  !  k = 0,ncyclesm1
  #else
     
     call trace_mpi('mpi_reduce_scatter', &
!          rcvcnt3,'MPI_DOUBLE_PRECISION', mpi_sum)
!    call mpi_reduce_scatter(f, tmp(iparpt3(mytaskid)+1), &
           rcvcnt3, MPI_DOUBLE_PRECISION, mpi_sum, &
           commsander, ierr)
!    do i=1, 3*natom
!       f(i) = tmp(i)
!    end do
  #endif
     call trace_exit( 'fsum' )
     return
--- 426,454 ----
        
     end do  !  k = 0,ncyclesm1
  #else
+ #ifdef SGI_SMA
+    !   Implement a "simple" ring algoritm: 0->1, 0->2, 0->3.. 0->n
+    !                                       1->2, 1->3, 1->4.. 1->0
+    !   Need only a barrier at the beginning
+    call mpi_barrier(commsander,ierr)
+    do k=1,numtasks-1
+      other = mod(mytaskid+k,numtasks)
+      fr_p = SHMEM_PTR(f(1),other)
+      ibeg = iparpt3(mytaskid)
+      iend = ibeg + rcvcnt3(mytaskid)
+      ibeg = ibeg + 1
+      f(ibeg:iend) = f(ibeg:iend) + fr(ibeg:iend)
+    enddo
+ #else   
+    !       --- Assume an "in-place" allgatherv works: this seems(?) to
+    !           be true everywhere....
     
     call trace_mpi('mpi_reduce_scatter', &
!          rcvcnt3(mytaskid),'MPI_DOUBLE_PRECISION', mpi_sum)
!    call mpi_reduce_scatter(f, f(iparpt3(mytaskid)+1), &
           rcvcnt3, MPI_DOUBLE_PRECISION, mpi_sum, &
           commsander, ierr)
! #endif
  #endif
     call trace_exit( 'fsum' )
     return
***************
*** 450,455 ****
--- 472,483 ----
  #      define MPI_DOUBLE_PRECISION MPI_REAL8
  #    endif
     
+ #ifdef SGI_SMA
+ INCLUDE "mpp/shmem.fh"
+   _REAL_ xr(*)
+   pointer (xr_p,xr)
+   integer other, k, ibeg, iend
+ #endif
  #  ifndef noBTREE
     integer other,ncyclesm1,k,bit,cs,cr,ns,nr
     integer ist(mpi_status_size),ireq
***************
*** 457,465 ****
  #endif
     
     call trace_enter( 'xdist' )
  #  ifndef noBTREE
     
-    if (numtasks <= 1) return
     ncyclesm1 = logtwo(numtasks) - 1
     bit=1
     do k = 0,ncyclesm1
--- 485,493 ----
  #endif
     
     call trace_enter( 'xdist' )
+    if (numtasks <= 1) return
  #  ifndef noBTREE
     
     ncyclesm1 = logtwo(numtasks) - 1
     bit=1
     do k = 0,ncyclesm1
***************
*** 495,511 ****
        bit = ishft(bit,1)
     end do
  #  else
!    
     !       --- Assume an "in-place" allgatherv works: this seems(?) to
     !           be true everywhere....
     
     call trace_mpi('mpi_allgatherv', &
!          rcvcnt3(mytaskid)+rcvcnt3,'MPI_DOUBLE_PRECISION', iparpt3)
     call mpi_allgatherv( &
!          x(iparpt3(mytaskid)+1),rcvcnt3(mytaskid), &
           MPI_DOUBLE_PRECISION,x,rcvcnt3,iparpt3, &
           MPI_DOUBLE_PRECISION,commsander, ierr)
  #  endif
     call trace_exit( 'xdist' )
     return
  end subroutine xdist 
--- 523,553 ----
        bit = ishft(bit,1)
     end do
  #  else
! #ifdef SGI_SMA
!    !   Implement a "simple" ring algoritm: 0->1, 0->2, 0->3.. 0->n
!    !                                       1->2, 1->3, 1->4.. 1->0
!    !   Need only a barrier at the end
!    do k=1,numtasks-1
!      other = mod(mytaskid+k,numtasks)
!      xr_p = SHMEM_PTR(x(1),other)
!      ibeg = iparpt3(mytaskid)
!      iend = ibeg + rcvcnt3(mytaskid)
!      ibeg = ibeg + 1
!      xr(ibeg:iend) = x(ibeg:iend)
!    enddo
!    call mpi_barrier(commsander,ierr)
! #else   
     !       --- Assume an "in-place" allgatherv works: this seems(?) to
     !           be true everywhere....
     
     call trace_mpi('mpi_allgatherv', &
!          rcvcnt3(mytaskid),'MPI_DOUBLE_PRECISION', iparpt3(mytaskid))
     call mpi_allgatherv( &
!          MPI_IN_PLACE,rcvcnt3(mytaskid), &
           MPI_DOUBLE_PRECISION,x,rcvcnt3,iparpt3, &
           MPI_DOUBLE_PRECISION,commsander, ierr)
  #  endif
+ #  endif
     call trace_exit( 'xdist' )
     return
  end subroutine xdist 

------------------------------------------------------------------------------

*** sander.f	2004-09-14 13:51:23.000000000 -0500
--- sander.f	2005-07-28 13:41:14.000000000 -0500
***************
*** 588,594 ****
--- 588,600 ----
     _REAL_  dummy
  #endif
  
+ #ifndef SGI_SMA
     _REAL_,  dimension(:), allocatable :: x, r_stack
+ #else
+    _REAL_,  dimension(:), allocatable :: r_stack
+    _REAL_  x(*)
+    pointer (x_p,x)
+ #endif
     integer, dimension(:), allocatable :: ix, ipairs, i_stack
     character(len=4), dimension(:), allocatable :: ih
     
***************
*** 680,688 ****
--- 686,717 ----
  
        !     --- dynamic memory allocation:
  
+ #ifdef SGI_SMA
+       !  if we are using symmetric heap allocation:
+       !    - end the master code
+       !    - broadcast the sizes (just send all)
+       !    - do a barrier
+       !    - shpalloc x (for all)
+       !    - resume master code
+ 
+       allocate( ix(lasti), ipairs(lastpr), ih(lasth), stat = ier )
+       REQUIRE( ier == 0 )
+ 
+       endif ! master
+ 
+    call mpi_bcast(natom,BC_MEMORY,mpi_integer,0,commsander,ierr)
+    call mpi_barrier(commsander,ierr)
+ 
+       CALL SHPALLOC(x_p, lastr*2, ier, 1 )
+       REQUIRE( ier == 0 )
+ 
+    if (master) then
+ #else
        allocate( x(lastr), ix(lasti), ipairs(lastpr), ih(lasth), stat = ier )
        REQUIRE( ier == 0 )
  
+ #endif
+ 
        if( igb == 0 ) then
  #ifndef PSANDER
           lastrst = sizffwrk + siz_q + natom*(4 + 6*order)
***************
*** 1117,1125 ****
     !     ---allocate memory on the non-master nodes:
  
     if( .not.master ) then
        allocate( x(1:lastr), stat = ier )
        REQUIRE( ier == 0 )
! 
        allocate( ix(1:lasti), stat = ier )
        REQUIRE( ier == 0 )
  
--- 1146,1155 ----
     !     ---allocate memory on the non-master nodes:
  
     if( .not.master ) then
+ #ifndef SGI_SMA
        allocate( x(1:lastr), stat = ier )
        REQUIRE( ier == 0 )
! #endif
        allocate( ix(1:lasti), stat = ier )
        REQUIRE( ier == 0 )
  
***************
*** 1387,1394 ****
--- 1417,1429 ----
     REQUIRE( ier == 0 )
     deallocate( ix, stat = ier )
     REQUIRE( ier == 0 )
+ #ifndef SGI_SMA
     deallocate( x, stat = ier )
     REQUIRE( ier == 0 )
+ #else
+    CALL SHPDEALLC(x_p, ier, 1 )
+    REQUIRE( ier == 0 )
+ #endif
  
     return