!   The distribution of heat over time is described by the so called heat equation:
!   df/dt = alpha*(d2f/dx2 + d2f/dy2)   for a function f(x,y,t).
!   This formula may be discretized in a regular grid G(:,:) by computing the new
!      value G1(x,y) in a point (x,y) at each time step as:
!      G1(x,y) = G(x,y) + Ax * ( G(x+1,y) + G(x-1,y) - 2.0 * G(x,y))
! 		+ Ay * ( G(x,y+1) + G(x,y-1) - 2.0 * G(x,y))
!   For each point in the grid the next value depends on the values of the four up
!      and down, left and right adjacent points.
!
!   This solution is based on a source code written by 
!      Blaise Barney, D. Turner, George L. Gusciora
! ****************************************************************************
! Explanation of constants and variables
!   Nrows                          =  x dimension of problem grid 
!   Ncols                          =  y dimension of problem grid
!   Tsteps                         =  number of time Tsteps
!   MAXWORKER                      =  maximum number of workers tasks
!   MINWORKER                      =  minimum number of workers tasks
!   IniDataTag, LTAG, RTAG, DONE   =  message tags
!   NoNeighbor                     =  indicates no neighbor
!   Ax, Ay                         =  used in heat equation
!   Grid                           =  array for grids
!   my_rank,Root                   =  my_ranks 
!   numprocs                       =  number of processes
!   dest, source                   =  to - from for message send-receive
!   left,right                     =  neighbor tasks
! ---------------------------------------------------------------------------
MODULE heat2D_data
      IMPLICIT NONE
      integer, PARAMETER :: Nrows=200,Ncols=200
      integer :: Nrow0, Nrow1, Ncol0, Ncol1, sp

      real, dimension(:,:,:), allocatable :: Grid

      real*4  Ax,Ay
      parameter(Ax=0.1)
      parameter(Ay=0.1)
      
CONTAINS
	
! ***************************************************************************
      subroutine compute (y0, y1, g0, g1)
! ***************************************************************************
      integer, intent(in) :: y0, y1
      real(4), dimension(Nrows,Ncol0:Ncol1), intent(in) :: g0
      real(4), dimension(Nrows,Ncol0:Ncol1), intent(out) :: g1

      integer :: i, j

      do j=y0, y1
        do i=2, Nrows-1
            g1(i,j) = g0(i,j) &
     &       + Ax * ( g0(i+1,j) + g0(i-1,j) - 2.0 * g0(i,j)) &
     &       + Ay * ( g0(i,j+1) + g0(i,j-1) - 2.0 * g0(i,j))
        end do
      end do
   end subroutine compute

! ***************************************************************************
      subroutine computel (nr, nc, g1, g2)
! ***************************************************************************
      integer, intent(in) :: nr, nc
      real(4), dimension(nr,nc), intent(in) :: g1
      real(4), dimension(nr,nc), intent(out) :: g2

      integer :: i, j

      do j=2, nc-1
        do i=2, Nrows-1
            g2(i,j) = g1(i,j) &
     &       + Ax * ( g1(i+1,j) + g1(i-1,j) - 2.0 * g1(i,j)) &
     &       + Ay * ( g1(i,j+1) + g1(i,j-1) - 2.0 * g1(i,j))
        end do
      end do
   end subroutine computel


! ****************************************************************************
      subroutine inidat
! ****************************************************************************
      integer :: i,j

!     Initialize data
      Grid = 0.0

      do i=0,Nrows-1
        do j=1,2
          Grid(i+1,j+1,1) = float(i*(Nrows-i-1) + j*(Ncols-j-1))
        end do
      end do
      end subroutine inidat

subroutine RealData2pgm(s1,s2,rdata,name)
!  Simple subroutine to dump real data in a PGM format
   implicit NONE
   INTEGER, INTENT(IN) :: s1, s2
   REAL, DIMENSION(s1,s2), intent(in) :: rdata
   character(*), intent(in) :: name
!
   integer :: i, j, ouni
   integer :: vp, vs
   real :: rmin, rmax
   character(80) :: fname
!
!  Write on unit 700 with  PGM format
   ouni = 700
   fname = TRIM(name)//".pgm"
   open(ouni,file=trim(fname),status="replace",iostat=vs)
   if ( vs /= 0 ) then
      print*,"!!!! Error write access to file ",trim(fname)
   end if
!  Magic code
   write(ouni,"(a)") "P2"
!  Dimensions
   write(ouni,*) s1, s2
!  Maximum value
   write(ouni,"(a)") "255"
!  Values from 0 to 255
   rmin = MINVAL(rdata); rmax = MAXVAL(rdata)
!
   vs = 0
   do i = 1, s1
      do j = 1, s2
         vp = INT ( (rdata(i,j) - rmin) * 255.0 / (rmax - rmin) )
         vs = vs + 1
         write(ouni,"(i4)",advance="no") vp
         if (vs >= 10 ) then
            write(ouni,"(a)") " "
            vs = 0
         end if
      end do
      write(ouni,"(a)") " "
      vs = 0
   end do
   close(ouni)
   return
end subroutine RealData2pgm


SUBROUTINE estremi(length,start,end,intervals,n)
   IMPLICIT NONE
   INTEGER, INTENT(IN) :: length,intervals,n
   INTEGER, INTENT(OUT) :: start,end

   INTEGER :: delta

   delta = ( length + (intervals - 1) ) / intervals
   start = delta * n + 1; end = delta * ( n + 1)
   if ( end > length ) end = length
   
END SUBROUTINE estremi 


END MODULE heat2D_data

program heat2D
      USE heat2D_data
      IMPLICIT NONE
      include "mpif.h"

      integer Tsteps,MAXWORKER,MINWORKER,IniDataTag,LTAG,RTAG,DONE, &
     &        NoNeighbor,Root, lborder, rborder
      parameter(Tsteps=1000)
      parameter(MAXWORKER=Ncols)
      parameter(MINWORKER=2)
      parameter(IniDataTag=100)
      parameter(LTAG=200)
      parameter(RTAG=300)
      parameter(DONE=400)
      parameter(NoNeighbor=-1)
      parameter(Root=0)
      integer my_rank,numprocs,avecol,cols,offset,extra, &
     &        p, dest,source,left,right,msgtype, &
     &        rc,start,end,i,ix,iy,it,ierr, st
      CHARACTER(80) :: fname
      integer :: status(MPI_STATUS_SIZE)
      CHARACTER(MPI_MAX_PROCESSOR_NAME) :: server
      integer :: ls

      real(8) :: time1, time2

       call MPI_INIT( ierr )
       call MPI_COMM_RANK( MPI_COMM_WORLD, my_rank, ierr )
       call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
       call MPI_Get_processor_name(server,ls,ierr)

       if (my_rank .eq. Root) then
!         Check if numprocs are too many
          if ( numprocs > MAXWORKER ) then
             WRITE(*,*) " NUMPROCS must not exceed Ncols = ",Ncols
             WRITE(*,*) " Program terminates "
             call MPI_ABORT(MPI_COMM_WORLD, rc, ierr)
             stop
          end if
          if ( numprocs < MINWORKER ) then
             WRITE(*,*) " NUMPROCS must be > 1 "
             WRITE(*,*) " Program terminates "
             call MPI_ABORT(MPI_COMM_WORLD, rc, ierr)
             stop
          end if

          WRITE(*,*) " Start computing on server ",TRIM(server)," with ",numprocs," processes"
!         Initialize grid - root allocates entire Grid
          allocate(Grid(Nrows,0:Ncols+1,2), STAT=st)
          if ( st /= 0 ) then
             WRITE(*,*) " Process ",my_rank," Error allocating Grid(",Nrows,",",Ncols,",2)"
             WRITE(*,*) " Program terminates "
             call MPI_ABORT(MPI_COMM_WORLD, rc, ierr)
             stop
          end if

          call inidat
!         Dump initial configuration
          call RealData2pgm(Nrows,Ncols,Grid(1,1,1),"tmap0000")

          time1 = MPI_Wtime()

!         Send data to each process
          do p = 1, numprocs-1
!            Calculate portion of columns 
             CALL estremi(Ncols,Ncol0,Ncol1,numprocs,p)
             Nrow0 = 1; Nrow1 = Nrows
             call MPI_SEND( Grid(Nrow0,Ncol0,1), (Ncol1-Ncol0+1) * Nrows, MPI_REAL, p, &
         &                  IniDataTag, MPI_COMM_WORLD, ierr )
          end do
       end if

!      Calculate local columns 
       CALL estremi(Ncols,Ncol0,Ncol1,numprocs,my_rank)
       Nrow0 = 1; Nrow1 = Nrows
       lborder = 1; rborder = 1    !  Left and right borders
       if ( (Ncol0-lborder) < 1 ) then
          lborder = 0
       end if
       if ( Ncol1+rborder > Ncols ) then
          rborder = 0
       end if

       if ( My_rank /= Root ) then
          allocate(Grid(Nrow0:Nrow1,Ncol0-1:Ncol1+1,2), STAT=st)
          if ( st /= 0 ) then
             WRITE(*,*) " Process ",my_rank," Error allocating Grid(", &
     &             Nrow0,":",Nrow1,",",Ncol0-1,":",Ncol1+1,",2) "
             WRITE(*,*) " Program terminates "
             call MPI_ABORT(MPI_COMM_WORLD, rc, ierr)
             stop
          end if
          Grid = 0.0
!         Receive data
          call MPI_RECV( Grid(Nrow0,Ncol0,1),(Ncol1-Ncol0+1)*Nrows,MPI_REAL, Root, &
         &              IniDataTag, MPI_COMM_WORLD, status, ierr )
          if ( ierr /= 0 ) then
             WRITE(*,*) " Process ",my_rank," Error receiving Grid(", &
     &             Nrow0,":",Nrow1,",",Ncol0,":",Ncol1,",1) from process ",Root
             WRITE(*,*) " Program terminates "
             call MPI_ABORT(MPI_COMM_WORLD, rc, ierr)
             stop
          end if
       end if
!
!        Compute iterations: before calculating communicate borders to neighbors.
!        It is enough to exchange only vertical borders
          if ( my_rank == Root ) then
             left = NoNeighbor
          else
             left = my_rank - 1
          end if
          if ( my_rank >= numprocs-1 ) then
             right = NoNeighbor
          else
             right = my_rank + 1
          end if
          sp=1
          do it=1, Tsteps
            ! It's no problem to synchronize
            CALL MPI_BARRIER(MPI_COMM_WORLD, ierr )   
            if (left .ne. NoNeighbor) then
              call MPI_SEND( Grid(Nrow0,Ncol0,sp), Nrows, MPI_REAL, left, &
         &                   RTAG, MPI_COMM_WORLD, ierr )

              source = left
              msgtype = LTAG
              call MPI_RECV( Grid(1,Ncol0-lborder,sp),Nrows,MPI_REAL, source, &
         &                   msgtype, MPI_COMM_WORLD, status, ierr )
            end if
    
            if (right .ne. NoNeighbor) then
              call MPI_SEND(Grid(1,Ncol1,sp),Nrows,MPI_REAL, &
         &                  right,LTAG,MPI_COMM_WORLD,ierr)

              source = right
              msgtype = RTAG
              call MPI_RECV(Grid(1,Ncol1+rborder,sp),Nrows,MPI_REAL,source, &
         &                   msgtype, MPI_COMM_WORLD, status, ierr )
            end if
         
!          Calculate  grid points
           call computel(Nrows,(Ncol1-Ncol0+3),Grid(Nrow0,Ncol0-1,sp),Grid(Nrow0,Ncol0-1,3-sp))
           sp=3-sp

         end do
             
!!        Root gathers results from all processes
         if ( my_rank == Root ) then
            do p = 1, numprocs-1
!              Calculate portion of columns 
               CALL estremi(Ncols,Ncol0,Ncol1,numprocs,p)
               Nrow0 = 1; Nrow1 = Nrows
               source = p
               msgtype = DONE
               call MPI_RECV( Grid(Nrow0,Ncol0,sp), (Ncol1-Ncol0+1) * Nrows, MPI_REAL, &
         &                 source,msgtype,MPI_COMM_WORLD,status,ierr)
            end do

            time2 = MPI_Wtime()

            WRITE(*,*) " Elapsed time with ",numprocs," processes = ",(time2-time1)
    
!           Print and show results 
            it = Tsteps
            WRITE(fname,"(A,I4.4)") "tmap",it
            call RealData2pgm(Nrows,Ncols,Grid(1,1,sp),fname)

         else     ! my_rank /= Root
              msgtype = DONE
              call MPI_SEND(Grid(Nrow0,Ncol0,sp),(Ncol1-Ncol0+1) * Nrows ,MPI_REAL, &
         &                  Root,msgtype,MPI_COMM_WORLD,ierr)
         end if

         CALL MPI_BARRIER(MPI_COMM_WORLD, ierr )   
         call MPI_FINALIZE(ierr)
         stop
 
end program heat2D