program laplace
   use mpi
#ifdef _OPENMP
   use omp_lib
#endif
   implicit none
   integer, parameter                 :: dp=kind(1.d0)
   integer                            :: n, maxIter, i, j, iter = 0
   real (dp), dimension(:,:), pointer :: T, Tnew, Tmp=>null()
   real (dp)                          :: tol, var = 1.d0, top = 100.d0
   integer                            :: ierr
   real(kind(1.d0)) :: startTime, endTime
! MPI
   integer :: rank, nprocs, required, provided, tag=201, status(MPI_STATUS_SIZE)
   integer :: cartesianComm, cart_rank, cart_dims(2), cart_coord(2)
   logical :: cart_reorder, cart_periods(2)
   integer :: mymsize_x, mymsize_y, res, mystart_x, mystart_y
   real (kind(1.d0)), dimension(:), allocatable :: buffer_s_rl, buffer_s_lr
   real (kind(1.d0)), dimension(:), allocatable :: buffer_s_tb,buffer_s_bt
   real (kind(1.d0)), dimension(:), allocatable :: buffer_r_rl,buffer_r_lr
   real (kind(1.d0)), dimension(:), allocatable :: buffer_r_tb,buffer_r_bt
   integer :: source_rl, dest_rl, source_lr, dest_lr
   integer :: source_tb, dest_tb, source_bt, dest_bt
   real (kind(1.d0)) :: myvar=1.0, mastervar

   required = MPI_THREAD_FUNNELED
   call MPI_Init_thread(required, provided, ierr)
   call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
   call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) 
       if(provided /= required) then
         if(rank == 0) then
            print*,'Attenzione! Supporto thread disponibile diverso da quello richiesto'
            print*,'THREAD support required:  ',required
            print*,'THREAD support provided:  ',provided
         endif
      endif
   

   if ( rank == 0 ) then
      write(*,*) 'Enter mesh size, max iterations and tollerance:'
      read(*,*,iostat=ierr)  n, maxIter, tol
      if(ierr /= 0)  then 
        call MPI_FINALIZE(ierr)
        STOP 'Input error!'
   endif
   endif
   startTime = MPI_Wtime()

    ! broadcast input parameters (n,maxIter,tol) from process 0 to others
    call MPI_Bcast(n, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
    call MPI_Bcast(maxIter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
    call MPI_Bcast(tol, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)

    ! creating a new communicator with a 2D cartesian topology 
    cart_dims = 0 
    call mpi_dims_create(nprocs,2,cart_dims,ierr)
    cart_periods = .false.
    cart_reorder = .false.
    ! call MPI_Dims_Create(nprocs, 2, cart_dims, ierr)
    call MPI_Cart_Create(MPI_COMM_WORLD, 2, cart_dims, &
      cart_periods, cart_reorder, cartesianComm, ierr)
    call MPI_Comm_rank(cartesianComm, cart_rank, ierr)
    call MPI_Cart_coords(cartesianComm, cart_rank, 2, cart_coord, ierr)

    ! calculating problem size/process
    res = mod(n, cart_dims(1))
    mymsize_x = n / cart_dims(1)
    mystart_x = mymsize_x * cart_coord(1)
    if (cart_coord(1) > cart_dims(1) -1-res) then
       mymsize_x = mymsize_x + 1
       mystart_x = mystart_x + cart_coord(1) - (cart_dims(1) -res)
    endif
    res = mod(n, cart_dims(2))
    mymsize_y = n / cart_dims(2)
    mystart_y = mymsize_y * cart_coord(2)
    if (cart_coord(2) > cart_dims(2) -1-res) then
       mymsize_y = mymsize_y + 1
       mystart_y = mystart_y + cart_coord(2) - (cart_dims(2) -res)
    endif

    ! allocating memory for T , Tnew and buffers
    allocate (T(0:mymsize_x+1,0:mymsize_y+1), Tnew(0:mymsize_x+1,0:mymsize_y+1))
    allocate(buffer_s_rl(1:mymsize_y),buffer_s_lr(1:mymsize_y),buffer_s_tb(1:mymsize_x),buffer_s_bt(1:mymsize_x))
    allocate(buffer_r_rl(1:mymsize_y),buffer_r_lr(1:mymsize_y),buffer_r_tb(1:mymsize_x),buffer_r_bt(1:mymsize_x))
    

   T = 0.d0

    ! boundary conditions 
    if ( cart_coord(1) == cart_dims(1)-1)  then
      do j = 0,mymsize_y+1
         T(mymsize_x+1,j) = (mystart_y+j)*top/(n+1)
      enddo
    end if
    if ( cart_coord(2) == cart_dims(2)-1)  then
      do i = 0,mymsize_x+1
         T(i,mymsize_y+1) = (mystart_x+i)*top/(n+1)
      enddo
    end if
    if ( cart_coord(1) == 0)  then
      do j = 0,mymsize_y+1
         T(0,j) = 0.d0
      enddo
    end if
    if ( cart_coord(2) == 0)  then
      do i = 0,mymsize_x+1
         T(i,0) = 0.d0
      enddo
    end if
   
   Tnew = T

    ! calculating source/dest neighbours (right->left)
    call MPI_Cart_shift(cartesianComm, 0, -1, source_rl, dest_rl, ierr)
    ! calculating source/dest neighbours (left->right)
    call MPI_Cart_shift(cartesianComm, 0,  1, source_lr, dest_lr, ierr)
    ! calculating source/dest neighbours (top->bottom)
    call MPI_Cart_shift(cartesianComm, 1, -1, source_tb, dest_tb, ierr)
    ! calculating source/dest neighbours (bottom->top)
    call MPI_Cart_shift(cartesianComm, 1,  1, source_bt, dest_bt, ierr)


   do while (var > tol .and. iter <= maxIter)
      iter = iter + 1
      var = 0.d0       
      myvar = 0.d0
      mastervar = 0.d0
!$omp parallel
!$omp master
        buffer_s_rl(1:mymsize_y) = T(1,1:mymsize_y)
        !--- exchange boundary data with neighbours (right->left)
        call MPI_Sendrecv(buffer_s_rl, mymsize_y, MPI_DOUBLE_PRECISION, dest_rl, tag,         &
                          buffer_r_rl, mymsize_y, MPI_DOUBLE_PRECISION, source_rl, tag, &
                          cartesianComm, status, ierr)
        if(source_rl >= 0) T(mymsize_x+1,1:mymsize_y) = buffer_r_rl(1:mymsize_y)

        buffer_s_lr(1:mymsize_y) = T(mymsize_x,1:mymsize_y)
        !--- exchange boundary data with neighbours (left->right)
        call MPI_Sendrecv(buffer_s_lr, mymsize_y, MPI_DOUBLE_PRECISION, dest_lr, tag+1,  &
                          buffer_r_lr, mymsize_y, MPI_DOUBLE_PRECISION, source_lr, tag+1,        &
                          cartesianComm, status, ierr)
        if(source_lr >= 0) T(0,1:mymsize_y) = buffer_r_lr(1:mymsize_y)

        buffer_s_tb(1:mymsize_x) = T(1:mymsize_x,1)
        !--- exchange boundary data with neighbours (top->bottom)
        call MPI_Sendrecv(buffer_s_tb, mymsize_x, MPI_DOUBLE_PRECISION, dest_tb, tag+2,         &
                          buffer_r_tb, mymsize_x, MPI_DOUBLE_PRECISION, source_tb, tag+2, &
                          cartesianComm, status, ierr)
        if(source_tb >= 0) T(1:mymsize_x,mymsize_y+1) = buffer_r_tb(1:mymsize_x)

        !--- exchange boundary data with neighbours (bottom->top)
        buffer_s_bt(1:mymsize_x) = T(1:mymsize_x,mymsize_y)
        call MPI_Sendrecv(buffer_s_bt, mymsize_x, MPI_DOUBLE_PRECISION, dest_bt, tag+3,  &
                          buffer_r_bt, mymsize_x, MPI_DOUBLE_PRECISION, source_bt, tag+3,        &
                          cartesianComm, status, ierr)
        if(source_bt >= 0) T(1:mymsize_x,0) = buffer_r_bt(1:mymsize_x)

        do j = 1, mymsize_y, mymsize_y-1
          do i = 1, mymsize_x
            Tnew(i,j) = 0.25 * ( T(i-1,j) + T(i+1,j) + T(i,j-1) + T(i,j+1) )
            mastervar = max(mastervar, abs( Tnew(i,j) - T(i,j) ))

          enddo
        enddo

        do j = 1, mymsize_y
          do i = 1, mymsize_x, mymsize_x-1
            Tnew(i,j) = 0.25 * ( T(i-1,j) + T(i+1,j) + T(i,j-1) + T(i,j+1) )
            mastervar = max(mastervar, abs( Tnew(i,j) - T(i,j) ))

          enddo
        enddo
!$omp end master


!$omp do reduction(max:myvar) schedule(dynamic,125)
      do j = 2, mymsize_y - 1
         do i = 2, mymsize_x - 1
            Tnew(i,j) = 0.25d0 * ( T(i-1,j) + T(i+1,j) + T(i,j-1) + T(i,j+1) )
            myvar = max(myvar, abs( Tnew(i,j) - T(i,j) ))
         end do
      end do
!$omp end do
!$omp master
      myvar = max(myvar,mastervar)
!$omp end master
!$omp end parallel
      Tmp =>T; T =>Tnew; Tnew => Tmp; 

      call MPI_Allreduce(myvar, var, 1, MPI_DOUBLE_PRECISION, &
                    MPI_MAX, MPI_COMM_WORLD, ierr)

      if ( rank == 0 ) then
      if( mod(iter,100) == 0 ) write(*,"(a,i8,e12.4)") &
         ' iter, variation:', iter, var
      end if

   end do

endTime = MPI_Wtime()

if ( rank == 0 ) then
   write(*,'(/A,F10.4)') ' Elapsed time (s)     =', endTime - startTime
   write(*,*) 'Mesh size            =', n
   write(*,*) 'Stopped at iteration =', iter
   write(*,*) 'The maximum error    =', var
    
end if

   deallocate (T, Tnew)
   nullify(Tmp)

   call MPI_Finalize(ierr)

end program laplace