#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>

/* =================================================================================
    The distribution of heat over time is described by the so called heat equation:
    df/dt = alpha*(d2f/dx2 + d2f/dy2)   for a function f(x,y,t).
    This formula may be discretized in a regular grid G(:,:) by computing the new
       value G1(x,y) in a point (x,y) at each time step as:
       G1(x,y) = G(x,y) + Ax * ( G(x+1,y) + G(x-1,y) - 2.0 * G(x,y))
  		+ Ay * ( G(x,y+1) + G(x,y-1) - 2.0 * G(x,y))
    For each point in the grid the next value depends on the values of the four up
       and down, left and right adjacent points.
 
    This solution is based on a source code written by 
       Blaise Barney, D. Turner, George L. Gusciora
  ****************************************************************************
  Explanation of constants and variables
    Nrows                          =  x dimension of problem grid 
    Ncols                          =  y dimension of problem grid
    Tsteps                         =  number of time Tsteps
    MAXWORKER                      =  maximum number of workers tasks
    MINWORKER                      =  minimum number of workers tasks
    IniDataTag, LTAG, RTAG, DONE   =  message tags
    NoNeighbor                     =  indicates no neighbor
    Ax, Ay                         =  used in heat equation
    Grid                           =  array for grids
    my_rank,Root                   =  Process ranks 
    numprocs                       =  number of processes
  ---------------------------------------------------------------------------
*/
#define Nrows 200
#define Ncols 200

int Nrow0, Nrow1, Ncol0, Ncol1, l_Ncol0, l_Ncol1, g_Ncol0, g_Ncol1, sp;
int my_Ncols;
float *Grid;

#define Ax 0.1
#define Ay 0.1

// to address C arrays in Fortran-like column order
#define index2D(a,b,LD1)  a + (b*LD1)
#define index3D(a,b,c,LD1,LD2)  a + (b*LD1) + (c*LD1*LD2)
      
void computel(int nr, int nc, float *g1, float *g2)
{
/*
    Compute next step using matrices g1, g2 of dimension (nr,nc)
*/

      int i, j;

      for ( j = 1; j < nc-1; j++ ) {
        for ( i = 1; i < Nrows-1; i++ ) {
            g2[index2D(i,j,nr)] = g1[index2D(i,j,nr)] 
             + Ax * ( g1[index2D((i+1),j,nr)] + g1[index2D((i-1),j,nr)] - 2.0 * g1[index2D(i,j,nr)]) 
             + Ay * ( g1[index2D(i,(j+1),nr)] + g1[index2D(i,(j-1),nr)] - 2.0 * g1[index2D(i,j,nr)]);
        }
      }
}


void inidat()
{
/*
      Initialize Grid data
*/
      int i,j;

      for ( i = 0; i < Nrows*Ncols; i++ ) Grid[i] = 0.0;

      for ( i = 0; i < Nrows; i++ ) {
        for ( j = 1; j <= 2; j++ ) {
          Grid[index3D(i,j,0,Nrows,Ncols)] = (float)i*(float)(Nrows-i-1) + (float)j*(float)(Ncols-j-1);
        }
      }
}


int MINVAL(int s, float* a)
{
   int e;
   float v;

   v = a[0];

   for ( e = 0; e < s; e++ )
   {
       if ( v > a[e] ) v = a[e];
   
   }
   
   return(v);
}

int MAXVAL(int s, float* a)
{
   int e;
   float v;

   v = a[0];

   for ( e = 0; e < s; e++ )
   {
       if ( v < a[e] ) v = a[e];
   
   }
   
   return(v);
}
   
   
void Terminate(int p, int rc)
{
   int ierr;
   
   fprintf(stdout,"Process %d  terminates\n",p);
   if ( rc != 0 ) 
      ierr = MPI_Abort(MPI_COMM_WORLD,rc);
   else
      ierr = MPI_Finalize();
   
   return;
}

void RealData2pgm(int s1, int s2, float *rdata, char* name) 
{
   /*
      Simple subroutine to dump integer data in a PGM format
   */
   int i, j;
   FILE* ouni;
   int  vp, vs;
   float  rmin, rmax;
   char  fname[80];
   /*
      Write on unit 700 with  PGM format
   */
   strcpy(fname,name);
   strcat(fname,".pgm\0");
   ouni = fopen(fname,"w");
   if ( ! ouni ) {
      fprintf(stderr,"!!!! Error write access to file %s\n",fname);
   }
   /*  Magic code */
   fprintf(ouni,"P2\n");
   /*  Dimensions */
   fprintf(ouni,"%d %d\n",s1, s2);
   /*  Maximum value */
   fprintf(ouni,"255\n");
   /*  Values from 0 to 255 */
   rmin = MINVAL(s1*s2,rdata); rmax = MAXVAL(s1*s2,rdata);
   vs = 0;
   for ( i = 0; i < s1; i++ ) {
      for ( j = 0; j < s2; j++ ) {
         vp = (int) ( (rdata[i+(j*s1)] - rmin) * 255.0 / (rmax - rmin) );
         vs++;
         fprintf(ouni,"%4d ", vp);
         if ( vs >= 10 ) {
            fprintf(ouni," \n");
            vs = 0;
         }
      }
      fprintf(ouni," ");
      vs = 0;
   }
   fclose(ouni);
   return;
}


int ProcessBand(int length, int* start, int* end, int intervals, int n)
{
    int delta;

    delta = ( length + (intervals - 1) ) / intervals;
    *start = delta * n; *end = delta * ( n + 1) - 1;
    if ( *end > length-1 ) *end = length-1;
   
   return(0);
}


main( int argc, char *argv[] )   /* Program Heat2D */
{

      int Tsteps=1000, MAXWORKER=Ncols, MINWORKER=1, IniDataTag=100, LTAG=200, RTAG=300, DONE=400, 
             NoNeighbor=-1, Root=0, lborder, rborder;

      int my_rank,numprocs,avecol,cols,offset,extra, 
             p, dest,source,left,right,msgtype, 
             rc,start,end,i,ix,iy,it,ierr, st;

      char fname[80];
      MPI_Status status;
      char server[MPI_MAX_PROCESSOR_NAME];
      int l1, l2, ls;

      double time1, time2;

       ierr = MPI_Init(&argc,&argv);
       ierr = MPI_Comm_rank(MPI_COMM_WORLD,&my_rank);
       ierr = MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
       ierr = MPI_Get_processor_name(server,&ls);
       fprintf(stdout," Activated process %d of %d on server %s\n", my_rank,numprocs,server);

       if (my_rank == Root) {
//          Check if numprocs are too many
          if ( numprocs > Ncols ) {
             fprintf(stderr," NUMPROCS must not exceed Ncols = %d\n",Ncols);
             ierr = -1;
             Terminate(my_rank,ierr);
             return(-1);
          }

//        Initialize grid - root allocates entire Grid
          Grid = (float*)malloc(sizeof((float)1.0)*Nrows*Ncols*2);
          if ( Grid == NULL ) {
             fprintf(stderr," Process %d - Error allocating Grid(%d,%d,2)",my_rank,Nrows,Ncols);
             ierr = -1;
             Terminate(my_rank,ierr);
             return(-1);
          }

          inidat();
//          Dump initial configuration
          RealData2pgm(Nrows,Ncols,Grid,"tmap0000");

          time1 = MPI_Wtime();

//          Send data to each process
          for ( p = 1; p < numprocs; p++ ) {
//             Calculate portion of columns 
             ProcessBand(Ncols,&Ncol0,&Ncol1,numprocs,p);
             Nrow0 = 0; Nrow1 = Nrows-1;
             ierr = MPI_Send(&Grid[index3D(Nrow0,Ncol0,0,Nrows,Ncols)],
                    (Ncol1-Ncol0+1) * Nrows, MPI_FLOAT, p, 
                           IniDataTag, MPI_COMM_WORLD);
             //  WRITE(*,*) " Grid(",Nrow0,",",Ncol0,", 1) sent from process ",my_rank," to process ",p
          }
       }

//       Calculate local columns 
       ProcessBand(Ncols,&g_Ncol0,&g_Ncol1,numprocs,my_rank);
       Nrow0 = 0; Nrow1 = Nrows-1;
       //  WRITE(*,*) " Process ",my_rank," Grid sizes: ",Nrows,", ",(Ncol1-Ncol0+1),"; Time steps=",Tsteps
       lborder = 1; rborder = 1;    //   Left and right borders
       if ( (g_Ncol0-lborder) < 0 ) {
          lborder = 0;
       }
       if ( g_Ncol1+rborder > Ncols-1 ) {
          rborder = 0;
       }
       l_Ncol0 = lborder; l_Ncol1 = l_Ncol0 + g_Ncol1 - g_Ncol0;
       //  WRITE(*,*) " Process ",my_rank," borders are : ",lborder, rborder

       if ( my_rank != Root ) {
          my_Ncols = (l_Ncol1-l_Ncol0+3);
       } else {
          my_Ncols = Ncols;
       }
       if ( my_rank != Root ) {
          Grid = (float*)malloc(sizeof((float)1.0)*(Nrow1-Nrow0+1)*my_Ncols*2);
          if ( Grid == NULL ) {
             fprintf(stderr," Process %d - Error allocating Grid(%d,%d,2)",my_rank,(Nrow1-Nrow0+1),(l_Ncol1-l_Ncol0+3));
             ierr = -1;
             Terminate(my_rank,ierr);
             return(-1);
          }
          for ( i = 0; i < (Nrow1-Nrow0+1)*(l_Ncol1-l_Ncol0+3)*2; i++ ) Grid[i] = 0.0;
//          Receive data
          ierr = MPI_Recv(&Grid[index3D(0,l_Ncol0,0,Nrows,my_Ncols)],(l_Ncol1-l_Ncol0+1)*Nrows,MPI_FLOAT, Root, 
                       IniDataTag, MPI_COMM_WORLD, &status );
          if ( ierr != 0 ) {
             fprintf(stderr," Process %d - Error receiving Grid[0,1,0] from process %d\n",my_rank,Root);
             ierr = -1;
             Terminate(my_rank,ierr);
             return(-1);
          }
       }
// 
//         Compute iterations: before calculating communicate borders to neighbors.
//         It is enough to exchange only vertical borders
          if ( my_rank == Root ) {
             left = NoNeighbor;
          } else {
             left = my_rank - 1;
          }
          if ( my_rank >= numprocs-1 ) {
             right = NoNeighbor;
          } else {
             right = my_rank + 1;
          }
          sp=0;
          for ( it = 1; it <= Tsteps; it++ ) {
            //  It's no problem to synchronize
            ierr = MPI_Barrier(MPI_COMM_WORLD);
            if (left != NoNeighbor) {
              ierr = MPI_Send(&Grid[index3D(Nrow0,l_Ncol0,sp,Nrows,my_Ncols)], Nrows, MPI_FLOAT, left, 
                            RTAG, MPI_COMM_WORLD);

              source = left;
              msgtype = LTAG;
              ierr =  MPI_Recv(&Grid[index3D(0,(l_Ncol0-lborder),sp,Nrows,my_Ncols)],Nrows,MPI_FLOAT, source, 
                            msgtype, MPI_COMM_WORLD, &status);
            }
    
            if (right != NoNeighbor) {
              ierr =  MPI_Send(&Grid[index3D(0,l_Ncol1,sp,Nrows,my_Ncols)],Nrows,MPI_FLOAT, 
                           right,LTAG,MPI_COMM_WORLD);

              source = right;
              msgtype = RTAG;
              ierr =  MPI_Recv(&Grid[index3D(0,(l_Ncol1+rborder),sp,Nrows,my_Ncols)],Nrows,MPI_FLOAT,source, 
                            msgtype, MPI_COMM_WORLD, &status);
            }
         
//           Calculate  grid points
           l1 = index3D(Nrow0,(l_Ncol0-lborder),sp,Nrows,my_Ncols);
           l2 = index3D(Nrow0,(l_Ncol0-lborder),(1-sp),Nrows,my_Ncols);
           computel(Nrows,my_Ncols,
                    &Grid[index3D(Nrow0,(l_Ncol0-lborder),sp,Nrows,my_Ncols)],
                    &Grid[index3D(Nrow0,(l_Ncol0-lborder),(1-sp),Nrows,my_Ncols)]);
           sp=1-sp;

         }
             
//       Root gathers results from all processes
         if ( my_rank == Root ) {
            for ( p = 1; p < numprocs; p++ ) {
//               Calculate portion of columns 
               ProcessBand(Ncols,&Ncol0,&Ncol1,numprocs,p);
               Nrow0 = 0; Nrow1 = Nrows-1;
               source = p;
               msgtype = DONE;
               ierr = MPI_Recv(&Grid[index3D(Nrow0,Ncol0,sp,Nrows,my_Ncols)], (Ncol1-Ncol0+1) * Nrows, MPI_FLOAT, 
                          source,msgtype,MPI_COMM_WORLD,&status);
               //  WRITE(*,*) " Root receives Grid(",Nrow0,",",Ncol0,", 1) from process ",p
            }

            time2 = MPI_Wtime();

            fprintf(stdout," Elapsed time with %d processes = %lf\n",numprocs,(time2-time1));
    
//            Print and show results 
            it = Tsteps;
            sprintf(fname,"tmap%4.4d",it);
            RealData2pgm(Nrows,Ncols,&Grid[index3D(0,0,sp,Nrows,my_Ncols)],fname);

         } else {     //  my_rank != Root
              msgtype = DONE;
              ierr = MPI_Send(&Grid[index3D(Nrow0,l_Ncol0,sp,Nrows,my_Ncols)],(l_Ncol1-l_Ncol0+1) * Nrows ,MPI_FLOAT, 
                           Root,msgtype,MPI_COMM_WORLD);
         }

         ierr = MPI_Barrier(MPI_COMM_WORLD);
             ierr = 0;
             Terminate(my_rank,ierr);
             return(0);
         return(0);
 
}