/* 
 * This program demonstrates the use of MPI_Alltoall when
 * transposing a square matrix.
 * For simplicity, the number of processes is 4 and the dimension
 * of the matrix is fixed to NROW
 */

#include <stdio.h>
#include <mpi.h>
#define NROW 8192
#define NP 64
#define NBLK NROW/NP
#define ONEML 99999999999
void
trans (double *a, int n)
/* transpose square matrix a, dimension nxn
 * Consider this as a black box for the MPI course
 */

{
  int i, j;
  int ij, ji, l;
  double tmp;
  ij = 0;
  l = -1;
  for (i = 0; i < n; i++)
    {
      l += n + 1;
      ji = l;
      ij += i + 1;
      for (j = i+1; j < n; j++)
	{
	  tmp = a[ij];
	  a[ij] = a[ji];
	  a[ji] = tmp;
	  ij++;
	  ji += n;
	}
    }
}

#define index2D(a,b,LD2)  b + (a*LD2)

int main (int argc, char *argv[])
{
  double *a;
  double *b;

  int i, j, nprocs, rank,provided,itemp;
  double r0;

  MPI_Init (&argc, &argv);
  MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);

  a = malloc(NROW*NBLK*sizeof((double)1.0));
  b = malloc(NROW*NBLK*sizeof((double)1.0));
  
  if(rank == 0)
  {
    printf("Transposing a %d x %d matrix, divided among %d processors\n",NROW,NROW,NP);
  }
  if (nprocs != NP)
    {
      if (rank == 0)
	     printf ("Error, number of processes must be %d\n",NP);
      MPI_Finalize ();
      return 1;
    }
  r0 = MPI_Wtime();
  for (i = 0; i < NROW; i++)
    for (j = 0; j < NBLK; j++)
      a[index2D(i,j,NBLK)] = ONEML * i + j + NBLK * rank; /* give every element a unique value */
  r0 = MPI_Wtime()-r0;
  if(rank == 0)
  {
    printf("Building matrix time (sec) %f\n",r0);
  }
  /* do the MPI part of the transpose */

  /* Tricky here is the number of items to send and receive. 
   * Not NROWxNBLK as one may guess, but the amount to send to one process
   * and the amount to receive from any process */

  r0 = MPI_Wtime();
  /* MPI_Alltoall does not a transpose of the data received, we have to
   * do this ourself: 
   * - use function MPI_Alltoall to redistribute data:
   */
		---

  r0 = MPI_Wtime()-r0;
  if(rank == 0)
  {
    printf("MPI_Alltoall time (sec) %f\n",r0);
  }

  /* transpose NP square matrices, order NBLKxNBLK: */
  r0 = MPI_Wtime();
  for (i = 0; i < NP; i++)
      trans (&b[index2D(i*NBLK,0,NBLK)], NBLK);
  r0 = MPI_Wtime()-r0;
  if(rank == 0)
  {
    printf("Transpose block matrices time (sec) %f\n",r0);
  }

   /* now check the result */

  for (i = 0; i < NROW; i++)
    for (j = 0; j < NBLK; j++)
      {
	     if (b[index2D(i,j,NBLK)] != ONEML * (j + NBLK * rank) + i )
	       {
	          printf ("process %d found b[%d][%d] = %f, but %f was expected\n",
		          rank, i, j, b[index2D(i,j,NBLK)], (double) (ONEML * (j + NBLK * rank) + i));
	          MPI_Abort (MPI_COMM_WORLD,1);
	        return 1;
	       }
      }
  if (rank == 0)
    printf ("Transpose seems ok\n");
  MPI_Finalize ();
  return 0;
}