subroutine mymmul_block( m, n, k, a, lda, b, ldb, c, ldc )

   implicit none

   integer  :: m ! M  specifies  the number  of rows  of the  matrix A
   integer  :: n ! N  specifies the number  of columns of the matrix B
   integer  :: k ! K  specifies  the number of columns of the matrix A
   integer  :: lda, ldb, ldc
   real*8 :: a( lda, * ), b( ldb, * ), c( ldc, * )

   integer :: mh = 32
   integer :: nh = 32
   integer :: kh = 32
   integer :: mb, nb, kb, i, j, l, ib, jb, lb
   integer :: ioff, joff, loff
   integer :: iend, jend, lend

   !mh = m
   !nh = n
   !kh = k

   mb = m / mh
   IF( MOD( m, mh ) /= 0 ) mb = mb + 1
   nb = n / nh
   IF( MOD( n, nh ) /= 0 ) nb = nb + 1
   kb = k / kh
   IF( MOD( k, kh ) /= 0 ) kb = kb + 1

   ! write(*,*) 'mb, nb, kb = ', mb, nb, kb
!$omp parallel do default(none) &
!$omp          shared(a,b,c,mb,nb,kb,m,n,k,mh,nh,kh) &
!$omp          private(ib,jb,lb,i,j,l,ioff,joff,loff,iend,jend,lend)
   do ib = 0, mb-1
      ioff = 1 + ib * mh
      iend = MIN( m, ioff+mh-1)
      ! write(*,*) 'ioff, iend = ', ioff, iend
      do jb = 0, nb-1
         joff = 1 + jb * nh
         jend = MIN( n, joff+nh-1 )
         ! write(*,*) '   joff, jend = ', joff, jend
         do lb = 0, kb-1 
            loff = 1 + lb * kh
            lend = MIN( k, loff+kh-1 )
            ! write(*,*) '      loff, lend = ', loff, lend
            ! Cij = Aik * Bkj
            do i = ioff, iend
               do j = joff, jend
                  do l = loff, lend
                     c( i, j ) = c( i, j ) + a( i, l ) * b( l, j )
                  end do
               end do
            end do
         end do
      end do
   end do
!$omp end parallel do

   return

end subroutine


subroutine mymmul( m, n, k, a, lda, b, ldb, c, ldc )

   implicit none

   integer  :: m ! M  specifies  the number  of rows  of the  matrix A
   integer  :: n ! N  specifies the number  of columns of the matrix B
   integer  :: k ! K  specifies  the number of columns of the matrix A
   integer  :: lda, ldb, ldc
   real*8 :: a( lda, * ), b( ldb, * ), c( ldc, * )

   INTEGER :: mytid, ntids
   INTEGER :: ntids_row, ntids_col
   INTEGER :: myrow, mycol
   INTEGER :: mb, nb
   INTEGER :: m_off, n_off
   INTEGER :: ldim_block, lind_block, gind_block
   EXTERNAL :: ldim_block, lind_block, gind_block

#ifdef __OPENMP
   INTEGER :: omp_get_thread_num, omp_get_num_threads
   EXTERNAL :: omp_get_thread_num, omp_get_num_threads
#endif

!$omp parallel default(none) &
!$omp private( mytid, ntids, ntids_row, ntids_col, myrow, mycol, mb, nb, m_off, n_off) &
!$omp shared( m, n, k, lda, ldb, ldc, a, b, c )

#ifdef __OPENMP
   mytid = omp_get_thread_num()  ! take the thread ID
   ntids = omp_get_num_threads() ! take the number of threads
   ! define a grid as square as possible
   CALL gridsetup( ntids, ntids_row, ntids_col )
   ! row mayour order
   myrow = MOD( mytid, ntids_row )
   mycol = mytid / ntids_row
   !write(*,*) 'threds : ', ntids, mytid, ntids_row, ntids_col, myrow, mycol
   ! find my block size
   mb = ldim_block( m, ntids_row, myrow )
   nb = ldim_block( n, ntids_col, mycol )
   ! find the offset
   m_off = gind_block(1, m, ntids_row, myrow)
   n_off = gind_block(1, n, ntids_col, mycol)
   !write(*,*) 'blocks : ', ntids, mytid, mb, nb, m_off, n_off
#else
   ntids = 1
   mytids = 0
   ntids_row = 1
   ntids_col = 0
   myrow = 0
   mycol = 0
   mb = m
   nb = n
   m_off = 1
   n_off = 1
#endif

   CALL dgemm( 'N', 'N', mb, nb, k, 1.0d0, a(m_off,1), lda, b(1,n_off), ldb, 0.0d0, c(m_off,n_off), ldc )

!$omp end parallel

   return

end subroutine


SUBROUTINE gridsetup( nproc, nprow, npcol )

!   
! This subroutine factorizes the number of processors (NPROC)
! into NPROW and NPCOL,  that are the sizes of the 2D processors mesh.
!   
! Written by Carlo Cavazzoni
!
    
      IMPLICIT NONE
    
      integer nproc,nprow,npcol
    
      integer sqrtnp,i
    
      sqrtnp = int( sqrt( dble(nproc) ) + 1 )
      do i=1,sqrtnp
        if(mod(nproc,i).eq.0) nprow = i
      end do

      npcol = nproc/nprow
    
      return
END SUBROUTINE


program test
      IMPLICIT NONE

      INTEGER, PARAMETER :: dim = 1000
      REAL*8, ALLOCATABLE :: x(:,:), y(:,:), z(:,:)
      INTEGER :: i,j,k
      REAL*8 :: t1, t2
      REAL*8 :: cclock
      EXTERNAL :: cclock


      ALLOCATE( x( dim, dim ), y( dim, dim ), z( dim, dim ) )

      y = 1.0d0
      z = 1.0d0 / DBLE( dim )
      x = 0.0d0

      t1 = cclock()

      ! x = matmul( y, z )
      ! call dgemm('N', 'N', dim, dim, dim, 1.0d0, y, dim, z, dim, 0.0d0, x, dim)
      !call mymmul( dim, dim, dim, y, dim, z, dim, x, dim )
      call mymmul_block( dim, dim, dim, y, dim, z, dim, x, dim )

      t2 = cclock()

      write(*,*) ' Matrix sum = ', sum(x)
      write(*,*) ' tempo (secondi) ', t2-t1
      DEALLOCATE( x, y, z )

end program