subroutine mymmul_block( m, n, k, a, lda, b, ldb, c, ldc ) implicit none integer :: m ! M specifies the number of rows of the matrix A integer :: n ! N specifies the number of columns of the matrix B integer :: k ! K specifies the number of columns of the matrix A integer :: lda, ldb, ldc real*8 :: a( lda, * ), b( ldb, * ), c( ldc, * ) integer :: mh = 32 integer :: nh = 32 integer :: kh = 32 integer :: mb, nb, kb, i, j, l, ib, jb, lb integer :: ioff, joff, loff integer :: iend, jend, lend !mh = m !nh = n !kh = k mb = m / mh IF( MOD( m, mh ) /= 0 ) mb = mb + 1 nb = n / nh IF( MOD( n, nh ) /= 0 ) nb = nb + 1 kb = k / kh IF( MOD( k, kh ) /= 0 ) kb = kb + 1 ! write(*,*) 'mb, nb, kb = ', mb, nb, kb !$omp parallel do default(none) & !$omp shared(a,b,c,mb,nb,kb,m,n,k,mh,nh,kh) & !$omp private(ib,jb,lb,i,j,l,ioff,joff,loff,iend,jend,lend) do ib = 0, mb-1 ioff = 1 + ib * mh iend = MIN( m, ioff+mh-1) ! write(*,*) 'ioff, iend = ', ioff, iend do jb = 0, nb-1 joff = 1 + jb * nh jend = MIN( n, joff+nh-1 ) ! write(*,*) ' joff, jend = ', joff, jend do lb = 0, kb-1 loff = 1 + lb * kh lend = MIN( k, loff+kh-1 ) ! write(*,*) ' loff, lend = ', loff, lend ! Cij = Aik * Bkj do i = ioff, iend do j = joff, jend do l = loff, lend c( i, j ) = c( i, j ) + a( i, l ) * b( l, j ) end do end do end do end do end do end do !$omp end parallel do return end subroutine subroutine mymmul( m, n, k, a, lda, b, ldb, c, ldc ) implicit none integer :: m ! M specifies the number of rows of the matrix A integer :: n ! N specifies the number of columns of the matrix B integer :: k ! K specifies the number of columns of the matrix A integer :: lda, ldb, ldc real*8 :: a( lda, * ), b( ldb, * ), c( ldc, * ) INTEGER :: mytid, ntids INTEGER :: ntids_row, ntids_col INTEGER :: myrow, mycol INTEGER :: mb, nb INTEGER :: m_off, n_off INTEGER :: ldim_block, lind_block, gind_block EXTERNAL :: ldim_block, lind_block, gind_block #ifdef __OPENMP INTEGER :: omp_get_thread_num, omp_get_num_threads EXTERNAL :: omp_get_thread_num, omp_get_num_threads #endif !$omp parallel default(none) & !$omp private( mytid, ntids, ntids_row, ntids_col, myrow, mycol, mb, nb, m_off, n_off) & !$omp shared( m, n, k, lda, ldb, ldc, a, b, c ) #ifdef __OPENMP mytid = omp_get_thread_num() ! take the thread ID ntids = omp_get_num_threads() ! take the number of threads ! define a grid as square as possible CALL gridsetup( ntids, ntids_row, ntids_col ) ! row mayour order myrow = MOD( mytid, ntids_row ) mycol = mytid / ntids_row !write(*,*) 'threds : ', ntids, mytid, ntids_row, ntids_col, myrow, mycol ! find my block size mb = ldim_block( m, ntids_row, myrow ) nb = ldim_block( n, ntids_col, mycol ) ! find the offset m_off = gind_block(1, m, ntids_row, myrow) n_off = gind_block(1, n, ntids_col, mycol) !write(*,*) 'blocks : ', ntids, mytid, mb, nb, m_off, n_off #else ntids = 1 mytids = 0 ntids_row = 1 ntids_col = 0 myrow = 0 mycol = 0 mb = m nb = n m_off = 1 n_off = 1 #endif CALL dgemm( 'N', 'N', mb, nb, k, 1.0d0, a(m_off,1), lda, b(1,n_off), ldb, 0.0d0, c(m_off,n_off), ldc ) !$omp end parallel return end subroutine SUBROUTINE gridsetup( nproc, nprow, npcol ) ! ! This subroutine factorizes the number of processors (NPROC) ! into NPROW and NPCOL, that are the sizes of the 2D processors mesh. ! ! Written by Carlo Cavazzoni ! IMPLICIT NONE integer nproc,nprow,npcol integer sqrtnp,i sqrtnp = int( sqrt( dble(nproc) ) + 1 ) do i=1,sqrtnp if(mod(nproc,i).eq.0) nprow = i end do npcol = nproc/nprow return END SUBROUTINE program test IMPLICIT NONE INTEGER, PARAMETER :: dim = 1000 REAL*8, ALLOCATABLE :: x(:,:), y(:,:), z(:,:) INTEGER :: i,j,k REAL*8 :: t1, t2 REAL*8 :: cclock EXTERNAL :: cclock ALLOCATE( x( dim, dim ), y( dim, dim ), z( dim, dim ) ) y = 1.0d0 z = 1.0d0 / DBLE( dim ) x = 0.0d0 t1 = cclock() ! x = matmul( y, z ) ! call dgemm('N', 'N', dim, dim, dim, 1.0d0, y, dim, z, dim, 0.0d0, x, dim) !call mymmul( dim, dim, dim, y, dim, z, dim, x, dim ) call mymmul_block( dim, dim, dim, y, dim, z, dim, x, dim ) t2 = cclock() write(*,*) ' Matrix sum = ', sum(x) write(*,*) ' tempo (secondi) ', t2-t1 DEALLOCATE( x, y, z ) end program