| 1 | !C
|
| 2 | !C***
|
| 3 | !C*** hecmw_matvec_33
|
| 4 | !C***
|
| 5 | !C
|
| 6 | subroutine hecmw_matvec_33 (hecMESH, hecMAT, X, Y, COMMtime)
|
| 7 | use hecmw_util
|
| 8 | implicit none
|
| 9 | type (hecmwST_local_mesh), intent(in) :: hecMESH
|
| 10 | type (hecmwST_matrix), intent(in), target :: hecMAT
|
| 11 | real(kind=kreal), intent(in) :: X(:)
|
| 12 | real(kind=kreal), intent(out) :: Y(:)
|
| 13 | real(kind=kreal), intent(inout), optional :: COMMtime
|
| 14 |
|
| 15 | real(kind=kreal) :: Tcomm
|
| 16 | real(kind=kreal), allocatable :: WK(:)
|
| 17 |
|
| 18 | Tcomm = 0.d0
|
| 19 |
|
| 20 | if (mpcmatvec_flg) then
|
| 21 | allocate(WK(hecMAT%NP * hecMAT%NDOF))
|
| 22 | call hecmw_TtmatTvec_33(hecMESH, hecMAT, X, Y, WK, Tcomm)
|
| 23 | deallocate(WK)
|
| 24 | else
|
| 25 | call hecmw_matvec_33_inner(hecMESH, hecMAT, X, Y, Tcomm)
|
| 26 | endif
|
| 27 |
|
| 28 | if (present(COMMtime)) COMMtime = COMMtime + Tcomm
|
| 29 | end subroutine hecmw_matvec_33
|
| 30 |
|
| 31 | !C
|
| 32 | !C***
|
| 33 | !C*** hecmw_matvec_33_inner ( private subroutine )
|
| 34 | !C***
|
| 35 | !C
|
| 36 | subroutine hecmw_matvec_33_inner (hecMESH, hecMAT, X, Y, COMMtime)
|
| 37 | use hecmw_util
|
| 38 | use m_hecmw_comm_f
|
| 39 | use hecmw_matrix_contact
|
| 40 | use hecmw_matrix_misc
|
| 41 | use hecmw_jad_type
|
| 42 | use hecmw_tuning_fx
|
| 43 | !$ use omp_lib
|
| 44 |
|
| 45 | implicit none
|
| 46 | type (hecmwST_local_mesh), intent(in) :: hecMESH
|
| 47 | type (hecmwST_matrix), intent(in), target :: hecMAT
|
| 48 | real(kind=kreal), intent(in) :: X(:)
|
| 49 | real(kind=kreal), intent(out) :: Y(:)
|
| 50 | real(kind=kreal), intent(inout), optional :: COMMtime
|
| 51 |
|
| 52 | real(kind=kreal) :: START_TIME, END_TIME, Tcomm
|
| 53 | integer(kind=kint) :: i, j, jS, jE, in
|
| 54 | real(kind=kreal) :: YV1, YV2, YV3, X1, X2, X3
|
| 55 |
|
| 56 | integer(kind=kint) :: N, NP
|
| 57 | integer(kind=kint), pointer :: indexL(:), itemL(:), indexU(:), itemU(:)
|
| 58 | real(kind=kreal), pointer :: AL(:), AU(:), D(:)
|
| 59 |
|
| 60 | ! for communication hiding
|
| 61 | integer(kind=kint) :: ireq
|
| 62 |
|
| 63 | ! added for turning >>>
|
| 64 | integer, parameter :: numOfBlockPerThread = 100
|
| 65 | logical, save :: isFirst = .true.
|
| 66 | integer, save :: numOfThread = 1
|
| 67 | integer, save, allocatable :: startPos(:), endPos(:)
|
| 68 | integer(kind=kint), save :: sectorCacheSize0, sectorCacheSize1
|
| 69 | integer(kind=kint) :: threadNum, blockNum, numOfBlock
|
| 70 | integer(kind=kint) :: numOfElement, elementCount, blockIndex
|
| 71 | real(kind=kreal) :: numOfElementPerBlock
|
| 72 | ! <<< added for turning
|
| 73 |
|
| 74 | IF (hecmw_JAD_IS_INITIALIZED().ne.0) THEN
|
| 75 | Tcomm = 0.d0
|
| 76 | START_TIME = hecmw_Wtime()
|
| 77 | call hecmw_JAD_MATVEC(hecMESH, hecMAT, X, Y, Tcomm)
|
| 78 | END_TIME = hecmw_Wtime()
|
| 79 | time_Ax = time_Ax + END_TIME - START_TIME - Tcomm
|
| 80 | if (present(COMMtime)) COMMtime = COMMtime + Tcomm
|
| 81 | ELSE
|
| 82 |
|
| 83 | N = hecMAT%N
|
| 84 | NP = hecMAT%NP
|
| 85 | indexL => hecMAT%indexL
|
| 86 | indexU => hecMAT%indexU
|
| 87 | itemL => hecMAT%itemL
|
| 88 | itemU => hecMAT%itemU
|
| 89 | AL => hecMAT%AL
|
| 90 | AU => hecMAT%AU
|
| 91 | D => hecMAT%D
|
| 92 |
|
| 93 | ! added for turning >>>
|
| 94 | if (.not. isFirst) then
|
| 95 | numOfBlock = numOfThread * numOfBlockPerThread
|
| 96 | if (endPos(numOfBlock-1) .ne. N-1) then
|
| 97 | deallocate(startPos, endPos)
|
| 98 | isFirst = .true.
|
| 99 | endif
|
| 100 | endif
|
| 101 | if (isFirst) then
|
| 102 | !$ numOfThread = omp_get_max_threads()
|
| 103 | numOfBlock = numOfThread * numOfBlockPerThread
|
| 104 | allocate (startPos(0 : numOfBlock - 1), endPos(0 : numOfBlock - 1))
|
| 105 | numOfElement = N + indexL(N) + indexU(N)
|
| 106 | numOfElementPerBlock = dble(numOfElement) / numOfBlock
|
| 107 | blockNum = 0
|
| 108 | elementCount = 0
|
| 109 | startPos(blockNum) = 1
|
| 110 | do i= 1, N
|
| 111 | elementCount = elementCount + 1
|
| 112 | elementCount = elementCount + (indexL(i) - indexL(i-1))
|
| 113 | elementCount = elementCount + (indexU(i) - indexU(i-1))
|
| 114 | if (elementCount > (blockNum + 1) * numOfElementPerBlock) then
|
| 115 | endPos(blockNum) = i
|
| 116 | ! write(9000+hecMESH%my_rank,*) mod(blockNum, numOfThread), &
|
| 117 | ! startPos(blockNum), endPos(blockNum)
|
| 118 | blockNum = blockNum + 1
|
| 119 | startPos(blockNum) = i + 1
|
| 120 | if (blockNum == (numOfBlock - 1)) exit
|
| 121 | endif
|
| 122 | enddo
|
| 123 | endPos(blockNum) = N
|
| 124 | ! write(9000+hecMESH%my_rank,*) mod(blockNum, numOfThread), &
|
| 125 | ! startPos(blockNum), endPos(blockNum)
|
| 126 | ! for irregular data
|
| 127 | do i= blockNum+1, numOfBlock-1
|
| 128 | startPos(i) = N
|
| 129 | endPos(i) = N-1
|
| 130 | ! write(9000+hecMESH%my_rank,*) mod(i, numOfThread), &
|
| 131 | ! startPos(i), endPos(i)
|
| 132 | end do
|
| 133 |
|
| 134 | call hecmw_tuning_fx_calc_sector_cache(NP, 3, &
|
| 135 | sectorCacheSize0, sectorCacheSize1)
|
| 136 |
|
| 137 | isFirst = .false.
|
| 138 | endif
|
| 139 | ! <<< added for turning
|
| 140 |
|
| 141 | START_TIME= HECMW_WTIME()
|
| 142 | ! if (async_matvec_flg) then
|
| 143 | ! call hecmw_update_3_R_async (hecMESH, X, NP, ireq)
|
| 144 | ! else
|
| 145 | call hecmw_update_3_R (hecMESH, X, NP)
|
| 146 | ! endif
|
| 147 | END_TIME= HECMW_WTIME()
|
| 148 | if (present(COMMtime)) COMMtime = COMMtime + END_TIME - START_TIME
|
| 149 |
|
| 150 | START_TIME = hecmw_Wtime()
|
| 151 |
|
| 152 | !call fapp_start("loopInMatvec33", 1, 0)
|
| 153 | !call start_collection("loopInMatvec33")
|
| 154 |
|
| 155 | !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
|
| 156 | !OCL CACHE_SUBSECTOR_ASSIGN(X)
|
| 157 |
|
| 158 | !$OMP PARALLEL DEFAULT(NONE) &
|
| 159 | !$OMP&PRIVATE(i,X1,X2,X3,YV1,YV2,YV3,jS,jE,j,in,threadNum,blockNum,blockIndex) &
|
| 160 | !$OMP&SHARED(D,AL,AU,indexL,itemL,indexU,itemU,X,Y,startPos,endPos,numOfThread,N,async_matvec_flg)
|
| 161 | threadNum = 0
|
| 162 | !$ threadNum = omp_get_thread_num()
|
| 163 | do blockNum = 0 , numOfBlockPerThread - 1
|
| 164 | blockIndex = blockNum * numOfThread + threadNum
|
| 165 | do i = startPos(blockIndex), endPos(blockIndex)
|
| 166 | X1= X(3*i-2)
|
| 167 | X2= X(3*i-1)
|
| 168 | X3= X(3*i )
|
| 169 | YV1= D(9*i-8)*X1 + D(9*i-7)*X2 + D(9*i-6)*X3
|
| 170 | YV2= D(9*i-5)*X1 + D(9*i-4)*X2 + D(9*i-3)*X3
|
| 171 | YV3= D(9*i-2)*X1 + D(9*i-1)*X2 + D(9*i )*X3
|
| 172 |
|
| 173 | jS= indexL(i-1) + 1
|
| 174 | jE= indexL(i )
|
| 175 | do j= jS, jE
|
| 176 | in = itemL(j)
|
| 177 | X1= X(3*in-2)
|
| 178 | X2= X(3*in-1)
|
| 179 | X3= X(3*in )
|
| 180 | YV1= YV1 + AL(9*j-8)*X1 + AL(9*j-7)*X2 + AL(9*j-6)*X3
|
| 181 | YV2= YV2 + AL(9*j-5)*X1 + AL(9*j-4)*X2 + AL(9*j-3)*X3
|
| 182 | YV3= YV3 + AL(9*j-2)*X1 + AL(9*j-1)*X2 + AL(9*j )*X3
|
| 183 | enddo
|
| 184 | jS= indexU(i-1) + 1
|
| 185 | jE= indexU(i )
|
| 186 | do j= jS, jE
|
| 187 | in = itemU(j)
|
| 188 | ! if (async_matvec_flg .and. in > N) cycle
|
| 189 | X1= X(3*in-2)
|
| 190 | X2= X(3*in-1)
|
| 191 | X3= X(3*in )
|
| 192 | YV1= YV1 + AU(9*j-8)*X1 + AU(9*j-7)*X2 + AU(9*j-6)*X3
|
| 193 | YV2= YV2 + AU(9*j-5)*X1 + AU(9*j-4)*X2 + AU(9*j-3)*X3
|
| 194 | YV3= YV3 + AU(9*j-2)*X1 + AU(9*j-1)*X2 + AU(9*j )*X3
|
| 195 | enddo
|
| 196 | Y(3*i-2)= YV1
|
| 197 | Y(3*i-1)= YV2
|
| 198 | Y(3*i )= YV3
|
| 199 | enddo
|
| 200 | enddo
|
| 201 | !$OMP END PARALLEL
|
| 202 |
|
| 203 | !OCL END_CACHE_SUBSECTOR
|
| 204 | !OCL END_CACHE_SECTOR_SIZE
|
| 205 |
|
| 206 | !call stop_collection("loopInMatvec33")
|
| 207 | !call fapp_stop("loopInMatvec33", 1, 0)
|
| 208 |
|
| 209 | END_TIME = hecmw_Wtime()
|
| 210 | time_Ax = time_Ax + END_TIME - START_TIME
|
| 211 |
|
| 212 | ! if (async_matvec_flg) then
|
| 213 | ! START_TIME= HECMW_WTIME()
|
| 214 | ! call hecmw_update_3_R_wait (hecMESH, ireq)
|
| 215 | ! END_TIME= HECMW_WTIME()
|
| 216 | ! if (present(COMMtime)) COMMtime = COMMtime + END_TIME - START_TIME
|
| 217 |
|
| 218 | ! START_TIME = hecmw_Wtime()
|
| 219 |
|
| 220 | ! do i = 1, N
|
| 221 | ! jS= index_o(i-1) + 1
|
| 222 | ! jE= index_o(i )
|
| 223 | ! if (jS > jE) cycle
|
| 224 | ! YV1= 0.d0
|
| 225 | ! YV2= 0.d0
|
| 226 | ! YV3= 0.d0
|
| 227 | ! do j=jS, jE
|
| 228 | ! in = item_o(j)
|
| 229 | ! X1= X(3*(N+in)-2)
|
| 230 | ! X2= X(3*(N+in)-1)
|
| 231 | ! X3= X(3*(N+in) )
|
| 232 | ! YV1= YV1 + A_o(9*j-8)*X1 + A_o(9*j-7)*X2 + A_o(9*j-6)*X3
|
| 233 | ! YV2= YV2 + A_o(9*j-5)*X1 + A_o(9*j-4)*X2 + A_o(9*j-3)*X3
|
| 234 | ! YV3= YV3 + A_o(9*j-2)*X1 + A_o(9*j-1)*X2 + A_o(9*j )*X3
|
| 235 | ! enddo
|
| 236 | ! Y(3*i-2)= Y(3*i-2)+YV1
|
| 237 | ! Y(3*i-1)= Y(3*i-1)+YV2
|
| 238 | ! Y(3*i )= Y(3*i )+YV3
|
| 239 | ! enddo
|
| 240 |
|
| 241 | ! END_TIME = hecmw_Wtime()
|
| 242 | ! time_Ax = time_Ax + END_TIME - START_TIME
|
| 243 | ! endif
|
| 244 |
|
| 245 | ENDIF
|
| 246 |
|
| 247 | if (hecMAT%cmat%n_val > 0) then
|
| 248 | call hecmw_cmat_multvec_add( hecMAT%cmat, X, Y, NP * hecMAT%NDOF )
|
| 249 | end if
|
| 250 |
|
| 251 | end subroutine hecmw_matvec_33_inner
|