LCOV - coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info - src/elpa2/elpa2_trans_ev_tridi_to_band

LCOV - code coverage report

Current view:	top level - src/elpa2 - elpa2_trans_ev_tridi_to_band_template.F90 (source / functions)		Hit	Total	Coverage
Test:	coverage_50ab7a7628bba174fc62cee3ab72b26e81f87fe5.info	Lines:	482	808	59.7 %
Date:	2018-01-10 09:29:53	Functions:	58	64	90.6 %

          Line data    Source code

       1             : !    This file is part of ELPA.
       2             : !
       3             : !    The ELPA library was originally created by the ELPA consortium,
       4             : !    consisting of the following organizations:
       5             : !
       6             : !    - Max Planck Computing and Data Facility (MPCDF), formerly known as
       7             : !      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
       8             : !    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
       9             : !      Informatik,
      10             : !    - Technische Universität München, Lehrstuhl für Informatik mit
      11             : !      Schwerpunkt Wissenschaftliches Rechnen ,
      12             : !    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
      13             : !    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
      14             : !      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
      15             : !      and
      16             : !    - IBM Deutschland GmbH
      17             : !
      18             : !    This particular source code file contains additions, changes and
      19             : !    enhancements authored by Intel Corporation which is not part of
      20             : !    the ELPA consortium.
      21             : !
      22             : !    More information can be found here:
      23             : !    http://elpa.mpcdf.mpg.de/
      24             : !
      25             : !    ELPA is free software: you can redistribute it and/or modify
      26             : !    it under the terms of the version 3 of the license of the
      27             : !    GNU Lesser General Public License as published by the Free
      28             : !    Software Foundation.
      29             : !
      30             : !    ELPA is distributed in the hope that it will be useful,
      31             : !    but WITHOUT ANY WARRANTY; without even the implied warranty of
      32             : !    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      33             : !    GNU Lesser General Public License for more details.
      34             : !
      35             : !    You should have received a copy of the GNU Lesser General Public License
      36             : !    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
      37             : !
      38             : !    ELPA reflects a substantial effort on the part of the original
      39             : !    ELPA consortium, and we ask you to respect the spirit of the
      40             : !    license that we chose: i.e., please contribute any changes you
      41             : !    may have back to the original ELPA library distribution, and keep
      42             : !    any derivatives of ELPA under the same license that we chose for
      43             : !    the original distribution, the GNU Lesser General Public License.
      44             : !
      45             : !
      46             : ! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
      47             : !
      48             : ! Copyright of the original code rests with the authors inside the ELPA
      49             : ! consortium. The copyright of any additional modifications shall rest
      50             : ! with their original authors, but shall adhere to the licensing terms
      51             : ! distributed along with the original code in the file "COPYING".
      52             : 
      53             : #include "../general/sanity.F90"
      54             : 
      55             :   subroutine trans_ev_tridi_to_band_&
      56             :     &MATH_DATATYPE&
      57             :     &_&
      58       46296 :     &PRECISION &
      59       46296 :     (obj, na, nev, nblk, nbw, q, q_dev, ldq, matrixCols,         &
      60       46296 :      hh_trans, mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
      61             :      kernel)
      62             : 
      63             :     !-------------------------------------------------------------------------------
      64             :     !  trans_ev_tridi_to_band_real/complex:
      65             :     !  Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
      66             :     !
      67             :     !  Parameters
      68             :     !
      69             :     !  na          Order of matrix a, number of rows of matrix q
      70             :     !
      71             :     !  nev         Number eigenvectors to compute (= columns of matrix q)
      72             :     !
      73             :     !  nblk        blocksize of cyclic distribution, must be the same in both directions!
      74             :     !
      75             :     !  nb          semi bandwith
      76             :     !
      77             :     !  q           On input: Eigenvectors of tridiagonal matrix
      78             :     !              On output: Transformed eigenvectors
      79             :     !              Distribution is like in Scalapack.
      80             :     !
      81             :     !  q_dev       GPU device pointer to q
      82             :     !
      83             :     !  ldq         Leading dimension of q
      84             :     !  matrixCols  local columns of matrix q
      85             :     !
      86             :     !  mpi_comm_rows
      87             :     !  mpi_comm_cols
      88             :     !              MPI-Communicators for rows/columns/both
      89             :     !
      90             :     !-------------------------------------------------------------------------------
      91             :       use elpa_abstract_impl
      92             :       use elpa2_workload
      93             :       use pack_unpack_cpu
      94             :       use pack_unpack_gpu
      95             :       use compute_hh_trafo
      96             :       use cuda_functions
      97             :       use precision
      98             :       use iso_c_binding
      99             :       implicit none
     100             : #include "../general/precision_kinds.F90"
     101             :       class(elpa_abstract_impl_t), intent(inout) :: obj
     102             :       logical, intent(in)                        :: useGPU
     103             : 
     104             :       integer(kind=ik), intent(in)               :: kernel
     105             :       integer(kind=ik), intent(in)               :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
     106             : 
     107             : #ifdef USE_ASSUMED_SIZE
     108             :       MATH_DATATYPE(kind=rck)                    :: q(ldq,*)
     109             : #else
     110             :       MATH_DATATYPE(kind=rck)                    :: q(ldq,matrixCols)
     111             : #endif
     112             : 
     113             :       MATH_DATATYPE(kind=rck), intent(in)        :: hh_trans(:,:)
     114             :       integer(kind=c_intptr_t)                   :: q_dev
     115             : 
     116             :       integer(kind=ik)                           :: np_rows, my_prow, np_cols, my_pcol
     117             :       integer(kind=ik)                           :: i, j, ip, sweep, nbuf, l_nev, a_dim2
     118             :       integer(kind=ik)                           :: current_n, current_local_n, current_n_start, current_n_end
     119             :       integer(kind=ik)                           :: next_n, next_local_n, next_n_start, next_n_end
     120             :       integer(kind=ik)                           :: bottom_msg_length, top_msg_length, next_top_msg_length
     121             :       integer(kind=ik)                           :: stripe_width, last_stripe_width, stripe_count
     122             : #ifdef WITH_OPENMP
     123             :       integer(kind=ik)                           :: thread_width, csw, b_off, b_len
     124             : #endif
     125             :       integer(kind=ik)                           :: num_result_blocks, num_result_buffers, num_bufs_recvd
     126             :       integer(kind=ik)                           :: a_off, current_tv_off, max_blk_size
     127             :       integer(kind=ik)                           :: mpierr, src, src_offset, dst, offset, nfact, num_blk
     128             : 
     129             :       logical                                    :: flag
     130             : #ifdef WITH_OPENMP
     131             :       MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:,:)
     132             : #else
     133             :       MATH_DATATYPE(kind=rck), pointer           :: aIntern(:,:,:)
     134             : #endif
     135             :       MATH_DATATYPE(kind=rck)                    :: a_var
     136             : 
     137             :       type(c_ptr)                                :: aIntern_ptr
     138             : 
     139       46296 :       MATH_DATATYPE(kind=rck)   , allocatable    :: row(:)
     140       46296 :       MATH_DATATYPE(kind=rck)   , allocatable    :: row_group(:,:)
     141             : 
     142             : #ifdef WITH_OPENMP
     143       23148 :       MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
     144       23148 :       MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
     145             : #else
     146       23148 :       MATH_DATATYPE(kind=rck), allocatable       :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
     147       23148 :       MATH_DATATYPE(kind=rck), allocatable       :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
     148             : #endif
     149             : 
     150             :       integer(kind=c_intptr_t)                   :: aIntern_dev
     151             :       integer(kind=c_intptr_t)                   :: bcast_buffer_dev
     152             :       integer(kind=c_intptr_t)                   :: num
     153             :       integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1
     154             :       integer(kind=c_intptr_t)                   :: row_dev
     155             :       integer(kind=c_intptr_t)                   :: row_group_dev
     156             :       integer(kind=c_intptr_t)                   :: hh_tau_dev
     157             :       integer(kind=c_intptr_t)                   :: hh_dot_dev
     158             :       integer(kind=ik)                           :: row_group_size, unpack_idx
     159             : 
     160             :       integer(kind=ik)                           :: n_times
     161             :       integer(kind=ik)                           :: top, chunk, this_chunk
     162             : 
     163       46296 :       MATH_DATATYPE(kind=rck), allocatable       :: result_buffer(:,:,:)
     164       46296 :       MATH_DATATYPE(kind=rck), allocatable       :: bcast_buffer(:,:)
     165             : 
     166             :       integer(kind=ik)                           :: n_off
     167             : 
     168       92592 :       integer(kind=ik), allocatable              :: result_send_request(:), result_recv_request(:), limits(:)
     169       92592 :       integer(kind=ik), allocatable              :: top_send_request(:), bottom_send_request(:)
     170       92592 :       integer(kind=ik), allocatable              :: top_recv_request(:), bottom_recv_request(:)
     171             : 
     172             :       ! MPI send/recv tags, arbitrary
     173             : 
     174             :       integer(kind=ik), parameter                :: bottom_recv_tag = 111
     175             :       integer(kind=ik), parameter                :: top_recv_tag    = 222
     176             :       integer(kind=ik), parameter                :: result_recv_tag = 333
     177             : #ifdef WITH_OPENMP
     178             :       integer(kind=ik)                           :: max_threads, my_thread
     179             :       integer(kind=ik)                           :: omp_get_max_threads
     180             : #endif
     181             : 
     182             : 
     183             :       ! Just for measuring the kernel performance
     184             :       real(kind=c_double)                        :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
     185             :       ! long integer
     186             :       integer(kind=lik)                          :: kernel_flops, kernel_flops_recv
     187             : 
     188             :       logical, intent(in)                        :: wantDebug
     189             :       logical                                    :: success
     190             :       integer(kind=ik)                           :: istat, print_flops
     191             :       character(200)                             :: errorMessage
     192             :       logical                                    :: successCUDA
     193             : #ifndef WITH_MPI
     194             :       integer(kind=ik)                           :: j1
     195             : #endif
     196             :       integer(kind=c_intptr_t), parameter        :: size_of_datatype = size_of_&
     197             :                                                                      &PRECISION&
     198             :                                                                      &_&
     199             :                                                                      &MATH_DATATYPE
     200             : 
     201             :       call obj%timer%start("trans_ev_tridi_to_band_&
     202             :       &MATH_DATATYPE&
     203             :       &" // &
     204             :       &PRECISION_SUFFIX &
     205       46296 :       )
     206             : 
     207       46296 :       n_times = 0
     208       46296 :       if (useGPU) then
     209           0 :         unpack_idx = 0
     210           0 :         row_group_size = 0
     211             :       endif
     212             : 
     213       46296 :       success = .true.
     214       46296 :       kernel_time = 0.0
     215       46296 :       kernel_flops = 0
     216             : 
     217             : #ifdef WITH_OPENMP
     218       23148 :       max_threads = 1
     219       23148 :       max_threads = omp_get_max_threads()
     220             : #endif
     221       46296 :       if (wantDebug) call obj%timer%start("mpi_communication")
     222       46296 :       call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
     223       46296 :       call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
     224       46296 :       call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
     225       46296 :       call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
     226       46296 :       if (wantDebug) call obj%timer%stop("mpi_communication")
     227             : 
     228       46296 :       if (mod(nbw,nblk)/=0) then
     229           0 :         if (my_prow==0 .and. my_pcol==0) then
     230           0 :           if (wantDebug) then
     231             :             write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
     232             :                                 &MATH_DATATYPE&
     233           0 :                                 &: ERROR: nbw=',nbw,', nblk=',nblk
     234             :             write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
     235             :                                 &MATH_DATATYPE&
     236           0 :                                 &: band backtransform works only for nbw==n*nblk'
     237             :           endif
     238           0 :           success = .false.
     239           0 :           return
     240             :         endif
     241             :       endif
     242             : 
     243       46296 :       nfact = nbw / nblk
     244             : 
     245             : 
     246             :       ! local number of eigenvectors
     247       46296 :       l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
     248             : 
     249       46296 :       if (l_nev==0) then
     250             : #ifdef WITH_OPENMP
     251           0 :         thread_width = 0
     252             : #endif
     253           0 :         stripe_width = 0
     254           0 :         stripe_count = 0
     255           0 :         last_stripe_width = 0
     256             : 
     257             :       else ! l_nev
     258             : 
     259             : #if WITH_OPENMP
     260             :         ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
     261             :         ! every primary cache
     262             :         ! Suggested stripe width is 48 - should this be reduced for the complex case ???
     263             : 
     264       23148 :         if (useGPU) then
     265           0 :           stripe_width = 256 ! Must be a multiple of 4
     266           0 :           stripe_count = (l_nev - 1) / stripe_width + 1
     267             :         else ! useGPU
     268             :           ! openmp only in non-GPU case
     269       23148 :           thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
     270             : #if REALCASE == 1
     271             : #ifdef DOUBLE_PRECISION_REAL
     272       10080 :           stripe_width = 48 ! Must be a multiple of 4
     273             : #else
     274        3852 :           stripe_width = 96 ! Must be a multiple of 8
     275             : #endif
     276             : #endif /* REALCASE */
     277             : 
     278             : #if COMPLEXCASE == 1
     279             : #ifdef DOUBLE_PRECISION_COMPLEX
     280        6192 :           stripe_width = 48 ! Must be a multiple of 2
     281             : #else
     282        3024 :           stripe_width = 48 ! Must be a multiple of 4
     283             : #endif
     284             : #endif /* COMPLEXCASE */
     285             : 
     286       23148 :           stripe_count = (thread_width-1)/stripe_width + 1
     287             : 
     288             :           ! Adapt stripe width so that last one doesn't get too small
     289             : 
     290       23148 :           stripe_width = (thread_width-1)/stripe_count + 1
     291             : 
     292             : #if REALCASE == 1
     293             : #ifdef DOUBLE_PRECISION_REAL
     294             :           if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
     295       10080 :               kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
     296             :               kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
     297             : 
     298        1944 :             stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
     299             :                                                   ! (8 * sizeof(double) == 64)
     300             : 
     301             :     else
     302        8136 :             stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
     303             :                                             ! (4 * sizeof(double) == 32)
     304             :           endif
     305             : #else
     306             :           if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
     307        3852 :               kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
     308             :               kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
     309             : 
     310             : 
     311         756 :             stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
     312             :                                                ! (16 * sizeof(float) == 64)
     313             : 
     314             :     else
     315        3096 :             stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
     316             :                                             ! (8 * sizeof(float) == 32)
     317             :           endif
     318             : #endif
     319             : #endif /* REALCASE */
     320             : 
     321             : #if COMPLEXCASE == 1
     322             : #ifdef DOUBLE_PRECISION_COMPLEX
     323        6192 :           if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
     324             :               kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
     325             : 
     326        1296 :             stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
     327             :                                             ! (4 * sizeof(double complex) == 64)
     328             : 
     329             :     else
     330             : 
     331        4896 :             stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
     332             :                                             ! (2 * sizeof(double complex) == 32)
     333             :     endif
     334             : #else
     335             : 
     336        3024 :           if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
     337             :               kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
     338             : 
     339         612 :             stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
     340             :                                             ! (8 * sizeof(float complex) == 64)
     341             : 
     342             :           else
     343        2412 :             stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
     344             :                                             ! (4 * sizeof(float complex) == 32)
     345             :     endif
     346             : #endif
     347             : #endif /* COMPLEXCASE */
     348             : 
     349             : #if REALCASE == 1
     350       13932 :         last_stripe_width = l_nev - (stripe_count-1)*stripe_width
     351             : #endif
     352             : #if COMPLEXCASE == 1
     353             : ! only needed in no OMP case check thsis
     354             : ! last_stripe_width = l_nev - (stripe_count-1)*stripe_width
     355             : #endif
     356             : 
     357             :         endif ! useGPU
     358             : 
     359             : #else /* WITH_OPENMP */
     360             : 
     361             :         ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
     362             :         ! every primary cache
     363             :         ! Suggested stripe width is 48 - should this be reduced for the complex case ???
     364             : 
     365       23148 :         if (useGPU) then
     366           0 :           stripe_width = 256 ! Must be a multiple of 4
     367           0 :           stripe_count = (l_nev - 1) / stripe_width + 1
     368             : 
     369             :         else ! useGPU
     370             : #if REALCASE == 1
     371             : #ifdef DOUBLE_PRECISION_REAL
     372       10080 :           stripe_width = 48 ! Must be a multiple of 4
     373             : #else
     374        3852 :           stripe_width = 96 ! Must be a multiple of 8
     375             : #endif
     376             : #endif /* REALCASE */
     377             : 
     378             : #if COMPLEXCASE == 1
     379             : #ifdef DOUBLE_PRECISION_COMPLEX
     380        6192 :           stripe_width = 48 ! Must be a multiple of 2
     381             : #else
     382        3024 :           stripe_width = 48 ! Must be a multiple of 4
     383             : #endif
     384             : #endif /* COMPLEXCASE */
     385             : 
     386       23148 :           stripe_count = (l_nev-1)/stripe_width + 1
     387             : 
     388             :           ! Adapt stripe width so that last one doesn't get too small
     389             : 
     390       23148 :           stripe_width = (l_nev-1)/stripe_count + 1
     391             : 
     392             : #if REALCASE == 1
     393             : #ifdef DOUBLE_PRECISION_REAL
     394             :           if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
     395       10080 :               kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
     396             :               kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
     397             : 
     398        1944 :             stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
     399             :                                                   ! (8 * sizeof(double) == 64)
     400             : 
     401             :     else
     402        8136 :             stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
     403             :                                             ! (4 * sizeof(double) == 32)
     404             :           endif
     405             : #else
     406             :           if (kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
     407        3852 :               kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK4 .or. &
     408             :               kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK6) then
     409             : 
     410             : 
     411         756 :             stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 16 because of AVX-512 memory alignment of 64 bytes
     412             :                                                ! (16 * sizeof(float) == 64)
     413             : 
     414             :     else
     415        3096 :             stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
     416             :                                             ! (8 * sizeof(float) == 32)
     417             :           endif
     418             : #endif
     419             : #endif /* REALCASE */
     420             : 
     421             : #if COMPLEXCASE == 1
     422             : #ifdef DOUBLE_PRECISION_COMPLEX
     423             : 
     424        6192 :           if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
     425             :               kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
     426             : 
     427        1296 :             stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 4 because of AVX-512 memory alignment of 64 bytes
     428             :                                             ! (4 * sizeof(double complex) == 64)
     429             : 
     430             :     else
     431             : 
     432        4896 :             stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
     433             :                                             ! (2 * sizeof(double complex) == 32)
     434             :     endif
     435             : #else
     436             : 
     437        3024 :           if (kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 .or. &
     438             :               kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK2) then
     439             : 
     440         612 :             stripe_width = ((stripe_width+15)/16)*16 ! Must be a multiple of 8 because of AVX-512 memory alignment of 64 bytes
     441             :                                             ! (8 * sizeof(float complex) == 64)
     442             : 
     443             :           else
     444        2412 :             stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
     445             :                                             ! (4 * sizeof(float complex) == 32)
     446             :     endif
     447             : #endif
     448             : #endif /* COMPLEXCASE */
     449             :         endif ! useGPU
     450             : 
     451       23148 :         last_stripe_width = l_nev - (stripe_count-1)*stripe_width
     452             : 
     453             : #endif /* WITH_OPENMP */
     454             :       endif ! l_nev
     455             : 
     456             :       ! Determine the matrix distribution at the beginning
     457             : 
     458       46296 :       allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
     459       46296 :       if (istat .ne. 0) then
     460             :         print *,"trans_ev_tridi_to_band_&
     461             :                 &MATH_DATATYPE&
     462           0 :                 &: error when allocating limits"//errorMessage
     463           0 :         stop 1
     464             :       endif
     465       46296 :       call determine_workload(obj,na, nbw, np_rows, limits)
     466             : 
     467       46296 :       max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
     468             : 
     469       46296 :       a_dim2 = max_blk_size + nbw
     470             : 
     471       46296 :       if (useGPU) then
     472           0 :         num =  (stripe_width*a_dim2*stripe_count)* size_of_datatype
     473           0 :         successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
     474           0 :         if (.not.(successCUDA)) then
     475             :           print *,"trans_ev_tridi_to_band_&
     476             :           &MATH_DATATYPE&
     477           0 :           &: error in cudaMalloc"//errorMessage
     478           0 :           stop 1
     479             :         endif
     480             : 
     481           0 :         successCUDA = cuda_memset(aIntern_dev , 0, num)
     482           0 :         if (.not.(successCUDA)) then
     483             :           print *,"trans_ev_tridi_to_band_&
     484             :           &MATH_DATATYPE&
     485           0 :           &: error in cudaMemset"//errorMessage
     486           0 :           stop 1
     487             :         endif
     488             : 
     489           0 :         num =  (l_nev)* size_of_datatype
     490           0 :         successCUDA = cuda_malloc( row_dev,num)
     491           0 :         if (.not.(successCUDA)) then
     492             :           print *,"trans_ev_tridi_to_band_&
     493             :           &MATH_DATATYPE&
     494           0 :           &: error in cudaMalloc "
     495           0 :           stop 1
     496             :         endif
     497             : 
     498           0 :         successCUDA = cuda_memset(row_dev , 0, num)
     499           0 :         if (.not.(successCUDA)) then
     500             :           print *,"trans_ev_tridi_to_band_&
     501             :           &MATH_DATATYPE&
     502           0 :           &: error in cudaMemset "
     503           0 :           stop 1
     504             :         endif
     505             : 
     506             :         ! "row_group" and "row_group_dev" are needed for GPU optimizations
     507           0 :         allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
     508           0 :         if (istat .ne. 0) then
     509             :           print *,"trans_ev_tridi_to_band_&
     510             :           &MATH_DATATYPE&
     511           0 :           &: error when allocating row_group"//errorMessage
     512           0 :           stop 1
     513             :         endif
     514             : 
     515           0 :         row_group(:, :) = 0.0_rck
     516           0 :         num =  (l_nev*nblk)* size_of_datatype
     517           0 :         successCUDA = cuda_malloc(row_group_dev, num)
     518           0 :         if (.not.(successCUDA)) then
     519             :           print *,"trans_ev_tridi_to_band_&
     520             :           &MATH_DATATYPE&
     521           0 :           &: error in cudaMalloc"//errorMessage
     522           0 :           stop 1
     523             :         endif
     524             : 
     525           0 :         successCUDA = cuda_memset(row_group_dev , 0, num)
     526           0 :         if (.not.(successCUDA)) then
     527             :           print *,"trans_ev_tridi_to_band_&
     528             :           &MATH_DATATYPE&
     529           0 :           &: error in cudaMemset"//errorMessage
     530           0 :           stop 1
     531             :         endif
     532             : 
     533             :       else ! GPUs are not used
     534             : 
     535             : #if 0
     536             : ! realcase or complexcase
     537             : !DEC$ ATTRIBUTES ALIGN: 64:: aIntern
     538             : #endif
     539             : 
     540             : #ifdef WITH_OPENMP
     541       23148 :         if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*max_threads*     &
     542             :                C_SIZEOF(a_var)) /= 0) then
     543             :           print *,"trans_ev_tridi_to_band_&
     544             :           &MATH_DATATYPE&
     545           0 :           &: error when allocating aIntern"//errorMessage
     546           0 :           stop 1
     547             :         endif
     548             : 
     549       23148 :         call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
     550             :         ! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
     551             : 
     552             :         ! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
     553             : 
     554             : #else /* WITH_OPENMP */
     555             : 
     556       23148 :         if (posix_memalign(aIntern_ptr, 64_c_intptr_t, stripe_width*a_dim2*stripe_count*  &
     557             :             C_SIZEOF(a_var)) /= 0) then
     558           0 :           print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
     559           0 :           stop 1
     560             :         endif
     561             : 
     562       23148 :         call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
     563             :         !allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
     564             : 
     565       23148 :         aIntern(:,:,:) = 0.0_rck
     566             : #endif /* WITH_OPENMP */
     567             :       endif !useGPU
     568             : 
     569       46296 :       allocate(row(l_nev), stat=istat, errmsg=errorMessage)
     570       46296 :       if (istat .ne. 0) then
     571             :         print *,"trans_ev_tridi_to_band_&
     572             :         &MATH_DATATYPE&
     573           0 :         &: error when allocating row"//errorMessage
     574           0 :         stop 1
     575             :       endif
     576             : 
     577       46296 :       row(:) = 0.0_rck
     578             : 
     579             :       ! Copy q from a block cyclic distribution into a distribution with contiguous rows,
     580             :       ! and transpose the matrix using stripes of given stripe_width for cache blocking.
     581             : 
     582             :       ! The peculiar way it is done below is due to the fact that the last row should be
     583             :       ! ready first since it is the first one to start below
     584             : 
     585             : #ifdef WITH_OPENMP
     586             :       ! Please note about the OMP usage below:
     587             :       ! This is not for speed, but because we want the matrix a in the memory and
     588             :       ! in the cache of the correct thread (if possible)
     589             : 
     590       23148 :       call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
     591       46296 :       !$omp parallel do private(my_thread), schedule(static, 1)
     592             :       do my_thread = 1, max_threads
     593       23148 :         aIntern(:,:,:,my_thread) = 0.0_rck ! if possible, do first touch allocation!
     594             :       enddo
     595             :       !$omp end parallel do
     596             : 
     597       23148 :       call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
     598             : #endif /* WITH_OPENMP */
     599             : 
     600      123456 :       do ip = np_rows-1, 0, -1
     601       77160 :         if (my_prow == ip) then
     602             :           ! Receive my rows which have not yet been received
     603       46296 :           src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
     604     5981496 :           do i=limits(ip)+1,limits(ip+1)
     605     5935200 :             src = mod((i-1)/nblk, np_rows)
     606             : 
     607     5935200 :             if (src < my_prow) then
     608             : #ifdef WITH_OPENMP
     609             : 
     610             : #ifdef WITH_MPI
     611      366516 :               if (wantDebug) call obj%timer%start("mpi_communication")
     612             :               call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
     613      366516 :                             src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
     614      366516 :               if (wantDebug) call obj%timer%stop("mpi_communication")
     615             : 
     616             : #else /* WITH_MPI */
     617             : 
     618             : !              row(1:l_nev) = row(1:l_nev)
     619             : 
     620             : #endif /* WITH_MPI */
     621             : 
     622      366516 :               call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
     623             : 
     624      733032 : !$omp parallel do private(my_thread), schedule(static, 1)
     625             :               do my_thread = 1, max_threads
     626             :                 call unpack_row_&
     627             :                      &MATH_DATATYPE&
     628             :                      &_cpu_openmp_&
     629             :                      &PRECISION &
     630             :                                   (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, &
     631      366516 :                                    thread_width, stripe_width, l_nev)
     632             : 
     633             :               enddo
     634             : !$omp end parallel do
     635             : 
     636      366516 :               call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
     637             : 
     638             : #else /* WITH_OPENMP */
     639      366516 :               if (useGPU) then
     640             :                 ! An unpacking of the current row group may occur before queuing the next row
     641             :                 call unpack_and_prepare_row_group_&
     642             :                 &MATH_DATATYPE&
     643             :                 &_gpu_&
     644             :                 &PRECISION &
     645             :                               ( &
     646             :                               row_group, row_group_dev, aIntern_dev, stripe_count, &
     647             :                                           stripe_width, last_stripe_width, a_dim2, l_nev,&
     648             :                                           row_group_size, nblk, unpack_idx, &
     649           0 :                                            i - limits(ip), .false.)
     650             : #ifdef WITH_MPI
     651           0 :                 if (wantDebug) call obj%timer%start("mpi_communication")
     652             :                 call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
     653           0 :                               src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
     654           0 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
     655             : 
     656             : #else /* WITH_MPI */
     657           0 :                 row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
     658             : #endif /* WITH_MPI */
     659             : 
     660             :               else ! useGPU
     661             : #ifdef WITH_MPI
     662      366516 :                 if (wantDebug) call obj%timer%start("mpi_communication")
     663             :                 call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
     664      366516 :                               src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
     665      366516 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
     666             : 
     667             : #else /* WITH_MPI */
     668             : 
     669             : !                row(1:l_nev) = row(1:l_nev)
     670             : 
     671             : #endif /* WITH_MPI */
     672             : 
     673             :                 call unpack_row_&
     674             :                      &MATH_DATATYPE&
     675             :                      &_cpu_&
     676             :                      &PRECISION &
     677      366516 :                                 (obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
     678             :               endif ! useGPU
     679             : #endif /* WITH_OPENMP */
     680             : 
     681     5202168 :             elseif (src == my_prow) then
     682             : 
     683     4545528 :               src_offset = src_offset+1
     684             : 
     685     4545528 :               if (useGPU) then
     686             : #ifndef WITH_OPENMP
     687             : 
     688             :                  ! An unpacking of the current row group may occur before queuing the next row
     689             :                  call unpack_and_prepare_row_group_&
     690             :                       &MATH_DATATYPE&
     691             :                       &_gpu_&
     692             :                       &PRECISION &
     693             :                   ( &
     694             :                                row_group, row_group_dev, aIntern_dev, stripe_count, &
     695             :                                stripe_width, last_stripe_width, a_dim2, l_nev,&
     696             :                                row_group_size, nblk, unpack_idx, &
     697           0 :                                i - limits(ip), .false.)
     698             : 
     699           0 :                 row_group(:, row_group_size) = q(src_offset, 1:l_nev)
     700             : #else /* WITH_OPENMP */
     701             : 
     702             : !#if COMPLEXCASE == 1
     703             : !! why is an cuda call in the openmp region?
     704             : !                call unpack_and_prepare_row_group_complex_gpu_&
     705             : !                     &PRECISION&
     706             : !                     &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
     707             : !                      last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
     708             : !                      unpack_idx, i - limits(ip),.false.)
     709             : !                      row_group(:, row_group_size) = q(src_offset, 1:l_nev)
     710             : !#endif
     711             : 
     712             : #endif /* not OpenMP */
     713             :               else
     714     4545528 :                 row(:) = q(src_offset, 1:l_nev)
     715             :               endif
     716             : 
     717             : #ifdef WITH_OPENMP
     718     2272764 :               call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
     719             : 
     720     4545528 : !$omp parallel do private(my_thread), schedule(static, 1)
     721             :               do my_thread = 1, max_threads
     722             :                 call unpack_row_&
     723             :                      &MATH_DATATYPE&
     724             :                      &_cpu_openmp_&
     725             :                      &PRECISION &
     726     2272764 :                                    (obj,aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
     727             : 
     728             :               enddo
     729             : !$omp end parallel do
     730             : 
     731     2272764 :               call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
     732             : 
     733             : #else /* WITH_OPENMP */
     734             : 
     735     2272764 :               if (useGPU) then
     736             : 
     737             :               else
     738             :                 call unpack_row_&
     739             :                      &MATH_DATATYPE&
     740             :                      &_cpu_&
     741             :                      &PRECISION &
     742     2272764 :                                 (obj,aIntern, row,i-limits(ip),  stripe_count, stripe_width, last_stripe_width)
     743             :               endif
     744             : 
     745             : #endif /* WITH_OPENMP */
     746             : 
     747             :             endif
     748             :           enddo
     749             : 
     750             :           ! Send all rows which have not yet been send
     751       46296 :           src_offset = 0
     752       61728 :           do dst = 0, ip-1
     753     1499232 :             do i=limits(dst)+1,limits(dst+1)
     754     1483800 :               if (mod((i-1)/nblk, np_rows) == my_prow) then
     755      656640 :                 src_offset = src_offset+1
     756      656640 :                 row(:) = q(src_offset, 1:l_nev)
     757             : 
     758             : #ifdef WITH_MPI
     759      656640 :                 if (wantDebug) call obj%timer%start("mpi_communication")
     760             :                 call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
     761      656640 :                               dst, 0, mpi_comm_rows, mpierr)
     762      656640 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
     763             : #endif /* WITH_MPI */
     764             :               endif
     765             :             enddo
     766             :           enddo
     767             : 
     768       30864 :         else if (my_prow < ip) then
     769             : 
     770             :           ! Send all rows going to PE ip
     771       15432 :           src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
     772     1499232 :           do i=limits(ip)+1,limits(ip+1)
     773     1483800 :             src = mod((i-1)/nblk, np_rows)
     774     1483800 :             if (src == my_prow) then
     775      733032 :               src_offset = src_offset+1
     776      733032 :               row(:) = q(src_offset, 1:l_nev)
     777             : #ifdef WITH_MPI
     778      733032 :               if (wantDebug) call obj%timer%start("mpi_communication")
     779             :               call MPI_Send(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
     780      733032 :                             ip, 0, mpi_comm_rows, mpierr)
     781      733032 :               if (wantDebug) call obj%timer%stop("mpi_communication")
     782             : #endif /* WITH_MPI */
     783             :             endif
     784             :           enddo
     785             : 
     786             :           ! Receive all rows from PE ip
     787     1499232 :           do i=limits(my_prow)+1,limits(my_prow+1)
     788     1483800 :             src = mod((i-1)/nblk, np_rows)
     789     1483800 :             if (src == ip) then
     790             : #ifdef WITH_OPENMP
     791             : 
     792             : #ifdef WITH_MPI
     793      328320 :               if (wantDebug) call obj%timer%start("mpi_communication")
     794             :               call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
     795      328320 :                             src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
     796      328320 :               if (wantDebug) call obj%timer%stop("mpi_communication")
     797             : #else /* WITH_MPI */
     798             : 
     799             : !              row(1:l_nev) = row(1:l_nev)
     800             : 
     801             : #endif /* WITH_MPI */
     802             : 
     803      328320 :               call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
     804      656640 : !$omp parallel do private(my_thread), schedule(static, 1)
     805             :               do my_thread = 1, max_threads
     806             :                 call unpack_row_&
     807             :                      &MATH_DATATYPE&
     808             :                      &_cpu_openmp_&
     809             :                      &PRECISION &
     810      328320 :                                  (obj,aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
     811             :               enddo
     812             : !$omp end parallel do
     813      328320 :               call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
     814             : 
     815             : #else /* WITH_OPENMP */
     816      328320 :               if (useGPU) then
     817             :                 ! An unpacking of the current row group may occur before queuing the next row
     818             :                 call unpack_and_prepare_row_group_&
     819             :                      &MATH_DATATYPE&
     820             :                      &_gpu_&
     821             :                      &PRECISION&
     822             :                      &( &
     823             :                   row_group, row_group_dev, aIntern_dev, stripe_count,  &
     824             :                   stripe_width, last_stripe_width, a_dim2, l_nev,       &
     825             :                   row_group_size, nblk, unpack_idx,                     &
     826           0 :                   i - limits(my_prow), .false.)
     827             : 
     828             : #ifdef WITH_MPI
     829           0 :                if (wantDebug) call obj%timer%start("mpi_communication")
     830             :                call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
     831           0 :                              src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
     832           0 :                if (wantDebug) call obj%timer%stop("mpi_communication")
     833             : #else /* WITH_MPI */
     834             : 
     835           0 :                row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
     836             : #endif /* WITH_MPI */
     837             : 
     838             :               else ! useGPU
     839             : #ifdef WITH_MPI
     840      328320 :                 if (wantDebug) call obj%timer%start("mpi_communication")
     841             :                 call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
     842      328320 :                               src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
     843      328320 :                 if (wantDebug) call obj%timer%stop("mpi_communication")
     844             : #else /* WITH_MPI */
     845             : 
     846             : !                row(1:l_nev) = row(1:l_nev)
     847             : 
     848             : #endif
     849             :                 call unpack_row_&
     850             :                      &MATH_DATATYPE&
     851             :                      &_cpu_&
     852             :                      &PRECISION &
     853      328320 :                                 (obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
     854             :               endif ! useGPU
     855             : 
     856             : #endif /* WITH_OPENMP */
     857             : 
     858             :             endif
     859             :           enddo
     860             :         endif
     861             :       enddo
     862             : 
     863       46296 :       if (useGPU) then
     864             :         ! Force an unpacking of all remaining rows that haven't been unpacked yet
     865             :         call unpack_and_prepare_row_group_&
     866             :              &MATH_DATATYPE&
     867             :              &_gpu_&
     868             :              &PRECISION&
     869             :              &( &
     870             :           row_group, row_group_dev, aIntern_dev, stripe_count, &
     871             :           stripe_width, last_stripe_width, &
     872             :           a_dim2, l_nev, row_group_size, nblk, unpack_idx,     &
     873           0 :           -1, .true.)
     874             : 
     875           0 :         successCUDA = cuda_devicesynchronize()
     876             : 
     877           0 :          if (.not.(successCUDA)) then
     878             :            print *,"trans_ev_tridi_to_band_&
     879             :            &MATH_DATATYPE&
     880           0 :            &: error in cudaDeviceSynchronize"//errorMessage
     881           0 :            stop 1
     882             :          endif
     883             :       endif
     884             : 
     885             :       ! Set up result buffer queue
     886             : 
     887       46296 :       num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
     888             : 
     889       46296 :       num_result_buffers = 4*nfact
     890       46296 :       allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
     891       46296 :       if (istat .ne. 0) then
     892             :         print *,"trans_ev_tridi_to_band_&
     893             :         &MATH_DATATYPE&
     894           0 :         &: error when allocating result_buffer"//errorMessage
     895           0 :         stop 1
     896             :       endif
     897             : 
     898       46296 :       allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
     899       46296 :       if (istat .ne. 0) then
     900             :         print *,"trans_ev_tridi_to_band_&
     901             :         &MATH_DATATYPE&
     902           0 :         &: error when allocating result_send_request"//errorMessage
     903           0 :         stop 1
     904             :       endif
     905             : 
     906       46296 :       allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
     907       46296 :       if (istat .ne. 0) then
     908             :         print *,"trans_ev_tridi_to_band_&
     909             :         &MATH_DATATYPE&
     910           0 :         &: error when allocating result_recv_request"//errorMessage
     911           0 :         stop 1
     912             :       endif
     913             : 
     914             : #ifdef WITH_MPI
     915       30864 :       result_send_request(:) = MPI_REQUEST_NULL
     916       30864 :       result_recv_request(:) = MPI_REQUEST_NULL
     917             : #endif
     918             : 
     919             :       ! Queue up buffers
     920             : #ifdef WITH_MPI
     921       30864 :       if (wantDebug) call obj%timer%start("mpi_communication")
     922             : 
     923       30864 :       if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
     924       97680 :         do j = 1, min(num_result_buffers, num_result_blocks)
     925             :           call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL,     &
     926       82248 :                          0, result_recv_tag, mpi_comm_rows, result_recv_request(j), mpierr)
     927             :         enddo
     928             :       endif
     929       30864 :       if (wantDebug) call obj%timer%stop("mpi_communication")
     930             : #else /* WITH_MPI */
     931             : 
     932             :       ! carefull the "recv" has to be done at the corresponding wait or send
     933             :       ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf)
     934             : 
     935             : #endif /* WITH_MPI */
     936             : 
     937       46296 :       num_bufs_recvd = 0 ! No buffers received yet
     938             : 
     939             :       ! Initialize top/bottom requests
     940             : 
     941       46296 :       allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
     942       46296 :        if (istat .ne. 0) then
     943             :          print *,"trans_ev_tridi_to_band_&
     944             :          &MPI_DATATYPE&
     945           0 :          &: error when allocating top_send_request"//errorMessage
     946           0 :          stop 1
     947             :        endif
     948             : 
     949       46296 :       allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
     950       46296 :        if (istat .ne. 0) then
     951             :          print *,"trans_ev_tridi_to_band_&
     952             :          &MATH_DATATYPE&
     953           0 :          &: error when allocating top_recv_request"//errorMessage
     954           0 :          stop 1
     955             :        endif
     956             : 
     957       46296 :       allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
     958       46296 :        if (istat .ne. 0) then
     959             :          print *,"trans_ev_tridi_to_band_&
     960             :          &MATH_DATATYPE&
     961           0 :          &: error when allocating bottom_send_request"//errorMessage
     962           0 :          stop 1
     963             :        endif
     964             : 
     965       46296 :       allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
     966       46296 :        if (istat .ne. 0) then
     967             :          print *,"trans_ev_tridi_to_band_&
     968             :          &MATH_DATATYPE&
     969           0 :          &: error when allocating bottom_recv_request"//errorMessage
     970           0 :          stop 1
     971             :        endif
     972             : 
     973             : #ifdef WITH_MPI
     974       30864 :       top_send_request(:) = MPI_REQUEST_NULL
     975       30864 :       top_recv_request(:) = MPI_REQUEST_NULL
     976       30864 :       bottom_send_request(:) = MPI_REQUEST_NULL
     977       30864 :       bottom_recv_request(:) = MPI_REQUEST_NULL
     978             : #endif
     979             : 
     980             : #ifdef WITH_OPENMP
     981       23148 :       allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
     982       23148 :        if (istat .ne. 0) then
     983             :          print *,"trans_ev_tridi_to_band_&
     984             :          &MATH_DATATYPE&
     985           0 :          &: error when allocating top_border_send_buffer"//errorMessage
     986           0 :          stop 1
     987             :        endif
     988             : 
     989       23148 :       allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
     990       23148 :        if (istat .ne. 0) then
     991             :          print *,"trans_ev_tridi_to_band_&
     992             :          &MATH_DATATYPE&
     993           0 :          &: error when allocating top_border_recv_buffer"//errorMessage
     994           0 :          stop 1
     995             :        endif
     996             : 
     997       23148 :       allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
     998       23148 :        if (istat .ne. 0) then
     999             :          print *,"trans_ev_tridi_to_band_&
    1000             :          &MATH_DATATYPE&
    1001           0 :          &: error when allocating bottom_border_send_buffer"//errorMessage
    1002           0 :          stop 1
    1003             :        endif
    1004             : 
    1005       23148 :       allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
    1006       23148 :        if (istat .ne. 0) then
    1007             :          print *,"trans_ev_tridi_to_band_&
    1008             :          &MATH_DATATYPE&
    1009           0 :          &: error when allocating bottom_border_recv_buffer"//errorMessage
    1010           0 :          stop 1
    1011             :        endif
    1012             : 
    1013       23148 :       top_border_send_buffer(:,:) = 0.0_rck
    1014       23148 :       top_border_recv_buffer(:,:) = 0.0_rck
    1015       23148 :       bottom_border_send_buffer(:,:) = 0.0_rck
    1016       23148 :       bottom_border_recv_buffer(:,:) = 0.0_rck
    1017             :       ! Initialize broadcast buffer
    1018             : 
    1019             : #else /* WITH_OPENMP */
    1020             : 
    1021       23148 :        allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
    1022       23148 :        if (istat .ne. 0) then
    1023             :          print *,"trans_ev_tridi_to_band_&
    1024             :          &MATH_DATATYPE&
    1025           0 :          &: error when allocating top_border_send_bufer"//errorMessage
    1026           0 :          stop 1
    1027             :        endif
    1028             : 
    1029       23148 :       allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
    1030       23148 :        if (istat .ne. 0) then
    1031             :          print *,"trans_ev_tridi_to_band_&
    1032             :          &MATH_DATATYPE&
    1033           0 :          &: error when allocating top_border_recv_buffer"//errorMessage
    1034           0 :          stop 1
    1035             :        endif
    1036             : 
    1037       23148 :       allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
    1038       23148 :        if (istat .ne. 0) then
    1039             :          print *,"trans_ev_tridi_to_band_&
    1040             :          &MATH_DATATYPE&
    1041           0 :          &: error when allocating bottom_border_send_buffer"//errorMessage
    1042           0 :          stop 1
    1043             :        endif
    1044             : 
    1045       23148 :       allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
    1046       23148 :        if (istat .ne. 0) then
    1047             :          print *,"trans_ev_tridi_to_band_&
    1048             :          &MATH_DATATYPE&
    1049           0 :          &: error when allocating bottom_border_recv_buffer"//errorMessage
    1050           0 :          stop 1
    1051             :        endif
    1052             : 
    1053       23148 :       top_border_send_buffer(:,:,:) = 0.0_rck
    1054       23148 :       top_border_recv_buffer(:,:,:) = 0.0_rck
    1055       23148 :       bottom_border_send_buffer(:,:,:) = 0.0_rck
    1056       23148 :       bottom_border_recv_buffer(:,:,:) = 0.0_rck
    1057             : #endif /* WITH_OPENMP */
    1058             : 
    1059             :       ! Initialize broadcast buffer
    1060             : 
    1061       46296 :       allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
    1062       46296 :        if (istat .ne. 0) then
    1063             :          print *,"trans_ev_tridi_to_band_&
    1064             :          &MATH_DATATYPE&
    1065           0 :          &: error when allocating bcast_buffer"//errorMessage
    1066           0 :          stop 1
    1067             :        endif
    1068             : 
    1069       46296 :       bcast_buffer = 0.0_rck
    1070       46296 :       if (useGPU) then
    1071           0 :         num =  ( nbw * max_blk_size) * size_of_datatype
    1072           0 :         successCUDA = cuda_malloc(bcast_buffer_dev, num)
    1073           0 :         if (.not.(successCUDA)) then
    1074             :           print *,"trans_ev_tridi_to_band_&
    1075             :           &MATH_DATATYPE&
    1076           0 :           &: error in cudaMalloc"
    1077           0 :           stop 1
    1078             :         endif
    1079             : 
    1080           0 :         successCUDA = cuda_memset( bcast_buffer_dev, 0, num)
    1081           0 :         if (.not.(successCUDA)) then
    1082             :           print *,"trans_ev_tridi_to_band_&
    1083             :           &MATH_DATATYPE&
    1084           0 :           &: error in cudaMemset"
    1085           0 :           stop 1
    1086             :         endif
    1087             : 
    1088           0 :         num =  ((max_blk_size-1))* size_of_datatype
    1089           0 :         successCUDA = cuda_malloc( hh_dot_dev, num)
    1090           0 :         if (.not.(successCUDA)) then
    1091             :           print *,"trans_ev_tridi_to_band_&
    1092             :           &MATH_DATATYPE&
    1093           0 :           &: error in cudaMalloc"
    1094           0 :           stop 1
    1095             :         endif
    1096             : 
    1097           0 :         successCUDA = cuda_memset( hh_dot_dev, 0, num)
    1098           0 :         if (.not.(successCUDA)) then
    1099             :           print *,"trans_ev_tridi_to_band_&
    1100             :           &MATH_DATATYPE&
    1101           0 :           &: error in cudaMemset"
    1102           0 :           stop 1
    1103             :         endif
    1104             : 
    1105           0 :         num =  (max_blk_size)* size_of_datatype
    1106           0 :         successCUDA = cuda_malloc( hh_tau_dev, num)
    1107           0 :         if (.not.(successCUDA)) then
    1108             :           print *,"trans_ev_tridi_to_band_&
    1109             :           &MATH_DATATYPE&
    1110           0 :           &: error in cudaMalloc"
    1111           0 :           stop 1
    1112             :         endif
    1113             : 
    1114           0 :         successCUDA = cuda_memset( hh_tau_dev, 0, num)
    1115           0 :         if (.not.(successCUDA)) then
    1116             :           print *,"trans_ev_tridi_to_band_&
    1117             :           &MATH_DATATYPE&
    1118           0 :           &: error in cudaMemset"
    1119           0 :           stop 1
    1120             :         endif
    1121             :       endif ! useGPU
    1122             : 
    1123       46296 :       current_tv_off = 0 ! Offset of next row to be broadcast
    1124             : 
    1125             :       ! ------------------- start of work loop -------------------
    1126             : 
    1127       46296 :       a_off = 0 ! offset in aIntern (to avoid unnecessary shifts)
    1128             : 
    1129       46296 :       top_msg_length = 0
    1130       46296 :       bottom_msg_length = 0
    1131             : 
    1132      273312 :       do sweep = 0, (na-1)/nbw
    1133             : 
    1134      227016 :         current_n = na - sweep*nbw
    1135      227016 :         call determine_workload(obj,current_n, nbw, np_rows, limits)
    1136      227016 :         current_n_start = limits(my_prow)
    1137      227016 :         current_n_end   = limits(my_prow+1)
    1138      227016 :         current_local_n = current_n_end - current_n_start
    1139             : 
    1140      227016 :         next_n = max(current_n - nbw, 0)
    1141      227016 :         call determine_workload(obj,next_n, nbw, np_rows, limits)
    1142      227016 :         next_n_start = limits(my_prow)
    1143      227016 :         next_n_end   = limits(my_prow+1)
    1144      227016 :         next_local_n = next_n_end - next_n_start
    1145             : 
    1146      227016 :         if (next_n_end < next_n) then
    1147       44808 :           bottom_msg_length = current_n_end - next_n_end
    1148             :         else
    1149      182208 :           bottom_msg_length = 0
    1150             :         endif
    1151             : 
    1152      227016 :         if (next_local_n > 0) then
    1153      165288 :           next_top_msg_length = current_n_start - next_n_start
    1154             :         else
    1155       61728 :           next_top_msg_length = 0
    1156             :         endif
    1157             : 
    1158      227016 :         if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
    1159             : #ifdef WITH_MPI
    1160       15432 :           if (wantDebug) call obj%timer%start("mpi_communication")
    1161             : #endif
    1162       77112 :           do i = 1, stripe_count
    1163             : 
    1164             : #ifdef WITH_OPENMP
    1165             : 
    1166       30840 :             if (useGPU) then
    1167           0 :               print *,"trans_ev_tridi_to_band_real: not yet implemented"
    1168           0 :               stop 1
    1169             :             endif
    1170             : 
    1171       30840 :             csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
    1172       30840 :             b_len = csw*nbw*max_threads
    1173             : #ifdef WITH_MPI
    1174             :             call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1175       30840 :                            my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr)
    1176             : 
    1177             : #else /* WITH_MPI */
    1178             : !            carefull the "recieve" has to be done at the corresponding wait or send
    1179             : !            bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
    1180             : #endif /* WITH_MPI */
    1181             : 
    1182             : #else /* WITH_OPENMP */
    1183             : 
    1184             : #ifdef WITH_MPI
    1185             :             call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1186       30840 :                            my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr)
    1187             : #else  /* WITH_MPI */
    1188             : !            carefull the recieve has to be done at the corresponding wait or send
    1189             : !            bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
    1190             : #endif /* WITH_MPI */
    1191             : 
    1192             : #endif /* WITH_OPENMP */
    1193             : 
    1194             :           enddo
    1195             : #if WITH_MPI
    1196       15432 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    1197             : #endif
    1198             :         endif
    1199             : 
    1200      227016 :         if (current_local_n > 1) then
    1201      211584 :           if (my_pcol == mod(sweep,np_cols)) then
    1202             :             bcast_buffer(:,1:current_local_n) =    &
    1203      211584 :                   hh_trans(:,current_tv_off+1:current_tv_off+current_local_n)
    1204      211584 :             current_tv_off = current_tv_off + current_local_n
    1205             :           endif
    1206             : 
    1207             : #ifdef WITH_MPI
    1208      135912 :            if (wantDebug) call obj%timer%start("mpi_communication")
    1209             :            call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1210      135912 :                           mod(sweep,np_cols), mpi_comm_cols, mpierr)
    1211      135912 :            if (wantDebug) call obj%timer%stop("mpi_communication")
    1212             : 
    1213             : #endif /* WITH_MPI */
    1214             : 
    1215      211584 :           if (useGPU) then
    1216             :             successCUDA =  cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)),  &
    1217             :                                        nbw * current_local_n *    &
    1218             :                                        size_of_datatype, &
    1219           0 :                                        cudaMemcpyHostToDevice)
    1220           0 :             if (.not.(successCUDA)) then
    1221             :               print *,"trans_ev_tridi_to_band_&
    1222             :               &MATH_DATATYPE&
    1223           0 :               &: error in cudaMemcpy"
    1224           0 :               stop 1
    1225             :             endif
    1226             : 
    1227             :             call extract_hh_tau_&
    1228             :                  &MATH_DATATYPE&
    1229             :                  &_gpu_&
    1230             :                  &PRECISION &
    1231             : !#if REALCASE == 1
    1232             :                           (bcast_buffer_dev, hh_tau_dev, nbw, &
    1233             : !#endif
    1234             : !#if COMPLEXCASE == 1
    1235             : !                          (                              nbw, &
    1236             : !#endif
    1237           0 :                            current_local_n, .false.)
    1238             :       call compute_hh_dot_products_&
    1239             :            &MATH_DATATYPE&
    1240             :            &_gpu_&
    1241             :            &PRECISION &
    1242             :                      (bcast_buffer_dev, hh_dot_dev, nbw, &
    1243           0 :                             current_local_n)
    1244             :           endif ! useGPU
    1245             : 
    1246             :         else ! (current_local_n > 1) then
    1247             : 
    1248             :           ! for current_local_n == 1 the one and only HH Vector is 0 and not stored in hh_trans_real/complex
    1249       15432 :           bcast_buffer(:,1) = 0.0_rck
    1250       15432 :           if (useGPU) then
    1251           0 :             successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_datatype)
    1252           0 :             if (.not.(successCUDA)) then
    1253             :               print *,"trans_ev_tridi_to_band_&
    1254             :               &MATH_DATATYPE&
    1255           0 :               &: error in cudaMemset"
    1256           0 :               stop 1
    1257             :             endif
    1258             : 
    1259             :             call extract_hh_tau_&
    1260             :                  &MATH_DATATYPE&
    1261             :                  &_gpu_&
    1262             :                  &PRECISION&
    1263             :                  &( &
    1264             :         bcast_buffer_dev, hh_tau_dev, &
    1265           0 :         nbw, 1, .true.)
    1266             :           endif ! useGPU
    1267             :         endif ! (current_local_n > 1) then
    1268             : 
    1269      227016 :         if (l_nev == 0) cycle
    1270             : 
    1271      227016 :         if (current_local_n > 0) then
    1272             : 
    1273     1375488 :           do i = 1, stripe_count
    1274             : #ifdef WITH_OPENMP
    1275      581952 :             if (useGPU) then
    1276           0 :               print *,"trans_ev_tridi_to_band_real: not yet implemented"
    1277           0 :               stop 1
    1278             :             endif
    1279             : 
    1280             :             ! Get real stripe width for strip i;
    1281             :             ! The last OpenMP tasks may have an even smaller stripe with,
    1282             :             ! but we don't care about this, i.e. we send/recv a bit too much in this case.
    1283             :             ! csw: current_stripe_width
    1284             : 
    1285      581952 :             csw = min(stripe_width, thread_width-(i-1)*stripe_width)
    1286             : #endif /* WITH_OPENMP */
    1287             : 
    1288             :             !wait_b
    1289     1163904 :             if (current_n_end < current_n) then
    1290             : 
    1291             : 
    1292             : #ifdef WITH_OPENMP
    1293      173424 :               if (useGPU) then
    1294           0 :                 print *,"trans_ev_tridi_to_band_real: not yet implemented"
    1295           0 :                 stop 1
    1296             :               endif
    1297             : 
    1298             : #ifdef WITH_MPI
    1299      173424 :               if (wantDebug) call obj%timer%start("mpi_communication")
    1300             : 
    1301      173424 :               call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
    1302      173424 :               if (wantDebug) call obj%timer%stop("mpi_communication")
    1303             : #endif
    1304      173424 :               call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
    1305      346848 : !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
    1306             :               do my_thread = 1, max_threads
    1307      173424 :                 n_off = current_local_n+a_off
    1308      173424 :                 b_len = csw*nbw
    1309      173424 :                 b_off = (my_thread-1)*b_len
    1310             :                 aIntern(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
    1311      346848 :                   reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
    1312             :               enddo
    1313             : !$omp end parallel do
    1314      173424 :               call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
    1315             : #else /* WITH_OPENMP */
    1316             : 
    1317             : #ifdef WITH_MPI
    1318      173424 :               if (wantDebug) call obj%timer%start("mpi_communication")
    1319      173424 :               call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
    1320      173424 :               if (wantDebug) call obj%timer%stop("mpi_communication")
    1321             : 
    1322             : #endif
    1323      173424 :               n_off = current_local_n+a_off
    1324             : 
    1325      173424 :               if (useGPU) then
    1326           0 :                 dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
    1327             :                 successCUDA =  cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), &
    1328             :                                            stripe_width*nbw*  size_of_datatype,    &
    1329           0 :                                            cudaMemcpyHostToDevice)
    1330           0 :                 if (.not.(successCUDA)) then
    1331             :                   print *,"trans_ev_tridi_to_band_&
    1332             :                   &MATH_DATATYPE&
    1333           0 :                   &: error in cudaMemcpy"
    1334           0 :                   stop 1
    1335             :                 endif
    1336             : 
    1337             :               else
    1338      173424 :                 aIntern(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
    1339             :               endif
    1340             : 
    1341             : #endif /* WITH_OPENMP */
    1342             : 
    1343      346848 :            if (next_n_end < next_n) then
    1344             : 
    1345             : #ifdef WITH_OPENMP
    1346             : 
    1347      142584 :              if (useGPU) then
    1348           0 :                print *,"trans_ev_tridi_to_band_real: not yet implemented"
    1349           0 :                stop 1
    1350             :              endif
    1351             : #ifdef WITH_MPI
    1352      142584 :              if (wantDebug) call obj%timer%start("mpi_communication")
    1353             :              call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1354      142584 :                             my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr)
    1355      142584 :              if (wantDebug) call obj%timer%stop("mpi_communication")
    1356             : 
    1357             : #else /* WTIH_MPI */
    1358             : !                carefull the recieve has to be done at the corresponding wait or send
    1359             : !                bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
    1360             : 
    1361             : #endif /* WITH_MPI */
    1362             : 
    1363             : #else /* WITH_OPENMP */
    1364             : 
    1365             : #ifdef WITH_MPI
    1366      142584 :              if (wantDebug) call obj%timer%start("mpi_communication")
    1367             :              call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1368      142584 :                             my_prow+1, bottom_recv_tag, mpi_comm_rows, bottom_recv_request(i), mpierr)
    1369      142584 :               if (wantDebug) call obj%timer%stop("mpi_communication")
    1370             : 
    1371             : #else /* WITH_MPI */
    1372             : 
    1373             : !!                carefull the recieve has to be done at the corresponding wait or send
    1374             : !!                bottom_border_recv_buffer(1:stripe_width,1:nbw,i) =  top_border_send_buffer(1:stripe_width,1:nbw,i)
    1375             : 
    1376             : #endif /* WITH_MPI */
    1377             : 
    1378             : #endif /* WITH_OPENMP */
    1379             :            endif
    1380             :          endif
    1381             : 
    1382     1163904 :          if (current_local_n <= bottom_msg_length + top_msg_length) then
    1383             : 
    1384             :            !wait_t
    1385           0 :            if (top_msg_length>0) then
    1386             : 
    1387             : #ifdef WITH_OPENMP
    1388           0 :              if (useGPU) then
    1389             :                print *,"trans_ev_tridi_to_band_&
    1390             :                        &MATH_DATATYPE&
    1391           0 :                        &: not yet implemented"
    1392           0 :                stop 1
    1393             :              endif
    1394             : #ifdef WITH_MPI
    1395           0 :              if (wantDebug) call obj%timer%start("mpi_communication")
    1396             : 
    1397           0 :              call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
    1398           0 :              if (wantDebug) call obj%timer%stop("mpi_communication")
    1399             : #endif
    1400             : 
    1401             : #else /* WITH_OPENMP */
    1402             : 
    1403             : #ifdef WITH_MPI
    1404           0 :              if (wantDebug) call obj%timer%start("mpi_communication")
    1405           0 :              call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
    1406           0 :              if (wantDebug) call obj%timer%stop("mpi_communication")
    1407             : #endif
    1408             : 
    1409           0 :              if (useGPU) then
    1410           0 :                dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
    1411             :                !             host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
    1412             :                successCUDA =  cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)),  &
    1413             :                                            stripe_width*top_msg_length* size_of_datatype,      &
    1414           0 :                                            cudaMemcpyHostToDevice)
    1415           0 :                if (.not.(successCUDA)) then
    1416             :                  print *,"trans_ev_tridi_to_band_&
    1417             :                          &MATH_DATATYPE&
    1418           0 :                          &: error in cudaMemcpy"
    1419           0 :                  stop 1
    1420             :                 endif
    1421             :              else ! useGPU
    1422           0 :                aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
    1423             :              endif ! useGPU
    1424             : #endif /* WITH_OPENMP */
    1425             :            endif ! top_msg_length
    1426             : 
    1427             :            !compute
    1428             : #ifdef WITH_OPENMP
    1429             : 
    1430           0 :            call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
    1431             : 
    1432           0 : !$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
    1433             :            do my_thread = 1, max_threads
    1434           0 :              if (top_msg_length>0) then
    1435           0 :                b_len = csw*top_msg_length
    1436           0 :                b_off = (my_thread-1)*b_len
    1437             :                aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
    1438           0 :                           reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
    1439             :              endif
    1440             : 
    1441             :        call compute_hh_trafo_&
    1442             :             &MATH_DATATYPE&
    1443             :             &_openmp_&
    1444             :             &PRECISION &
    1445             :                               (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
    1446             :              l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
    1447             : #if REALCASE == 1
    1448             :                                hh_dot_dev, &
    1449             : #endif
    1450             :                                hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, &
    1451           0 :              i, my_thread, thread_width, kernel)
    1452             :            enddo
    1453             : !$omp end parallel do
    1454           0 :            call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
    1455             : 
    1456             : #else /* WITH_OPENMP */
    1457             : 
    1458             :            call compute_hh_trafo_&
    1459             :                 &MATH_DATATYPE&
    1460             :                 &_&
    1461             :                 &PRECISION&
    1462             :      & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,       &
    1463             :               a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev,      &
    1464             : #if REALCASE == 1
    1465             :               hh_dot_dev, &
    1466             : #endif
    1467             :               hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i,          &
    1468           0 :               last_stripe_width, kernel)
    1469             : #endif /* WITH_OPENMP */
    1470             : 
    1471             :            !send_b        1
    1472             : #ifdef WITH_MPI
    1473           0 :            if (wantDebug) call obj%timer%start("mpi_communication")
    1474           0 :            call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
    1475           0 :            if (wantDebug) call obj%timer%stop("mpi_communication")
    1476             : #endif
    1477             : 
    1478           0 :            if (bottom_msg_length>0) then
    1479           0 :              n_off = current_local_n+nbw-bottom_msg_length+a_off
    1480             : #ifdef WITH_OPENMP
    1481           0 :              b_len = csw*bottom_msg_length*max_threads
    1482             :              bottom_border_send_buffer(1:b_len,i) = &
    1483           0 :                  reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
    1484             : #ifdef WITH_MPI
    1485           0 :              if (wantDebug) call obj%timer%start("mpi_communication")
    1486             :              call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1487           0 :                             my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
    1488           0 :              if (wantDebug) call obj%timer%stop("mpi_communication")
    1489             : #else /* WITH_MPI */
    1490           0 :              if (next_top_msg_length > 0) then
    1491             :                top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* &
    1492           0 :                                             next_top_msg_length*max_threads,i)
    1493             :              endif
    1494             : 
    1495             : #endif /* WITH_MPI */
    1496             : 
    1497             : !#if REALCASE == 1
    1498             :            endif ! this endif is not here in complex -case is for bottom_msg_length
    1499             : !#endif
    1500             : 
    1501             : #else /* WITH_OPENMP */
    1502             : 
    1503           0 :              if (useGPU) then
    1504           0 :                dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
    1505             :                successCUDA =  cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
    1506             :                                           stripe_width * bottom_msg_length * size_of_datatype,      &
    1507           0 :                                           cudaMemcpyDeviceToHost)
    1508           0 :                if (.not.(successCUDA)) then
    1509             :                  print *,"trans_ev_tridi_to_band_&
    1510             :                  &MATH_DATATYPE&
    1511           0 :                  &: error in cudaMemcpy"
    1512           0 :                  stop 1
    1513             :                endif
    1514             :              else
    1515           0 :                bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
    1516             :              endif
    1517             : #ifdef WITH_MPI
    1518           0 :              if (wantDebug) call obj%timer%start("mpi_communication")
    1519             :              call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width,  &
    1520           0 :                    MPI_MATH_DATATYPE_PRECISION_EXPL, my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
    1521           0 :              if (wantDebug) call obj%timer%stop("mpi_communication")
    1522             : 
    1523             : #else /* WITH_MPI */
    1524           0 :                 if (next_top_msg_length > 0) then
    1525             :                   top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
    1526           0 :                   bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
    1527             :                 endif
    1528             : 
    1529             : #endif /* WITH_MPI */
    1530             :            endif
    1531             : #endif /* WITH_OPENMP */
    1532             : 
    1533             :          else ! current_local_n <= bottom_msg_length + top_msg_length
    1534             : 
    1535             :          !compute
    1536             : #ifdef WITH_OPENMP
    1537      581952 :          if (useGPU) then
    1538           0 :            print *,"trans_ev_tridi_to_band_real: not yet implemented"
    1539           0 :            stop 1
    1540             :          endif
    1541      581952 :          call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
    1542             : 
    1543     1163904 :         !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
    1544             :         do my_thread = 1, max_threads
    1545             : 
    1546             :           call compute_hh_trafo_&
    1547             :                &MATH_DATATYPE&
    1548             :                &_openmp_&
    1549             :                &PRECISION&
    1550             :                & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
    1551             :                  nbw, max_blk_size,  bcast_buffer, bcast_buffer_dev, &
    1552             : #if REALCASE == 1
    1553             :             hh_dot_dev,  &
    1554             : #endif
    1555             :             hh_tau_dev, kernel_flops, kernel_time, n_times, current_local_n - bottom_msg_length, &
    1556      581952 :       bottom_msg_length, i, my_thread, thread_width, kernel)
    1557             :         enddo
    1558             : !$omp end parallel do
    1559      581952 :         call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
    1560             : 
    1561             :         !send_b
    1562             : #ifdef WITH_MPI
    1563      377688 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1564      377688 :         call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
    1565      377688 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1566             : #endif
    1567      581952 :         if (bottom_msg_length > 0) then
    1568      142584 :           n_off = current_local_n+nbw-bottom_msg_length+a_off
    1569      142584 :           b_len = csw*bottom_msg_length*max_threads
    1570             :           bottom_border_send_buffer(1:b_len,i) = &
    1571      142584 :               reshape(aIntern(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
    1572             : #ifdef WITH_MPI
    1573      142584 :           if (wantDebug) call obj%timer%start("mpi_communication")
    1574             :           call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1575      142584 :                          my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
    1576      142584 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    1577             : 
    1578             : #else /* WITH_MPI */
    1579           0 :           if (next_top_msg_length > 0) then
    1580             :             top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw* &
    1581             :                                                                                                      next_top_msg_length*&
    1582           0 :                                                           max_threads,i)
    1583             :           endif
    1584             : #endif /* WITH_MPI */
    1585             :         endif
    1586             : 
    1587             : #else /* WITH_OPENMP */
    1588             : 
    1589             :         call compute_hh_trafo_&
    1590             :              &MATH_DATATYPE&
    1591             :              &_&
    1592             :              &PRECISION&
    1593             :              & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,       &
    1594             :                       a_off,  nbw, max_blk_size, bcast_buffer, bcast_buffer_dev,      &
    1595             : #if REALCASE == 1
    1596             :             hh_dot_dev, &
    1597             : #endif
    1598             :            hh_tau_dev, kernel_flops, kernel_time, n_times,                 &
    1599             :            current_local_n - bottom_msg_length, bottom_msg_length, i,             &
    1600      581952 :            last_stripe_width, kernel)
    1601             : 
    1602             : 
    1603             : 
    1604             :         !send_b
    1605             : #ifdef WITH_MPI
    1606      377688 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1607             : 
    1608      377688 :         call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
    1609      377688 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1610             : #endif
    1611      581952 :         if (bottom_msg_length > 0) then
    1612      142584 :           n_off = current_local_n+nbw-bottom_msg_length+a_off
    1613             : 
    1614      142584 :           if (useGPU) then
    1615           0 :             dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
    1616             :             successCUDA =  cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset,  &
    1617             :                                          stripe_width*bottom_msg_length* size_of_datatype,  &
    1618           0 :                                          cudaMemcpyDeviceToHost)
    1619           0 :             if (.not.(successCUDA)) then
    1620             :               print *,"trans_ev_tridi_to_band_&
    1621             :               &MATH_DATATYPE&
    1622           0 :               &: error cudaMemcpy"
    1623           0 :               stop 1
    1624             :             endif
    1625             :           else
    1626      142584 :             bottom_border_send_buffer(:,1:bottom_msg_length,i) = aIntern(:,n_off+1:n_off+bottom_msg_length,i)
    1627             :           endif
    1628             : 
    1629             : #ifdef WITH_MPI
    1630      142584 :           if (wantDebug) call obj%timer%start("mpi_communication")
    1631             :           call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, &
    1632      142584 :            MPI_MATH_DATATYPE_PRECISION_EXPL, my_prow+1, top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
    1633      142584 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    1634             : #else /* WITH_MPI */
    1635           0 :                 if (next_top_msg_length > 0) then
    1636             :                   top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
    1637           0 :                   bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
    1638             :                 endif
    1639             : 
    1640             : #endif /* WITH_MPI */
    1641             : 
    1642             : #if REALCASE == 1
    1643             :         endif
    1644             : #endif
    1645             : 
    1646             : #endif /* WITH_OPENMP */
    1647             : 
    1648             : #ifndef WITH_OPENMP
    1649             : #if COMPLEXCASE == 1
    1650             :         endif
    1651             : #endif
    1652             : #endif
    1653             :         !compute
    1654             : #ifdef WITH_OPENMP
    1655             : 
    1656      581952 :         call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
    1657             : 
    1658     1163904 : !$omp parallel do private(my_thread), schedule(static, 1)
    1659             :         do my_thread = 1, max_threads
    1660             :           call compute_hh_trafo_&
    1661             :           &MATH_DATATYPE&
    1662             :           &_openmp_&
    1663             :           &PRECISION&
    1664             :           & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
    1665             :              nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
    1666             : #if REALCASE == 1
    1667             :              hh_dot_dev, &
    1668             : #endif
    1669             :              hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,&
    1670             :              current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
    1671      581952 :              kernel)
    1672             :         enddo
    1673             : !$omp end parallel do
    1674      581952 :         call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
    1675             : 
    1676             : #else /* WITH_OPENMP */
    1677             : 
    1678             :         call compute_hh_trafo_&
    1679             :              &MATH_DATATYPE&
    1680             :              &_&
    1681             :              &PRECISION&
    1682             :              & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,           &
    1683             :                       a_off,  nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
    1684             : #if REALCASE == 1
    1685             :             hh_dot_dev,     &
    1686             : #endif
    1687             :            hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,                    &
    1688             :            current_local_n-top_msg_length-bottom_msg_length, i,                       &
    1689      581952 :            last_stripe_width, kernel)
    1690             : 
    1691             : #endif /* WITH_OPENMP */
    1692             : 
    1693             :         !wait_t
    1694      959640 :         if (top_msg_length>0) then
    1695             : #ifdef WITH_OPENMP
    1696             : 
    1697             : #ifdef WITH_MPI
    1698      142584 :           if (wantDebug) call obj%timer%start("mpi_communication")
    1699      142584 :           call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
    1700      142584 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    1701             : #endif
    1702             : 
    1703             : #else /* WITH_OPENMP */
    1704             : 
    1705             : #ifdef WITH_MPI
    1706      142584 :           if (wantDebug) call obj%timer%start("mpi_communication")
    1707      142584 :           call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
    1708      142584 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    1709             : #endif
    1710      142584 :           if (useGPU) then
    1711           0 :             dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
    1712             :             successCUDA =  cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)),  &
    1713             :                                        stripe_width * top_msg_length * size_of_datatype,   &
    1714           0 :                cudaMemcpyHostToDevice)
    1715           0 :             if (.not.(successCUDA)) then
    1716             :               print *,"trans_ev_tridi_to_band_&
    1717             :                       &MATH_DATATYPE&
    1718           0 :                       &: error in cudaMemcpy"
    1719           0 :               stop 1
    1720             :             endif
    1721             :           else
    1722      142584 :             aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
    1723             :           endif
    1724             : #endif /* WITH_OPENMP */
    1725             :         endif
    1726             : 
    1727             :         !compute
    1728             : #ifdef WITH_OPENMP
    1729             : 
    1730      581952 :         call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
    1731             : 
    1732     1163904 : !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
    1733             :         do my_thread = 1, max_threads
    1734      581952 :           if (top_msg_length>0) then
    1735      142584 :             b_len = csw*top_msg_length
    1736      142584 :             b_off = (my_thread-1)*b_len
    1737             :             aIntern(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
    1738      142584 :               reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
    1739             :           endif
    1740             :           call compute_hh_trafo_&
    1741             :                &MATH_DATATYPE&
    1742             :                &_openmp_&
    1743             :                &PRECISION&
    1744             :                & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
    1745             :                   nbw, max_blk_size,  bcast_buffer, bcast_buffer_dev, &
    1746             : #if REALCASE == 1
    1747             :              hh_dot_dev, &
    1748             : #endif
    1749             :              hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, my_thread, &
    1750     1163904 :        thread_width, kernel)
    1751             :         enddo
    1752             : !$omp end parallel do
    1753      581952 :         call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
    1754             : 
    1755             : #else /* WITH_OPENMP */
    1756             : 
    1757             :         call compute_hh_trafo_&
    1758             :              &MATH_DATATYPE&
    1759             :              &_&
    1760             :              &PRECISION&
    1761             :              & (obj, useGPU, wantDebug, aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,           &
    1762             :                       a_off, nbw, max_blk_size,  bcast_buffer, bcast_buffer_dev,          &
    1763             : #if REALCASE == 1
    1764             :                hh_dot_dev,     &
    1765             : #endif
    1766             :            hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i,               &
    1767      581952 :            last_stripe_width, kernel)
    1768             : 
    1769             : #endif /* WITH_OPENMP */
    1770             :       endif
    1771             : 
    1772      959640 :       if (next_top_msg_length > 0) then
    1773             :         !request top_border data
    1774             : #ifdef WITH_OPENMP
    1775             : 
    1776      142584 :         b_len = csw*next_top_msg_length*max_threads
    1777             : #ifdef WITH_MPI
    1778      142584 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1779             :         call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1780      142584 :            my_prow-1, top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
    1781      142584 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1782             : #else /* WITH_MPI */
    1783             : !             carefull the "recieve" has to be done at the corresponding wait or send
    1784             : !              top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = &
    1785             : !                                     bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i)
    1786             : #endif /* WITH_MPI */
    1787             : 
    1788             : #else /* WITH_OPENMP */
    1789             : 
    1790             : #ifdef WITH_MPI
    1791      142584 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1792             :         call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, &
    1793      142584 :               MPI_MATH_DATATYPE_PRECISION_EXPL, my_prow-1, top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
    1794      142584 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1795             : #else /* WITH_MPI */
    1796             : !             carefull the "recieve" has to be done at the corresponding wait or send
    1797             : !              top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) =  &
    1798             : !               bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i)
    1799             : #endif /* WITH_MPI */
    1800             : 
    1801             : #endif /* WITH_OPENMP */
    1802             : 
    1803             :       endif
    1804             : 
    1805             :       !send_t
    1806     1163904 :       if (my_prow > 0) then
    1807             : #ifdef WITH_OPENMP
    1808             : 
    1809             : #ifdef WITH_MPI
    1810      173424 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1811      173424 :         call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
    1812      173424 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1813             : #endif
    1814      173424 :         b_len = csw*nbw*max_threads
    1815      173424 :         top_border_send_buffer(1:b_len,i) = reshape(aIntern(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
    1816             : 
    1817             : #ifdef WITH_MPI
    1818      173424 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1819             :         call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1820      173424 :                        my_prow-1, bottom_recv_tag, mpi_comm_rows, top_send_request(i), mpierr)
    1821      173424 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1822             : #else /* WITH_MPI */
    1823           0 :               if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
    1824           0 :                 bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
    1825             :               endif
    1826           0 :               if (next_n_end < next_n) then
    1827           0 :                 bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i)
    1828             :               endif
    1829             : #endif /* WITH_MPI */
    1830             : 
    1831             : #else /* WITH_OPENMP */
    1832             : 
    1833             : #ifdef WITH_MPI
    1834      173424 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1835      173424 :         call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
    1836      173424 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1837             : #endif
    1838      173424 :         if (useGPU) then
    1839           0 :           dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
    1840             :           successCUDA =  cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
    1841             :                                      stripe_width*nbw * size_of_datatype, &
    1842           0 :                                      cudaMemcpyDeviceToHost)
    1843           0 :           if (.not.(successCUDA)) then
    1844             :             print *,"trans_ev_tridi_to_band_&
    1845             :                     &MATH_DATATYPE&
    1846           0 :                     &: error in cudaMemcpy"
    1847           0 :             stop 1
    1848             :           endif
    1849             : 
    1850             :         else
    1851      173424 :           top_border_send_buffer(:,1:nbw,i) = aIntern(:,a_off+1:a_off+nbw,i)
    1852             :         endif
    1853             : #ifdef WITH_MPI
    1854      173424 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1855             :         call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1856      173424 :                        my_prow-1, bottom_recv_tag, mpi_comm_rows, top_send_request(i), mpierr)
    1857      173424 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1858             : #else /* WITH_MPI */
    1859           0 :             if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
    1860           0 :                bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i)
    1861             :              endif
    1862           0 :              if (next_n_end < next_n) then
    1863           0 :                bottom_border_recv_buffer(1:stripe_width,1:nbw,i) =  top_border_send_buffer(1:stripe_width,1:nbw,i)
    1864             :              endif
    1865             : #endif /* WITH_MPI */
    1866             : 
    1867             : #endif /* WITH_OPENMP */
    1868             :       endif
    1869             : 
    1870             :       ! Care that there are not too many outstanding top_recv_request's
    1871     1163904 :       if (stripe_count > 1) then
    1872      755376 :         if (i>1) then
    1873             : 
    1874             : #ifdef WITH_MPI
    1875      619464 :           if (wantDebug) call obj%timer%start("mpi_communication")
    1876      619464 :           call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
    1877      619464 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    1878             : #endif
    1879             :         else
    1880             : 
    1881             : #ifdef WITH_MPI
    1882      135912 :           if (wantDebug) call obj%timer%start("mpi_communication")
    1883      135912 :           call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
    1884      135912 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    1885             : #endif
    1886             : 
    1887             :         endif
    1888             :       endif
    1889             : 
    1890             :     enddo
    1891             : 
    1892      211584 :     top_msg_length = next_top_msg_length
    1893             : 
    1894             :   else
    1895             :     ! wait for last top_send_request
    1896             : 
    1897             : #ifdef WITH_MPI
    1898       77112 :     do i = 1, stripe_count
    1899       61680 :       if (wantDebug) call obj%timer%start("mpi_communication")
    1900       61680 :       call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
    1901       61680 :       if (wantDebug) call obj%timer%stop("mpi_communication")
    1902             :     enddo
    1903             : #endif
    1904             :   endif
    1905             : 
    1906             :     ! Care about the result
    1907             : 
    1908      227016 :     if (my_prow == 0) then
    1909             : 
    1910             :       ! topmost process sends nbw rows to destination processes
    1911             : 
    1912      541392 :       do j=0, nfact-1
    1913      409104 :         num_blk = sweep*nfact+j ! global number of destination block, 0 based
    1914      409104 :         if (num_blk*nblk >= na) exit
    1915             : 
    1916      390048 :         nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
    1917             : 
    1918             : #ifdef WITH_MPI
    1919      195024 :         if (wantDebug) call obj%timer%start("mpi_communication")
    1920      195024 :         call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
    1921      195024 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    1922             : 
    1923             : #endif
    1924      390048 :         dst = mod(num_blk, np_rows)
    1925             : 
    1926      390048 :         if (dst == 0) then
    1927      292920 :           if (useGPU) then
    1928           0 :             row_group_size = min(na - num_blk*nblk, nblk)
    1929             :             call pack_row_group_&
    1930             :                  &MATH_DATATYPE&
    1931             :                  &_gpu_&
    1932             :                  &PRECISION&
    1933             :                  &(row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
    1934           0 :                          row_group(:, :), j * nblk + a_off, row_group_size)
    1935             : 
    1936           0 :             do i = 1, row_group_size
    1937           0 :               q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
    1938             :             enddo
    1939             :           else ! useGPU
    1940             : 
    1941     4820712 :             do i = 1, min(na - num_blk*nblk, nblk)
    1942             : #ifdef WITH_OPENMP
    1943             :               call pack_row_&
    1944             :                    &MATH_DATATYPE&
    1945             :                    &_cpu_openmp_&
    1946             :                    &PRECISION&
    1947     2263896 :                    &(obj,aIntern, row, j*nblk+i+a_off, stripe_width, stripe_count, max_threads, thread_width, l_nev)
    1948             : #else /* WITH_OPENMP */
    1949             : 
    1950             :               call pack_row_&
    1951             :                    &MATH_DATATYPE&
    1952             :                    &_cpu_&
    1953             :                    &PRECISION&
    1954     2263896 :                    &(obj,aIntern, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
    1955             : #endif /* WITH_OPENMP */
    1956     4527792 :               q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
    1957             :             enddo
    1958             :           endif ! useGPU
    1959             : 
    1960             :         else ! (dst == 0)
    1961             : 
    1962       97128 :           if (useGPU) then
    1963             :             call pack_row_group_&
    1964             :                  &MATH_DATATYPE&
    1965             :                  &_gpu_&
    1966             :                  &PRECISION&
    1967             :                  &(row_group_dev, aIntern_dev, stripe_count, stripe_width, &
    1968             :                    last_stripe_width, a_dim2, l_nev, &
    1969           0 :                    result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
    1970             : 
    1971             :           else  ! useGPU
    1972     1651176 :             do i = 1, nblk
    1973             : #if WITH_OPENMP
    1974             :               call pack_row_&
    1975             :                    &MATH_DATATYPE&
    1976             :                    &_cpu_openmp_&
    1977             :                    &PRECISION&
    1978             :                    &(obj,aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, stripe_width, stripe_count, &
    1979      777024 :                    max_threads, thread_width, l_nev)
    1980             : #else /* WITH_OPENMP */
    1981             :               call pack_row_&
    1982             :                    &MATH_DATATYPE&
    1983             :                    &_cpu_&
    1984             :                    &PRECISION&
    1985      777024 :                    &(obj, aIntern, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
    1986             : #endif /* WITH_OPENMP */
    1987             :             enddo
    1988             :           endif ! useGPU
    1989             : #ifdef WITH_MPI
    1990       97128 :           if (wantDebug) call obj%timer%start("mpi_communication")
    1991             :           call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    1992       97128 :                           dst, result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
    1993       97128 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    1994             : 
    1995             : #else /* WITH_MPI */
    1996           0 :           if (j+num_result_buffers < num_result_blocks) &
    1997           0 :                    result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf)
    1998           0 :           if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
    1999           0 :             do j1 = 1, min(num_result_buffers, num_result_blocks)
    2000           0 :               result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf)
    2001             :             enddo
    2002             :           endif
    2003             : 
    2004             : #endif /* WITH_MPI */
    2005             :         endif ! (dst == 0)
    2006             :       enddo  !j=0, nfact-1
    2007             : 
    2008             :     else ! (my_prow == 0)
    2009             : 
    2010             :       ! receive and store final result
    2011             : 
    2012      172800 :       do j = num_bufs_recvd, num_result_blocks-1
    2013             : 
    2014      141936 :         nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
    2015             : 
    2016             :         ! If there is still work to do, just test for the next result request
    2017             :         ! and leave the loop if it is not ready, otherwise wait for all
    2018             :         ! outstanding requests
    2019             : 
    2020      141936 :         if (next_local_n > 0) then
    2021             : 
    2022             : #ifdef WITH_MPI
    2023       78024 :           if (wantDebug) call obj%timer%start("mpi_communication")
    2024       78024 :           call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
    2025       78024 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    2026             : 
    2027             : #else /* WITH_MPI */
    2028           0 :           flag = .true.
    2029             : #endif
    2030             : 
    2031       78024 :           if (.not.flag) exit
    2032             : 
    2033             :         else ! (next_local_n > 0)
    2034             : #ifdef WITH_MPI
    2035       63912 :           if (wantDebug) call obj%timer%start("mpi_communication")
    2036       63912 :           call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
    2037       63912 :           if (wantDebug) call obj%timer%stop("mpi_communication")
    2038             : #endif
    2039             :         endif ! (next_local_n > 0)
    2040             : 
    2041             :         ! Fill result buffer into q
    2042       97128 :         num_blk = j*np_rows + my_prow ! global number of current block, 0 based
    2043     1504536 :         do i = 1, min(na - num_blk*nblk, nblk)
    2044     1407408 :           q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
    2045             :         enddo
    2046             : 
    2047             :         ! Queue result buffer again if there are outstanding blocks left
    2048             : #ifdef WITH_MPI
    2049       97128 :         if (wantDebug) call obj%timer%start("mpi_communication")
    2050             : 
    2051       97128 :         if (j+num_result_buffers < num_result_blocks) &
    2052             :             call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL, &
    2053       14880 :                            0, result_recv_tag, mpi_comm_rows, result_recv_request(nbuf), mpierr)
    2054             : 
    2055             :         ! carefull the "recieve" has to be done at the corresponding wait or send
    2056             : !         if (j+num_result_buffers < num_result_blocks) &
    2057             : !                result_buffer(1:l_nev*nblk,1,nbuf) =  result_buffer(1:l_nev*nblk,1,nbuf)
    2058       97128 :         if (wantDebug) call obj%timer%stop("mpi_communication")
    2059             : 
    2060             : #else /* WITH_MPI */
    2061             : 
    2062             : #endif /* WITH_MPI */
    2063             : 
    2064             :       enddo ! j = num_bufs_recvd, num_result_blocks-1
    2065       75672 :       num_bufs_recvd = j
    2066             : 
    2067             :     endif ! (my_prow == 0)
    2068             : 
    2069             :     ! Shift the remaining rows to the front of aIntern (if necessary)
    2070             : 
    2071      227016 :     offset = nbw - top_msg_length
    2072      227016 :     if (offset<0) then
    2073           0 :       if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
    2074             :                                          &MATH_DATATYPE&
    2075           0 :                                          &: internal error, offset for shifting = ',offset
    2076           0 :       success = .false.
    2077           0 :       return
    2078             :     endif
    2079             : 
    2080      227016 :     a_off = a_off + offset
    2081      227016 :     if (a_off + next_local_n + nbw > a_dim2) then
    2082             : #ifdef WITH_OPENMP
    2083       31968 :       if (useGPU) then
    2084           0 :         print *,"trans_ev_tridi_to_band_real: not yet implemented"
    2085           0 :         stop 1
    2086             :       endif
    2087             : 
    2088       31968 :       call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
    2089             : 
    2090       63936 : !$omp parallel do private(my_thread, i, j), schedule(static, 1)
    2091             :       do my_thread = 1, max_threads
    2092      165936 :         do i = 1, stripe_count
    2093     8208528 :           do j = top_msg_length+1, top_msg_length+next_local_n
    2094   400471872 :             aIntern(:,j,i,my_thread) = aIntern(:,j+a_off,i,my_thread)
    2095             :           enddo
    2096             :         enddo
    2097             :       enddo
    2098             : !$omp end parallel do
    2099       31968 :       call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
    2100             : 
    2101             : #else /* WITH_OPENMP */
    2102      165936 :          do i = 1, stripe_count
    2103      133968 :            if (useGPU) then
    2104           0 :              chunk = min(next_local_n - 1, a_off)
    2105           0 :              do j = top_msg_length + 1, top_msg_length + next_local_n, chunk
    2106           0 :                top = min(j + chunk, top_msg_length + next_local_n)
    2107           0 :                this_chunk = top - j + 1
    2108           0 :                dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
    2109           0 :                dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
    2110             :                ! it is not logical to set here always the value for the parameter
    2111             :                ! "cudaMemcpyDeviceToDevice" do this ONCE at startup
    2112             :                !               tmp = cuda_d2d(1)
    2113             :                successCUDA =  cuda_memcpy( aIntern_dev + dev_offset , aIntern_dev +dev_offset_1, &
    2114           0 :                                          stripe_width*this_chunk* size_of_datatype, cudaMemcpyDeviceToDevice)
    2115           0 :                if (.not.(successCUDA)) then
    2116             :                  print *,"trans_ev_tridi_to_band_&
    2117             :                          &MATH_DATATYPE&
    2118           0 :                          &: error cudaMemcpy"
    2119           0 :                  stop 1
    2120             :                endif
    2121             :              enddo
    2122             :            else ! not useGPU
    2123     8208528 :              do j = top_msg_length+1, top_msg_length+next_local_n
    2124     8074560 :                aIntern(:,j,i) = aIntern(:,j+a_off,i)
    2125             :              enddo
    2126             :            endif
    2127             :          enddo ! stripe_count
    2128             : #endif /* WITH_OPENMP */
    2129             : 
    2130       63936 :          a_off = 0
    2131             :        endif
    2132             :      enddo
    2133             : 
    2134             :      ! Just for safety:
    2135             : #ifdef WITH_MPI
    2136       30864 :      if (ANY(top_send_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
    2137       30864 :      if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
    2138       30864 :      if (ANY(top_recv_request    /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
    2139       30864 :      if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
    2140             : #endif
    2141             : 
    2142       30864 :      if (my_prow == 0) then
    2143             : 
    2144             : #ifdef WITH_MPI
    2145       15432 :        if (wantDebug) call obj%timer%start("mpi_communication")
    2146       15432 :        call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
    2147       15432 :        if (wantDebug) call obj%timer%stop("mpi_communication")
    2148             : #endif
    2149             :      endif
    2150             : 
    2151             : #ifdef WITH_MPI
    2152       30864 :      if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
    2153       30864 :      if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
    2154             : 
    2155             : #ifdef HAVE_DETAILED_TIMINGS
    2156       30864 :        call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_ROWS, mpierr)
    2157       30864 :        kernel_flops = kernel_flops_recv
    2158       30864 :        call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_COLS, mpierr)
    2159       30864 :        kernel_flops = kernel_flops_recv
    2160             : 
    2161       30864 :        call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_ROWS, mpierr)
    2162       30864 :        kernel_time_recv = kernel_time
    2163       30864 :        call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_COLS, mpierr)
    2164       30864 :        kernel_time_recv = kernel_time
    2165             : #endif
    2166             : 
    2167             : #else /* WITH_MPI */
    2168             : 
    2169       15432 :      call obj%get("print_flops",print_flops)
    2170       15432 :      if (my_prow==0 .and. my_pcol==0 .and.print_flops == 1) &
    2171        2208 :          write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)')  kernel_time, kernel_flops/kernel_time*1.d-6
    2172             : 
    2173             : #endif /* WITH_MPI */
    2174             : 
    2175       46296 :      if (useGPU) then
    2176             :      ! copy q to q_dev needed in trans_ev_band_to_full
    2177           0 :         successCUDA = cuda_malloc(q_dev, ldq*matrixCols* size_of_datatype)
    2178           0 :         if (.not.(successCUDA)) then
    2179             :           print *,"trans_ev_tridi_to_band_&
    2180             :                   &MATH_DATATYPE&
    2181           0 :                   &: error in cudaMalloc"
    2182           0 :           stop 1
    2183             :         endif
    2184             : 
    2185             :           ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
    2186             :           successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype,   &
    2187           0 :                   cudaMemcpyHostToDevice)
    2188           0 :           if (.not.(successCUDA)) then
    2189             :             print *,"trans_ev_tridi_to_band_&
    2190             :                     &MATH_DATATYPE&
    2191           0 :                     &: error in cudaMalloc"
    2192           0 :             stop 1
    2193             :           endif
    2194             : !        endif
    2195             :      endif !use GPU
    2196             : 
    2197             :      ! deallocate all working space
    2198             : 
    2199       46296 :      if (.not.(useGPU)) then
    2200       46296 :        nullify(aIntern)
    2201       46296 :        call free(aIntern_ptr)
    2202             :      endif
    2203             : 
    2204       46296 :      deallocate(row, stat=istat, errmsg=errorMessage)
    2205       46296 :      if (istat .ne. 0) then
    2206             :        print *,"trans_ev_tridi_to_band_&
    2207             :                &MATH_DATATYPE&
    2208           0 :                &: error when deallocating row "//errorMessage
    2209           0 :        stop 1
    2210             :      endif
    2211             : 
    2212       46296 :      deallocate(limits, stat=istat, errmsg=errorMessage)
    2213       46296 :      if (istat .ne. 0) then
    2214             :        print *,"trans_ev_tridi_to_band_&
    2215             :                &MATH_DATATYPE&
    2216           0 :                &: error when deallocating limits"//errorMessage
    2217           0 :        stop 1
    2218             :      endif
    2219             : 
    2220       46296 :      deallocate(result_send_request, stat=istat, errmsg=errorMessage)
    2221       46296 :      if (istat .ne. 0) then
    2222             :        print *,"trans_ev_tridi_to_band_&
    2223             :                &MATH_DATATYPE&
    2224           0 :                &: error when deallocating result_send_request "//errorMessage
    2225           0 :        stop 1
    2226             :      endif
    2227             : 
    2228       46296 :      deallocate(result_recv_request, stat=istat, errmsg=errorMessage)
    2229       46296 :      if (istat .ne. 0) then
    2230             :        print *,"trans_ev_tridi_to_band_&
    2231             :                &MATH_DATATYPE&
    2232           0 :                &: error when deallocating result_recv_request "//errorMessage
    2233           0 :        stop 1
    2234             :      endif
    2235             : 
    2236       46296 :      deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage)
    2237       46296 :      if (istat .ne. 0) then
    2238             :        print *,"trans_ev_tridi_to_band_&
    2239             :                &MATH_DATATYPE&
    2240           0 :                &: error when deallocating top_border_send_buffer "//errorMessage
    2241           0 :        stop 1
    2242             :      endif
    2243             : 
    2244       46296 :      deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage)
    2245       46296 :      if (istat .ne. 0) then
    2246             :        print *,"trans_ev_tridi_to_band_&
    2247             :                &MATH_DATATYPE&
    2248           0 :                &: error when deallocating top_border_recv_buffer "//errorMessage
    2249           0 :        stop 1
    2250             :      endif
    2251             : 
    2252       46296 :      deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage)
    2253       46296 :      if (istat .ne. 0) then
    2254             :        print *,"trans_ev_tridi_to_band_&
    2255             :                &MATH_DATATYPE&
    2256           0 :                &: error when deallocating bottom_border_send_buffer "//errorMessage
    2257           0 :        stop 1
    2258             :      endif
    2259             : 
    2260       46296 :      deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage)
    2261       46296 :      if (istat .ne. 0) then
    2262             :        print *,"trans_ev_tridi_to_band_&
    2263             :                &MATH_DATATYPE&
    2264           0 :                &: error when deallocating bottom_border_recv_buffer "//errorMessage
    2265           0 :        stop 1
    2266             :      endif
    2267             : 
    2268       46296 :      deallocate(result_buffer, stat=istat, errmsg=errorMessage)
    2269       46296 :      if (istat .ne. 0) then
    2270             :        print *,"trans_ev_tridi_to_band_&
    2271             :        &MATH_DATATYPE&
    2272           0 :        &: error when deallocating result_buffer "//errorMessage
    2273           0 :        stop 1
    2274             :      endif
    2275             : 
    2276       46296 :      deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
    2277       46296 :      if (istat .ne. 0) then
    2278             :        print *,"trans_ev_tridi_to_band_&
    2279             :                &MATH_DATATYPE&
    2280           0 :                &: error when deallocating bcast_buffer "//errorMessage
    2281           0 :        stop 1
    2282             :      endif
    2283             : 
    2284       46296 :      deallocate(top_send_request, stat=istat, errmsg=errorMessage)
    2285       46296 :      if (istat .ne. 0) then
    2286             :        print *,"trans_ev_tridi_to_band_&
    2287             :                &MATH_DATATYPE&
    2288           0 :                &: error when deallocating top_send_request "//errorMessage
    2289           0 :        stop 1
    2290             :      endif
    2291             : 
    2292       46296 :      deallocate(top_recv_request, stat=istat, errmsg=errorMessage)
    2293       46296 :      if (istat .ne. 0) then
    2294             :        print *,"trans_ev_tridi_to_band_&
    2295             :                &MATH_DATATYPE&
    2296           0 :                &: error when deallocating top_recv_request "//errorMessage
    2297           0 :        stop 1
    2298             :      endif
    2299             : 
    2300       46296 :      deallocate(bottom_send_request, stat=istat, errmsg=errorMessage)
    2301       46296 :      if (istat .ne. 0) then
    2302             :        print *,"trans_ev_tridi_to_band_&
    2303             :                &MATH_DATATYPE&
    2304           0 :                &: error when deallocating bottom_send_request "//errorMessage
    2305           0 :        stop 1
    2306             :      endif
    2307             : 
    2308       46296 :      deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage)
    2309       46296 :      if (istat .ne. 0) then
    2310             :        print *,"trans_ev_tridi_to_band_&
    2311             :                &MATH_DATATYPE&
    2312           0 :                &: error when deallocating bottom_recv_request "//errorMessage
    2313           0 :        stop 1
    2314             :      endif
    2315             : 
    2316       46296 :      if (useGPU) then
    2317             : #if COMPLEXCASE == 1
    2318             :        ! should this not hbe done always?
    2319           0 :        successCUDA = cuda_free(aIntern_dev)
    2320           0 :        if (.not.(successCUDA)) then
    2321           0 :          print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
    2322           0 :          stop 1
    2323             :        endif
    2324             : #endif
    2325           0 :        successCUDA = cuda_free(hh_dot_dev)
    2326           0 :        if (.not.(successCUDA)) then
    2327             :          print *,"trans_ev_tridi_to_band_&
    2328             :                  &MATH_DATATYPE&
    2329           0 :                  &real: error in cudaFree "//errorMessage
    2330           0 :          stop 1
    2331             :        endif
    2332             : 
    2333           0 :        successCUDA = cuda_free(hh_tau_dev)
    2334           0 :        if (.not.(successCUDA)) then
    2335             :          print *,"trans_ev_tridi_to_band_&
    2336             :                  &MATH_DATATYPE&
    2337           0 :                  &: error in cudaFree "//errorMessage
    2338           0 :          stop 1
    2339             :        endif
    2340             : 
    2341           0 :        successCUDA = cuda_free(row_dev)
    2342           0 :        if (.not.(successCUDA)) then
    2343             :          print *,"trans_ev_tridi_to_band_&
    2344             :                  &MATH_DATATYPE&
    2345           0 :                  &: error in cudaFree "//errorMessage
    2346           0 :          stop 1
    2347             :        endif
    2348             : 
    2349           0 :        deallocate(row_group, stat=istat, errmsg=errorMessage)
    2350           0 :        if (istat .ne. 0) then
    2351             :          print *,"trans_ev_tridi_to_band_&
    2352             :                  &MATH_DATATYPE&
    2353           0 :                  &: error when deallocating row_group "//errorMessage
    2354           0 :          stop 1
    2355             :        endif
    2356             : 
    2357           0 :        successCUDA = cuda_free(row_group_dev)
    2358           0 :        if (.not.(successCUDA)) then
    2359             :          print *,"trans_ev_tridi_to_band_&
    2360             :                  &MATH_DATATYPE&
    2361           0 :                  &: error in cudaFree "//errorMessage
    2362           0 :          stop 1
    2363             :        endif
    2364             : 
    2365           0 :        successCUDA =  cuda_free(bcast_buffer_dev)
    2366           0 :        if (.not.(successCUDA)) then
    2367             :          print *,"trans_ev_tridi_to_band_&
    2368             :                  &MATH_DATATYPE&
    2369           0 :                  &: error in cudaFree "//errorMessage
    2370           0 :          stop 1
    2371             :        endif
    2372             :      endif ! useGPU
    2373             : 
    2374             : 
    2375             :      call obj%timer%stop("trans_ev_tridi_to_band_&
    2376             :                          &MATH_DATATYPE&
    2377             :                          &" // &
    2378             :                          &PRECISION_SUFFIX&
    2379       46296 :                          )
    2380             : 
    2381       46296 :      return
    2382             : !#if COMPLEXCASE == 1
    2383             : !     contains
    2384             : !     ! The host wrapper for extracting "tau" from the HH reflectors (see the
    2385             : !     ! kernel below)
    2386             : !       subroutine extract_hh_tau_complex_gpu_&
    2387             : !       &PRECISION&
    2388             : !       &(nbw, n, is_zero)
    2389             : !         use cuda_c_kernel
    2390             : !         use pack_unpack_gpu
    2391             : !         use precision
    2392             : !         implicit none
    2393             : !         integer(kind=ik), value :: nbw, n
    2394             : !         logical, value          :: is_zero
    2395             : !         integer(kind=ik)        :: val_is_zero
    2396             : !
    2397             : !         if (is_zero) then
    2398             : !           val_is_zero = 1
    2399             : !         else
    2400             : !           val_is_zero = 0
    2401             : !         endif
    2402             : !         call launch_extract_hh_tau_c_kernel_complex_&
    2403             : !   &PRECISION&
    2404             : !   &(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
    2405             : !       end subroutine
    2406             : !#endif /* COMPLEXCASE */
    2407             : 
    2408       46296 :     end subroutine
    2409             : 
    2410             : ! vim: syntax=fortran

Generated by: LCOV version 1.12