program PCMRH
!***************************************************************
! Parallel CMRH program with MPI
!***************************************************************
use mpi
external SSWAP,DCOPY,DSCAL,DGEMV,DSWAP
intrinsic dabs,dsqrt
!********************************************
! VARIABLES
!********************************************
integer,parameter   ::Nprocs_max=16
integer             ::rang,Nprocs,n,nl,code,i,j,k,choix,solchoice,iprint
real(kind=8) ::init(5)
integer,dimension(MPI_STATUS_SIZE) ::statut
real(kind=8),allocatable,dimension(:,:)  ::AL
real(kind=8)                           ::btemp,bmax,btemp2
real(kind=8),allocatable,dimension(:)   ::b,sol,rs,temp,x,workl,work,tempn,c,s,xl,cl,temp2,xx,yy,valeurs
integer,allocatable,dimension(:) ::p
real*8                          ::dnorm2,D_inv,beta,dnorm,dnrm2,tol,alpha,H_k1,dgam,r
integer                         ::indice,idamax,k1,n_k,knl,knlp,inl,inlp
real*8 dtime, delta, delta0, dt1, dt2, dt3
real*8 etime, eelta, eelta0, et1, et2, et3
real*8 t1, t2, t3, tm1, tm2, tm3
real*8 dtime1, delta1, delta01
real*8 etime1, eelta1, eelta01,l,p1,p2
integer nb_ligne
real(kind=8)  ::temps,tempsd,tempsf,temps2
integer*4,dimension(3)  ::timearray
real                     ::rand
real*4 da(2),ea(2),da1(2),ea1(2)
!********************************************
! MPI CONFIG
!********************************************
call MPI_INIT(code)
call MPI_COMM_RANK(MPI_COMM_WORLD,rang,code)
call MPI_COMM_SIZE(MPI_COMM_WORLD,Nprocs,code)
call MPI_BARRIER(MPI_COMM_WORLD,code)


call itime(timearray)     ! Get the current time
i = rand ( timearray(1)+timearray(2)+timearray(3) )

!************************************
! Matrix Initialisation
!************************************
if (rang==0) then
    open(unit=10,file='inputfile.dat',status='old')
    read(10,*)init
    close(10)
    iprint=int(init(1))
    n=int(init(2))
    choix=int(init(3))
    solchoice=int(init(4))
    tol=init(5)
end if
tempsd=MPI_WTIME()
call MPI_BCAST(n,1,MPI_INTEGER,0,MPI_COMM_WORLD,code)
call MPI_BCAST(choix,1,MPI_INTEGER,0,MPI_COMM_WORLD,code)
call MPI_BCAST(tol,1,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,code)

if (mod(n,Nprocs)==0)then
    nl=n/Nprocs
else
    nl=(n/Nprocs)+1
end if

allocate(b(n),rs(n),x(n),AL(nl,n),workl(nl),sol(n),temp(n),temp2(n))
if (choix<3)then
    call matrice(AL,nl,n,2.d0,1.d0,choix,rang)
else
    allocate(valeurs(n))
    open(unit=30,file='matrixfile.dat',status='old')
    do i=1,n
        call position(i,nl,inl,inlp)
        read(30,*)valeurs
        if (rang.eq.inl)then
            AL(inlp,:)=valeurs(:)
        endif
    enddo
    close(30)
endif
!********************************************
! End Matrix initialisation
!********************************************


!********************************************
!   Solution initialisation
!********************************************

call MPI_BCAST(solchoice,1,MPI_INTEGER,0,MPI_COMM_WORLD,code)
call solution(AL,workl,sol,nl,n,solchoice)


call MPI_ALLGATHER(workl,nl,MPI_DOUBLE_PRECISION,b,nl,MPI_DOUBLE_PRECISION,MPI_COMM_WORLD,code)

!********************************************
! End solution initialisation
!********************************************


!********************************************
!   Permutation Vector initialisation
!   and computation of initial residual
!********************************************
if (rang==0) then
    allocate(p(n))
    do i=1,n
        p(i)=i
    end do
end if
dnorm2=dnrm2(n,b,1)
if (rang==0)then
    if (iprint==2)then
        print*,''
        print*,'la norme 2 de r0=',dnorm2
        print*,''
    end if
end if

!********************************************
!   Computation of max b(i) 
!********************************************
indice = idamax(n,b,1)


beta   = b(indice)
D_inv  = 1.d0/beta

!********************************************
! First permutation
!********************************************
if (indice .ne.1) then
call dswap(1,b(indice),1,b(1),1)
call dswap(nl,AL(1,indice),1,AL(1,1),1)
call position(indice,nl,inl,inlp)
if (inl==0) then
if (rang==inl)then
call dswap(n,AL(inlp,1),nl,AL(1,1),nl)
end if
else
if (rang==inl) then
call dcopy(n,AL(inlp,1),nl,temp,1)
end if
call MPI_BCAST(temp,n,MPI_DOUBLE_PRECISION,inl,MPI_COMM_WORLD,code)
if (rang==0)then
call dcopy(n,AL(1,1),nl,temp2,1)
end if
call MPI_BCAST(temp2,n,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,code)
if (rang==inl)then
call dcopy(n,temp2,1,AL(inlp,1),nl)
end if
if (rang==0)then
call dcopy(n,temp,1,AL(1,1),nl)
end if
end if
if (rang==0)then
call sswap(1,p(indice),1,p(1),1)
end if
end if

!********************************************
! End permutation
!********************************************


! call dscal(n,0.d0,x,1)
call dscal(n,D_inv,b,1)
rs(1)=beta
dnorm=dabs(beta)
allocate(tempn(nl),work(n),c(n),s(n),xl(nl),cl(nl))
call dscal(nl,0.d0,xl,1)

call MPI_BARRIER(MPI_COMM_WORLD,code)
k=0
if (rang==0) then
    if (iprint==2)then
        write(*,*)'iter., pseudo residu '
    end if
    open(unit=30,file='presidual.dat',status='unknown')
    write(30,*)'Iter., Pseudo Residual Norm'
end if
!tol=1.d-7
delta0 = dtime(da)
eelta0 = etime(ea)
tempsf=MPI_WTIME()

!********************************************
! Loop
!********************************************
do while((dnorm.gt.tol).and.(k.lt.n))
    k=k+1;
    n_k=n-k;
    k1=k+1;
    if (rang==0) then
        if (iprint==2)then
            write(*,10)k-1,dnorm
        end if
        write(30,10)k-1,dnorm
    end if
10 format(i5,2x,e12.4)

!********************************************
!   workl = A*b
!********************************************
    call dgemv('N',nl,n_k+1,1.d0,AL(1,k),nl,b(k),1,0.d0,workl,1)

!    ! call dscal(nl,0.d0,workl,1)
!    ! do j=k1,n
!    !    do i=1,nl
!    !       workl(i)=workl(i)+AL(i,j)*b(j)
!    !    end do
!    ! end do

!********************************************
!   A(:,k)=b
!********************************************
    call position(k1,nl,knl,knlp)
    if (rang>knl)then
        call dcopy(nl,b(rang*nl+1),1,AL(1,k),1)
    else
        if (rang==knl) then
            call dcopy(nl-knlp+1,b(k1),1,AL(knlp,k),1)
        end if
    end if

    !********************************************
    !   j Loop
    !********************************************
    do j=1,k
        !********************************************
        !   A(j,k)=workl(j)
        !   workl(j)=0
        !********************************************
        call position(j,nl,knl,knlp)
        if (rang==knl) then
            AL(knlp,k)=workl(knlp)
            workl(knlp)=0.d0
            alpha=-AL(knlp,k)
        end if
        call MPI_BCAST(alpha,1,MPI_DOUBLE_PRECISION,knl,MPI_COMM_WORLD,code)
        !********************************************
        !   workl=workl - alpha A(:,j)
        !********************************************
        call position(j+1,nl,knl,knlp)
        if (rang >knl)then
            call daxpy(nl,alpha,AL(1,j),1,workl,1)
        end if
        if (rang==knl) then
            call daxpy(nl-knlp+1,alpha,AL(knlp,j),1,workl(knlp),1)
        end if
    end do
    !********************************************
    ! End j Loop
    !********************************************
    if (k.lt.n)then
        call MPI_ALLGATHER(workl,nl,MPI_DOUBLE_PRECISION,b,nl,MPI_DOUBLE_PRECISION,MPI_COMM_WORLD,code)
        !********************************************
        !   Search indice i_0
        !********************************************
        indice=k+idamax(n_k,b(k1),1)
        H_k1=b(indice)
        !********************************************
        !   Permutation
        !********************************************
        if (indice.ne.k1) then
            if (rang==0) then
                call sswap(1,p(indice),1,p(k1),1)
            end if
            call dswap(1,b(indice),1,b(k1),1)
            call dswap(nl,AL(1,indice),1,AL(1,k1),1)
            call MPI_BARRIER(MPI_COMM_WORLD,code)
            call position(k1,nl,knl,knlp)
            call position(indice,nl,inl,inlp)
            if (inl==knl) then
                if (rang==inl)then
                    call dswap(n,AL(inlp,1),nl,AL(knlp,1),nl)
                end if
            else
                if (rang==inl) then
                    call dcopy(n,AL(inlp,1),nl,temp,1)
                end if
                call MPI_BCAST(temp,n,MPI_DOUBLE_PRECISION,inl,MPI_COMM_WORLD,code)
                if (rang==knl)then
                    call dcopy(n,AL(knlp,1),nl,temp2,1)
                end if
                call MPI_BCAST(temp2,n,MPI_DOUBLE_PRECISION,knl,MPI_COMM_WORLD,code)
                if (rang==inl)then
                    call dcopy(n,temp2,1,AL(inlp,1),nl)
                end if
                if (rang==knl)then
                    call dcopy(n,temp,1,AL(knlp,1),nl)
                end if
            end if
        end if
        !********************************************
        !   End Permutation
        !********************************************
        if (H_k1.ne.0.d0)then
            !********************************************
            !   b=b/(b)_(i_0)
            !********************************************
            D_inv=1.d0/H_k1
            ! call dcopy(n,work,1,b,1)
            call dscal(n,D_inv,b,1)
        end if
    else
        H_k1=0.d0
        call dscal(n,0.d0,b,1)
    end if
    !********************************************
    !   Application of Givens rotations
    !********************************************
    if (k.gt.1)then
        do i=2,k
            call position(i-1,nl,knl,knlp)
            call position(i,nl,inl,inlp)
            if (inl==knl)then
                if (rang==inl)then
                    btemp=AL(knlp,k)
                    AL(knlp,k)=c(i-1)*btemp+s(i-1)*AL(inlp,k)
                    AL(inlp,k)=-s(i-1)*btemp+c(i-1)*AL(inlp,k)
                end if
            else
                if (rang==inl)then
                    btemp=AL(inlp,k)
                end if
                if (rang==knl)then
                    btemp2=AL(knlp,k)
                end if
                call MPI_BCAST(btemp,1,MPI_DOUBLE_PRECISION,inl,MPI_COMM_WORLD,code)
                call MPI_BCAST(btemp2,1,MPI_DOUBLE_PRECISION,knl,MPI_COMM_WORLD,code)
                net=net+2
                if (rang==inl)then
                    AL(inlp,k)=-s(i-1)*btemp2+c(i-1)*btemp
                end if
                if (rang==knl)then
                    AL(knlp,k)=c(i-1)*btemp2+s(i-1)*btemp
                end if
            end if
        end do
    end if
    if (k.le.n)then
        call position(k,nl,knl,knlp)
        if (rang==knl)then
            btemp=AL(knlp,k)
        end if
        call MPI_BCAST(btemp,1,MPI_DOUBLE_PRECISION,knl,MPI_COMM_WORLD,code)
        dgam=dsqrt(btemp**2+H_k1**2)
        c(k)=dabs(btemp)/dgam
        s(k)=(H_k1*dabs(btemp))/(btemp*dgam)
        rs(k1)=-s(k)*rs(k)
        rs(k)=c(k)*rs(k)
        dnorm=dabs(rs(k1))
        if (rang==knl)then
            AL(knlp,k)=c(k)*btemp+s(k)*H_k1
        end if
    end if
    !********************************************
    ! End Givens rotations
    !********************************************
end do

!********************************************
!   Solve H_k d_k = alpha e_1
!********************************************
call position(k,nl,knl,knlp)
do j=k,1,-1
    call position(j,nl,inl,inlp)
    if (rang==inl) then
        rs(j)=rs(j)/AL(inlp,j)
        btemp=rs(j)
    end if
    call MPI_BCAST(btemp,1,MPI_DOUBLE_PRECISION,inl,MPI_COMM_WORLD,code)
    rs(j)=btemp
    work(j)=btemp
    call position(j-1,nl,inl,inlp)
    if (rang==inl)then
        call daxpy(inlp,-btemp,AL(1,j),1,rs(rang*nl+1),1)
    end if
    if (rang.lt.inl)then
        call daxpy(nl,-btemp,AL(1,j),1,rs(rang*nl+1),1)
    end if
end do

do j=k,1,-1
    btemp=rs(j)
    call position(k,nl,knl,knlp)
    call position(j+1,nl,inl,inlp)
    if (rang==inl)then
        if (rang==knl)then
            call daxpy((knlp-inlp+1),btemp,AL(inlp,j),1,rs(rang*nl+inlp),1)
        end if
        if (rang.lt.knl)then
            call daxpy(nl-inlp+1,btemp,AL(inlp,j),1,rs(rang*nl+inlp),1)
        end if
    end if
    if (rang.gt.inl)then
        if (rang==knl)then
            call daxpy(knlp,btemp,AL(1,j),1,rs(rang*nl+1),1)
        end if
        if (rang.lt.knl)then
            call daxpy(nl,btemp,AL(1,j),1,rs(rang*nl+1),1)
        end if
    end if
end do
!********************************************
!   End Loop
!********************************************

!********************************************
!   Update x_k=x_0 + L_k d_k
!********************************************
call position(k1,nl,knl,knlp)
if (rang.gt.knl)then
    call dgemv('N',nl,k,1.d0,AL(1,1),nl,work,1,0.d0,c(rang*nl+1),1)
end if
if (rang==knl)then
    call dgemv('N',nl-knlp+1,k,1.d0,AL(knlp,1),nl,work,1,0.d0,c(rang*nl+knlp),1)
end if
call position(k,nl,knl,knlp)
if (rang.lt.knl)then
    call daxpy(nl,1.d0,rs(rang*nl+1),1,xl,1)
end if
if (rang==knl)then
    call daxpy(knlp,1.d0,rs(rang*nl+1),1,xl,1)
    call daxpy(nl-knlp,1.d0,c(rang*nl+knlp+1),1,xl(knlp+1),1)
end if
if (rang.gt.knl)then
    call daxpy(nl,1.d0,c(rang*nl+1),1,xl,1)
end if

!********************************************
!   Reorder the components of x_k
!********************************************
call MPI_GATHER(xl,nl,MPI_DOUBLE_PRECISION,x,nl,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,code)
call MPI_BARRIER(MPI_COMM_WORLD,code)
if (rang==0)then
    do j=1,n
        b(p(j))=x(j)
    end do
end if


tempsf=(MPI_WTIME()-tempsf)
tempsd=(MPI_WTIME()-tempsd)
!write(*,*)'rang et temps',rang,tempsf,tempsd
call MPI_REDUCE(tempsf,temps,1,MPI_DOUBLE_PRECISION,MPI_MAX,0,MPI_COMM_WORLD,code)
call MPI_REDUCE(tempsd,temps2,1,MPI_DOUBLE_PRECISION,MPI_MAX,0,MPI_COMM_WORLD,code)
if (rang==0)then
    if (iprint >= 2) then
        print*,'Results : iter, norm residual, time'
        write(*,40)k,dnorm,temps,temps2
    end if
    write(30,10),k,dnorm
    close(30)
    open(unit=10,file='temps.txt',status='unknown',access='append')
    write(unit=10,FMT=*)'Proc. Number :',Nprocs,'Iter. Number:',k,'Time:',temps,temps2
    close(10)
end if

if (rang==0)then
    if (iprint==2) then
        write(*,*)
        write(*,*)' la perm. et les solutions CMRH et EXACTE sont : '
        write(*,*)' ______________________________________________ '
        if (n.lt.10)then
            do i=1,n
                write(*,20)p(i), b(i), sol(i)
            end do
        else
            do i=1,10
                write(*,20)p(i), b(i), sol(i)
            end do
        end if
    end if
end if

30 format(A37,4(f14.7,1x))
20 format(1x,i5,2(1x,f14.8))
40 format(1x,i5,4(1x,e12.4))


!  deallocate(AL,b,sol,workl,temp,tempn,work,c,s,rs,xl,x,cl)
if (rang==0) then
    deallocate(p)
end if
call MPI_FINALIZE(code)


end program PCMRH
!********************************************
!    End Parallel CMRH
!********************************************


!********************************************
!   Matrix Choice
!********************************************
subroutine matrice(A,nl,n,alpha,beta,choix,rang)
integer            ::n,i,j,choix,rang,nl,k
real(kind=8),dimension(nl,n)::A
real*8               ::alpha,beta
if (choix==1) then
    do i = 1, nl
        k=rang*nl+i
        do j = 1, k
            A(i,j) = (2.d0*dble(j)-1.d0)/dble(n-k+j)
        end do
        do j = k+1, n
            A(i,j) = (2.d0*dble(k)-1.d0)/dble(n-k+j)
        end do
    end do
end if
if (choix==2)then
    do i=1,nl
        k=rang*nl+i
        do j=1,k
            A(i,j)=dble(k)
        end do
        do j=k+1,n
            A(i,j)=dble(j)
        end do
    end do
end if
end subroutine matrice


!********************************************
! Solution Choice
!********************************************
subroutine solution(A,b,sol,nl,n,solchoice)
integer    ::n,i,nl,solchoice
real(kind=8),dimension(n,n) ::A
real(kind=8),dimension(n)   ::sol
real(kind=8),dimension(nl)  ::b
real   ::rand
do i=1,n
if (solchoice==1) then
sol(i)=1.d0
end if
if (solchoice==2) then
sol(i)=dble(n-i+1)
end if
if (solchoice==3)then
sol(i)=rand(0)
end if
end do
call dgemv('N',nl,n,1.d0,A,nl,sol,1,0.d0,b,1)
end subroutine solution


!********************************************
!   Where are we ?
!********************************************
subroutine position(k,nl,knl,knlp)
integer :: k,nl,knl,knlp
knl=(k-1)/nl
knlp=k-knl*nl
end subroutine position