diff --git a/examples/Bi2Se3/wt.in b/examples/Bi2Se3/wt.in
index 8784407c..ed4ade9e 100644
--- a/examples/Bi2Se3/wt.in
+++ b/examples/Bi2Se3/wt.in
@@ -29,8 +29,7 @@ Se pz px py
 !> bulk band structure calculation flag
 &CONTROL
 BulkBand_calc         = T
-!BulkBand_points_calc  = T
-Symmetry_Import_calc = T
+BulkBand_points_calc  = T
 /
 
 &SYSTEM
diff --git a/examples/MoTe2/bulk/wt.in b/examples/MoTe2/bulk/wt.in
index f716e56d..43a9852c 100644
--- a/examples/MoTe2/bulk/wt.in
+++ b/examples/MoTe2/bulk/wt.in
@@ -23,10 +23,10 @@ SOC = 1                 ! soc
 
 &PARAMETERS
 E_arc = -5          ! specify the Fermi energy
-OmegaNum = 32       ! omega number       
+OmegaNum = 50       ! omega number       
 OmegaMin = -0.6     ! energy interval
 OmegaMax =  0.2     ! energy interval
-Nk1 = 10           ! number k points 
+Nk1 = 40           ! number k points 
 NP = 1              ! number of principle layers
 /
 
diff --git a/examples/WTe2/wannier90.win b/examples/WTe2/wannier90.win
index ccb057a9..d2d92fda 100644
--- a/examples/WTe2/wannier90.win
+++ b/examples/WTe2/wannier90.win
@@ -1,16 +1,257 @@
-num_bands = 192 
+num_bands = 192  ! set to NBANDS by VASP
 num_wann = 96
 
-dis_num_iter = 2000
-num_iter = 100
+dis_num_iter = 1000
+num_iter =   0
+!postproc_setup = .true.
+iprint=2
+!wvfn_formatted=.true.
+!bands_plot = .true.
 
+!min of outer window
 dis_win_min = -1.0
 dis_win_max = 16     
+!inner - 
 dis_froz_min = 6.0000
 dis_froz_max = 8.0000
+wvfn_formatted=.true.
+bands_plot = .true.
 hr_plot =.true.
 
 begin projections
 W : s; dxy; dyz; dxz; dx2-y2; dz2
 Te : px; py; pz
 end projections
+
+
+begin kpoint_path
+G 0.00000  0.00000 0.0000 Y 0.00000  0.50000 0.0000
+Y 0.00000  0.50000 0.0000 S 0.50000  0.50000 0.0000
+S 0.50000  0.50000 0.0000 X 0.50000  0.00000 0.0000
+X 0.50000  0.00000 0.0000 G 0.00000  0.00000 0.0000
+G 0.00000  0.00000 0.0000 Z 0.00000  0.00000 0.5000
+Z 0.00000  0.00000 0.5000 T 0.00000  0.50000 0.5000
+T 0.0 0.5 0.5 R 0.5 0.5 0.5
+R 0.5 0.5 0.5 U 0.5 0.0 0.5
+U 0.5 0.0 0.5 Z 0.0 0.0 0.5 
+end kpoint_path
+
+spinors = .true.
+
+begin unit_cell_cart
+     3.4770000     0.0000000     0.0000000
+     0.0000000     6.2490000     0.0000000
+     0.0000000     0.0000000    14.0180000
+end unit_cell_cart
+
+begin atoms_cart
+W        0.0000000     3.7532744    10.7171100
+W        0.0000000     0.2487102     3.9214640
+W        1.7385000     2.4957256     3.7081100
+W        1.7385000     6.0002898    10.9304640
+Te       0.0000000     5.3592049    12.8934045
+Te       1.7385000     0.8897951     5.8844045
+Te       0.0000000     4.0387912     5.2657902
+Te       1.7385000     2.2102088    12.2747902
+Te       0.0000000     1.8650141     1.7432069
+Te       1.7385000     4.3839859     8.7522069
+Te       0.0000000     1.2949178     9.3695597
+Te       1.7385000     4.9540822     2.3605597
+end atoms_cart
+
+mp_grid =     8     6     4
+
+begin kpoints
+      0.000000000000      0.000000000000      0.000000000000
+      0.125000000000      0.000000000000      0.000000000000
+      0.250000000000      0.000000000000      0.000000000000
+      0.375000000000      0.000000000000      0.000000000000
+      0.500000000000      0.000000000000      0.000000000000
+     -0.375000000000      0.000000000000      0.000000000000
+     -0.250000000000      0.000000000000      0.000000000000
+     -0.125000000000      0.000000000000      0.000000000000
+      0.000000000000      0.166666666667      0.000000000000
+      0.125000000000      0.166666666667      0.000000000000
+      0.250000000000      0.166666666667      0.000000000000
+      0.375000000000      0.166666666667      0.000000000000
+      0.500000000000      0.166666666667      0.000000000000
+     -0.375000000000      0.166666666667      0.000000000000
+     -0.250000000000      0.166666666667      0.000000000000
+     -0.125000000000      0.166666666667      0.000000000000
+      0.000000000000      0.333333333333      0.000000000000
+      0.125000000000      0.333333333333      0.000000000000
+      0.250000000000      0.333333333333      0.000000000000
+      0.375000000000      0.333333333333      0.000000000000
+      0.500000000000      0.333333333333      0.000000000000
+     -0.375000000000      0.333333333333      0.000000000000
+     -0.250000000000      0.333333333333      0.000000000000
+     -0.125000000000      0.333333333333      0.000000000000
+      0.000000000000      0.500000000000      0.000000000000
+      0.125000000000      0.500000000000      0.000000000000
+      0.250000000000      0.500000000000      0.000000000000
+      0.375000000000      0.500000000000      0.000000000000
+      0.500000000000      0.500000000000      0.000000000000
+     -0.375000000000      0.500000000000      0.000000000000
+     -0.250000000000      0.500000000000      0.000000000000
+     -0.125000000000      0.500000000000      0.000000000000
+      0.000000000000     -0.333333333333      0.000000000000
+      0.125000000000     -0.333333333333      0.000000000000
+      0.250000000000     -0.333333333333      0.000000000000
+      0.375000000000     -0.333333333333      0.000000000000
+      0.500000000000     -0.333333333333      0.000000000000
+     -0.375000000000     -0.333333333333      0.000000000000
+     -0.250000000000     -0.333333333333      0.000000000000
+     -0.125000000000     -0.333333333333      0.000000000000
+      0.000000000000     -0.166666666667      0.000000000000
+      0.125000000000     -0.166666666667      0.000000000000
+      0.250000000000     -0.166666666667      0.000000000000
+      0.375000000000     -0.166666666667      0.000000000000
+      0.500000000000     -0.166666666667      0.000000000000
+     -0.375000000000     -0.166666666667      0.000000000000
+     -0.250000000000     -0.166666666667      0.000000000000
+     -0.125000000000     -0.166666666667      0.000000000000
+      0.000000000000      0.000000000000      0.250000000000
+      0.125000000000      0.000000000000      0.250000000000
+      0.250000000000      0.000000000000      0.250000000000
+      0.375000000000      0.000000000000      0.250000000000
+      0.500000000000      0.000000000000      0.250000000000
+     -0.375000000000      0.000000000000      0.250000000000
+     -0.250000000000      0.000000000000      0.250000000000
+     -0.125000000000      0.000000000000      0.250000000000
+      0.000000000000      0.166666666667      0.250000000000
+      0.125000000000      0.166666666667      0.250000000000
+      0.250000000000      0.166666666667      0.250000000000
+      0.375000000000      0.166666666667      0.250000000000
+      0.500000000000      0.166666666667      0.250000000000
+     -0.375000000000      0.166666666667      0.250000000000
+     -0.250000000000      0.166666666667      0.250000000000
+     -0.125000000000      0.166666666667      0.250000000000
+      0.000000000000      0.333333333333      0.250000000000
+      0.125000000000      0.333333333333      0.250000000000
+      0.250000000000      0.333333333333      0.250000000000
+      0.375000000000      0.333333333333      0.250000000000
+      0.500000000000      0.333333333333      0.250000000000
+     -0.375000000000      0.333333333333      0.250000000000
+     -0.250000000000      0.333333333333      0.250000000000
+     -0.125000000000      0.333333333333      0.250000000000
+      0.000000000000      0.500000000000      0.250000000000
+      0.125000000000      0.500000000000      0.250000000000
+      0.250000000000      0.500000000000      0.250000000000
+      0.375000000000      0.500000000000      0.250000000000
+      0.500000000000      0.500000000000      0.250000000000
+     -0.375000000000      0.500000000000      0.250000000000
+     -0.250000000000      0.500000000000      0.250000000000
+     -0.125000000000      0.500000000000      0.250000000000
+      0.000000000000     -0.333333333333      0.250000000000
+      0.125000000000     -0.333333333333      0.250000000000
+      0.250000000000     -0.333333333333      0.250000000000
+      0.375000000000     -0.333333333333      0.250000000000
+      0.500000000000     -0.333333333333      0.250000000000
+     -0.375000000000     -0.333333333333      0.250000000000
+     -0.250000000000     -0.333333333333      0.250000000000
+     -0.125000000000     -0.333333333333      0.250000000000
+      0.000000000000     -0.166666666667      0.250000000000
+      0.125000000000     -0.166666666667      0.250000000000
+      0.250000000000     -0.166666666667      0.250000000000
+      0.375000000000     -0.166666666667      0.250000000000
+      0.500000000000     -0.166666666667      0.250000000000
+     -0.375000000000     -0.166666666667      0.250000000000
+     -0.250000000000     -0.166666666667      0.250000000000
+     -0.125000000000     -0.166666666667      0.250000000000
+      0.000000000000      0.000000000000      0.500000000000
+      0.125000000000      0.000000000000      0.500000000000
+      0.250000000000      0.000000000000      0.500000000000
+      0.375000000000      0.000000000000      0.500000000000
+      0.500000000000      0.000000000000      0.500000000000
+     -0.375000000000      0.000000000000      0.500000000000
+     -0.250000000000      0.000000000000      0.500000000000
+     -0.125000000000      0.000000000000      0.500000000000
+      0.000000000000      0.166666666667      0.500000000000
+      0.125000000000      0.166666666667      0.500000000000
+      0.250000000000      0.166666666667      0.500000000000
+      0.375000000000      0.166666666667      0.500000000000
+      0.500000000000      0.166666666667      0.500000000000
+     -0.375000000000      0.166666666667      0.500000000000
+     -0.250000000000      0.166666666667      0.500000000000
+     -0.125000000000      0.166666666667      0.500000000000
+      0.000000000000      0.333333333333      0.500000000000
+      0.125000000000      0.333333333333      0.500000000000
+      0.250000000000      0.333333333333      0.500000000000
+      0.375000000000      0.333333333333      0.500000000000
+      0.500000000000      0.333333333333      0.500000000000
+     -0.375000000000      0.333333333333      0.500000000000
+     -0.250000000000      0.333333333333      0.500000000000
+     -0.125000000000      0.333333333333      0.500000000000
+      0.000000000000      0.500000000000      0.500000000000
+      0.125000000000      0.500000000000      0.500000000000
+      0.250000000000      0.500000000000      0.500000000000
+      0.375000000000      0.500000000000      0.500000000000
+      0.500000000000      0.500000000000      0.500000000000
+     -0.375000000000      0.500000000000      0.500000000000
+     -0.250000000000      0.500000000000      0.500000000000
+     -0.125000000000      0.500000000000      0.500000000000
+      0.000000000000     -0.333333333333      0.500000000000
+      0.125000000000     -0.333333333333      0.500000000000
+      0.250000000000     -0.333333333333      0.500000000000
+      0.375000000000     -0.333333333333      0.500000000000
+      0.500000000000     -0.333333333333      0.500000000000
+     -0.375000000000     -0.333333333333      0.500000000000
+     -0.250000000000     -0.333333333333      0.500000000000
+     -0.125000000000     -0.333333333333      0.500000000000
+      0.000000000000     -0.166666666667      0.500000000000
+      0.125000000000     -0.166666666667      0.500000000000
+      0.250000000000     -0.166666666667      0.500000000000
+      0.375000000000     -0.166666666667      0.500000000000
+      0.500000000000     -0.166666666667      0.500000000000
+     -0.375000000000     -0.166666666667      0.500000000000
+     -0.250000000000     -0.166666666667      0.500000000000
+     -0.125000000000     -0.166666666667      0.500000000000
+      0.000000000000      0.000000000000     -0.250000000000
+      0.125000000000      0.000000000000     -0.250000000000
+      0.250000000000      0.000000000000     -0.250000000000
+      0.375000000000      0.000000000000     -0.250000000000
+      0.500000000000      0.000000000000     -0.250000000000
+     -0.375000000000      0.000000000000     -0.250000000000
+     -0.250000000000      0.000000000000     -0.250000000000
+     -0.125000000000      0.000000000000     -0.250000000000
+      0.000000000000      0.166666666667     -0.250000000000
+      0.125000000000      0.166666666667     -0.250000000000
+      0.250000000000      0.166666666667     -0.250000000000
+      0.375000000000      0.166666666667     -0.250000000000
+      0.500000000000      0.166666666667     -0.250000000000
+     -0.375000000000      0.166666666667     -0.250000000000
+     -0.250000000000      0.166666666667     -0.250000000000
+     -0.125000000000      0.166666666667     -0.250000000000
+      0.000000000000      0.333333333333     -0.250000000000
+      0.125000000000      0.333333333333     -0.250000000000
+      0.250000000000      0.333333333333     -0.250000000000
+      0.375000000000      0.333333333333     -0.250000000000
+      0.500000000000      0.333333333333     -0.250000000000
+     -0.375000000000      0.333333333333     -0.250000000000
+     -0.250000000000      0.333333333333     -0.250000000000
+     -0.125000000000      0.333333333333     -0.250000000000
+      0.000000000000      0.500000000000     -0.250000000000
+      0.125000000000      0.500000000000     -0.250000000000
+      0.250000000000      0.500000000000     -0.250000000000
+      0.375000000000      0.500000000000     -0.250000000000
+      0.500000000000      0.500000000000     -0.250000000000
+     -0.375000000000      0.500000000000     -0.250000000000
+     -0.250000000000      0.500000000000     -0.250000000000
+     -0.125000000000      0.500000000000     -0.250000000000
+      0.000000000000     -0.333333333333     -0.250000000000
+      0.125000000000     -0.333333333333     -0.250000000000
+      0.250000000000     -0.333333333333     -0.250000000000
+      0.375000000000     -0.333333333333     -0.250000000000
+      0.500000000000     -0.333333333333     -0.250000000000
+     -0.375000000000     -0.333333333333     -0.250000000000
+     -0.250000000000     -0.333333333333     -0.250000000000
+     -0.125000000000     -0.333333333333     -0.250000000000
+      0.000000000000     -0.166666666667     -0.250000000000
+      0.125000000000     -0.166666666667     -0.250000000000
+      0.250000000000     -0.166666666667     -0.250000000000
+      0.375000000000     -0.166666666667     -0.250000000000
+      0.500000000000     -0.166666666667     -0.250000000000
+     -0.375000000000     -0.166666666667     -0.250000000000
+     -0.250000000000     -0.166666666667     -0.250000000000
+     -0.125000000000     -0.166666666667     -0.250000000000
+end kpoints
diff --git a/examples/WTe2/wt.in b/examples/WTe2/wt.in
index 3e2c6d03..f8aa087b 100644
--- a/examples/WTe2/wt.in
+++ b/examples/WTe2/wt.in
@@ -68,8 +68,8 @@ E_arc = 0.0         ! energy for calculate Fermi Arc
 OmegaNum = 400      ! omega number       
 OmegaMin = -0.6     ! energy interval
 OmegaMax =  0.5     ! energy interval
-Nk1 =  41            ! number k points 
-Nk2 =  41            ! number k points 
+Nk1 =  6             ! number k points 
+Nk2 =  6             ! number k points 
 Nk3 =  6             ! number k points 
 NP = 1              ! number of principle layers
 Gap_threshold = 0.001 ! threshold for GapCube output
diff --git a/src/Boltz_transport_anomalous.f90 b/src/Boltz_transport_anomalous.f90
index 114476fc..37f8355a 100644
--- a/src/Boltz_transport_anomalous.f90
+++ b/src/Boltz_transport_anomalous.f90
@@ -111,8 +111,8 @@ subroutine sigma_AHC
         write(outfileindex, '(a)')'set xlabel "Energy (eV)"'
         write(outfileindex, '(a)')'set ylabel "AHC (S/cm)"'
         write(outfileindex, '(5a)')"plot '",  trim(adjustl(ahcfilename)),  "' u 1:2 w l title '\sigma_{xy}' lc rgb 'red' lw 4, \"
-        write(outfileindex, '(5a)')"'",  trim(adjustl(ahcfilename)), "' u 1:3 w l title '\sigma_{zx}' lc rgb 'blue' lw 4, \"
-        write(outfileindex, '(5a)')"'", trim(adjustl(ahcfilename)), "' u 1:4 w l title '\sigma_{xz}' lc rgb 'orange' lw 4 "
+        write(outfileindex, '(5a)')"'",  trim(adjustl(ahcfilename)), "' u 1:3 w l title '\sigma_{yz}' lc rgb 'blue' lw 4, \"
+        write(outfileindex, '(5a)')"'", trim(adjustl(ahcfilename)), "' u 1:4 w l title '\sigma_{zx}' lc rgb 'orange' lw 4 "
         close(outfileindex)
      endif
 
diff --git a/src/Makefile.gfortran b/src/Makefile.gfortran
index 5a278f70..0f80a7ad 100644
--- a/src/Makefile.gfortran
+++ b/src/Makefile.gfortran
@@ -1,4 +1,5 @@
-obj =  module.o sparse.o wt_aux.o math_lib.o symmetry.o readHmnR.o inverse.o proteus.o \
+OBJ =  module.o sparse.o wt_aux.o math_lib.o mgmres.o symmetry.o \
+		  readHmnR.o inverse.o proteus.o \
        eigen.o ham_qlayer2qlayer.o psi.o unfolding.o rand.o \
   		 ham_slab.o ham_bulk.o ek_slab.o ek_bulk_polar.o ek_bulk.o \
        readinput.o fermisurface.o surfgreen.o surfstat.o \
@@ -12,28 +13,48 @@ obj =  module.o sparse.o wt_aux.o math_lib.o symmetry.o readHmnR.o inverse.o pro
 	main.o
 
 # compiler
-f90  = gfortran -cpp -DINTELMKL
+#F90  = mpif90 -fpp -DMPI -fpe3 -O3 # -DINTELMKL -DSUPERLU #  -check all -traceback -g
+#F90  = gfortran -fpp #-DMPI -fpe3 -O3 # -DINTELMKL -DSUPERLU #  -check all -traceback -g
+F90  = gfortran -cpp 
+#F90  = ifort -fpp  -DINTELMKL -fpe3 -check all -traceback -g
+#CC = mpicc -fpp -DMPI -fpe3 -O3 # -DINTELMKL -DSUPERLU
+CC = gcc -E #-DMPI -fpe3 -O3 # -DINTELMKL -DSUPERLU
 
-flag = -O3 -ffree-line-length-512  #-g -Wall -Wextra -Warray-temporaries -Wconversion -fimplicit-none -fbacktrace -ffree-line-length-0 -fcheck=all -ffpe-trap=zero,overflow,underflow -finit-real=nan
+INCLUDE = #-I${MKLROOT}/include -I/Users/user/quan/work/workplace/github-repositories/superlu/SRC
+WFLAG = #-nogen-interface
+#OFLAG = -O3 -static-intel
+OFLAG = -O3 -ffree-line-length-512  #-g -Wall -Wextra -Warray-temporaries -Wconversion -fimplicit-none -fbacktrace -ffree-line-length-0 -fcheck=all -ffpe-trap=zero,overflow,underflow -finit-real=nan
+FFLAG = $(OFLAG) $(WFLAG)
+LFLAG = $(OFLAG)
 
-ARPACK=/Users/quan/work/workplace/ARPACK/libarpack_gfortran.a
+# ARPACK LIBRARY
+ARPACK= #-L/usr/local/lib -larpack #/Users/user/quan/work/workplace/ARPACK/libarpack_gfortran.a
+
+# need to specify if -DSUPERLU privided in the F90 
+SUPERLULIB=#/usr/local/lib/libsuperlu.a
 
 # blas and lapack libraries
-#libs = -L/usr/lib/ -llapack -lblas
- 
-# Intel MKL also supports with gfortran comipler
+# static linking
+#LIBS = -Wl,--start-group ${MKLROOT}/lib/intel64/libmkl_intel_lp64.a \
+        ${MKLROOT}/lib/intel64/libmkl_sequential.a \
+        ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread -lm -ldl \
+		  ${ARPACK}
+
 # dynamic linking
-libs = ${ARPACK} -L/${MKLROOT}/lib -lmkl_core -lmkl_sequential -lmkl_intel_lp64 -lpthread
+#LIBS = ${ARPACK} -L/${MKLROOT}/lib/intel64 -lmkl_core -lmkl_sequential -lmkl_intel_lp64 -lpthread ${SUPERLULIB}
+LIBS = ${ARPACK} -llapack -lblas
  
- 
-main :  $(obj)
-	$(f90) $(obj) -o wt.x $(libs) 
+main : $(OBJ)
+	$(F90) $(LFLAG) $(OBJ) -o wt.x $(LIBS)
 	cp -f wt.x ../bin
 
-.SUFFIXES: .o .f90
+.SUFFIXES: .o .f90 .c
 
 .f90.o :
-	$(f90) -c $(flag) $(includes) $*.f90
+	$(F90) $(FFLAG) $(INCLUDE) -c $*.f90
+.c.o:
+	$(CC) $(CFLAGS) -I$(INCLUDE) -c $< $(VERBOSE)
 
 clean :
 	rm -f *.o *.mod *~ wt.x
+
diff --git a/src/Makefile.gfortran_travis b/src/Makefile.gfortran_travis
index 448c72de..43d87b41 100644
--- a/src/Makefile.gfortran_travis
+++ b/src/Makefile.gfortran_travis
@@ -1,4 +1,5 @@
-OBJ =  module.o sparse.o wt_aux.o math_lib.o symmetry.o readHmnR.o inverse.o proteus.o \
+OBJ =  module.o sparse.o wt_aux.o math_lib.o mgmres.o symmetry.o \
+		  readHmnR.o inverse.o proteus.o \
        eigen.o ham_qlayer2qlayer.o psi.o unfolding.o rand.o \
   		 ham_slab.o ham_bulk.o ek_slab.o ek_bulk_polar.o ek_bulk.o \
        readinput.o fermisurface.o surfgreen.o surfstat.o \
diff --git a/src/Makefile.intel-mpi b/src/Makefile.intel-mpi
index 53c32dcf..de8ba175 100644
--- a/src/Makefile.intel-mpi
+++ b/src/Makefile.intel-mpi
@@ -1,4 +1,5 @@
-OBJ =  module.o sparse.o wt_aux.o math_lib.o symmetry.o readHmnR.o inverse.o proteus.o \
+OBJ =  module.o sparse.o wt_aux.o math_lib.o mgmres.o symmetry.o \
+		 c_fortran_zgssv.o readHmnR.o inverse.o proteus.o \
        eigen.o ham_qlayer2qlayer.o psi.o unfolding.o rand.o \
   		 ham_slab.o ham_bulk.o ek_slab.o ek_bulk_polar.o ek_bulk.o \
        readinput.o fermisurface.o surfgreen.o surfstat.o \
@@ -12,16 +13,21 @@ OBJ =  module.o sparse.o wt_aux.o math_lib.o symmetry.o readHmnR.o inverse.o pro
 	main.o
 
 # compiler
-F90  = mpiifort -fpp -DMPI -DINTELMKL -fpe3
+F90  = mpif90 -fpp -DMPI -fpe3 -O3 # -DINTELMKL -DSUPERLU #  -check all -traceback -g
+#F90  = ifort -fpp  -DINTELMKL -fpe3 -check all -traceback -g
+CC = mpicc -fpp -DMPI -fpe3 -O3 # -DINTELMKL -DSUPERLU
 
-INCLUDE = -I${MKLROOT}/include
+INCLUDE = #-I${MKLROOT}/include -I/Users/user/quan/work/workplace/github-repositories/superlu/SRC
 WFLAG = -nogen-interface
 OFLAG = -O3 -static-intel
 FFLAG = $(OFLAG) $(WFLAG)
 LFLAG = $(OFLAG)
 
 # ARPACK LIBRARY
-ARPACK=/Users/quan/work/workplace/ARPACK/libarpack_MAC.a
+ARPACK=/Users/user/quan/work/workplace/ARPACK/libarpack_MAC.a
+
+# need to specify if -DSUPERLU privided in the F90 
+SUPERLULIB=#/usr/local/lib/libsuperlu.a
 
 # blas and lapack libraries
 # static linking
@@ -31,16 +37,18 @@ ARPACK=/Users/quan/work/workplace/ARPACK/libarpack_MAC.a
 		  ${ARPACK}
 
 # dynamic linking
-LIBS = ${ARPACK} -L/${MKLROOT}/lib/intel64 -lmkl_core -lmkl_sequential -lmkl_intel_lp64 -lpthread
+LIBS = ${ARPACK} -L/${MKLROOT}/lib/intel64 -lmkl_core -lmkl_sequential -lmkl_intel_lp64 -lpthread ${SUPERLULIB}
  
 main : $(OBJ)
 	$(F90) $(LFLAG) $(OBJ) -o wt.x $(LIBS)
 	cp -f wt.x ../bin
 
-.SUFFIXES: .o .f90
+.SUFFIXES: .o .f90 .c
 
 .f90.o :
 	$(F90) $(FFLAG) $(INCLUDE) -c $*.f90
+.c.o:
+	$(CC) $(CFLAGS) -I$(INCLUDE) -c $< $(VERBOSE)
 
 clean :
 	rm -f *.o *.mod *~ wt.x
diff --git a/src/c_fortran_zgssv.c b/src/c_fortran_zgssv.c
new file mode 100644
index 00000000..7be69561
--- /dev/null
+++ b/src/c_fortran_zgssv.c
@@ -0,0 +1,181 @@
+#ifdef SUPERLU
+
+/*
+ * -- SuperLU routine (version 6.0) --
+ * Univ. of California Berkeley, Xerox Palo Alto Research Center,
+ * and Lawrence Berkeley National Lab.
+ * October 15, 2003
+ *
+ * March 26, 2023  Add 64-bit indexing and METIS ordering
+ */
+
+#include "slu_zdefs.h"
+
+#define HANDLE_SIZE  8
+
+/* kind of integer to hold a pointer.  Use 64-bit. */
+typedef long long int fptr;
+
+typedef struct {
+    SuperMatrix *L;
+    SuperMatrix *U;
+    int *perm_c;
+    int *perm_r;
+} factors_t;
+
+/*!
+ * This routine can be called from Fortran.
+ *
+ * iopt (input) int
+ *      Specifies the operation:
+ *      = 1, performs LU decomposition for the first time
+ *      = 2, performs triangular solve
+ *      = 3, free all the storage in the end
+ *
+ * f_factors (input/output) fptr* 
+ *      If iopt == 1, it is an output and contains the pointer pointing to
+ *                    the structure of the factored matrices.
+ *      Otherwise, it it an input.
+ */
+void
+c_fortran_zgssv_(int *iopt, int *n, int_t *nnz, int *nrhs,
+                 doublecomplex *values, int_t *rowind, int_t *colptr,
+                 doublecomplex *b, int *ldb,
+                 fptr *f_factors, /* a handle containing the address
+                                     pointing to the factored matrices */
+                 int_t *info)
+{
+    SuperMatrix A, AC, B;
+    SuperMatrix *L, *U;
+    int *perm_r; /* row permutations from partial pivoting */
+    int *perm_c; /* column permutation vector */
+    int *etree;  /* column elimination tree */
+    SCformat *Lstore;
+    NCformat *Ustore;
+    int      i, panel_size, permc_spec, relax;
+    trans_t  trans;
+    mem_usage_t   mem_usage;
+    superlu_options_t options;
+    SuperLUStat_t stat;
+    factors_t *LUfactors;
+    GlobalLU_t Glu;   /* Not needed on return. */
+    int_t    *rowind0;  /* counter 1-based indexing from Frotran arrays. */
+    int_t    *colptr0;  
+
+    trans = NOTRANS;
+
+    if ( *iopt == 1 ) { /* LU decomposition */
+
+        /* Set the default input options. */
+        set_default_options(&options);
+
+	/* Initialize the statistics variables. */
+	StatInit(&stat);
+
+
+	/* Adjust to 0-based indexing */
+	if ( !(rowind0 = intMalloc(*nnz)) ) ABORT("Malloc fails for rowind0[].");
+	if ( !(colptr0 = intMalloc(*n+1)) ) ABORT("Malloc fails for colptr0[].");
+	for (i = 0; i < *nnz; ++i) rowind0[i] = rowind[i] - 1;
+	for (i = 0; i <= *n; ++i) colptr0[i] = colptr[i] - 1;
+
+	zCreate_CompCol_Matrix(&A, *n, *n, *nnz, values, rowind0, colptr0,
+			       SLU_NC, SLU_Z, SLU_GE);
+	L = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
+	U = (SuperMatrix *) SUPERLU_MALLOC( sizeof(SuperMatrix) );
+	if ( !(perm_r = int32Malloc(*n)) ) ABORT("Malloc fails for perm_r[].");
+	if ( !(perm_c = int32Malloc(*n)) ) ABORT("Malloc fails for perm_c[].");
+	if ( !(etree = int32Malloc(*n)) ) ABORT("Malloc fails for etree[].");
+
+	/*
+	 * Get column permutation vector perm_c[], according to permc_spec:
+	 *   permc_spec = 0: natural ordering 
+	 *   permc_spec = 1: minimum degree on structure of A'*A
+	 *   permc_spec = 2: minimum degree on structure of A'+A
+	 *   permc_spec = 3: approximate minimum degree for unsymmetric matrices
+	 *   permc_spec = 6: METIS ordering on structure of A'*A
+	 */    	
+	permc_spec = options.ColPerm;
+	get_perm_c(permc_spec, &A, perm_c);
+	
+	sp_preorder(&options, &A, perm_c, etree, &AC);
+
+	panel_size = sp_ienv(1);
+	relax = sp_ienv(2);
+
+	zgstrf(&options, &AC, relax, panel_size, etree,
+                NULL, 0, perm_c, perm_r, L, U, &Glu, &stat, info);
+
+	if ( *info == 0 ) {
+	    Lstore = (SCformat *) L->Store;
+	    Ustore = (NCformat *) U->Store;
+	    //printf("No of nonzeros in factor L = %lld\n", (long long) Lstore->nnz);
+	    //printf("No of nonzeros in factor U = %lld\n", (long long) Ustore->nnz);
+	    //printf("No of nonzeros in L+U = %lld\n", (long long) Lstore->nnz + Ustore->nnz);
+	    zQuerySpace(L, U, &mem_usage);
+	    //printf("L\\U MB %.3f\ttotal MB needed %.3f\n",
+		 //  mem_usage.for_lu/1e6, mem_usage.total_needed/1e6);
+	} else {
+	    printf("zgstrf() error returns INFO= %lld\n", (long long) *info);
+	    if ( *info <= *n ) { /* factorization completes */
+		zQuerySpace(L, U, &mem_usage);
+		printf("L\\U MB %.3f\ttotal MB needed %.3f\n",
+		       mem_usage.for_lu/1e6, mem_usage.total_needed/1e6);
+	    }
+	}
+	
+	/* Save the LU factors in the factors handle */
+	LUfactors = (factors_t*) SUPERLU_MALLOC(sizeof(factors_t));
+	LUfactors->L = L;
+	LUfactors->U = U;
+	LUfactors->perm_c = perm_c;
+	LUfactors->perm_r = perm_r;
+	*f_factors = (fptr) LUfactors;
+
+	/* Free un-wanted storage */
+	SUPERLU_FREE(etree);
+	Destroy_SuperMatrix_Store(&A);
+	Destroy_CompCol_Permuted(&AC);
+	SUPERLU_FREE(rowind0);
+	SUPERLU_FREE(colptr0);
+	StatFree(&stat);
+
+    } else if ( *iopt == 2 ) { /* Triangular solve */
+	int iinfo;
+    
+	/* Initialize the statistics variables. */
+	StatInit(&stat);
+
+	/* Extract the LU factors in the factors handle */
+	LUfactors = (factors_t*) *f_factors;
+	L = LUfactors->L;
+	U = LUfactors->U;
+	perm_c = LUfactors->perm_c;
+	perm_r = LUfactors->perm_r;
+
+	zCreate_Dense_Matrix(&B, *n, *nrhs, b, *ldb, SLU_DN, SLU_Z, SLU_GE);
+
+        /* Solve the system A*X=B, overwriting B with X. */
+        zgstrs (trans, L, U, perm_c, perm_r, &B, &stat, &iinfo);
+	*info = iinfo;
+
+	Destroy_SuperMatrix_Store(&B);
+	StatFree(&stat);
+
+    } else if ( *iopt == 3 ) { /* Free storage */
+	/* Free the LU factors in the factors handle */
+	LUfactors = (factors_t*) *f_factors;
+	SUPERLU_FREE (LUfactors->perm_r);
+	SUPERLU_FREE (LUfactors->perm_c);
+	Destroy_SuperNode_Matrix(LUfactors->L);
+	Destroy_CompCol_Matrix(LUfactors->U);
+        SUPERLU_FREE (LUfactors->L);
+        SUPERLU_FREE (LUfactors->U);
+	SUPERLU_FREE (LUfactors);
+    } else {
+	fprintf(stderr,"Invalid iopt=%d passed to c_fortran_zgssv()\n",*iopt);
+	exit(-1);
+    }
+}
+
+#endif
diff --git a/src/dos.f90 b/src/dos.f90
index eedd9943..f973cfd2 100644
--- a/src/dos.f90
+++ b/src/dos.f90
@@ -1,5 +1,4 @@
 
-#if defined (INTELMKL)
 subroutine dos_sparse
 !> calculate density of state for 3D bulk system
 !
@@ -189,10 +188,6 @@ subroutine dos_sparse
    return
 end subroutine dos_sparse
 
-#endif
-
-
-#if defined (INTELMKL)
 subroutine charge_density_sparse
 !> calculate charge density
 !
@@ -356,11 +351,6 @@ subroutine charge_density_sparse
    return
 end subroutine charge_density_sparse
 
-#endif
-
-
-
-
 
 
 subroutine dos_sub
diff --git a/src/ek_bulk.f90 b/src/ek_bulk.f90
index a54b5040..e4aadd7f 100644
--- a/src/ek_bulk.f90
+++ b/src/ek_bulk.f90
@@ -1108,7 +1108,6 @@ end subroutine ek_bulk_cube
 
 
 
-#if defined (INTELMKL)
 subroutine sparse_ekbulk
    use sparse
    use para
@@ -1184,7 +1183,11 @@ subroutine sparse_ekbulk
    if (neval>Num_wann-2) neval= Num_wann- 2
 
    !> ncv
-   nvecs=int(2*neval)
+   if (NumLCZVecs.ne.0) then
+      nvecs= NumLCZVecs
+   else
+      nvecs=int(2*neval)
+   endif
 
    if (nvecs<20) nvecs= 20
    if (nvecs>Num_wann) nvecs= Num_wann
@@ -1556,7 +1559,8 @@ subroutine sparse_ekbulk_valley
             do ie2= 1, ND
                psi2= zeigv(:, IE+ie2-1)
                vpsi=0d0
-               call mkl_zcoogemv('N', Num_wann, acoo_valley, icoo_valley, jcoo_valley, nnzmax_valley, psi2, vpsi)
+               !call mkl_zcoogemv('N', Num_wann, acoo_valley, icoo_valley, jcoo_valley, nnzmax_valley, psi2, vpsi)
+               call coomv_z(Num_wann, nnzmax_valley, acoo_valley, icoo_valley, jcoo_valley, psi2, vpsi)
                valley_k_nd(ie1, ie2)= zdotc(Num_wann, psi1, 1, vpsi, 1)
             enddo
            !if (abs(W(IE+ie1-1)/eV2Hartree)<2d0) &
@@ -1918,7 +1922,6 @@ subroutine sparse_ekbulk_plane
    return
 end subroutine sparse_ekbulk_plane
 
-#endif
 
 #if defined (ELPA)
 subroutine ekbulk_elpa
diff --git a/src/ek_slab.f90 b/src/ek_slab.f90
index de983a3c..7dec956b 100644
--- a/src/ek_slab.f90
+++ b/src/ek_slab.f90
@@ -377,9 +377,7 @@ subroutine ek_slab_sparseHR
       !> diagonalization by call zheev in lapack
       W= 0d0
 
-#if defined (INTELMKL)
       call arpack_sparse_coo_eigs(Ndimq,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,W,sigma, zeigv, ritzvec)
-#endif
 
       call now(time3)
       eigv(1:neval, ik)= W(1:neval)
diff --git a/src/fermisurface.f90 b/src/fermisurface.f90
index e93768a5..9250f332 100644
--- a/src/fermisurface.f90
+++ b/src/fermisurface.f90
@@ -439,7 +439,7 @@ subroutine fermisurface_kplane
 
      integer :: ia, ik, i, j, i1, i2, ig, io
      integer :: knv3, nkx, nky, ierr, nwann
-     real(dp) :: kz, k(3), s0(3)
+     real(dp) :: kz, k(3), s0(3), s0t(3), s0_len
      
      ! Hamiltonian of bulk system
      complex(Dp), allocatable :: Hamk_bulk(:, :)
@@ -463,6 +463,7 @@ subroutine fermisurface_kplane
      complex(dp), allocatable :: ones(:, :), ctemp(:, :)
 
      logical, external :: if_in_the_WS
+     real(dp), external :: norm
 
 
      nkx= Nk
@@ -665,9 +666,14 @@ subroutine fermisurface_kplane
            write(outfileindex, "('#column', i5, 3000i16)")(i, i=1, 7+4*NumberofSelectedOrbitals_groups)
            do ik=1, knv3
               do ig=1, NumberofSelectedOrbitals_groups
-                 s0(1)= sx_selected_mpi(ik, ig)  /dos_selected_mpi(ik, ig)
-                 s0(2)= sy_selected_mpi(ik, ig)  /dos_selected_mpi(ik, ig)
-                 s0(3)= sz_selected_mpi(ik, ig)  /dos_selected_mpi(ik, ig)
+                 s0(1)= sx_selected_mpi(ik, ig)   /dos_selected_mpi(ik, ig)
+                 s0(2)= sy_selected_mpi(ik, ig)   /dos_selected_mpi(ik, ig)
+                 s0(3)= sz_selected_mpi(ik, ig)   /dos_selected_mpi(ik, ig)
+
+                !s0_len= norm(s0t)
+                !if (s0_len>eps6) then
+                !   s0= s0t/s0_len
+                !endif
                  call rotate_k3_to_kplane(s0, s1(:, ig))
               enddo
               write(outfileindex, '(3000E16.8)')kxy_shape(:, ik)*Angstrom2atomic, kxy_plane(:, ik)*Angstrom2atomic, &
diff --git a/src/ham_bulk.f90 b/src/ham_bulk.f90
index 9705e98a..03fd510f 100644
--- a/src/ham_bulk.f90
+++ b/src/ham_bulk.f90
@@ -325,17 +325,17 @@ subroutine ham_bulk_latticegauge(k,Hamk_bulk)
    !Hamk_bulk= (Hamk_bulk+ mat2)/2d0
 
    ! check hermitcity
-   do i1=1, Num_wann
-      do i2=1, Num_wann
-         if(abs(Hamk_bulk(i1,i2)-conjg(Hamk_bulk(i2,i1))).ge.1e-6)then
-            write(stdout,*)'there is something wrong with Hamk_bulk'
-            write(stdout,*)'i1, i2', i1, i2
-            write(stdout,*)'value at (i1, i2)', Hamk_bulk(i1, i2)
-            write(stdout,*)'value at (i2, i1)', Hamk_bulk(i2, i1)
-            !stop
-         endif
-      enddo
-   enddo
+  !do i1=1, Num_wann
+  !   do i2=1, Num_wann
+  !      if(abs(Hamk_bulk(i1,i2)-conjg(Hamk_bulk(i2,i1))).ge.1e-6)then
+  !         write(stdout,*)'there is something wrong with Hamk_bulk'
+  !         write(stdout,*)'i1, i2', i1, i2
+  !         write(stdout,*)'value at (i1, i2)', Hamk_bulk(i1, i2)
+  !         write(stdout,*)'value at (i2, i1)', Hamk_bulk(i2, i1)
+  !         !stop
+  !      endif
+  !   enddo
+  !enddo
 
    return
 end subroutine ham_bulk_latticegauge
@@ -834,6 +834,66 @@ subroutine overlap_bulk_coo_sparse(k, acoo, icoo, jcoo)
    return
 end subroutine overlap_bulk_coo_sparse
 
+subroutine ham_bulk_tpaw(mdim, k, Hamk_moire)
+   ! This subroutine caculates Hamiltonian for
+   ! a twist system
+   !
+   !  Lattice gauge Hl
+   !  Atomic gauge Ha= U* Hl U 
+   !  where U = e^ik.wc(i) on diagonal
+   ! contact wuquansheng@gmail.com
+   ! 2024 Jan 19
+
+   use para
+   implicit none
+
+   !> leading dimension of Ham_moire
+   integer, intent(in) :: mdim
+   real(dp), intent(in) :: k(3)
+   complex(dp), intent(inout) :: hamk_moire(mdim, mdim)
+
+   ! wave vector in 3d
+   real(Dp) :: kdotr, pos0(3), dis
+
+   complex(dp) :: factor
+
+   real(dp) :: pos(3), pos1(3), pos2(3), pos_cart(3), pos_direct(3)
+
+   integer :: Num_gvectors
+
+   !> dimension 3*Num_gvectors 
+   real(dp), allocatable :: gvectors(:, :)
+
+   integer :: i1, i2
+
+   !> for a test, we assume a twist homo-bilayer system
+   ! mdim= Num_gvectors* Folded_cell%NumberOfspinorbitals*2
+
+   hamk_moire=0d0
+
+   !> first, set G vectors
+
+   !> construct T matrix
+   !> the dimension of the T matrix is num_wann
+   
+
+   ! check hermitcity
+   do i1=1, mdim
+      do i2=1, mdim
+         if(abs(hamk_moire(i1,i2)-conjg(hamk_moire(i2,i1))).ge.1e-6)then
+            write(stdout,*)'there is something wrong with hamk_moire'
+            write(stdout,*)'i1, i2', i1, i2
+            write(stdout,*)'value at (i1, i2)', hamk_moire(i1, i2)
+            write(stdout,*)'value at (i2, i1)', hamk_moire(i2, i1)
+            !stop
+         endif
+      enddo
+   enddo
+
+   return
+end subroutine ham_bulk_tpaw
+
+
 subroutine valley_k_coo_sparsehr(nnz, k,acoo,icoo,jcoo)
    !> This subroutine use sparse hr format
    !> Here we use atomic gauge which means the atomic position is taken into account
diff --git a/src/inverse.f90 b/src/inverse.f90
index a846cbf5..efaa40e0 100644
--- a/src/inverse.f90
+++ b/src/inverse.f90
@@ -111,3 +111,321 @@ subroutine inv_r(ndim,Amat)
      
      return
   end subroutine inv_r
+
+
+  subroutine sparse_solver(ndims, nnz, acsr, icsr, jcsr, b, x)
+     !> this subroutine  solves A*x= b, where A is a sparse matrix stored in compress row format
+     !> A is a Hermitian complex square matrix
+
+     use para, only : dp
+     implicit none
+    
+     !> the dimension of a square matrix
+     integer, intent(in) :: ndims
+
+     !> number of non-zeros elements
+     integer, intent(in) :: nnz
+
+     !> sparse matrix 
+     ! acsr(K) = value of entry,
+     ! icsr(K) = row of entry,
+     ! jcsr(K) = column of entry.
+     complex(dp), intent(in) :: acsr(nnz)
+     integer, intent(in) :: icsr(ndims+1)
+     integer, intent(in) :: jcsr(nnz)
+
+     complex(dp), intent(out) :: x(ndims)
+     complex(dp), intent(in)  :: b(ndims)
+     real(dp), allocatable :: acsr_real(:)
+     real(dp), allocatable :: acsr_imag(:)
+     real(dp), allocatable :: x_real(:)
+     real(dp), allocatable :: x_imag(:)
+     real(dp), allocatable :: b_real(:)
+     real(dp), allocatable :: b_imag(:)
+
+     complex(dp), allocatable ::  acsc(:)
+     integer, allocatable ::  icsc(:)
+     integer, allocatable ::  jcsc(:)
+
+     ! ITR_MAX, the maximum number of (outer) iterations to take.
+     integer :: itr_max
+ 
+     !MR, the maximum number of (inner) iterations to take.  0 < MR <= N.
+     integer :: mr
+
+     !TOL_ABS, an absolute tolerance applied to the current residual.
+     !TOL_REL, a relative tolerance comparing the current residual to the initial residual.
+     real(dp) :: tol_abs, tol_rel
+
+     integer :: iopt, info, nrhs, i, ipos, ljob
+     integer*8 :: factors
+
+
+     !>--------------------------------------------------
+     !>> call mgmres_st
+     !> mgmres_st() applies restarted GMRES to a sparse triplet matrix.
+
+     !allocate(acsr_real(nnz), acsr_imag(nnz))
+     !allocate(x_real(ndims), x_imag(ndims))
+     !allocate(b_real(ndims), b_imag(ndims))
+
+     !acsr_real= real(acsr)
+     !acsr_imag= aimag(acsr)
+     !b_real= real(b)
+     !b_imag= aimag(b)
+
+
+     !itr_max=1000
+     !tol_abs=1E-7
+     !tol_rel=1E-7
+     !mr= min(100, ndims)
+    !call mgmres_csr ( ndims, nnz, icsr, jcsr, acsr_real, x_real, b_real, itr_max, mr, tol_abs, tol_rel )
+    !call mgmres_csr ( ndims, nnz, icsr, jcsr, acsr_imag, x_imag, b_imag, itr_max, mr, tol_abs, tol_rel )
+    
+    !do i=1, ndims
+    !   x(i) = cmplx(x_real(i), x_imag(i))
+    !enddo
+     !>--------------------------------------------------
+
+     !>--------------------------------------------------
+     !>> call superLU
+
+#ifdef SUPERLU
+     allocate( acsc(nnz), icsc(ndims+1), jcsc(nnz))
+     !> convert csr to csc
+     ljob = 1 ! fill c
+     ipos = 1
+     call csrcsc ( ndims, ljob, ipos, acsr, jcsr, icsr, acsc, jcsc, icsc )
+     !> 
+     x= b
+! First, factorize the matrix. The factors are stored in *factors* handle.
+      iopt = 1
+      nrhs = 1
+      call c_fortran_zgssv( iopt, ndims, nnz, nrhs, acsc, jcsc, icsc, &
+           x, ndims, factors, info )
+      
+      if (info .eq. 0) then
+      !  write (*,*) 'Factorization succeeded'
+      else
+      !  write(*,*) 'INFO from factorization = ', info
+      endif
+      
+! Second, solve the system using the existing factors.
+      iopt = 2
+      call c_fortran_zgssv( iopt, ndims, nnz, nrhs, acsc, jcsc, icsc, &
+                           x, ndims, factors, info )
+
+      if (info .eq. 0) then
+      !  write (*,*) 'Solve succeeded'
+      else
+      !  write(*,*) 'INFO from triangular solve = ', info
+      endif
+
+! Last, free the storage allocated inside SuperLU
+      iopt = 3
+      call c_fortran_zgssv( iopt, ndims, nnz, nrhs, acsc, jcsc, icsc, &
+                           x, ndims, factors, info )
+     !>--------------------------------------------------
+#elif defined (INTELMKL)
+!     !>--------------------------------------------------
+     !>> call mkldss zgesv
+      call zmat_mkldss_zgesv(ndims, nnz, acsr, jcsr, icsr, b, x)
+     !>--------------------------------------------------
+#else
+     STOP "ERROR: Please add -DINTELMKL or -DSUPERLU in the Makefile to run the sparse_solver"
+#endif
+
+
+     return
+  end subroutine sparse_solver
+
+#if defined (INTELMKL)
+      subroutine zmat_mkldss_zgesv(ndims, nnzmax, acsr, jcsr, icsr, xvec, yvec)
+         use mkl_dss
+         use para, only : dp, stdout, time_cost_t1, time_cost_t2, time_cost_t3
+
+         implicit none
+
+         ! external arguments
+         ! dimension of matrix A
+         integer, intent(in) :: ndims
+
+         ! non-zero elements of matrix A
+         integer, intent(in) :: nnzmax
+
+         ! compressed sparse row storage of matrix A
+         complex(dp), intent(in) :: acsr(nnzmax)
+         integer, intent(in) :: jcsr(nnzmax)
+         integer, intent(in) :: icsr(ndims+1)
+
+         ! xvec vector with dimension "ndims"
+         complex(dp), intent(in) :: xvec(ndims)
+
+         ! yvec vector with dimension "ndims"
+         complex(dp), intent(out) :: yvec(ndims)
+
+         ! local variables
+         type(mkl_dss_handle) :: handle
+         integer :: nrhs
+         integer :: ierr
+         integer :: perm(1)
+         real(dp) :: time_tic, time_toc
+
+         !>>> step 0: setup the problem to be solved
+         nrhs = 1
+         perm(1) = 0
+
+         !>>> step 1: initialize the solver.
+         ierr = dss_create( handle, mkl_dss_defaults )
+         if (ierr .ne. mkl_dss_success) then
+            write(*,*) "mkl_dss_create returned error code", ierr; stop
+         endif 
+
+         !>>> step 2: define the non-zero structure of the matrix.
+         ierr = dss_define_structure( handle, mkl_dss_symmetric_structure_complex, icsr, ndims, ndims, jcsr, nnzmax )
+         !ierr = dss_define_structure( handle, MKL_DSS_NON_SYMMETRIC_COMPLEX, icsr, ndims, ndims, jcsr, nnzmax )
+         if (ierr .ne. mkl_dss_success) then
+            write(*,*) "mkl_dss_define_structure returned error code", ierr; stop
+         endif ! back if (ierr .ne. mkl_dss_success} block
+
+         call now(time_tic)
+         !>>> step 3: reorder the matrix.
+         ierr = dss_reorder( handle, mkl_dss_defaults, perm )
+         if (ierr .ne. mkl_dss_success) then
+            write(*,*) "mkl_dss_reorder returned error code", ierr; stop
+         endif ! back if (ierr .ne. mkl_dss_success} block
+         call now(time_toc)
+         time_cost_t1= time_cost_t1+ time_toc- time_tic
+
+         call now(time_tic)
+         !>>> step 4: factor the matrix.
+         ierr = dss_factor_complex( handle, mkl_dss_defaults, acsr )
+         if (ierr .ne. mkl_dss_success) then
+            write(*,*) "mkl_dss_factor_complex returned error code", ierr; stop
+         endif ! back if (ierr .ne. mkl_dss_success} block
+         call now(time_toc)
+         time_cost_t2= time_cost_t2+ time_toc- time_tic
+
+         call now(time_tic)
+         ! allocate the solution vector and solve the problem.
+         ierr = dss_solve_complex( handle, mkl_dss_defaults, xvec, nrhs, yvec )
+         call now(time_toc)
+         time_cost_t3= time_cost_t3+ time_toc- time_tic
+
+         if (ierr .ne. mkl_dss_success) then
+            write(*,*) "mkl_dss_solve_complex returned error code", ierr; stop
+         endif ! back if (ierr .ne. mkl_dss_success} block
+
+         ! deallocate solver storage and various local arrays.
+         ierr = dss_delete( handle, mkl_dss_defaults )
+
+         if (ierr .ne. mkl_dss_success) then
+            write(*,*) "mkl_dss_solver returned error code", ierr; stop
+         endif ! back if (ierr .ne. mkl_dss_success} block
+
+         return
+      end subroutine zmat_mkldss_zgesv
+#endif
+
+subroutine csrcsc ( n, job, ipos, a, ja, ia, ao, jao, iao )
+ 
+!*****************************************************************************80
+!
+!! CSRCSC converts Compressed Sparse Row to Compressed Sparse Column.
+!
+!  Discussion:
+!
+!    This is essentially a transposition operation.  
+!
+!    It is NOT an in-place algorithm.
+!
+!    This routine transposes a matrix stored in a, ja, ia format.
+!
+!  Modified:
+!
+!    07 January 2004
+!
+!  Author:
+!
+!    Youcef Saad
+!
+!  Parameters:
+!
+!    Input, integer ( kind = 4 ) N, the order of the matrix.
+!
+!    Input, integer ( kind = 4 ) JOB, indicates whether or not to fill the values of the
+!    matrix AO or only the pattern (IA, and JA).  Enter 1 for yes.
+!
+! ipos  = starting position in ao, jao of the transposed matrix.
+!         the iao array takes this into account (thus iao(1) is set to ipos.)
+!         Note: this may be useful if one needs to append the data structure
+!         of the transpose to that of A. In this case use
+!                call csrcsc (n,1,n+2,a,ja,ia,a,ja,ia(n+2))
+!        for any other normal usage, enter ipos=1.
+!
+!    Input, real A(*), integer ( kind = 4 ) JA(*), IA(N+1), the matrix in CSR
+!    Compressed Sparse Row format.
+!
+!    Output, real AO(*), JAO(*), IAO(N+1), the matrix in CSC
+!    Compressed Sparse Column format.
+!
+  implicit none
+
+  integer ( kind = 4 ) n
+
+  complex ( kind = 8 ) a(*)
+  complex ( kind = 8 ) ao(*)
+  integer ( kind = 4 ) i
+  integer ( kind = 4 ) ia(n+1)
+  integer ( kind = 4 ) iao(n+1)
+  integer ( kind = 4 ) ipos
+  integer ( kind = 4 ) j
+  integer ( kind = 4 ) ja(*)
+  integer ( kind = 4 ) jao(*)
+  integer ( kind = 4 ) job
+  integer ( kind = 4 ) k
+  integer ( kind = 4 ) next
+!
+!  Compute lengths of rows of A'.
+!
+  iao(1:n+1) = 0
+
+  do i = 1, n
+    do k = ia(i), ia(i+1)-1
+      j = ja(k) + 1
+      iao(j) = iao(j) + 1
+    end do
+  end do
+!
+!  Compute pointers from lengths.
+!
+  iao(1) = ipos
+  do i = 1, n
+    iao(i+1) = iao(i) + iao(i+1)
+  end do
+!
+!  Do the actual copying.
+!
+  do i = 1, n
+    do k = ia(i), ia(i+1)-1
+      j = ja(k)
+      next = iao(j)
+      if ( job == 1 ) then
+        ao(next) = a(k)
+      end if
+      jao(next) = i
+      iao(j) = next + 1
+    end do
+  end do
+!
+!  Reshift IAO and leave.
+!
+  do i = n, 1, -1
+    iao(i+1) = iao(i)
+  end do
+  iao(1) = ipos
+
+  return
+end
+
+
diff --git a/src/lanczos_sparse.f90 b/src/lanczos_sparse.f90
index 89526e73..3b4173d1 100644
--- a/src/lanczos_sparse.f90
+++ b/src/lanczos_sparse.f90
@@ -83,9 +83,10 @@ subroutine lanczos_seqsparse_cpu_z(NumLczVectors, NumLczVectors_out, Mdim, nnz,
    vec_0= 1d0/betan(1)* vec_0
 
    !* perform |vec(1,:)>=Ham |vec(0,:)>
-#if defined (INTELMKL)
-   call mkl_zcsrgemv('N', Mdim, acsr, icsr, jcsr, vec_0, vec_1)
-#endif
+!#if defined (INTELMKL)
+!   call mkl_zcsrgemv('N', Mdim, acsr, icsr, jcsr, vec_0, vec_1)
+!#endif
+   call csrmv_z(Mdim, nnz, acsr, icsr, jcsr, vec_0, vec_1)
 
    !* a_0= <phi_0|H|phi_0>
    alpha(1) = zdotc(Mdim, vec_0, 1, vec_1, 1)
@@ -114,9 +115,10 @@ subroutine lanczos_seqsparse_cpu_z(NumLczVectors, NumLczVectors_out, Mdim, nnz,
       call now(time_start)
 
 
-#if defined (INTELMKL)
-      call mkl_zcsrgemv('N', Mdim, acsr, icsr, jcsr, vec_1, vec_2)
-#endif
+!#if defined (INTELMKL)
+!      call mkl_zcsrgemv('N', Mdim, acsr, icsr, jcsr, vec_1, vec_2)
+!#endif
+      call csrmv_z(Mdim, nnz, acsr, icsr, jcsr, vec_1, vec_2)
 
       !* calculate alpha, betan
       t_z1= zdotc(Mdim, vec_1, 1, vec_2, 1)
diff --git a/src/landau_level_sparse.f90 b/src/landau_level_sparse.f90
index a7263fcc..16ab28c6 100644
--- a/src/landau_level_sparse.f90
+++ b/src/landau_level_sparse.f90
@@ -750,9 +750,7 @@ subroutine sparse_landau_level_B
 
       !> diagonalization by call zheev in lapack
       W= 0d0
-#if defined (INTELMKL)
       call arpack_sparse_coo_eigs(Ndimq,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,W,sigma, zeigv, LandauLevel_wavefunction_calc)
-#endif
       call now(time3)
       eigv(:, ib)= W
       if (cpuid==0)write(stdout, '(a, f20.2, a)')'  >> Time cost for constructing H: ', time2-time1, ' s'
@@ -1070,9 +1068,7 @@ subroutine sparse_landau_level_k
 
       !> diagonalization by call zheev in lapack
       W= 0d0
-#if defined (INTELMKL)
       call arpack_sparse_coo_eigs(Ndimq,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,W,sigma, zeigv, LandauLevel_wavefunction_calc)
-#endif
 
       call now(time3)
       eigv(1:neval, ik)= W(1:neval)
@@ -1401,9 +1397,7 @@ subroutine sparse_landau_dos
       !> diagonalization by call zheev in lapack
       W= 0d0
       !        call eigensystem_c( 'N', 'U', Ndimq ,ham_landau, W)
-#if defined (INTELMKL)
       call arpack_sparse_coo_eigs(Ndimq,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,W,sigma, zeigv, LandauLevel_wavefunction_calc)
-#endif
       do ie= 1, NE
          do iv= 1, neval
             x= omega(ie)- W(iv)
diff --git a/src/main.f90 b/src/main.f90
index 39793df4..2a567c1d 100644
--- a/src/main.f90
+++ b/src/main.f90
@@ -117,7 +117,6 @@ program main
         !> normal hmnr file
         if(.not. Is_Sparse_Hr) then
            !> for the dense hr file, we allocate HmnR
-           HmnR= 0d0
            call readNormalHmnR()
            if (valley_projection_calc) call  read_valley_operator
         !> sparse hmnr input
@@ -167,13 +166,11 @@ program main
       if(cpuid.eq.0)write(stdout, *)'>> Start of calculating bulk band'
       call now(time_start)
       if (Is_Sparse_Hr) then
-#if defined (INTELMKL)
          if (valley_projection_calc) then
             call sparse_ekbulk_valley
          else
             call sparse_ekbulk
          endif
-#endif
       else
          if (valley_projection_calc) then
             call ek_bulk_line_valley
@@ -207,9 +204,7 @@ program main
       call now(time_start)
 
       if (Is_Sparse_Hr) then
-#if defined (INTELMKL)
          call sparse_ekbulk_plane
-#endif
       else
          call ek_bulk_plane
       endif
@@ -225,9 +220,7 @@ program main
       if(cpuid.eq.0)write(stdout, *)' '
       if(cpuid.eq.0)write(stdout, *)'>> Start of calculating Landau level spectrum'
       call now(time_start)
-#if defined (INTELMKL)
       call LandauLevel_B_dos_Lanczos
-#endif
       call now(time_end)
       call print_time_cost(time_start, time_end, 'LandauLevel_B_dos_calc')
       if(cpuid.eq.0)write(stdout, *)'<< End of calculating Landau level spectrum'
@@ -238,9 +231,7 @@ program main
       if(cpuid.eq.0)write(stdout, *)' '
       if(cpuid.eq.0)write(stdout, *)'>> Start of calculating Landau level spectrum'
       call now(time_start)
-#if defined (INTELMKL)
       call LandauLevel_k_dos_Lanczos
-#endif
       call now(time_end)
       call print_time_cost(time_start, time_end, 'LandauLevel_k_dos_calc')
       if(cpuid.eq.0)write(stdout, *)'<< End of calculating Landau level spectrum'
@@ -252,9 +243,7 @@ program main
       call now(time_start)
       if (Is_HrFile) then
          if(Is_Sparse_Hr) then
-#if defined (INTELMKL)
             call sparse_landau_level_B
-#endif
          else
             call landau_level_B
          end if
@@ -286,9 +275,7 @@ program main
       call now(time_start)
       if (Is_HrFile) then
          if(Is_Sparse_Hr.or.Num_wann*Magq>2000) then
-#if defined (INTELMKL)
             call sparse_landau_level_k
-#endif
          else
             call landau_level_k
          endif
@@ -384,9 +371,7 @@ program main
            if(.not. Is_Sparse_Hr) then
               call dos_sub
            else
-#if defined (INTELMKL)
               call dos_sparse
-#endif     
            end if
            call now(time_end)
            call print_time_cost(time_start, time_end, 'Dos_calc')
@@ -469,9 +454,7 @@ program main
         if(cpuid.eq.0)write(stdout, *)'>> Start of calculating the slab band structure'
         call now(time_start)
         if (Is_Sparse_Hr) then
-#if defined (INTELMKL)
           call ek_slab_sparseHR
-#endif
         else
            call ek_slab
         endif
diff --git a/src/mgmres.f90 b/src/mgmres.f90
new file mode 100644
index 00000000..a5ba68e6
--- /dev/null
+++ b/src/mgmres.f90
@@ -0,0 +1,1700 @@
+subroutine atx_cr ( n, nz_num, ia, ja, a, x, w )
+
+!*****************************************************************************80
+!
+!! atx_cr() computes A'*x for a matrix stored in sparse compressed row form.
+!
+!  Discussion:
+!
+!    The Sparse Compressed Row storage format is used.
+!
+!    The matrix A is assumed to be sparse.  To save on storage, only
+!    the nonzero entries of A are stored.  The vector JA stores the
+!    column index of the nonzero value.  The nonzero values are sorted
+!    by row, and the compressed row vector IA then has the property that
+!    the entries in A and JA that correspond to row I occur in indices
+!    IA[I] through IA[I+1]-1.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    17 July 2007
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the system.
+!
+!    Input, integer NZ_NUM, the number of nonzeros.
+!
+!    Input, integer IA(N+1), JA(NZ_NUM), the row and column
+!    indices of the matrix values.  The row vector has been compressed.
+!
+!    Input, real ( kind = rk ) A(NZ_NUM), the matrix values.
+!
+!    Input, real ( kind = rk ) X(N), the vector to be multiplied by A'.
+!
+!    Output, real ( kind = rk ) W(N), the value of A'*X.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  integer i
+  integer ia(n+1)
+  integer ja(nz_num)
+  integer k1
+  integer k2
+  real ( kind = rk ) w(n)
+  real ( kind = rk ) x(n)
+
+  w(1:n) = 0.0D+00
+
+  do i = 1, n
+    k1 = ia(i)
+    k2 = ia(i+1) - 1
+    w(ja(k1:k2)) = w(ja(k1:k2)) + a(k1:k2) * x(i)
+  end do
+
+  return
+end
+subroutine atx_st ( n, nz_num, ia, ja, a, x, w )
+
+!*****************************************************************************80
+!
+!! atx_st() computes A'*x for a matrix stored in sparset triplet form.
+!
+!  Discussion:
+!
+!    The matrix A is assumed to be sparse.  To save on storage, only
+!    the nonzero entries of A are stored.  For instance, the K-th nonzero
+!    entry in the matrix is stored by:
+!
+!      A(K) = value of entry,
+!      IA(K) = row of entry,
+!      JA(K) = column of entry.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    08 August 2006
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the system.
+!
+!    Input, integer NZ_NUM, the number of nonzeros.
+!
+!    Input, integer IA(NZ_NUM), JA(NZ_NUM), the row and column
+!    indices of the matrix values.
+!
+!    Input, real ( kind = rk ) A(NZ_NUM), the matrix values.
+!
+!    Input, real ( kind = rk ) X(N), the vector to be multiplied by A'.
+!
+!    Output, real ( kind = rk ) W(N), the value of A'*X.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  integer i
+  integer ia(nz_num)
+  integer j
+  integer ja(nz_num)
+  integer k
+  real ( kind = rk ) w(n)
+  real ( kind = rk ) x(n)
+
+  w(1:n) = 0.0D+00
+
+  do k = 1, nz_num
+    i = ia(k)
+    j = ja(k)
+    w(j) = w(j) + a(k) * x(i)
+  end do
+
+  return
+end
+subroutine ax_cr ( n, nz_num, ia, ja, a, x, w )
+
+!*****************************************************************************80
+!
+!! ax_cr() computes A*x for a matrix stored in sparse compressed row form.
+!
+!  Discussion:
+!
+!    The Sparse Compressed Row storage format is used.
+!
+!    The matrix A is assumed to be sparse.  To save on storage, only
+!    the nonzero entries of A are stored.  The vector JA stores the
+!    column index of the nonzero value.  The nonzero values are sorted
+!    by row, and the compressed row vector IA then has the property that
+!    the entries in A and JA that correspond to row I occur in indices
+!    IA[I] through IA[I+1]-1.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    17 July 2007
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the system.
+!
+!    Input, integer NZ_NUM, the number of nonzeros.
+!
+!    Input, integer IA(N+1), JA(NZ_NUM), the row and column
+!    indices of the matrix values.  The row vector has been compressed.
+!
+!    Input, real ( kind = rk ) A(NZ_NUM), the matrix values.
+!
+!    Input, real ( kind = rk ) X(N), the vector to be multiplied by A.
+!
+!    Output, real ( kind = rk ) W(N), the value of A*X.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  integer i
+  integer ia(n+1)
+  integer ja(nz_num)
+  integer k1
+  integer k2
+  real ( kind = rk ) w(n)
+  real ( kind = rk ) x(n)
+
+  w(1:n) = 0.0D+00
+
+  do i = 1, n
+    k1 = ia(i)
+    k2 = ia(i+1) - 1
+    w(i) = w(i) + dot_product ( a(k1:k2), x(ja(k1:k2)) )
+  end do
+
+  return
+end
+subroutine ax_st ( n, nz_num, ia, ja, a, x, w )
+
+!*****************************************************************************80
+!
+!! ax_st() computes A*x for a matrix stored in sparset triplet form.
+!
+!  Discussion:
+!
+!    The matrix A is assumed to be sparse.  To save on storage, only
+!    the nonzero entries of A are stored.  For instance, the K-th nonzero
+!    entry in the matrix is stored by:
+!
+!      A(K) = value of entry,
+!      IA(K) = row of entry,
+!      JA(K) = column of entry.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    08 August 2006
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the system.
+!
+!    Input, integer NZ_NUM, the number of nonzeros.
+!
+!    Input, integer IA(NZ_NUM), JA(NZ_NUM), the row and column
+!    indices of the matrix values.
+!
+!    Input, real ( kind = rk ) A(NZ_NUM), the matrix values.
+!
+!    Input, real ( kind = rk ) X(N), the vector to be multiplied by A.
+!
+!    Output, real ( kind = rk ) W(N), the value of A*X.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  integer i
+  integer ia(nz_num)
+  integer j
+  integer ja(nz_num)
+  integer k
+  real ( kind = rk ) w(n)
+  real ( kind = rk ) x(n)
+
+  w(1:n) = 0.0D+00
+
+  do k = 1, nz_num
+    i = ia(k)
+    j = ja(k)
+    w(i) = w(i) + a(k) * x(j)
+  end do
+
+  return
+end
+subroutine diagonal_pointer_cr ( n, nz_num, ia, ja, ua )
+
+!*****************************************************************************80
+!
+!! diagonal_pointer_cr() finds diagonal entries in a sparse compressed row matrix.
+!
+!  Discussion:
+!
+!    The matrix A is assumed to be stored in compressed row format.  Only
+!    the nonzero entries of A are stored.  The vector JA stores the
+!    column index of the nonzero value.  The nonzero values are sorted
+!    by row, and the compressed row vector IA then has the property that
+!    the entries in A and JA that correspond to row I occur in indices
+!    IA[I] through IA[I+1]-1.
+!
+!    The array UA can be used to locate the diagonal elements of the matrix.
+!
+!    It is assumed that every row of the matrix includes a diagonal element,
+!    and that the elements of each row have been ascending sorted.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    18 July 2007
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the system.
+!
+!    Input, integer NZ_NUM, the number of nonzeros.
+!
+!    Input, integer IA(N+1), JA(NZ_NUM), the row and column
+!    indices of the matrix values.  The row vector has been compressed.
+!    On output, the order of the entries of JA may have changed because of
+!    the sorting.
+!
+!    Output, integer UA(N), the index of the diagonal element
+!    of each row.
+!
+  implicit none
+
+  integer n
+  integer nz_num
+
+  integer i
+  integer ia(n+1)
+  integer k
+  integer ja(nz_num)
+  integer ua(n)
+
+  ua(1:n) = -1
+
+  do i = 1, n
+    do k = ia(i), ia(i+1) - 1
+      if ( ja(k) == i ) then
+        ua(i) = k
+      end if
+    end do
+  end do
+
+  return
+end
+subroutine ilu_cr ( n, nz_num, ia, ja, a, ua, l )
+
+!*****************************************************************************80
+!
+!! ilu_cr() computes the incomplete LU factorization of a matrix.
+!
+!  Discussion:
+!
+!    The matrix A is assumed to be stored in compressed row format.  Only
+!    the nonzero entries of A are stored.  The vector JA stores the
+!    column index of the nonzero value.  The nonzero values are sorted
+!    by row, and the compressed row vector IA then has the property that
+!    the entries in A and JA that correspond to row I occur in indices
+!    IA(I) through IA(I+1)-1.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    27 July 2007
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the system.
+!
+!    Input, integer NZ_NUM, the number of nonzeros.
+!
+!    Input, integer IA(N+1), JA(NZ_NUM), the row and column
+!    indices of the matrix values.  The row vector has been compressed.
+!
+!    Input, real ( kind = rk ) A(NZ_NUM), the matrix values.
+!
+!    Input, integer UA(N), the index of the diagonal element
+!    of each row.
+!
+!    Output, real ( kind = rk ) L(NZ_NUM), the ILU factorization of A.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  integer i
+  integer ia(n+1)
+  integer iw(n)
+  integer j
+  integer ja(nz_num)
+  integer jj
+  integer jrow
+  integer jw
+  integer k
+  real ( kind = rk ) l(nz_num)
+  real ( kind = rk ) tl
+  integer ua(n)
+!
+!  Copy A.
+!
+  l(1:nz_num) = a(1:nz_num)
+
+  do i = 1, n
+!
+!  IW points to the nonzero entries in row I.
+!
+    iw(1:n) = -1
+
+    do k = ia(i), ia(i+1) - 1
+      iw(ja(k)) = k
+    end do
+
+    do j = ia(i), ia(i+1) - 1
+      jrow = ja(j)
+      if ( i <= jrow ) then
+        exit
+      end if
+      tl = l(j) * l(ua(jrow))
+      l(j) = tl
+      do jj = ua(jrow) + 1, ia(jrow+1) - 1
+        jw = iw(ja(jj))
+        if ( jw /= -1 ) then
+          l(jw) = l(jw) - tl * l(jj)
+        end if
+      end do
+    end do
+
+    ua(i) = j
+
+    if ( jrow /= i ) then
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'ILU_CR - Fatal error!'
+      write ( *, '(a)' ) '  JROW ~= I'
+      write ( *, '(a,i8)' ) '  JROW = ', jrow
+      write ( *, '(a,i8)' ) '  I    = ', i
+      stop
+    end if
+
+    if ( l(j) == 0.0D+00 ) then
+      write ( *, '(a)' ) ' '
+      write ( *, '(a)' ) 'ILU_CR - Fatal error!'
+      write ( *, '(a,i8)' ) '  Zero pivot on step I = ', i
+      write ( *, '(a,i8,a)' ) '  L(', j, ') = 0.0'
+      stop
+    end if
+
+    l(j) = 1.0D+00 / l(j)
+
+  end do
+
+  l(ua(1:n)) = 1.0D+00 / l(ua(1:n))
+
+  return
+end
+subroutine lus_cr ( n, nz_num, ia, ja, l, ua, r, z )
+
+!*****************************************************************************80
+!
+!! lus_cr() applies the incomplete LU preconditioner.
+!
+!  Discussion:
+!
+!    The linear system M * Z = R is solved for Z.  M is the incomplete
+!    LU preconditioner matrix, and R is a vector supplied by the user.
+!    So essentially, we're solving L * U * Z = R.
+!
+!    The matrix A is assumed to be stored in compressed row format.  Only
+!    the nonzero entries of A are stored.  The vector JA stores the
+!    column index of the nonzero value.  The nonzero values are sorted
+!    by row, and the compressed row vector IA then has the property that
+!    the entries in A and JA that correspond to row I occur in indices
+!    IA(I) through IA(I+1)-1.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    18 July 2007
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the system.
+!
+!    Input, integer NZ_NUM, the number of nonzeros.
+!
+!    Input, integer IA(N+1), JA(NZ_NUM), the row and column
+!    indices of the matrix values.  The row vector has been compressed.
+!
+!    Input, real ( kind = rk ) L(NZ_NUM), the matrix values.
+!
+!    Input, integer UA(N), the index of the diagonal element
+!    of each row.
+!
+!    Input, real ( kind = rk ) R(N), the right hand side.
+!
+!    Output, real ( kind = rk ) Z(N), the solution of the system M * Z = R.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer n
+  integer nz_num
+
+  integer i
+  integer ia(n+1)
+  integer j
+  integer ja(nz_num)
+  real ( kind = rk ) l(nz_num)
+  real ( kind = rk ) r(n)
+  integer ua(n)
+  real ( kind = rk ) w(n)
+  real ( kind = rk ) z(n)
+!
+!  Copy R in.
+!
+  w(1:n) = r(1:n)
+!
+!  Solve L * w = w where L is unit lower triangular.
+!
+  do i = 2, n
+    do j = ia(i), ua(i) - 1
+      w(i) = w(i) - l(j) * w(ja(j))
+    end do
+  end do
+!
+!  Solve U * w = w, where U is upper triangular.
+!
+  do i = n, 1, -1
+    do j = ua(i) + 1, ia(i+1) - 1
+      w(i) = w(i) - l(j) * w(ja(j))
+    end do
+    w(i) = w(i) / l(ua(i))
+  end do
+!
+!  Copy Z out.
+!
+  z(1:n) = w(1:n)
+
+  return
+end
+subroutine mgmres_csr ( n, nz_num, ia, ja, a, x, rhs, itr_max, mr, tol_abs, &
+  tol_rel )
+
+!*****************************************************************************80
+!
+!! mgmres_csr() applies restarted GMRES to a sparse triplet matrix.
+!
+!  Discussion:
+!
+!    The linear system A*X=B is solved iteratively.
+!
+!    The matrix A is assumed to be stored in sparse triplet form.  Only
+!    the nonzero entries of A are stored.  For instance, the K-th nonzero
+!    entry in the matrix is stored by:
+!
+!      A(K) = value of entry,
+!      IA(K) = row of entry,
+!      JA(K) = column of entry.
+!
+!    Thanks to Jesus Pueblas Sanchez-Guerra for supplying two
+!    corrections to the code on 31 May 2007.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    13 July 2007
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the linear system.
+!
+!    Input, integer NZ_NUM, the number of nonzero matrix values.
+!
+!    Input, integer IA(NZ_NUM), JA(NZ_NUM), the row and column
+!    indices of the matrix values.
+!
+!    Input, real ( kind = rk ) A(NZ_NUM), the matrix values.
+!
+!    Input/output, real ( kind = rk ) X(N); on input, an approximation to
+!    the solution.  On output, an improved approximation.
+!
+!    Input, real ( kind = rk ) RHS(N), the right hand side of the linear system.
+!
+!    Input, integer ITR_MAX, the maximum number of (outer)
+!    iterations to take.
+!
+!    Input, integer MR, the maximum number of (inner) iterations
+!    to take.  0 < MR <= N.
+!
+!    Input, real ( kind = rk ) TOL_ABS, an absolute tolerance applied to the
+!    current residual.
+!
+!    Input, real ( kind = rk ) TOL_REL, a relative tolerance comparing the
+!    current residual to the initial residual.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer mr
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  real ( kind = rk ) av
+  real ( kind = rk ) c(1:mr)
+  real ( kind = rk ), parameter :: delta = 1.0D-03
+  real ( kind = rk ) g(1:mr+1)
+  real ( kind = rk ) h(1:mr+1,1:mr)
+  real ( kind = rk ) htmp
+  integer i
+  integer ia(n+1)
+  integer itr
+  integer itr_max
+  integer itr_used
+  integer j
+  integer ja(nz_num)
+  integer k
+  integer k_copy
+  real ( kind = rk ) mu
+  real ( kind = rk ) r(1:n)
+  real ( kind = rk ) rho
+  real ( kind = rk ) rho_tol
+  real ( kind = rk ) rhs(1:n)
+  real ( kind = rk ) s(1:mr)
+  real ( kind = rk ) tol_abs
+  real ( kind = rk ) tol_rel
+  real ( kind = rk ) v(1:n,1:mr+1)
+  logical, parameter :: verbose = .false.
+  real ( kind = rk ) x(1:n)
+  real ( kind = rk ) y(1:mr+1)
+
+  itr_used = 0
+
+  if ( n < mr ) then
+    write ( *, '(a)' ) ' '
+    write ( *, '(a)' ) 'MGMRES_csr - Fatal error!'
+    write ( *, '(a)' ) '  N < MR.'
+    write ( *, '(a,i8)' ) '  N = ', n
+    write ( *, '(a,i8)' ) '  MR = ', mr
+    stop
+  end if
+
+  do itr = 1, itr_max
+
+    call ax_cr ( n, nz_num, ia, ja, a, x, r )
+
+    r(1:n) = rhs(1:n) - r(1:n)
+
+    rho = sqrt ( dot_product ( r(1:n), r(1:n) ) )
+
+    if ( verbose ) then
+      write ( *, '(a,i8,a,g14.6)' ) '  ITR = ', itr, '  Residual = ', rho
+    end if
+
+    if ( itr == 1 ) then
+      rho_tol = rho * tol_rel
+    end if
+
+    v(1:n,1) = r(1:n) / rho
+
+    g(1) = rho
+    g(2:mr+1) = 0.0D+00
+
+    h(1:mr+1,1:mr) = 0.0D+00
+
+    do k = 1, mr
+
+      k_copy = k
+
+      call ax_cr ( n, nz_num, ia, ja, a, v(1:n,k), v(1:n,k+1) )
+
+      av = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+
+      do j = 1, k
+        h(j,k) = dot_product ( v(1:n,k+1), v(1:n,j) )
+        v(1:n,k+1) = v(1:n,k+1) - h(j,k) * v(1:n,j)
+      end do
+
+      h(k+1,k) = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+
+      if ( av + delta * h(k+1,k) == av ) then
+
+        do j = 1, k
+          htmp = dot_product ( v(1:n,k+1), v(1:n,j) )
+          h(j,k) = h(j,k) + htmp
+          v(1:n,k+1) = v(1:n,k+1) - htmp * v(1:n,j)
+        end do
+
+        h(k+1,k) = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+
+      end if
+
+      if ( h(k+1,k) /= 0.0D+00 ) then
+        v(1:n,k+1) = v(1:n,k+1) / h(k+1,k)
+      end if
+
+      if ( 1 < k ) then
+
+        y(1:k+1) = h(1:k+1,k)
+
+        do j = 1, k - 1
+          call mult_givens ( c(j), s(j), j, y(1:k+1) )
+        end do
+
+        h(1:k+1,k) = y(1:k+1)
+
+      end if
+
+      mu = sqrt ( h(k,k)**2 + h(k+1,k)**2 )
+      c(k) = h(k,k) / mu
+      s(k) = -h(k+1,k) / mu
+      h(k,k) = c(k) * h(k,k) - s(k) * h(k+1,k)
+      h(k+1,k) = 0.0D+00
+      call mult_givens ( c(k), s(k), k, g(1:k+1) )
+      rho = abs ( g(k+1) )
+
+      itr_used = itr_used + 1
+
+      if ( verbose ) then
+        write ( *, '(a,i8,a,g14.6)' ) '  K =   ', k, '  Residual = ', rho
+      end if
+
+      if ( rho <= rho_tol .and. rho <= tol_abs ) then
+        exit
+      end if
+
+    end do
+
+    k = k_copy - 1
+
+    y(k+1) = g(k+1) / h(k+1,k+1)
+
+    do i = k, 1, -1
+      y(i) = ( g(i) - dot_product ( h(i,i+1:k+1), y(i+1:k+1) ) ) / h(i,i)
+    end do
+
+    do i = 1, n
+      x(i) = x(i) + dot_product ( v(i,1:k+1), y(1:k+1) )
+    end do
+
+    if ( rho <= rho_tol .and. rho <= tol_abs ) then
+      exit
+    end if
+
+  end do
+
+  if ( verbose ) then
+    write ( *, '(a)'       ) ' '
+    write ( *, '(a)'       ) 'MGMRES_ST:'
+    write ( *, '(a,i8)'    ) '  Iterations = ', itr_used
+    write ( *, '(a,g14.6)' ) '  Final residual = ', rho
+  end if
+
+  return
+end
+
+subroutine mgmres_st ( n, nz_num, ia, ja, a, x, rhs, itr_max, mr, tol_abs, &
+  tol_rel )
+
+!*****************************************************************************80
+!
+!! mgmres_st() applies restarted GMRES to a sparse triplet matrix.
+!
+!  Discussion:
+!
+!    The linear system A*X=B is solved iteratively.
+!
+!    The matrix A is assumed to be stored in sparse triplet form.  Only
+!    the nonzero entries of A are stored.  For instance, the K-th nonzero
+!    entry in the matrix is stored by:
+!
+!      A(K) = value of entry,
+!      IA(K) = row of entry,
+!      JA(K) = column of entry.
+!
+!    Thanks to Jesus Pueblas Sanchez-Guerra for supplying two
+!    corrections to the code on 31 May 2007.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    13 July 2007
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the linear system.
+!
+!    Input, integer NZ_NUM, the number of nonzero matrix values.
+!
+!    Input, integer IA(NZ_NUM), JA(NZ_NUM), the row and column
+!    indices of the matrix values.
+!
+!    Input, real ( kind = rk ) A(NZ_NUM), the matrix values.
+!
+!    Input/output, real ( kind = rk ) X(N); on input, an approximation to
+!    the solution.  On output, an improved approximation.
+!
+!    Input, real ( kind = rk ) RHS(N), the right hand side of the linear system.
+!
+!    Input, integer ITR_MAX, the maximum number of (outer)
+!    iterations to take.
+!
+!    Input, integer MR, the maximum number of (inner) iterations
+!    to take.  0 < MR <= N.
+!
+!    Input, real ( kind = rk ) TOL_ABS, an absolute tolerance applied to the
+!    current residual.
+!
+!    Input, real ( kind = rk ) TOL_REL, a relative tolerance comparing the
+!    current residual to the initial residual.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer mr
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  real ( kind = rk ) av
+  real ( kind = rk ) c(1:mr)
+  real ( kind = rk ), parameter :: delta = 1.0D-03
+  real ( kind = rk ) g(1:mr+1)
+  real ( kind = rk ) h(1:mr+1,1:mr)
+  real ( kind = rk ) htmp
+  integer i
+  integer ia(nz_num)
+  integer itr
+  integer itr_max
+  integer itr_used
+  integer j
+  integer ja(nz_num)
+  integer k
+  integer k_copy
+  real ( kind = rk ) mu
+  real ( kind = rk ) r(1:n)
+  real ( kind = rk ) rho
+  real ( kind = rk ) rho_tol
+  real ( kind = rk ) rhs(1:n)
+  real ( kind = rk ) s(1:mr)
+  real ( kind = rk ) tol_abs
+  real ( kind = rk ) tol_rel
+  real ( kind = rk ) v(1:n,1:mr+1)
+  logical, parameter :: verbose = .true.
+  real ( kind = rk ) x(1:n)
+  real ( kind = rk ) y(1:mr+1)
+
+  itr_used = 0
+
+  if ( n < mr ) then
+    write ( *, '(a)' ) ' '
+    write ( *, '(a)' ) 'MGMRES_ST - Fatal error!'
+    write ( *, '(a)' ) '  N < MR.'
+    write ( *, '(a,i8)' ) '  N = ', n
+    write ( *, '(a,i8)' ) '  MR = ', mr
+    stop
+  end if
+
+  do itr = 1, itr_max
+
+    call ax_st ( n, nz_num, ia, ja, a, x, r )
+
+    r(1:n) = rhs(1:n) - r(1:n)
+
+    rho = sqrt ( dot_product ( r(1:n), r(1:n) ) )
+
+    if ( verbose ) then
+      write ( *, '(a,i8,a,g14.6)' ) '  ITR = ', itr, '  Residual = ', rho
+    end if
+
+    if ( itr == 1 ) then
+      rho_tol = rho * tol_rel
+    end if
+
+    v(1:n,1) = r(1:n) / rho
+
+    g(1) = rho
+    g(2:mr+1) = 0.0D+00
+
+    h(1:mr+1,1:mr) = 0.0D+00
+
+    do k = 1, mr
+
+      k_copy = k
+
+      call ax_st ( n, nz_num, ia, ja, a, v(1:n,k), v(1:n,k+1) )
+
+      av = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+
+      do j = 1, k
+        h(j,k) = dot_product ( v(1:n,k+1), v(1:n,j) )
+        v(1:n,k+1) = v(1:n,k+1) - h(j,k) * v(1:n,j)
+      end do
+
+      h(k+1,k) = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+
+      if ( av + delta * h(k+1,k) == av ) then
+
+        do j = 1, k
+          htmp = dot_product ( v(1:n,k+1), v(1:n,j) )
+          h(j,k) = h(j,k) + htmp
+          v(1:n,k+1) = v(1:n,k+1) - htmp * v(1:n,j)
+        end do
+
+        h(k+1,k) = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+
+      end if
+
+      if ( h(k+1,k) /= 0.0D+00 ) then
+        v(1:n,k+1) = v(1:n,k+1) / h(k+1,k)
+      end if
+
+      if ( 1 < k ) then
+
+        y(1:k+1) = h(1:k+1,k)
+
+        do j = 1, k - 1
+          call mult_givens ( c(j), s(j), j, y(1:k+1) )
+        end do
+
+        h(1:k+1,k) = y(1:k+1)
+
+      end if
+
+      mu = sqrt ( h(k,k)**2 + h(k+1,k)**2 )
+      c(k) = h(k,k) / mu
+      s(k) = -h(k+1,k) / mu
+      h(k,k) = c(k) * h(k,k) - s(k) * h(k+1,k)
+      h(k+1,k) = 0.0D+00
+      call mult_givens ( c(k), s(k), k, g(1:k+1) )
+      rho = abs ( g(k+1) )
+
+      itr_used = itr_used + 1
+
+      if ( verbose ) then
+        write ( *, '(a,i8,a,g14.6)' ) '  K =   ', k, '  Residual = ', rho
+      end if
+
+      if ( rho <= rho_tol .and. rho <= tol_abs ) then
+        exit
+      end if
+
+    end do
+
+    k = k_copy - 1
+
+    y(k+1) = g(k+1) / h(k+1,k+1)
+
+    do i = k, 1, -1
+      y(i) = ( g(i) - dot_product ( h(i,i+1:k+1), y(i+1:k+1) ) ) / h(i,i)
+    end do
+
+    do i = 1, n
+      x(i) = x(i) + dot_product ( v(i,1:k+1), y(1:k+1) )
+    end do
+
+    if ( rho <= rho_tol .and. rho <= tol_abs ) then
+      exit
+    end if
+
+  end do
+
+  if ( verbose ) then
+    write ( *, '(a)'       ) ' '
+    write ( *, '(a)'       ) 'MGMRES_ST:'
+    write ( *, '(a,i8)'    ) '  Iterations = ', itr_used
+    write ( *, '(a,g14.6)' ) '  Final residual = ', rho
+  end if
+
+  return
+end
+subroutine mult_givens ( c, s, k, g )
+
+!*****************************************************************************80
+!
+!! mult_givens() applies a Givens rotation to two successive entries of a vector.
+!
+!  Discussion:
+!
+!    In order to make it easier to compare this code with the Original C,
+!    the vector indexing is 0-based.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    08 August 2006
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Input:
+!
+!    real ( kind = rk ) C, S, the cosine and sine of a Givens  rotation.
+!
+!    integer K, indicates the location of the first vector entry.
+!
+!    real ( kind = rk ) G(1:K+1), the vector to be modified.
+!
+!  Output:
+!
+!    real ( kind = rk ) G(1:K+1): the Givens rotation has been 
+!    applied to entries G(K) and G(K+1).
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer k
+
+  real ( kind = rk ) c
+  real ( kind = rk ) g(1:k+1)
+  real ( kind = rk ) g1
+  real ( kind = rk ) g2
+  real ( kind = rk ) s
+
+  g1 = c * g(k) - s * g(k+1)
+  g2 = s * g(k) + c * g(k+1)
+
+  g(k)   = g1
+  g(k+1) = g2
+
+  return
+end
+subroutine pmgmres_ilu_cr ( n, nz_num, ia, ja, a, x, rhs, itr_max, mr, &
+  tol_abs, tol_rel )
+
+!*****************************************************************************80
+!
+!! pmgmres_ilu_cr() applies the preconditioned restarted GMRES algorithm.
+!
+!  Discussion:
+!
+!    The matrix A is assumed to be stored in compressed row format.  Only
+!    the nonzero entries of A are stored.  The vector JA stores the
+!    column index of the nonzero value.  The nonzero values are sorted
+!    by row, and the compressed row vector IA then has the property that
+!    the entries in A and JA that correspond to row I occur in indices
+!    IA(I) through IA(I+1)-1.
+!
+!    This routine uses the incomplete LU decomposition for the
+!    preconditioning.  This preconditioner requires that the sparse
+!    matrix data structure supplies a storage position for each diagonal
+!    element of the matrix A, and that each diagonal element of the
+!    matrix A is not zero.
+!
+!    Thanks to Jesus Pueblas Sanchez-Guerra for supplying two
+!    corrections to the code on 31 May 2007.
+!
+!    Thanks to Zhitao Li for pointing out that the L array should be dimensioned
+!    NZ_NUM, rather than IA[N]+1, 26 October 2022.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license. 
+!
+!  Modified:
+!
+!    26 October 2022
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Input:
+!
+!    integer N, the order of the linear system.
+!
+!    integer NZ_NUM, the number of nonzero matrix values.
+!
+!    integer IA(N+1), JA(NZ_NUM), the row and column indices
+!    of the matrix values.  The row vector has been compressed.
+!
+!    real ( kind = rk ) A(NZ_NUM), the matrix values.
+!
+!    real ( kind = rk ) X(N); an approximation to the solution.
+!
+!    real ( kind = rk ) RHS(N), the right hand side of the linear system.
+!
+!    integer ITR_MAX, the maximum number of (outer) iterations to take.
+!
+!    integer MR, the maximum number of (inner) iterations 
+!    to take.  MR must be less than N.
+!
+!    real ( kind = rk ) TOL_ABS, an absolute tolerance applied to the
+!    current residual.
+!
+!    real ( kind = rk ) TOL_REL, a relative tolerance comparing the
+!    current residual to the initial residual.
+!
+!  Output:
+!
+!    real ( kind = rk ) X(N); an improved approximate solution.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer mr
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  real ( kind = rk ) av
+  real ( kind = rk ) c(mr+1)
+  real ( kind = rk ), parameter :: delta = 1.0D-03
+  real ( kind = rk ) g(mr+1)
+  real ( kind = rk ) h(mr+1,mr)
+  real ( kind = rk ) htmp
+  integer i
+  integer ia(n+1)
+  integer itr
+  integer itr_max
+  integer itr_used
+  integer j
+  integer ja(nz_num)
+  integer k
+  integer k_copy
+!
+!  Dimension of L changed from ia(n+1)+1 to nz_num, 26 October 2022.
+!
+  real ( kind = rk ) l(nz_num)
+! real ( kind = rk ) l(ia(n+1)+1)
+  real ( kind = rk ) mu
+  real ( kind = rk ) r(n)
+  real ( kind = rk ) rho
+  real ( kind = rk ) rho_tol
+  real ( kind = rk ) rhs(n)
+  real ( kind = rk ) s(mr+1)
+  real ( kind = rk ) tol_abs
+  real ( kind = rk ) tol_rel
+  integer ua(n)
+  real ( kind = rk ) v(n,mr+1);
+  logical, parameter :: verbose = .true.
+  real ( kind = rk ) x(n)
+  real ( kind = rk ) y(mr+1)
+
+  itr_used = 0
+!
+!  Sort the entries in the matrix.
+!
+  call rearrange_cr ( n, nz_num, ia, ja, a )
+!
+!  Index the diagonal entries.
+!
+  call diagonal_pointer_cr ( n, nz_num, ia, ja, ua )
+!
+!  Compute the incomplete LU factorization of the matrix.
+!
+  call ilu_cr ( n, nz_num, ia, ja, a, ua, l )
+
+  if ( verbose ) then
+    write ( *, '(a)' ) ' '
+    write ( *, '(a)' ) 'pmgmres_ilu_cr():'
+    write ( *, '(a,i4)' ) '  Number of unknowns = ', n
+  end if
+
+  do itr = 1, itr_max
+
+    call ax_cr ( n, nz_num, ia, ja, a, x, r )
+
+    r(1:n) = rhs(1:n) - r(1:n)
+!
+!  Apply the ILU preconditioner.
+!
+    call lus_cr ( n, nz_num, ia, ja, l, ua, r, r )
+
+    rho = sqrt ( dot_product ( r, r ) )
+
+    if ( verbose ) then
+      write ( *, '(a,i4,a,g14.6)' ) '  ITR = ', itr, '  Residual = ', rho
+    end if
+
+    if ( itr == 1 ) then
+      rho_tol = rho * tol_rel
+    end if
+
+    v(1:n,1) = r(1:n) / rho
+
+    g(1) = rho
+    g(2:mr+1) = 0.0D+00
+
+    h(1:mr+1,1:mr) = 0.0D+00
+
+    do k = 1, mr
+
+      k_copy = k
+
+      call ax_cr ( n, nz_num, ia, ja, a, v(1:n,k), v(1:n,k+1) ) 
+!
+!  Apply the ILU preconditioner.
+!
+      call lus_cr ( n, nz_num, ia, ja, l, ua, v(1:n,k+1), v(1:n,k+1) )
+
+      av = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+
+      do j = 1, k
+        h(j,k) = dot_product ( v(1:n,k+1), v(1:n,j) )
+        v(1:n,k+1) = v(1:n,k+1) - v(1:n,j) * h(j,k)
+      end do
+
+      h(k+1,k) = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+
+      if ( ( av + delta * h(k+1,k)) == av ) then
+        do j = 1, k
+          htmp = dot_product ( v(1:n,k+1), v(1:n,j) )
+          h(j,k) = h(j,k) + htmp
+          v(1:n,k+1) = v(1:n,k+1) - htmp * v(1:n,j)
+        end do
+        h(k+1,k) = sqrt ( dot_product ( v(1:n,k+1), v(1:n,k+1) ) )
+      end if
+
+      if ( h(k+1,k) /= 0.0D+00 ) then
+        v(1:n,k+1) = v(1:n,k+1) / h(k+1,k)
+      end if
+
+      if ( 1 < k ) then
+        y(1:k+1) = h(1:k+1,k)
+        do j = 1, k - 1
+          call mult_givens ( c(j), s(j), j, y )
+        end do
+        h(1:k+1,k) = y(1:k+1)
+      end if
+
+      mu = sqrt ( h(k,k)**2 + h(k+1,k)**2 )
+
+      c(k) = h(k,k) / mu
+      s(k) = -h(k+1,k) / mu
+      h(k,k) = c(k) * h(k,k) - s(k) * h(k+1,k)
+      h(k+1,k) = 0.0D+00
+      call mult_givens ( c(k), s(k), k, g )
+
+      rho = abs ( g(k+1) )
+
+      itr_used = itr_used + 1
+
+      if ( verbose ) then
+        write ( *, '(a,i4,a,g14.6)' ) '  K = ', k, '  Residual = ', rho
+      end if
+
+      if ( rho <= rho_tol .and. rho <= tol_abs ) then
+        exit
+      end if
+
+    end do
+
+    k = k_copy - 1
+
+    y(k+1) = g(k+1) / h(k+1,k+1)
+
+    do i = k, 1, -1
+      y(i) = ( g(i) - dot_product ( h(i,i+1:k+1), y(i+1:k+1) ) ) / h(i,i)
+    end do
+
+    do i = 1, n
+      x(i) = x(i) + dot_product ( v(i,1:k+1), y(1:k+1) )
+    end do
+
+    if ( rho <= rho_tol .and. rho <= tol_abs ) then
+      exit
+    end if
+
+  end do
+
+  if ( verbose ) then
+    write ( *, '(a)' ) ' '
+    write ( *, '(a)' ) 'pmgmres_ilu_cr():'
+    write ( *, '(a,i6)' ) '  Iterations = ', itr_used
+    write ( *, '(a,g14.6)' ) '  Final residual = ', rho
+  end if
+
+  return
+end
+subroutine rearrange_cr ( n, nz_num, ia, ja, a )
+
+!*****************************************************************************80
+!
+!! rearrange_cr() sorts a sparse compressed row matrix.
+!
+!  Discussion:
+!
+!    This routine guarantees that the entries in the CR matrix
+!    are properly sorted.
+!
+!    After the sorting, the entries of the matrix are rearranged in such
+!    a way that the entries of each column are listed in ascending order
+!    of their column values.
+!
+!    The matrix A is assumed to be stored in compressed row format.  Only
+!    the nonzero entries of A are stored.  The vector JA stores the
+!    column index of the nonzero value.  The nonzero values are sorted
+!    by row, and the compressed row vector IA then has the property that
+!    the entries in A and JA that correspond to row I occur in indices
+!    IA(I) through IA(I+1)-1.
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    17 July 2007
+!
+!  Author:
+!
+!    Original C version by Lili Ju.
+!    FORTRAN90 version by John Burkardt.
+!
+!  Reference:
+!
+!    Richard Barrett, Michael Berry, Tony Chan, James Demmel,
+!    June Donato, Jack Dongarra, Victor Eijkhout, Roidan Pozo,
+!    Charles Romine, Henk van der Vorst,
+!    Templates for the Solution of Linear Systems:
+!    Building Blocks for Iterative Methods,
+!    SIAM, 1994.
+!    ISBN: 0898714710,
+!    LC: QA297.8.T45.
+!
+!    Tim Kelley,
+!    Iterative Methods for Linear and Nonlinear Equations,
+!    SIAM, 2004,
+!    ISBN: 0898713528,
+!    LC: QA297.8.K45.
+!
+!    Yousef Saad,
+!    Iterative Methods for Sparse Linear Systems,
+!    Second Edition,
+!    SIAM, 2003,
+!    ISBN: 0898715342,
+!    LC: QA188.S17.
+!
+!  Parameters:
+!
+!    Input, integer N, the order of the system.
+!
+!    Input, integer NZ_NUM, the number of nonzeros.
+!
+!    Input, integer IA(N+1), the compressed row indices.
+!
+!    Input/output, integer JA(NZ_NUM), the column indices.
+!    On output, these may have been rearranged by the sorting.
+!
+!    Input/output, real ( kind = rk ) A(NZ_NUM), the matrix values.  On output,
+!    the matrix values may have been moved somewhat because of the sorting.
+!
+  implicit none
+
+  integer, parameter :: rk = kind ( 1.0D+00 )
+
+  integer n
+  integer nz_num
+
+  real ( kind = rk ) a(nz_num)
+  integer i
+  integer ia(n+1)
+  integer i4temp
+  integer ja(nz_num)
+  integer k
+  integer l
+  real ( kind = rk ) r8temp
+
+  do i = 1, n
+
+    do k = ia(i), ia(i+1) - 2
+      do l = k + 1, ia(i+1) - 1
+
+        if ( ja(l) < ja(k) ) then
+          i4temp = ja(l)
+          ja(l)  = ja(k)
+          ja(k)  = i4temp
+
+          r8temp = a(l)
+          a(l)   = a(k)
+          a(k)   = r8temp
+        end if
+
+      end do
+    end do
+
+  end do
+
+  return
+end
+subroutine timestamp ( )
+
+!*****************************************************************************80
+!
+!! timestamp() prints the current YMDHMS date as a time stamp.
+!
+!  Example:
+!
+!    31 May 2001   9:45:54.872 AM
+!
+!  Licensing:
+!
+!    This code is distributed under the MIT license.
+!
+!  Modified:
+!
+!    18 May 2013
+!
+!  Author:
+!
+!    John Burkardt
+!
+  implicit none
+
+  character ( len = 8 ) ampm
+  integer d
+  integer h
+  integer m
+  integer mm
+  character ( len = 9 ), parameter, dimension(12) :: month = (/ &
+    'January  ', 'February ', 'March    ', 'April    ', &
+    'May      ', 'June     ', 'July     ', 'August   ', &
+    'September', 'October  ', 'November ', 'December ' /)
+  integer n
+  integer s
+  integer values(8)
+  integer y
+
+  call date_and_time ( values = values )
+
+  y = values(1)
+  m = values(2)
+  d = values(3)
+  h = values(5)
+  n = values(6)
+  s = values(7)
+  mm = values(8)
+
+  if ( h < 12 ) then
+    ampm = 'AM'
+  else if ( h == 12 ) then
+    if ( n == 0 .and. s == 0 ) then
+      ampm = 'Noon'
+    else
+      ampm = 'PM'
+    end if
+  else
+    h = h - 12
+    if ( h < 12 ) then
+      ampm = 'PM'
+    else if ( h == 12 ) then
+      if ( n == 0 .and. s == 0 ) then
+        ampm = 'Midnight'
+      else
+        ampm = 'AM'
+      end if
+    end if
+  end if
+
+  write ( *, '(i2.2,1x,a,1x,i4,2x,i2,a1,i2.2,a1,i2.2,a1,i3.3,1x,a)' ) &
+    d, trim ( month(m) ), y, h, ':', n, ':', s, '.', mm, trim ( ampm )
+
+  return
+end
diff --git a/src/module.f90 b/src/module.f90
index 1a2b8efc..68bc9843 100644
--- a/src/module.f90
+++ b/src/module.f90
@@ -450,7 +450,11 @@ module para
      logical :: valley_projection_calc ! for valley projection, only for sparse hamiltonina, you need to provide the valley operator 
      
      logical :: w3d_nested_calc = .false.
-     
+    
+     !> an integer to print the messages
+     !> iprint_level=3 : print all the debug messages
+     integer :: iprint_level = 1       
+
      namelist / Control / BulkBand_calc, BulkFS_calc,  BulkFS_Plane_calc, &
                           BulkFS_plane_stack_calc,  BulkGap_plane_calc, &
                           QPI_unfold_plane_calc, &
@@ -482,7 +486,7 @@ module para
                           LandauLevel_B_dos_calc,LanczosBand_calc,LanczosDos_calc, &
                           LandauLevel_B_calc, LandauLevel_kplane_calc,landau_chern_calc, &
                           FermiLevel_calc,ANE_calc, export_newhr,export_maghr,w3d_nested_calc, &
-                          valley_projection_calc
+                          valley_projection_calc, iprint_level
 
      integer :: Nslab  ! Number of slabs for 2d Slab system
      integer :: Nslab1 ! Number of slabs for 1D wire system
@@ -1062,6 +1066,9 @@ module para
      !> dimension number_group_operators, Num_atoms
      integer, allocatable :: imap_sym(:, :)
 
+     real(dp) :: time_cost_t1=0d0
+     real(dp) :: time_cost_t2=0d0
+     real(dp) :: time_cost_t3=0d0
 
  end module para
 
diff --git a/src/readHmnR.f90 b/src/readHmnR.f90
index 1fbaa9f1..d2d32174 100644
--- a/src/readHmnR.f90
+++ b/src/readHmnR.f90
@@ -181,6 +181,11 @@ subroutine readNormalHmnR()
       endif
    endif
 
+   !> get the ir index that the R(ir)=(0, 0, 0)
+   ir0=0
+   do ir=1, nrpts
+      if (irvec(1, ir)==0.and.irvec(2, ir)==0.and.irvec(3, ir)==0) ir0=ir
+   enddo
 
    !> Adding zeeman field
    !> Bx=Bdirection(1)
@@ -190,22 +195,19 @@ subroutine readNormalHmnR()
    !> sx, sy, sz are Pauli matrices.
    if (Add_Zeeman_Field) then
       if (abs(Zeeman_energy_in_eV)<eps6)then
-         Zeeman_energy_in_eV= Bmagnitude*Effective_gfactor*Bohr_magneton
+         Zeeman_energy_in_eV= Bmagnitude*Effective_gfactor*Bohr_magneton/eV2Hartree
          if (cpuid==0)then
-            write(stdout, '(1x, a, 3f16.6)')"Zeeman_energy_in_eV: ",  Zeeman_energy_in_eV/eV2Hartree
+            write(stdout, '(1x, a, 3f16.6)')"Zeeman_energy_in_eV: ",  Zeeman_energy_in_eV
          endif
       endif
+      call add_zeeman_normal_hr()
 
       !> After considering the Zeeman field, we already extended the spin space to spin-full.
       SOC = 1
-   endif ! Add_Zeeman_Field
 
+   endif ! Add_Zeeman_Field
 
    call get_stacking_direction_and_pos(add_electric_field, pos)
-   ir0=0
-   do ir=1, nrpts
-      if (irvec(1, ir)==0.and.irvec(2, ir)==0.and.irvec(3, ir)==0) ir0=ir
-   enddo
  
    if (add_electric_field>0) then
      io=0
@@ -797,7 +799,7 @@ subroutine readsparse_overlap
    1003 continue
    close(13)
 
-   if (cpuid.eq.0) write(stdout, '(a, i)')' >> Number of non-zeros splen_overlap_input', splen_overlap_input
+   if (cpuid.eq.0) write(stdout, '(a, i10)')' >> Number of non-zeros splen_overlap_input', splen_overlap_input
    if (cpuid.eq.0) write(stdout, '(a)')' >> Sparse overlap matrix reading finished '
 
    return
@@ -1129,71 +1131,70 @@ end subroutine reorder_wannierbasis
 
 
 
-!subroutine add_zeeman_normal_hr(Bx_in_Tesla, By_in_Tesla, Bz_in_Tesla)
-!  !> add Zeeman energy on the sparse hmnr based on the magnetic field strength
-!  !> magnetic_field_in_tesla is in Tesla
-!  !> Here the coordinate system is the same as user-specified in the LATTICE card.
-!  use para
-!  implicit none
-
-!  real(dp), intent(in) :: Bx_in_Tesla, By_in_Tesla, Bz_in_Tesla
-
-!  integer :: nwann_nsoc, i, j, ir0, ir
-!  real(dp) :: Zeeman_energy_in_eV_factor
-
-!  nwann_nsoc= Num_wann/2
-
-!  if (.not.Add_Zeeman_Field) return
-
-!  ir0=0
-!  do ir=1, Nrpts
-!     if (irvec(1,ir)==0.and.irvec(2,ir)==0.and.irvec(3,ir)==0) then
-!        ir0= ir
-!     endif
-!  enddo
-!  if (ir0==0) stop 'something wrong with irvec in subroutine add_zeeman_sparse_hr'
-
-!  Zeeman_energy_in_eV_factor= Effective_gfactor*Bohr_magneton
-
-!  if (SOC==0) then
-!     do ir=1, Nrpts
-!        do n=1, nwann_nsoc
-!           do m=1, nwann_nsoc
-!              if (irvec(1, ir)==0.and.irvec(2, ir)==0.and.irvec(3, ir)==0.and.n==m) then
-!                 HmnR(n+nwann_nsoc, m+nwann_nsoc, ir)= HmnR(n, m, ir)- Zeeman_energy_in_eV_factor/2d0*Bz_in_Tesla
-!                 HmnR(n, m, ir)            = HmnR(n, m, ir)+ Zeeman_energy_in_eV_factor/2d0*Bz_in_Tesla
-!                 HmnR(n      , m+nwann_nsoc, ir)=                 Zeeman_energy_in_eV_factor/2d0*Bx_in_Tesla &
-!                                                           - zi*Zeeman_energy_in_eV_factor/2d0*By_in_Tesla
-!                 HmnR(n+nwann_nsoc, m      , ir)=                 Zeeman_energy_in_eV_factor/2d0*Bx_in_Tesla &
-!                                                           + zi*Zeeman_energy_in_eV_factor/2d0*By_in_Tesla
-!              else
-!                 HmnR(n+nwann_nsoc, m+nwann_nsoc, ir)= HmnR(n, m, ir)
-!              endif ! R=0
-!           enddo ! m
-!        enddo ! n
-!     enddo ! ir
-!  else
-!     do ir=1, Nrpts
-!        if (irvec(1, ir)/=0.or.irvec(2, ir)/=0.or.irvec(3, ir)/=0) cycle
-!        do n=1, nwann_nsoc
-!           do m=1, nwann_nsoc
-!              if (n==m) then
-!                 HmnR(n+nwann_nsoc, m+nwann_nsoc, ir)= HmnR(n+nwann_nsoc, &
-!                 m+nwann_nsoc, ir)-Zeeman_energy_in_eV_factor/2d0*Bz_in_Tesla
-!                 HmnR(n      , m+nwann_nsoc, ir)= HmnR(n, m+nwann_nsoc, ir)+ &
-!                   Zeeman_energy_in_eV_factor/2d0*Bx_in_Tesla &
-!                                                           -zi*Zeeman_energy_in_eV_factor/2d0*By_in_Tesla
-!                 HmnR(n+nwann_nsoc, m      , ir)= HmnR(n+nwann_nsoc, m, ir)+  &
-!                  Zeeman_energy_in_eV_factor/2d0*Bx_in_Tesla &
-!                                                           +zi*Zeeman_energy_in_eV_factor/2d0*By_in_Tesla
-!                 HmnR(n, m, ir)= HmnR(n, m, ir)+Zeeman_energy_in_eV_factor/2d0*Bz_in_Tesla
-!              endif ! R=0
-!           enddo ! m
-!        enddo ! n
-!     enddo ! ir
-!  endif ! SOC
-
-
-!  return
-
-!end subroutine add_zeeman_normal_hr
+ subroutine add_zeeman_normal_hr()
+   !> add Zeeman energy on the sparse hmnr based on Zeeman_energy_in_eV and the magnetic field direction
+   !> Bx=Bdirection(1)
+   !> By=Bdirection(2)
+   !> Bz=Bdirection(3)
+   !> magnetic_field_in_tesla is in Tesla
+   !> Here the coordinate system is the same as user-specified in the LATTICE card.
+   !> H_zeeman
+   use para
+   implicit none
+
+   real(dp) :: Bdirection_x, Bdirection_y, Bdirection_z
+
+   integer :: nwann_nsoc, n, m, ir0, ir
+   real(dp) :: Zeeman_energy_in_au
+
+   if (.not.Add_Zeeman_Field) return
+
+   nwann_nsoc= Num_wann/2
+
+   ir0=0
+   do ir=1, Nrpts
+      if (irvec(1,ir)==0.and.irvec(2,ir)==0.and.irvec(3,ir)==0) then
+         ir0= ir
+      endif
+   enddo
+   if (ir0==0) stop 'something wrong with irvec in subroutine add_zeeman_sparse_hr'
+
+   if (abs(Zeeman_energy_in_eV)<eps9)then
+      !> in atomic unit
+      Zeeman_energy_in_au= Effective_gfactor*Bohr_magneton*Bmagnitude
+   else
+      Zeeman_energy_in_au= Zeeman_energy_in_eV*eV2Hartree
+   endif
+
+   if (SOC==0) then
+      !> We need to extend the spin up to the spin down part
+      do n=1, nwann_nsoc
+         HmnR(n, n, ir0)            = HmnR(n, n, ir0)+ Zeeman_energy_in_au/2d0*Bdirection(3)
+         HmnR(n+nwann_nsoc, n+nwann_nsoc, ir0)= HmnR(n, n, ir0)- Zeeman_energy_in_au/2d0*Bdirection(3)
+         HmnR(n      , n+nwann_nsoc, ir0)= Zeeman_energy_in_au/2d0*Bdirection(1) &
+                                         - zi*Zeeman_energy_in_au/2d0*Bdirection(2)
+         HmnR(n+nwann_nsoc, n      , ir0)= Zeeman_energy_in_au/2d0*Bdirection(1) &
+                                         + zi*Zeeman_energy_in_au/2d0*Bdirection(2)
+      enddo
+ 
+      do ir=1, Nrpts
+         do m=1, nwann_nsoc
+            do n=1, nwann_nsoc
+               HmnR(n+nwann_nsoc, m+nwann_nsoc, ir)= HmnR(n, m, ir)
+            enddo ! m
+         enddo ! n
+      enddo ! ir
+   else
+      do n=1, nwann_nsoc
+         HmnR(n, n, ir0)            = HmnR(n, n, ir0)+ Zeeman_energy_in_au/2d0*Bdirection(3)
+         HmnR(n+nwann_nsoc, n+nwann_nsoc, ir0)= HmnR(n, n, ir0)- Zeeman_energy_in_au/2d0*Bdirection(3)
+         HmnR(n      , n+nwann_nsoc, ir0)= Zeeman_energy_in_au/2d0*Bdirection(1) &
+                                         - zi*Zeeman_energy_in_au/2d0*Bdirection(2)
+         HmnR(n+nwann_nsoc, n      , ir0)= Zeeman_energy_in_au/2d0*Bdirection(1) &
+                                         + zi*Zeeman_energy_in_au/2d0*Bdirection(2)
+      enddo
+   endif ! SOC
+
+   return
+
+ end subroutine add_zeeman_normal_hr
diff --git a/src/readinput.f90 b/src/readinput.f90
index 2bb1f423..0f925549 100644
--- a/src/readinput.f90
+++ b/src/readinput.f90
@@ -562,7 +562,7 @@ subroutine readinput
    Magp_max=0   
    wcc_calc_tol= 0.08
    wcc_neighbour_tol= 0.3
-   NumLCZVecs=400
+   NumLCZVecs=100
    NumRandomConfs=1
    NumSelectedEigenVals=0 
    Beta= 100
@@ -3523,7 +3523,7 @@ subroutine readinput
 
    ! build the map between supercell (Origin_cell) and primitive cell (Folded_cell)
    if (BulkBand_unfold_line_calc.or.BulkBand_unfold_plane_calc.or.QPI_unfold_plane_calc.or.Landaulevel_unfold_line_calc)then
-      call build_map_supercell_primitivecell(Origin_cell, Folded_cell)
+      call build_map_supercell_primitivecell
    endif
 
    !> close wt.in
diff --git a/src/sigma_OHE.f90 b/src/sigma_OHE.f90
index 2e7886b9..93b8c0cb 100644
--- a/src/sigma_OHE.f90
+++ b/src/sigma_OHE.f90
@@ -78,6 +78,8 @@ subroutine sigma_ohe_calc_symm(mu_array, KBT_array, BTau_array, Nband_Fermi_Leve
       
       real(dp) :: time_start, time_end
       real(dp) :: time_start0, time_end0
+      real(dp) :: time_start1, time_end1
+      real(dp) :: time_rkf45, time_velocity, time_integral
       integer :: NSlice_Btau_inuse
 
       !> Btau slices for Runge-Kutta integration
@@ -117,7 +119,8 @@ subroutine sigma_ohe_calc_symm(mu_array, KBT_array, BTau_array, Nband_Fermi_Leve
       !> number of steps used in the Runge-Kutta integration
       integer :: NSlice_Btau
       integer :: NSlice_Btau_local
-      real(dp), allocatable :: kout(:, :)
+      real(dp), allocatable :: kout(:, :), kout_all(:, :)
+      real(dp), allocatable :: weights(:)
 
       !> define some arrays for different bands. Since there are different number
       !> of k points left for different bands.
@@ -148,7 +151,7 @@ subroutine sigma_ohe_calc_symm(mu_array, KBT_array, BTau_array, Nband_Fermi_Leve
 
       !> file index
       !integer, allocatable  :: myfileindex(:)
-      character(80) :: sigmafilename, bandname, tname, muname
+      character(80) :: sigmafilename, bandname, tname, muname, ibname, ikname, filename
 
 
       !> inverse of group operator
@@ -234,6 +237,7 @@ subroutine sigma_ohe_calc_symm(mu_array, KBT_array, BTau_array, Nband_Fermi_Leve
       Nk_start= KCube3D_total%Nk_start
       Nk_end= KCube3D_total%Nk_end
 
+      allocate( weights(Nslice_BTau_Max))
       allocate( Ek(Nband_Fermi_Level))
       allocate( Enk(Nk_start:Nk_end, Nband_Fermi_Level))
       allocate( velocity(3, Nk_start:Nk_end, Nband_Fermi_Level))
@@ -246,6 +250,7 @@ subroutine sigma_ohe_calc_symm(mu_array, KBT_array, BTau_array, Nband_Fermi_Leve
       velocity_k= 0d0
       velocity_bar= 0d0
       velocity_bar_k= 0d0
+      weights = 0d0
 
       time_start= 0d0
       time_end= 0d0
@@ -525,7 +530,9 @@ subroutine sigma_ohe_calc_symm(mu_array, KBT_array, BTau_array, Nband_Fermi_Leve
 
       !> a temp array used in RKFS
       allocate(kout(3, NSlice_Btau))
+      allocate(kout_all(3, NSlice_Btau))
       kout= 0d0
+      kout_all= 0d0
 
       !> allocate array for sigmafileindex and rhofileindex
       allocate(sigmafileindex(OmegaNum))
@@ -871,6 +878,7 @@ subroutine cal_sigma_iband_k
    
             !> Runge-Kutta only applied with BTauMax>0
             !> if the magnetic field is zero. 
+            call now(time_start1)
             if (BTauMax>eps3) then
                NSlice_Btau_inuse= NSlice_Btau
                call RKF45_pack(magnetic_field, bands_fermi_level(iband),  &
@@ -882,6 +890,8 @@ subroutine cal_sigma_iband_k
                enddo
                NSlice_Btau_inuse = NSlice_Btau
             endif
+            call now(time_end1)
+            time_rkf45= time_end1-time_start1
    
             if (NSlice_Btau_inuse==1) then
                write(stdout, '(a, i6, a, i4, a, i6, a, 3f12.6)')&
@@ -896,10 +906,12 @@ subroutine cal_sigma_iband_k
                cycle
             endif
   
+            call now(time_start1)
             if (NSlice_Btau_inuse==1)then  ! vxB=0
                call velocity_calc_iband(bands_fermi_level(iband), k_start, v_t)
                do ibtau=1, NSlice_Btau
                   kout(:, ibtau)= k_start(:)
+                  kout_all(:, ibtau)= k_start(:)
                   klist_iband(iband)%velocity_k(:, ibtau)= v_t
                enddo
                NSlice_Btau_inuse = NSlice_Btau
@@ -908,20 +920,39 @@ subroutine cal_sigma_iband_k
                   k= kout(:, it) 
                   call velocity_calc_iband(bands_fermi_level(iband), k, v_t)
                   klist_iband(iband)%velocity_k(:, it)= v_t
+                  kout_all(:, it) = k
                enddo ! integrate over time step
    
                !> periodic kpath in the BZ can be reused
                do i=2, NSlice_Btau_inuse/icycle
                   do j=1, icycle
                      klist_iband(iband)%velocity_k(:, j+(i-1)*icycle)= klist_iband(iband)%velocity_k(:, j)
+                     kout_all(:, j+(i-1)*icycle) = kout(:, j)
                   enddo
                enddo
                do i=(NSlice_Btau_inuse/icycle)*icycle+1, NSlice_Btau
                   klist_iband(iband)%velocity_k(:, i)= klist_iband(iband)%velocity_k(:, i-(NSlice_Btau_inuse/icycle)*icycle)
+                  kout_all(:, i) = kout(:, i-(NSlice_Btau_inuse/icycle)*icycle)
                enddo
             endif
+            call now(time_end1)
+            time_velocity=  time_end1- time_start1
+
+            if (cpuid==0.and.iprint_level==3) then
+               write(ibname, '(i6)')iband
+               write(ikname, '(i6)')ik
+               write(filename, '(5a)')'klist_ib_', trim(Adjustl(ibname)), '_ik', trim(Adjustl(ikname)), '.txt'
+               open(unit=324232, file=filename)
+
+               write(324232, *)'#icycle= ', icycle, ', NSlice_Btau= ', NSlice_Btau
+               do i=1, NSlice_Btau
+                  write(324232, "(i9, 6f16.6)") i, kout_all(:, i), klist_iband(iband)%velocity_k(:, i)
+               enddo
+               close(324232)
 
- 
+            endif
+
+            call now(time_start1)
             !> calculate the conductivity/tau
             do ikt=1, NumT
                KBT= KBT_array(ikt)
@@ -950,6 +981,11 @@ subroutine cal_sigma_iband_k
                         !> in the relaxation time approximation
                         v_k= klist_iband(iband)%velocity_k(:, 1+ishift)
                         if (BTau>eps3.and.NSlice_Btau_local>2) then
+                           ! set weight 
+                           do it=1, NSlice_Btau_local
+                              weights(it) = exp(-DeltaBtau*(it-1d0))
+                           enddo
+                           !> trapezoidal integral
                            velocity_bar_k= 0d0
                            do it=1, NSlice_Btau_local
                              velocity_bar_k= velocity_bar_k+ &
@@ -959,6 +995,23 @@ subroutine cal_sigma_iband_k
                            - 0.5d0*DeltaBtau*(exp(-(NSlice_Btau_local-1d0)*DeltaBtau)&
                            * klist_iband(iband)%velocity_k(:, NSlice_Btau_local+ishift)  &
                            + klist_iband(iband)%velocity_k(:, 1+ishift))
+
+                          !> Simpson's integral
+                          !> add the first and the last term
+                         ! velocity_bar_k=  weights(1)*klist_iband(iband)%velocity_k(:, 1)
+                         ! velocity_bar_k= velocity_bar_k+ weights(NSlice_Btau_local)*klist_iband(iband)%velocity_k(:, NSlice_Btau_local)
+                         !
+                         ! !> add the middle part
+                         ! do it = 2, NSlice_Btau_local-1, 2
+                         !    velocity_bar_k = velocity_bar_k + 4d0 * weights(it) * klist_iband(iband)%velocity_k(:, it)
+                         ! end do
+                         !
+                         ! do it = 3, NSlice_Btau_local-2, 2
+                         !    velocity_bar_k = velocity_bar_k + 2d0 * weights(it) * klist_iband(iband)%velocity_k(:, it)
+                         ! end do
+                         !
+                         ! velocity_bar_k = velocity_bar_k * (DeltaBtau / 3d0)
+
                         else
                            velocity_bar_k= v_k
                         endif
@@ -1019,9 +1072,15 @@ subroutine cal_sigma_iband_k
                   enddo ! ibtau  Btau
                enddo ! ie  mu
             enddo ! ikt KBT
+            call now(time_end1)
+            time_integral= time_end1- time_start1
+            if (cpuid.eq.0) write(stdout, '(a, i9, a, i10)')'>> icycle = ', icycle, ' , NSlice_Btau', NSlice_Btau
+            if (cpuid.eq.0) write(stdout, '(a, f16.2, a)')'>> time cost for RKF45_pack is    ', time_rkf45, ' s'
+            if (cpuid.eq.0) write(stdout, '(a, f16.2, a)')'>> time cost for velocity_calc is ', time_velocity, ' s'
+            if (cpuid.eq.0) write(stdout, '(a, f16.2, a)')'>> time cost for integral is      ', time_integral, ' s'
             call now(time_end)
             sigma_iband_k(iband)%time_cost_mpi(ik)= time_end- time_start
-            if (cpuid.eq.0) write(stdout, '(a, f16.2, a)')'>> time cost for this loop is ', time_end- time_start, ' s'
+            if (cpuid.eq.0) write(stdout, '(a, f16.2, a)')'>> time cost for this loop is     ', time_end- time_start, ' s'
          enddo ! ik  kpoints
    
       end subroutine cal_sigma_iband_k
@@ -1420,6 +1479,16 @@ subroutine velocity_calc_iband(iband, k, velocity_k)
       ! calculation bulk hamiltonian
       call ham_bulk_latticegauge(k, Hamk_bulk)
 
+!     call eigensystem_c( 'V', 'U', Num_wann, Hamk_bulk, W)
+
+!     !> Only the energy levels close to the Fermi level contribute to the conductivity
+!     if (abs(W(iband))/eV2Hartree>EF_broadening) then
+!        velocity_k= 0d0
+!        return
+!     endif
+
+!     UU= Hamk_bulk(:, iband)
+
       call zheevx_pack('V', 'U', Num_wann, iband, iband, Hamk_bulk, W, UU)
 
       !> Only the energy levels close to the Fermi level contribute to the conductivity
@@ -1558,7 +1627,7 @@ subroutine RKF45_pack(magnetic_field, iband, NSlice_Btau, k_start, Btau_start, B
          
 
          call periodic_diff(kout(:,2), kout(:,1), kdiff)
-         if (it>2)dis_smallest= norm(kdiff)
+         if (it>2)dis_smallest= norm(kdiff)/1 
 
          !> check if kout(:, it)==kout(:, 1)
          !> if it's a close orbit, we don't have to calculate all of them.
diff --git a/src/sparse.f90 b/src/sparse.f90
index e4ee7017..e0288880 100644
--- a/src/sparse.f90
+++ b/src/sparse.f90
@@ -196,10 +196,10 @@ module sparse
    public :: WTParCSRMatrixCreate
    public :: csr_sort_indices
    public :: csr_sum_duplicates
-#if defined (INTELMKL)
    public :: arpack_sparse_coo_eigs
    public :: arpack_sparse_coo_eigs_nonorth
-#endif
+   public :: csrmv_z
+   public :: coomv_z
 
 contains
 
@@ -2163,7 +2163,102 @@ subroutine csr_sum_duplicates(n, nnz, ia, ja, A_val)
          return
       end subroutine csr_sum_duplicates
 
-# if defined (INTELMKL)
+      !> csrmv_z multiplies a CSR matrix A times a vector x; y=A*x
+      !> inputs:
+      !> ndim, integer, the row dimension of the matrix
+      !> nnz, integer, number of non-zero entries, the dimension of A_csr,
+      !ja_csr
+      !> complex A_csr(nnz), integer ia_csr(ndim+1), ja_csr(nnz), the matrix in
+      !CSR Compresed Sparse Row format
+      !> complex x(ndim)
+      !> outputs:
+      !> complex y(ndim)
+      subroutine csrmv_z(ndim, nnz, A_csr, ia_csr, ja_csr, x, y)
+
+         use para, only : dp
+         implicit none
+
+         integer, intent(in) :: ndim
+         integer, intent(in) :: nnz
+         complex(dp), intent(in) :: A_csr(nnz)
+         integer, intent(in) :: ia_csr(ndim+1)
+         integer, intent(in) :: ja_csr(nnz)
+         complex(dp), intent(in) :: x(ndim)
+         complex(dp), intent(out) :: y(ndim)
+
+         integer :: i, k
+         complex(dp) :: t_z
+
+!        real(dp) :: time1, time2
+
+!        call now(time1)
+#if defined (INTELMKL)
+         call mkl_zcsrgemv('N', ndim, a_csr, ia_csr, ja_csr, x, y)
+#else
+      !> A naive implementation
+         do i=1, ndim
+            t_z= 0d0
+            do k=ia_csr(i), ia_csr(i+1)-1
+               t_z= t_z+ A_csr(k) * x(ja_csr(k))
+               
+            enddo
+            y(i) = t_z
+         enddo
+#endif
+
+!        call now(time2)
+!        time_total_debug= time_total_debug+ time2- time1
+         return
+
+      end subroutine csrmv_z
+
+      !> coomv_z multiplies a COO matrix A times a vector x; y=A*x
+      !> inputs:
+      !> ndim, integer, the row dimension of the matrix
+      !> nnz, integer, number of non-zero entries, the dimension of A_COO,
+      !ja_COO
+      !> complex A_COO(nnz), integer ia_COO(ndim+1), ja_COO(nnz), the matrix in
+      !COO Compresed Sparse Row format
+      !> complex x(ndim)
+      !> outputs:
+      !> complex y(ndim)
+      subroutine coomv_z(ndim, nnz, A_coo, ia_coo, ja_coo, x, y)
+
+         use para, only : dp
+         implicit none
+
+         integer, intent(in) :: ndim
+         integer, intent(in) :: nnz
+         complex(dp), intent(in) :: A_coo(nnz)
+         integer, intent(in) :: ia_coo(ndim+1)
+         integer, intent(in) :: ja_coo(nnz)
+         complex(dp), intent(in) :: x(ndim)
+         complex(dp), intent(out) :: y(ndim)
+
+         integer :: i, k
+         complex(dp) :: t_z
+
+!        real(dp) :: time1, time2
+
+!        call now(time1)
+#if defined (INTELMKL)
+         call mkl_zcoogemv('N', ndim, a_coo, ia_coo, ja_coo, nnz, x, y)
+#else
+      !> A naive implementation
+         do i=1, nnz
+            y(ia_coo(i)) = y(ia_coo(i))+ a_coo(i)* x(ja_coo(i))
+         enddo
+#endif
+
+!        call now(time2)
+!        time_total_debug= time_total_debug+ time2- time1
+         return
+
+      end subroutine coomv_z
+
+
+
+
       subroutine arpack_sparse_coo_eigs(ndims,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,deval,sigma,zeigv, ritzvec)
          use para, only : dp
          implicit none
@@ -2205,12 +2300,16 @@ subroutine arpack_sparse_coo_eigs(ndims,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,de
          complex(dp),intent(out) :: zeigv(ndims, nvecs)
         
          zeigv= 0d0
-         !> get eigenvalues of a sparse matrix by calling arpack subroutine
-         !> here acoo, icoo, jcoo are stored in COO format
-         !call zmat_arpack_zndrv1(ndims, nnzmax, nnz,  acoo, jcoo, icoo, sigma, neval, nvecs, deval, zeigv)
 
+#if defined (INTELMKL)
+         !> get eigenvalues of a sparse matrix by calling arpack subroutine
          !> acoo, jcoo, icoo would be converted in to A-sigma*I, then converted into CSR format
          call zmat_arpack_zndrv2(ndims, nnzmax, nnz, acoo, jcoo, icoo, sigma, neval, nvecs, deval, zeigv, ritzvec)
+#else
+         !> here acoo, icoo, jcoo are stored in COO format
+         !> use matrix vector multiplication
+         call zmat_arpack_zndrv1(ndims, nnzmax, nnz,  acoo, jcoo, icoo, sigma, neval, nvecs, deval, zeigv, ritzvec)
+#endif
 
 
          return
@@ -2339,6 +2438,9 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
 ! axuiliary integer variables
          integer :: itmp, iter
 
+         real(dp) :: time_cost_mv, time_cost_zgesv, time_cost_convert,  time_cost_znaupd
+         real(dp) :: time_start, time_end
+
 ! auxiliary real(dp) variables
          real(dp) :: dtmp
 
@@ -2425,6 +2527,10 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
          bmat  = 'G'
          which = 'LM'
 
+
+         time_cost_mv= 0d0; time_cost_zgesv= 0d0; time_cost_convert= 0d0
+         time_cost_znaupd= 0d0
+
          !
          !     %----------------------------------------------------%
          !     | Construct C = A - SIGMA*S                          |
@@ -2440,6 +2546,7 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
          if (nnz>nnzmax) stop "ERROR: in zndrv4, nnz is larger than nnzmax"
 
 
+         call now(time_start)
          !> prepare hamiltonian
          !> transform coo format to csr format
          call ConvertCooToCsr(ndims, nnz, acsr, icsr, jcsr, iwk)
@@ -2454,6 +2561,8 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
          !> eleminate the same entries in the sparse matrix
          call csr_sum_duplicates(ndims, snnz, sicsr, sjcsr, sacsr)
          call csr_sort_indices(ndims, snnz, sicsr, sjcsr, sacsr)
+         call now(time_end)
+         time_cost_convert= time_end- time_start
 
 !
 !     %---------------------------------------------------%
@@ -2505,8 +2614,15 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
 !        | has been exceeded.                          |
 !        %---------------------------------------------%
 !
+         call now(time_start)
+#if defined (ARPACK)
          call znaupd ( ido, bmat, n, which, nev, tol, resid, ncv, &
             zeigv, ldv, iparam, ipntr, workd, workl, lworkl, rwork, info )
+#else
+         STOP "ERROR : Please install WannierTools with ARPACK"
+#endif
+         call now(time_end)
+         time_cost_znaupd=  time_cost_znaupd+ time_end- time_start
 
          iter = iter + 1
          if (mod(iter,100) .eq. 0 .and. cpuid==0) then
@@ -2531,10 +2647,18 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
 !
  
             !> first perform S*x
-            call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call now(time_start)
+            !call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call csrmv_z(ndims, snnz, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call now(time_end)
+            time_cost_mv=  time_cost_mv+ time_end- time_start
             call zcopy (ndims, workd(ipntr(2)),  1, workd(ipntr(1)), 1)
             !> then perform inv[A-SIGMA*S]*S*x
-            call zmat_mkldss_zgesv(ndims, nnz, acsr, jcsr, icsr, workd(ipntr(1)), workd(ipntr(2)))
+            call now(time_start)
+            !call zmat_mkldss_zgesv(ndims, nnz, acsr, jcsr, icsr, workd(ipntr(1)), workd(ipntr(2)))
+            call sparse_solver(ndims, nnz, acsr, icsr, jcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call now(time_end)
+            time_cost_zgesv= time_cost_zgesv+ time_cost_mv+ time_end- time_start
 
 !           %-----------------------------------------%
 !           | L O O P   B A C K to call ZNAUPD again. |
@@ -2554,7 +2678,11 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
 !           |    y  =workd(ipntr(2))
 !           %-----------------------------------------%
 !
-            call zmat_mkldss_zgesv(ndims, nnz, acsr, jcsr, icsr, workd(ipntr(3)), workd(ipntr(2)))
+            call now(time_start)
+            !call zmat_mkldss_zgesv(ndims, nnz, acsr, jcsr, icsr, workd(ipntr(3)), workd(ipntr(2)))
+            call sparse_solver(ndims, nnz, acsr, icsr, jcsr, workd(ipntr(3)), workd(ipntr(2)))
+            call now(time_end)
+            time_cost_zgesv= time_cost_zgesv+ time_cost_mv+ time_end- time_start
 
             go to 10
 !
@@ -2570,7 +2698,11 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
 !           |    y = workd(ipntr(2))
 !           %-------------------------------------%
 !
-            call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call now(time_start)
+            !call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call csrmv_z(ndims, snnz, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call now(time_end)
+            time_cost_mv=  time_cost_mv+ time_end- time_start
 !
 !           %-----------------------------------------%
 !           | L O O P   B A C K to call ZNAUPD  again. |
@@ -2613,10 +2745,14 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
             rvec = .false.
             if (LandauLevel_wavefunction_calc.or.SlabBand_calc) rvec = .true.
 
+#if defined (ARPACK)
             call zneupd (rvec, 'A', select, d, zeigv, ldv, sigma, &
                workev, bmat, n, which, nev, tol, resid, ncv,&
                zeigv, ldv, iparam, ipntr, workd, workl, lworkl, &
                rwork, ierr)
+#else
+         STOP "ERROR : Please install WannierTools with ARPACK"
+#endif
 !
 !        %----------------------------------------------%
 !        | Eigenvalues are returned in the one          |
@@ -2660,8 +2796,10 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
 !               %---------------------------%
 !
 
-                  call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, zeigV(1,j), mx)
-                  call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, zeigV(1,j), ax)
+                  !call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, zeigV(1,j), mx)
+                  call csrmv_z(ndims, snnz, sacsr, sicsr, sjcsr, zeigV(1,j), mx)
+                  !call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, zeigV(1,j), ax)
+                  call csrmv_z(ndims, nnz, acsr, icsr, jcsr, zeigV(1,j), ax)
                   call zaxpy(n, -d(j), mx, 1, ax, 1)
                   rd(j,1) = dble(d(j))
                   rd(j,2) = dimag(d(j))
@@ -2709,6 +2847,13 @@ subroutine zmat_arpack_zndrv4(ndims, nnzmax, nnz, acsr, jcsr, icsr, &
             if (cpuid==0) write(stdout, *) ' The number of OP*x is ', iparam(9)
             if (cpuid==0) write(stdout, *) ' The convergence criterion is ', tol
             if (cpuid==0) write(stdout, *) ' '
+            if (cpuid==0) write(stdout, '(a,f8.4, " s")') ' time_cost_zgesv: ', time_cost_zgesv
+            if (cpuid==0) write(stdout, '(a,f8.4, " s")') ' time_cost_mv: ', time_cost_mv
+            if (cpuid==0) write(stdout, '(a,f8.4, " s")') ' time_cost_t1: ', time_cost_t1
+            if (cpuid==0) write(stdout, '(a,f8.4, " s")') ' time_cost_t2: ', time_cost_t2
+            if (cpuid==0) write(stdout, '(a,f8.4, " s")') ' time_cost_t3: ', time_cost_t3
+            if (cpuid==0) write(stdout, '(a,f8.4, " s")') ' time_cost_convert', time_cost_convert
+            if (cpuid==0) write(stdout, '(a,f8.4, " s")') ' time_cost_znaupd:', time_cost_znaupd
 
          end if
 !
@@ -2954,8 +3099,12 @@ subroutine zmat_arpack_zndrv3(ndims, nnzmax, nnz, acsr, jcsr, icsr, snnzmax, snn
 !        | has been exceeded.                          |
 !        %---------------------------------------------%
 !
+#if defined (ARPACK)
          call znaupd ( ido, bmat, n, which, nev, tol, resid, ncv, &
             zeigv, ldv, iparam, ipntr, workd, workl, lworkl, rwork, info )
+#else
+         STOP "ERROR : Please install WannierTools with ARPACK"
+#endif
 
          iter = iter + 1
          if (mod(iter,100) .eq. 0 .and. cpuid==0) then
@@ -2976,10 +3125,12 @@ subroutine zmat_arpack_zndrv3(ndims, nnzmax, nnz, acsr, jcsr, icsr, snnzmax, snn
 !
  
             !> first perform A*x
-            call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, workd(ipntr(1)), workd(ipntr(2)))
+            !call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call csrmv_z(ndims, nnz, acsr, icsr, jcsr, workd(ipntr(1)), workd(ipntr(2)))
             call zcopy (ndims, workd(ipntr(2)),  1, workd(ipntr(1)), 1)
             !> first perform inv[S]*A*x
-            call zmat_mkldss_zgesv(ndims, snnz, sacsr, sjcsr, sicsr, workd(ipntr(1)), workd(ipntr(2)))
+            !call zmat_mkldss_zgesv(ndims, snnz, sacsr, sjcsr, sicsr, workd(ipntr(1)), workd(ipntr(2)))
+            call sparse_solver(ndims, snnz, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
 !
 !           %-----------------------------------------%
 !           | L O O P   B A C K to call ZNAUPD again. |
@@ -2996,7 +3147,8 @@ subroutine zmat_arpack_zndrv3(ndims, nnzmax, nnz, acsr, jcsr, icsr, snnzmax, snn
 !           | workd(ipntr(2)).                    |
 !           %-------------------------------------%
 !
-            call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
+            !call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call csrmv_z(ndims, snnz, sacsr, sicsr, sjcsr, workd(ipntr(1)), workd(ipntr(2)))
 !
 !           %-----------------------------------------%
 !           | L O O P   B A C K to call ZNAUPD  again. |
@@ -3039,10 +3191,14 @@ subroutine zmat_arpack_zndrv3(ndims, nnzmax, nnz, acsr, jcsr, icsr, snnzmax, snn
             rvec = .false.
             if (LandauLevel_wavefunction_calc.or.SlabBand_calc) rvec = .true.
 
+#if defined (ARPACK)
             call zneupd (rvec, 'A', select, d, zeigv, ldv, sigma, &
                workev, bmat, n, which, nev, tol, resid, ncv,&
                zeigv, ldv, iparam, ipntr, workd, workl, lworkl, &
                rwork, ierr)
+#else
+         STOP "ERROR : Please install WannierTools with ARPACK"
+#endif
 !
 !        %----------------------------------------------%
 !        | Eigenvalues are returned in the one          |
@@ -3086,8 +3242,10 @@ subroutine zmat_arpack_zndrv3(ndims, nnzmax, nnz, acsr, jcsr, icsr, snnzmax, snn
 !               %---------------------------%
 !
 
-                  call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, zeigV(1,j), mx)
-                  call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, zeigV(1,j), ax)
+                  !call mkl_zcsrgemv('N', ndims, sacsr, sicsr, sjcsr, zeigV(1,j), mx)
+                  call csrmv_z(ndims, snnz, sacsr, sicsr, sjcsr, zeigV(1,j), mx)
+                  !call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, zeigV(1,j), ax)
+                  call csrmv_z(ndims, nnz, acsr, icsr, jcsr, zeigV(1,j), ax)
                   call zaxpy(n, -d(j), mx, 1, ax, 1)
                   rd(j,1) = dble(d(j))
                   rd(j,2) = dimag(d(j))
@@ -3168,7 +3326,7 @@ subroutine zmat_arpack_zndrv3(ndims, nnzmax, nnz, acsr, jcsr, icsr, snnzmax, snn
          return
       end subroutine zmat_arpack_zndrv3
 
-      subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval, nvecs, deval, zeigv)
+      subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval, nvecs, deval, zeigv, ritzvec)
          use para, only : dp, stdout, cpuid
          implicit none
 
@@ -3202,6 +3360,9 @@ subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval
 ! eigenvector for selected "which"
          complex(dp), intent(out) :: zeigv(ndims, nvecs)
 
+!> calculate eigenvector or not
+         logical, intent(in) :: ritzvec
+
 ! loop index over neval
          integer :: ival, jval
 
@@ -3237,12 +3398,11 @@ subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval
 !     %---------------%
 !
          character  :: bmat*1, which*2
-         integer    :: ido, n, nev, ncv, lworkl, info, j, &
+         integer    :: ido, n, nev, ncv, lworkl, info, i, j, &
             ierr, nconv, maxitr, ishfts, mode, ldv
-         integer    :: istat
+         integer    :: istat 
          integer    :: lwrkl, lwrkv, lwrkd
          real(dp)    :: tol
-         logical    :: rvec
 !
 !     %-----------------------------%
 !     | BLAS & LAPACK routines used |
@@ -3297,6 +3457,15 @@ subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval
          bmat  = 'I'
          which = 'SM'
 
+         !> added in the diagonal part, shift the spectrum by sigma
+         do i=1, ndims
+            j=i+nnz
+            icsr(j)= i
+            jcsr(j)= i
+            acsr(j)= -sigma
+         enddo
+         nnz= nnz+ ndims
+
          !> prepare hamiltonian
          !> transform coo format to csr format
          call ConvertCooToCsr(ndims, nnz, acsr, icsr, jcsr, iwk)
@@ -3355,8 +3524,12 @@ subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval
 !        | has been exceeded.                          |
 !        %---------------------------------------------%
 !
+#if defined (ARPACK)
          call znaupd ( ido, bmat, n, which, nev, tol, resid, ncv, &
             zeigv, ldv, iparam, ipntr, workd, workl, lworkl, rwork, info )
+#else
+         STOP "ERROR : Please install WannierTools with ARPACK"
+#endif
 
          iter = iter + 1
          if (mod(iter,100) .eq. 0 .and. cpuid==0) then
@@ -3375,7 +3548,8 @@ subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval
 !           | product to workd(ipntr(2)).               |
 !           %-------------------------------------------%
 !
-            call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, workd(ipntr(1)), workd(ipntr(2)))
+            !call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, workd(ipntr(1)), workd(ipntr(2)))
+            call csrmv_z(ndims, nnz, acsr, icsr, jcsr, workd(ipntr(1)), workd(ipntr(2)))
 !
 !           %-----------------------------------------%
 !           | L O O P   B A C K to call ZNAUPD again. |
@@ -3415,13 +3589,14 @@ subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval
 !        | desired.  (indicated by rvec = .true.)    |
 !        %-------------------------------------------%
 !
-            rvec = .false.
-            if (LandauLevel_wavefunction_calc.or.SlabBand_calc) rvec = .true.
-
-            call zneupd (rvec, 'A', select, d, zeigv, ldv, sigma, &
+#if defined (ARPACK)
+            call zneupd (ritzvec, 'A', select, d, zeigv, ldv, sigma, &
                workev, bmat, n, which, nev, tol, resid, ncv,&
                zeigv, ldv, iparam, ipntr, workd, workl, lworkl, &
                rwork, ierr)
+#else
+         STOP "ERROR : Please install WannierTools with ARPACK"
+#endif
 !
 !        %----------------------------------------------%
 !        | Eigenvalues are returned in the one          |
@@ -3464,10 +3639,8 @@ subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval
 !               | tolerance)                |
 !               %---------------------------%
 !
-                  !$ call av(nx, zeigv(1,j), ax)
-
-                  !$ call zmat_sparse_csrmv0(ndims, nnzmax, acsr, jcsr, icsr, zeigv(1,j), ax)
-                  call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, zeigV(1,j), ax)
+                  !call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, zeigV(1,j), ax)
+                  call csrmv_z(ndims, nnz, acsr, icsr, jcsr, zeigV(1,j), ax)
                   call zaxpy(n, -d(j), zeigv(1,j), 1, ax, 1)
                   rd(j,1) = dble(d(j))
                   rd(j,2) = dimag(d(j))
@@ -3521,9 +3694,9 @@ subroutine zmat_arpack_zndrv1(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma, neval
 !     | Done with program zndrv1. |
 !     %---------------------------%
 !
-
+         !shift back the spectrum by sigma
          do ival=1,neval
-            deval(ival) = real(d(ival))
+            deval(ival) = real(d(ival)+sigma)
            !zeigv(1:ndims, ival) = V(1:ndims, ival)
          enddo ! over ival={1,neval} loop
 
@@ -3774,8 +3947,12 @@ subroutine zmat_arpack_zndrv2(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma,neval,
          !    write(*,*) 'ncv,nev',ncv,nev,n
 
          
+#if defined (ARPACK)
          call znaupd ( ido, bmat, n, which, nev, tol, resid, ncv, &
             zeigv, ldv, iparam, ipntr, workd, workl, lworkl, rwork,info )
+#else
+         STOP "ERROR : Please install WannierTools with ARPACK"
+#endif
 
          iter = iter + 1
          if (mod(iter,10) .eq. 0 .and. cpuid==0 ) then
@@ -3795,8 +3972,8 @@ subroutine zmat_arpack_zndrv2(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma,neval,
             !           call zcopy( n, workd(ipntr(1)),1, workd(ipntr(2)), 1)
             !
             !           call zgttrs('N', n, 1, dl, dd, du, du2, ipiv, workd(ipntr(2)), n, ierr)
-!        call sparse_zgesv_csr(ndims,nnz,acsr,jcsr,icsr,workd(ipntr(1)), workd(ipntr(2)))
-            call zmat_mkldss_zgesv(ndims, nnz, acsr, jcsr, icsr, workd(ipntr(1)), workd(ipntr(2)))
+            !call zmat_mkldss_zgesv(ndims, nnz, acsr, jcsr, icsr, workd(ipntr(1)), workd(ipntr(2)))
+            call sparse_solver(ndims, nnz, acsr, icsr, jcsr, workd(ipntr(1)), workd(ipntr(2)))
             !
             !           %-----------------------------------------%
             !           | L O O P   B A C K to call ZNAUPD again. |
@@ -3836,10 +4013,14 @@ subroutine zmat_arpack_zndrv2(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma,neval,
             !        %-------------------------------------------%
             !
 
+#if defined (ARPACK)
             call zneupd (ritzvec, 'A', select, d, zeigv, ldv, sigma, &
                workev, bmat, n, which, nev, tol, resid, ncv,&
                zeigv, ldv, iparam, ipntr, workd, workl, lworkl, &
                rwork, ierr)
+#else
+         STOP "ERROR : Please install WannierTools with ARPACK"
+#endif
             !
             !        %----------------------------------------------%
             !        | Eigenvalues are returned in the one          |
@@ -3882,7 +4063,8 @@ subroutine zmat_arpack_zndrv2(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma,neval,
                   !              %---------------------------%
                   !
                   !$ call av(n, zeigv(1,j), ax)
-                  call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, zeigv(1,j), ax)
+                  !call mkl_zcsrgemv('N', ndims, acsr, icsr, jcsr, zeigv(1,j), ax)
+                  call csrmv_z(ndims, nnz, acsr, icsr, jcsr, zeigV(1,j), ax)
                   call zaxpy(n, -d(j), zeigv(1,j), 1, ax, 1)
                   rd(j,1) = dble(d(j))
                   rd(j,2) = dimag(d(j))
@@ -3977,84 +4159,6 @@ subroutine zmat_arpack_zndrv2(ndims, nnzmax, nnz, acsr, jcsr, icsr, sigma,neval,
          return
       end subroutine zmat_arpack_zndrv2
 
-      subroutine zmat_mkldss_zgesv(ndims, nnzmax, acsr, jcsr, icsr, xvec, yvec)
-         use mkl_dss
-         use para, only : dp, stdout
-
-         implicit none
-
-         ! external arguments
-         ! dimension of matrix A
-         integer, intent(in) :: ndims
-
-         ! non-zero elements of matrix A
-         integer, intent(in) :: nnzmax
-
-         ! compressed sparse row storage of matrix A
-         complex(dp), intent(in) :: acsr(nnzmax)
-         integer, intent(in) :: jcsr(nnzmax)
-         integer, intent(in) :: icsr(ndims+1)
-
-         ! xvec vector with dimension "ndims"
-         complex(dp), intent(in) :: xvec(ndims)
-
-         ! yvec vector with dimension "ndims"
-         complex(dp), intent(out) :: yvec(ndims)
-
-         ! local variables
-         type(mkl_dss_handle) :: handle
-         integer :: nrhs
-         integer :: ierr
-         integer :: perm(1)
-
-         !>>> step 0: setup the problem to be solved
-         nrhs = 1
-         perm(1) = 0
-
-         !>>> step 1: initialize the solver.
-         ierr = dss_create( handle, mkl_dss_defaults )
-         if (ierr .ne. mkl_dss_success) then
-            write(*,*) "mkl_dss_create returned error code", ierr; stop
-         endif 
-
-         !>>> step 2: define the non-zero structure of the matrix.
-         !ierr = dss_define_structure( handle, mkl_dss_symmetric_structure_complex, icsr, ndims, ndims, jcsr, nnzmax )
-         ierr = dss_define_structure( handle, MKL_DSS_NON_SYMMETRIC_COMPLEX, icsr, ndims, ndims, jcsr, nnzmax )
-         if (ierr .ne. mkl_dss_success) then
-            write(*,*) "mkl_dss_define_structure returned error code", ierr; stop
-         endif ! back if (ierr .ne. mkl_dss_success} block
-
-         !>>> step 3: reorder the matrix.
-         ierr = dss_reorder( handle, mkl_dss_defaults, perm )
-         if (ierr .ne. mkl_dss_success) then
-            write(*,*) "mkl_dss_reorder returned error code", ierr; stop
-         endif ! back if (ierr .ne. mkl_dss_success} block
-
-         !>>> step 4: factor the matrix.
-         ierr = dss_factor_complex( handle, mkl_dss_defaults, acsr )
-         if (ierr .ne. mkl_dss_success) then
-            write(*,*) "mkl_dss_factor_complex returned error code", ierr; stop
-         endif ! back if (ierr .ne. mkl_dss_success} block
-
-         ! allocate the solution vector and solve the problem.
-         ierr = dss_solve_complex( handle, mkl_dss_defaults, xvec, nrhs, yvec )
-
-         if (ierr .ne. mkl_dss_success) then
-            write(*,*) "mkl_dss_solve_complex returned error code", ierr; stop
-         endif ! back if (ierr .ne. mkl_dss_success} block
-
-         ! deallocate solver storage and various local arrays.
-         ierr = dss_delete( handle, mkl_dss_defaults )
-
-         if (ierr .ne. mkl_dss_success) then
-            write(*,*) "mkl_dss_solver returned error code", ierr; stop
-         endif ! back if (ierr .ne. mkl_dss_success} block
-
-         return
-      end subroutine zmat_mkldss_zgesv
-#endif
-
-
    end module sparse
 
 
diff --git a/src/superlu_config.fh b/src/superlu_config.fh
new file mode 100644
index 00000000..bacb6183
--- /dev/null
+++ b/src/superlu_config.fh
@@ -0,0 +1,22 @@
+
+#ifndef SUPERLU_CONFIG_H
+#define SUPERLU_CONFIG_H
+
+
+
+
+
+
+
+
+
+
+
+#if (XSDK_INDEX_SIZE == 64)
+#include <stdint.h>
+#define _LONGINT 1
+#else
+#endif
+
+#endif 
+
diff --git a/src/unfolding.f90 b/src/unfolding.f90
index 99aeca8f..f9caf013 100644
--- a/src/unfolding.f90
+++ b/src/unfolding.f90
@@ -176,9 +176,7 @@ subroutine unfolding_kpath
             W= 0d0
             !> after arpack_sparse_coo_eigs, nnz will be updated.
             ritzvec= .true.
-#if defined (INTELMKL)
             call arpack_sparse_coo_eigs(Ndimq,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,W,sigma, zeigv, ritzvec)
-#endif
          else
             call ham_3Dlandau(Ndimq, Magq, k_SBZ_direct, hamk_bulk)
             zeigv=hamk_bulk
@@ -197,9 +195,7 @@ subroutine unfolding_kpath
             W= 0d0
             !> after arpack_sparse_coo_eigs, nnz will be updated.
             ritzvec= .true.
-#if defined (INTELMKL)
             call arpack_sparse_coo_eigs(Num_wann,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,W,sigma, zeigv, ritzvec)
-#endif
             call now(time3)
          else
             ! dense hr
@@ -490,9 +486,7 @@ subroutine unfolding_kplane
          W= 0d0
          !> after arpack_sparse_coo_eigs, nnz will be updated.
          ritzvec= .true.
-#if defined (INTELMKL)
          call arpack_sparse_coo_eigs(Num_wann,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,W,sigma, zeigv, ritzvec)
-#endif
          call now(time3)
       else
  
@@ -505,9 +499,7 @@ subroutine unfolding_kplane
             W= 0d0
             !> after arpack_sparse_coo_eigs, nnz will be updated.
             ritzvec= .true.
-#if defined (INTELMKL)
             call arpack_sparse_coo_eigs(Num_wann,nnzmax,nnz,acoo,jcoo,icoo,neval,nvecs,W,sigma, zeigv, ritzvec)
-#endif
             call now(time3)
          else
             ! dense hr
diff --git a/src/wanniercenter.f90 b/src/wanniercenter.f90
index 70529e25..2a1664e6 100644
--- a/src/wanniercenter.f90
+++ b/src/wanniercenter.f90
@@ -1657,7 +1657,7 @@ subroutine  wannier_center3D_plane_mirror_minus
    real(dp) :: maxgap0
 
    !> Z2 calculation for time reversal invariant system
-   integer :: Z2
+   real(dp) :: Z2
    integer :: Delta
 
    real(dp) :: g
@@ -2042,7 +2042,7 @@ subroutine  wannier_center3D_plane_mirror_plus
    real(dp) :: maxgap0
 
    !> Z2 calculation for time reversal invariant system
-   integer :: Z2
+   real(dp) :: Z2
    integer :: Delta
 
    real(dp) :: g
@@ -2344,7 +2344,7 @@ subroutine  wannier_center3D_plane0
    real(dp), allocatable :: largestgap(:)
 
    !> Z2 calculation for time reversal invariant system
-   integer :: Z2
+   real(dp) :: Z2
 
    allocate(wcc(NumberofSelectedOccupiedBands, Nk2))
    allocate(largestgap(Nk2))
@@ -2471,7 +2471,7 @@ subroutine  wannier_center3D_plane_func(kstart, kvec1, kvec2, largest_gap, wcc,
       real(dp) :: maxgap0
 
       !> Z2 calculation for time reversal invariant system
-      integer :: Z2
+      real(dp) :: Z2
       integer :: Delta
 
       real(dp) :: g
@@ -2719,7 +2719,7 @@ subroutine  wannier_center3D_k
    real(dp), allocatable :: largestgap(:)
 
    !> Z2 calculation for time reversal invariant system
-   integer :: Z2
+   real(dp) :: Z2
 
    real(dp),allocatable :: kpoints(:,:,:)
    integer :: nkp1,nkp2
@@ -2879,7 +2879,7 @@ subroutine  wannier_center3D_kpath(kpoints, nkp1, nkp2, largest_gap, wcc, Z2)
    real(dp) :: maxgap0
 
    !> Z2 calculation for time reversal invariant system
-   integer :: Z2
+   real(dp) :: Z2
    integer :: Delta
 
    real(dp) :: g
@@ -3863,8 +3863,8 @@ subroutine  Z2_3D
    real(dp) :: kvec1(3)
    real(dp) :: kvec2(3)
 
-   integer :: Z2
-   integer :: Z2_all(6)
+   real(dp) :: Z2
+   real(dp) :: Z2_all(6)
 
    !> wannier centers for each ky, bands
    real(dp), allocatable :: wcc(:, :)
diff --git a/src/wt_aux.f90 b/src/wt_aux.f90
index c1fae869..e4a981e4 100644
--- a/src/wt_aux.f90
+++ b/src/wt_aux.f90
@@ -13,8 +13,8 @@ subroutine now(time_now)
      real(dp), intent(inout)  :: time_now
 
      call Date_and_time(values=time_new)
-     time_now= time_new(3)*24*3600+time_new(5)*3600+&
-               time_new(6)*60+time_new(7)+time_new(8)/1000d0
+     time_now= time_new(3)*24d0*3600d0+time_new(5)*3600d0+&
+               time_new(6)*60d0+time_new(7)+time_new(8)/1000d0
   
      return
   end subroutine now
diff --git a/utility/twisted_graphene_system_kp_model/system.in b/utility/twisted_graphene_system_kp_model/system.in
index 26043a06..b30bc032 100644
--- a/utility/twisted_graphene_system_kp_model/system.in
+++ b/utility/twisted_graphene_system_kp_model/system.in
@@ -1,5 +1,5 @@
    &PARAMETERS
-   number_layers = 4
+   number_layers = 2
 
    !> magic angle
    twisted_angle_degree= 1.0661
@@ -25,6 +25,7 @@
    u_AA=0.0797  ! eV  parameters from Phys. Rev. X 8, 031087
    u_AB=0.0975  ! eV 
    vf=5.253084  ! eV/Angstrom 2.1354*2.46
+   gamma_4 = 0.03
 
   !u_AA=0.000   ! eV parameters from PhysRevLett.122.106405
   !u_AB=0.110   ! eV