diff --git a/Project.toml b/Project.toml
index 0bcd17d..a1e34ce 100644
--- a/Project.toml
+++ b/Project.toml
@@ -2,7 +2,7 @@ name = "LLLplus"
 uuid = "142c1900-a1c3-58ae-a66d-b187f9ca6423"
 keywords = ["lattice reduction", "lattice basis reduction", "SVP", "shortest vector problem", "CVP", "closest vector problem", "LLL", "Lenstra-Lenstra-Lovász", "Seysen", "Brun", "VBLAST", "subset-sum problem", "Lagarias-Odlyzko", "Bailey–Borwein–Plouffe formula"]
 license = "MIT"
-version = "1.3.4"
+version = "1.3.5"
 
 [deps]
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
diff --git a/README.md b/README.md
index ec2f065..6560ca5 100644
--- a/README.md
+++ b/README.md
@@ -3,10 +3,9 @@
 [![Build Status](https://github.com/christianpeel/LLLplus.jl/workflows/CI/badge.svg)](https://github.com/christianpeel/LLLplus.jl/actions)
 [![](https://img.shields.io/badge/docs-devel-blue.svg)](https://christianpeel.github.io/LLLplus.jl/dev)
 
-LLLplus provides lattice tools such as
-Lenstra-Lenstra-Lovász (LLL) lattice reduction which are of practical and
-theoretical use in cryptography, digital communication, integer
-programming, and more.
+LLLplus provides lattice tools such as Lenstra-Lenstra-Lovász (LLL)
+lattice reduction which are of practical and theoretical use in
+cryptography, digital communication, integer programming, and more.
 This package is experimental and not a robust tool; use at your own
 risk :-)
 
@@ -26,7 +25,7 @@ functions are also included; see the  `subsetsum`, `minimalpolynomial`,
 `integerfeasibility`, `rationalapprox`, and  `spigotBBP` functions.
 
 <details>
-   <summary><b>Examples</b> (click for details)</summary>
+   <summary><b>Basic Examples</b> (click for details)</summary>
 <p>
 
 Each function contains documentation and examples available via Julia's
@@ -45,10 +44,10 @@ using LLLplus
 # do lattice reduction on a matrix with randn entries
 N = 40;
 H = randn(N,N);
-B,T = brun(H);
-B,T = lll(H);
-B,T = seysen(H);
-B,T = hkz(H);
+Bbrun,_ = brun(H);
+Blll,_ = lll(H);
+Bseysen,_ = seysen(H);
+Bhkz,_ = hkz(H);
 
 # check out the CVP solver
 Q,Rtmp=qr(H); R = UpperTriangular(Rtmp);
@@ -64,54 +63,18 @@ sum(abs.(u-uhat))
    <summary><b>Execution Time results</b> (click for details)</summary>
 <p>
 
-In the first test we compare several LLL functions: the `lll` function from LLLplus, the
-`l2avx` function in the `src\l2.jl` file in LLLplus, the
-`lll_with_transform` function from
-[Nemo.jl](https://github.com/Nemocas/Nemo.jl) (which uses FLINT), and
-the `lll_reduction` function from
-[fplll](https://github.com/fplll/fplll).  Nemo is written by number
-theorists, while fplll is written
-by lattice cryptanalysis academics; they are good benchmarks against which to compare.
-We first show how the execution time varies as the basis (matrix) size
-varies over [4 8 16 32 64]. For each matrix size, 20 random bases are
-generated using fplll's `gen_qary` function with depth of 25 bits,
-with the average execution time shown; the `eltype` is `Int64` except
-for NEMO, which can only use GMP (its own `BigInt`); in all cases the
-`δ=.99`. The vertical axis shows execution time on a logarithmic
-scale; the x-axis is also logarithmic.
-The `lll` function is slower, while `l2avx` is similar to
-fplll. Though not shown, using bases from `gen_qary` with bit depth of
-45 gives fplll a larger advantage. Though the LLLplus functions are
-not the fastest, they are in the same ballpark as the C and
-C++ tools; if this package gets more users, we'll spend more time on
-speed :-)  This figure was generated using code in `test/timeLLLs.jl`.
-
-![Time vs basis size](docs/src/assets/timeVdim_25bitsInt64.png)
-
-One additional question that could arise when looking at the plot above is what
-the quality of the basis is. In the next plot we show execution time
-vs the norm of the first vector in the reduced basis, this first
-vector is typically the smallest; its norm is an rough indication of
-the quality of the reduced basis. We show results averaged over 20
-random bases from `gen_qary` with depth `25` bits, this time with the
-dimension fixed at `32`. The curve is created by varying the `δ`
-parameter from `.29` to `.99` in steps of `.2`; the larger times and
-smaller norms correspond to the largest `δ` values. Though the `l2avx`
-function is competitive with fplll in this case, in most cases
-the fplll code is faster.
-
-![Time vs reduction quality](docs/src/assets/timeVsmallest_25bitsInt64.png)
-
-Finally, we show execution time for several built-in
-datatypes (Int32, Int64, Int128, Float32, Float64, BitInt, and
-BigFloat) as well as type from external packages (Float128 from
+To give a flavor of the behavior of the functions in LLLplus,
+we show execution time for several built-in datatypes (Int32,
+Int64, Int128, Float32, Float64, BitInt, and BigFloat) as well as type
+from external packages (Float128 from
 [Quadmath.jl](https://github.com/JuliaMath/Quadmath.jl) and Double64
 from [DoubleFloat.jl](https://github.com/JuliaMath/DoubleFloats.jl))
-which are used to 
-generate 60 16x16 matrices, over which execution time for the
-lattice reduction techniques is averaged.  The vertical axis is a
-logarithmic representation of execution time as in the previous
-figure. This figure was generated using code in `test/perftest.jl`.
+which are used to generate 100 16x16 matrices with elements uniformly
+distributed over `-100` to `100`. The figure shows average execution
+time when using these matrices as input lattice bases for several
+functions from LLLplus. See `test/perftest.jl` for the code to
+regenerate the figure and for another line of code that generates a
+figure of execution time versus basis dimension.
 
 ![Time vs data type](docs/src/assets/perfVsDataType.png)
 
@@ -135,4 +98,5 @@ number-theoretic problems the
 [Nemo.jl](https://github.com/Nemocas/Nemo.jl) package is appropriate;
 it uses the [FLINT](http://flintlib.org/) C library to do LLL
 reduction on Nemo-specific data types.  Finally, no number theorists
+or computer scientists
 have worked on LLLplus; please treat the package as experimental.
diff --git a/docs/src/assets/perfVsDataType.png b/docs/src/assets/perfVsDataType.png
index 2cf5bf5..0636e6e 100644
Binary files a/docs/src/assets/perfVsDataType.png and b/docs/src/assets/perfVsDataType.png differ
diff --git a/docs/src/assets/timeVdim_25bitsInt64.png b/docs/src/assets/timeVdim_25bitsInt64.png
deleted file mode 100644
index dc600aa..0000000
Binary files a/docs/src/assets/timeVdim_25bitsInt64.png and /dev/null differ
diff --git a/docs/src/assets/timeVsmallest_25bitsInt64.png b/docs/src/assets/timeVsmallest_25bitsInt64.png
deleted file mode 100644
index 13df01c..0000000
Binary files a/docs/src/assets/timeVsmallest_25bitsInt64.png and /dev/null differ
diff --git a/docs/src/functions.md b/docs/src/functions.md
index 02463a9..f9df9d7 100644
--- a/docs/src/functions.md
+++ b/docs/src/functions.md
@@ -22,6 +22,7 @@ end
     rationalapprox
     spigotBBP
     minimalpolynomial
+    partitionintwo
     hkz
     ishkzreduced
     issizereduced
diff --git a/docs/src/index.md b/docs/src/index.md
index 51614de..477faf3 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -5,14 +5,13 @@ CurrentModule = LLLplus
 ```
 
 LLLplus provides lattice tools such as
-[Lenstra-Lenstra-Lovász](https://en.wikipedia.org/wiki/Lenstra%E2%80%93Lenstra%E2%80%93Lov%C3%A1sz_lattice_basis_reduction_algorithm)
-(LLL) lattice reduction which are of practical and
+Lenstra-Lenstra-Lovász (LLL) lattice reduction which are of practical and
 theoretical use in cryptography, digital communication, integer
 programming, and more.
 This package is experimental and not a robust tool; use at your own
 risk :-)
 
-LLLplus has functions for LLL,
+LLLplus has functions for [LLL](https://en.wikipedia.org/wiki/Lenstra%E2%80%93Lenstra%E2%80%93Lov%C3%A1sz_lattice_basis_reduction_algorithm),
 [Seysen](http://link.springer.com/article/10.1007%2FBF01202355), and
 [Hermite-Korkine-Zolotarev](http://www.cas.mcmaster.ca/~qiao/publications/ZQW11.pdf)
 lattice reduction
@@ -46,10 +45,10 @@ using LLLplus
 # do lattice reduction on a matrix with randn entries
 N = 40;
 H = randn(N,N);
-B,T = brun(H);
-B,T = lll(H);
-B,T = seysen(H);
-B,T = hkz(H);
+Bbrun,_ = brun(H);
+Blll,_ = lll(H);
+Bseysen,_ = seysen(H);
+Bhkz,_ = hkz(H);
 
 # check out the CVP solver
 Q,Rtmp=qr(H); R = UpperTriangular(Rtmp);
@@ -61,52 +60,18 @@ sum(abs.(u-uhat))
 
 ### Execution Time results
 
-In the first test we compare several LLL functions: the `lll` function from LLLplus, the
-`l2avx` function in the `src\l2.jl` file in LLLplus, the
-`lll_with_transform` function from Nemo (which uses FLINT), and the
-`lll_reduction` function from fplll. Nemo is written by number
-theorists, while fplll is written
-by lattice cryptanalysis academics; they are good benchmarks against which to compare.  We
-first show how the execution time varies as the basis (matrix) size
-varies over [4 8 16 32 64]. For each matrix size, 20 random bases
-are generated using fplll's `gen_qary` function with depth of 25
-bits, with the average execution time shown; the `eltype` is `Int64`
-except for NEMO, which uses GMP (its own `BigInt`); in all cases the
-`δ=.99`. The vertical axis shows
-execution time on a logarithmic scale; the x-axis is also
-logarithmic. The generally linear nature of the LLL curves supports
-the polynomial-time nature of the algorithm. The `LLLplus.lll`
-function is slower, while `l2avx` is similar to fplll. Though not
-shown, using bases from `gen_qary` with bit depth of 45 gives fplll
-a larger advantage. This figure was generated using code in
-`test/timeLLLs.jl`.
-
-![Time vs basis size](assets/timeVdim_25bitsInt64.png)
-
-One additional question that could arise when looking at the plot above is what
-the quality of the basis is. In the next plot we show execution time
-vs the norm of the first vector in the reduced basis, this first
-vector is typically the smallest; its norm is an rough indication of
-the quality of the reduced basis. We show results averaged over 20
-random bases from `gen_qary` with depth `25` bits, this time with the
-dimension fixed at `32`. The curve is created by varying the `δ`
-parameter from `.29` to `.99` in steps of `.2`; the larger times and
-smaller norms correspond to the largest `δ` values. Though the `l2avx`
-function is competitive with fplll in this case, in most cases
-the fplll code is faster.
-
-![Time vs reduction quality](assets/timeVsmallest_25bitsInt64.png)
-
-Finally, we show execution time for several built-in
-datatypes (Int32, Int64, Int128, Float32, Float64, BitInt, and
-BigFloat) as well as type from external packages (Float128 from
+To give a flavor of the behavior of the functions in LLLplus,
+we show execution time for several built-in datatypes (Int32,
+Int64, Int128, Float32, Float64, BitInt, and BigFloat) as well as type
+from external packages (Float128 from
 [Quadmath.jl](https://github.com/JuliaMath/Quadmath.jl) and Double64
 from [DoubleFloat.jl](https://github.com/JuliaMath/DoubleFloats.jl))
-which are used to 
-generate 60 16x16 matrices, over which execution time for the
-lattice reduction techniques is averaged.  The vertical axis is a
-logarithmic representation of execution time as in the previous
-figure. This figure was generated using code in `test/perftest.jl`.
+which are used to generate 100 16x16 matrices with elements uniformly
+distributed over `-100` to `100`. The figure shows average execution
+time when using these matrices as input lattice bases for several
+functions from LLLplus. See `test/perftest.jl` for the code to
+regenerate the figure and for another line of code that generates a
+figure of execution time versus basis dimension.
 
 ![Time vs data type](assets/perfVsDataType.png)
 
diff --git a/src/l2.jl b/src/l2.jl
index e230303..a3276e1 100644
--- a/src/l2.jl
+++ b/src/l2.jl
@@ -180,28 +180,28 @@ function lazysizereduction!(ηb,κ,B,G,r,μ,s,X,n,d,Tg)
 end
 #=
 """
-    B = l2avx(H::AbstractArray{Td,2},TG::Type{Tg},δ=.75,η=.51) where
+    B = l2turbo(H::AbstractArray{Td,2},TG::Type{Tg},δ=.75,η=.51) where
                          {Td<:Number,Tg<:Number}
 
-A version of the `l2` function with a few calls to the `@avx` macro
+A version of the `l2` function with a few calls to the `@turbo` macro
 from the `LoopVectorization.jl` package.  See the `l2` help text
 
 # Examples
 ```julia
 julia> using LLLplus
 julia> using LoopVectorization
-julia> H= [1 2; 3 4];B = l2avx(H)
-┌ Warning: l2avx is in a raw (alpha) state and may change. See the help text.
+julia> H= [1 2; 3 4];B = l2turbo(H)
+┌ Warning: l2turbo is in a raw (alpha) state and may change. See the help text.
 └ @ LLLplus ~/shared/LLLplus/src/l2.jl:42
 2×2 Matrix{Int64}:
  1  -1
  1   1
 ```
 """
-function l2avx(H::AbstractArray{Td,2},TG::Type{Tg}=Td,δ=.75,η=.51) where
+function l2turbo(H::AbstractArray{Td,2},TG::Type{Tg}=Td,δ=.75,η=.51) where
     {Td<:Number,Tg<:Number}
 
-    @warn "l2avx is in a raw (alpha) state and may change. See the help text." maxlog=1
+    @warn "l2turbo is in a raw (alpha) state and may change. See the help text." maxlog=1
     
     if !(0.25 < δ < 1.0)
         error("δ must be between 1/4 and 1.");
@@ -218,7 +218,7 @@ function l2avx(H::AbstractArray{Td,2},TG::Type{Tg}=Td,δ=.75,η=.51) where
     ϵ = .001  # ϵ = eps(Td) may be too small
     C = ϵ
     if Tf <: Complex
-        @error "`l2avx` does not handle complex data; try `lll`."
+        @error "`l2turbo` does not handle complex data; try `lll`."
         return
     end
     Tfe = real(Tf)
@@ -268,7 +268,7 @@ function l2avx(H::AbstractArray{Td,2},TG::Type{Tg}=Td,δ=.75,η=.51) where
             B[:,κ] .= bκp
             
             if κ<d
-                @avx for i=κ:κp
+                @turbo for i=κ:κp
                     for d1=1:d
                         G[d1,i] = B[1,d1]*B[1,i]
                         for d2=2:d
@@ -320,12 +320,12 @@ function lazysizereductionAVX!(ηb,κ,B,G,r,μ,s,X,n,d,Tg)
                 μ[κ,j]-=X[i]*μ[i,j]
             end
         end
-        @avx for nx=1:n
+        @turbo for nx=1:n
             for i = 1:κ-1
                 B[nx,κ] -=X[i]*B[nx,i]
             end
         end
-        @avx for d1=1:d
+        @turbo for d1=1:d
             G[d1,κ] = B[1,d1]*B[1,κ]
             for d2=2:d
                 G[d1,κ] += B[d2,d1]*B[d2,κ]
diff --git a/test/lrtest.jl b/test/lrtest.jl
index d1e7688..2bab184 100644
--- a/test/lrtest.jl
+++ b/test/lrtest.jl
@@ -30,7 +30,7 @@ those used in lattice cryptography).
 function lrtest(Ns::Int,N::Array{Int,1},L::Array{Int,1},
                 dataType::Array{DataType,1},distType)
 
-lrAlgs = [hkz,lll,seysen, sizereduction,brun,]
+lrAlgs = [seysen,lll, sizereduction,brun,]
 
 @printf("      Ns      N      L   dataType")
 for ax = 1:min(length(lrAlgs),6)
@@ -48,6 +48,7 @@ if length(N)>1
     yscale=:log10;
     xval = N;
     xlab = "Matrix Size";
+    xtickStrs = [string(ix) for ix ∈ N]
     tstr = @sprintf("Ns=%d,Type=%s,dist=%s",
                     Ns,string(dataType[1]),distType);
 elseif length(L)>1
diff --git a/test/perftest.jl b/test/perftest.jl
index b10835c..6adffbf 100644
--- a/test/perftest.jl
+++ b/test/perftest.jl
@@ -7,7 +7,6 @@ using DoubleFloats
 using Quadmath
 gr();
 
-#include("../src/l2.jl")
 include("lrtest.jl")
 
 # getIntType is used to indicate what type we want the unimodular integer
@@ -18,8 +17,10 @@ import LLLplus.getIntType
 getIntType(Td::Type{Tr}) where {Tr<:Float128} = Int128
 getIntType(Td::Type{Tr}) where {Tr<:Double64} = Int128
 
-lrtest(40,2 .^[3],[100],[Int32,Int64,Int128,Float32,Float64,Double64,Float128,BigInt,BigFloat],"rand")
+lrtest(100,2 .^[4],[100],[Int32,Int64,Int128,Float32,Float64,Double64,Float128,BigInt,BigFloat],"rand")
 savefig("perfVsDataType.png")
+#display(current())
 
 # lrtest(40,2 .^[1:8;],[1],[Float64],"randn")
 # savefig("perfVsNfloat64.png")
+