remove multithreaded gather (#64)

* copy! -> copyto! * remove threads * remove multithreaded gather * Update Project.toml * Update SolverCPU.jl * Update AbstractFixedEffectSolver.jl
FixedEffects · Mar 8, 2024 · 474dea9 · 474dea9 · matthieugomez · Mar 10, 2024
1 parent 99b4f8e
commit 474dea9
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 34 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "FixedEffects"
 uuid = "c8885935-8500-56a7-9867-7708b20db0eb"
-version = "2.3.0"
+version = "2.3.1"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

diff --git a/src/AbstractFixedEffectSolver.jl b/src/AbstractFixedEffectSolver.jl
@@ -35,7 +35,7 @@ p2 = repeat(1:5, outer = 2)
 solve_residuals!(rand(10), [FixedEffect(p1), FixedEffect(p2)])
 ```
 """
-function solve_residuals!(y::Union{AbstractVector{<: Number}, AbstractMatrix{<: Number}}, fes::AbstractVector{<: FixedEffect}, w::AbstractWeights = uweights(eltype(y), size(y, 1)); 
+function solve_residuals!(y::Union{AbstractVector{<: Real}, AbstractMatrix{<: Real}}, fes::AbstractVector{<: FixedEffect}, w::AbstractWeights = uweights(eltype(y), size(y, 1)); 
 	method::Symbol = :cpu, double_precision::Bool = method == :cpu, 
 	tol::Real = double_precision ? 1e-8 : 1e-6, maxiter::Integer = 10000,
 	nthreads = method == :cpu ? Threads.nthreads() : 256)
@@ -47,7 +47,7 @@ end
 
 
 
-function solve_residuals!(r::AbstractVector, feM::AbstractFixedEffectSolver{T}; tol::Real = sqrt(eps(T)), maxiter::Integer = 100_000) where {T}
+function solve_residuals!(r::AbstractVector{<:Real}, feM::AbstractFixedEffectSolver{T}; tol::Real = sqrt(eps(T)), maxiter::Integer = 100_000) where {T}
 	# One cannot copy view of Vector (r) on GPU, so first collect the vector
 	if works_with_view(feM)
 		copyto!(feM.r, r)
@@ -71,7 +71,7 @@ function solve_residuals!(r::AbstractVector, feM::AbstractFixedEffectSolver{T};
 		feM.r ./=  sqrt.(feM.weights)
 	end
 	if works_with_view(feM)
-		copy!(r, feM.r)
+		copyto!(r, feM.r)
 	else
 		copyto!(feM.tmp, feM.r)
 		copyto!(r, feM.tmp)
@@ -104,6 +104,7 @@ function solve_residuals!(xs::AbstractVector{<: AbstractVector}, feM::AbstractFi
     return xs, iterations, convergeds
 end
 
+# to depreciate
 function solve_residuals!(X::AbstractMatrix, feM::AbstractFixedEffectSolver; kwargs...)
 	xs, iterations, convergeds = solve_residuals!(eachcol(X), feM; kwargs...)
 	return X, iterations, convergeds

diff --git a/src/SolverCPU.jl b/src/SolverCPU.jl
@@ -8,45 +8,37 @@ mutable struct FixedEffectLinearMapCPU{T} <: AbstractFixedEffectLinearMap{T}
 	fes::Vector{<:FixedEffect}
 	scales::Vector{<:AbstractVector}
 	caches::Vector{<:AbstractVector}
-	tmp::Vector{Union{Nothing, <:AbstractVector}}
 	nthreads::Int
 end
 
 function FixedEffectLinearMapCPU{T}(fes::Vector{<:FixedEffect}, ::Type{Val{:cpu}}, nthreads) where {T}
 	scales = [zeros(T, fe.n) for fe in fes]
 	caches = [zeros(T, length(fes[1].interaction)) for fe in fes]
-	fecoefs = [[zeros(T, fe.n) for _ in 1:nthreads] for fe in fes]
-	return FixedEffectLinearMapCPU{T}(fes, scales, caches, fecoefs, nthreads)
+	return FixedEffectLinearMapCPU{T}(fes, scales, caches, nthreads)
 end
 
 function LinearAlgebra.mul!(fecoefs::FixedEffectCoefficients, 
 	Cfem::Adjoint{T, FixedEffectLinearMapCPU{T}},
 	y::AbstractVector, α::Number, β::Number) where {T}
 	fem = adjoint(Cfem)
 	rmul!(fecoefs, β)
-	for (fecoef, fe, cache, tmp) in zip(fecoefs.x, fem.fes, fem.caches, fem.tmp)
-		gather!(fecoef, fe.refs, α, y, cache, tmp, fem.nthreads)
+	for (fecoef, fe, cache) in zip(fecoefs.x, fem.fes, fem.caches)
+		gather!(fecoef, fe.refs, α, y, cache, fem.nthreads)
 	end
 	return fecoefs
 end
 
+# multithreaded gather seemds to be slower
 function gather!(fecoef::AbstractVector, refs::AbstractVector, α::Number, 
-	y::AbstractVector, cache::AbstractVector, tmp::AbstractVector, nthreads::Integer)
-	n_each = div(length(y), nthreads)
-	Threads.@threads for t in 1:nthreads
-		fill!(tmp[t], 0.0)
-		gather!(tmp[t], refs, α, y, cache, ((t - 1) * n_each + 1):(t * n_each))
-	end
-	for x in tmp
-		fecoef .+= x
-	end
-	gather!(fecoef, refs, α, y, cache, (nthreads * n_each + 1):length(y))
-end
-
-function gather!(fecoef::AbstractVector, refs::AbstractVector, α::Number, 
-	y::AbstractVector, cache::AbstractVector, irange::AbstractRange)
-	@inbounds @simd for i in irange
-		fecoef[refs[i]] += α * y[i] * cache[i]
+	y::AbstractVector, cache::AbstractVector, nthreads::Integer)
+	if α == 1
+		@fastmath @inbounds @simd for i in eachindex(y)
+			fecoef[refs[i]] += y[i] * cache[i]
+		end
+	else
+		@fastmath @inbounds @simd for i in eachindex(y)
+			fecoef[refs[i]] += α * y[i] * cache[i]
+		end
 	end
 end
 
@@ -61,11 +53,21 @@ end
 
 function scatter!(y::AbstractVector, α::Number, fecoef::AbstractVector, 
 	refs::AbstractVector, cache::AbstractVector, irange::AbstractRange)
-	@inbounds @simd for i in irange
-		y[i] += α * fecoef[refs[i]] * cache[i]
+	# α is actually only 1 or -1 so do special path for them
+	if α == 1
+		@fastmath @inbounds @simd for i in irange
+			y[i] += fecoef[refs[i]] * cache[i]
+		end
+	elseif α == -1
+		@fastmath @inbounds @simd for i in irange
+			y[i] -= fecoef[refs[i]] * cache[i]
+		end
+	else
+		@fastmath @inbounds @simd for i in irange
+			y[i] += α * fecoef[refs[i]] * cache[i]
+		end
 	end
 end
-
 ##############################################################################
 ##
 ## Implement AbstractFixedEffectSolver interface
@@ -109,7 +111,7 @@ end
 
 function scale!(scale::AbstractVector, refs::AbstractVector, interaction::AbstractVector, weights::AbstractVector)
         fill!(scale, 0)
-	@inbounds @simd for i in eachindex(refs)
+	@fastmath @inbounds @simd for i in eachindex(refs)
 		scale[refs[i]] += abs2(interaction[i]) * weights[i]
 	end
 	# Case of interaction variatble equal to zero in the category (issue #97)
@@ -119,11 +121,7 @@ function scale!(scale::AbstractVector, refs::AbstractVector, interaction::Abstra
 end
 
 function cache!(cache::AbstractVector, refs::AbstractVector, interaction::AbstractVector, weights::AbstractVector, scale::AbstractVector)
-	@inbounds @simd for i in eachindex(cache)
+	@fastmath @inbounds @simd for i in eachindex(cache)
 		cache[i] = interaction[i] * sqrt(weights[i]) * scale[refs[i]]
 	end
 end
-
-
-
-