diff --git a/NEWS.md b/NEWS.md index 79f8f92..9b9229d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,9 +3,10 @@ ### general - strongly speed-up contingency table computation for heterogeneous=true and max_k=0/1 -- use the more compiler-friendly `stack` (introduced in Julia v1.9) instead of `hcat` for large numbers of columns (if available) +- use the more compiler-friendly `stack` (introduced in Julia v1.9) instead of `hcat` for large numbers of columns (if available), introduce fast method for sparse columns - improve univariate pvalue filtering - remove performance bottleneck in three-way `adjust_df` +- catch incorrect usage of 'meta_data_path' as keyword argument # v0.19.2 (latest) diff --git a/src/io.jl b/src/io.jl index f2d139d..cd00842 100644 --- a/src/io.jl +++ b/src/io.jl @@ -52,7 +52,7 @@ function load_data(data_path::AbstractString, meta_path::StrOrNoth=nothing; tran end ld_results = load_jld(data_path, otu_data_key, otu_header_key, meta_data_key, meta_header_key, transposed=transposed) else - error("$(file_ext) not a valid output format. Choose one of $(valid_data_formats)") + error("$(file_ext) not a valid input format. Choose one of $(valid_data_formats)") end ld_results diff --git a/src/learning.jl b/src/learning.jl index a1d0b51..82d3d7e 100644 --- a/src/learning.jl +++ b/src/learning.jl @@ -298,7 +298,7 @@ end function make_table(data_path::AbstractString, meta_data_path::StrOrNoth=nothing; otu_data_key::StrOrNoth="otu_data", otu_header_key::StrOrNoth="otu_header", meta_data_key::StrOrNoth="meta_data", meta_header_key::StrOrNoth="meta_header", - verbose::Bool=true, transposed::Bool=false) + transposed::Bool=false) data, header, meta_data, meta_header = load_data(data_path, meta_data_path, otu_data_key=otu_data_key, otu_header_key=otu_header_key, meta_data_key=meta_data_key, @@ -316,6 +316,24 @@ function make_table(data_path::AbstractString, meta_data_path::StrOrNoth=nothing data, header, meta_mask end +#learn_local_neighborhood(target_var::AbstractString, header, args...; kwargs...) = +# learn_local_neighborhood(findfirst(==(target_var), header), args...; kwargs...) +# +#function learn_local_neighborhood(target_var::Int, data_path::AbstractString, meta_data_path::StrOrNoth=nothing; +# otu_data_key::StrOrNoth="otu_data", +# otu_header_key::StrOrNoth="otu_header", meta_data_key::StrOrNoth="meta_data", +# meta_header_key::StrOrNoth="meta_header", verbose::Bool=true, +# transposed::Bool=false, kwargs...) +# +# verbose && println("\n### Loading data ###\n") +# data, header, meta_mask = make_table(data_path, meta_data_path, otu_data_key=otu_data_key, +# otu_header_key=otu_header_key, meta_data_key=meta_data_key, +# meta_header_key=meta_header_key, transposed=transposed) +# +# +# +#end + """ learn_network(data_path::AbstractString, meta_data_path::AbstractString) -> FWResult{<:Integer} @@ -339,6 +357,11 @@ function learn_network(data_path::AbstractString, meta_data_path::StrOrNoth=noth meta_header_key::StrOrNoth="meta_header", verbose::Bool=true, transposed::Bool=false, kwargs...) + # Catch incorrect usage of data_path/meta_data_path as keyword argument + for key in (:data_path, :meta_data_path) + @assert !(key in keys(kwargs)) "'$key' is a positional argument, please use 'learn_network(, ; )'." + end + verbose && println("\n### Loading data ###\n") data, header, meta_mask = make_table(data_path, meta_data_path, otu_data_key=otu_data_key, otu_header_key=otu_header_key, meta_data_key=meta_data_key, @@ -353,10 +376,12 @@ end Works like learn_network(data_path::AbstractString, meta_data_path::AbstractString), but takes paths to multiple data sets (independent sequencing experiments (e.g. 16S + ITS) for the same biological samples) which are normalized independently. """ function learn_network(all_data_paths::AbstractVector{<:AbstractString}, meta_data_path::StrOrNoth=nothing; - otu_data_key::StrOrNoth="otu_data", - otu_header_key::StrOrNoth="otu_header", meta_data_key::StrOrNoth="meta_data", - meta_header_key::StrOrNoth="meta_header", verbose::Bool=true, - transposed::Bool=false, kwargs...) + otu_data_key::StrOrNoth="otu_data", otu_header_key::StrOrNoth="otu_header", transposed::Bool=false, kwargs...) + + # Catch incorrect usage of data_path/meta_data_path as keyword argument + for key in (:all_data_paths, :meta_data_path) + @assert !(key in keys(kwargs)) "'$key' is a positional argument, please use 'learn_network(, ; )'." + end data_path = all_data_paths[1] if length(all_data_paths) > 1 @@ -447,6 +472,9 @@ function learn_network(data::AbstractMatrix; sensitive::Bool=true, cache_pcor::Bool=false, time_limit::AbstractFloat=-1.0, update_interval::AbstractFloat=30.0, parallel_mode="auto", extra_data::Union{AbstractVector,Nothing}=nothing, share_data::Bool=true, experimental_kwargs...) + @assert !(:meta_data_path in keys(experimental_kwargs)) "You provided a OTU matrix together with a meta data path, this is currently not supported. + Use either 'learn_network(, ; )' or 'learn_network(; )'." + start_time = time() cont_mode = sensitive ? "fz" : "mi" diff --git a/src/preprocessing.jl b/src/preprocessing.jl index c3169a2..9eb8d12 100644 --- a/src/preprocessing.jl +++ b/src/preprocessing.jl @@ -1,10 +1,38 @@ +function _fast_stack_sparse(vecs::Vector{SparseVector{T1, T2}}) where {T1 <: Real, T2 <: Integer} + """Fast method for stacking sparse columns""" + n_rows = length(vecs[1]) + @assert all(length(x) == n_rows for x in vecs) + + rids, cids, nzvals = Int[], Int[], T1[] + + for (col_i, v) in enumerate(vecs) + n_val = nnz(v) + + if n_val > 0 + append!(rids, rowvals(v)) + append!(cids, repeat([col_i], n_val)) + append!(nzvals, nonzeros(v)) + end + end + + n_cols = length(vecs) + return sparse(rids, cids, nzvals, n_rows, n_cols) +end + function stack_or_hcat(vecs::AbstractVector{<:AbstractArray}) # use more efficient stack (introduced in Julia v1.9) if available - if isdefined(Base, :stack) - return stack(vecs) + stacked_matrix = if isdefined(Base, :stack) + # use even faster custom implementation for sparse vectors + if isa(vecs, AbstractVector{<:SparseVector}) + _fast_stack_sparse(vecs) + else + stack(vecs) + end else - return hcat(vecs...) + hcat(vecs...) end + + return stacked_matrix end @@ -187,7 +215,7 @@ function discretize(X::AbstractMatrix{ElType}; n_bins::Integer=3, nz::Bool=true, rank_method::String="tied", disc_method::String="median", nz_mask::BitMatrix=BitMatrix(undef, (0,0))) where ElType <: AbstractFloat if nz if issparse(X) - disc_vecs = SparseVector{Int}[] + disc_vecs = SparseVector{Int,Int}[] for j in 1:size(X, 2) push!(disc_vecs, discretize_nz(X[:, j], n_bins, rank_method=rank_method, disc_method=disc_method)) end diff --git a/src/types.jl b/src/types.jl index b2c9f59..ed5feeb 100644 --- a/src/types.jl +++ b/src/types.jl @@ -149,7 +149,7 @@ end ## RESULT TYPES ## ################## -const RejDict{T} = Dict{T,Tuple{Tuple{Int64,Vararg{Int64,N} where N},TestResult,Tuple{Int,Float64}}} +const RejDict{T} = Dict{T,Tuple{Tuple{Int64,Vararg{Int64,N}} where N,TestResult,Tuple{Int,Float64}}} struct HitonState{T} phase::Char