From de87c7ac2e1f9f5b295a6bff777b11266851ced8 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Sat, 23 Nov 2024 23:25:08 -0500 Subject: [PATCH 01/25] this is the idea --- src/tensors/levels/sharded_levels.jl | 265 +++++++++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 src/tensors/levels/sharded_levels.jl diff --git a/src/tensors/levels/sharded_levels.jl b/src/tensors/levels/sharded_levels.jl new file mode 100644 index 000000000..78324c455 --- /dev/null +++ b/src/tensors/levels/sharded_levels.jl @@ -0,0 +1,265 @@ +""" + ShardedLevel{Lvl, [Val]}() + +A subfiber of a Sharded level is a separate tensor of type `Lvl`, in it's +own memory space. + +Each sublevel is stored in a vector of type `Val` with `eltype(Val) = Lvl`. + +```jldoctest +julia> tensor_tree(Tensor(Dense(Sharded(Element(0.0))), [1, 2, 3])) +3-Tensor +└─ Dense [1:3] + ├─ [1]: Shard -> + │ └─ 1.0 + ├─ [2]: Shard -> + │ └─ 2.0 + └─ [3]: Shard -> + └─ 3.0 +``` +""" +struct ShardedLevel{Lvl, Tp, Ptr, Val, Device} <: AbstractLevel + lvl::Lvl + ptr::Ptr + val::Val + device::Device +end +const Sharded = ShardedLevel + +#similar_level(lvl, level_fill_value(typeof(lvl)), level_eltype(typeof(lvl)), level_size(lvl)...) +ShardedLevel(lvl::Lvl) where {Lvl} = ShardedLevel(lvl, postype(lvl)[], Lvl[]) +Base.summary(::Sharded{Lvl, Val}) where {Lvl, Val} = "Sharded($(Lvl))" + +similar_level(lvl::Sharded{Lvl, Val}, fill_value, eltype::Type, dims...) where {Lvl, Val} = + ShardedLevel(similar_level(lvl.lvl, fill_value, eltype, dims...)) + +postype(::Type{<:Sharded{Lvl, Val}}) where {Lvl, Val} = postype(Lvl) + +function moveto(lvl::ShardedLevel, device) + lvl_2 = moveto(lvl.lvl, device) + val_2 = moveto(lvl.val, device) + return ShardedLevel(lvl_2, val_2) +end + +pattern!(lvl::ShardedLevel) = ShardedLevel(pattern!(lvl.lvl), map(pattern!, lvl.val)) +set_fill_value!(lvl::ShardedLevel, init) = ShardedLevel(set_fill_value!(lvl.lvl, init), map(lvl_2->set_fill_value!(lvl_2, init), lvl.val)) +Base.resize!(lvl::ShardedLevel, dims...) = ShardedLevel(resize!(lvl.lvl, dims...), map(lvl_2->resize!(lvl_2, dims...), lvl.val)) + +function Base.show(io::IO, lvl::ShardedLevel{Lvl, Val}) where {Lvl, Val} + print(io, "Sharded(") + if get(io, :compact, false) + print(io, "…") + else + show(io, lvl.lvl) + print(io, ", ") + show(io, lvl.val) + end + print(io, ")") +end + +labelled_show(io::IO, ::SubFiber{<:ShardedLevel}) = + print(io, "Pointer -> ") + +function labelled_children(fbr::SubFiber{<:ShardedLevel}) + lvl = fbr.lvl + pos = fbr.pos + pos > length(lvl.val) && return [] + [LabelledTree(SubFiber(lvl.val[pos], 1))] +end + +@inline level_ndims(::Type{<:ShardedLevel{Lvl, Val}}) where {Lvl, Val} = level_ndims(Lvl) +@inline level_size(lvl::ShardedLevel{Lvl, Val}) where {Lvl, Val} = level_size(lvl.lvl) +@inline level_axes(lvl::ShardedLevel{Lvl, Val}) where {Lvl, Val} = level_axes(lvl.lvl) +@inline level_eltype(::Type{ShardedLevel{Lvl, Val}}) where {Lvl, Val} = level_eltype(Lvl) +@inline level_fill_value(::Type{<:ShardedLevel{Lvl, Val}}) where {Lvl, Val} = level_fill_value(Lvl) + +function (fbr::SubFiber{<:ShardedLevel})(idxs...) + q = fbr.pos + return SubFiber(fbr.lvl.val[q], 1)(idxs...) +end + +countstored_level(lvl::ShardedLevel, pos) = pos + +mutable struct VirtualShardedLevel <: AbstractVirtualLevel + lvl # stand in for the sublevel for virutal resize, etc. + ex + val + Tv + Lvl + Val +end + +postype(lvl:: VirtualShardedLevel) = postype(lvl.lvl) + +is_level_injective(ctx, lvl::VirtualShardedLevel) = [is_level_injective(ctx, lvl.lvl)..., true] +function is_level_atomic(ctx, lvl::VirtualShardedLevel) + (below, atomic) = is_level_atomic(ctx, lvl.lvl) + return ([below; [atomic]], atomic) +end +function is_level_concurrent(ctx, lvl::VirtualShardedLevel) + (data, _) = is_level_concurrent(ctx, lvl.lvl) + return (data, true) +end + +function lower(ctx::AbstractCompiler, lvl::VirtualShardedLevel, ::DefaultStyle) + quote + $ShardedLevel{$(lvl.Lvl), $(lvl.Val)}($(ctx(lvl.lvl)), $(lvl.val)) + end +end + +function virtualize(ctx, ex, ::Type{ShardedLevel{Lvl, Val}}, tag=:lvl) where {Lvl, Val} + sym = freshen(ctx, tag) + val = freshen(ctx, tag, :_val) + + push_preamble!(ctx, quote + $sym = $ex + $val = $ex.val + end) + lvl_2 = virtualize(ctx, :($ex.lvl), Lvl, sym) + VirtualShardedLevel(lvl_2, sym, val, typeof(level_fill_value(Lvl)), Lvl, Val) +end + +Base.summary(lvl::VirtualShardedLevel) = "Sharded($(lvl.Lvl))" + +virtual_level_resize!(ctx, lvl::VirtualShardedLevel, dims...) = (lvl.lvl = virtual_level_resize!(ctx, lvl.lvl, dims...); lvl) +virtual_level_size(ctx, lvl::VirtualShardedLevel) = virtual_level_size(ctx, lvl.lvl) +virtual_level_eltype(lvl::VirtualShardedLevel) = virtual_level_eltype(lvl.lvl) +virtual_level_fill_value(lvl::VirtualShardedLevel) = virtual_level_fill_value(lvl.lvl) + +function virtual_moveto_level(ctx, lvl::VirtualShardedLevel, arch) + + # Need to move each pointer... + val_2 = freshen(ctx, lvl.val) + push_preamble!(ctx, quote + $val_2 = $(lvl.val) + $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) + end) + push_epilogue!(ctx, quote + $(lvl.val) = $val_2 + end) + virtual_moveto_level(ctx, lvl.lvl, arch) +end + + +function declare_level!(ctx, lvl::VirtualShardedLevel, pos, init) + #declare_level!(lvl.lvl, ctx_2, literal(1), init) + return lvl +end + +function assemble_level!(ctx, lvl::VirtualShardedLevel, pos_start, pos_stop) + pos_start = cache!(ctx, :pos_start, simplify(ctx, pos_start)) + pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) + pos = freshen(ctx, :pos) + sym = freshen(ctx, :pointer_to_lvl) + push_preamble!(ctx, quote + Finch.resize_if_smaller!($(lvl.val), $(ctx(pos_stop))) + for $pos in $(ctx(pos_start)):$(ctx(pos_stop)) + $sym = Finch.similar_level( + $(lvl.ex).lvl, + Finch.level_fill_value(typeof($(lvl.ex).lvl)), + Finch.level_eltype(typeof($(lvl.ex).lvl)), + $(map(ctx, map(getstop, virtual_level_size(ctx, lvl)))...) + ) + $(contain(ctx) do ctx_2 + lvl_2 = virtualize(ctx_2.code, sym, lvl.Lvl, sym) + lvl_2 = declare_level!(ctx_2, lvl_2, literal(0), literal(virtual_level_fill_value(lvl_2))) + lvl_2 = virtual_level_resize!(ctx_2, lvl_2, virtual_level_size(ctx_2, lvl.lvl)...) + push_preamble!(ctx_2, assemble_level!(ctx_2, lvl_2, literal(1), literal(1))) + contain(ctx_2) do ctx_3 + lvl_2 = freeze_level!(ctx_3, lvl_2, literal(1)) + :($(lvl.val)[$(ctx_3(pos))] = $(ctx_3(lvl_2))) + end + end) + end + end) + lvl +end + +supports_reassembly(::VirtualShardedLevel) = true +function reassemble_level!(ctx, lvl::VirtualShardedLevel, pos_start, pos_stop) + pos_start = cache!(ctx, :pos_start, simplify(ctx, pos_start)) + pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) + pos = freshen(ctx, :pos) + push_preamble!(ctx, quote + for $idx in $(ctx(pos_start)):$(ctx(pos_stop)) + $(contain(ctx) do ctx_2 + lvl_2 = virtualize(ctx_2.code, :($(lvl.val)[$idx]), lvl.Lvl, sym) + push_preamble!(ctx_2, assemble_level!(ctx_2, lvl_2, literal(1), literal(1))) + lvl_2 = declare_level!(ctx_2, lvl_2, literal(1), init) + contain(ctx_2) do ctx_3 + lvl_2 = freeze_level!(ctx_3, lvl_2, literal(1)) + :($(lvl.val)[$(ctx_3(pos))] = $(ctx_3(lvl_2))) + end + end) + end + end) + lvl +end + +function freeze_level!(ctx, lvl::VirtualShardedLevel, pos) + return lvl +end + +function thaw_level!(ctx::AbstractCompiler, lvl::VirtualShardedLevel, pos) + return lvl +end + +function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardedLevel}, mode) + if mode.kind === reader + (lvl, pos) = (fbr.lvl, fbr.pos) + tag = lvl.ex + isnulltest = freshen(ctx, tag, :_nulltest) + Vf = level_fill_value(lvl.Lvl) + sym = freshen(ctx, :pointer_to_lvl) + val = freshen(ctx, lvl.ex, :_val) + return Thunk( + body = (ctx) -> begin + lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) + instantiate(ctx, VirtualSubFiber(lvl_2, literal(1)), mode) + end, + ) + else + (lvl, pos) = (fbr.lvl, fbr.pos) + tag = lvl.ex + sym = freshen(ctx, :pointer_to_lvl) + + return Thunk( + body = (ctx) -> begin + lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) + lvl_2 = thaw_level!(ctx, lvl_2, literal(1)) + push_preamble!(ctx, assemble_level!(ctx, lvl_2, literal(1), literal(1))) + res = instantiate(ctx, VirtualSubFiber(lvl_2, literal(1)), mode) + push_epilogue!(ctx, + contain(ctx) do ctx_2 + lvl_2 = freeze_level!(ctx_2, lvl_2, literal(1)) + :($(lvl.val)[$(ctx_2(pos))] = $(ctx_2(lvl_2))) + end + ) + res + end + ) + end +end + +function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardedLevel}, mode) + @assert mode.kind === updater + (lvl, pos) = (fbr.lvl, fbr.pos) + tag = lvl.ex + sym = freshen(ctx, :pointer_to_lvl) + + return Thunk( + body = (ctx) -> begin + lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) + lvl_2 = thaw_level!(ctx, lvl_2, literal(1)) + push_preamble!(ctx, assemble_level!(ctx, lvl_2, literal(1), literal(1))) + res = instantiate(ctx, VirtualHollowSubFiber(lvl_2, literal(1), fbr.dirty), mode) + push_epilogue!(ctx, + contain(ctx) do ctx_2 + lvl_2 = freeze_level!(ctx_2, lvl_2, literal(1)) + :($(lvl.val)[$(ctx_2(pos))] = $(ctx_2(lvl_2))) + end + ) + res + end + ) + end From e0da0fe9805c73f52bf7ec4f35a6a50c05938164 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Mon, 25 Nov 2024 22:01:42 -0500 Subject: [PATCH 02/25] fixing --- src/tensors/levels/sharded_levels.jl | 30 +++++++++++++++------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/tensors/levels/sharded_levels.jl b/src/tensors/levels/sharded_levels.jl index 78324c455..2b9a860b2 100644 --- a/src/tensors/levels/sharded_levels.jl +++ b/src/tensors/levels/sharded_levels.jl @@ -18,22 +18,24 @@ julia> tensor_tree(Tensor(Dense(Sharded(Element(0.0))), [1, 2, 3])) └─ 3.0 ``` """ -struct ShardedLevel{Lvl, Tp, Ptr, Val, Device} <: AbstractLevel +struct ShardedLevel{Device, Lvl, Ptr, Val} <: AbstractLevel + device::Device lvl::Lvl ptr::Ptr val::Val - device::Device end const Sharded = ShardedLevel -#similar_level(lvl, level_fill_value(typeof(lvl)), level_eltype(typeof(lvl)), level_size(lvl)...) -ShardedLevel(lvl::Lvl) where {Lvl} = ShardedLevel(lvl, postype(lvl)[], Lvl[]) -Base.summary(::Sharded{Lvl, Val}) where {Lvl, Val} = "Sharded($(Lvl))" +ShardedLevel(device::Device, lvl::Lvl) where {Device, Lvl} = ShardedLevel{Device}(device, lvl, postype(lvl)[], typeof(lvl)[]) +ShardedLevel(device::Device, lvl::Lvl, ptr::Ptr, val::Val) where {Device, Lvl, Ptr, Val} = + ShardedLevel{Device, Lvl, Ptr, Val}(device, lvl, ptr, val) + +Base.summary(::Sharded{Device, Lvl, Ptr, Val}) where {Device, Lvl, Ptr, Val} = "Sharded($(Lvl))" -similar_level(lvl::Sharded{Lvl, Val}, fill_value, eltype::Type, dims...) where {Lvl, Val} = +similar_level(lvl::Sharded{Device, Lvl, Ptr, Val}, fill_value, eltype::Type, dims...) where {Device, Lvl, Ptr, Val} = ShardedLevel(similar_level(lvl.lvl, fill_value, eltype, dims...)) -postype(::Type{<:Sharded{Lvl, Val}}) where {Lvl, Val} = postype(Lvl) +postype(::Type{<:Sharded{Device, Lvl, Ptr, Val}}) where {Device, Lvl, Ptr, Val} = postype(Lvl) function moveto(lvl::ShardedLevel, device) lvl_2 = moveto(lvl.lvl, device) @@ -45,7 +47,7 @@ pattern!(lvl::ShardedLevel) = ShardedLevel(pattern!(lvl.lvl), map(pattern!, lvl. set_fill_value!(lvl::ShardedLevel, init) = ShardedLevel(set_fill_value!(lvl.lvl, init), map(lvl_2->set_fill_value!(lvl_2, init), lvl.val)) Base.resize!(lvl::ShardedLevel, dims...) = ShardedLevel(resize!(lvl.lvl, dims...), map(lvl_2->resize!(lvl_2, dims...), lvl.val)) -function Base.show(io::IO, lvl::ShardedLevel{Lvl, Val}) where {Lvl, Val} +function Base.show(io::IO, lvl::ShardedLevel{Device, Lvl, Ptr, Val}) where {Device, Lvl, Ptr, Val} print(io, "Sharded(") if get(io, :compact, false) print(io, "…") @@ -67,11 +69,11 @@ function labelled_children(fbr::SubFiber{<:ShardedLevel}) [LabelledTree(SubFiber(lvl.val[pos], 1))] end -@inline level_ndims(::Type{<:ShardedLevel{Lvl, Val}}) where {Lvl, Val} = level_ndims(Lvl) -@inline level_size(lvl::ShardedLevel{Lvl, Val}) where {Lvl, Val} = level_size(lvl.lvl) -@inline level_axes(lvl::ShardedLevel{Lvl, Val}) where {Lvl, Val} = level_axes(lvl.lvl) -@inline level_eltype(::Type{ShardedLevel{Lvl, Val}}) where {Lvl, Val} = level_eltype(Lvl) -@inline level_fill_value(::Type{<:ShardedLevel{Lvl, Val}}) where {Lvl, Val} = level_fill_value(Lvl) +@inline level_ndims(::Type{<:ShardedLevel{Device, Lvl, Ptr, Val}}) where {Device, Lvl, Ptr, Val} = level_ndims(Lvl) +@inline level_size(lvl::ShardedLevel{Device, Lvl, Ptr, Val}) where {Device, Lvl, Ptr, Val} = level_size(lvl.lvl) +@inline level_axes(lvl::ShardedLevel{Device, Lvl, Ptr, Val}) where {Device, Lvl, Ptr, Val} = level_axes(lvl.lvl) +@inline level_eltype(::Type{ShardedLevel{Device, Lvl, Ptr, Val}}) where {Device, Lvl, Ptr, Val} = level_eltype(Lvl) +@inline level_fill_value(::Type{<:ShardedLevel{Device, Lvl, Ptr, Val}}) where {Device, Lvl, Ptr, Val} = level_fill_value(Lvl) function (fbr::SubFiber{<:ShardedLevel})(idxs...) q = fbr.pos @@ -107,7 +109,7 @@ function lower(ctx::AbstractCompiler, lvl::VirtualShardedLevel, ::DefaultStyle) end end -function virtualize(ctx, ex, ::Type{ShardedLevel{Lvl, Val}}, tag=:lvl) where {Lvl, Val} +function virtualize(ctx, ex, ::Type{ShardedLevel{Device, Lvl, Ptr, Val}}, tag=:lvl) where {Device, Lvl, Ptr, Val} sym = freshen(ctx, tag) val = freshen(ctx, tag, :_val) From 208cc189b1358c7755f814591f2e0dd7a5cee10f Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Fri, 13 Dec 2024 10:01:28 -0500 Subject: [PATCH 03/25] trying out local memory concept --- src/architecture.jl | 13 +++++++++++++ src/tensors/levels/sharded_levels.jl | 6 +++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/architecture.jl b/src/architecture.jl index cdc8fc438..eadbc64fc 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -171,6 +171,19 @@ function moveto(vec::CPULocalVector, task::CPUThread) return temp end +""" + local_memory(device) + +Returns the local memory type for a given device. +""" +function local_memory(device::CPU) + return CPULocalMemory(device) +end + +function local_memory(device::Serial) + return device +end + struct Converter{f, T} end (::Converter{f, T})(x) where {f, T} = T(f(x)) diff --git a/src/tensors/levels/sharded_levels.jl b/src/tensors/levels/sharded_levels.jl index 2b9a860b2..bb70a64f0 100644 --- a/src/tensors/levels/sharded_levels.jl +++ b/src/tensors/levels/sharded_levels.jl @@ -1,8 +1,8 @@ """ ShardedLevel{Lvl, [Val]}() -A subfiber of a Sharded level is a separate tensor of type `Lvl`, in it's -own memory space. +Each subfiber of a Sharded level is stored in a thread-local tensor of type +`Lvl`, in a thread-local memory space. Each sublevel is stored in a vector of type `Val` with `eltype(Val) = Lvl`. @@ -26,7 +26,7 @@ struct ShardedLevel{Device, Lvl, Ptr, Val} <: AbstractLevel end const Sharded = ShardedLevel -ShardedLevel(device::Device, lvl::Lvl) where {Device, Lvl} = ShardedLevel{Device}(device, lvl, postype(lvl)[], typeof(lvl)[]) +ShardedLevel(device::Device, lvl::Lvl) where {Device, Lvl} = ShardedLevel{Device}(device, moveto!(lvl, local_memory(device)), moveto!(postype(lvl)[], device), moveto!(postype(lvl)[], device), typeof(lvl)[]) ShardedLevel(device::Device, lvl::Lvl, ptr::Ptr, val::Val) where {Device, Lvl, Ptr, Val} = ShardedLevel{Device, Lvl, Ptr, Val}(device, lvl, ptr, val) From 7d9217c2ddb7c9d51d4f4bd65d243784f78c1cca Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Fri, 13 Dec 2024 11:46:18 -0500 Subject: [PATCH 04/25] add tasks --- src/tensors/levels/sharded_levels.jl | 75 +++++++++++++++++----------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/src/tensors/levels/sharded_levels.jl b/src/tensors/levels/sharded_levels.jl index bb70a64f0..48f9538bf 100644 --- a/src/tensors/levels/sharded_levels.jl +++ b/src/tensors/levels/sharded_levels.jl @@ -18,62 +18,73 @@ julia> tensor_tree(Tensor(Dense(Sharded(Element(0.0))), [1, 2, 3])) └─ 3.0 ``` """ -struct ShardedLevel{Device, Lvl, Ptr, Val} <: AbstractLevel +struct ShardedLevel{Device, Lvl, Ptr, Task, Val} <: AbstractLevel device::Device lvl::Lvl ptr::Ptr + task::Task val::Val end const Sharded = ShardedLevel -ShardedLevel(device::Device, lvl::Lvl) where {Device, Lvl} = ShardedLevel{Device}(device, moveto!(lvl, local_memory(device)), moveto!(postype(lvl)[], device), moveto!(postype(lvl)[], device), typeof(lvl)[]) -ShardedLevel(device::Device, lvl::Lvl, ptr::Ptr, val::Val) where {Device, Lvl, Ptr, Val} = - ShardedLevel{Device, Lvl, Ptr, Val}(device, lvl, ptr, val) +ShardedLevel(device::Device, lvl::Lvl) where {Device, Lvl} = + ShardedLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], typeof(lvl)[]) -Base.summary(::Sharded{Device, Lvl, Ptr, Val}) where {Device, Lvl, Ptr, Val} = "Sharded($(Lvl))" +ShardedLevel(device::Device, lvl::Lvl, ptr::Ptr, task::Task, val::Val) where {Device, Lvl, Ptr, Task, Val} = + ShardedLevel{Device, Lvl, Ptr, Task, Val}(device, lvl, ptr, task, val) -similar_level(lvl::Sharded{Device, Lvl, Ptr, Val}, fill_value, eltype::Type, dims...) where {Device, Lvl, Ptr, Val} = - ShardedLevel(similar_level(lvl.lvl, fill_value, eltype, dims...)) +Base.summary(::Sharded{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = "Sharded($(Lvl))" -postype(::Type{<:Sharded{Device, Lvl, Ptr, Val}}) where {Device, Lvl, Ptr, Val} = postype(Lvl) +similar_level(lvl::Sharded{Device, Lvl, Ptr, Task, Val}, fill_value, eltype::Type, dims...) where {Device, Lvl, Ptr, Task, Val} = + ShardedLevel(lvl, similar_level(lvl.lvl, fill_value, eltype, dims...)) + +postype(::Type{<:Sharded{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = postype(Lvl) function moveto(lvl::ShardedLevel, device) lvl_2 = moveto(lvl.lvl, device) + ptr_2 = moveto(lvl.ptr, device) + task_2 = moveto(lvl.task, device) val_2 = moveto(lvl.val, device) - return ShardedLevel(lvl_2, val_2) + return ShardedLevel(lvl_2, ptr_2, task_2, val_2) end -pattern!(lvl::ShardedLevel) = ShardedLevel(pattern!(lvl.lvl), map(pattern!, lvl.val)) -set_fill_value!(lvl::ShardedLevel, init) = ShardedLevel(set_fill_value!(lvl.lvl, init), map(lvl_2->set_fill_value!(lvl_2, init), lvl.val)) -Base.resize!(lvl::ShardedLevel, dims...) = ShardedLevel(resize!(lvl.lvl, dims...), map(lvl_2->resize!(lvl_2, dims...), lvl.val)) +pattern!(lvl::ShardedLevel) = ShardedLevel(pattern!(lvl.lvl), lvl.ptr, lvl.task, map(pattern!, lvl.val)) +set_fill_value!(lvl::ShardedLevel, init) = ShardedLevel(set_fill_value!(lvl.lvl, init), lvl.ptr, lvl.task, map(lvl_2 -> set_fill_value!(lvl_2, init), lvl.val)) +Base.resize!(lvl::ShardedLevel, dims...) = ShardedLevel(resize!(lvl.lvl, dims...), lvl.ptr, lvl.task, map(lvl_2 -> resize!(lvl_2, dims...), lvl.val)) -function Base.show(io::IO, lvl::ShardedLevel{Device, Lvl, Ptr, Val}) where {Device, Lvl, Ptr, Val} +function Base.show(io::IO, lvl::ShardedLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} print(io, "Sharded(") if get(io, :compact, false) print(io, "…") else show(io, lvl.lvl) print(io, ", ") + show(io, lvl.ptr) + print(io, ", ") + show(io, lvl.task) + print(io, ", ") show(io, lvl.val) end print(io, ")") end -labelled_show(io::IO, ::SubFiber{<:ShardedLevel}) = - print(io, "Pointer -> ") +function labelled_show(io::IO, fbr::SubFiber{<:ShardedLevel}) + (lvl, pos) = (fbr.lvl, fbr.pos) + print(io, "shard($(lvl.task[pos])) -> ") +end function labelled_children(fbr::SubFiber{<:ShardedLevel}) lvl = fbr.lvl pos = fbr.pos pos > length(lvl.val) && return [] - [LabelledTree(SubFiber(lvl.val[pos], 1))] + [LabelledTree(SubFiber(lvl.val[lvl.task[pos]], lvl.ptr[pos]))] end -@inline level_ndims(::Type{<:ShardedLevel{Device, Lvl, Ptr, Val}}) where {Device, Lvl, Ptr, Val} = level_ndims(Lvl) -@inline level_size(lvl::ShardedLevel{Device, Lvl, Ptr, Val}) where {Device, Lvl, Ptr, Val} = level_size(lvl.lvl) -@inline level_axes(lvl::ShardedLevel{Device, Lvl, Ptr, Val}) where {Device, Lvl, Ptr, Val} = level_axes(lvl.lvl) -@inline level_eltype(::Type{ShardedLevel{Device, Lvl, Ptr, Val}}) where {Device, Lvl, Ptr, Val} = level_eltype(Lvl) -@inline level_fill_value(::Type{<:ShardedLevel{Device, Lvl, Ptr, Val}}) where {Device, Lvl, Ptr, Val} = level_fill_value(Lvl) +@inline level_ndims(::Type{<:ShardedLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_ndims(Lvl) +@inline level_size(lvl::ShardedLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = level_size(lvl.lvl) +@inline level_axes(lvl::ShardedLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = level_axes(lvl.lvl) +@inline level_eltype(::Type{ShardedLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_eltype(Lvl) +@inline level_fill_value(::Type{<:ShardedLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_fill_value(Lvl) function (fbr::SubFiber{<:ShardedLevel})(idxs...) q = fbr.pos @@ -83,15 +94,17 @@ end countstored_level(lvl::ShardedLevel, pos) = pos mutable struct VirtualShardedLevel <: AbstractVirtualLevel - lvl # stand in for the sublevel for virutal resize, etc. + lvl # stand-in for the sublevel for virtual resize, etc. ex val Tv Lvl + Ptr + Task Val end -postype(lvl:: VirtualShardedLevel) = postype(lvl.lvl) +postype(lvl::VirtualShardedLevel) = postype(lvl.lvl) is_level_injective(ctx, lvl::VirtualShardedLevel) = [is_level_injective(ctx, lvl.lvl)..., true] function is_level_atomic(ctx, lvl::VirtualShardedLevel) @@ -105,20 +118,24 @@ end function lower(ctx::AbstractCompiler, lvl::VirtualShardedLevel, ::DefaultStyle) quote - $ShardedLevel{$(lvl.Lvl), $(lvl.Val)}($(ctx(lvl.lvl)), $(lvl.val)) + $ShardedLevel{$(lvl.Lvl), $(lvl.Ptr), $(lvl.Task), $(lvl.Val)}($(ctx(lvl.lvl)), $(lvl.val)) end end -function virtualize(ctx, ex, ::Type{ShardedLevel{Device, Lvl, Ptr, Val}}, tag=:lvl) where {Device, Lvl, Ptr, Val} +function virtualize(ctx, ex, ::Type{ShardedLevel{Device, Lvl, Ptr, Task, Val}}, tag=:lvl) where {Device, Lvl, Ptr, Task, Val} sym = freshen(ctx, tag) + ptr = freshen(ctx, tag, :_ptr) + task = freshen(ctx, tag, :_task) val = freshen(ctx, tag, :_val) push_preamble!(ctx, quote $sym = $ex + $ptr = $ex.ptr + $task = $ex.task $val = $ex.val end) lvl_2 = virtualize(ctx, :($ex.lvl), Lvl, sym) - VirtualShardedLevel(lvl_2, sym, val, typeof(level_fill_value(Lvl)), Lvl, Val) + VirtualShardedLevel(lvl_2, sym, val, typeof(level_fill_value(Lvl)), Lvl, Ptr, Task, Val) end Base.summary(lvl::VirtualShardedLevel) = "Sharded($(lvl.Lvl))" @@ -129,8 +146,6 @@ virtual_level_eltype(lvl::VirtualShardedLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualShardedLevel) = virtual_level_fill_value(lvl.lvl) function virtual_moveto_level(ctx, lvl::VirtualShardedLevel, arch) - - # Need to move each pointer... val_2 = freshen(ctx, lvl.val) push_preamble!(ctx, quote $val_2 = $(lvl.val) @@ -142,9 +157,7 @@ function virtual_moveto_level(ctx, lvl::VirtualShardedLevel, arch) virtual_moveto_level(ctx, lvl.lvl, arch) end - function declare_level!(ctx, lvl::VirtualShardedLevel, pos, init) - #declare_level!(lvl.lvl, ctx_2, literal(1), init) return lvl end @@ -178,6 +191,7 @@ function assemble_level!(ctx, lvl::VirtualShardedLevel, pos_start, pos_stop) end supports_reassembly(::VirtualShardedLevel) = true + function reassemble_level!(ctx, lvl::VirtualShardedLevel, pos_start, pos_stop) pos_start = cache!(ctx, :pos_start, simplify(ctx, pos_start)) pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) @@ -265,3 +279,4 @@ function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardedLevel}, mode) end ) end +end \ No newline at end of file From 364d5a3afbbb6b3c5d44566e85ce29b6391c1af3 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 8 Jan 2025 12:10:17 -0500 Subject: [PATCH 05/25] stuff --- src/tensors/levels/sharded_levels.jl | 118 ++++++++++++++------------- 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/src/tensors/levels/sharded_levels.jl b/src/tensors/levels/sharded_levels.jl index 48f9538bf..a549a6be3 100644 --- a/src/tensors/levels/sharded_levels.jl +++ b/src/tensors/levels/sharded_levels.jl @@ -28,7 +28,7 @@ end const Sharded = ShardedLevel ShardedLevel(device::Device, lvl::Lvl) where {Device, Lvl} = - ShardedLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], typeof(lvl)[]) + ShardedLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], replicateto(lvl, device)) ShardedLevel(device::Device, lvl::Lvl, ptr::Ptr, task::Task, val::Val) where {Device, Lvl, Ptr, Task, Val} = ShardedLevel{Device, Lvl, Ptr, Task, Val}(device, lvl, ptr, task, val) @@ -44,7 +44,6 @@ function moveto(lvl::ShardedLevel, device) lvl_2 = moveto(lvl.lvl, device) ptr_2 = moveto(lvl.ptr, device) task_2 = moveto(lvl.task, device) - val_2 = moveto(lvl.val, device) return ShardedLevel(lvl_2, ptr_2, task_2, val_2) end @@ -158,60 +157,42 @@ function virtual_moveto_level(ctx, lvl::VirtualShardedLevel, arch) end function declare_level!(ctx, lvl::VirtualShardedLevel, pos, init) - return lvl + virtual_parallel_region!(ctx, lvl.device) do ctx, task + lvl_2 = virtualize(ctx, :($(lvl.ex).val[$(task)]), lvl.Lvl) #TODO should this virtualize the eltype of Val? + declare_level!(ctx, lvl.lvl, literal($task), init) + end end +""" +assemble: + mapping is pos -> task, ptr. task says which task has it, ptr says which position in that task has it. + +read: + read from pos to task, ptr. simple. + +write: + allocate something for this task on that position, assemble on the task itself on demand. Complain if the task is wrong. + +The outer level needs to be concurrent, like denselevel. +""" function assemble_level!(ctx, lvl::VirtualShardedLevel, pos_start, pos_stop) pos_start = cache!(ctx, :pos_start, simplify(ctx, pos_start)) pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) pos = freshen(ctx, :pos) sym = freshen(ctx, :pointer_to_lvl) push_preamble!(ctx, quote - Finch.resize_if_smaller!($(lvl.val), $(ctx(pos_stop))) - for $pos in $(ctx(pos_start)):$(ctx(pos_stop)) - $sym = Finch.similar_level( - $(lvl.ex).lvl, - Finch.level_fill_value(typeof($(lvl.ex).lvl)), - Finch.level_eltype(typeof($(lvl.ex).lvl)), - $(map(ctx, map(getstop, virtual_level_size(ctx, lvl)))...) - ) - $(contain(ctx) do ctx_2 - lvl_2 = virtualize(ctx_2.code, sym, lvl.Lvl, sym) - lvl_2 = declare_level!(ctx_2, lvl_2, literal(0), literal(virtual_level_fill_value(lvl_2))) - lvl_2 = virtual_level_resize!(ctx_2, lvl_2, virtual_level_size(ctx_2, lvl.lvl)...) - push_preamble!(ctx_2, assemble_level!(ctx_2, lvl_2, literal(1), literal(1))) - contain(ctx_2) do ctx_3 - lvl_2 = freeze_level!(ctx_3, lvl_2, literal(1)) - :($(lvl.val)[$(ctx_3(pos))] = $(ctx_3(lvl_2))) - end - end) - end + Finch.resize_if_smaller!($(lvl.task), $(ctx(pos_stop))) + Finch.resize_if_smaller!($(lvl.ptr), $(ctx(pos_stop))) + Finch.fill_range!($(lvl.task), $(ctx(pos_start)), $(ctx(pos_stop)), 0) end) lvl end -supports_reassembly(::VirtualShardedLevel) = true - -function reassemble_level!(ctx, lvl::VirtualShardedLevel, pos_start, pos_stop) - pos_start = cache!(ctx, :pos_start, simplify(ctx, pos_start)) - pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) - pos = freshen(ctx, :pos) - push_preamble!(ctx, quote - for $idx in $(ctx(pos_start)):$(ctx(pos_stop)) - $(contain(ctx) do ctx_2 - lvl_2 = virtualize(ctx_2.code, :($(lvl.val)[$idx]), lvl.Lvl, sym) - push_preamble!(ctx_2, assemble_level!(ctx_2, lvl_2, literal(1), literal(1))) - lvl_2 = declare_level!(ctx_2, lvl_2, literal(1), init) - contain(ctx_2) do ctx_3 - lvl_2 = freeze_level!(ctx_3, lvl_2, literal(1)) - :($(lvl.val)[$(ctx_3(pos))] = $(ctx_3(lvl_2))) - end - end) - end - end) - lvl -end +supports_reassembly(::VirtualShardedLevel) = false +""" +these two are no-ops, we insteaed do these on instantiate +""" function freeze_level!(ctx, lvl::VirtualShardedLevel, pos) return lvl end @@ -257,26 +238,53 @@ function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardedLevel}, mode) end end +#we need some sort of localization step at the start of a parallel region whereby we can thaw the shart level + +""" +assemble: + mapping is pos -> task, ptr. task says which task has it, ptr says which position in that task has it. + +read: + read from pos to task, ptr. simple. + +write: + allocate something for this task on that position, assemble on the task itself on demand. Complain if the task is wrong. + +The outer level needs to be concurrent, like denselevel. +""" function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardedLevel}, mode) @assert mode.kind === updater (lvl, pos) = (fbr.lvl, fbr.pos) tag = lvl.ex sym = freshen(ctx, :pointer_to_lvl) + task = freshen(ctx, tag, :_task) + return Thunk( - body = (ctx) -> begin - lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) - lvl_2 = thaw_level!(ctx, lvl_2, literal(1)) - push_preamble!(ctx, assemble_level!(ctx, lvl_2, literal(1), literal(1))) - res = instantiate(ctx, VirtualHollowSubFiber(lvl_2, literal(1), fbr.dirty), mode) - push_epilogue!(ctx, - contain(ctx) do ctx_2 - lvl_2 = freeze_level!(ctx_2, lvl_2, literal(1)) - :($(lvl.val)[$(ctx_2(pos))] = $(ctx_2(lvl_2))) + preamble = quote + $task = $(lvl.task)[$(ctx(pos))] + if task == 0 + $(lvl.task)[$(ctx(pos))] = $(gettasknum(ctx)) + qos = local_qos_fill + if $(lvl.local_qos_fill) > $(lvl.local_qos_stop) + $local_qos_stop = max($local_qos_stop << 1, 1) + $(contain(ctx_2->assemble_level!(ctx_2, lvl.lvl, value(qos_fill, Tp), value(qos_stop, Tp)), ctx)) + end + else + qos = $(lvl.ptr)[$(ctx(pos))] + qos_stop = $(lvl.local_qos_stop) + #only in safe mode, we check if task == $(gettasknum(ctx)) and if not error("Task mismatch in ShardedLevel") end - ) - res + dirty = true + end) + end + body = (ctx) -> VirtualHollowSubFiber(lvl.lvl, local_) + epilogue = quote + #this task will always own this position forever, even if we don't write to it. Still, we try to be conservative of memory usage of the underlying level. + if dirty && $(lvl.ptr)[$(ctx(pos))] == 0 + local_qos_fill += 1 + $(lvl.ptr)[$(ctx(pos))] = $(lvl.local_qos_fill) += 1 end - ) + end end end \ No newline at end of file From fa71b9bce473b252fbf9a7862dff551e88b40236 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Thu, 30 Jan 2025 10:50:53 -0500 Subject: [PATCH 06/25] fixes --- src/Finch.jl | 2 + .../{sharded_levels.jl => shard_levels.jl} | 137 +++++++++--------- 2 files changed, 70 insertions(+), 69 deletions(-) rename src/tensors/levels/{sharded_levels.jl => shard_levels.jl} (52%) diff --git a/src/Finch.jl b/src/Finch.jl index 1b6fee820..606d8c5bb 100644 --- a/src/Finch.jl +++ b/src/Finch.jl @@ -40,6 +40,7 @@ export Dense, DenseLevel export Element, ElementLevel export AtomicElement, AtomicElementLevel export Separate, SeparateLevel +export Shard, ShardLevel export Mutex, MutexLevel export Pattern, PatternLevel export Scalar, SparseScalar, ShortCircuitScalar, SparseShortCircuitScalar @@ -138,6 +139,7 @@ include("tensors/levels/dense_rle_levels.jl") include("tensors/levels/element_levels.jl") include("tensors/levels/atomic_element_levels.jl") include("tensors/levels/separate_levels.jl") +include("tensors/levels/shard_levels.jl") include("tensors/levels/mutex_levels.jl") include("tensors/levels/pattern_levels.jl") include("tensors/masks.jl") diff --git a/src/tensors/levels/sharded_levels.jl b/src/tensors/levels/shard_levels.jl similarity index 52% rename from src/tensors/levels/sharded_levels.jl rename to src/tensors/levels/shard_levels.jl index a549a6be3..021f1388d 100644 --- a/src/tensors/levels/sharded_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -1,13 +1,13 @@ """ - ShardedLevel{Lvl, [Val]}() + ShardLevel{Lvl, [Val]}() -Each subfiber of a Sharded level is stored in a thread-local tensor of type +Each subfiber of a Shard level is stored in a thread-local tensor of type `Lvl`, in a thread-local memory space. Each sublevel is stored in a vector of type `Val` with `eltype(Val) = Lvl`. ```jldoctest -julia> tensor_tree(Tensor(Dense(Sharded(Element(0.0))), [1, 2, 3])) +julia> tensor_tree(Tensor(Dense(Shard(Element(0.0))), [1, 2, 3])) 3-Tensor └─ Dense [1:3] ├─ [1]: Shard -> @@ -18,41 +18,41 @@ julia> tensor_tree(Tensor(Dense(Sharded(Element(0.0))), [1, 2, 3])) └─ 3.0 ``` """ -struct ShardedLevel{Device, Lvl, Ptr, Task, Val} <: AbstractLevel +struct ShardLevel{Device, Lvl, Ptr, Task, Val} <: AbstractLevel device::Device lvl::Lvl ptr::Ptr task::Task val::Val end -const Sharded = ShardedLevel +const Shard = ShardLevel -ShardedLevel(device::Device, lvl::Lvl) where {Device, Lvl} = - ShardedLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], replicateto(lvl, device)) +ShardLevel(device::Device, lvl::Lvl) where {Device, Lvl} = + ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], moveto(lvl, device)) #TODO scatterto? -ShardedLevel(device::Device, lvl::Lvl, ptr::Ptr, task::Task, val::Val) where {Device, Lvl, Ptr, Task, Val} = - ShardedLevel{Device, Lvl, Ptr, Task, Val}(device, lvl, ptr, task, val) +#ShardLevel(device::Device, lvl::Lvl, ptr::Ptr, task::Task, val::Val) where {Device, Lvl, Ptr, Task, Val} = +# ShardLevel{Device, Lvl, Ptr, Task, Val}(device, lvl, ptr, task, val) -Base.summary(::Sharded{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = "Sharded($(Lvl))" +Base.summary(::Shard{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = "Shard($(Lvl))" -similar_level(lvl::Sharded{Device, Lvl, Ptr, Task, Val}, fill_value, eltype::Type, dims...) where {Device, Lvl, Ptr, Task, Val} = - ShardedLevel(lvl, similar_level(lvl.lvl, fill_value, eltype, dims...)) +similar_level(lvl::Shard{Device, Lvl, Ptr, Task, Val}, fill_value, eltype::Type, dims...) where {Device, Lvl, Ptr, Task, Val} = + ShardLevel(lvl, similar_level(lvl.lvl, fill_value, eltype, dims...)) -postype(::Type{<:Sharded{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = postype(Lvl) +postype(::Type{<:Shard{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = postype(Lvl) -function moveto(lvl::ShardedLevel, device) +function moveto(lvl::ShardLevel, device) lvl_2 = moveto(lvl.lvl, device) ptr_2 = moveto(lvl.ptr, device) task_2 = moveto(lvl.task, device) - return ShardedLevel(lvl_2, ptr_2, task_2, val_2) + return ShardLevel(lvl_2, ptr_2, task_2, val_2) end -pattern!(lvl::ShardedLevel) = ShardedLevel(pattern!(lvl.lvl), lvl.ptr, lvl.task, map(pattern!, lvl.val)) -set_fill_value!(lvl::ShardedLevel, init) = ShardedLevel(set_fill_value!(lvl.lvl, init), lvl.ptr, lvl.task, map(lvl_2 -> set_fill_value!(lvl_2, init), lvl.val)) -Base.resize!(lvl::ShardedLevel, dims...) = ShardedLevel(resize!(lvl.lvl, dims...), lvl.ptr, lvl.task, map(lvl_2 -> resize!(lvl_2, dims...), lvl.val)) +pattern!(lvl::ShardLevel) = ShardLevel(pattern!(lvl.lvl), lvl.ptr, lvl.task, map(pattern!, lvl.val)) +set_fill_value!(lvl::ShardLevel, init) = ShardLevel(set_fill_value!(lvl.lvl, init), lvl.ptr, lvl.task, map(lvl_2 -> set_fill_value!(lvl_2, init), lvl.val)) +Base.resize!(lvl::ShardLevel, dims...) = ShardLevel(resize!(lvl.lvl, dims...), lvl.ptr, lvl.task, map(lvl_2 -> resize!(lvl_2, dims...), lvl.val)) -function Base.show(io::IO, lvl::ShardedLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} - print(io, "Sharded(") +function Base.show(io::IO, lvl::ShardLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} + print(io, "Shard(") if get(io, :compact, false) print(io, "…") else @@ -67,32 +67,32 @@ function Base.show(io::IO, lvl::ShardedLevel{Device, Lvl, Ptr, Task, Val}) where print(io, ")") end -function labelled_show(io::IO, fbr::SubFiber{<:ShardedLevel}) +function labelled_show(io::IO, fbr::SubFiber{<:ShardLevel}) (lvl, pos) = (fbr.lvl, fbr.pos) print(io, "shard($(lvl.task[pos])) -> ") end -function labelled_children(fbr::SubFiber{<:ShardedLevel}) +function labelled_children(fbr::SubFiber{<:ShardLevel}) lvl = fbr.lvl pos = fbr.pos pos > length(lvl.val) && return [] [LabelledTree(SubFiber(lvl.val[lvl.task[pos]], lvl.ptr[pos]))] end -@inline level_ndims(::Type{<:ShardedLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_ndims(Lvl) -@inline level_size(lvl::ShardedLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = level_size(lvl.lvl) -@inline level_axes(lvl::ShardedLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = level_axes(lvl.lvl) -@inline level_eltype(::Type{ShardedLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_eltype(Lvl) -@inline level_fill_value(::Type{<:ShardedLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_fill_value(Lvl) +@inline level_ndims(::Type{<:ShardLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_ndims(Lvl) +@inline level_size(lvl::ShardLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = level_size(lvl.lvl) +@inline level_axes(lvl::ShardLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = level_axes(lvl.lvl) +@inline level_eltype(::Type{ShardLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_eltype(Lvl) +@inline level_fill_value(::Type{<:ShardLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_fill_value(Lvl) -function (fbr::SubFiber{<:ShardedLevel})(idxs...) +function (fbr::SubFiber{<:ShardLevel})(idxs...) q = fbr.pos return SubFiber(fbr.lvl.val[q], 1)(idxs...) end -countstored_level(lvl::ShardedLevel, pos) = pos +countstored_level(lvl::ShardLevel, pos) = pos -mutable struct VirtualShardedLevel <: AbstractVirtualLevel +mutable struct VirtualShardLevel <: AbstractVirtualLevel lvl # stand-in for the sublevel for virtual resize, etc. ex val @@ -103,25 +103,25 @@ mutable struct VirtualShardedLevel <: AbstractVirtualLevel Val end -postype(lvl::VirtualShardedLevel) = postype(lvl.lvl) +postype(lvl::VirtualShardLevel) = postype(lvl.lvl) -is_level_injective(ctx, lvl::VirtualShardedLevel) = [is_level_injective(ctx, lvl.lvl)..., true] -function is_level_atomic(ctx, lvl::VirtualShardedLevel) +is_level_injective(ctx, lvl::VirtualShardLevel) = [is_level_injective(ctx, lvl.lvl)..., true] +function is_level_atomic(ctx, lvl::VirtualShardLevel) (below, atomic) = is_level_atomic(ctx, lvl.lvl) return ([below; [atomic]], atomic) end -function is_level_concurrent(ctx, lvl::VirtualShardedLevel) +function is_level_concurrent(ctx, lvl::VirtualShardLevel) (data, _) = is_level_concurrent(ctx, lvl.lvl) return (data, true) end -function lower(ctx::AbstractCompiler, lvl::VirtualShardedLevel, ::DefaultStyle) +function lower(ctx::AbstractCompiler, lvl::VirtualShardLevel, ::DefaultStyle) quote - $ShardedLevel{$(lvl.Lvl), $(lvl.Ptr), $(lvl.Task), $(lvl.Val)}($(ctx(lvl.lvl)), $(lvl.val)) + $ShardLevel{$(lvl.Lvl), $(lvl.Ptr), $(lvl.Task), $(lvl.Val)}($(ctx(lvl.lvl)), $(lvl.val)) end end -function virtualize(ctx, ex, ::Type{ShardedLevel{Device, Lvl, Ptr, Task, Val}}, tag=:lvl) where {Device, Lvl, Ptr, Task, Val} +function virtualize(ctx, ex, ::Type{ShardLevel{Device, Lvl, Ptr, Task, Val}}, tag=:lvl) where {Device, Lvl, Ptr, Task, Val} sym = freshen(ctx, tag) ptr = freshen(ctx, tag, :_ptr) task = freshen(ctx, tag, :_task) @@ -134,17 +134,17 @@ function virtualize(ctx, ex, ::Type{ShardedLevel{Device, Lvl, Ptr, Task, Val}}, $val = $ex.val end) lvl_2 = virtualize(ctx, :($ex.lvl), Lvl, sym) - VirtualShardedLevel(lvl_2, sym, val, typeof(level_fill_value(Lvl)), Lvl, Ptr, Task, Val) + VirtualShardLevel(lvl_2, sym, val, typeof(level_fill_value(Lvl)), Lvl, Ptr, Task, Val) end -Base.summary(lvl::VirtualShardedLevel) = "Sharded($(lvl.Lvl))" +Base.summary(lvl::VirtualShardLevel) = "Shard($(lvl.Lvl))" -virtual_level_resize!(ctx, lvl::VirtualShardedLevel, dims...) = (lvl.lvl = virtual_level_resize!(ctx, lvl.lvl, dims...); lvl) -virtual_level_size(ctx, lvl::VirtualShardedLevel) = virtual_level_size(ctx, lvl.lvl) -virtual_level_eltype(lvl::VirtualShardedLevel) = virtual_level_eltype(lvl.lvl) -virtual_level_fill_value(lvl::VirtualShardedLevel) = virtual_level_fill_value(lvl.lvl) +virtual_level_resize!(ctx, lvl::VirtualShardLevel, dims...) = (lvl.lvl = virtual_level_resize!(ctx, lvl.lvl, dims...); lvl) +virtual_level_size(ctx, lvl::VirtualShardLevel) = virtual_level_size(ctx, lvl.lvl) +virtual_level_eltype(lvl::VirtualShardLevel) = virtual_level_eltype(lvl.lvl) +virtual_level_fill_value(lvl::VirtualShardLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_moveto_level(ctx, lvl::VirtualShardedLevel, arch) +function virtual_moveto_level(ctx, lvl::VirtualShardLevel, arch) val_2 = freshen(ctx, lvl.val) push_preamble!(ctx, quote $val_2 = $(lvl.val) @@ -156,10 +156,10 @@ function virtual_moveto_level(ctx, lvl::VirtualShardedLevel, arch) virtual_moveto_level(ctx, lvl.lvl, arch) end -function declare_level!(ctx, lvl::VirtualShardedLevel, pos, init) +function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) virtual_parallel_region!(ctx, lvl.device) do ctx, task lvl_2 = virtualize(ctx, :($(lvl.ex).val[$(task)]), lvl.Lvl) #TODO should this virtualize the eltype of Val? - declare_level!(ctx, lvl.lvl, literal($task), init) + declare_level!(ctx, lvl_2, literal(1), init) end end @@ -175,7 +175,7 @@ write: The outer level needs to be concurrent, like denselevel. """ -function assemble_level!(ctx, lvl::VirtualShardedLevel, pos_start, pos_stop) +function assemble_level!(ctx, lvl::VirtualShardLevel, pos_start, pos_stop) pos_start = cache!(ctx, :pos_start, simplify(ctx, pos_start)) pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) pos = freshen(ctx, :pos) @@ -188,20 +188,20 @@ function assemble_level!(ctx, lvl::VirtualShardedLevel, pos_start, pos_stop) lvl end -supports_reassembly(::VirtualShardedLevel) = false +supports_reassembly(::VirtualShardLevel) = false """ these two are no-ops, we insteaed do these on instantiate """ -function freeze_level!(ctx, lvl::VirtualShardedLevel, pos) +function freeze_level!(ctx, lvl::VirtualShardLevel, pos) return lvl end -function thaw_level!(ctx::AbstractCompiler, lvl::VirtualShardedLevel, pos) +function thaw_level!(ctx::AbstractCompiler, lvl::VirtualShardLevel, pos) return lvl end -function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardedLevel}, mode) +function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardLevel}, mode) if mode.kind === reader (lvl, pos) = (fbr.lvl, fbr.pos) tag = lvl.ex @@ -252,7 +252,7 @@ write: The outer level needs to be concurrent, like denselevel. """ -function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardedLevel}, mode) +function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardLevel}, mode) @assert mode.kind === updater (lvl, pos) = (fbr.lvl, fbr.pos) tag = lvl.ex @@ -262,23 +262,22 @@ function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardedLevel}, mode) return Thunk( preamble = quote - $task = $(lvl.task)[$(ctx(pos))] - if task == 0 - $(lvl.task)[$(ctx(pos))] = $(gettasknum(ctx)) - qos = local_qos_fill - if $(lvl.local_qos_fill) > $(lvl.local_qos_stop) - $local_qos_stop = max($local_qos_stop << 1, 1) - $(contain(ctx_2->assemble_level!(ctx_2, lvl.lvl, value(qos_fill, Tp), value(qos_stop, Tp)), ctx)) - end - else - qos = $(lvl.ptr)[$(ctx(pos))] - qos_stop = $(lvl.local_qos_stop) - #only in safe mode, we check if task == $(gettasknum(ctx)) and if not error("Task mismatch in ShardedLevel") + $task = $(lvl.task)[$(ctx(pos))] + if task == 0 + $(lvl.task)[$(ctx(pos))] = $(gettasknum(ctx)) + qos = local_qos_fill + if $(lvl.local_qos_fill) > $(lvl.local_qos_stop) + $local_qos_stop = max($local_qos_stop << 1, 1) + $(contain(ctx_2->assemble_level!(ctx_2, lvl.lvl, value(qos_fill, Tp), value(qos_stop, Tp)), ctx)) end - dirty = true - end) - end - body = (ctx) -> VirtualHollowSubFiber(lvl.lvl, local_) + else + qos = $(lvl.ptr)[$(ctx(pos))] + qos_stop = $(lvl.local_qos_stop) + #only in safe mode, we check if task == $(gettasknum(ctx)) and if not error("Task mismatch in ShardLevel") + end + dirty = true + end, + body = (ctx) -> VirtualHollowSubFiber(lvl.lvl, value(qos), dirty), epilogue = quote #this task will always own this position forever, even if we don't write to it. Still, we try to be conservative of memory usage of the underlying level. if dirty && $(lvl.ptr)[$(ctx(pos))] == 0 @@ -286,5 +285,5 @@ function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardedLevel}, mode) $(lvl.ptr)[$(ctx(pos))] = $(lvl.local_qos_fill) += 1 end end - end + ) end \ No newline at end of file From 193064addda23c843488c746f15d1ba693ae8dc9 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Thu, 30 Jan 2025 15:00:53 -0500 Subject: [PATCH 07/25] some fixes --- src/architecture.jl | 38 +++++++++++++++++++++++++----- src/lower.jl | 27 ++++++--------------- src/tensors/levels/shard_levels.jl | 13 ++++++---- 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/src/architecture.jl b/src/architecture.jl index eadbc64fc..9d0b89a42 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -41,6 +41,7 @@ struct CPU <: AbstractDevice n::Int end CPU() = CPU(Threads.nthreads()) +get_num_tasks(dev::CPU) = dev.n @kwdef struct VirtualCPU <: AbstractVirtualDevice ex n @@ -54,10 +55,10 @@ function virtualize(ctx, ex, ::Type{CPU}) end lower(ctx::AbstractCompiler, device::VirtualCPU, ::DefaultStyle) = something(device.ex, :(CPU($(ctx(device.n))))) +virtual_get_num_tasks(::VirtualCPU) = 1 FinchNotation.finch_leaf(device::VirtualCPU) = virtual(device) - """ Serial() @@ -66,14 +67,15 @@ A device that represents a serial CPU execution. struct Serial <: AbstractTask end const serial = Serial() get_device(::Serial) = CPU(1) -get_task(::Serial) = nothing +get_parent_task(::Serial) = nothing +get_task_number(::Serial) = 1 struct VirtualSerial <: AbstractVirtualTask end virtualize(ctx, ex, ::Type{Serial}) = VirtualSerial() lower(ctx::AbstractCompiler, task::VirtualSerial, ::DefaultStyle) = :(Serial()) FinchNotation.finch_leaf(device::VirtualSerial) = virtual(device) virtual_get_device(::VirtualSerial) = VirtualCPU(nothing, 1) -virtual_get_task(::VirtualSerial) = nothing - +virtual_get_parent_task(::VirtualSerial) = nothing +virtual_get_task_number(::VirtualSerial) = literal(1) struct CPUThread{Parent} <: AbstractTask tid::Int @@ -81,7 +83,8 @@ struct CPUThread{Parent} <: AbstractTask parent::Parent end get_device(task::CPUThread) = task.device -get_task(task::CPUThread) = task.parent +get_parent_task(task::CPUThread) = task.parent +get_task_number(task::CPUThread) = task.tid @inline function make_lock(::Type{Threads.Atomic{T}}) where {T} return Threads.Atomic{T}(zero(T)) @@ -138,7 +141,8 @@ end lower(ctx::AbstractCompiler, task::VirtualCPUThread, ::DefaultStyle) = :(CPUThread($(ctx(task.tid)), $(ctx(task.dev)), $(ctx(task.parent)))) FinchNotation.finch_leaf(device::VirtualCPUThread) = virtual(device) virtual_get_device(task::VirtualCPUThread) = task.dev -virtual_get_task(task::VirtualCPUThread) = task.parent +virtual_get_parent_task(task::VirtualCPUThread) = task.parent +virtual_get_task_number(task::VirtualCPUThread) = task.tid struct CPULocalMemory device::CPU @@ -226,4 +230,26 @@ for T = [Bool, Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, Int128, UnsafeAtomics.cas!(pointer(vec, idx), $T(Vf), x, UnsafeAtomics.seq_cst, UnsafeAtomics.seq_cst) end +end + +function virtual_parallel_region(f, ctx, ::Serial) + contain(f, ctx) +end + +function virtual_parallel_region(f, ctx, device::VirtualCPU) + code = contain(ctx) do ctx_2 + subtask = VirtualCPUThread(value(i, Int), device, ctx_2.code.task) + contain(f, ctx_2, task=subtask) + end + + return quote + Threads.@threads for $i = 1:$(ctx(device.n)) + Finch.@barrier begin + @inbounds @fastmath begin + $code + end + nothing + end + end + end end \ No newline at end of file diff --git a/src/lower.jl b/src/lower.jl index 278ef8bfa..19a1f808c 100644 --- a/src/lower.jl +++ b/src/lower.jl @@ -321,27 +321,14 @@ function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualC virtual_moveto(ctx, resolve(ctx, tns), device) end - code = contain(ctx) do ctx_2 - subtask = VirtualCPUThread(value(i, Int), device, ctx_2.code.task) - contain(ctx_2, task=subtask) do ctx_3 - for tns in intersect(used_in_scope, decl_in_scope) - virtual_moveto(ctx_3, resolve(ctx_3, tns), subtask) - end - contain(ctx_3) do ctx_4 - open_scope(ctx_4) do ctx_5 - ctx_5(instantiate!(ctx_5, root_2)) - end - end + virtual_parallel_region(ctx, device) do ctx_2 + subtask = ctx_2.task + for tns in intersect(used_in_scope, decl_in_scope) + virtual_moveto(ctx_3, resolve(ctx_3, tns), subtask) end - end - - return quote - Threads.@threads for $i = 1:$(ctx(device.n)) - Finch.@barrier begin - @inbounds @fastmath begin - $code - end - nothing + contain(ctx_3) do ctx_4 + open_scope(ctx_4) do ctx_5 + ctx_5(instantiate!(ctx_5, root_2)) end end end diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl index 021f1388d..1d3ca600b 100644 --- a/src/tensors/levels/shard_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -30,13 +30,13 @@ const Shard = ShardLevel ShardLevel(device::Device, lvl::Lvl) where {Device, Lvl} = ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], moveto(lvl, device)) #TODO scatterto? -#ShardLevel(device::Device, lvl::Lvl, ptr::Ptr, task::Task, val::Val) where {Device, Lvl, Ptr, Task, Val} = -# ShardLevel{Device, Lvl, Ptr, Task, Val}(device, lvl, ptr, task, val) +ShardLevel{Device}(device, lvl::Lvl, ptr::Ptr, task::Task, val::Val) where {Device, Lvl, Ptr, Task, Val} = + ShardLevel{Device, Lvl, Ptr, Task, Val}(device, lvl, ptr, task, val) Base.summary(::Shard{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = "Shard($(Lvl))" similar_level(lvl::Shard{Device, Lvl, Ptr, Task, Val}, fill_value, eltype::Type, dims...) where {Device, Lvl, Ptr, Task, Val} = - ShardLevel(lvl, similar_level(lvl.lvl, fill_value, eltype, dims...)) + ShardLevel(lvl.device, similar_level(lvl.lvl, fill_value, eltype, dims...)) postype(::Type{<:Shard{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = postype(Lvl) @@ -93,10 +93,12 @@ end countstored_level(lvl::ShardLevel, pos) = pos mutable struct VirtualShardLevel <: AbstractVirtualLevel + device lvl # stand-in for the sublevel for virtual resize, etc. ex val Tv + Device Lvl Ptr Task @@ -133,8 +135,9 @@ function virtualize(ctx, ex, ::Type{ShardLevel{Device, Lvl, Ptr, Task, Val}}, ta $task = $ex.task $val = $ex.val end) + device_2 = virtualize(ctx, :($ex.device), Device, sym) lvl_2 = virtualize(ctx, :($ex.lvl), Lvl, sym) - VirtualShardLevel(lvl_2, sym, val, typeof(level_fill_value(Lvl)), Lvl, Ptr, Task, Val) + VirtualShardLevel(device_2, lvl_2, sym, val, typeof(level_fill_value(Lvl)), Device, Lvl, Ptr, Task, Val) end Base.summary(lvl::VirtualShardLevel) = "Shard($(lvl.Lvl))" @@ -157,7 +160,7 @@ function virtual_moveto_level(ctx, lvl::VirtualShardLevel, arch) end function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) - virtual_parallel_region!(ctx, lvl.device) do ctx, task + virtual_parallel_region(ctx, lvl.device) do ctx, task lvl_2 = virtualize(ctx, :($(lvl.ex).val[$(task)]), lvl.Lvl) #TODO should this virtualize the eltype of Val? declare_level!(ctx, lvl_2, literal(1), init) end From 46f56526720b6ad7efcdf7147e886a22d1313f79 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Thu, 30 Jan 2025 15:35:43 -0500 Subject: [PATCH 08/25] sorta broken --- src/architecture.jl | 24 +++++++++++---------- src/lower.jl | 20 ++++++++--------- src/tensors/levels/atomic_element_levels.jl | 4 ++-- src/tensors/levels/mutex_levels.jl | 8 +++---- src/tensors/levels/shard_levels.jl | 16 +++++++++----- 5 files changed, 39 insertions(+), 33 deletions(-) diff --git a/src/architecture.jl b/src/architecture.jl index 9d0b89a42..8484dbc8c 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -55,7 +55,7 @@ function virtualize(ctx, ex, ::Type{CPU}) end lower(ctx::AbstractCompiler, device::VirtualCPU, ::DefaultStyle) = something(device.ex, :(CPU($(ctx(device.n))))) -virtual_get_num_tasks(::VirtualCPU) = 1 +get_num_tasks(::VirtualCPU) = 1 FinchNotation.finch_leaf(device::VirtualCPU) = virtual(device) @@ -68,14 +68,14 @@ struct Serial <: AbstractTask end const serial = Serial() get_device(::Serial) = CPU(1) get_parent_task(::Serial) = nothing -get_task_number(::Serial) = 1 +get_task_num(::Serial) = 1 struct VirtualSerial <: AbstractVirtualTask end virtualize(ctx, ex, ::Type{Serial}) = VirtualSerial() lower(ctx::AbstractCompiler, task::VirtualSerial, ::DefaultStyle) = :(Serial()) FinchNotation.finch_leaf(device::VirtualSerial) = virtual(device) -virtual_get_device(::VirtualSerial) = VirtualCPU(nothing, 1) -virtual_get_parent_task(::VirtualSerial) = nothing -virtual_get_task_number(::VirtualSerial) = literal(1) +get_device(::VirtualSerial) = VirtualCPU(nothing, 1) +get_parent_task(::VirtualSerial) = nothing +get_task_num(::VirtualSerial) = literal(1) struct CPUThread{Parent} <: AbstractTask tid::Int @@ -84,7 +84,7 @@ struct CPUThread{Parent} <: AbstractTask end get_device(task::CPUThread) = task.device get_parent_task(task::CPUThread) = task.parent -get_task_number(task::CPUThread) = task.tid +get_task_num(task::CPUThread) = task.tid @inline function make_lock(::Type{Threads.Atomic{T}}) where {T} return Threads.Atomic{T}(zero(T)) @@ -140,9 +140,9 @@ function virtualize(ctx, ex, ::Type{CPUThread{Parent}}) where {Parent} end lower(ctx::AbstractCompiler, task::VirtualCPUThread, ::DefaultStyle) = :(CPUThread($(ctx(task.tid)), $(ctx(task.dev)), $(ctx(task.parent)))) FinchNotation.finch_leaf(device::VirtualCPUThread) = virtual(device) -virtual_get_device(task::VirtualCPUThread) = task.dev -virtual_get_parent_task(task::VirtualCPUThread) = task.parent -virtual_get_task_number(task::VirtualCPUThread) = task.tid +get_device(task::VirtualCPUThread) = task.dev +get_parent_task(task::VirtualCPUThread) = task.parent +get_task_num(task::VirtualCPUThread) = task.tid struct CPULocalMemory device::CPU @@ -237,13 +237,15 @@ function virtual_parallel_region(f, ctx, ::Serial) end function virtual_parallel_region(f, ctx, device::VirtualCPU) + tid = freshen(ctx, :tid) + code = contain(ctx) do ctx_2 - subtask = VirtualCPUThread(value(i, Int), device, ctx_2.code.task) + subtask = VirtualCPUThread(value(tid, Int), device, ctx_2.code.task) contain(f, ctx_2, task=subtask) end return quote - Threads.@threads for $i = 1:$(ctx(device.n)) + Threads.@threads for $tid = 1:$(ctx(device.n)) Finch.@barrier begin @inbounds @fastmath begin $code diff --git a/src/lower.jl b/src/lower.jl index 19a1f808c..ed17c6104 100644 --- a/src/lower.jl +++ b/src/lower.jl @@ -294,9 +294,6 @@ lower_loop(ctx, root, ext::ParallelDimension) = function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualCPU) root = ensure_concurrent(root, ctx) - tid = index(freshen(ctx, :tid)) - i = freshen(ctx, :i) - decl_in_scope = unique(filter(!isnothing, map(node-> begin if @capture(node, declare(~tns, ~init, ~op)) tns @@ -309,25 +306,26 @@ function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualC end end, PostOrderDFS(root.body)))) - root_2 = loop(tid, Extent(value(i, Int), value(i, Int)), - loop(root.idx, ext.ext, - sieve(access(VirtualSplitMask(device.n), reader(), root.idx, tid), - root.body - ) - ) - ) - for tns in setdiff(used_in_scope, decl_in_scope) virtual_moveto(ctx, resolve(ctx, tns), device) end virtual_parallel_region(ctx, device) do ctx_2 subtask = ctx_2.task + tid = get_task_number(subtask) for tns in intersect(used_in_scope, decl_in_scope) virtual_moveto(ctx_3, resolve(ctx_3, tns), subtask) end contain(ctx_3) do ctx_4 open_scope(ctx_4) do ctx_5 + i = index(freshen(ctx, :i)) + root_2 = loop(i, Extent(tid, tid), + loop(root.idx, ext.ext, + sieve(access(VirtualSplitMask(device.n), reader(), root.idx, i), + root.body + ) + ) + ) ctx_5(instantiate!(ctx_5, root_2)) end end diff --git a/src/tensors/levels/atomic_element_levels.jl b/src/tensors/levels/atomic_element_levels.jl index d1cc423d7..329469f4d 100644 --- a/src/tensors/levels/atomic_element_levels.jl +++ b/src/tensors/levels/atomic_element_levels.jl @@ -178,7 +178,7 @@ function lower_assign(ctx, fbr::VirtualSubFiber{VirtualAtomicElementLevel}, mode (lvl, pos) = (fbr.lvl, fbr.pos) op = ctx(op) rhs = ctx(rhs) - device = ctx(virtual_get_device(get_task(ctx))) + device = ctx(get_device(get_task(ctx))) :(Finch.atomic_modify!($device, $(lvl.val), $(ctx(pos)), $op, $rhs)) end @@ -189,6 +189,6 @@ function lower_assign(ctx, fbr::VirtualHollowSubFiber{VirtualAtomicElementLevel} end) op = ctx(op) rhs = ctx(rhs) - device = ctx(virtual_get_device(get_task(ctx))) + device = ctx(get_device(get_task(ctx))) :(Finch.atomic_modify!($device, $(lvl.val), $(ctx(pos)), $op, $rhs)) end \ No newline at end of file diff --git a/src/tensors/levels/mutex_levels.jl b/src/tensors/levels/mutex_levels.jl index d91f79074..b37e2c434 100644 --- a/src/tensors/levels/mutex_levels.jl +++ b/src/tensors/levels/mutex_levels.jl @@ -218,7 +218,7 @@ function unfurl(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, ext, mode, proto) sym = freshen(ctx, lvl.ex, :after_atomic_lvl) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) - dev = lower(ctx, virtual_get_device(ctx.code.task), DefaultStyle()) + dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) push_preamble!(ctx, quote $atomicData = Finch.get_lock($dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal))) $lockVal = Finch.aquire_lock!($dev, $atomicData) @@ -237,7 +237,7 @@ function unfurl(ctx, fbr::VirtualHollowSubFiber{VirtualMutexLevel}, ext, mode, p sym = freshen(ctx, lvl.ex, :after_atomic_lvl) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) - dev = lower(ctx, virtual_get_device(ctx.code.task), DefaultStyle()) + dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) push_preamble!(ctx, quote $atomicData = Finch.get_lock($dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal))) $lockVal = Finch.aquire_lock!($dev, $atomicData) @@ -254,7 +254,7 @@ function lower_assign(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, mode, op, rh sym = freshen(ctx, lvl.ex, :after_atomic_lvl) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) - dev = lower(ctx, virtual_get_device(ctx.code.task), DefaultStyle()) + dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) push_preamble!(ctx, quote $atomicData = Finch.get_lock($dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal))) $lockVal = Finch.aquire_lock!($dev, $atomicData) @@ -271,7 +271,7 @@ function lower_assign(ctx, fbr::VirtualHollowSubFiber{VirtualMutexLevel}, mode, sym = freshen(ctx, lvl.ex, :after_atomic_lvl) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) - dev = lower(ctx, virtual_get_device(ctx.code.task), DefaultStyle()) + dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) push_preamble!(ctx, quote $atomicData = Finch.get_lock($dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal))) $lockVal = Finch.aquire_lock!($dev, $atomicData) diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl index 1d3ca600b..a548e4750 100644 --- a/src/tensors/levels/shard_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -96,6 +96,8 @@ mutable struct VirtualShardLevel <: AbstractVirtualLevel device lvl # stand-in for the sublevel for virtual resize, etc. ex + ptr + task val Tv Device @@ -137,7 +139,7 @@ function virtualize(ctx, ex, ::Type{ShardLevel{Device, Lvl, Ptr, Task, Val}}, ta end) device_2 = virtualize(ctx, :($ex.device), Device, sym) lvl_2 = virtualize(ctx, :($ex.lvl), Lvl, sym) - VirtualShardLevel(device_2, lvl_2, sym, val, typeof(level_fill_value(Lvl)), Device, Lvl, Ptr, Task, Val) + VirtualShardLevel(device_2, lvl_2, sym, ptr, task, val, typeof(level_fill_value(Lvl)), Device, Lvl, Ptr, Task, Val) end Base.summary(lvl::VirtualShardLevel) = "Shard($(lvl.Lvl))" @@ -160,10 +162,14 @@ function virtual_moveto_level(ctx, lvl::VirtualShardLevel, arch) end function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) - virtual_parallel_region(ctx, lvl.device) do ctx, task - lvl_2 = virtualize(ctx, :($(lvl.ex).val[$(task)]), lvl.Lvl) #TODO should this virtualize the eltype of Val? - declare_level!(ctx, lvl_2, literal(1), init) - end + push_preamble!(ctx, + virtual_parallel_region(ctx, lvl.device) do ctx_2 + lvl_2 = virtualize(ctx_2, :($(lvl.ex).val[$(ctx_2(get_task_num(get_task(ctx_2))))]), lvl.Lvl) #TODO should this virtualize the eltype of Val? + declare_level!(ctx_2, lvl_2, literal(1), init) + println(ctx_2.code.preamble) + end + ) + lvl end """ From f5a5c87c0bf1131a9537ee698fbbff0b857c2c58 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Thu, 30 Jan 2025 15:36:00 -0500 Subject: [PATCH 09/25] rm println --- src/tensors/levels/shard_levels.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl index a548e4750..f8699fef5 100644 --- a/src/tensors/levels/shard_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -166,7 +166,6 @@ function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) virtual_parallel_region(ctx, lvl.device) do ctx_2 lvl_2 = virtualize(ctx_2, :($(lvl.ex).val[$(ctx_2(get_task_num(get_task(ctx_2))))]), lvl.Lvl) #TODO should this virtualize the eltype of Val? declare_level!(ctx_2, lvl_2, literal(1), init) - println(ctx_2.code.preamble) end ) lvl From 5f984f9cbce1f69bd9dee882be998661c3adc8c8 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Tue, 4 Feb 2025 14:27:25 -0500 Subject: [PATCH 10/25] fix --- src/lower.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lower.jl b/src/lower.jl index ed17c6104..761e2bcf7 100644 --- a/src/lower.jl +++ b/src/lower.jl @@ -311,13 +311,13 @@ function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualC end virtual_parallel_region(ctx, device) do ctx_2 - subtask = ctx_2.task - tid = get_task_number(subtask) + subtask = get_task(ctx_2) + tid = get_task_num(subtask) for tns in intersect(used_in_scope, decl_in_scope) - virtual_moveto(ctx_3, resolve(ctx_3, tns), subtask) + virtual_moveto(ctx_2, resolve(ctx_2, tns), subtask) end - contain(ctx_3) do ctx_4 - open_scope(ctx_4) do ctx_5 + contain(ctx_2) do ctx_3 + open_scope(ctx_3) do ctx_4 i = index(freshen(ctx, :i)) root_2 = loop(i, Extent(tid, tid), loop(root.idx, ext.ext, @@ -326,7 +326,7 @@ function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualC ) ) ) - ctx_5(instantiate!(ctx_5, root_2)) + ctx_4(instantiate!(ctx_4, root_2)) end end end From e8da20cf3e60459809465999c4b391e9fc57ae7c Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 18:33:17 -0500 Subject: [PATCH 11/25] fix --- src/architecture.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/architecture.jl b/src/architecture.jl index 56237c677..b198220e4 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -57,7 +57,8 @@ function virtualize(ctx, ex, ::Type{CPU}) end function lower(ctx::AbstractCompiler, device::VirtualCPU, ::DefaultStyle) something(device.ex, :(CPU($(ctx(device.n))))) -get_num_tasks(::VirtualCPU) = 1 +end +get_num_tasks(::VirtualCPU) = literal(1) FinchNotation.finch_leaf(device::VirtualCPU) = virtual(device) From c0546282cdbbe5ba5c3303e7b7ee776186fded6c Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 18:35:31 -0500 Subject: [PATCH 12/25] directions --- CONTRIBUTING.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 307fad215..0dee19a38 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -118,6 +118,9 @@ though both are included as part of the test suite. ## Code Style -We use [Blue Style](https://github.com/JuliaDiff/BlueStyle) formatting, with a few tweaks -defined in `.JuliaFormatter.toml`. Running the tests in overwrite mode will -automatically reformat your code, but you can also add [`JuliaFormatter`](https://domluna.github.io/JuliaFormatter.jl/stable/#Editor-Plugins) to your editor to reformat as you go. +We use [Blue Style](https://github.com/JuliaDiff/BlueStyle) formatting, with a +few tweaks defined in `.JuliaFormatter.toml`. Running the tests in overwrite +mode will automatically reformat your code, but you can also add +[`JuliaFormatter`](https://domluna.github.io/JuliaFormatter.jl/stable/#Editor-Plugins) +to your editor to reformat as you go, or call +`julia -e "using JuliaFormatter; format("path/to/Finch.jl")`. \ No newline at end of file From a8c2eb1f503c8d486d306ee56e8bf18c99948a7e Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 18:35:49 -0500 Subject: [PATCH 13/25] directions --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0dee19a38..9b42fc95c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -123,4 +123,4 @@ few tweaks defined in `.JuliaFormatter.toml`. Running the tests in overwrite mode will automatically reformat your code, but you can also add [`JuliaFormatter`](https://domluna.github.io/JuliaFormatter.jl/stable/#Editor-Plugins) to your editor to reformat as you go, or call -`julia -e "using JuliaFormatter; format("path/to/Finch.jl")`. \ No newline at end of file +`julia -e "using JuliaFormatter; format("path/to/Finch.jl")` manually. \ No newline at end of file From 228926652d8e09a9679664d20ea69ca38aaaa2e9 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 19:30:36 -0500 Subject: [PATCH 14/25] add a parallel manifesto --- docs/src/docs/internals/parallel.md | 62 +++++++++++++++++++++++++++++ src/architecture.jl | 37 +++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 docs/src/docs/internals/parallel.md diff --git a/docs/src/docs/internals/parallel.md b/docs/src/docs/internals/parallel.md new file mode 100644 index 000000000..b1da4b28c --- /dev/null +++ b/docs/src/docs/internals/parallel.md @@ -0,0 +1,62 @@ +# Parallel Processing in Finch + +## Modelling the Architecture + +Finch uses a simple, hierarchical representation of devices and tasks to model +different kind of parallel processing. An [`AbstractDevice`](@ref) is a physical or +virtual device on which we can execute tasks, which may each be represented by +an [`AbstractTask`](@ref). + +```@docs +AbstractTask +AbstractDevice +``` + +The current task in a compilation context can be queried with +[`get_task`](@ref). Each device has a set of numbered child +tasks, and each task has a parent task. + +```@docs +get_num_tasks +get_task_num +get_device +get_parent_task +``` + +## Data Movement + +Before entering a parallel loop, a tensor may reside on a single task, or +represent a single view of data distributed across multiple tasks, or represent +multiple separate tensors local to multiple tasks. A tensor's data must be +resident in the current task to process operations on that tensor, such as loops +over the indices, accesses to the tensor, or `declare`, `freeze`, or `thaw`. +Upon entering a parallel loop, we must transfer the tensor to the tasks +where it is needed. Upon exiting the parallel loop, we may need to combine +the data from multiple tasks into a single tensor. + +There are two cases, depending on whether the tensor is declared outside the +parallel loop or is a temporary tensor declared within the parallel loop. + +If the tensor is a temporary tensor declared within the parallel loop, we call +`bcast` to broadcast the tensor to all tasks. + +If the tensor is declared outside the parallel loop, we call `scatter` to +send it to the tasks where it is needed. Note that if the tensor is in `read` mode, +`scatter` may simply `bcast` the entire tensor to all tasks. If the device has global +memory, `scatter` may also be a no-op. When the parallel loop is exited, we call +`gather` to reconcile the data from multiple tasks back into a single tensor. + +Each of these operations begins with a `_send` variant on one task, and +finishes with a `_recv` variant on the recieving task. + +```@docs +bcast +bcast_send +bcast_recv +scatter +scatter_send +scatter_recv +gather +gather_send +gather_recv +``` \ No newline at end of file diff --git a/src/architecture.jl b/src/architecture.jl index b198220e4..2bb9d948c 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -1,8 +1,45 @@ +""" + AbstractDevice + +A datatype representing a device on which tasks can be executed. +""" abstract type AbstractDevice end abstract type AbstractVirtualDevice end + +""" + AbstractTask + +An individual processing unit on a device, responsible for running code. +""" abstract type AbstractTask end abstract type AbstractVirtualTask end +""" + get_num_tasks(dev::AbstractDevice) + +Return the number of tasks on the device dev. +""" +function get_num_tasks end +""" + get_task_num(task::AbstractTask) + +Return the task number of `task`. +""" +function get_task_num end +""" + get_device(task::AbstractTask) + +Return the device that `task` is running on. +""" +function get_device end + +""" + get_parent_task(task::AbstractTask) + +Return the task which spawned `task`. +""" +function get_parent_task end + """ aquire_lock!(dev::AbstractDevice, val) From ebc14cb94cbf6852085215881c108e7394e80365 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 19:38:44 -0500 Subject: [PATCH 15/25] Update docs/src/docs/internals/parallel.md Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/src/docs/internals/parallel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/docs/internals/parallel.md b/docs/src/docs/internals/parallel.md index b1da4b28c..47a782006 100644 --- a/docs/src/docs/internals/parallel.md +++ b/docs/src/docs/internals/parallel.md @@ -5,7 +5,7 @@ Finch uses a simple, hierarchical representation of devices and tasks to model different kind of parallel processing. An [`AbstractDevice`](@ref) is a physical or virtual device on which we can execute tasks, which may each be represented by -an [`AbstractTask`](@ref). +an [`AbstractTask`](@ref). ```@docs AbstractTask From afae7e5c6e5509ac34be7bedabd355d7b0cc1435 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 19:38:51 -0500 Subject: [PATCH 16/25] Update docs/src/docs/internals/parallel.md Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/src/docs/internals/parallel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/docs/internals/parallel.md b/docs/src/docs/internals/parallel.md index 47a782006..60d38514b 100644 --- a/docs/src/docs/internals/parallel.md +++ b/docs/src/docs/internals/parallel.md @@ -40,7 +40,7 @@ parallel loop or is a temporary tensor declared within the parallel loop. If the tensor is a temporary tensor declared within the parallel loop, we call `bcast` to broadcast the tensor to all tasks. -If the tensor is declared outside the parallel loop, we call `scatter` to +If the tensor is declared outside the parallel loop, we call `scatter` to send it to the tasks where it is needed. Note that if the tensor is in `read` mode, `scatter` may simply `bcast` the entire tensor to all tasks. If the device has global memory, `scatter` may also be a no-op. When the parallel loop is exited, we call From 39d9356cebbd38bac62551c104a542b394bc15b2 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 19:38:57 -0500 Subject: [PATCH 17/25] Update docs/src/docs/internals/parallel.md Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- docs/src/docs/internals/parallel.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/docs/internals/parallel.md b/docs/src/docs/internals/parallel.md index 60d38514b..e92045525 100644 --- a/docs/src/docs/internals/parallel.md +++ b/docs/src/docs/internals/parallel.md @@ -46,7 +46,7 @@ send it to the tasks where it is needed. Note that if the tensor is in `read` mo memory, `scatter` may also be a no-op. When the parallel loop is exited, we call `gather` to reconcile the data from multiple tasks back into a single tensor. -Each of these operations begins with a `_send` variant on one task, and +Each of these operations begins with a `_send` variant on one task, and finishes with a `_recv` variant on the recieving task. ```@docs From 1b986614aa13679a22cdf3503a09fdab0dbdef91 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 19:40:08 -0500 Subject: [PATCH 18/25] fix --- CONTRIBUTING.md | 2 +- docs/src/docs/internals/parallel.md | 10 +- docs/src/docs/internals/tensor_interface.md | 2 - src/architecture.jl | 7 +- src/lower.jl | 21 ++- src/tensors/levels/mutex_levels.jl | 52 ++++-- src/tensors/levels/shard_levels.jl | 176 ++++++++++++++------ 7 files changed, 183 insertions(+), 87 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9b42fc95c..76ae9d10b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -123,4 +123,4 @@ few tweaks defined in `.JuliaFormatter.toml`. Running the tests in overwrite mode will automatically reformat your code, but you can also add [`JuliaFormatter`](https://domluna.github.io/JuliaFormatter.jl/stable/#Editor-Plugins) to your editor to reformat as you go, or call -`julia -e "using JuliaFormatter; format("path/to/Finch.jl")` manually. \ No newline at end of file +`julia -e "using JuliaFormatter; format("path/to/Finch.jl")` manually. diff --git a/docs/src/docs/internals/parallel.md b/docs/src/docs/internals/parallel.md index b1da4b28c..fbf899c3d 100644 --- a/docs/src/docs/internals/parallel.md +++ b/docs/src/docs/internals/parallel.md @@ -5,7 +5,7 @@ Finch uses a simple, hierarchical representation of devices and tasks to model different kind of parallel processing. An [`AbstractDevice`](@ref) is a physical or virtual device on which we can execute tasks, which may each be represented by -an [`AbstractTask`](@ref). +an [`AbstractTask`](@ref). ```@docs AbstractTask @@ -26,7 +26,7 @@ get_parent_task ## Data Movement Before entering a parallel loop, a tensor may reside on a single task, or -represent a single view of data distributed across multiple tasks, or represent +represent a single view of data distributed across multiple tasks, or represent multiple separate tensors local to multiple tasks. A tensor's data must be resident in the current task to process operations on that tensor, such as loops over the indices, accesses to the tensor, or `declare`, `freeze`, or `thaw`. @@ -40,13 +40,13 @@ parallel loop or is a temporary tensor declared within the parallel loop. If the tensor is a temporary tensor declared within the parallel loop, we call `bcast` to broadcast the tensor to all tasks. -If the tensor is declared outside the parallel loop, we call `scatter` to +If the tensor is declared outside the parallel loop, we call `scatter` to send it to the tasks where it is needed. Note that if the tensor is in `read` mode, `scatter` may simply `bcast` the entire tensor to all tasks. If the device has global memory, `scatter` may also be a no-op. When the parallel loop is exited, we call `gather` to reconcile the data from multiple tasks back into a single tensor. -Each of these operations begins with a `_send` variant on one task, and +Each of these operations begins with a `_send` variant on one task, and finishes with a `_recv` variant on the recieving task. ```@docs @@ -59,4 +59,4 @@ scatter_recv gather gather_send gather_recv -``` \ No newline at end of file +``` diff --git a/docs/src/docs/internals/tensor_interface.md b/docs/src/docs/internals/tensor_interface.md index f54e8111b..f1530a310 100644 --- a/docs/src/docs/internals/tensor_interface.md +++ b/docs/src/docs/internals/tensor_interface.md @@ -18,8 +18,6 @@ virtual_eltype virtual_fill_value virtual_size virtual_resize! -moveto -virtual_moveto labelled_show labelled_children is_injective diff --git a/src/architecture.jl b/src/architecture.jl index 2bb9d948c..d6ab266e8 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -230,7 +230,7 @@ function local_memory(device::Serial) return device end -struct Converter{f, T} end +struct Converter{f,T} end (::Converter{f,T})(x) where {f,T} = T(f(x)) @@ -294,7 +294,6 @@ for T in [ pointer(vec, idx), $T(Vf), x, UnsafeAtomics.seq_cst, UnsafeAtomics.seq_cst ) end - end function virtual_parallel_region(f, ctx, ::Serial) @@ -306,11 +305,11 @@ function virtual_parallel_region(f, ctx, device::VirtualCPU) code = contain(ctx) do ctx_2 subtask = VirtualCPUThread(value(tid, Int), device, ctx_2.code.task) - contain(f, ctx_2, task=subtask) + contain(f, ctx_2; task=subtask) end return quote - Threads.@threads for $tid = 1:$(ctx(device.n)) + Threads.@threads for $tid in 1:($(ctx(device.n))) Finch.@barrier begin @inbounds @fastmath begin $code diff --git a/src/lower.jl b/src/lower.jl index 4e00c28c0..bc40b9bc5 100644 --- a/src/lower.jl +++ b/src/lower.jl @@ -319,11 +319,16 @@ end function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualCPU) root = ensure_concurrent(root, ctx) - decl_in_scope = unique(filter(!isnothing, map(node-> begin - if @capture(node, declare(~tns, ~init, ~op)) - tns - end - end, PostOrderDFS(root.body)))) + decl_in_scope = unique( + filter( + !isnothing, + map(node -> begin + if @capture(node, declare(~tns, ~init, ~op)) + tns + end + end, PostOrderDFS(root.body)), + ), + ) used_in_scope = unique( filter( @@ -352,9 +357,9 @@ function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualC root_2 = loop(i, Extent(tid, tid), loop(root.idx, ext.ext, sieve(access(VirtualSplitMask(device.n), reader(), root.idx, i), - root.body - ) - ) + root.body, + ), + ), ) ctx_4(instantiate!(ctx_4, root_2)) end diff --git a/src/tensors/levels/mutex_levels.jl b/src/tensors/levels/mutex_levels.jl index 98c058cf0..4f26a5f44 100644 --- a/src/tensors/levels/mutex_levels.jl +++ b/src/tensors/levels/mutex_levels.jl @@ -245,10 +245,15 @@ function unfurl(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, ext, mode, proto) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) - push_preamble!(ctx, quote - $atomicData = Finch.get_lock($dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal))) - $lockVal = Finch.aquire_lock!($dev, $atomicData) - end) + push_preamble!( + ctx, + quote + $atomicData = Finch.get_lock( + $dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal)) + ) + $lockVal = Finch.aquire_lock!($dev, $atomicData) + end, + ) res = unfurl(ctx, VirtualSubFiber(lvl.lvl, pos), ext, mode, proto) push_epilogue!( ctx, @@ -267,10 +272,15 @@ function unfurl(ctx, fbr::VirtualHollowSubFiber{VirtualMutexLevel}, ext, mode, p atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) - push_preamble!(ctx, quote - $atomicData = Finch.get_lock($dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal))) - $lockVal = Finch.aquire_lock!($dev, $atomicData) - end) + push_preamble!( + ctx, + quote + $atomicData = Finch.get_lock( + $dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal)) + ) + $lockVal = Finch.aquire_lock!($dev, $atomicData) + end, + ) res = unfurl(ctx, VirtualHollowSubFiber(lvl.lvl, pos, fbr.dirty), ext, mode, proto) push_epilogue!( ctx, @@ -287,10 +297,15 @@ function lower_assign(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, mode, op, rh atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) - push_preamble!(ctx, quote - $atomicData = Finch.get_lock($dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal))) - $lockVal = Finch.aquire_lock!($dev, $atomicData) - end) + push_preamble!( + ctx, + quote + $atomicData = Finch.get_lock( + $dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal)) + ) + $lockVal = Finch.aquire_lock!($dev, $atomicData) + end, + ) res = lower_assign(ctx, VirtualSubFiber(lvl.lvl, pos), mode, op, rhs) push_epilogue!( ctx, @@ -307,10 +322,15 @@ function lower_assign(ctx, fbr::VirtualHollowSubFiber{VirtualMutexLevel}, mode, atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) - push_preamble!(ctx, quote - $atomicData = Finch.get_lock($dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal))) - $lockVal = Finch.aquire_lock!($dev, $atomicData) - end) + push_preamble!( + ctx, + quote + $atomicData = Finch.get_lock( + $dev, $(lvl.locks), $(ctx(pos)), eltype($(lvl.AVal)) + ) + $lockVal = Finch.aquire_lock!($dev, $atomicData) + end, + ) res = lower_assign(ctx, VirtualHollowSubFiber(lvl.lvl, pos, fbr.dirty), mode, op, rhs) push_epilogue!( ctx, diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl index f8699fef5..077f60bd9 100644 --- a/src/tensors/levels/shard_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -18,7 +18,7 @@ julia> tensor_tree(Tensor(Dense(Shard(Element(0.0))), [1, 2, 3])) └─ 3.0 ``` """ -struct ShardLevel{Device, Lvl, Ptr, Task, Val} <: AbstractLevel +struct ShardLevel{Device,Lvl,Ptr,Task,Val} <: AbstractLevel device::Device lvl::Lvl ptr::Ptr @@ -27,18 +27,29 @@ struct ShardLevel{Device, Lvl, Ptr, Task, Val} <: AbstractLevel end const Shard = ShardLevel -ShardLevel(device::Device, lvl::Lvl) where {Device, Lvl} = - ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], moveto(lvl, device)) #TODO scatterto? +function ShardLevel(device::Device, lvl::Lvl) where {Device,Lvl} + ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], moveto(lvl, device)) +end #TODO scatterto? -ShardLevel{Device}(device, lvl::Lvl, ptr::Ptr, task::Task, val::Val) where {Device, Lvl, Ptr, Task, Val} = - ShardLevel{Device, Lvl, Ptr, Task, Val}(device, lvl, ptr, task, val) +function ShardLevel{Device}( + device, lvl::Lvl, ptr::Ptr, task::Task, val::Val +) where {Device,Lvl,Ptr,Task,Val} + ShardLevel{Device,Lvl,Ptr,Task,Val}(device, lvl, ptr, task, val) +end -Base.summary(::Shard{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = "Shard($(Lvl))" +function Base.summary(::Shard{Device,Lvl,Ptr,Task,Val}) where {Device,Lvl,Ptr,Task,Val} + "Shard($(Lvl))" +end -similar_level(lvl::Shard{Device, Lvl, Ptr, Task, Val}, fill_value, eltype::Type, dims...) where {Device, Lvl, Ptr, Task, Val} = +function similar_level( + lvl::Shard{Device,Lvl,Ptr,Task,Val}, fill_value, eltype::Type, dims... +) where {Device,Lvl,Ptr,Task,Val} ShardLevel(lvl.device, similar_level(lvl.lvl, fill_value, eltype, dims...)) +end -postype(::Type{<:Shard{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = postype(Lvl) +function postype(::Type{<:Shard{Device,Lvl,Ptr,Task,Val}}) where {Device,Lvl,Ptr,Task,Val} + postype(Lvl) +end function moveto(lvl::ShardLevel, device) lvl_2 = moveto(lvl.lvl, device) @@ -47,11 +58,29 @@ function moveto(lvl::ShardLevel, device) return ShardLevel(lvl_2, ptr_2, task_2, val_2) end -pattern!(lvl::ShardLevel) = ShardLevel(pattern!(lvl.lvl), lvl.ptr, lvl.task, map(pattern!, lvl.val)) -set_fill_value!(lvl::ShardLevel, init) = ShardLevel(set_fill_value!(lvl.lvl, init), lvl.ptr, lvl.task, map(lvl_2 -> set_fill_value!(lvl_2, init), lvl.val)) -Base.resize!(lvl::ShardLevel, dims...) = ShardLevel(resize!(lvl.lvl, dims...), lvl.ptr, lvl.task, map(lvl_2 -> resize!(lvl_2, dims...), lvl.val)) +function pattern!(lvl::ShardLevel) + ShardLevel(pattern!(lvl.lvl), lvl.ptr, lvl.task, map(pattern!, lvl.val)) +end +function set_fill_value!(lvl::ShardLevel, init) + ShardLevel( + set_fill_value!(lvl.lvl, init), + lvl.ptr, + lvl.task, + map(lvl_2 -> set_fill_value!(lvl_2, init), lvl.val), + ) +end +function Base.resize!(lvl::ShardLevel, dims...) + ShardLevel( + resize!(lvl.lvl, dims...), + lvl.ptr, + lvl.task, + map(lvl_2 -> resize!(lvl_2, dims...), lvl.val), + ) +end -function Base.show(io::IO, lvl::ShardLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} +function Base.show( + io::IO, lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} +) where {Device,Lvl,Ptr,Task,Val} print(io, "Shard(") if get(io, :compact, false) print(io, "…") @@ -79,11 +108,21 @@ function labelled_children(fbr::SubFiber{<:ShardLevel}) [LabelledTree(SubFiber(lvl.val[lvl.task[pos]], lvl.ptr[pos]))] end -@inline level_ndims(::Type{<:ShardLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_ndims(Lvl) -@inline level_size(lvl::ShardLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = level_size(lvl.lvl) -@inline level_axes(lvl::ShardLevel{Device, Lvl, Ptr, Task, Val}) where {Device, Lvl, Ptr, Task, Val} = level_axes(lvl.lvl) -@inline level_eltype(::Type{ShardLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_eltype(Lvl) -@inline level_fill_value(::Type{<:ShardLevel{Device, Lvl, Ptr, Task, Val}}) where {Device, Lvl, Ptr, Task, Val} = level_fill_value(Lvl) +@inline level_ndims( + ::Type{<:ShardLevel{Device,Lvl,Ptr,Task,Val}} +) where {Device,Lvl,Ptr,Task,Val} = level_ndims(Lvl) +@inline level_size( + lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} +) where {Device,Lvl,Ptr,Task,Val} = level_size(lvl.lvl) +@inline level_axes( + lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} +) where {Device,Lvl,Ptr,Task,Val} = level_axes(lvl.lvl) +@inline level_eltype( + ::Type{ShardLevel{Device,Lvl,Ptr,Task,Val}} +) where {Device,Lvl,Ptr,Task,Val} = level_eltype(Lvl) +@inline level_fill_value( + ::Type{<:ShardLevel{Device,Lvl,Ptr,Task,Val}} +) where {Device,Lvl,Ptr,Task,Val} = level_fill_value(Lvl) function (fbr::SubFiber{<:ShardLevel})(idxs...) q = fbr.pos @@ -109,7 +148,9 @@ end postype(lvl::VirtualShardLevel) = postype(lvl.lvl) -is_level_injective(ctx, lvl::VirtualShardLevel) = [is_level_injective(ctx, lvl.lvl)..., true] +function is_level_injective(ctx, lvl::VirtualShardLevel) + [is_level_injective(ctx, lvl.lvl)..., true] +end function is_level_atomic(ctx, lvl::VirtualShardLevel) (below, atomic) = is_level_atomic(ctx, lvl.lvl) return ([below; [atomic]], atomic) @@ -121,52 +162,82 @@ end function lower(ctx::AbstractCompiler, lvl::VirtualShardLevel, ::DefaultStyle) quote - $ShardLevel{$(lvl.Lvl), $(lvl.Ptr), $(lvl.Task), $(lvl.Val)}($(ctx(lvl.lvl)), $(lvl.val)) + $ShardLevel{$(lvl.Lvl),$(lvl.Ptr),$(lvl.Task),$(lvl.Val)}( + $(ctx(lvl.lvl)), $(lvl.val) + ) end end -function virtualize(ctx, ex, ::Type{ShardLevel{Device, Lvl, Ptr, Task, Val}}, tag=:lvl) where {Device, Lvl, Ptr, Task, Val} +function virtualize( + ctx, ex, ::Type{ShardLevel{Device,Lvl,Ptr,Task,Val}}, tag=:lvl +) where {Device,Lvl,Ptr,Task,Val} sym = freshen(ctx, tag) ptr = freshen(ctx, tag, :_ptr) task = freshen(ctx, tag, :_task) val = freshen(ctx, tag, :_val) - push_preamble!(ctx, quote - $sym = $ex - $ptr = $ex.ptr - $task = $ex.task - $val = $ex.val - end) + push_preamble!( + ctx, + quote + $sym = $ex + $ptr = $ex.ptr + $task = $ex.task + $val = $ex.val + end, + ) device_2 = virtualize(ctx, :($ex.device), Device, sym) lvl_2 = virtualize(ctx, :($ex.lvl), Lvl, sym) - VirtualShardLevel(device_2, lvl_2, sym, ptr, task, val, typeof(level_fill_value(Lvl)), Device, Lvl, Ptr, Task, Val) + VirtualShardLevel( + device_2, + lvl_2, + sym, + ptr, + task, + val, + typeof(level_fill_value(Lvl)), + Device, + Lvl, + Ptr, + Task, + Val, + ) end Base.summary(lvl::VirtualShardLevel) = "Shard($(lvl.Lvl))" -virtual_level_resize!(ctx, lvl::VirtualShardLevel, dims...) = (lvl.lvl = virtual_level_resize!(ctx, lvl.lvl, dims...); lvl) +function virtual_level_resize!(ctx, lvl::VirtualShardLevel, dims...) + (lvl.lvl = virtual_level_resize!(ctx, lvl.lvl, dims...); lvl) +end virtual_level_size(ctx, lvl::VirtualShardLevel) = virtual_level_size(ctx, lvl.lvl) virtual_level_eltype(lvl::VirtualShardLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualShardLevel) = virtual_level_fill_value(lvl.lvl) function virtual_moveto_level(ctx, lvl::VirtualShardLevel, arch) val_2 = freshen(ctx, lvl.val) - push_preamble!(ctx, quote + push_preamble!( + ctx, + quote $val_2 = $(lvl.val) $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) - end) - push_epilogue!(ctx, quote + end, + ) + push_epilogue!( + ctx, + quote $(lvl.val) = $val_2 - end) + end, + ) virtual_moveto_level(ctx, lvl.lvl, arch) end function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) - push_preamble!(ctx, + push_preamble!(ctx, virtual_parallel_region(ctx, lvl.device) do ctx_2 - lvl_2 = virtualize(ctx_2, :($(lvl.ex).val[$(ctx_2(get_task_num(get_task(ctx_2))))]), lvl.Lvl) #TODO should this virtualize the eltype of Val? + lvl_2 = virtualize( + ctx_2, :($(lvl.ex).val[$(ctx_2(get_task_num(get_task(ctx_2))))]), lvl.Lvl + ) #TODO should this virtualize the eltype of Val? declare_level!(ctx_2, lvl_2, literal(1), init) - end + end, ) lvl end @@ -188,11 +259,14 @@ function assemble_level!(ctx, lvl::VirtualShardLevel, pos_start, pos_stop) pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) pos = freshen(ctx, :pos) sym = freshen(ctx, :pointer_to_lvl) - push_preamble!(ctx, quote - Finch.resize_if_smaller!($(lvl.task), $(ctx(pos_stop))) - Finch.resize_if_smaller!($(lvl.ptr), $(ctx(pos_stop))) - Finch.fill_range!($(lvl.task), $(ctx(pos_start)), $(ctx(pos_stop)), 0) - end) + push_preamble!( + ctx, + quote + Finch.resize_if_smaller!($(lvl.task), $(ctx(pos_stop))) + Finch.resize_if_smaller!($(lvl.ptr), $(ctx(pos_stop))) + Finch.fill_range!($(lvl.task), $(ctx(pos_start)), $(ctx(pos_stop)), 0) + end, + ) lvl end @@ -217,8 +291,8 @@ function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardLevel}, mode) Vf = level_fill_value(lvl.Lvl) sym = freshen(ctx, :pointer_to_lvl) val = freshen(ctx, lvl.ex, :_val) - return Thunk( - body = (ctx) -> begin + return Thunk(; + body=(ctx) -> begin lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) instantiate(ctx, VirtualSubFiber(lvl_2, literal(1)), mode) end, @@ -228,8 +302,8 @@ function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardLevel}, mode) tag = lvl.ex sym = freshen(ctx, :pointer_to_lvl) - return Thunk( - body = (ctx) -> begin + return Thunk(; + body=(ctx) -> begin lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) lvl_2 = thaw_level!(ctx, lvl_2, literal(1)) push_preamble!(ctx, assemble_level!(ctx, lvl_2, literal(1), literal(1))) @@ -238,10 +312,10 @@ function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardLevel}, mode) contain(ctx) do ctx_2 lvl_2 = freeze_level!(ctx_2, lvl_2, literal(1)) :($(lvl.val)[$(ctx_2(pos))] = $(ctx_2(lvl_2))) - end + end, ) res - end + end, ) end end @@ -268,7 +342,7 @@ function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardLevel}, mode) task = freshen(ctx, tag, :_task) - return Thunk( + return Thunk(; preamble = quote $task = $(lvl.task)[$(ctx(pos))] if task == 0 @@ -276,7 +350,7 @@ function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardLevel}, mode) qos = local_qos_fill if $(lvl.local_qos_fill) > $(lvl.local_qos_stop) $local_qos_stop = max($local_qos_stop << 1, 1) - $(contain(ctx_2->assemble_level!(ctx_2, lvl.lvl, value(qos_fill, Tp), value(qos_stop, Tp)), ctx)) + $(contain(ctx_2 -> assemble_level!(ctx_2, lvl.lvl, value(qos_fill, Tp), value(qos_stop, Tp)), ctx)) end else qos = $(lvl.ptr)[$(ctx(pos))] @@ -285,13 +359,13 @@ function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardLevel}, mode) end dirty = true end, - body = (ctx) -> VirtualHollowSubFiber(lvl.lvl, value(qos), dirty), + body = (ctx) -> VirtualHollowSubFiber(lvl.lvl, value(qos), dirty), epilogue = quote #this task will always own this position forever, even if we don't write to it. Still, we try to be conservative of memory usage of the underlying level. if dirty && $(lvl.ptr)[$(ctx(pos))] == 0 local_qos_fill += 1 $(lvl.ptr)[$(ctx(pos))] = $(lvl.local_qos_fill) += 1 end - end + end, ) -end \ No newline at end of file +end From a509212c471c38f1f2d25580af669c64e08fdf96 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 19:41:35 -0500 Subject: [PATCH 19/25] no stylebot (awkward --- .github/workflows/StyleBot.yml | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 .github/workflows/StyleBot.yml diff --git a/.github/workflows/StyleBot.yml b/.github/workflows/StyleBot.yml deleted file mode 100644 index fc13c6fae..000000000 --- a/.github/workflows/StyleBot.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: Style Review -on: - pull_request: -jobs: - code-style: - runs-on: ubuntu-latest - steps: - - uses: julia-actions/julia-format@v3 - with: - version: '1' # Set `version` to '1.0.54' if you need to use JuliaFormatter.jl v1.0.54 (default: '1') \ No newline at end of file From 311c7813d7e61fcdd16a35e103dd5014593986c3 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 20:10:54 -0500 Subject: [PATCH 20/25] quick fix --- docs/src/docs/internals/parallel.md | 4 ++-- src/abstract_tensor.jl | 16 ---------------- src/architecture.jl | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/docs/src/docs/internals/parallel.md b/docs/src/docs/internals/parallel.md index fbf899c3d..dd47aae69 100644 --- a/docs/src/docs/internals/parallel.md +++ b/docs/src/docs/internals/parallel.md @@ -23,7 +23,7 @@ get_device get_parent_task ``` -## Data Movement +## Data Transfer Before entering a parallel loop, a tensor may reside on a single task, or represent a single view of data distributed across multiple tasks, or represent @@ -59,4 +59,4 @@ scatter_recv gather gather_send gather_recv -``` +``` \ No newline at end of file diff --git a/src/abstract_tensor.jl b/src/abstract_tensor.jl index 0d5c601ee..2eb31e0da 100644 --- a/src/abstract_tensor.jl +++ b/src/abstract_tensor.jl @@ -93,22 +93,6 @@ function similar in spirit to `Base.resize!`. """ function virtual_resize! end -""" - moveto(arr, device) - -If the array is not on the given device, it creates a new version of this array on that device -and copies the data in to it, according to the `device` trait. -""" -function moveto end - -""" - virtual_moveto(device, arr) - -If the virtual array is not on the given device, copy the array to that device. This -function may modify underlying data arrays, but cannot change the virtual itself. This -function is used to move data to the device before a kernel is launched. -""" -function virtual_moveto end struct LabelledTree key diff --git a/src/architecture.jl b/src/architecture.jl index d6ab266e8..b0001d0c1 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -319,3 +319,20 @@ function virtual_parallel_region(f, ctx, device::VirtualCPU) end end end + +""" + moveto(arr, device) + +If the array is not on the given device, it creates a new version of this array on that device +and copies the data in to it, according to the `device` trait. +""" +function moveto end + +""" + virtual_moveto(device, arr) + +If the virtual array is not on the given device, copy the array to that device. This +function may modify underlying data arrays, but cannot change the virtual itself. This +function is used to move data to the device before a kernel is launched. +""" +function virtual_moveto end \ No newline at end of file From 8617a204bf8f00b16bf0a50b0147e0035884ddda Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 20:19:13 -0500 Subject: [PATCH 21/25] rename --- docs/src/docs/internals/parallel.md | 3 ++ src/Finch.jl | 2 +- src/architecture.jl | 16 +++---- src/interface/abstract_arrays.jl | 4 +- src/lower.jl | 4 +- src/tensors/fibers.jl | 14 +++--- src/tensors/levels/atomic_element_levels.jl | 8 ++-- src/tensors/levels/dense_levels.jl | 8 ++-- src/tensors/levels/dense_rle_levels.jl | 20 ++++---- src/tensors/levels/element_levels.jl | 8 ++-- src/tensors/levels/mutex_levels.jl | 12 ++--- src/tensors/levels/pattern_levels.jl | 4 +- src/tensors/levels/separate_levels.jl | 12 ++--- src/tensors/levels/shard_levels.jl | 16 +++---- src/tensors/levels/sparse_band_levels.jl | 16 +++---- src/tensors/levels/sparse_bytemap_levels.jl | 20 ++++---- src/tensors/levels/sparse_coo_levels.jl | 16 +++---- src/tensors/levels/sparse_dict_levels.jl | 20 ++++---- src/tensors/levels/sparse_interval_levels.jl | 8 ++-- src/tensors/levels/sparse_list_levels.jl | 16 +++---- src/tensors/levels/sparse_point_levels.jl | 12 ++--- src/tensors/levels/sparse_rle_levels.jl | 24 +++++----- src/tensors/levels/sparse_vbl_levels.jl | 20 ++++---- src/tensors/scalars.jl | 8 ++-- src/util/vectors.jl | 12 ++--- .../reference32/parallel/atomics_sym_spmv.txt | 14 +++--- test/reference32/parallel/parallel_blur.jl | 6 +-- .../parallel/parallel_blur_sparse.jl | 10 ++-- .../parallel/parallel_spmms_no_atomics_1.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_2.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_3.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_4.txt | 28 +++++------ .../parallel/parallel_spmms_no_atomics_5.txt | 46 +++++++++---------- test/reference32/parallel/parallel_spmv.txt | 10 ++-- .../parallel/parallel_spmv_atomic.txt | 10 ++-- .../parallel/parallel_spmv_atomics.txt | 12 ++--- .../parallel/stress_dense_atomics.txt | 6 +-- .../reference64/parallel/atomics_sym_spmv.txt | 14 +++--- test/reference64/parallel/parallel_blur.jl | 6 +-- .../parallel/parallel_blur_sparse.jl | 10 ++-- .../parallel/parallel_spmms_no_atomics_1.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_2.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_3.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_4.txt | 28 +++++------ .../parallel/parallel_spmms_no_atomics_5.txt | 46 +++++++++---------- test/reference64/parallel/parallel_spmv.txt | 10 ++-- .../parallel/parallel_spmv_atomic.txt | 10 ++-- .../parallel/parallel_spmv_atomics.txt | 12 ++--- .../parallel/stress_dense_atomics.txt | 6 +-- test/suites/parallel_tests.jl | 4 +- 50 files changed, 351 insertions(+), 348 deletions(-) diff --git a/docs/src/docs/internals/parallel.md b/docs/src/docs/internals/parallel.md index dd47aae69..e94a93e13 100644 --- a/docs/src/docs/internals/parallel.md +++ b/docs/src/docs/internals/parallel.md @@ -49,6 +49,9 @@ memory, `scatter` may also be a no-op. When the parallel loop is exited, we call Each of these operations begins with a `_send` variant on one task, and finishes with a `_recv` variant on the recieving task. +All transfers are accomplished with the functions `transfer` and `virtual_transfer`, with +different `style` objects signaling the type of transfer. + ```@docs bcast bcast_send diff --git a/src/Finch.jl b/src/Finch.jl index e90a3d1c9..ccd785c4e 100644 --- a/src/Finch.jl +++ b/src/Finch.jl @@ -260,7 +260,7 @@ export fsparse, fsparse!, fsprand, fspzeros, ffindnz, fread, fwrite, countstored export bspread, bspwrite export ftnsread, ftnswrite, fttread, fttwrite -export moveto, postype +export transfer, postype include("FinchLogic/FinchLogic.jl") using .FinchLogic diff --git a/src/architecture.jl b/src/architecture.jl index b0001d0c1..739d9106a 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -188,7 +188,7 @@ get_task_num(task::VirtualCPUThread) = task.tid struct CPULocalMemory device::CPU end -function moveto(vec::V, mem::CPULocalMemory) where {V<:Vector} +function transfer(vec::V, mem::CPULocalMemory) where {V<:Vector} CPULocalVector{V}(mem.device, [copy(vec) for _ in 1:(mem.device.n)]) end @@ -204,15 +204,15 @@ end Base.eltype(::Type{CPULocalVector{V}}) where {V} = eltype(V) Base.ndims(::Type{CPULocalVector{V}}) where {V} = ndims(V) -function moveto(vec::Vector, device::CPU) +function transfer(vec::Vector, device::CPU) return vec end -function moveto(vec::Vector, task::CPUThread) +function transfer(vec::Vector, task::CPUThread) return copy(vec) end -function moveto(vec::CPULocalVector, task::CPUThread) +function transfer(vec::CPULocalVector, task::CPUThread) temp = vec.data[task.tid] return temp end @@ -321,18 +321,18 @@ function virtual_parallel_region(f, ctx, device::VirtualCPU) end """ - moveto(arr, device) + transfer(arr, device) If the array is not on the given device, it creates a new version of this array on that device and copies the data in to it, according to the `device` trait. """ -function moveto end +function transfer end """ - virtual_moveto(device, arr) + virtual_transfer(device, arr) If the virtual array is not on the given device, copy the array to that device. This function may modify underlying data arrays, but cannot change the virtual itself. This function is used to move data to the device before a kernel is launched. """ -function virtual_moveto end \ No newline at end of file +function virtual_transfer end \ No newline at end of file diff --git a/src/interface/abstract_arrays.jl b/src/interface/abstract_arrays.jl index da91292c7..c77bef03b 100644 --- a/src/interface/abstract_arrays.jl +++ b/src/interface/abstract_arrays.jl @@ -130,13 +130,13 @@ FinchNotation.finch_leaf(x::VirtualAbstractArray) = virtual(x) virtual_fill_value(ctx, ::VirtualAbstractArray) = 0 virtual_eltype(ctx, tns::VirtualAbstractArray) = tns.eltype -function virtual_moveto(ctx, vec::VirtualAbstractArray, device) +function virtual_transfer(ctx, vec::VirtualAbstractArray, device) ex = freshen(ctx, vec.ex) push_preamble!( ctx, quote $ex = $(vec.ex) - $(vec.ex) = $moveto($(vec.ex), $(ctx(device))) + $(vec.ex) = $transfer($(vec.ex), $(ctx(device))) end, ) push_epilogue!( diff --git a/src/lower.jl b/src/lower.jl index bc40b9bc5..7d6d8e564 100644 --- a/src/lower.jl +++ b/src/lower.jl @@ -342,14 +342,14 @@ function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualC ) for tns in setdiff(used_in_scope, decl_in_scope) - virtual_moveto(ctx, resolve(ctx, tns), device) + virtual_transfer(ctx, resolve(ctx, tns), device) end virtual_parallel_region(ctx, device) do ctx_2 subtask = get_task(ctx_2) tid = get_task_num(subtask) for tns in intersect(used_in_scope, decl_in_scope) - virtual_moveto(ctx_2, resolve(ctx_2, tns), subtask) + virtual_transfer(ctx_2, resolve(ctx_2, tns), subtask) end contain(ctx_2) do ctx_3 open_scope(ctx_3) do ctx_4 diff --git a/src/tensors/fibers.jl b/src/tensors/fibers.jl index 5f331d487..400ef936c 100644 --- a/src/tensors/fibers.jl +++ b/src/tensors/fibers.jl @@ -139,12 +139,12 @@ function unfurl(ctx::AbstractCompiler, arr::VirtualFiber, ext, mode, proto) unfurl(ctx, VirtualSubFiber(arr.lvl, literal(1)), ext, mode, proto) end -function virtual_moveto(ctx::AbstractCompiler, fbr::VirtualFiber, arch) - virtual_moveto_level(ctx, fbr.lvl, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualFiber, arch) + virtual_transfer_level(ctx, fbr.lvl, arch) end -function virtual_moveto(ctx::AbstractCompiler, fbr::VirtualSubFiber, arch) - virtual_moveto_level(ctx, fbr.lvl, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualSubFiber, arch) + virtual_transfer_level(ctx, fbr.lvl, arch) end struct HollowSubFiber{Lvl,Pos,Dirty} <: AbstractFiber{Lvl} @@ -171,9 +171,9 @@ function lower(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, ::DefaultStyle end FinchNotation.finch_leaf(x::VirtualHollowSubFiber) = virtual(x) -function virtual_moveto(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, arch) return VirtualHollowSubFiber( - virtual_moveto_level(ctx, fbr.lvl, arch), fbr.pos, fbr.dirty + virtual_transfer_level(ctx, fbr.lvl, arch), fbr.pos, fbr.dirty ) end @@ -333,7 +333,7 @@ function Base.similar(fbr::AbstractFiber, fill_value, eltype::Type, dims::Tuple) Tensor(similar_level(fbr.lvl, fill_value, eltype, dims...)) end -moveto(tns::Tensor, device) = Tensor(moveto(tns.lvl, device)) +transfer(tns::Tensor, device) = Tensor(transfer(tns.lvl, device)) struct Structure t diff --git a/src/tensors/levels/atomic_element_levels.jl b/src/tensors/levels/atomic_element_levels.jl index a284c09e5..313c904c3 100644 --- a/src/tensors/levels/atomic_element_levels.jl +++ b/src/tensors/levels/atomic_element_levels.jl @@ -40,8 +40,8 @@ end postype(::Type{<:AtomicElementLevel{Vf,Tv,Tp}}) where {Vf,Tv,Tp} = Tp -function moveto(lvl::AtomicElementLevel{Vf,Tv,Tp}, device) where {Vf,Tv,Tp} - return AtomicElementLevel{Vf,Tv,Tp}(moveto(lvl.val, device)) +function transfer(lvl::AtomicElementLevel{Vf,Tv,Tp}, device) where {Vf,Tv,Tp} + return AtomicElementLevel{Vf,Tv,Tp}(transfer(lvl.val, device)) end pattern!(lvl::AtomicElementLevel{Vf,Tv,Tp}) where {Vf,Tv,Tp} = @@ -165,13 +165,13 @@ function reassemble_level!(ctx, lvl::VirtualAtomicElementLevel, pos_start, pos_s lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualAtomicElementLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualAtomicElementLevel, arch) val_2 = freshen(ctx, :val) push_preamble!( ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch))) end, ) push_epilogue!( diff --git a/src/tensors/levels/dense_levels.jl b/src/tensors/levels/dense_levels.jl index c06f0090f..372e49faf 100644 --- a/src/tensors/levels/dense_levels.jl +++ b/src/tensors/levels/dense_levels.jl @@ -44,8 +44,8 @@ function postype(::Type{DenseLevel{Ti,Lvl}}) where {Ti,Lvl} return postype(Lvl) end -function moveto(lvl::DenseLevel{Ti}, device) where {Ti} - return DenseLevel{Ti}(moveto(lvl.lvl, device), lvl.shape) +function transfer(lvl::DenseLevel{Ti}, device) where {Ti} + return DenseLevel{Ti}(transfer(lvl.lvl, device), lvl.shape) end function pattern!(lvl::DenseLevel{Ti,Lvl}) where {Ti,Lvl} @@ -201,8 +201,8 @@ function freeze_level!(ctx::AbstractCompiler, lvl::VirtualDenseLevel, pos) return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualDenseLevel, arch) - virtual_moveto_level(ctx, lvl.lvl, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualDenseLevel, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end struct DenseTraversal diff --git a/src/tensors/levels/dense_rle_levels.jl b/src/tensors/levels/dense_rle_levels.jl index a4d05091a..047e53ba0 100644 --- a/src/tensors/levels/dense_rle_levels.jl +++ b/src/tensors/levels/dense_rle_levels.jl @@ -64,11 +64,11 @@ function postype( return postype(Lvl) end -function moveto(lvl::RunListLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - ptr = moveto(lvl.ptr, device) - right = moveto(lvl.right, device) - buf = moveto(lvl.buf, device) +function transfer(lvl::RunListLevel{Ti}, device) where {Ti} + lvl_2 = transfer(lvl.lvl, device) + ptr = transfer(lvl.ptr, device) + right = transfer(lvl.right, device) + buf = transfer(lvl.buf, device) return RunListLevel{Ti}( lvl_2, lvl.shape, lvl.ptr, lvl.right, lvl.buf; merge=getmerge(lvl) ) @@ -295,7 +295,7 @@ function virtual_level_resize!(ctx, lvl::VirtualRunListLevel, dims...) lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, arch) ptr_2 = freshen(ctx, lvl.ptr) right_2 = freshen(ctx, lvl.right) push_preamble!( @@ -303,8 +303,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, a quote $ptr_2 = $(lvl.ptr) $right_2 = $(lvl.right) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.right) = $moveto($(lvl.right), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) + $(lvl.right) = $transfer($(lvl.right), $(ctx(arch))) end, ) push_epilogue!( @@ -314,8 +314,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, a $(lvl.right) = $right_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) - virtual_moveto_level(ctx, lvl.buf, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.buf, arch) end virtual_level_eltype(lvl::VirtualRunListLevel) = virtual_level_eltype(lvl.lvl) diff --git a/src/tensors/levels/element_levels.jl b/src/tensors/levels/element_levels.jl index 8effd622d..b8807af01 100644 --- a/src/tensors/levels/element_levels.jl +++ b/src/tensors/levels/element_levels.jl @@ -43,8 +43,8 @@ end postype(::Type{<:ElementLevel{Vf,Tv,Tp}}) where {Vf,Tv,Tp} = Tp -function moveto(lvl::ElementLevel{Vf,Tv,Tp}, device) where {Vf,Tv,Tp} - return ElementLevel{Vf,Tv,Tp}(moveto(lvl.val, device)) +function transfer(lvl::ElementLevel{Vf,Tv,Tp}, device) where {Vf,Tv,Tp} + return ElementLevel{Vf,Tv,Tp}(transfer(lvl.val, device)) end pattern!(lvl::ElementLevel{Vf,Tv,Tp}) where {Vf,Tv,Tp} = @@ -171,13 +171,13 @@ function reassemble_level!(ctx, lvl::VirtualElementLevel, pos_start, pos_stop) lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualElementLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualElementLevel, arch) val_2 = freshen(ctx, :val) push_preamble!( ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch))) end, ) push_epilogue!( diff --git a/src/tensors/levels/mutex_levels.jl b/src/tensors/levels/mutex_levels.jl index 4f26a5f44..fb368c781 100644 --- a/src/tensors/levels/mutex_levels.jl +++ b/src/tensors/levels/mutex_levels.jl @@ -35,9 +35,9 @@ end postype(::Type{<:MutexLevel{AVal,Lvl}}) where {Lvl,AVal} = postype(Lvl) -function moveto(lvl::MutexLevel, device) - lvl_2 = moveto(lvl.lvl, device) - locks_2 = moveto(lvl.locks, device) +function transfer(lvl::MutexLevel, device) + lvl_2 = transfer(lvl.lvl, device) + locks_2 = transfer(lvl.locks, device) return MutexLevel(lvl_2, locks_2) end @@ -207,7 +207,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualMutexLevel, pos) return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arch) #Add for seperation level too. atomics = freshen(ctx, :locksArray) @@ -215,7 +215,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arc ctx, quote $atomics = $(lvl.locks) - $(lvl.locks) = $moveto($(lvl.locks), $(ctx(arch))) + $(lvl.locks) = $transfer($(lvl.locks), $(ctx(arch))) end, ) push_epilogue!( @@ -224,7 +224,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arc $(lvl.locks) = $atomics end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end function instantiate(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, mode) diff --git a/src/tensors/levels/pattern_levels.jl b/src/tensors/levels/pattern_levels.jl index 31e477be9..95d274ff7 100644 --- a/src/tensors/levels/pattern_levels.jl +++ b/src/tensors/levels/pattern_levels.jl @@ -47,7 +47,7 @@ isstructequal(a::T, b::T) where {T<:Pattern} = true postype(::Type{<:PatternLevel{Tp}}) where {Tp} = Tp -function moveto(lvl::PatternLevel{Tp}, device) where {Tp} +function transfer(lvl::PatternLevel{Tp}, device) where {Tp} return PatternLevel{Tp}() end @@ -93,7 +93,7 @@ struct VirtualPatternLevel <: AbstractVirtualLevel Tp end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualPatternLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualPatternLevel, arch) end is_level_injective(ctx, ::VirtualPatternLevel) = [] diff --git a/src/tensors/levels/separate_levels.jl b/src/tensors/levels/separate_levels.jl index 62999cc02..e9de1f367 100644 --- a/src/tensors/levels/separate_levels.jl +++ b/src/tensors/levels/separate_levels.jl @@ -36,9 +36,9 @@ end postype(::Type{<:Separate{Lvl,Val}}) where {Lvl,Val} = postype(Lvl) -function moveto(lvl::SeparateLevel, device) - lvl_2 = moveto(lvl.lvl, device) - val_2 = moveto(lvl.val, device) +function transfer(lvl::SeparateLevel, device) + lvl_2 = transfer(lvl.lvl, device) + val_2 = transfer(lvl.val, device) return SeparateLevel(lvl_2, val_2) end @@ -146,7 +146,7 @@ virtual_level_size(ctx, lvl::VirtualSeparateLevel) = virtual_level_size(ctx, lvl virtual_level_eltype(lvl::VirtualSeparateLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualSeparateLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_moveto_level(ctx, lvl::VirtualSeparateLevel, arch) +function virtual_transfer_level(ctx, lvl::VirtualSeparateLevel, arch) # Need to move each pointer... val_2 = freshen(ctx, lvl.val) @@ -154,7 +154,7 @@ function virtual_moveto_level(ctx, lvl::VirtualSeparateLevel, arch) ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch))) end, ) push_epilogue!( @@ -163,7 +163,7 @@ function virtual_moveto_level(ctx, lvl::VirtualSeparateLevel, arch) $(lvl.val) = $val_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end function declare_level!(ctx, lvl::VirtualSeparateLevel, pos, init) diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl index 077f60bd9..b9a8e9082 100644 --- a/src/tensors/levels/shard_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -28,7 +28,7 @@ end const Shard = ShardLevel function ShardLevel(device::Device, lvl::Lvl) where {Device,Lvl} - ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], moveto(lvl, device)) + ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], transfer(lvl, device)) end #TODO scatterto? function ShardLevel{Device}( @@ -51,10 +51,10 @@ function postype(::Type{<:Shard{Device,Lvl,Ptr,Task,Val}}) where {Device,Lvl,Ptr postype(Lvl) end -function moveto(lvl::ShardLevel, device) - lvl_2 = moveto(lvl.lvl, device) - ptr_2 = moveto(lvl.ptr, device) - task_2 = moveto(lvl.task, device) +function transfer(lvl::ShardLevel, device) + lvl_2 = transfer(lvl.lvl, device) + ptr_2 = transfer(lvl.ptr, device) + task_2 = transfer(lvl.task, device) return ShardLevel(lvl_2, ptr_2, task_2, val_2) end @@ -212,13 +212,13 @@ virtual_level_size(ctx, lvl::VirtualShardLevel) = virtual_level_size(ctx, lvl.lv virtual_level_eltype(lvl::VirtualShardLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualShardLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_moveto_level(ctx, lvl::VirtualShardLevel, arch) +function virtual_transfer_level(ctx, lvl::VirtualShardLevel, arch) val_2 = freshen(ctx, lvl.val) push_preamble!( ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch))) end, ) push_epilogue!( @@ -227,7 +227,7 @@ function virtual_moveto_level(ctx, lvl::VirtualShardLevel, arch) $(lvl.val) = $val_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) diff --git a/src/tensors/levels/sparse_band_levels.jl b/src/tensors/levels/sparse_band_levels.jl index fd5f29687..194c86d26 100644 --- a/src/tensors/levels/sparse_band_levels.jl +++ b/src/tensors/levels/sparse_band_levels.jl @@ -36,10 +36,10 @@ function postype(::Type{SparseBandLevel{Ti,Idx,Ofs,Lvl}}) where {Ti,Idx,Ofs,Lvl} return postype(Lvl) end -function moveto(lvl::SparseBandLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - idx_2 = moveto(lvl.idx, device) - ofs_2 = moveto(lvl.ofs, device) +function transfer(lvl::SparseBandLevel{Ti}, device) where {Ti} + lvl_2 = transfer(lvl.lvl, device) + idx_2 = transfer(lvl.idx, device) + ofs_2 = transfer(lvl.ofs, device) return SparseBandLevel{Ti}(lvl_2, lvl.shape, idx_2, ofs_2) end @@ -241,7 +241,7 @@ end virtual_level_eltype(lvl::VirtualSparseBandLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualSparseBandLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, arch) tbl_2 = freshen(ctx, lvl.tbl) ofs_2 = freshen(ctx, lvl.ofs) push_preamble!( @@ -249,8 +249,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel quote $tbl_2 = $(lvl.tbl) $ofs_2 = $(lvl.ofs) - $(lvl.tbl) = $moveto($(lvl.tbl), $(ctx(arch))) - $(lvl.ofs) = $moveto($(lvl.ofs), $(ctx(arch))) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch))) + $(lvl.ofs) = $transfer($(lvl.ofs), $(ctx(arch))) end, ) push_epilogue!( @@ -260,7 +260,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel $(lvl.ofs) = $ofs_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end function declare_level!(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, pos, init) diff --git a/src/tensors/levels/sparse_bytemap_levels.jl b/src/tensors/levels/sparse_bytemap_levels.jl index 4c0c03b7a..6610f26b7 100644 --- a/src/tensors/levels/sparse_bytemap_levels.jl +++ b/src/tensors/levels/sparse_bytemap_levels.jl @@ -61,11 +61,11 @@ function postype(::Type{SparseByteMapLevel{Ti,Ptr,Tbl,Srt,Lvl}}) where {Ti,Ptr,T return postype(Lvl) end -function moveto(lvl::SparseByteMapLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - ptr_2 = moveto(lvl.ptr, device) - tbl_2 = moveto(lvl.tbl, device) - srt_2 = moveto(lvl.srt, device) +function transfer(lvl::SparseByteMapLevel{Ti}, device) where {Ti} + lvl_2 = transfer(lvl.lvl, device) + ptr_2 = transfer(lvl.ptr, device) + tbl_2 = transfer(lvl.tbl, device) + srt_2 = transfer(lvl.srt, device) return SparseByteMapLevel{Ti}(lvl_2, lvl.shape, ptr_2, tbl_2, srt_2) end @@ -239,7 +239,7 @@ function lower(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, ::DefaultS end end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, arch) ptr_2 = freshen(ctx, lvl.ptr) tbl_2 = freshen(ctx, lvl.tbl) srt_2 = freshen(ctx, lvl.srt) @@ -249,9 +249,9 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLe $ptr_2 = $(lvl.ptr) $tbl_2 = $(lvl.tbl) $srt_2 = $(lvl.srt) - $(lvl.ptr) = moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.tbl) = moveto($(lvl.tbl), $(ctx(arch))) - $(lvl.srt) = moveto($(lvl.srt), $(ctx(arch))) + $(lvl.ptr) = transfer($(lvl.ptr), $(ctx(arch))) + $(lvl.tbl) = transfer($(lvl.tbl), $(ctx(arch))) + $(lvl.srt) = transfer($(lvl.srt), $(ctx(arch))) end, ) push_epilogue!( @@ -262,7 +262,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLe $(lvl.srt) = $srt_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end Base.summary(lvl::VirtualSparseByteMapLevel) = "SparseByteMap($(summary(lvl.lvl)))" diff --git a/src/tensors/levels/sparse_coo_levels.jl b/src/tensors/levels/sparse_coo_levels.jl index 2b4aa4a4c..e04eadee3 100644 --- a/src/tensors/levels/sparse_coo_levels.jl +++ b/src/tensors/levels/sparse_coo_levels.jl @@ -85,10 +85,10 @@ function postype(::Type{SparseCOOLevel{N,TI,Ptr,Tbl,Lvl}}) where {N,TI,Ptr,Tbl,L return postype(Lvl) end -function moveto(lvl::SparseCOOLevel{N,TI}, device) where {N,TI} - lvl_2 = moveto(lvl.lvl, device) - ptr_2 = moveto(lvl.ptr, device) - tbl_2 = ntuple(n -> moveto(lvl.tbl[n], device), N) +function transfer(lvl::SparseCOOLevel{N,TI}, device) where {N,TI} + lvl_2 = transfer(lvl.lvl, device) + ptr_2 = transfer(lvl.ptr, device) + tbl_2 = ntuple(n -> transfer(lvl.tbl[n], device), N) return SparseCOOLevel{N,TI}(lvl_2, lvl.shape, ptr_2, tbl_2) end @@ -344,13 +344,13 @@ function freeze_level!(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, pos_st return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, arch) ptr_2 = freshen(ctx, lvl.ptr) push_preamble!( ctx, quote $ptr_2 = $(lvl.ptr) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) end, ) push_epilogue!( @@ -365,7 +365,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, ctx, quote $idx_2 = $idx - $idx = $moveto($idx, $(ctx(arch))) + $idx = $transfer($idx, $(ctx(arch))) end, ) push_epilogue!( @@ -376,7 +376,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, ) idx_2 end - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end struct SparseCOOWalkTraversal diff --git a/src/tensors/levels/sparse_dict_levels.jl b/src/tensors/levels/sparse_dict_levels.jl index 9fc11ab27..84c46d017 100644 --- a/src/tensors/levels/sparse_dict_levels.jl +++ b/src/tensors/levels/sparse_dict_levels.jl @@ -90,15 +90,15 @@ function Base.resize!(lvl::SparseDictLevel{Ti}, dims...) where {Ti} ) end -function moveto( +function transfer( lvl::SparseDictLevel{Ti,Ptr,Idx,Val,Tbl,Pool,Lvl}, Tm ) where {Ti,Ptr,Idx,Val,Tbl,Pool,Lvl} - lvl_2 = moveto(lvl.lvl, Tm) - ptr_2 = moveto(lvl.ptr, Tm) - idx_2 = moveto(lvl.idx, Tm) - val_2 = moveto(lvl.val, Tm) - tbl_2 = moveto(lvl.tbl, Tm) - pool_2 = moveto(lvl.pool, Tm) + lvl_2 = transfer(lvl.lvl, Tm) + ptr_2 = transfer(lvl.ptr, Tm) + idx_2 = transfer(lvl.idx, Tm) + val_2 = transfer(lvl.val, Tm) + tbl_2 = transfer(lvl.tbl, Tm) + pool_2 = transfer(lvl.pool, Tm) return SparseDictLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2, val_2, tbl_2, pool_2) end @@ -385,7 +385,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, pos_sto return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, arch) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) tbl_2 = freshen(ctx, lvl.tbl_2) @@ -393,7 +393,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel ctx, quote $tbl_2 = $(lvl.tbl) - $(lvl.tbl) = $moveto($(lvl.tbl), $(ctx(arch))) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch))) end, ) push_epilogue!( @@ -402,7 +402,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel $(lvl.tbl) = $tbl_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end function unfurl( diff --git a/src/tensors/levels/sparse_interval_levels.jl b/src/tensors/levels/sparse_interval_levels.jl index 9fcdf6967..9b57e6a84 100644 --- a/src/tensors/levels/sparse_interval_levels.jl +++ b/src/tensors/levels/sparse_interval_levels.jl @@ -63,10 +63,10 @@ function postype(::Type{SparseIntervalLevel{Ti,Left,Right,Lvl}}) where {Ti,Left, return postype(Lvl) end -function moveto(lvl::SparseIntervalLevel{Ti,Left,Right,Lvl}, Tm) where {Ti,Left,Right,Lvl} - lvl_2 = moveto(lvl.lvl, Tm) - left_2 = moveto(lvl.left, Tm) - right_2 = moveto(lvl.right, Tm) +function transfer(lvl::SparseIntervalLevel{Ti,Left,Right,Lvl}, Tm) where {Ti,Left,Right,Lvl} + lvl_2 = transfer(lvl.lvl, Tm) + left_2 = transfer(lvl.left, Tm) + right_2 = transfer(lvl.right, Tm) return SparseIntervalLevel{Ti}(lvl_2, lvl.shape, left_2, right_2) end diff --git a/src/tensors/levels/sparse_list_levels.jl b/src/tensors/levels/sparse_list_levels.jl index fa83d9dee..f79b15dac 100644 --- a/src/tensors/levels/sparse_list_levels.jl +++ b/src/tensors/levels/sparse_list_levels.jl @@ -61,10 +61,10 @@ function postype(::Type{SparseListLevel{Ti,Ptr,Idx,Lvl}}) where {Ti,Ptr,Idx,Lvl} return postype(Lvl) end -function moveto(lvl::SparseListLevel{Ti,Ptr,Idx,Lvl}, Tm) where {Ti,Ptr,Idx,Lvl} - lvl_2 = moveto(lvl.lvl, Tm) - ptr_2 = moveto(lvl.ptr, Tm) - idx_2 = moveto(lvl.idx, Tm) +function transfer(lvl::SparseListLevel{Ti,Ptr,Idx,Lvl}, Tm) where {Ti,Ptr,Idx,Lvl} + lvl_2 = transfer(lvl.lvl, Tm) + ptr_2 = transfer(lvl.ptr, Tm) + idx_2 = transfer(lvl.idx, Tm) return SparseListLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2) end @@ -313,7 +313,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, pos_sto return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, arch) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) push_preamble!( @@ -321,8 +321,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel quote $ptr_2 = $(lvl.ptr) $idx_2 = $(lvl.idx) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.idx) = $moveto($(lvl.idx), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) + $(lvl.idx) = $transfer($(lvl.idx), $(ctx(arch))) end, ) push_epilogue!( @@ -332,7 +332,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel $(lvl.idx) = $idx_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end function unfurl( diff --git a/src/tensors/levels/sparse_point_levels.jl b/src/tensors/levels/sparse_point_levels.jl index 762bbe20e..31c002569 100644 --- a/src/tensors/levels/sparse_point_levels.jl +++ b/src/tensors/levels/sparse_point_levels.jl @@ -55,9 +55,9 @@ function postype(::Type{SparsePointLevel{Ti,Idx,Lvl}}) where {Ti,Idx,Lvl} return postype(Lvl) end -function moveto(lvl::SparsePointLevel{Ti,Idx,Lvl}, Tm) where {Ti,Idx,Lvl} - lvl_2 = moveto(lvl.lvl, Tm) - idx_2 = moveto(lvl.idx, Tm) +function transfer(lvl::SparsePointLevel{Ti,Idx,Lvl}, Tm) where {Ti,Idx,Lvl} + lvl_2 = transfer(lvl.lvl, Tm) + idx_2 = transfer(lvl.idx, Tm) return SparsePointLevel{Ti}(lvl_2, lvl.shape, idx_2) end @@ -254,14 +254,14 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, pos_st return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, arch) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) push_preamble!( ctx, quote $idx_2 = $(lvl.idx) - $(lvl.idx) = $moveto($(lvl.idx), $(ctx(arch))) + $(lvl.idx) = $transfer($(lvl.idx), $(ctx(arch))) end, ) push_epilogue!( @@ -270,7 +270,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLeve $(lvl.idx) = $idx_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end function unfurl( diff --git a/src/tensors/levels/sparse_rle_levels.jl b/src/tensors/levels/sparse_rle_levels.jl index 49de8844a..7e74938a1 100644 --- a/src/tensors/levels/sparse_rle_levels.jl +++ b/src/tensors/levels/sparse_rle_levels.jl @@ -72,12 +72,12 @@ function postype( return postype(Lvl) end -function moveto(lvl::SparseRunListLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - ptr = moveto(lvl.ptr, device) - left = moveto(lvl.left, device) - right = moveto(lvl.right, device) - buf = moveto(lvl.buf, device) +function transfer(lvl::SparseRunListLevel{Ti}, device) where {Ti} + lvl_2 = transfer(lvl.lvl, device) + ptr = transfer(lvl.ptr, device) + left = transfer(lvl.left, device) + right = transfer(lvl.right, device) + buf = transfer(lvl.buf, device) return SparseRunListLevel{Ti}( lvl_2, lvl.shape, lvl.ptr, lvl.left, lvl.right, lvl.buf; merge=getmerge(lvl) ) @@ -303,7 +303,7 @@ function virtual_level_resize!(ctx, lvl::VirtualSparseRunListLevel, dims...) lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLevel, arch) ptr_2 = freshen(ctx, lvl.ptr) left_2 = freshen(ctx, lvl.left) right_2 = freshen(ctx, lvl.right) @@ -313,9 +313,9 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLe $ptr_2 = $(lvl.ptr) $left_2 = $(lvl.left) $right_2 = $(lvl.right) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.left) = $moveto($(lvl.left), $(ctx(arch))) - $(lvl.right) = $moveto($(lvl.right), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) + $(lvl.left) = $transfer($(lvl.left), $(ctx(arch))) + $(lvl.right) = $transfer($(lvl.right), $(ctx(arch))) end, ) push_epilogue!( @@ -326,8 +326,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLe $(lvl.right) = $right_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) - virtual_moveto_level(ctx, lvl.buf, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.buf, arch) end virtual_level_eltype(lvl::VirtualSparseRunListLevel) = virtual_level_eltype(lvl.lvl) diff --git a/src/tensors/levels/sparse_vbl_levels.jl b/src/tensors/levels/sparse_vbl_levels.jl index ae8f6ff2d..a26e26a11 100644 --- a/src/tensors/levels/sparse_vbl_levels.jl +++ b/src/tensors/levels/sparse_vbl_levels.jl @@ -54,11 +54,11 @@ function postype( return postype(Lvl) end -function moveto(lvl::SparseBlockListLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - ptr_2 = moveto(lvl.ptr, device) - idx_2 = moveto(lvl.idx, device) - ofs_2 = moveto(lvl.ofs, device) +function transfer(lvl::SparseBlockListLevel{Ti}, device) where {Ti} + lvl_2 = transfer(lvl.lvl, device) + ptr_2 = transfer(lvl.ptr, device) + idx_2 = transfer(lvl.idx, device) + ofs_2 = transfer(lvl.ofs, device) return SparseBlockListLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2, ofs_2) end @@ -289,7 +289,7 @@ function virtual_level_fill_value(lvl::VirtualSparseBlockListLevel) virtual_level_fill_value(lvl.lvl) end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, arch) ptr_2 = freshen(ctx, lvl.ptr) tbl_2 = freshen(ctx, lvl.tbl) ofs_2 = freshen(ctx, lvl.ofs) @@ -299,9 +299,9 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockList $ptr_2 = $(lvl.ptr) $tbl_2 = $(lvl.tbl) $ofs_2 = $(lvl.ofs) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.tbl) = $moveto($(lvl.tbl), $(ctx(arch))) - $(lvl.ofs) = $moveto($(lvl.ofs), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch))) + $(lvl.ofs) = $transfer($(lvl.ofs), $(ctx(arch))) end, ) push_epilogue!( @@ -312,7 +312,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockList $(lvl.ofs) = $ofs_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch) end function declare_level!(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, pos, init) diff --git a/src/tensors/scalars.jl b/src/tensors/scalars.jl index 11ac313e8..cba9db56b 100644 --- a/src/tensors/scalars.jl +++ b/src/tensors/scalars.jl @@ -41,7 +41,7 @@ function virtualize(ctx, ex, ::Type{Scalar{Vf,Tv}}, tag) where {Vf,Tv} VirtualScalar(sym, Tv, Vf, tag, val) end -virtual_moveto(ctx, lvl::VirtualScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualScalar, arch) = lvl virtual_size(ctx, ::VirtualScalar) = () @@ -147,7 +147,7 @@ virtual_size(ctx, ::VirtualSparseScalar) = () virtual_fill_value(ctx, tns::VirtualSparseScalar) = tns.Vf virtual_eltype(tns::VirtualSparseScalar, ctx) = tns.Tv -virtual_moveto(ctx, lvl::VirtualSparseScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualSparseScalar, arch) = lvl function declare!(ctx, tns::VirtualSparseScalar, init) push_preamble!( @@ -289,7 +289,7 @@ function lower_assign(ctx, tns::VirtualShortCircuitScalar, mode, op, rhs) :($(tns.val) = $lhs_2) end -virtual_moveto(ctx, lvl::VirtualShortCircuitScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualShortCircuitScalar, arch) = lvl function short_circuit_cases(ctx, tns::VirtualShortCircuitScalar, op) [ @@ -359,7 +359,7 @@ virtual_size(ctx, ::VirtualSparseShortCircuitScalar) = () virtual_fill_value(ctx, tns::VirtualSparseShortCircuitScalar) = tns.Vf virtual_eltype(tns::VirtualSparseShortCircuitScalar, ctx) = tns.Tv -virtual_moveto(ctx, lvl::VirtualSparseShortCircuitScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualSparseShortCircuitScalar, arch) = lvl function declare!(ctx, tns::VirtualSparseShortCircuitScalar, init) push_preamble!( diff --git a/src/util/vectors.jl b/src/util/vectors.jl index bcf901a2c..20eec0920 100644 --- a/src/util/vectors.jl +++ b/src/util/vectors.jl @@ -31,8 +31,8 @@ Base.size(vec::PlusOneVector{T}) where {T} = size(vec.data) Base.axes(vec::PlusOneVector{T}) where {T} = axes(vec.data) Base.resize!(vec::PlusOneVector{T}, dim) where {T} = resize!(vec.data, dim) -function moveto(vec::PlusOneVector{T}, device) where {T} - data = moveto(vec.data, device) +function transfer(vec::PlusOneVector{T}, device) where {T} + data = transfer(vec.data, device) return PlusOneVector{T}(data) end @@ -77,8 +77,8 @@ Base.size(vec::MinusEpsVector{T}) where {T} = size(vec.data) Base.axes(vec::MinusEpsVector{T}) where {T} = axes(vec.data) Base.resize!(vec::MinusEpsVector{T}, dim) where {T} = resize!(vec.data, dim) -function moveto(vec::MinusEpsVector{T}, device) where {T} - data = moveto(vec.data, device) +function transfer(vec::MinusEpsVector{T}, device) where {T} + data = transfer(vec.data, device) return MinusEpsVector{T}(data) end @@ -123,7 +123,7 @@ Base.size(vec::PlusEpsVector{T}) where {T} = size(vec.data) Base.axes(vec::PlusEpsVector{T}) where {T} = axes(vec.data) Base.resize!(vec::PlusEpsVector{T}, dim) where {T} = resize!(vec.data, dim) -function moveto(vec::PlusEpsVector{T}, device) where {T} - data = moveto(vec.data, device) +function transfer(vec::PlusEpsVector{T}, device) where {T} + data = transfer(vec.data, device) return PlusEpsVector{T}(data) end diff --git a/test/reference32/parallel/atomics_sym_spmv.txt b/test/reference32/parallel/atomics_sym_spmv.txt index 07ed8cfd8..6b2673ba8 100644 --- a/test/reference32/parallel/atomics_sym_spmv.txt +++ b/test/reference32/parallel/atomics_sym_spmv.txt @@ -23,15 +23,15 @@ begin end Finch.resize_if_smaller!(y_lvl_2_val, x_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0.0, 1, x_lvl.shape) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) val_3 = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) - diag_lvl_val = (Finch).moveto(diag_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) + diag_lvl_val = (Finch).transfer(diag_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_blur.jl b/test/reference32/parallel/parallel_blur.jl index 896ebf877..6ccfada4f 100644 --- a/test/reference32/parallel/parallel_blur.jl +++ b/test/reference32/parallel/parallel_blur.jl @@ -20,14 +20,14 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_2_val = (Finch).moveto(input_lvl_2_val, cpu) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).moveto(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).moveto(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial())) res_6 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference32/parallel/parallel_blur_sparse.jl b/test/reference32/parallel/parallel_blur_sparse.jl index 0410f18e7..af13ee215 100644 --- a/test/reference32/parallel/parallel_blur_sparse.jl +++ b/test/reference32/parallel/parallel_blur_sparse.jl @@ -22,16 +22,16 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_ptr = (Finch).moveto(input_lvl_ptr, cpu) - input_lvl_idx = (Finch).moveto(input_lvl_idx, cpu) - input_lvl_2_val = (Finch).moveto(input_lvl_2_val, cpu) + input_lvl_ptr = (Finch).transfer(input_lvl_ptr, cpu) + input_lvl_idx = (Finch).transfer(input_lvl_idx, cpu) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).moveto(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).moveto(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial())) res_71 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_1.txt b/test/reference32/parallel/parallel_spmms_no_atomics_1.txt index be19b7230..15b897b83 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_1.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_1.txt @@ -20,15 +20,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_2.txt b/test/reference32/parallel/parallel_spmms_no_atomics_2.txt index 7bf416ea1..ddd615e2b 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_2.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_2.txt @@ -21,23 +21,23 @@ begin Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) for i_4 = 1:A_lvl.shape[1] val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) B_lvl_ptr_2 = B_lvl_ptr - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_2 = B_lvl_tbl2 - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) val_2 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_3 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_3.txt b/test/reference32/parallel/parallel_spmms_no_atomics_3.txt index 56a2553c8..e406623d2 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_3.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_3.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_4.txt b/test/reference32/parallel/parallel_spmms_no_atomics_4.txt index e317e6193..f5f13f0df 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_4.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_4.txt @@ -54,20 +54,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_2 = B_lvl_tbl2 val_2 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_3 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_9 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -186,20 +186,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_18 val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_3 = B_lvl_tbl2 val_5 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_6 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_19 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_5.txt b/test/reference32/parallel/parallel_spmms_no_atomics_5.txt index c143fcbd1..9ee761e3e 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_5.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_5.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -68,21 +68,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_5 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) B_lvl_ptr_3 = B_lvl_ptr B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_3 = B_lvl_tbl2 val_6 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_10 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -202,21 +202,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_19 val_7 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) A_lvl_ptr_4 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_4 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_4 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_8 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) B_lvl_ptr_4 = B_lvl_ptr B_lvl_tbl1_4 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_4 = B_lvl_tbl2 val_9 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_20 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv.txt b/test/reference32/parallel/parallel_spmv.txt index 8d8ae1101..72aadc0b9 100644 --- a/test/reference32/parallel/parallel_spmv.txt +++ b/test/reference32/parallel/parallel_spmv.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl.shape) val = y_lvl_val - y_lvl_val = (Finch).moveto(y_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv_atomic.txt b/test/reference32/parallel/parallel_spmv_atomic.txt index 9bf5012a5..8e0c41443 100644 --- a/test/reference32/parallel/parallel_spmv_atomic.txt +++ b/test/reference32/parallel/parallel_spmv_atomic.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl_2.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl_2.shape) val = y_lvl_val - y_lvl_val = (Finch).moveto(y_lvl_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv_atomics.txt b/test/reference32/parallel/parallel_spmv_atomics.txt index f5c707ef8..8afb8fecd 100644 --- a/test/reference32/parallel/parallel_spmv_atomics.txt +++ b/test/reference32/parallel/parallel_spmv_atomics.txt @@ -27,13 +27,13 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, A_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, A_lvl.shape) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) val = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/stress_dense_atomics.txt b/test/reference32/parallel/stress_dense_atomics.txt index 7f03510f5..93641ccd2 100644 --- a/test/reference32/parallel/stress_dense_atomics.txt +++ b/test/reference32/parallel/stress_dense_atomics.txt @@ -30,11 +30,11 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, y_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, y_lvl.shape) resize!(x_lvl_val, x_lvl.shape) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) val_2 = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) Threads.@threads for i = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/atomics_sym_spmv.txt b/test/reference64/parallel/atomics_sym_spmv.txt index 18325a90e..7fa7f8f02 100644 --- a/test/reference64/parallel/atomics_sym_spmv.txt +++ b/test/reference64/parallel/atomics_sym_spmv.txt @@ -23,15 +23,15 @@ begin end Finch.resize_if_smaller!(y_lvl_2_val, x_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0.0, 1, x_lvl.shape) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) val_3 = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) - diag_lvl_val = (Finch).moveto(diag_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) + diag_lvl_val = (Finch).transfer(diag_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_blur.jl b/test/reference64/parallel/parallel_blur.jl index 7284481c6..fec0d205b 100644 --- a/test/reference64/parallel/parallel_blur.jl +++ b/test/reference64/parallel/parallel_blur.jl @@ -20,14 +20,14 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_2_val = (Finch).moveto(input_lvl_2_val, cpu) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).moveto(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).moveto(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial())) res_6 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference64/parallel/parallel_blur_sparse.jl b/test/reference64/parallel/parallel_blur_sparse.jl index befeec956..8ac6ac88b 100644 --- a/test/reference64/parallel/parallel_blur_sparse.jl +++ b/test/reference64/parallel/parallel_blur_sparse.jl @@ -22,16 +22,16 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_ptr = (Finch).moveto(input_lvl_ptr, cpu) - input_lvl_idx = (Finch).moveto(input_lvl_idx, cpu) - input_lvl_2_val = (Finch).moveto(input_lvl_2_val, cpu) + input_lvl_ptr = (Finch).transfer(input_lvl_ptr, cpu) + input_lvl_idx = (Finch).transfer(input_lvl_idx, cpu) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).moveto(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).moveto(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial())) res_71 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_1.txt b/test/reference64/parallel/parallel_spmms_no_atomics_1.txt index a4986cd00..9708e569c 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_1.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_1.txt @@ -20,15 +20,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_2.txt b/test/reference64/parallel/parallel_spmms_no_atomics_2.txt index b4a0d757b..24e3c3487 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_2.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_2.txt @@ -21,23 +21,23 @@ begin Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) for i_4 = 1:A_lvl.shape[1] val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) B_lvl_ptr_2 = B_lvl_ptr - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_2 = B_lvl_tbl2 - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) val_2 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_3 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_3.txt b/test/reference64/parallel/parallel_spmms_no_atomics_3.txt index ed44c6465..13525bf1d 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_3.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_3.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_4.txt b/test/reference64/parallel/parallel_spmms_no_atomics_4.txt index 0d4601fa7..b725d5fc9 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_4.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_4.txt @@ -54,20 +54,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_2 = B_lvl_tbl2 val_2 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_3 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_9 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -186,20 +186,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_18 val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_3 = B_lvl_tbl2 val_5 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_6 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_19 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_5.txt b/test/reference64/parallel/parallel_spmms_no_atomics_5.txt index 49ca7ab16..f3a821b23 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_5.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_5.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -68,21 +68,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_5 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) B_lvl_ptr_3 = B_lvl_ptr B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_3 = B_lvl_tbl2 val_6 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_10 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -202,21 +202,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_19 val_7 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) A_lvl_ptr_4 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) A_lvl_tbl1_4 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) A_lvl_tbl2_4 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) val_8 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) B_lvl_ptr_4 = B_lvl_ptr B_lvl_tbl1_4 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) B_lvl_tbl2_4 = B_lvl_tbl2 val_9 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_20 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv.txt b/test/reference64/parallel/parallel_spmv.txt index 407116f23..4d2ab5c94 100644 --- a/test/reference64/parallel/parallel_spmv.txt +++ b/test/reference64/parallel/parallel_spmv.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl.shape) val = y_lvl_val - y_lvl_val = (Finch).moveto(y_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv_atomic.txt b/test/reference64/parallel/parallel_spmv_atomic.txt index 5d071fa23..9b5fe2128 100644 --- a/test/reference64/parallel/parallel_spmv_atomic.txt +++ b/test/reference64/parallel/parallel_spmv_atomic.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl_2.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl_2.shape) val = y_lvl_val - y_lvl_val = (Finch).moveto(y_lvl_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv_atomics.txt b/test/reference64/parallel/parallel_spmv_atomics.txt index 8bb3bd88d..e433a2e28 100644 --- a/test/reference64/parallel/parallel_spmv_atomics.txt +++ b/test/reference64/parallel/parallel_spmv_atomics.txt @@ -27,13 +27,13 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, A_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, A_lvl.shape) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) val = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/stress_dense_atomics.txt b/test/reference64/parallel/stress_dense_atomics.txt index 67b2308ed..855608a60 100644 --- a/test/reference64/parallel/stress_dense_atomics.txt +++ b/test/reference64/parallel/stress_dense_atomics.txt @@ -30,11 +30,11 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, y_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, y_lvl.shape) resize!(x_lvl_val, x_lvl.shape) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) val_2 = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) Threads.@threads for i = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/suites/parallel_tests.jl b/test/suites/parallel_tests.jl index 2b7b3e457..2f6e05b4d 100644 --- a/test/suites/parallel_tests.jl +++ b/test/suites/parallel_tests.jl @@ -554,7 +554,7 @@ input = Tensor(Dense(Dense(Element(0.0)))) output = Tensor(Dense(Dense(Element(0.0)))) cpu = CPU(Threads.nthreads()) - tmp = moveto(Tensor(Dense(Element(0))), CPULocalMemory(cpu)) + tmp = transfer(Tensor(Dense(Element(0))), CPULocalMemory(cpu)) check_output( "parallel/parallel_blur.jl", @@ -579,7 +579,7 @@ input = Tensor(Dense(SparseList(Element(0.0)))) output = Tensor(Dense(Dense(Element(0.0)))) cpu = CPU(Threads.nthreads()) - tmp = moveto(Tensor(Dense(Element(0))), CPULocalMemory(cpu)) + tmp = transfer(Tensor(Dense(Element(0))), CPULocalMemory(cpu)) check_output( "parallel/parallel_blur_sparse.jl", From 553e57eff0654470dedd355bba486af7fdf55a89 Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Wed, 5 Feb 2025 20:27:35 -0500 Subject: [PATCH 22/25] right direction --- src/architecture.jl | 12 ++--- src/interface/abstract_arrays.jl | 4 +- src/lower.jl | 4 +- src/tensors/fibers.jl | 14 +++--- src/tensors/levels/atomic_element_levels.jl | 8 ++-- src/tensors/levels/dense_levels.jl | 8 ++-- src/tensors/levels/dense_rle_levels.jl | 20 ++++---- src/tensors/levels/element_levels.jl | 8 ++-- src/tensors/levels/mutex_levels.jl | 12 ++--- src/tensors/levels/pattern_levels.jl | 4 +- src/tensors/levels/separate_levels.jl | 12 ++--- src/tensors/levels/shard_levels.jl | 16 +++---- src/tensors/levels/sparse_band_levels.jl | 16 +++---- src/tensors/levels/sparse_bytemap_levels.jl | 20 ++++---- src/tensors/levels/sparse_coo_levels.jl | 16 +++---- src/tensors/levels/sparse_dict_levels.jl | 18 ++++---- src/tensors/levels/sparse_interval_levels.jl | 8 ++-- src/tensors/levels/sparse_list_levels.jl | 16 +++---- src/tensors/levels/sparse_point_levels.jl | 12 ++--- src/tensors/levels/sparse_rle_levels.jl | 24 +++++----- src/tensors/levels/sparse_vbl_levels.jl | 20 ++++---- src/tensors/scalars.jl | 8 ++-- src/util/vectors.jl | 12 ++--- .../reference32/parallel/atomics_sym_spmv.txt | 14 +++--- test/reference32/parallel/parallel_blur.jl | 6 +-- .../parallel/parallel_blur_sparse.jl | 10 ++-- .../parallel/parallel_spmms_no_atomics_1.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_2.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_3.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_4.txt | 28 +++++------ .../parallel/parallel_spmms_no_atomics_5.txt | 46 +++++++++---------- test/reference32/parallel/parallel_spmv.txt | 10 ++-- .../parallel/parallel_spmv_atomic.txt | 10 ++-- .../parallel/parallel_spmv_atomics.txt | 12 ++--- .../parallel/stress_dense_atomics.txt | 6 +-- .../reference64/parallel/atomics_sym_spmv.txt | 14 +++--- test/reference64/parallel/parallel_blur.jl | 6 +-- .../parallel/parallel_blur_sparse.jl | 10 ++-- .../parallel/parallel_spmms_no_atomics_1.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_2.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_3.txt | 18 ++++---- .../parallel/parallel_spmms_no_atomics_4.txt | 28 +++++------ .../parallel/parallel_spmms_no_atomics_5.txt | 46 +++++++++---------- test/reference64/parallel/parallel_spmv.txt | 10 ++-- .../parallel/parallel_spmv_atomic.txt | 10 ++-- .../parallel/parallel_spmv_atomics.txt | 12 ++--- .../parallel/stress_dense_atomics.txt | 6 +-- test/suites/parallel_tests.jl | 4 +- 48 files changed, 344 insertions(+), 344 deletions(-) diff --git a/src/architecture.jl b/src/architecture.jl index 739d9106a..c472695d0 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -188,7 +188,7 @@ get_task_num(task::VirtualCPUThread) = task.tid struct CPULocalMemory device::CPU end -function transfer(vec::V, mem::CPULocalMemory) where {V<:Vector} +function transfer(vec::V, mem::CPULocalMemory, style) where {V<:Vector} CPULocalVector{V}(mem.device, [copy(vec) for _ in 1:(mem.device.n)]) end @@ -204,15 +204,15 @@ end Base.eltype(::Type{CPULocalVector{V}}) where {V} = eltype(V) Base.ndims(::Type{CPULocalVector{V}}) where {V} = ndims(V) -function transfer(vec::Vector, device::CPU) +function transfer(vec::Vector, device::CPU, style) return vec end -function transfer(vec::Vector, task::CPUThread) +function transfer(vec::Vector, task::CPUThread, style) return copy(vec) end -function transfer(vec::CPULocalVector, task::CPUThread) +function transfer(vec::CPULocalVector, task::CPUThread, style) temp = vec.data[task.tid] return temp end @@ -321,7 +321,7 @@ function virtual_parallel_region(f, ctx, device::VirtualCPU) end """ - transfer(arr, device) + transfer(arr, device, style) If the array is not on the given device, it creates a new version of this array on that device and copies the data in to it, according to the `device` trait. @@ -329,7 +329,7 @@ and copies the data in to it, according to the `device` trait. function transfer end """ - virtual_transfer(device, arr) + virtual_transfer(device, arr, style) If the virtual array is not on the given device, copy the array to that device. This function may modify underlying data arrays, but cannot change the virtual itself. This diff --git a/src/interface/abstract_arrays.jl b/src/interface/abstract_arrays.jl index c77bef03b..222a54d54 100644 --- a/src/interface/abstract_arrays.jl +++ b/src/interface/abstract_arrays.jl @@ -130,13 +130,13 @@ FinchNotation.finch_leaf(x::VirtualAbstractArray) = virtual(x) virtual_fill_value(ctx, ::VirtualAbstractArray) = 0 virtual_eltype(ctx, tns::VirtualAbstractArray) = tns.eltype -function virtual_transfer(ctx, vec::VirtualAbstractArray, device) +function virtual_transfer(ctx, vec::VirtualAbstractArray, device, style) ex = freshen(ctx, vec.ex) push_preamble!( ctx, quote $ex = $(vec.ex) - $(vec.ex) = $transfer($(vec.ex), $(ctx(device))) + $(vec.ex) = $transfer($(vec.ex), $(ctx(device)), style) end, ) push_epilogue!( diff --git a/src/lower.jl b/src/lower.jl index 7d6d8e564..cbf89c171 100644 --- a/src/lower.jl +++ b/src/lower.jl @@ -342,14 +342,14 @@ function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualC ) for tns in setdiff(used_in_scope, decl_in_scope) - virtual_transfer(ctx, resolve(ctx, tns), device) + virtual_transfer(ctx, resolve(ctx, tns), device, style) end virtual_parallel_region(ctx, device) do ctx_2 subtask = get_task(ctx_2) tid = get_task_num(subtask) for tns in intersect(used_in_scope, decl_in_scope) - virtual_transfer(ctx_2, resolve(ctx_2, tns), subtask) + virtual_transfer(ctx_2, resolve(ctx_2, tns), subtask, style) end contain(ctx_2) do ctx_3 open_scope(ctx_3) do ctx_4 diff --git a/src/tensors/fibers.jl b/src/tensors/fibers.jl index 400ef936c..a37482eee 100644 --- a/src/tensors/fibers.jl +++ b/src/tensors/fibers.jl @@ -139,12 +139,12 @@ function unfurl(ctx::AbstractCompiler, arr::VirtualFiber, ext, mode, proto) unfurl(ctx, VirtualSubFiber(arr.lvl, literal(1)), ext, mode, proto) end -function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualFiber, arch) - virtual_transfer_level(ctx, fbr.lvl, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualFiber, arch, style) + virtual_transfer_level(ctx, fbr.lvl, arch, style) end -function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualSubFiber, arch) - virtual_transfer_level(ctx, fbr.lvl, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualSubFiber, arch, style) + virtual_transfer_level(ctx, fbr.lvl, arch, style) end struct HollowSubFiber{Lvl,Pos,Dirty} <: AbstractFiber{Lvl} @@ -171,9 +171,9 @@ function lower(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, ::DefaultStyle end FinchNotation.finch_leaf(x::VirtualHollowSubFiber) = virtual(x) -function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, arch, style) return VirtualHollowSubFiber( - virtual_transfer_level(ctx, fbr.lvl, arch), fbr.pos, fbr.dirty + virtual_transfer_level(ctx, fbr.lvl, arch, style), fbr.pos, fbr.dirty ) end @@ -333,7 +333,7 @@ function Base.similar(fbr::AbstractFiber, fill_value, eltype::Type, dims::Tuple) Tensor(similar_level(fbr.lvl, fill_value, eltype, dims...)) end -transfer(tns::Tensor, device) = Tensor(transfer(tns.lvl, device)) +transfer(tns::Tensor, device) = Tensor(transfer(tns.lvl, device), style) struct Structure t diff --git a/src/tensors/levels/atomic_element_levels.jl b/src/tensors/levels/atomic_element_levels.jl index 313c904c3..d9d371a04 100644 --- a/src/tensors/levels/atomic_element_levels.jl +++ b/src/tensors/levels/atomic_element_levels.jl @@ -40,8 +40,8 @@ end postype(::Type{<:AtomicElementLevel{Vf,Tv,Tp}}) where {Vf,Tv,Tp} = Tp -function transfer(lvl::AtomicElementLevel{Vf,Tv,Tp}, device) where {Vf,Tv,Tp} - return AtomicElementLevel{Vf,Tv,Tp}(transfer(lvl.val, device)) +function transfer(lvl::AtomicElementLevel{Vf,Tv,Tp}, device, style) where {Vf,Tv,Tp} + return AtomicElementLevel{Vf,Tv,Tp}(transfer(lvl.val, device), style) end pattern!(lvl::AtomicElementLevel{Vf,Tv,Tp}) where {Vf,Tv,Tp} = @@ -165,13 +165,13 @@ function reassemble_level!(ctx, lvl::VirtualAtomicElementLevel, pos_start, pos_s lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualAtomicElementLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualAtomicElementLevel, arch, style) val_2 = freshen(ctx, :val) push_preamble!( ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $transfer($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) end, ) push_epilogue!( diff --git a/src/tensors/levels/dense_levels.jl b/src/tensors/levels/dense_levels.jl index 372e49faf..32c69f760 100644 --- a/src/tensors/levels/dense_levels.jl +++ b/src/tensors/levels/dense_levels.jl @@ -44,8 +44,8 @@ function postype(::Type{DenseLevel{Ti,Lvl}}) where {Ti,Lvl} return postype(Lvl) end -function transfer(lvl::DenseLevel{Ti}, device) where {Ti} - return DenseLevel{Ti}(transfer(lvl.lvl, device), lvl.shape) +function transfer(lvl::DenseLevel{Ti}, device, style) where {Ti} + return DenseLevel{Ti}(transfer(lvl.lvl, device), lvl.shape, style) end function pattern!(lvl::DenseLevel{Ti,Lvl}) where {Ti,Lvl} @@ -201,8 +201,8 @@ function freeze_level!(ctx::AbstractCompiler, lvl::VirtualDenseLevel, pos) return lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualDenseLevel, arch) - virtual_transfer_level(ctx, lvl.lvl, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualDenseLevel, arch, style) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end struct DenseTraversal diff --git a/src/tensors/levels/dense_rle_levels.jl b/src/tensors/levels/dense_rle_levels.jl index 047e53ba0..0ca813128 100644 --- a/src/tensors/levels/dense_rle_levels.jl +++ b/src/tensors/levels/dense_rle_levels.jl @@ -64,11 +64,11 @@ function postype( return postype(Lvl) end -function transfer(lvl::RunListLevel{Ti}, device) where {Ti} - lvl_2 = transfer(lvl.lvl, device) - ptr = transfer(lvl.ptr, device) - right = transfer(lvl.right, device) - buf = transfer(lvl.buf, device) +function transfer(lvl::RunListLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + ptr = transfer(lvl.ptr, device, style) + right = transfer(lvl.right, device, style) + buf = transfer(lvl.buf, device, style) return RunListLevel{Ti}( lvl_2, lvl.shape, lvl.ptr, lvl.right, lvl.buf; merge=getmerge(lvl) ) @@ -295,7 +295,7 @@ function virtual_level_resize!(ctx, lvl::VirtualRunListLevel, dims...) lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) right_2 = freshen(ctx, lvl.right) push_preamble!( @@ -303,8 +303,8 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, quote $ptr_2 = $(lvl.ptr) $right_2 = $(lvl.right) - $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) - $(lvl.right) = $transfer($(lvl.right), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.right) = $transfer($(lvl.right), $(ctx(arch)), style) end, ) push_epilogue!( @@ -314,8 +314,8 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, $(lvl.right) = $right_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) - virtual_transfer_level(ctx, lvl.buf, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) + virtual_transfer_level(ctx, lvl.buf, arch, style) end virtual_level_eltype(lvl::VirtualRunListLevel) = virtual_level_eltype(lvl.lvl) diff --git a/src/tensors/levels/element_levels.jl b/src/tensors/levels/element_levels.jl index b8807af01..b8903ee27 100644 --- a/src/tensors/levels/element_levels.jl +++ b/src/tensors/levels/element_levels.jl @@ -43,8 +43,8 @@ end postype(::Type{<:ElementLevel{Vf,Tv,Tp}}) where {Vf,Tv,Tp} = Tp -function transfer(lvl::ElementLevel{Vf,Tv,Tp}, device) where {Vf,Tv,Tp} - return ElementLevel{Vf,Tv,Tp}(transfer(lvl.val, device)) +function transfer(lvl::ElementLevel{Vf,Tv,Tp}, device, style) where {Vf,Tv,Tp} + return ElementLevel{Vf,Tv,Tp}(transfer(lvl.val, device), style) end pattern!(lvl::ElementLevel{Vf,Tv,Tp}) where {Vf,Tv,Tp} = @@ -171,13 +171,13 @@ function reassemble_level!(ctx, lvl::VirtualElementLevel, pos_start, pos_stop) lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualElementLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualElementLevel, arch, style) val_2 = freshen(ctx, :val) push_preamble!( ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $transfer($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) end, ) push_epilogue!( diff --git a/src/tensors/levels/mutex_levels.jl b/src/tensors/levels/mutex_levels.jl index fb368c781..b3c066d47 100644 --- a/src/tensors/levels/mutex_levels.jl +++ b/src/tensors/levels/mutex_levels.jl @@ -35,9 +35,9 @@ end postype(::Type{<:MutexLevel{AVal,Lvl}}) where {Lvl,AVal} = postype(Lvl) -function transfer(lvl::MutexLevel, device) - lvl_2 = transfer(lvl.lvl, device) - locks_2 = transfer(lvl.locks, device) +function transfer(lvl::MutexLevel, device, style) + lvl_2 = transfer(lvl.lvl, device, style) + locks_2 = transfer(lvl.locks, device, style) return MutexLevel(lvl_2, locks_2) end @@ -207,7 +207,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualMutexLevel, pos) return lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arch, style) #Add for seperation level too. atomics = freshen(ctx, :locksArray) @@ -215,7 +215,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, a ctx, quote $atomics = $(lvl.locks) - $(lvl.locks) = $transfer($(lvl.locks), $(ctx(arch))) + $(lvl.locks) = $transfer($(lvl.locks), $(ctx(arch)), style) end, ) push_epilogue!( @@ -224,7 +224,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, a $(lvl.locks) = $atomics end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function instantiate(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, mode) diff --git a/src/tensors/levels/pattern_levels.jl b/src/tensors/levels/pattern_levels.jl index 95d274ff7..951b14f19 100644 --- a/src/tensors/levels/pattern_levels.jl +++ b/src/tensors/levels/pattern_levels.jl @@ -47,7 +47,7 @@ isstructequal(a::T, b::T) where {T<:Pattern} = true postype(::Type{<:PatternLevel{Tp}}) where {Tp} = Tp -function transfer(lvl::PatternLevel{Tp}, device) where {Tp} +function transfer(lvl::PatternLevel{Tp}, device, style) where {Tp} return PatternLevel{Tp}() end @@ -93,7 +93,7 @@ struct VirtualPatternLevel <: AbstractVirtualLevel Tp end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualPatternLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualPatternLevel, arch, style) end is_level_injective(ctx, ::VirtualPatternLevel) = [] diff --git a/src/tensors/levels/separate_levels.jl b/src/tensors/levels/separate_levels.jl index e9de1f367..b1c5220f9 100644 --- a/src/tensors/levels/separate_levels.jl +++ b/src/tensors/levels/separate_levels.jl @@ -36,9 +36,9 @@ end postype(::Type{<:Separate{Lvl,Val}}) where {Lvl,Val} = postype(Lvl) -function transfer(lvl::SeparateLevel, device) - lvl_2 = transfer(lvl.lvl, device) - val_2 = transfer(lvl.val, device) +function transfer(lvl::SeparateLevel, device, style) + lvl_2 = transfer(lvl.lvl, device, style) + val_2 = transfer(lvl.val, device, style) return SeparateLevel(lvl_2, val_2) end @@ -146,7 +146,7 @@ virtual_level_size(ctx, lvl::VirtualSeparateLevel) = virtual_level_size(ctx, lvl virtual_level_eltype(lvl::VirtualSeparateLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualSeparateLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_transfer_level(ctx, lvl::VirtualSeparateLevel, arch) +function virtual_transfer_level(ctx, lvl::VirtualSeparateLevel, arch, style) # Need to move each pointer... val_2 = freshen(ctx, lvl.val) @@ -154,7 +154,7 @@ function virtual_transfer_level(ctx, lvl::VirtualSeparateLevel, arch) ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $transfer($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) end, ) push_epilogue!( @@ -163,7 +163,7 @@ function virtual_transfer_level(ctx, lvl::VirtualSeparateLevel, arch) $(lvl.val) = $val_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function declare_level!(ctx, lvl::VirtualSeparateLevel, pos, init) diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl index b9a8e9082..f598ab44a 100644 --- a/src/tensors/levels/shard_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -28,7 +28,7 @@ end const Shard = ShardLevel function ShardLevel(device::Device, lvl::Lvl) where {Device,Lvl} - ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], transfer(lvl, device)) + ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], transfer(lvl, device), style) end #TODO scatterto? function ShardLevel{Device}( @@ -51,10 +51,10 @@ function postype(::Type{<:Shard{Device,Lvl,Ptr,Task,Val}}) where {Device,Lvl,Ptr postype(Lvl) end -function transfer(lvl::ShardLevel, device) - lvl_2 = transfer(lvl.lvl, device) - ptr_2 = transfer(lvl.ptr, device) - task_2 = transfer(lvl.task, device) +function transfer(lvl::ShardLevel, device, style) + lvl_2 = transfer(lvl.lvl, device, style) + ptr_2 = transfer(lvl.ptr, device, style) + task_2 = transfer(lvl.task, device, style) return ShardLevel(lvl_2, ptr_2, task_2, val_2) end @@ -212,13 +212,13 @@ virtual_level_size(ctx, lvl::VirtualShardLevel) = virtual_level_size(ctx, lvl.lv virtual_level_eltype(lvl::VirtualShardLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualShardLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_transfer_level(ctx, lvl::VirtualShardLevel, arch) +function virtual_transfer_level(ctx, lvl::VirtualShardLevel, arch, style) val_2 = freshen(ctx, lvl.val) push_preamble!( ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $transfer($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) end, ) push_epilogue!( @@ -227,7 +227,7 @@ function virtual_transfer_level(ctx, lvl::VirtualShardLevel, arch) $(lvl.val) = $val_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) diff --git a/src/tensors/levels/sparse_band_levels.jl b/src/tensors/levels/sparse_band_levels.jl index 194c86d26..db42b6195 100644 --- a/src/tensors/levels/sparse_band_levels.jl +++ b/src/tensors/levels/sparse_band_levels.jl @@ -36,10 +36,10 @@ function postype(::Type{SparseBandLevel{Ti,Idx,Ofs,Lvl}}) where {Ti,Idx,Ofs,Lvl} return postype(Lvl) end -function transfer(lvl::SparseBandLevel{Ti}, device) where {Ti} - lvl_2 = transfer(lvl.lvl, device) - idx_2 = transfer(lvl.idx, device) - ofs_2 = transfer(lvl.ofs, device) +function transfer(lvl::SparseBandLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + idx_2 = transfer(lvl.idx, device, style) + ofs_2 = transfer(lvl.ofs, device, style) return SparseBandLevel{Ti}(lvl_2, lvl.shape, idx_2, ofs_2) end @@ -241,7 +241,7 @@ end virtual_level_eltype(lvl::VirtualSparseBandLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualSparseBandLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, arch, style) tbl_2 = freshen(ctx, lvl.tbl) ofs_2 = freshen(ctx, lvl.ofs) push_preamble!( @@ -249,8 +249,8 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLev quote $tbl_2 = $(lvl.tbl) $ofs_2 = $(lvl.ofs) - $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch))) - $(lvl.ofs) = $transfer($(lvl.ofs), $(ctx(arch))) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch)), style) + $(lvl.ofs) = $transfer($(lvl.ofs), $(ctx(arch)), style) end, ) push_epilogue!( @@ -260,7 +260,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLev $(lvl.ofs) = $ofs_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function declare_level!(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, pos, init) diff --git a/src/tensors/levels/sparse_bytemap_levels.jl b/src/tensors/levels/sparse_bytemap_levels.jl index 6610f26b7..787d7fc3d 100644 --- a/src/tensors/levels/sparse_bytemap_levels.jl +++ b/src/tensors/levels/sparse_bytemap_levels.jl @@ -61,11 +61,11 @@ function postype(::Type{SparseByteMapLevel{Ti,Ptr,Tbl,Srt,Lvl}}) where {Ti,Ptr,T return postype(Lvl) end -function transfer(lvl::SparseByteMapLevel{Ti}, device) where {Ti} - lvl_2 = transfer(lvl.lvl, device) - ptr_2 = transfer(lvl.ptr, device) - tbl_2 = transfer(lvl.tbl, device) - srt_2 = transfer(lvl.srt, device) +function transfer(lvl::SparseByteMapLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + ptr_2 = transfer(lvl.ptr, device, style) + tbl_2 = transfer(lvl.tbl, device, style) + srt_2 = transfer(lvl.srt, device, style) return SparseByteMapLevel{Ti}(lvl_2, lvl.shape, ptr_2, tbl_2, srt_2) end @@ -239,7 +239,7 @@ function lower(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, ::DefaultS end end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) tbl_2 = freshen(ctx, lvl.tbl) srt_2 = freshen(ctx, lvl.srt) @@ -249,9 +249,9 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMap $ptr_2 = $(lvl.ptr) $tbl_2 = $(lvl.tbl) $srt_2 = $(lvl.srt) - $(lvl.ptr) = transfer($(lvl.ptr), $(ctx(arch))) - $(lvl.tbl) = transfer($(lvl.tbl), $(ctx(arch))) - $(lvl.srt) = transfer($(lvl.srt), $(ctx(arch))) + $(lvl.ptr) = transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.tbl) = transfer($(lvl.tbl), $(ctx(arch)), style) + $(lvl.srt) = transfer($(lvl.srt), $(ctx(arch)), style) end, ) push_epilogue!( @@ -262,7 +262,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMap $(lvl.srt) = $srt_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end Base.summary(lvl::VirtualSparseByteMapLevel) = "SparseByteMap($(summary(lvl.lvl)))" diff --git a/src/tensors/levels/sparse_coo_levels.jl b/src/tensors/levels/sparse_coo_levels.jl index e04eadee3..de5235668 100644 --- a/src/tensors/levels/sparse_coo_levels.jl +++ b/src/tensors/levels/sparse_coo_levels.jl @@ -85,10 +85,10 @@ function postype(::Type{SparseCOOLevel{N,TI,Ptr,Tbl,Lvl}}) where {N,TI,Ptr,Tbl,L return postype(Lvl) end -function transfer(lvl::SparseCOOLevel{N,TI}, device) where {N,TI} - lvl_2 = transfer(lvl.lvl, device) - ptr_2 = transfer(lvl.ptr, device) - tbl_2 = ntuple(n -> transfer(lvl.tbl[n], device), N) +function transfer(lvl::SparseCOOLevel{N,TI}, device, style) where {N,TI} + lvl_2 = transfer(lvl.lvl, device, style) + ptr_2 = transfer(lvl.ptr, device, style) + tbl_2 = ntuple(n -> transfer(lvl.tbl[n], device), N, style) return SparseCOOLevel{N,TI}(lvl_2, lvl.shape, ptr_2, tbl_2) end @@ -344,13 +344,13 @@ function freeze_level!(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, pos_st return lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) push_preamble!( ctx, quote $ptr_2 = $(lvl.ptr) - $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) end, ) push_epilogue!( @@ -365,7 +365,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLeve ctx, quote $idx_2 = $idx - $idx = $transfer($idx, $(ctx(arch))) + $idx = $transfer($idx, $(ctx(arch)), style) end, ) push_epilogue!( @@ -376,7 +376,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLeve ) idx_2 end - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end struct SparseCOOWalkTraversal diff --git a/src/tensors/levels/sparse_dict_levels.jl b/src/tensors/levels/sparse_dict_levels.jl index 84c46d017..b2bd73893 100644 --- a/src/tensors/levels/sparse_dict_levels.jl +++ b/src/tensors/levels/sparse_dict_levels.jl @@ -93,12 +93,12 @@ end function transfer( lvl::SparseDictLevel{Ti,Ptr,Idx,Val,Tbl,Pool,Lvl}, Tm ) where {Ti,Ptr,Idx,Val,Tbl,Pool,Lvl} - lvl_2 = transfer(lvl.lvl, Tm) - ptr_2 = transfer(lvl.ptr, Tm) - idx_2 = transfer(lvl.idx, Tm) - val_2 = transfer(lvl.val, Tm) - tbl_2 = transfer(lvl.tbl, Tm) - pool_2 = transfer(lvl.pool, Tm) + lvl_2 = transfer(lvl.lvl, Tm, style) + ptr_2 = transfer(lvl.ptr, Tm, style) + idx_2 = transfer(lvl.idx, Tm, style) + val_2 = transfer(lvl.val, Tm, style) + tbl_2 = transfer(lvl.tbl, Tm, style) + pool_2 = transfer(lvl.pool, Tm, style) return SparseDictLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2, val_2, tbl_2, pool_2) end @@ -385,7 +385,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, pos_sto return lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) tbl_2 = freshen(ctx, lvl.tbl_2) @@ -393,7 +393,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLev ctx, quote $tbl_2 = $(lvl.tbl) - $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch))) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch)), style) end, ) push_epilogue!( @@ -402,7 +402,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLev $(lvl.tbl) = $tbl_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function unfurl( diff --git a/src/tensors/levels/sparse_interval_levels.jl b/src/tensors/levels/sparse_interval_levels.jl index 9b57e6a84..1c6fce27c 100644 --- a/src/tensors/levels/sparse_interval_levels.jl +++ b/src/tensors/levels/sparse_interval_levels.jl @@ -63,10 +63,10 @@ function postype(::Type{SparseIntervalLevel{Ti,Left,Right,Lvl}}) where {Ti,Left, return postype(Lvl) end -function transfer(lvl::SparseIntervalLevel{Ti,Left,Right,Lvl}, Tm) where {Ti,Left,Right,Lvl} - lvl_2 = transfer(lvl.lvl, Tm) - left_2 = transfer(lvl.left, Tm) - right_2 = transfer(lvl.right, Tm) +function transfer(lvl::SparseIntervalLevel{Ti,Left,Right,Lvl}, Tm, style) where {Ti,Left,Right,Lvl} + lvl_2 = transfer(lvl.lvl, Tm, style) + left_2 = transfer(lvl.left, Tm, style) + right_2 = transfer(lvl.right, Tm, style) return SparseIntervalLevel{Ti}(lvl_2, lvl.shape, left_2, right_2) end diff --git a/src/tensors/levels/sparse_list_levels.jl b/src/tensors/levels/sparse_list_levels.jl index f79b15dac..c1cf11094 100644 --- a/src/tensors/levels/sparse_list_levels.jl +++ b/src/tensors/levels/sparse_list_levels.jl @@ -61,10 +61,10 @@ function postype(::Type{SparseListLevel{Ti,Ptr,Idx,Lvl}}) where {Ti,Ptr,Idx,Lvl} return postype(Lvl) end -function transfer(lvl::SparseListLevel{Ti,Ptr,Idx,Lvl}, Tm) where {Ti,Ptr,Idx,Lvl} - lvl_2 = transfer(lvl.lvl, Tm) - ptr_2 = transfer(lvl.ptr, Tm) - idx_2 = transfer(lvl.idx, Tm) +function transfer(lvl::SparseListLevel{Ti,Ptr,Idx,Lvl}, Tm, style) where {Ti,Ptr,Idx,Lvl} + lvl_2 = transfer(lvl.lvl, Tm, style) + ptr_2 = transfer(lvl.ptr, Tm, style) + idx_2 = transfer(lvl.idx, Tm, style) return SparseListLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2) end @@ -313,7 +313,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, pos_sto return lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) push_preamble!( @@ -321,8 +321,8 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseListLev quote $ptr_2 = $(lvl.ptr) $idx_2 = $(lvl.idx) - $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) - $(lvl.idx) = $transfer($(lvl.idx), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.idx) = $transfer($(lvl.idx), $(ctx(arch)), style) end, ) push_epilogue!( @@ -332,7 +332,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseListLev $(lvl.idx) = $idx_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function unfurl( diff --git a/src/tensors/levels/sparse_point_levels.jl b/src/tensors/levels/sparse_point_levels.jl index 31c002569..18d3e359b 100644 --- a/src/tensors/levels/sparse_point_levels.jl +++ b/src/tensors/levels/sparse_point_levels.jl @@ -55,9 +55,9 @@ function postype(::Type{SparsePointLevel{Ti,Idx,Lvl}}) where {Ti,Idx,Lvl} return postype(Lvl) end -function transfer(lvl::SparsePointLevel{Ti,Idx,Lvl}, Tm) where {Ti,Idx,Lvl} - lvl_2 = transfer(lvl.lvl, Tm) - idx_2 = transfer(lvl.idx, Tm) +function transfer(lvl::SparsePointLevel{Ti,Idx,Lvl}, Tm, style) where {Ti,Idx,Lvl} + lvl_2 = transfer(lvl.lvl, Tm, style) + idx_2 = transfer(lvl.idx, Tm, style) return SparsePointLevel{Ti}(lvl_2, lvl.shape, idx_2) end @@ -254,14 +254,14 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, pos_st return lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) push_preamble!( ctx, quote $idx_2 = $(lvl.idx) - $(lvl.idx) = $transfer($(lvl.idx), $(ctx(arch))) + $(lvl.idx) = $transfer($(lvl.idx), $(ctx(arch)), style) end, ) push_epilogue!( @@ -270,7 +270,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLe $(lvl.idx) = $idx_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function unfurl( diff --git a/src/tensors/levels/sparse_rle_levels.jl b/src/tensors/levels/sparse_rle_levels.jl index 7e74938a1..b250aa3f4 100644 --- a/src/tensors/levels/sparse_rle_levels.jl +++ b/src/tensors/levels/sparse_rle_levels.jl @@ -72,12 +72,12 @@ function postype( return postype(Lvl) end -function transfer(lvl::SparseRunListLevel{Ti}, device) where {Ti} - lvl_2 = transfer(lvl.lvl, device) - ptr = transfer(lvl.ptr, device) - left = transfer(lvl.left, device) - right = transfer(lvl.right, device) - buf = transfer(lvl.buf, device) +function transfer(lvl::SparseRunListLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + ptr = transfer(lvl.ptr, device, style) + left = transfer(lvl.left, device, style) + right = transfer(lvl.right, device, style) + buf = transfer(lvl.buf, device, style) return SparseRunListLevel{Ti}( lvl_2, lvl.shape, lvl.ptr, lvl.left, lvl.right, lvl.buf; merge=getmerge(lvl) ) @@ -303,7 +303,7 @@ function virtual_level_resize!(ctx, lvl::VirtualSparseRunListLevel, dims...) lvl end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) left_2 = freshen(ctx, lvl.left) right_2 = freshen(ctx, lvl.right) @@ -313,9 +313,9 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseRunList $ptr_2 = $(lvl.ptr) $left_2 = $(lvl.left) $right_2 = $(lvl.right) - $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) - $(lvl.left) = $transfer($(lvl.left), $(ctx(arch))) - $(lvl.right) = $transfer($(lvl.right), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.left) = $transfer($(lvl.left), $(ctx(arch)), style) + $(lvl.right) = $transfer($(lvl.right), $(ctx(arch)), style) end, ) push_epilogue!( @@ -326,8 +326,8 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseRunList $(lvl.right) = $right_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) - virtual_transfer_level(ctx, lvl.buf, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) + virtual_transfer_level(ctx, lvl.buf, arch, style) end virtual_level_eltype(lvl::VirtualSparseRunListLevel) = virtual_level_eltype(lvl.lvl) diff --git a/src/tensors/levels/sparse_vbl_levels.jl b/src/tensors/levels/sparse_vbl_levels.jl index a26e26a11..e9080aec5 100644 --- a/src/tensors/levels/sparse_vbl_levels.jl +++ b/src/tensors/levels/sparse_vbl_levels.jl @@ -54,11 +54,11 @@ function postype( return postype(Lvl) end -function transfer(lvl::SparseBlockListLevel{Ti}, device) where {Ti} - lvl_2 = transfer(lvl.lvl, device) - ptr_2 = transfer(lvl.ptr, device) - idx_2 = transfer(lvl.idx, device) - ofs_2 = transfer(lvl.ofs, device) +function transfer(lvl::SparseBlockListLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + ptr_2 = transfer(lvl.ptr, device, style) + idx_2 = transfer(lvl.idx, device, style) + ofs_2 = transfer(lvl.ofs, device, style) return SparseBlockListLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2, ofs_2) end @@ -289,7 +289,7 @@ function virtual_level_fill_value(lvl::VirtualSparseBlockListLevel) virtual_level_fill_value(lvl.lvl) end -function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) tbl_2 = freshen(ctx, lvl.tbl) ofs_2 = freshen(ctx, lvl.ofs) @@ -299,9 +299,9 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockLi $ptr_2 = $(lvl.ptr) $tbl_2 = $(lvl.tbl) $ofs_2 = $(lvl.ofs) - $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch))) - $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch))) - $(lvl.ofs) = $transfer($(lvl.ofs), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch)), style) + $(lvl.ofs) = $transfer($(lvl.ofs), $(ctx(arch)), style) end, ) push_epilogue!( @@ -312,7 +312,7 @@ function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockLi $(lvl.ofs) = $ofs_2 end, ) - virtual_transfer_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function declare_level!(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, pos, init) diff --git a/src/tensors/scalars.jl b/src/tensors/scalars.jl index cba9db56b..5990032f4 100644 --- a/src/tensors/scalars.jl +++ b/src/tensors/scalars.jl @@ -41,7 +41,7 @@ function virtualize(ctx, ex, ::Type{Scalar{Vf,Tv}}, tag) where {Vf,Tv} VirtualScalar(sym, Tv, Vf, tag, val) end -virtual_transfer(ctx, lvl::VirtualScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualScalar, arch, style) = lvl virtual_size(ctx, ::VirtualScalar) = () @@ -147,7 +147,7 @@ virtual_size(ctx, ::VirtualSparseScalar) = () virtual_fill_value(ctx, tns::VirtualSparseScalar) = tns.Vf virtual_eltype(tns::VirtualSparseScalar, ctx) = tns.Tv -virtual_transfer(ctx, lvl::VirtualSparseScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualSparseScalar, arch, style) = lvl function declare!(ctx, tns::VirtualSparseScalar, init) push_preamble!( @@ -289,7 +289,7 @@ function lower_assign(ctx, tns::VirtualShortCircuitScalar, mode, op, rhs) :($(tns.val) = $lhs_2) end -virtual_transfer(ctx, lvl::VirtualShortCircuitScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualShortCircuitScalar, arch, style) = lvl function short_circuit_cases(ctx, tns::VirtualShortCircuitScalar, op) [ @@ -359,7 +359,7 @@ virtual_size(ctx, ::VirtualSparseShortCircuitScalar) = () virtual_fill_value(ctx, tns::VirtualSparseShortCircuitScalar) = tns.Vf virtual_eltype(tns::VirtualSparseShortCircuitScalar, ctx) = tns.Tv -virtual_transfer(ctx, lvl::VirtualSparseShortCircuitScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualSparseShortCircuitScalar, arch, style) = lvl function declare!(ctx, tns::VirtualSparseShortCircuitScalar, init) push_preamble!( diff --git a/src/util/vectors.jl b/src/util/vectors.jl index 20eec0920..975bc6117 100644 --- a/src/util/vectors.jl +++ b/src/util/vectors.jl @@ -31,8 +31,8 @@ Base.size(vec::PlusOneVector{T}) where {T} = size(vec.data) Base.axes(vec::PlusOneVector{T}) where {T} = axes(vec.data) Base.resize!(vec::PlusOneVector{T}, dim) where {T} = resize!(vec.data, dim) -function transfer(vec::PlusOneVector{T}, device) where {T} - data = transfer(vec.data, device) +function transfer(vec::PlusOneVector{T}, device, style) where {T} + data = transfer(vec.data, device, style) return PlusOneVector{T}(data) end @@ -77,8 +77,8 @@ Base.size(vec::MinusEpsVector{T}) where {T} = size(vec.data) Base.axes(vec::MinusEpsVector{T}) where {T} = axes(vec.data) Base.resize!(vec::MinusEpsVector{T}, dim) where {T} = resize!(vec.data, dim) -function transfer(vec::MinusEpsVector{T}, device) where {T} - data = transfer(vec.data, device) +function transfer(vec::MinusEpsVector{T}, device, style) where {T} + data = transfer(vec.data, device, style) return MinusEpsVector{T}(data) end @@ -123,7 +123,7 @@ Base.size(vec::PlusEpsVector{T}) where {T} = size(vec.data) Base.axes(vec::PlusEpsVector{T}) where {T} = axes(vec.data) Base.resize!(vec::PlusEpsVector{T}, dim) where {T} = resize!(vec.data, dim) -function transfer(vec::PlusEpsVector{T}, device) where {T} - data = transfer(vec.data, device) +function transfer(vec::PlusEpsVector{T}, device, style) where {T} + data = transfer(vec.data, device, style) return PlusEpsVector{T}(data) end diff --git a/test/reference32/parallel/atomics_sym_spmv.txt b/test/reference32/parallel/atomics_sym_spmv.txt index 6b2673ba8..404376299 100644 --- a/test/reference32/parallel/atomics_sym_spmv.txt +++ b/test/reference32/parallel/atomics_sym_spmv.txt @@ -23,15 +23,15 @@ begin end Finch.resize_if_smaller!(y_lvl_2_val, x_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0.0, 1, x_lvl.shape) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) locksArray = y_lvl_locks - y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val_3 = y_lvl_2_val - y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) - diag_lvl_val = (Finch).transfer(diag_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) + diag_lvl_val = (Finch).transfer(diag_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_blur.jl b/test/reference32/parallel/parallel_blur.jl index 6ccfada4f..6c7d8fb41 100644 --- a/test/reference32/parallel/parallel_blur.jl +++ b/test/reference32/parallel/parallel_blur.jl @@ -20,14 +20,14 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu, style) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu, style) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial()), style) res_6 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference32/parallel/parallel_blur_sparse.jl b/test/reference32/parallel/parallel_blur_sparse.jl index af13ee215..4e13fffba 100644 --- a/test/reference32/parallel/parallel_blur_sparse.jl +++ b/test/reference32/parallel/parallel_blur_sparse.jl @@ -22,16 +22,16 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_ptr = (Finch).transfer(input_lvl_ptr, cpu) - input_lvl_idx = (Finch).transfer(input_lvl_idx, cpu) - input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu) + input_lvl_ptr = (Finch).transfer(input_lvl_ptr, cpu, style) + input_lvl_idx = (Finch).transfer(input_lvl_idx, cpu, style) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu, style) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu, style) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial()), style) res_71 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_1.txt b/test/reference32/parallel/parallel_spmms_no_atomics_1.txt index 15b897b83..f746834f0 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_1.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_1.txt @@ -20,15 +20,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_2.txt b/test/reference32/parallel/parallel_spmms_no_atomics_2.txt index ddd615e2b..1adeb98b6 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_2.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_2.txt @@ -21,23 +21,23 @@ begin Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) for i_4 = 1:A_lvl.shape[1] val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_ptr_2 = B_lvl_ptr - B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_2 = B_lvl_tbl2 - B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) val_2 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_3 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_3.txt b/test/reference32/parallel/parallel_spmms_no_atomics_3.txt index e406623d2..5bd9abec3 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_3.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_3.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_4.txt b/test/reference32/parallel/parallel_spmms_no_atomics_4.txt index f5f13f0df..2d6bd4cce 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_4.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_4.txt @@ -54,20 +54,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_2 = B_lvl_tbl2 val_2 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_3 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_9 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -186,20 +186,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_18 val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_3 = B_lvl_tbl2 val_5 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_6 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_19 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_5.txt b/test/reference32/parallel/parallel_spmms_no_atomics_5.txt index 9ee761e3e..b9b21407f 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_5.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_5.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -68,21 +68,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_5 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) B_lvl_ptr_3 = B_lvl_ptr B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_3 = B_lvl_tbl2 val_6 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_10 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -202,21 +202,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_19 val_7 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) A_lvl_ptr_4 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_4 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_4 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_8 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) B_lvl_ptr_4 = B_lvl_ptr B_lvl_tbl1_4 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_4 = B_lvl_tbl2 val_9 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_20 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv.txt b/test/reference32/parallel/parallel_spmv.txt index 72aadc0b9..2c0b26d0d 100644 --- a/test/reference32/parallel/parallel_spmv.txt +++ b/test/reference32/parallel/parallel_spmv.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl.shape) val = y_lvl_val - y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv_atomic.txt b/test/reference32/parallel/parallel_spmv_atomic.txt index 8e0c41443..88413dcc0 100644 --- a/test/reference32/parallel/parallel_spmv_atomic.txt +++ b/test/reference32/parallel/parallel_spmv_atomic.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl_2.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl_2.shape) val = y_lvl_val - y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv_atomics.txt b/test/reference32/parallel/parallel_spmv_atomics.txt index 8afb8fecd..dc4cbf25e 100644 --- a/test/reference32/parallel/parallel_spmv_atomics.txt +++ b/test/reference32/parallel/parallel_spmv_atomics.txt @@ -27,13 +27,13 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, A_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, A_lvl.shape) locksArray = y_lvl_locks - y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val = y_lvl_2_val - y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/stress_dense_atomics.txt b/test/reference32/parallel/stress_dense_atomics.txt index 93641ccd2..994f11179 100644 --- a/test/reference32/parallel/stress_dense_atomics.txt +++ b/test/reference32/parallel/stress_dense_atomics.txt @@ -30,11 +30,11 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, y_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, y_lvl.shape) resize!(x_lvl_val, x_lvl.shape) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) locksArray = y_lvl_locks - y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val_2 = y_lvl_2_val - y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) Threads.@threads for i = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/atomics_sym_spmv.txt b/test/reference64/parallel/atomics_sym_spmv.txt index 7fa7f8f02..22059b911 100644 --- a/test/reference64/parallel/atomics_sym_spmv.txt +++ b/test/reference64/parallel/atomics_sym_spmv.txt @@ -23,15 +23,15 @@ begin end Finch.resize_if_smaller!(y_lvl_2_val, x_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0.0, 1, x_lvl.shape) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) locksArray = y_lvl_locks - y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val_3 = y_lvl_2_val - y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) - diag_lvl_val = (Finch).transfer(diag_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) + diag_lvl_val = (Finch).transfer(diag_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_blur.jl b/test/reference64/parallel/parallel_blur.jl index fec0d205b..81015e06b 100644 --- a/test/reference64/parallel/parallel_blur.jl +++ b/test/reference64/parallel/parallel_blur.jl @@ -20,14 +20,14 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu, style) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu, style) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial()), style) res_6 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference64/parallel/parallel_blur_sparse.jl b/test/reference64/parallel/parallel_blur_sparse.jl index 8ac6ac88b..c8b921542 100644 --- a/test/reference64/parallel/parallel_blur_sparse.jl +++ b/test/reference64/parallel/parallel_blur_sparse.jl @@ -22,16 +22,16 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_ptr = (Finch).transfer(input_lvl_ptr, cpu) - input_lvl_idx = (Finch).transfer(input_lvl_idx, cpu) - input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu) + input_lvl_ptr = (Finch).transfer(input_lvl_ptr, cpu, style) + input_lvl_idx = (Finch).transfer(input_lvl_idx, cpu, style) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu, style) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu, style) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial()), style) res_71 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_1.txt b/test/reference64/parallel/parallel_spmms_no_atomics_1.txt index 9708e569c..5d388fa3d 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_1.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_1.txt @@ -20,15 +20,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_2.txt b/test/reference64/parallel/parallel_spmms_no_atomics_2.txt index 24e3c3487..e5eca22dc 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_2.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_2.txt @@ -21,23 +21,23 @@ begin Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) for i_4 = 1:A_lvl.shape[1] val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_ptr_2 = B_lvl_ptr - B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_2 = B_lvl_tbl2 - B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) val_2 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_3 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_3.txt b/test/reference64/parallel/parallel_spmms_no_atomics_3.txt index 13525bf1d..12ae422f7 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_3.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_3.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_4.txt b/test/reference64/parallel/parallel_spmms_no_atomics_4.txt index b725d5fc9..1676382bc 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_4.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_4.txt @@ -54,20 +54,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_2 = B_lvl_tbl2 val_2 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_3 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_9 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -186,20 +186,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_18 val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_3 = B_lvl_tbl2 val_5 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_6 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_19 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_5.txt b/test/reference64/parallel/parallel_spmms_no_atomics_5.txt index f3a821b23..e7b18e235 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_5.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_5.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -68,21 +68,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_5 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) B_lvl_ptr_3 = B_lvl_ptr B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_3 = B_lvl_tbl2 val_6 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_10 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -202,21 +202,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_19 val_7 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) A_lvl_ptr_4 = A_lvl_ptr - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_4 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_4 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_8 = A_lvl_val - A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) B_lvl_ptr_4 = B_lvl_ptr B_lvl_tbl1_4 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_4 = B_lvl_tbl2 val_9 = B_lvl_val - B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_20 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv.txt b/test/reference64/parallel/parallel_spmv.txt index 4d2ab5c94..54f2b6ab4 100644 --- a/test/reference64/parallel/parallel_spmv.txt +++ b/test/reference64/parallel/parallel_spmv.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl.shape) val = y_lvl_val - y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv_atomic.txt b/test/reference64/parallel/parallel_spmv_atomic.txt index 9b5fe2128..4b945c401 100644 --- a/test/reference64/parallel/parallel_spmv_atomic.txt +++ b/test/reference64/parallel/parallel_spmv_atomic.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl_2.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl_2.shape) val = y_lvl_val - y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv_atomics.txt b/test/reference64/parallel/parallel_spmv_atomics.txt index e433a2e28..5d04f2e3a 100644 --- a/test/reference64/parallel/parallel_spmv_atomics.txt +++ b/test/reference64/parallel/parallel_spmv_atomics.txt @@ -27,13 +27,13 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, A_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, A_lvl.shape) locksArray = y_lvl_locks - y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val = y_lvl_2_val - y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/stress_dense_atomics.txt b/test/reference64/parallel/stress_dense_atomics.txt index 855608a60..71792f516 100644 --- a/test/reference64/parallel/stress_dense_atomics.txt +++ b/test/reference64/parallel/stress_dense_atomics.txt @@ -30,11 +30,11 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, y_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, y_lvl.shape) resize!(x_lvl_val, x_lvl.shape) - x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) locksArray = y_lvl_locks - y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val_2 = y_lvl_2_val - y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) Threads.@threads for i = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/suites/parallel_tests.jl b/test/suites/parallel_tests.jl index 2f6e05b4d..5e0388407 100644 --- a/test/suites/parallel_tests.jl +++ b/test/suites/parallel_tests.jl @@ -554,7 +554,7 @@ input = Tensor(Dense(Dense(Element(0.0)))) output = Tensor(Dense(Dense(Element(0.0)))) cpu = CPU(Threads.nthreads()) - tmp = transfer(Tensor(Dense(Element(0))), CPULocalMemory(cpu)) + tmp = transfer(Tensor(Dense(Element(0))), CPULocalMemory(cpu), style) check_output( "parallel/parallel_blur.jl", @@ -579,7 +579,7 @@ input = Tensor(Dense(SparseList(Element(0.0)))) output = Tensor(Dense(Dense(Element(0.0)))) cpu = CPU(Threads.nthreads()) - tmp = transfer(Tensor(Dense(Element(0))), CPULocalMemory(cpu)) + tmp = transfer(Tensor(Dense(Element(0))), CPULocalMemory(cpu), style) check_output( "parallel/parallel_blur_sparse.jl", From f7014730a91665c193503458beec4a251500787b Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Thu, 6 Feb 2025 11:00:16 -0500 Subject: [PATCH 23/25] rm shards --- src/Finch.jl | 2 - src/tensors/levels/shard_levels.jl | 371 ----------------------------- 2 files changed, 373 deletions(-) delete mode 100644 src/tensors/levels/shard_levels.jl diff --git a/src/Finch.jl b/src/Finch.jl index ccd785c4e..7cb664d20 100644 --- a/src/Finch.jl +++ b/src/Finch.jl @@ -41,7 +41,6 @@ export Dense, DenseLevel export Element, ElementLevel export AtomicElement, AtomicElementLevel export Separate, SeparateLevel -export Shard, ShardLevel export Mutex, MutexLevel export Pattern, PatternLevel export Scalar, SparseScalar, ShortCircuitScalar, SparseShortCircuitScalar @@ -142,7 +141,6 @@ include("tensors/levels/dense_rle_levels.jl") include("tensors/levels/element_levels.jl") include("tensors/levels/atomic_element_levels.jl") include("tensors/levels/separate_levels.jl") -include("tensors/levels/shard_levels.jl") include("tensors/levels/mutex_levels.jl") include("tensors/levels/pattern_levels.jl") include("tensors/masks.jl") diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl deleted file mode 100644 index f598ab44a..000000000 --- a/src/tensors/levels/shard_levels.jl +++ /dev/null @@ -1,371 +0,0 @@ -""" - ShardLevel{Lvl, [Val]}() - -Each subfiber of a Shard level is stored in a thread-local tensor of type -`Lvl`, in a thread-local memory space. - -Each sublevel is stored in a vector of type `Val` with `eltype(Val) = Lvl`. - -```jldoctest -julia> tensor_tree(Tensor(Dense(Shard(Element(0.0))), [1, 2, 3])) -3-Tensor -└─ Dense [1:3] - ├─ [1]: Shard -> - │ └─ 1.0 - ├─ [2]: Shard -> - │ └─ 2.0 - └─ [3]: Shard -> - └─ 3.0 -``` -""" -struct ShardLevel{Device,Lvl,Ptr,Task,Val} <: AbstractLevel - device::Device - lvl::Lvl - ptr::Ptr - task::Task - val::Val -end -const Shard = ShardLevel - -function ShardLevel(device::Device, lvl::Lvl) where {Device,Lvl} - ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], transfer(lvl, device), style) -end #TODO scatterto? - -function ShardLevel{Device}( - device, lvl::Lvl, ptr::Ptr, task::Task, val::Val -) where {Device,Lvl,Ptr,Task,Val} - ShardLevel{Device,Lvl,Ptr,Task,Val}(device, lvl, ptr, task, val) -end - -function Base.summary(::Shard{Device,Lvl,Ptr,Task,Val}) where {Device,Lvl,Ptr,Task,Val} - "Shard($(Lvl))" -end - -function similar_level( - lvl::Shard{Device,Lvl,Ptr,Task,Val}, fill_value, eltype::Type, dims... -) where {Device,Lvl,Ptr,Task,Val} - ShardLevel(lvl.device, similar_level(lvl.lvl, fill_value, eltype, dims...)) -end - -function postype(::Type{<:Shard{Device,Lvl,Ptr,Task,Val}}) where {Device,Lvl,Ptr,Task,Val} - postype(Lvl) -end - -function transfer(lvl::ShardLevel, device, style) - lvl_2 = transfer(lvl.lvl, device, style) - ptr_2 = transfer(lvl.ptr, device, style) - task_2 = transfer(lvl.task, device, style) - return ShardLevel(lvl_2, ptr_2, task_2, val_2) -end - -function pattern!(lvl::ShardLevel) - ShardLevel(pattern!(lvl.lvl), lvl.ptr, lvl.task, map(pattern!, lvl.val)) -end -function set_fill_value!(lvl::ShardLevel, init) - ShardLevel( - set_fill_value!(lvl.lvl, init), - lvl.ptr, - lvl.task, - map(lvl_2 -> set_fill_value!(lvl_2, init), lvl.val), - ) -end -function Base.resize!(lvl::ShardLevel, dims...) - ShardLevel( - resize!(lvl.lvl, dims...), - lvl.ptr, - lvl.task, - map(lvl_2 -> resize!(lvl_2, dims...), lvl.val), - ) -end - -function Base.show( - io::IO, lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} -) where {Device,Lvl,Ptr,Task,Val} - print(io, "Shard(") - if get(io, :compact, false) - print(io, "…") - else - show(io, lvl.lvl) - print(io, ", ") - show(io, lvl.ptr) - print(io, ", ") - show(io, lvl.task) - print(io, ", ") - show(io, lvl.val) - end - print(io, ")") -end - -function labelled_show(io::IO, fbr::SubFiber{<:ShardLevel}) - (lvl, pos) = (fbr.lvl, fbr.pos) - print(io, "shard($(lvl.task[pos])) -> ") -end - -function labelled_children(fbr::SubFiber{<:ShardLevel}) - lvl = fbr.lvl - pos = fbr.pos - pos > length(lvl.val) && return [] - [LabelledTree(SubFiber(lvl.val[lvl.task[pos]], lvl.ptr[pos]))] -end - -@inline level_ndims( - ::Type{<:ShardLevel{Device,Lvl,Ptr,Task,Val}} -) where {Device,Lvl,Ptr,Task,Val} = level_ndims(Lvl) -@inline level_size( - lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} -) where {Device,Lvl,Ptr,Task,Val} = level_size(lvl.lvl) -@inline level_axes( - lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} -) where {Device,Lvl,Ptr,Task,Val} = level_axes(lvl.lvl) -@inline level_eltype( - ::Type{ShardLevel{Device,Lvl,Ptr,Task,Val}} -) where {Device,Lvl,Ptr,Task,Val} = level_eltype(Lvl) -@inline level_fill_value( - ::Type{<:ShardLevel{Device,Lvl,Ptr,Task,Val}} -) where {Device,Lvl,Ptr,Task,Val} = level_fill_value(Lvl) - -function (fbr::SubFiber{<:ShardLevel})(idxs...) - q = fbr.pos - return SubFiber(fbr.lvl.val[q], 1)(idxs...) -end - -countstored_level(lvl::ShardLevel, pos) = pos - -mutable struct VirtualShardLevel <: AbstractVirtualLevel - device - lvl # stand-in for the sublevel for virtual resize, etc. - ex - ptr - task - val - Tv - Device - Lvl - Ptr - Task - Val -end - -postype(lvl::VirtualShardLevel) = postype(lvl.lvl) - -function is_level_injective(ctx, lvl::VirtualShardLevel) - [is_level_injective(ctx, lvl.lvl)..., true] -end -function is_level_atomic(ctx, lvl::VirtualShardLevel) - (below, atomic) = is_level_atomic(ctx, lvl.lvl) - return ([below; [atomic]], atomic) -end -function is_level_concurrent(ctx, lvl::VirtualShardLevel) - (data, _) = is_level_concurrent(ctx, lvl.lvl) - return (data, true) -end - -function lower(ctx::AbstractCompiler, lvl::VirtualShardLevel, ::DefaultStyle) - quote - $ShardLevel{$(lvl.Lvl),$(lvl.Ptr),$(lvl.Task),$(lvl.Val)}( - $(ctx(lvl.lvl)), $(lvl.val) - ) - end -end - -function virtualize( - ctx, ex, ::Type{ShardLevel{Device,Lvl,Ptr,Task,Val}}, tag=:lvl -) where {Device,Lvl,Ptr,Task,Val} - sym = freshen(ctx, tag) - ptr = freshen(ctx, tag, :_ptr) - task = freshen(ctx, tag, :_task) - val = freshen(ctx, tag, :_val) - - push_preamble!( - ctx, - quote - $sym = $ex - $ptr = $ex.ptr - $task = $ex.task - $val = $ex.val - end, - ) - device_2 = virtualize(ctx, :($ex.device), Device, sym) - lvl_2 = virtualize(ctx, :($ex.lvl), Lvl, sym) - VirtualShardLevel( - device_2, - lvl_2, - sym, - ptr, - task, - val, - typeof(level_fill_value(Lvl)), - Device, - Lvl, - Ptr, - Task, - Val, - ) -end - -Base.summary(lvl::VirtualShardLevel) = "Shard($(lvl.Lvl))" - -function virtual_level_resize!(ctx, lvl::VirtualShardLevel, dims...) - (lvl.lvl = virtual_level_resize!(ctx, lvl.lvl, dims...); lvl) -end -virtual_level_size(ctx, lvl::VirtualShardLevel) = virtual_level_size(ctx, lvl.lvl) -virtual_level_eltype(lvl::VirtualShardLevel) = virtual_level_eltype(lvl.lvl) -virtual_level_fill_value(lvl::VirtualShardLevel) = virtual_level_fill_value(lvl.lvl) - -function virtual_transfer_level(ctx, lvl::VirtualShardLevel, arch, style) - val_2 = freshen(ctx, lvl.val) - push_preamble!( - ctx, - quote - $val_2 = $(lvl.val) - $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) - end, - ) - push_epilogue!( - ctx, - quote - $(lvl.val) = $val_2 - end, - ) - virtual_transfer_level(ctx, lvl.lvl, arch, style) -end - -function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) - push_preamble!(ctx, - virtual_parallel_region(ctx, lvl.device) do ctx_2 - lvl_2 = virtualize( - ctx_2, :($(lvl.ex).val[$(ctx_2(get_task_num(get_task(ctx_2))))]), lvl.Lvl - ) #TODO should this virtualize the eltype of Val? - declare_level!(ctx_2, lvl_2, literal(1), init) - end, - ) - lvl -end - -""" -assemble: - mapping is pos -> task, ptr. task says which task has it, ptr says which position in that task has it. - -read: - read from pos to task, ptr. simple. - -write: - allocate something for this task on that position, assemble on the task itself on demand. Complain if the task is wrong. - -The outer level needs to be concurrent, like denselevel. -""" -function assemble_level!(ctx, lvl::VirtualShardLevel, pos_start, pos_stop) - pos_start = cache!(ctx, :pos_start, simplify(ctx, pos_start)) - pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) - pos = freshen(ctx, :pos) - sym = freshen(ctx, :pointer_to_lvl) - push_preamble!( - ctx, - quote - Finch.resize_if_smaller!($(lvl.task), $(ctx(pos_stop))) - Finch.resize_if_smaller!($(lvl.ptr), $(ctx(pos_stop))) - Finch.fill_range!($(lvl.task), $(ctx(pos_start)), $(ctx(pos_stop)), 0) - end, - ) - lvl -end - -supports_reassembly(::VirtualShardLevel) = false - -""" -these two are no-ops, we insteaed do these on instantiate -""" -function freeze_level!(ctx, lvl::VirtualShardLevel, pos) - return lvl -end - -function thaw_level!(ctx::AbstractCompiler, lvl::VirtualShardLevel, pos) - return lvl -end - -function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardLevel}, mode) - if mode.kind === reader - (lvl, pos) = (fbr.lvl, fbr.pos) - tag = lvl.ex - isnulltest = freshen(ctx, tag, :_nulltest) - Vf = level_fill_value(lvl.Lvl) - sym = freshen(ctx, :pointer_to_lvl) - val = freshen(ctx, lvl.ex, :_val) - return Thunk(; - body=(ctx) -> begin - lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) - instantiate(ctx, VirtualSubFiber(lvl_2, literal(1)), mode) - end, - ) - else - (lvl, pos) = (fbr.lvl, fbr.pos) - tag = lvl.ex - sym = freshen(ctx, :pointer_to_lvl) - - return Thunk(; - body=(ctx) -> begin - lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) - lvl_2 = thaw_level!(ctx, lvl_2, literal(1)) - push_preamble!(ctx, assemble_level!(ctx, lvl_2, literal(1), literal(1))) - res = instantiate(ctx, VirtualSubFiber(lvl_2, literal(1)), mode) - push_epilogue!(ctx, - contain(ctx) do ctx_2 - lvl_2 = freeze_level!(ctx_2, lvl_2, literal(1)) - :($(lvl.val)[$(ctx_2(pos))] = $(ctx_2(lvl_2))) - end, - ) - res - end, - ) - end -end - -#we need some sort of localization step at the start of a parallel region whereby we can thaw the shart level - -""" -assemble: - mapping is pos -> task, ptr. task says which task has it, ptr says which position in that task has it. - -read: - read from pos to task, ptr. simple. - -write: - allocate something for this task on that position, assemble on the task itself on demand. Complain if the task is wrong. - -The outer level needs to be concurrent, like denselevel. -""" -function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardLevel}, mode) - @assert mode.kind === updater - (lvl, pos) = (fbr.lvl, fbr.pos) - tag = lvl.ex - sym = freshen(ctx, :pointer_to_lvl) - - task = freshen(ctx, tag, :_task) - - return Thunk(; - preamble = quote - $task = $(lvl.task)[$(ctx(pos))] - if task == 0 - $(lvl.task)[$(ctx(pos))] = $(gettasknum(ctx)) - qos = local_qos_fill - if $(lvl.local_qos_fill) > $(lvl.local_qos_stop) - $local_qos_stop = max($local_qos_stop << 1, 1) - $(contain(ctx_2 -> assemble_level!(ctx_2, lvl.lvl, value(qos_fill, Tp), value(qos_stop, Tp)), ctx)) - end - else - qos = $(lvl.ptr)[$(ctx(pos))] - qos_stop = $(lvl.local_qos_stop) - #only in safe mode, we check if task == $(gettasknum(ctx)) and if not error("Task mismatch in ShardLevel") - end - dirty = true - end, - body = (ctx) -> VirtualHollowSubFiber(lvl.lvl, value(qos), dirty), - epilogue = quote - #this task will always own this position forever, even if we don't write to it. Still, we try to be conservative of memory usage of the underlying level. - if dirty && $(lvl.ptr)[$(ctx(pos))] == 0 - local_qos_fill += 1 - $(lvl.ptr)[$(ctx(pos))] = $(lvl.local_qos_fill) += 1 - end - end, - ) -end From a8557547fe218b1f8b159dafc41426af81298c2c Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Thu, 6 Feb 2025 11:01:18 -0500 Subject: [PATCH 24/25] keep commit --- src/Finch.jl | 4 ++-- src/tensors/levels/shard_levels.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Finch.jl b/src/Finch.jl index ccd785c4e..f416947da 100644 --- a/src/Finch.jl +++ b/src/Finch.jl @@ -41,7 +41,7 @@ export Dense, DenseLevel export Element, ElementLevel export AtomicElement, AtomicElementLevel export Separate, SeparateLevel -export Shard, ShardLevel +export Shard, ShardLevel #keep export Mutex, MutexLevel export Pattern, PatternLevel export Scalar, SparseScalar, ShortCircuitScalar, SparseShortCircuitScalar @@ -142,7 +142,7 @@ include("tensors/levels/dense_rle_levels.jl") include("tensors/levels/element_levels.jl") include("tensors/levels/atomic_element_levels.jl") include("tensors/levels/separate_levels.jl") -include("tensors/levels/shard_levels.jl") +include("tensors/levels/shard_levels.jl") #keep include("tensors/levels/mutex_levels.jl") include("tensors/levels/pattern_levels.jl") include("tensors/masks.jl") diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl index f598ab44a..e4e404d48 100644 --- a/src/tensors/levels/shard_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -26,7 +26,7 @@ struct ShardLevel{Device,Lvl,Ptr,Task,Val} <: AbstractLevel val::Val end const Shard = ShardLevel - + #keep function ShardLevel(device::Device, lvl::Lvl) where {Device,Lvl} ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], transfer(lvl, device), style) end #TODO scatterto? From 6aacac4b217fd2a810732f1636abc70b2a820b2f Mon Sep 17 00:00:00 2001 From: Willow Ahrens Date: Thu, 6 Feb 2025 11:02:36 -0500 Subject: [PATCH 25/25] fix --- src/tensors/levels/shard_levels.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl index e4e404d48..79cf9c6c0 100644 --- a/src/tensors/levels/shard_levels.jl +++ b/src/tensors/levels/shard_levels.jl @@ -26,7 +26,6 @@ struct ShardLevel{Device,Lvl,Ptr,Task,Val} <: AbstractLevel val::Val end const Shard = ShardLevel - #keep function ShardLevel(device::Device, lvl::Lvl) where {Device,Lvl} ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], transfer(lvl, device), style) end #TODO scatterto?