diff --git a/.github/workflows/StyleBot.yml b/.github/workflows/StyleBot.yml deleted file mode 100644 index fc13c6fae..000000000 --- a/.github/workflows/StyleBot.yml +++ /dev/null @@ -1,10 +0,0 @@ -name: Style Review -on: - pull_request: -jobs: - code-style: - runs-on: ubuntu-latest - steps: - - uses: julia-actions/julia-format@v3 - with: - version: '1' # Set `version` to '1.0.54' if you need to use JuliaFormatter.jl v1.0.54 (default: '1') \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 307fad215..76ae9d10b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -118,6 +118,9 @@ though both are included as part of the test suite. ## Code Style -We use [Blue Style](https://github.com/JuliaDiff/BlueStyle) formatting, with a few tweaks -defined in `.JuliaFormatter.toml`. Running the tests in overwrite mode will -automatically reformat your code, but you can also add [`JuliaFormatter`](https://domluna.github.io/JuliaFormatter.jl/stable/#Editor-Plugins) to your editor to reformat as you go. +We use [Blue Style](https://github.com/JuliaDiff/BlueStyle) formatting, with a +few tweaks defined in `.JuliaFormatter.toml`. Running the tests in overwrite +mode will automatically reformat your code, but you can also add +[`JuliaFormatter`](https://domluna.github.io/JuliaFormatter.jl/stable/#Editor-Plugins) +to your editor to reformat as you go, or call +`julia -e "using JuliaFormatter; format("path/to/Finch.jl")` manually. diff --git a/docs/src/docs/internals/parallel.md b/docs/src/docs/internals/parallel.md new file mode 100644 index 000000000..e94a93e13 --- /dev/null +++ b/docs/src/docs/internals/parallel.md @@ -0,0 +1,65 @@ +# Parallel Processing in Finch + +## Modelling the Architecture + +Finch uses a simple, hierarchical representation of devices and tasks to model +different kind of parallel processing. An [`AbstractDevice`](@ref) is a physical or +virtual device on which we can execute tasks, which may each be represented by +an [`AbstractTask`](@ref). + +```@docs +AbstractTask +AbstractDevice +``` + +The current task in a compilation context can be queried with +[`get_task`](@ref). Each device has a set of numbered child +tasks, and each task has a parent task. + +```@docs +get_num_tasks +get_task_num +get_device +get_parent_task +``` + +## Data Transfer + +Before entering a parallel loop, a tensor may reside on a single task, or +represent a single view of data distributed across multiple tasks, or represent +multiple separate tensors local to multiple tasks. A tensor's data must be +resident in the current task to process operations on that tensor, such as loops +over the indices, accesses to the tensor, or `declare`, `freeze`, or `thaw`. +Upon entering a parallel loop, we must transfer the tensor to the tasks +where it is needed. Upon exiting the parallel loop, we may need to combine +the data from multiple tasks into a single tensor. + +There are two cases, depending on whether the tensor is declared outside the +parallel loop or is a temporary tensor declared within the parallel loop. + +If the tensor is a temporary tensor declared within the parallel loop, we call +`bcast` to broadcast the tensor to all tasks. + +If the tensor is declared outside the parallel loop, we call `scatter` to +send it to the tasks where it is needed. Note that if the tensor is in `read` mode, +`scatter` may simply `bcast` the entire tensor to all tasks. If the device has global +memory, `scatter` may also be a no-op. When the parallel loop is exited, we call +`gather` to reconcile the data from multiple tasks back into a single tensor. + +Each of these operations begins with a `_send` variant on one task, and +finishes with a `_recv` variant on the recieving task. + +All transfers are accomplished with the functions `transfer` and `virtual_transfer`, with +different `style` objects signaling the type of transfer. + +```@docs +bcast +bcast_send +bcast_recv +scatter +scatter_send +scatter_recv +gather +gather_send +gather_recv +``` \ No newline at end of file diff --git a/docs/src/docs/internals/tensor_interface.md b/docs/src/docs/internals/tensor_interface.md index f54e8111b..f1530a310 100644 --- a/docs/src/docs/internals/tensor_interface.md +++ b/docs/src/docs/internals/tensor_interface.md @@ -18,8 +18,6 @@ virtual_eltype virtual_fill_value virtual_size virtual_resize! -moveto -virtual_moveto labelled_show labelled_children is_injective diff --git a/src/Finch.jl b/src/Finch.jl index d2efbb143..ccd785c4e 100644 --- a/src/Finch.jl +++ b/src/Finch.jl @@ -41,6 +41,7 @@ export Dense, DenseLevel export Element, ElementLevel export AtomicElement, AtomicElementLevel export Separate, SeparateLevel +export Shard, ShardLevel export Mutex, MutexLevel export Pattern, PatternLevel export Scalar, SparseScalar, ShortCircuitScalar, SparseShortCircuitScalar @@ -141,6 +142,7 @@ include("tensors/levels/dense_rle_levels.jl") include("tensors/levels/element_levels.jl") include("tensors/levels/atomic_element_levels.jl") include("tensors/levels/separate_levels.jl") +include("tensors/levels/shard_levels.jl") include("tensors/levels/mutex_levels.jl") include("tensors/levels/pattern_levels.jl") include("tensors/masks.jl") @@ -258,7 +260,7 @@ export fsparse, fsparse!, fsprand, fspzeros, ffindnz, fread, fwrite, countstored export bspread, bspwrite export ftnsread, ftnswrite, fttread, fttwrite -export moveto, postype +export transfer, postype include("FinchLogic/FinchLogic.jl") using .FinchLogic diff --git a/src/abstract_tensor.jl b/src/abstract_tensor.jl index 0d5c601ee..2eb31e0da 100644 --- a/src/abstract_tensor.jl +++ b/src/abstract_tensor.jl @@ -93,22 +93,6 @@ function similar in spirit to `Base.resize!`. """ function virtual_resize! end -""" - moveto(arr, device) - -If the array is not on the given device, it creates a new version of this array on that device -and copies the data in to it, according to the `device` trait. -""" -function moveto end - -""" - virtual_moveto(device, arr) - -If the virtual array is not on the given device, copy the array to that device. This -function may modify underlying data arrays, but cannot change the virtual itself. This -function is used to move data to the device before a kernel is launched. -""" -function virtual_moveto end struct LabelledTree key diff --git a/src/architecture.jl b/src/architecture.jl index 7b9400e18..c472695d0 100644 --- a/src/architecture.jl +++ b/src/architecture.jl @@ -1,8 +1,45 @@ +""" + AbstractDevice + +A datatype representing a device on which tasks can be executed. +""" abstract type AbstractDevice end abstract type AbstractVirtualDevice end + +""" + AbstractTask + +An individual processing unit on a device, responsible for running code. +""" abstract type AbstractTask end abstract type AbstractVirtualTask end +""" + get_num_tasks(dev::AbstractDevice) + +Return the number of tasks on the device dev. +""" +function get_num_tasks end +""" + get_task_num(task::AbstractTask) + +Return the task number of `task`. +""" +function get_task_num end +""" + get_device(task::AbstractTask) + +Return the device that `task` is running on. +""" +function get_device end + +""" + get_parent_task(task::AbstractTask) + +Return the task which spawned `task`. +""" +function get_parent_task end + """ aquire_lock!(dev::AbstractDevice, val) @@ -40,6 +77,7 @@ struct CPU <: AbstractDevice n::Int end CPU() = CPU(Threads.nthreads()) +get_num_tasks(dev::CPU) = dev.n @kwdef struct VirtualCPU <: AbstractVirtualDevice ex n @@ -57,6 +95,7 @@ end function lower(ctx::AbstractCompiler, device::VirtualCPU, ::DefaultStyle) something(device.ex, :(CPU($(ctx(device.n))))) end +get_num_tasks(::VirtualCPU) = literal(1) FinchNotation.finch_leaf(device::VirtualCPU) = virtual(device) @@ -68,13 +107,15 @@ A device that represents a serial CPU execution. struct Serial <: AbstractTask end const serial = Serial() get_device(::Serial) = CPU(1) -get_task(::Serial) = nothing +get_parent_task(::Serial) = nothing +get_task_num(::Serial) = 1 struct VirtualSerial <: AbstractVirtualTask end virtualize(ctx, ex, ::Type{Serial}) = VirtualSerial() lower(ctx::AbstractCompiler, task::VirtualSerial, ::DefaultStyle) = :(Serial()) FinchNotation.finch_leaf(device::VirtualSerial) = virtual(device) -virtual_get_device(::VirtualSerial) = VirtualCPU(nothing, 1) -virtual_get_task(::VirtualSerial) = nothing +get_device(::VirtualSerial) = VirtualCPU(nothing, 1) +get_parent_task(::VirtualSerial) = nothing +get_task_num(::VirtualSerial) = literal(1) struct CPUThread{Parent} <: AbstractTask tid::Int @@ -82,7 +123,8 @@ struct CPUThread{Parent} <: AbstractTask parent::Parent end get_device(task::CPUThread) = task.device -get_task(task::CPUThread) = task.parent +get_parent_task(task::CPUThread) = task.parent +get_task_num(task::CPUThread) = task.tid @inline function make_lock(::Type{Threads.Atomic{T}}) where {T} return Threads.Atomic{T}(zero(T)) @@ -139,13 +181,14 @@ function lower(ctx::AbstractCompiler, task::VirtualCPUThread, ::DefaultStyle) :(CPUThread($(ctx(task.tid)), $(ctx(task.dev)), $(ctx(task.parent)))) end FinchNotation.finch_leaf(device::VirtualCPUThread) = virtual(device) -virtual_get_device(task::VirtualCPUThread) = task.dev -virtual_get_task(task::VirtualCPUThread) = task.parent +get_device(task::VirtualCPUThread) = task.dev +get_parent_task(task::VirtualCPUThread) = task.parent +get_task_num(task::VirtualCPUThread) = task.tid struct CPULocalMemory device::CPU end -function moveto(vec::V, mem::CPULocalMemory) where {V<:Vector} +function transfer(vec::V, mem::CPULocalMemory, style) where {V<:Vector} CPULocalVector{V}(mem.device, [copy(vec) for _ in 1:(mem.device.n)]) end @@ -161,19 +204,32 @@ end Base.eltype(::Type{CPULocalVector{V}}) where {V} = eltype(V) Base.ndims(::Type{CPULocalVector{V}}) where {V} = ndims(V) -function moveto(vec::Vector, device::CPU) +function transfer(vec::Vector, device::CPU, style) return vec end -function moveto(vec::Vector, task::CPUThread) +function transfer(vec::Vector, task::CPUThread, style) return copy(vec) end -function moveto(vec::CPULocalVector, task::CPUThread) +function transfer(vec::CPULocalVector, task::CPUThread, style) temp = vec.data[task.tid] return temp end +""" + local_memory(device) + +Returns the local memory type for a given device. +""" +function local_memory(device::CPU) + return CPULocalMemory(device) +end + +function local_memory(device::Serial) + return device +end + struct Converter{f,T} end (::Converter{f,T})(x) where {f,T} = T(f(x)) @@ -239,3 +295,44 @@ for T in [ ) end end + +function virtual_parallel_region(f, ctx, ::Serial) + contain(f, ctx) +end + +function virtual_parallel_region(f, ctx, device::VirtualCPU) + tid = freshen(ctx, :tid) + + code = contain(ctx) do ctx_2 + subtask = VirtualCPUThread(value(tid, Int), device, ctx_2.code.task) + contain(f, ctx_2; task=subtask) + end + + return quote + Threads.@threads for $tid in 1:($(ctx(device.n))) + Finch.@barrier begin + @inbounds @fastmath begin + $code + end + nothing + end + end + end +end + +""" + transfer(arr, device, style) + +If the array is not on the given device, it creates a new version of this array on that device +and copies the data in to it, according to the `device` trait. +""" +function transfer end + +""" + virtual_transfer(device, arr, style) + +If the virtual array is not on the given device, copy the array to that device. This +function may modify underlying data arrays, but cannot change the virtual itself. This +function is used to move data to the device before a kernel is launched. +""" +function virtual_transfer end \ No newline at end of file diff --git a/src/interface/abstract_arrays.jl b/src/interface/abstract_arrays.jl index da91292c7..222a54d54 100644 --- a/src/interface/abstract_arrays.jl +++ b/src/interface/abstract_arrays.jl @@ -130,13 +130,13 @@ FinchNotation.finch_leaf(x::VirtualAbstractArray) = virtual(x) virtual_fill_value(ctx, ::VirtualAbstractArray) = 0 virtual_eltype(ctx, tns::VirtualAbstractArray) = tns.eltype -function virtual_moveto(ctx, vec::VirtualAbstractArray, device) +function virtual_transfer(ctx, vec::VirtualAbstractArray, device, style) ex = freshen(ctx, vec.ex) push_preamble!( ctx, quote $ex = $(vec.ex) - $(vec.ex) = $moveto($(vec.ex), $(ctx(device))) + $(vec.ex) = $transfer($(vec.ex), $(ctx(device)), style) end, ) push_epilogue!( diff --git a/src/lower.jl b/src/lower.jl index d8f5cfc06..cbf89c171 100644 --- a/src/lower.jl +++ b/src/lower.jl @@ -319,9 +319,6 @@ end function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualCPU) root = ensure_concurrent(root, ctx) - tid = index(freshen(ctx, :tid)) - i = freshen(ctx, :i) - decl_in_scope = unique( filter( !isnothing, @@ -344,39 +341,27 @@ function lower_parallel_loop(ctx, root, ext::ParallelDimension, device::VirtualC ), ) - root_2 = loop(tid, Extent(value(i, Int), value(i, Int)), - loop(root.idx, ext.ext, - sieve(access(VirtualSplitMask(device.n), reader(), root.idx, tid), - root.body, - ), - ), - ) - for tns in setdiff(used_in_scope, decl_in_scope) - virtual_moveto(ctx, resolve(ctx, tns), device) + virtual_transfer(ctx, resolve(ctx, tns), device, style) end - code = contain(ctx) do ctx_2 - subtask = VirtualCPUThread(value(i, Int), device, ctx_2.code.task) - contain(ctx_2; task=subtask) do ctx_3 - for tns in intersect(used_in_scope, decl_in_scope) - virtual_moveto(ctx_3, resolve(ctx_3, tns), subtask) - end - contain(ctx_3) do ctx_4 - open_scope(ctx_4) do ctx_5 - ctx_5(instantiate!(ctx_5, root_2)) - end - end + virtual_parallel_region(ctx, device) do ctx_2 + subtask = get_task(ctx_2) + tid = get_task_num(subtask) + for tns in intersect(used_in_scope, decl_in_scope) + virtual_transfer(ctx_2, resolve(ctx_2, tns), subtask, style) end - end - - return quote - Threads.@threads for $i in 1:($(ctx(device.n))) - Finch.@barrier begin - @inbounds @fastmath begin - $code - end - nothing + contain(ctx_2) do ctx_3 + open_scope(ctx_3) do ctx_4 + i = index(freshen(ctx, :i)) + root_2 = loop(i, Extent(tid, tid), + loop(root.idx, ext.ext, + sieve(access(VirtualSplitMask(device.n), reader(), root.idx, i), + root.body, + ), + ), + ) + ctx_4(instantiate!(ctx_4, root_2)) end end end diff --git a/src/tensors/fibers.jl b/src/tensors/fibers.jl index 5f331d487..a37482eee 100644 --- a/src/tensors/fibers.jl +++ b/src/tensors/fibers.jl @@ -139,12 +139,12 @@ function unfurl(ctx::AbstractCompiler, arr::VirtualFiber, ext, mode, proto) unfurl(ctx, VirtualSubFiber(arr.lvl, literal(1)), ext, mode, proto) end -function virtual_moveto(ctx::AbstractCompiler, fbr::VirtualFiber, arch) - virtual_moveto_level(ctx, fbr.lvl, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualFiber, arch, style) + virtual_transfer_level(ctx, fbr.lvl, arch, style) end -function virtual_moveto(ctx::AbstractCompiler, fbr::VirtualSubFiber, arch) - virtual_moveto_level(ctx, fbr.lvl, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualSubFiber, arch, style) + virtual_transfer_level(ctx, fbr.lvl, arch, style) end struct HollowSubFiber{Lvl,Pos,Dirty} <: AbstractFiber{Lvl} @@ -171,9 +171,9 @@ function lower(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, ::DefaultStyle end FinchNotation.finch_leaf(x::VirtualHollowSubFiber) = virtual(x) -function virtual_moveto(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, arch) +function virtual_transfer(ctx::AbstractCompiler, fbr::VirtualHollowSubFiber, arch, style) return VirtualHollowSubFiber( - virtual_moveto_level(ctx, fbr.lvl, arch), fbr.pos, fbr.dirty + virtual_transfer_level(ctx, fbr.lvl, arch, style), fbr.pos, fbr.dirty ) end @@ -333,7 +333,7 @@ function Base.similar(fbr::AbstractFiber, fill_value, eltype::Type, dims::Tuple) Tensor(similar_level(fbr.lvl, fill_value, eltype, dims...)) end -moveto(tns::Tensor, device) = Tensor(moveto(tns.lvl, device)) +transfer(tns::Tensor, device) = Tensor(transfer(tns.lvl, device), style) struct Structure t diff --git a/src/tensors/levels/atomic_element_levels.jl b/src/tensors/levels/atomic_element_levels.jl index 0a0746d62..d9d371a04 100644 --- a/src/tensors/levels/atomic_element_levels.jl +++ b/src/tensors/levels/atomic_element_levels.jl @@ -40,8 +40,8 @@ end postype(::Type{<:AtomicElementLevel{Vf,Tv,Tp}}) where {Vf,Tv,Tp} = Tp -function moveto(lvl::AtomicElementLevel{Vf,Tv,Tp}, device) where {Vf,Tv,Tp} - return AtomicElementLevel{Vf,Tv,Tp}(moveto(lvl.val, device)) +function transfer(lvl::AtomicElementLevel{Vf,Tv,Tp}, device, style) where {Vf,Tv,Tp} + return AtomicElementLevel{Vf,Tv,Tp}(transfer(lvl.val, device), style) end pattern!(lvl::AtomicElementLevel{Vf,Tv,Tp}) where {Vf,Tv,Tp} = @@ -165,13 +165,13 @@ function reassemble_level!(ctx, lvl::VirtualAtomicElementLevel, pos_start, pos_s lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualAtomicElementLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualAtomicElementLevel, arch, style) val_2 = freshen(ctx, :val) push_preamble!( ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) end, ) push_epilogue!( @@ -205,7 +205,7 @@ function lower_assign(ctx, fbr::VirtualSubFiber{VirtualAtomicElementLevel}, mode (lvl, pos) = (fbr.lvl, fbr.pos) op = ctx(op) rhs = ctx(rhs) - device = ctx(virtual_get_device(get_task(ctx))) + device = ctx(get_device(get_task(ctx))) :(Finch.atomic_modify!($device, $(lvl.val), $(ctx(pos)), $op, $rhs)) end @@ -221,6 +221,6 @@ function lower_assign( ) op = ctx(op) rhs = ctx(rhs) - device = ctx(virtual_get_device(get_task(ctx))) + device = ctx(get_device(get_task(ctx))) :(Finch.atomic_modify!($device, $(lvl.val), $(ctx(pos)), $op, $rhs)) end diff --git a/src/tensors/levels/dense_levels.jl b/src/tensors/levels/dense_levels.jl index c06f0090f..32c69f760 100644 --- a/src/tensors/levels/dense_levels.jl +++ b/src/tensors/levels/dense_levels.jl @@ -44,8 +44,8 @@ function postype(::Type{DenseLevel{Ti,Lvl}}) where {Ti,Lvl} return postype(Lvl) end -function moveto(lvl::DenseLevel{Ti}, device) where {Ti} - return DenseLevel{Ti}(moveto(lvl.lvl, device), lvl.shape) +function transfer(lvl::DenseLevel{Ti}, device, style) where {Ti} + return DenseLevel{Ti}(transfer(lvl.lvl, device), lvl.shape, style) end function pattern!(lvl::DenseLevel{Ti,Lvl}) where {Ti,Lvl} @@ -201,8 +201,8 @@ function freeze_level!(ctx::AbstractCompiler, lvl::VirtualDenseLevel, pos) return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualDenseLevel, arch) - virtual_moveto_level(ctx, lvl.lvl, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualDenseLevel, arch, style) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end struct DenseTraversal diff --git a/src/tensors/levels/dense_rle_levels.jl b/src/tensors/levels/dense_rle_levels.jl index a4d05091a..0ca813128 100644 --- a/src/tensors/levels/dense_rle_levels.jl +++ b/src/tensors/levels/dense_rle_levels.jl @@ -64,11 +64,11 @@ function postype( return postype(Lvl) end -function moveto(lvl::RunListLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - ptr = moveto(lvl.ptr, device) - right = moveto(lvl.right, device) - buf = moveto(lvl.buf, device) +function transfer(lvl::RunListLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + ptr = transfer(lvl.ptr, device, style) + right = transfer(lvl.right, device, style) + buf = transfer(lvl.buf, device, style) return RunListLevel{Ti}( lvl_2, lvl.shape, lvl.ptr, lvl.right, lvl.buf; merge=getmerge(lvl) ) @@ -295,7 +295,7 @@ function virtual_level_resize!(ctx, lvl::VirtualRunListLevel, dims...) lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) right_2 = freshen(ctx, lvl.right) push_preamble!( @@ -303,8 +303,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, a quote $ptr_2 = $(lvl.ptr) $right_2 = $(lvl.right) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.right) = $moveto($(lvl.right), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.right) = $transfer($(lvl.right), $(ctx(arch)), style) end, ) push_epilogue!( @@ -314,8 +314,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualRunListLevel, a $(lvl.right) = $right_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) - virtual_moveto_level(ctx, lvl.buf, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) + virtual_transfer_level(ctx, lvl.buf, arch, style) end virtual_level_eltype(lvl::VirtualRunListLevel) = virtual_level_eltype(lvl.lvl) diff --git a/src/tensors/levels/element_levels.jl b/src/tensors/levels/element_levels.jl index 8effd622d..b8903ee27 100644 --- a/src/tensors/levels/element_levels.jl +++ b/src/tensors/levels/element_levels.jl @@ -43,8 +43,8 @@ end postype(::Type{<:ElementLevel{Vf,Tv,Tp}}) where {Vf,Tv,Tp} = Tp -function moveto(lvl::ElementLevel{Vf,Tv,Tp}, device) where {Vf,Tv,Tp} - return ElementLevel{Vf,Tv,Tp}(moveto(lvl.val, device)) +function transfer(lvl::ElementLevel{Vf,Tv,Tp}, device, style) where {Vf,Tv,Tp} + return ElementLevel{Vf,Tv,Tp}(transfer(lvl.val, device), style) end pattern!(lvl::ElementLevel{Vf,Tv,Tp}) where {Vf,Tv,Tp} = @@ -171,13 +171,13 @@ function reassemble_level!(ctx, lvl::VirtualElementLevel, pos_start, pos_stop) lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualElementLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualElementLevel, arch, style) val_2 = freshen(ctx, :val) push_preamble!( ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) end, ) push_epilogue!( diff --git a/src/tensors/levels/mutex_levels.jl b/src/tensors/levels/mutex_levels.jl index 0f7b7478f..b3c066d47 100644 --- a/src/tensors/levels/mutex_levels.jl +++ b/src/tensors/levels/mutex_levels.jl @@ -35,9 +35,9 @@ end postype(::Type{<:MutexLevel{AVal,Lvl}}) where {Lvl,AVal} = postype(Lvl) -function moveto(lvl::MutexLevel, device) - lvl_2 = moveto(lvl.lvl, device) - locks_2 = moveto(lvl.locks, device) +function transfer(lvl::MutexLevel, device, style) + lvl_2 = transfer(lvl.lvl, device, style) + locks_2 = transfer(lvl.locks, device, style) return MutexLevel(lvl_2, locks_2) end @@ -207,7 +207,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualMutexLevel, pos) return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arch, style) #Add for seperation level too. atomics = freshen(ctx, :locksArray) @@ -215,7 +215,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arc ctx, quote $atomics = $(lvl.locks) - $(lvl.locks) = $moveto($(lvl.locks), $(ctx(arch))) + $(lvl.locks) = $transfer($(lvl.locks), $(ctx(arch)), style) end, ) push_epilogue!( @@ -224,7 +224,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualMutexLevel, arc $(lvl.locks) = $atomics end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function instantiate(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, mode) @@ -244,7 +244,7 @@ function unfurl(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, ext, mode, proto) sym = freshen(ctx, lvl.ex, :after_atomic_lvl) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) - dev = lower(ctx, virtual_get_device(ctx.code.task), DefaultStyle()) + dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) push_preamble!( ctx, quote @@ -271,7 +271,7 @@ function unfurl(ctx, fbr::VirtualHollowSubFiber{VirtualMutexLevel}, ext, mode, p sym = freshen(ctx, lvl.ex, :after_atomic_lvl) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) - dev = lower(ctx, virtual_get_device(ctx.code.task), DefaultStyle()) + dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) push_preamble!( ctx, quote @@ -296,7 +296,7 @@ function lower_assign(ctx, fbr::VirtualSubFiber{VirtualMutexLevel}, mode, op, rh sym = freshen(ctx, lvl.ex, :after_atomic_lvl) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) - dev = lower(ctx, virtual_get_device(ctx.code.task), DefaultStyle()) + dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) push_preamble!( ctx, quote @@ -321,7 +321,7 @@ function lower_assign(ctx, fbr::VirtualHollowSubFiber{VirtualMutexLevel}, mode, sym = freshen(ctx, lvl.ex, :after_atomic_lvl) atomicData = freshen(ctx, lvl.ex, :atomicArraysAcc) lockVal = freshen(ctx, lvl.ex, :lockVal) - dev = lower(ctx, virtual_get_device(ctx.code.task), DefaultStyle()) + dev = lower(ctx, get_device(ctx.code.task), DefaultStyle()) push_preamble!( ctx, quote diff --git a/src/tensors/levels/pattern_levels.jl b/src/tensors/levels/pattern_levels.jl index 31e477be9..951b14f19 100644 --- a/src/tensors/levels/pattern_levels.jl +++ b/src/tensors/levels/pattern_levels.jl @@ -47,7 +47,7 @@ isstructequal(a::T, b::T) where {T<:Pattern} = true postype(::Type{<:PatternLevel{Tp}}) where {Tp} = Tp -function moveto(lvl::PatternLevel{Tp}, device) where {Tp} +function transfer(lvl::PatternLevel{Tp}, device, style) where {Tp} return PatternLevel{Tp}() end @@ -93,7 +93,7 @@ struct VirtualPatternLevel <: AbstractVirtualLevel Tp end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualPatternLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualPatternLevel, arch, style) end is_level_injective(ctx, ::VirtualPatternLevel) = [] diff --git a/src/tensors/levels/separate_levels.jl b/src/tensors/levels/separate_levels.jl index 62999cc02..b1c5220f9 100644 --- a/src/tensors/levels/separate_levels.jl +++ b/src/tensors/levels/separate_levels.jl @@ -36,9 +36,9 @@ end postype(::Type{<:Separate{Lvl,Val}}) where {Lvl,Val} = postype(Lvl) -function moveto(lvl::SeparateLevel, device) - lvl_2 = moveto(lvl.lvl, device) - val_2 = moveto(lvl.val, device) +function transfer(lvl::SeparateLevel, device, style) + lvl_2 = transfer(lvl.lvl, device, style) + val_2 = transfer(lvl.val, device, style) return SeparateLevel(lvl_2, val_2) end @@ -146,7 +146,7 @@ virtual_level_size(ctx, lvl::VirtualSeparateLevel) = virtual_level_size(ctx, lvl virtual_level_eltype(lvl::VirtualSeparateLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualSeparateLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_moveto_level(ctx, lvl::VirtualSeparateLevel, arch) +function virtual_transfer_level(ctx, lvl::VirtualSeparateLevel, arch, style) # Need to move each pointer... val_2 = freshen(ctx, lvl.val) @@ -154,7 +154,7 @@ function virtual_moveto_level(ctx, lvl::VirtualSeparateLevel, arch) ctx, quote $val_2 = $(lvl.val) - $(lvl.val) = $moveto($(lvl.val), $(ctx(arch))) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) end, ) push_epilogue!( @@ -163,7 +163,7 @@ function virtual_moveto_level(ctx, lvl::VirtualSeparateLevel, arch) $(lvl.val) = $val_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function declare_level!(ctx, lvl::VirtualSeparateLevel, pos, init) diff --git a/src/tensors/levels/shard_levels.jl b/src/tensors/levels/shard_levels.jl new file mode 100644 index 000000000..79cf9c6c0 --- /dev/null +++ b/src/tensors/levels/shard_levels.jl @@ -0,0 +1,370 @@ +""" + ShardLevel{Lvl, [Val]}() + +Each subfiber of a Shard level is stored in a thread-local tensor of type +`Lvl`, in a thread-local memory space. + +Each sublevel is stored in a vector of type `Val` with `eltype(Val) = Lvl`. + +```jldoctest +julia> tensor_tree(Tensor(Dense(Shard(Element(0.0))), [1, 2, 3])) +3-Tensor +└─ Dense [1:3] + ├─ [1]: Shard -> + │ └─ 1.0 + ├─ [2]: Shard -> + │ └─ 2.0 + └─ [3]: Shard -> + └─ 3.0 +``` +""" +struct ShardLevel{Device,Lvl,Ptr,Task,Val} <: AbstractLevel + device::Device + lvl::Lvl + ptr::Ptr + task::Task + val::Val +end +const Shard = ShardLevel +function ShardLevel(device::Device, lvl::Lvl) where {Device,Lvl} + ShardLevel{Device}(device, lvl, postype(lvl)[], postype(lvl)[], transfer(lvl, device), style) +end #TODO scatterto? + +function ShardLevel{Device}( + device, lvl::Lvl, ptr::Ptr, task::Task, val::Val +) where {Device,Lvl,Ptr,Task,Val} + ShardLevel{Device,Lvl,Ptr,Task,Val}(device, lvl, ptr, task, val) +end + +function Base.summary(::Shard{Device,Lvl,Ptr,Task,Val}) where {Device,Lvl,Ptr,Task,Val} + "Shard($(Lvl))" +end + +function similar_level( + lvl::Shard{Device,Lvl,Ptr,Task,Val}, fill_value, eltype::Type, dims... +) where {Device,Lvl,Ptr,Task,Val} + ShardLevel(lvl.device, similar_level(lvl.lvl, fill_value, eltype, dims...)) +end + +function postype(::Type{<:Shard{Device,Lvl,Ptr,Task,Val}}) where {Device,Lvl,Ptr,Task,Val} + postype(Lvl) +end + +function transfer(lvl::ShardLevel, device, style) + lvl_2 = transfer(lvl.lvl, device, style) + ptr_2 = transfer(lvl.ptr, device, style) + task_2 = transfer(lvl.task, device, style) + return ShardLevel(lvl_2, ptr_2, task_2, val_2) +end + +function pattern!(lvl::ShardLevel) + ShardLevel(pattern!(lvl.lvl), lvl.ptr, lvl.task, map(pattern!, lvl.val)) +end +function set_fill_value!(lvl::ShardLevel, init) + ShardLevel( + set_fill_value!(lvl.lvl, init), + lvl.ptr, + lvl.task, + map(lvl_2 -> set_fill_value!(lvl_2, init), lvl.val), + ) +end +function Base.resize!(lvl::ShardLevel, dims...) + ShardLevel( + resize!(lvl.lvl, dims...), + lvl.ptr, + lvl.task, + map(lvl_2 -> resize!(lvl_2, dims...), lvl.val), + ) +end + +function Base.show( + io::IO, lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} +) where {Device,Lvl,Ptr,Task,Val} + print(io, "Shard(") + if get(io, :compact, false) + print(io, "…") + else + show(io, lvl.lvl) + print(io, ", ") + show(io, lvl.ptr) + print(io, ", ") + show(io, lvl.task) + print(io, ", ") + show(io, lvl.val) + end + print(io, ")") +end + +function labelled_show(io::IO, fbr::SubFiber{<:ShardLevel}) + (lvl, pos) = (fbr.lvl, fbr.pos) + print(io, "shard($(lvl.task[pos])) -> ") +end + +function labelled_children(fbr::SubFiber{<:ShardLevel}) + lvl = fbr.lvl + pos = fbr.pos + pos > length(lvl.val) && return [] + [LabelledTree(SubFiber(lvl.val[lvl.task[pos]], lvl.ptr[pos]))] +end + +@inline level_ndims( + ::Type{<:ShardLevel{Device,Lvl,Ptr,Task,Val}} +) where {Device,Lvl,Ptr,Task,Val} = level_ndims(Lvl) +@inline level_size( + lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} +) where {Device,Lvl,Ptr,Task,Val} = level_size(lvl.lvl) +@inline level_axes( + lvl::ShardLevel{Device,Lvl,Ptr,Task,Val} +) where {Device,Lvl,Ptr,Task,Val} = level_axes(lvl.lvl) +@inline level_eltype( + ::Type{ShardLevel{Device,Lvl,Ptr,Task,Val}} +) where {Device,Lvl,Ptr,Task,Val} = level_eltype(Lvl) +@inline level_fill_value( + ::Type{<:ShardLevel{Device,Lvl,Ptr,Task,Val}} +) where {Device,Lvl,Ptr,Task,Val} = level_fill_value(Lvl) + +function (fbr::SubFiber{<:ShardLevel})(idxs...) + q = fbr.pos + return SubFiber(fbr.lvl.val[q], 1)(idxs...) +end + +countstored_level(lvl::ShardLevel, pos) = pos + +mutable struct VirtualShardLevel <: AbstractVirtualLevel + device + lvl # stand-in for the sublevel for virtual resize, etc. + ex + ptr + task + val + Tv + Device + Lvl + Ptr + Task + Val +end + +postype(lvl::VirtualShardLevel) = postype(lvl.lvl) + +function is_level_injective(ctx, lvl::VirtualShardLevel) + [is_level_injective(ctx, lvl.lvl)..., true] +end +function is_level_atomic(ctx, lvl::VirtualShardLevel) + (below, atomic) = is_level_atomic(ctx, lvl.lvl) + return ([below; [atomic]], atomic) +end +function is_level_concurrent(ctx, lvl::VirtualShardLevel) + (data, _) = is_level_concurrent(ctx, lvl.lvl) + return (data, true) +end + +function lower(ctx::AbstractCompiler, lvl::VirtualShardLevel, ::DefaultStyle) + quote + $ShardLevel{$(lvl.Lvl),$(lvl.Ptr),$(lvl.Task),$(lvl.Val)}( + $(ctx(lvl.lvl)), $(lvl.val) + ) + end +end + +function virtualize( + ctx, ex, ::Type{ShardLevel{Device,Lvl,Ptr,Task,Val}}, tag=:lvl +) where {Device,Lvl,Ptr,Task,Val} + sym = freshen(ctx, tag) + ptr = freshen(ctx, tag, :_ptr) + task = freshen(ctx, tag, :_task) + val = freshen(ctx, tag, :_val) + + push_preamble!( + ctx, + quote + $sym = $ex + $ptr = $ex.ptr + $task = $ex.task + $val = $ex.val + end, + ) + device_2 = virtualize(ctx, :($ex.device), Device, sym) + lvl_2 = virtualize(ctx, :($ex.lvl), Lvl, sym) + VirtualShardLevel( + device_2, + lvl_2, + sym, + ptr, + task, + val, + typeof(level_fill_value(Lvl)), + Device, + Lvl, + Ptr, + Task, + Val, + ) +end + +Base.summary(lvl::VirtualShardLevel) = "Shard($(lvl.Lvl))" + +function virtual_level_resize!(ctx, lvl::VirtualShardLevel, dims...) + (lvl.lvl = virtual_level_resize!(ctx, lvl.lvl, dims...); lvl) +end +virtual_level_size(ctx, lvl::VirtualShardLevel) = virtual_level_size(ctx, lvl.lvl) +virtual_level_eltype(lvl::VirtualShardLevel) = virtual_level_eltype(lvl.lvl) +virtual_level_fill_value(lvl::VirtualShardLevel) = virtual_level_fill_value(lvl.lvl) + +function virtual_transfer_level(ctx, lvl::VirtualShardLevel, arch, style) + val_2 = freshen(ctx, lvl.val) + push_preamble!( + ctx, + quote + $val_2 = $(lvl.val) + $(lvl.val) = $transfer($(lvl.val), $(ctx(arch)), style) + end, + ) + push_epilogue!( + ctx, + quote + $(lvl.val) = $val_2 + end, + ) + virtual_transfer_level(ctx, lvl.lvl, arch, style) +end + +function declare_level!(ctx, lvl::VirtualShardLevel, pos, init) + push_preamble!(ctx, + virtual_parallel_region(ctx, lvl.device) do ctx_2 + lvl_2 = virtualize( + ctx_2, :($(lvl.ex).val[$(ctx_2(get_task_num(get_task(ctx_2))))]), lvl.Lvl + ) #TODO should this virtualize the eltype of Val? + declare_level!(ctx_2, lvl_2, literal(1), init) + end, + ) + lvl +end + +""" +assemble: + mapping is pos -> task, ptr. task says which task has it, ptr says which position in that task has it. + +read: + read from pos to task, ptr. simple. + +write: + allocate something for this task on that position, assemble on the task itself on demand. Complain if the task is wrong. + +The outer level needs to be concurrent, like denselevel. +""" +function assemble_level!(ctx, lvl::VirtualShardLevel, pos_start, pos_stop) + pos_start = cache!(ctx, :pos_start, simplify(ctx, pos_start)) + pos_stop = cache!(ctx, :pos_stop, simplify(ctx, pos_stop)) + pos = freshen(ctx, :pos) + sym = freshen(ctx, :pointer_to_lvl) + push_preamble!( + ctx, + quote + Finch.resize_if_smaller!($(lvl.task), $(ctx(pos_stop))) + Finch.resize_if_smaller!($(lvl.ptr), $(ctx(pos_stop))) + Finch.fill_range!($(lvl.task), $(ctx(pos_start)), $(ctx(pos_stop)), 0) + end, + ) + lvl +end + +supports_reassembly(::VirtualShardLevel) = false + +""" +these two are no-ops, we insteaed do these on instantiate +""" +function freeze_level!(ctx, lvl::VirtualShardLevel, pos) + return lvl +end + +function thaw_level!(ctx::AbstractCompiler, lvl::VirtualShardLevel, pos) + return lvl +end + +function instantiate(ctx, fbr::VirtualSubFiber{VirtualShardLevel}, mode) + if mode.kind === reader + (lvl, pos) = (fbr.lvl, fbr.pos) + tag = lvl.ex + isnulltest = freshen(ctx, tag, :_nulltest) + Vf = level_fill_value(lvl.Lvl) + sym = freshen(ctx, :pointer_to_lvl) + val = freshen(ctx, lvl.ex, :_val) + return Thunk(; + body=(ctx) -> begin + lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) + instantiate(ctx, VirtualSubFiber(lvl_2, literal(1)), mode) + end, + ) + else + (lvl, pos) = (fbr.lvl, fbr.pos) + tag = lvl.ex + sym = freshen(ctx, :pointer_to_lvl) + + return Thunk(; + body=(ctx) -> begin + lvl_2 = virtualize(ctx.code, :($(lvl.val)[$(ctx(pos))]), lvl.Lvl, sym) + lvl_2 = thaw_level!(ctx, lvl_2, literal(1)) + push_preamble!(ctx, assemble_level!(ctx, lvl_2, literal(1), literal(1))) + res = instantiate(ctx, VirtualSubFiber(lvl_2, literal(1)), mode) + push_epilogue!(ctx, + contain(ctx) do ctx_2 + lvl_2 = freeze_level!(ctx_2, lvl_2, literal(1)) + :($(lvl.val)[$(ctx_2(pos))] = $(ctx_2(lvl_2))) + end, + ) + res + end, + ) + end +end + +#we need some sort of localization step at the start of a parallel region whereby we can thaw the shart level + +""" +assemble: + mapping is pos -> task, ptr. task says which task has it, ptr says which position in that task has it. + +read: + read from pos to task, ptr. simple. + +write: + allocate something for this task on that position, assemble on the task itself on demand. Complain if the task is wrong. + +The outer level needs to be concurrent, like denselevel. +""" +function instantiate(ctx, fbr::VirtualHollowSubFiber{VirtualShardLevel}, mode) + @assert mode.kind === updater + (lvl, pos) = (fbr.lvl, fbr.pos) + tag = lvl.ex + sym = freshen(ctx, :pointer_to_lvl) + + task = freshen(ctx, tag, :_task) + + return Thunk(; + preamble = quote + $task = $(lvl.task)[$(ctx(pos))] + if task == 0 + $(lvl.task)[$(ctx(pos))] = $(gettasknum(ctx)) + qos = local_qos_fill + if $(lvl.local_qos_fill) > $(lvl.local_qos_stop) + $local_qos_stop = max($local_qos_stop << 1, 1) + $(contain(ctx_2 -> assemble_level!(ctx_2, lvl.lvl, value(qos_fill, Tp), value(qos_stop, Tp)), ctx)) + end + else + qos = $(lvl.ptr)[$(ctx(pos))] + qos_stop = $(lvl.local_qos_stop) + #only in safe mode, we check if task == $(gettasknum(ctx)) and if not error("Task mismatch in ShardLevel") + end + dirty = true + end, + body = (ctx) -> VirtualHollowSubFiber(lvl.lvl, value(qos), dirty), + epilogue = quote + #this task will always own this position forever, even if we don't write to it. Still, we try to be conservative of memory usage of the underlying level. + if dirty && $(lvl.ptr)[$(ctx(pos))] == 0 + local_qos_fill += 1 + $(lvl.ptr)[$(ctx(pos))] = $(lvl.local_qos_fill) += 1 + end + end, + ) +end diff --git a/src/tensors/levels/sparse_band_levels.jl b/src/tensors/levels/sparse_band_levels.jl index fd5f29687..db42b6195 100644 --- a/src/tensors/levels/sparse_band_levels.jl +++ b/src/tensors/levels/sparse_band_levels.jl @@ -36,10 +36,10 @@ function postype(::Type{SparseBandLevel{Ti,Idx,Ofs,Lvl}}) where {Ti,Idx,Ofs,Lvl} return postype(Lvl) end -function moveto(lvl::SparseBandLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - idx_2 = moveto(lvl.idx, device) - ofs_2 = moveto(lvl.ofs, device) +function transfer(lvl::SparseBandLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + idx_2 = transfer(lvl.idx, device, style) + ofs_2 = transfer(lvl.ofs, device, style) return SparseBandLevel{Ti}(lvl_2, lvl.shape, idx_2, ofs_2) end @@ -241,7 +241,7 @@ end virtual_level_eltype(lvl::VirtualSparseBandLevel) = virtual_level_eltype(lvl.lvl) virtual_level_fill_value(lvl::VirtualSparseBandLevel) = virtual_level_fill_value(lvl.lvl) -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, arch, style) tbl_2 = freshen(ctx, lvl.tbl) ofs_2 = freshen(ctx, lvl.ofs) push_preamble!( @@ -249,8 +249,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel quote $tbl_2 = $(lvl.tbl) $ofs_2 = $(lvl.ofs) - $(lvl.tbl) = $moveto($(lvl.tbl), $(ctx(arch))) - $(lvl.ofs) = $moveto($(lvl.ofs), $(ctx(arch))) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch)), style) + $(lvl.ofs) = $transfer($(lvl.ofs), $(ctx(arch)), style) end, ) push_epilogue!( @@ -260,7 +260,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel $(lvl.ofs) = $ofs_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function declare_level!(ctx::AbstractCompiler, lvl::VirtualSparseBandLevel, pos, init) diff --git a/src/tensors/levels/sparse_bytemap_levels.jl b/src/tensors/levels/sparse_bytemap_levels.jl index 4c0c03b7a..787d7fc3d 100644 --- a/src/tensors/levels/sparse_bytemap_levels.jl +++ b/src/tensors/levels/sparse_bytemap_levels.jl @@ -61,11 +61,11 @@ function postype(::Type{SparseByteMapLevel{Ti,Ptr,Tbl,Srt,Lvl}}) where {Ti,Ptr,T return postype(Lvl) end -function moveto(lvl::SparseByteMapLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - ptr_2 = moveto(lvl.ptr, device) - tbl_2 = moveto(lvl.tbl, device) - srt_2 = moveto(lvl.srt, device) +function transfer(lvl::SparseByteMapLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + ptr_2 = transfer(lvl.ptr, device, style) + tbl_2 = transfer(lvl.tbl, device, style) + srt_2 = transfer(lvl.srt, device, style) return SparseByteMapLevel{Ti}(lvl_2, lvl.shape, ptr_2, tbl_2, srt_2) end @@ -239,7 +239,7 @@ function lower(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, ::DefaultS end end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) tbl_2 = freshen(ctx, lvl.tbl) srt_2 = freshen(ctx, lvl.srt) @@ -249,9 +249,9 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLe $ptr_2 = $(lvl.ptr) $tbl_2 = $(lvl.tbl) $srt_2 = $(lvl.srt) - $(lvl.ptr) = moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.tbl) = moveto($(lvl.tbl), $(ctx(arch))) - $(lvl.srt) = moveto($(lvl.srt), $(ctx(arch))) + $(lvl.ptr) = transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.tbl) = transfer($(lvl.tbl), $(ctx(arch)), style) + $(lvl.srt) = transfer($(lvl.srt), $(ctx(arch)), style) end, ) push_epilogue!( @@ -262,7 +262,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseByteMapLe $(lvl.srt) = $srt_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end Base.summary(lvl::VirtualSparseByteMapLevel) = "SparseByteMap($(summary(lvl.lvl)))" diff --git a/src/tensors/levels/sparse_coo_levels.jl b/src/tensors/levels/sparse_coo_levels.jl index 2b4aa4a4c..de5235668 100644 --- a/src/tensors/levels/sparse_coo_levels.jl +++ b/src/tensors/levels/sparse_coo_levels.jl @@ -85,10 +85,10 @@ function postype(::Type{SparseCOOLevel{N,TI,Ptr,Tbl,Lvl}}) where {N,TI,Ptr,Tbl,L return postype(Lvl) end -function moveto(lvl::SparseCOOLevel{N,TI}, device) where {N,TI} - lvl_2 = moveto(lvl.lvl, device) - ptr_2 = moveto(lvl.ptr, device) - tbl_2 = ntuple(n -> moveto(lvl.tbl[n], device), N) +function transfer(lvl::SparseCOOLevel{N,TI}, device, style) where {N,TI} + lvl_2 = transfer(lvl.lvl, device, style) + ptr_2 = transfer(lvl.ptr, device, style) + tbl_2 = ntuple(n -> transfer(lvl.tbl[n], device), N, style) return SparseCOOLevel{N,TI}(lvl_2, lvl.shape, ptr_2, tbl_2) end @@ -344,13 +344,13 @@ function freeze_level!(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, pos_st return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) push_preamble!( ctx, quote $ptr_2 = $(lvl.ptr) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) end, ) push_epilogue!( @@ -365,7 +365,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, ctx, quote $idx_2 = $idx - $idx = $moveto($idx, $(ctx(arch))) + $idx = $transfer($idx, $(ctx(arch)), style) end, ) push_epilogue!( @@ -376,7 +376,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseCOOLevel, ) idx_2 end - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end struct SparseCOOWalkTraversal diff --git a/src/tensors/levels/sparse_dict_levels.jl b/src/tensors/levels/sparse_dict_levels.jl index 9fc11ab27..b2bd73893 100644 --- a/src/tensors/levels/sparse_dict_levels.jl +++ b/src/tensors/levels/sparse_dict_levels.jl @@ -90,15 +90,15 @@ function Base.resize!(lvl::SparseDictLevel{Ti}, dims...) where {Ti} ) end -function moveto( +function transfer( lvl::SparseDictLevel{Ti,Ptr,Idx,Val,Tbl,Pool,Lvl}, Tm ) where {Ti,Ptr,Idx,Val,Tbl,Pool,Lvl} - lvl_2 = moveto(lvl.lvl, Tm) - ptr_2 = moveto(lvl.ptr, Tm) - idx_2 = moveto(lvl.idx, Tm) - val_2 = moveto(lvl.val, Tm) - tbl_2 = moveto(lvl.tbl, Tm) - pool_2 = moveto(lvl.pool, Tm) + lvl_2 = transfer(lvl.lvl, Tm, style) + ptr_2 = transfer(lvl.ptr, Tm, style) + idx_2 = transfer(lvl.idx, Tm, style) + val_2 = transfer(lvl.val, Tm, style) + tbl_2 = transfer(lvl.tbl, Tm, style) + pool_2 = transfer(lvl.pool, Tm, style) return SparseDictLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2, val_2, tbl_2, pool_2) end @@ -385,7 +385,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, pos_sto return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) tbl_2 = freshen(ctx, lvl.tbl_2) @@ -393,7 +393,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel ctx, quote $tbl_2 = $(lvl.tbl) - $(lvl.tbl) = $moveto($(lvl.tbl), $(ctx(arch))) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch)), style) end, ) push_epilogue!( @@ -402,7 +402,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseDictLevel $(lvl.tbl) = $tbl_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function unfurl( diff --git a/src/tensors/levels/sparse_interval_levels.jl b/src/tensors/levels/sparse_interval_levels.jl index 9fcdf6967..1c6fce27c 100644 --- a/src/tensors/levels/sparse_interval_levels.jl +++ b/src/tensors/levels/sparse_interval_levels.jl @@ -63,10 +63,10 @@ function postype(::Type{SparseIntervalLevel{Ti,Left,Right,Lvl}}) where {Ti,Left, return postype(Lvl) end -function moveto(lvl::SparseIntervalLevel{Ti,Left,Right,Lvl}, Tm) where {Ti,Left,Right,Lvl} - lvl_2 = moveto(lvl.lvl, Tm) - left_2 = moveto(lvl.left, Tm) - right_2 = moveto(lvl.right, Tm) +function transfer(lvl::SparseIntervalLevel{Ti,Left,Right,Lvl}, Tm, style) where {Ti,Left,Right,Lvl} + lvl_2 = transfer(lvl.lvl, Tm, style) + left_2 = transfer(lvl.left, Tm, style) + right_2 = transfer(lvl.right, Tm, style) return SparseIntervalLevel{Ti}(lvl_2, lvl.shape, left_2, right_2) end diff --git a/src/tensors/levels/sparse_list_levels.jl b/src/tensors/levels/sparse_list_levels.jl index fa83d9dee..c1cf11094 100644 --- a/src/tensors/levels/sparse_list_levels.jl +++ b/src/tensors/levels/sparse_list_levels.jl @@ -61,10 +61,10 @@ function postype(::Type{SparseListLevel{Ti,Ptr,Idx,Lvl}}) where {Ti,Ptr,Idx,Lvl} return postype(Lvl) end -function moveto(lvl::SparseListLevel{Ti,Ptr,Idx,Lvl}, Tm) where {Ti,Ptr,Idx,Lvl} - lvl_2 = moveto(lvl.lvl, Tm) - ptr_2 = moveto(lvl.ptr, Tm) - idx_2 = moveto(lvl.idx, Tm) +function transfer(lvl::SparseListLevel{Ti,Ptr,Idx,Lvl}, Tm, style) where {Ti,Ptr,Idx,Lvl} + lvl_2 = transfer(lvl.lvl, Tm, style) + ptr_2 = transfer(lvl.ptr, Tm, style) + idx_2 = transfer(lvl.idx, Tm, style) return SparseListLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2) end @@ -313,7 +313,7 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, pos_sto return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) push_preamble!( @@ -321,8 +321,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel quote $ptr_2 = $(lvl.ptr) $idx_2 = $(lvl.idx) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.idx) = $moveto($(lvl.idx), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.idx) = $transfer($(lvl.idx), $(ctx(arch)), style) end, ) push_epilogue!( @@ -332,7 +332,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseListLevel $(lvl.idx) = $idx_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function unfurl( diff --git a/src/tensors/levels/sparse_point_levels.jl b/src/tensors/levels/sparse_point_levels.jl index 762bbe20e..18d3e359b 100644 --- a/src/tensors/levels/sparse_point_levels.jl +++ b/src/tensors/levels/sparse_point_levels.jl @@ -55,9 +55,9 @@ function postype(::Type{SparsePointLevel{Ti,Idx,Lvl}}) where {Ti,Idx,Lvl} return postype(Lvl) end -function moveto(lvl::SparsePointLevel{Ti,Idx,Lvl}, Tm) where {Ti,Idx,Lvl} - lvl_2 = moveto(lvl.lvl, Tm) - idx_2 = moveto(lvl.idx, Tm) +function transfer(lvl::SparsePointLevel{Ti,Idx,Lvl}, Tm, style) where {Ti,Idx,Lvl} + lvl_2 = transfer(lvl.lvl, Tm, style) + idx_2 = transfer(lvl.idx, Tm, style) return SparsePointLevel{Ti}(lvl_2, lvl.shape, idx_2) end @@ -254,14 +254,14 @@ function thaw_level!(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, pos_st return lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) idx_2 = freshen(ctx, lvl.idx) push_preamble!( ctx, quote $idx_2 = $(lvl.idx) - $(lvl.idx) = $moveto($(lvl.idx), $(ctx(arch))) + $(lvl.idx) = $transfer($(lvl.idx), $(ctx(arch)), style) end, ) push_epilogue!( @@ -270,7 +270,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparsePointLeve $(lvl.idx) = $idx_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function unfurl( diff --git a/src/tensors/levels/sparse_rle_levels.jl b/src/tensors/levels/sparse_rle_levels.jl index 49de8844a..b250aa3f4 100644 --- a/src/tensors/levels/sparse_rle_levels.jl +++ b/src/tensors/levels/sparse_rle_levels.jl @@ -72,12 +72,12 @@ function postype( return postype(Lvl) end -function moveto(lvl::SparseRunListLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - ptr = moveto(lvl.ptr, device) - left = moveto(lvl.left, device) - right = moveto(lvl.right, device) - buf = moveto(lvl.buf, device) +function transfer(lvl::SparseRunListLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + ptr = transfer(lvl.ptr, device, style) + left = transfer(lvl.left, device, style) + right = transfer(lvl.right, device, style) + buf = transfer(lvl.buf, device, style) return SparseRunListLevel{Ti}( lvl_2, lvl.shape, lvl.ptr, lvl.left, lvl.right, lvl.buf; merge=getmerge(lvl) ) @@ -303,7 +303,7 @@ function virtual_level_resize!(ctx, lvl::VirtualSparseRunListLevel, dims...) lvl end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) left_2 = freshen(ctx, lvl.left) right_2 = freshen(ctx, lvl.right) @@ -313,9 +313,9 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLe $ptr_2 = $(lvl.ptr) $left_2 = $(lvl.left) $right_2 = $(lvl.right) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.left) = $moveto($(lvl.left), $(ctx(arch))) - $(lvl.right) = $moveto($(lvl.right), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.left) = $transfer($(lvl.left), $(ctx(arch)), style) + $(lvl.right) = $transfer($(lvl.right), $(ctx(arch)), style) end, ) push_epilogue!( @@ -326,8 +326,8 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseRunListLe $(lvl.right) = $right_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) - virtual_moveto_level(ctx, lvl.buf, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) + virtual_transfer_level(ctx, lvl.buf, arch, style) end virtual_level_eltype(lvl::VirtualSparseRunListLevel) = virtual_level_eltype(lvl.lvl) diff --git a/src/tensors/levels/sparse_vbl_levels.jl b/src/tensors/levels/sparse_vbl_levels.jl index ae8f6ff2d..e9080aec5 100644 --- a/src/tensors/levels/sparse_vbl_levels.jl +++ b/src/tensors/levels/sparse_vbl_levels.jl @@ -54,11 +54,11 @@ function postype( return postype(Lvl) end -function moveto(lvl::SparseBlockListLevel{Ti}, device) where {Ti} - lvl_2 = moveto(lvl.lvl, device) - ptr_2 = moveto(lvl.ptr, device) - idx_2 = moveto(lvl.idx, device) - ofs_2 = moveto(lvl.ofs, device) +function transfer(lvl::SparseBlockListLevel{Ti}, device, style) where {Ti} + lvl_2 = transfer(lvl.lvl, device, style) + ptr_2 = transfer(lvl.ptr, device, style) + idx_2 = transfer(lvl.idx, device, style) + ofs_2 = transfer(lvl.ofs, device, style) return SparseBlockListLevel{Ti}(lvl_2, lvl.shape, ptr_2, idx_2, ofs_2) end @@ -289,7 +289,7 @@ function virtual_level_fill_value(lvl::VirtualSparseBlockListLevel) virtual_level_fill_value(lvl.lvl) end -function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, arch) +function virtual_transfer_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, arch, style) ptr_2 = freshen(ctx, lvl.ptr) tbl_2 = freshen(ctx, lvl.tbl) ofs_2 = freshen(ctx, lvl.ofs) @@ -299,9 +299,9 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockList $ptr_2 = $(lvl.ptr) $tbl_2 = $(lvl.tbl) $ofs_2 = $(lvl.ofs) - $(lvl.ptr) = $moveto($(lvl.ptr), $(ctx(arch))) - $(lvl.tbl) = $moveto($(lvl.tbl), $(ctx(arch))) - $(lvl.ofs) = $moveto($(lvl.ofs), $(ctx(arch))) + $(lvl.ptr) = $transfer($(lvl.ptr), $(ctx(arch)), style) + $(lvl.tbl) = $transfer($(lvl.tbl), $(ctx(arch)), style) + $(lvl.ofs) = $transfer($(lvl.ofs), $(ctx(arch)), style) end, ) push_epilogue!( @@ -312,7 +312,7 @@ function virtual_moveto_level(ctx::AbstractCompiler, lvl::VirtualSparseBlockList $(lvl.ofs) = $ofs_2 end, ) - virtual_moveto_level(ctx, lvl.lvl, arch) + virtual_transfer_level(ctx, lvl.lvl, arch, style) end function declare_level!(ctx::AbstractCompiler, lvl::VirtualSparseBlockListLevel, pos, init) diff --git a/src/tensors/scalars.jl b/src/tensors/scalars.jl index 11ac313e8..5990032f4 100644 --- a/src/tensors/scalars.jl +++ b/src/tensors/scalars.jl @@ -41,7 +41,7 @@ function virtualize(ctx, ex, ::Type{Scalar{Vf,Tv}}, tag) where {Vf,Tv} VirtualScalar(sym, Tv, Vf, tag, val) end -virtual_moveto(ctx, lvl::VirtualScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualScalar, arch, style) = lvl virtual_size(ctx, ::VirtualScalar) = () @@ -147,7 +147,7 @@ virtual_size(ctx, ::VirtualSparseScalar) = () virtual_fill_value(ctx, tns::VirtualSparseScalar) = tns.Vf virtual_eltype(tns::VirtualSparseScalar, ctx) = tns.Tv -virtual_moveto(ctx, lvl::VirtualSparseScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualSparseScalar, arch, style) = lvl function declare!(ctx, tns::VirtualSparseScalar, init) push_preamble!( @@ -289,7 +289,7 @@ function lower_assign(ctx, tns::VirtualShortCircuitScalar, mode, op, rhs) :($(tns.val) = $lhs_2) end -virtual_moveto(ctx, lvl::VirtualShortCircuitScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualShortCircuitScalar, arch, style) = lvl function short_circuit_cases(ctx, tns::VirtualShortCircuitScalar, op) [ @@ -359,7 +359,7 @@ virtual_size(ctx, ::VirtualSparseShortCircuitScalar) = () virtual_fill_value(ctx, tns::VirtualSparseShortCircuitScalar) = tns.Vf virtual_eltype(tns::VirtualSparseShortCircuitScalar, ctx) = tns.Tv -virtual_moveto(ctx, lvl::VirtualSparseShortCircuitScalar, arch) = lvl +virtual_transfer(ctx, lvl::VirtualSparseShortCircuitScalar, arch, style) = lvl function declare!(ctx, tns::VirtualSparseShortCircuitScalar, init) push_preamble!( diff --git a/src/util/vectors.jl b/src/util/vectors.jl index bcf901a2c..975bc6117 100644 --- a/src/util/vectors.jl +++ b/src/util/vectors.jl @@ -31,8 +31,8 @@ Base.size(vec::PlusOneVector{T}) where {T} = size(vec.data) Base.axes(vec::PlusOneVector{T}) where {T} = axes(vec.data) Base.resize!(vec::PlusOneVector{T}, dim) where {T} = resize!(vec.data, dim) -function moveto(vec::PlusOneVector{T}, device) where {T} - data = moveto(vec.data, device) +function transfer(vec::PlusOneVector{T}, device, style) where {T} + data = transfer(vec.data, device, style) return PlusOneVector{T}(data) end @@ -77,8 +77,8 @@ Base.size(vec::MinusEpsVector{T}) where {T} = size(vec.data) Base.axes(vec::MinusEpsVector{T}) where {T} = axes(vec.data) Base.resize!(vec::MinusEpsVector{T}, dim) where {T} = resize!(vec.data, dim) -function moveto(vec::MinusEpsVector{T}, device) where {T} - data = moveto(vec.data, device) +function transfer(vec::MinusEpsVector{T}, device, style) where {T} + data = transfer(vec.data, device, style) return MinusEpsVector{T}(data) end @@ -123,7 +123,7 @@ Base.size(vec::PlusEpsVector{T}) where {T} = size(vec.data) Base.axes(vec::PlusEpsVector{T}) where {T} = axes(vec.data) Base.resize!(vec::PlusEpsVector{T}, dim) where {T} = resize!(vec.data, dim) -function moveto(vec::PlusEpsVector{T}, device) where {T} - data = moveto(vec.data, device) +function transfer(vec::PlusEpsVector{T}, device, style) where {T} + data = transfer(vec.data, device, style) return PlusEpsVector{T}(data) end diff --git a/test/reference32/parallel/atomics_sym_spmv.txt b/test/reference32/parallel/atomics_sym_spmv.txt index 07ed8cfd8..404376299 100644 --- a/test/reference32/parallel/atomics_sym_spmv.txt +++ b/test/reference32/parallel/atomics_sym_spmv.txt @@ -23,15 +23,15 @@ begin end Finch.resize_if_smaller!(y_lvl_2_val, x_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0.0, 1, x_lvl.shape) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val_3 = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) - diag_lvl_val = (Finch).moveto(diag_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) + diag_lvl_val = (Finch).transfer(diag_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_blur.jl b/test/reference32/parallel/parallel_blur.jl index 896ebf877..6c7d8fb41 100644 --- a/test/reference32/parallel/parallel_blur.jl +++ b/test/reference32/parallel/parallel_blur.jl @@ -20,14 +20,14 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_2_val = (Finch).moveto(input_lvl_2_val, cpu) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu, style) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).moveto(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu, style) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).moveto(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial()), style) res_6 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference32/parallel/parallel_blur_sparse.jl b/test/reference32/parallel/parallel_blur_sparse.jl index 0410f18e7..4e13fffba 100644 --- a/test/reference32/parallel/parallel_blur_sparse.jl +++ b/test/reference32/parallel/parallel_blur_sparse.jl @@ -22,16 +22,16 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_ptr = (Finch).moveto(input_lvl_ptr, cpu) - input_lvl_idx = (Finch).moveto(input_lvl_idx, cpu) - input_lvl_2_val = (Finch).moveto(input_lvl_2_val, cpu) + input_lvl_ptr = (Finch).transfer(input_lvl_ptr, cpu, style) + input_lvl_idx = (Finch).transfer(input_lvl_idx, cpu, style) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu, style) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).moveto(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu, style) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).moveto(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial()), style) res_71 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_1.txt b/test/reference32/parallel/parallel_spmms_no_atomics_1.txt index be19b7230..f746834f0 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_1.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_1.txt @@ -20,15 +20,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_2.txt b/test/reference32/parallel/parallel_spmms_no_atomics_2.txt index 7bf416ea1..1adeb98b6 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_2.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_2.txt @@ -21,23 +21,23 @@ begin Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) for i_4 = 1:A_lvl.shape[1] val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_ptr_2 = B_lvl_ptr - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_2 = B_lvl_tbl2 - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) val_2 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_3 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_3.txt b/test/reference32/parallel/parallel_spmms_no_atomics_3.txt index 56a2553c8..5bd9abec3 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_3.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_3.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_4.txt b/test/reference32/parallel/parallel_spmms_no_atomics_4.txt index e317e6193..2d6bd4cce 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_4.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_4.txt @@ -54,20 +54,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_2 = B_lvl_tbl2 val_2 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_3 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_9 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -186,20 +186,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_18 val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_3 = B_lvl_tbl2 val_5 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_6 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_19 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmms_no_atomics_5.txt b/test/reference32/parallel/parallel_spmms_no_atomics_5.txt index c143fcbd1..b9b21407f 100644 --- a/test/reference32/parallel/parallel_spmms_no_atomics_5.txt +++ b/test/reference32/parallel/parallel_spmms_no_atomics_5.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x00000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -68,21 +68,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_5 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) B_lvl_ptr_3 = B_lvl_ptr B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_3 = B_lvl_tbl2 val_6 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_10 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -202,21 +202,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_19 val_7 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) A_lvl_ptr_4 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_4 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_4 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_8 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) B_lvl_ptr_4 = B_lvl_ptr B_lvl_tbl1_4 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_4 = B_lvl_tbl2 val_9 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_20 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv.txt b/test/reference32/parallel/parallel_spmv.txt index 8d8ae1101..2c0b26d0d 100644 --- a/test/reference32/parallel/parallel_spmv.txt +++ b/test/reference32/parallel/parallel_spmv.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl.shape) val = y_lvl_val - y_lvl_val = (Finch).moveto(y_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv_atomic.txt b/test/reference32/parallel/parallel_spmv_atomic.txt index 9bf5012a5..88413dcc0 100644 --- a/test/reference32/parallel/parallel_spmv_atomic.txt +++ b/test/reference32/parallel/parallel_spmv_atomic.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl_2.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl_2.shape) val = y_lvl_val - y_lvl_val = (Finch).moveto(y_lvl_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/parallel_spmv_atomics.txt b/test/reference32/parallel/parallel_spmv_atomics.txt index f5c707ef8..dc4cbf25e 100644 --- a/test/reference32/parallel/parallel_spmv_atomics.txt +++ b/test/reference32/parallel/parallel_spmv_atomics.txt @@ -27,13 +27,13 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, A_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, A_lvl.shape) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference32/parallel/stress_dense_atomics.txt b/test/reference32/parallel/stress_dense_atomics.txt index 7f03510f5..994f11179 100644 --- a/test/reference32/parallel/stress_dense_atomics.txt +++ b/test/reference32/parallel/stress_dense_atomics.txt @@ -30,11 +30,11 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, y_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, y_lvl.shape) resize!(x_lvl_val, x_lvl.shape) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val_2 = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) Threads.@threads for i = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/atomics_sym_spmv.txt b/test/reference64/parallel/atomics_sym_spmv.txt index 18325a90e..22059b911 100644 --- a/test/reference64/parallel/atomics_sym_spmv.txt +++ b/test/reference64/parallel/atomics_sym_spmv.txt @@ -23,15 +23,15 @@ begin end Finch.resize_if_smaller!(y_lvl_2_val, x_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0.0, 1, x_lvl.shape) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val_3 = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) - diag_lvl_val = (Finch).moveto(diag_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) + diag_lvl_val = (Finch).transfer(diag_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_blur.jl b/test/reference64/parallel/parallel_blur.jl index 7284481c6..81015e06b 100644 --- a/test/reference64/parallel/parallel_blur.jl +++ b/test/reference64/parallel/parallel_blur.jl @@ -20,14 +20,14 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_2_val = (Finch).moveto(input_lvl_2_val, cpu) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu, style) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).moveto(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu, style) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).moveto(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial()), style) res_6 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference64/parallel/parallel_blur_sparse.jl b/test/reference64/parallel/parallel_blur_sparse.jl index befeec956..c8b921542 100644 --- a/test/reference64/parallel/parallel_blur_sparse.jl +++ b/test/reference64/parallel/parallel_blur_sparse.jl @@ -22,16 +22,16 @@ begin pos_stop = input_lvl_2.shape * input_lvl.shape Finch.resize_if_smaller!(output_lvl_2_val, pos_stop) Finch.fill_range!(output_lvl_2_val, 0.0, 1, pos_stop) - input_lvl_ptr = (Finch).moveto(input_lvl_ptr, cpu) - input_lvl_idx = (Finch).moveto(input_lvl_idx, cpu) - input_lvl_2_val = (Finch).moveto(input_lvl_2_val, cpu) + input_lvl_ptr = (Finch).transfer(input_lvl_ptr, cpu, style) + input_lvl_idx = (Finch).transfer(input_lvl_idx, cpu, style) + input_lvl_2_val = (Finch).transfer(input_lvl_2_val, cpu, style) val_2 = output_lvl_2_val - output_lvl_2_val = (Finch).moveto(output_lvl_2_val, cpu) + output_lvl_2_val = (Finch).transfer(output_lvl_2_val, cpu, style) Threads.@threads for i = 1:cpu.n Finch.@barrier begin @inbounds @fastmath(begin val_3 = tmp_lvl_val - tmp_lvl_val = (Finch).moveto(tmp_lvl_val, CPUThread(i, cpu, Serial())) + tmp_lvl_val = (Finch).transfer(tmp_lvl_val, CPUThread(i, cpu, Serial()), style) res_71 = begin phase_start_2 = max(1, 1 + fld(y_stop * (-1 + i), cpu.n)) phase_stop_2 = min(y_stop, fld(y_stop * i, cpu.n)) diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_1.txt b/test/reference64/parallel/parallel_spmms_no_atomics_1.txt index a4986cd00..5d388fa3d 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_1.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_1.txt @@ -20,15 +20,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_2.txt b/test/reference64/parallel/parallel_spmms_no_atomics_2.txt index b4a0d757b..e5eca22dc 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_2.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_2.txt @@ -21,23 +21,23 @@ begin Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) for i_4 = 1:A_lvl.shape[1] val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_ptr_2 = B_lvl_ptr - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_2 = B_lvl_tbl2 - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) val_2 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_3 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_5 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_3.txt b/test/reference64/parallel/parallel_spmms_no_atomics_3.txt index ed44c6465..12ae422f7 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_3.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_3.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_4.txt b/test/reference64/parallel/parallel_spmms_no_atomics_4.txt index 0d4601fa7..1676382bc 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_4.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_4.txt @@ -54,20 +54,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_tbl1_2 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_2 = B_lvl_tbl2 val_2 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_2 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_2 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_2 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_3 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_9 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -186,20 +186,20 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_18 val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_3 = B_lvl_tbl2 val_5 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_6 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_19 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmms_no_atomics_5.txt b/test/reference64/parallel/parallel_spmms_no_atomics_5.txt index 49ca7ab16..e7b18e235 100644 --- a/test/reference64/parallel/parallel_spmms_no_atomics_5.txt +++ b/test/reference64/parallel/parallel_spmms_no_atomics_5.txt @@ -19,15 +19,15 @@ begin Finch.resize_if_smaller!(Ct_lvl_2_val, pos_stop) Finch.fill_range!(Ct_lvl_2_val, 0x0000000000000000, 1, pos_stop) val = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) - B_lvl_ptr = (Finch).moveto(B_lvl_ptr, CPU(Threads.nthreads())) - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) - B_lvl_tbl2 = (Finch).moveto(B_lvl_tbl2, CPU(Threads.nthreads())) - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) + B_lvl_ptr = (Finch).transfer(B_lvl_ptr, CPU(Threads.nthreads()), style) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) + B_lvl_tbl2 = (Finch).transfer(B_lvl_tbl2, CPU(Threads.nthreads()), style) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -68,21 +68,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + B_lvl_i val_4 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) A_lvl_ptr_3 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_3 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_3 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_5 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) B_lvl_ptr_3 = B_lvl_ptr B_lvl_tbl1_3 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_3 = B_lvl_tbl2 val_6 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_10 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin @@ -202,21 +202,21 @@ begin end Ct_lvl_q = (1 - 1) * B_lvl.shape[2] + phase_stop_19 val_7 = Ct_lvl_2_val - Ct_lvl_2_val = (Finch).moveto(Ct_lvl_2_val, CPU(Threads.nthreads())) + Ct_lvl_2_val = (Finch).transfer(Ct_lvl_2_val, CPU(Threads.nthreads()), style) A_lvl_ptr_4 = A_lvl_ptr - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) A_lvl_tbl1_4 = A_lvl_tbl1 - A_lvl_tbl1 = (Finch).moveto(A_lvl_tbl1, CPU(Threads.nthreads())) + A_lvl_tbl1 = (Finch).transfer(A_lvl_tbl1, CPU(Threads.nthreads()), style) A_lvl_tbl2_4 = A_lvl_tbl2 - A_lvl_tbl2 = (Finch).moveto(A_lvl_tbl2, CPU(Threads.nthreads())) + A_lvl_tbl2 = (Finch).transfer(A_lvl_tbl2, CPU(Threads.nthreads()), style) val_8 = A_lvl_val - A_lvl_val = (Finch).moveto(A_lvl_val, CPU(Threads.nthreads())) + A_lvl_val = (Finch).transfer(A_lvl_val, CPU(Threads.nthreads()), style) B_lvl_ptr_4 = B_lvl_ptr B_lvl_tbl1_4 = B_lvl_tbl1 - B_lvl_tbl1 = (Finch).moveto(B_lvl_tbl1, CPU(Threads.nthreads())) + B_lvl_tbl1 = (Finch).transfer(B_lvl_tbl1, CPU(Threads.nthreads()), style) B_lvl_tbl2_4 = B_lvl_tbl2 val_9 = B_lvl_val - B_lvl_val = (Finch).moveto(B_lvl_val, CPU(Threads.nthreads())) + B_lvl_val = (Finch).transfer(B_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_20 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv.txt b/test/reference64/parallel/parallel_spmv.txt index 407116f23..54f2b6ab4 100644 --- a/test/reference64/parallel/parallel_spmv.txt +++ b/test/reference64/parallel/parallel_spmv.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl.shape) val = y_lvl_val - y_lvl_val = (Finch).moveto(y_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv_atomic.txt b/test/reference64/parallel/parallel_spmv_atomic.txt index 5d071fa23..4b945c401 100644 --- a/test/reference64/parallel/parallel_spmv_atomic.txt +++ b/test/reference64/parallel/parallel_spmv_atomic.txt @@ -13,11 +13,11 @@ begin Finch.resize_if_smaller!(y_lvl_val, A_lvl_2.shape) Finch.fill_range!(y_lvl_val, 0.0, 1, A_lvl_2.shape) val = y_lvl_val - y_lvl_val = (Finch).moveto(y_lvl_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_val = (Finch).transfer(y_lvl_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/parallel_spmv_atomics.txt b/test/reference64/parallel/parallel_spmv_atomics.txt index 8bb3bd88d..5d04f2e3a 100644 --- a/test/reference64/parallel/parallel_spmv_atomics.txt +++ b/test/reference64/parallel/parallel_spmv_atomics.txt @@ -27,13 +27,13 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, A_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, A_lvl.shape) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) - A_lvl_ptr = (Finch).moveto(A_lvl_ptr, CPU(Threads.nthreads())) - A_lvl_idx = (Finch).moveto(A_lvl_idx, CPU(Threads.nthreads())) - A_lvl_2_val = (Finch).moveto(A_lvl_2_val, CPU(Threads.nthreads())) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) + A_lvl_ptr = (Finch).transfer(A_lvl_ptr, CPU(Threads.nthreads()), style) + A_lvl_idx = (Finch).transfer(A_lvl_idx, CPU(Threads.nthreads()), style) + A_lvl_2_val = (Finch).transfer(A_lvl_2_val, CPU(Threads.nthreads()), style) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) Threads.@threads for i_4 = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/reference64/parallel/stress_dense_atomics.txt b/test/reference64/parallel/stress_dense_atomics.txt index 67b2308ed..71792f516 100644 --- a/test/reference64/parallel/stress_dense_atomics.txt +++ b/test/reference64/parallel/stress_dense_atomics.txt @@ -30,11 +30,11 @@ quote Finch.resize_if_smaller!(y_lvl_2_val, y_lvl.shape) Finch.fill_range!(y_lvl_2_val, 0, 1, y_lvl.shape) resize!(x_lvl_val, x_lvl.shape) - x_lvl_val = (Finch).moveto(x_lvl_val, CPU(Threads.nthreads())) + x_lvl_val = (Finch).transfer(x_lvl_val, CPU(Threads.nthreads()), style) locksArray = y_lvl_locks - y_lvl_locks = (Finch).moveto(y_lvl_locks, CPU(Threads.nthreads())) + y_lvl_locks = (Finch).transfer(y_lvl_locks, CPU(Threads.nthreads()), style) val_2 = y_lvl_2_val - y_lvl_2_val = (Finch).moveto(y_lvl_2_val, CPU(Threads.nthreads())) + y_lvl_2_val = (Finch).transfer(y_lvl_2_val, CPU(Threads.nthreads()), style) Threads.@threads for i = 1:Threads.nthreads() Finch.@barrier begin @inbounds @fastmath(begin diff --git a/test/suites/parallel_tests.jl b/test/suites/parallel_tests.jl index 2b7b3e457..5e0388407 100644 --- a/test/suites/parallel_tests.jl +++ b/test/suites/parallel_tests.jl @@ -554,7 +554,7 @@ input = Tensor(Dense(Dense(Element(0.0)))) output = Tensor(Dense(Dense(Element(0.0)))) cpu = CPU(Threads.nthreads()) - tmp = moveto(Tensor(Dense(Element(0))), CPULocalMemory(cpu)) + tmp = transfer(Tensor(Dense(Element(0))), CPULocalMemory(cpu), style) check_output( "parallel/parallel_blur.jl", @@ -579,7 +579,7 @@ input = Tensor(Dense(SparseList(Element(0.0)))) output = Tensor(Dense(Dense(Element(0.0)))) cpu = CPU(Threads.nthreads()) - tmp = moveto(Tensor(Dense(Element(0))), CPULocalMemory(cpu)) + tmp = transfer(Tensor(Dense(Element(0))), CPULocalMemory(cpu), style) check_output( "parallel/parallel_blur_sparse.jl",