From 5d60a36a51564151a012ff31c338b3310a51d464 Mon Sep 17 00:00:00 2001
From: Christian Guinard <chguinard99@gmail.com>
Date: Sat, 12 Aug 2023 04:23:23 -0300
Subject: [PATCH] Improve docs (#235)

---
 README.md                                     |  8 ++---
 docs/make.jl                                  | 15 ++++++---
 docs/src/api/array.md                         | 10 ++++--
 docs/src/api/kernel.md                        |  1 +
 docs/src/api/mps.md                           | 28 +++++++++++++++++
 docs/src/{ => faq}/contributing.md            |  9 ++++--
 docs/src/{ => faq}/faq.md                     |  4 ++-
 docs/src/index.md                             |  8 ++---
 docs/src/profiling.md                         |  4 +--
 docs/src/usage/array.md                       | 22 ++++++-------
 .../{metal-programming.md => usage/kernel.md} | 31 ++++++-------------
 docs/src/usage/overview.md                    |  5 ---
 lib/mps/matrix.jl                             | 15 ++++++---
 lib/mps/vector.jl                             | 23 ++++++++------
 src/array.jl                                  |  8 ++++-
 15 files changed, 119 insertions(+), 72 deletions(-)
 create mode 100644 docs/src/api/mps.md
 rename docs/src/{ => faq}/contributing.md (93%)
 rename docs/src/{ => faq}/faq.md (81%)
 rename docs/src/{metal-programming.md => usage/kernel.md} (84%)

diff --git a/README.md b/README.md
index 7e94c54cf..0d76320f6 100644
--- a/README.md
+++ b/README.md
@@ -64,15 +64,15 @@ importing the package:
 julia> using Metal
 
 julia> Metal.versioninfo()
-macOS 13.3.1, Darwin 22.4.0
+macOS 13.5.0, Darwin 22.6.0
 
 Toolchain:
-- Julia: 1.9.0
+- Julia: 1.9.3
 - LLVM: 14.0.6
 
 Julia packages:
-- Metal.jl: 0.3.0
-- Metal_LLVM_Tools_jll: 0.3.0+2
+- Metal.jl: 0.5.0
+- Metal_LLVM_Tools_jll: 0.5.1+0
 
 1 device:
 - Apple M2 Max (64.000 KiB allocated)
diff --git a/docs/make.jl b/docs/make.jl
index 5ed0703f9..ba3cecd27 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -25,18 +25,23 @@ function main()
         modules = [Metal],
         pages = Any[
             "Home" => "index.md",
+            "Usage" => Any[
+                "usage/overview.md",
+                "usage/array.md",
+                "usage/kernel.md",
+            ],
+            "Profiling" => "profiling.md",
             "API reference" => Any[
                 "api/essentials.md",
                 "api/compiler.md",
                 "api/kernel.md",
                 "api/array.md",
+                "api/mps.md",
             ],
-            "Usage" => Any[
-                "usage/overview.md",
-                "usage/array.md",
+            "FAQ" => Any[
+                "faq/faq.md",
+                "faq/contributing.md",
             ],
-            "Profiling" => "profiling.md",
-            "FAQ" => "faq.md",
         ]
     )
 end
diff --git a/docs/src/api/array.md b/docs/src/api/array.md
index d6fb3124f..0fb07e2da 100644
--- a/docs/src/api/array.md
+++ b/docs/src/api/array.md
@@ -1,4 +1,10 @@
 # Array programming
 
-The Metal array type, `MtlArray`, generally implements the Base array interface and all of its
-expected methods.
\ No newline at end of file
+The Metal array type, `MtlArray`, generally implements the Base array interface
+and all of its expected methods.
+
+However, there is the special function `mtl` for transferring an array over to the gpu. For compatibility reasons, it will automatically convert arrays of `Float64` to `Float32`.
+
+```@docs
+mtl
+```
\ No newline at end of file
diff --git a/docs/src/api/kernel.md b/docs/src/api/kernel.md
index b69527e18..baaab1d6d 100644
--- a/docs/src/api/kernel.md
+++ b/docs/src/api/kernel.md
@@ -53,6 +53,7 @@ MtlThreadGroupArray
 ## Synchronization
 
 ```@docs
+MemoryFlags
 threadgroup_barrier
 simdgroup_barrier
 ```
\ No newline at end of file
diff --git a/docs/src/api/mps.md b/docs/src/api/mps.md
new file mode 100644
index 000000000..b743e36ee
--- /dev/null
+++ b/docs/src/api/mps.md
@@ -0,0 +1,28 @@
+# Metal Performance Shaders
+
+This section lists the package's public functionality that corresponds to the Metal
+Performance Shaders functions. For more information about these functions, or to see
+which functions have yet to be implemented in this package, please consult
+the [Metal Performance Shaders Documentation](https://developer.apple.com/documentation/metalperformanceshaders?language=objc).
+
+## Matrices and Vectors
+
+```@docs
+MPS.MPSMatrix
+MPS.MPSVector
+```
+
+### Matrix Arithmetic Operators
+
+```@docs
+MPS.matmul!
+MPS.matvecmul!
+MPS.topk
+MPS.topk!
+```
+
+### Linear Algebra
+
+Many of the currently implemented MPS functions are for linear algebra operations.
+Therefore, you use them by calling the corresponding LinearAlgebra function with an
+`MtlArray`. They are nonetheless listed below:
diff --git a/docs/src/contributing.md b/docs/src/faq/contributing.md
similarity index 93%
rename from docs/src/contributing.md
rename to docs/src/faq/contributing.md
index 404b610b4..29c45fe85 100644
--- a/docs/src/contributing.md
+++ b/docs/src/faq/contributing.md
@@ -43,7 +43,7 @@ skip the first two steps.
 Some Metal functions map directly to Apple intermediate representation intrinsics. In this
 case, wrapping them into Metal.jl is relatively easy. All that needs to be done is to create
 a mapping from a Julia function via a simple ccall. See the
-[threadgroup barrier implementation](../../src/device/intrinsics/synchronization.jl#L43) for
+[threadgroup barrier implementation](https://github.com/JuliaGPU/Metal.jl/blob/main/src/device/intrinsics/synchronization.jl#L43-L44) for
 reference.
 
 However, the Metal documentation doesn't tell you what the format of the intrinsic names
@@ -84,7 +84,10 @@ optimized away. Double-check that the kernel's IR makes sense for what you wrote
 Metal exposes a special interface to its library of optimized kernels. Rather than accepting
 the normal set of input GPU data structures, it requires special `MPS` datatypes that assume
 row-major memory layout. As this is not the Julia default, adapt accordingly. Adding MPS
-functionality should be mostly straightforward, so this can be an easy entrypoint to helping.
+functionality should be mostly straightforward, so this can be an easy entry point to helping.
+To get started, you can have a look at the [Metal Performance Shaders
+Documentation](https://developer.apple.com/documentation/metalperformanceshaders?language=objc)
+from Apple.
 
 ## Exposing your Interface
 
@@ -97,7 +100,7 @@ The only thing beyond this is exporting into the global namespace. That would be
 functions/structures/macros with clear and common use-cases (`MtlArray` or `@metal`).
 
 Additionally, you can override non-Metal.jl functions like `LinearAlgebra.mul!` seen
-[here](../../lib/mps/linalg.jl#L63). This is essentially (ab)using multiple dispatch to
+[here](https://github.com/JuliaGPU/Metal.jl/blob/main/lib/mps/linalg.jl#L34). This is essentially (ab)using multiple dispatch to
 specialize for certain cases (usually for more performant execution).
 
 If your function is only available from within GPU kernels (like thread indexing intrinsics).
diff --git a/docs/src/faq.md b/docs/src/faq/faq.md
similarity index 81%
rename from docs/src/faq.md
rename to docs/src/faq/faq.md
index 2abed926b..53c130c75 100644
--- a/docs/src/faq.md
+++ b/docs/src/faq/faq.md
@@ -1,5 +1,7 @@
+# Frequently Asked Questions
+
 ## Can you wrap this Metal API?
 
 Most likely. Any help on designing or implementing high-level wrappers for MSL's low-level functionality
 is greatly appreciated, so please consider [contributing](contributing.md) your uses of these APIs on the
-respective repositories.
\ No newline at end of file
+respective repositories.
diff --git a/docs/src/index.md b/docs/src/index.md
index dbf393936..9d6f22b9a 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,6 +1,6 @@
 # MacOS GPU programming in Julia
 
-The Metal.jl package is the main entrypoint for GPU programming on MacOS in Julia. The package
+The Metal.jl package is the main entry point for GPU programming on MacOS in Julia. The package
 makes it possible to do so at various abstraction levels, from easy-to-use arrays down to
 hand-written kernels using low-level Metal APIs.
 
@@ -37,13 +37,13 @@ Pkg.test("Metal")
 The following resources may also be of interest (although are mainly focused on the CUDA GPU
  backend):
 
-- Effectively using GPUs with Julia: [video](https://www.youtube.com/watch?v=7Yq1UyncDNc),
+- Effectively using GPUs with Julia:
   [slides](https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/)
 - How Julia is compiled to GPUs: [video](https://www.youtube.com/watch?v=Fz-ogmASMAE)
 
 ## Contributing
 
-If you want to help improve this package, look at [the contributing page](contributing.md) for more details.
+If you want to help improve this package, look at [the contributing page](faq/contributing.md) for more details.
 
 ## Acknowledgements
 
@@ -60,5 +60,5 @@ Some of the software in this ecosystem was developed as part of academic researc
 would like to help support it, please star the repository as such metrics may help us secure
 funding in the future. If you use our software as part of your research, teaching, or other
 activities, we would be grateful if you could cite our work. The
-[CITATION.bib](https://github.com/JuliaGPU/Metal.jl/blob/master/CITATION.bib) file in the
+[CITATION.cff](https://github.com/JuliaGPU/Metal.jl/blob/main/CITATION.cff) file in the
 root of this repository lists the relevant papers.
diff --git a/docs/src/profiling.md b/docs/src/profiling.md
index 4e972e8db..c2c14dce7 100644
--- a/docs/src/profiling.md
+++ b/docs/src/profiling.md
@@ -18,8 +18,8 @@ Note that the allocations as reported by BenchmarkTools are CPU allocations.
 
 ## Application profiling
 
-For profiling large applications, simple timings are insufficient. Instead, we want a
-overview of how and when the GPU was active, to avoid times where the device was idle and/or
+For profiling large applications, simple timings are insufficient. Instead, we want an
+overview of how and when the GPU was active to avoid times where the device was idle and/or
 find which kernels needs optimization.
 
 As we cannot use the Julia profiler for this task, we will use Metal's GPU profiler directly.
diff --git a/docs/src/usage/array.md b/docs/src/usage/array.md
index c1199255a..1c94126d6 100644
--- a/docs/src/usage/array.md
+++ b/docs/src/usage/array.md
@@ -26,18 +26,18 @@ The `MtlArray` type aims to implement the `AbstractArray` interface, and provide
 implementations of methods that are commonly used when working with arrays. That means you
 can construct `MtlArray`s in the same way as regular `Array` objects:
 
-```julia
+```jldoctest
 julia> MtlArray{Int}(undef, 2)
-2-element MtlVector{Int64}:
+2-element MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  0
  0
 
 julia> MtlArray{Int}(undef, (1,2))
-1×2 MtlMatrix{Int64}:
+1×2 MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  0  0
 
 julia> similar(ans)
-1×2 MtlMatrix{Int64}:
+1×2 MtlMatrix{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  0  0
 ```
 
@@ -46,7 +46,7 @@ Copying memory to or from the GPU can be expressed using constructors as well, o
 
 ```jldoctest
 julia> a = MtlArray([1,2])
-2-element MtlVector{Int64}:
+2-element MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  1
  2
 
@@ -73,11 +73,11 @@ perform simple element-wise operations you can use `map` or `broadcast`:
 julia> a = MtlArray{Float32}(undef, (1,2));
 
 julia> a .= 5
-1×2 MtlMatrix{Float32}:
+1×2 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  5.0  5.0
 
 julia> map(sin, a)
-1×2 MtlMatrix{Float32}:
+1×2 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  -0.958924  -0.958924
 ```
 
@@ -86,7 +86,7 @@ To reduce the dimensionality of arrays, Metal.jl implements the various flavours
 
 ```jldoctest
 julia> a = Metal.ones(2,3)
-2×3 MtlMatrix{Float32}:
+2×3 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  1.0  1.0  1.0
  1.0  1.0  1.0
 
@@ -94,15 +94,15 @@ julia> reduce(+, a)
 6.0f0
 
 julia> mapreduce(sin, *, a; dims=2)
-2×1 MtlMatrix{Float32}:
+2×1 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  0.59582335
  0.59582335
 
 julia> b = Metal.zeros(1)
-1-element MtlVector{Float32}:
+1-element MtlVector{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  0.0
 
 julia> Base.mapreducedim!(identity, +, b, a)
-1×1 MtlMatrix{Float32}:
+1×1 MtlMatrix{Float32, Metal.MTL.MTLResourceStorageModePrivate}:
  6.0
 ```
diff --git a/docs/src/metal-programming.md b/docs/src/usage/kernel.md
similarity index 84%
rename from docs/src/metal-programming.md
rename to docs/src/usage/kernel.md
index 2c7aa80fd..ad0dc92ea 100644
--- a/docs/src/metal-programming.md
+++ b/docs/src/usage/kernel.md
@@ -1,15 +1,4 @@
-# Introduction to Metal Programming in Julia
-
-*A quick and dirty introduction to GPU programming on MacOS*
-
-## High-Level Array Operations
-
-This document isn't meant to cover the high-level array operations that Metal.jl enables.
-Performing `Array` operations on `MtlArrays` should *just work* if the types are compatible.
-For example, `gpu_c .= gpu_a .+ gpu_b`
-Thus, this document is more about an introduction to explicit GPU *kernel* programming.
-
-## Kernel Programming
+# Kernel programming
 
 Metal.jl is based off of Apple's [Metal Shading Language (MSL)](https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf)
 and Metal framework. The interface allows you to utilize the graphics and computing power of
@@ -53,18 +42,18 @@ also query what the grid and threadgroup sizes are as well.
 
 For Metal.jl, these values are accessed via the following functions:
 
-- thread_index_in_threadgroup()
-- grid_size_Xd()
-- thread_position_in_grid_Xd()
-- thread_position_in_threadgroup_Xd()
-- threadgroup_position_in_grid_Xd()
-- threadgroups_per_grid_Xd()
-- threads_per_grid_Xd()
-- threads_per_threadgroup_Xd()
+- `thread_index_in_threadgroup()`
+- `grid_size_Xd()`
+- `thread_position_in_grid_Xd()`
+- `thread_position_in_threadgroup_Xd()`
+- `threadgroup_position_in_grid_Xd()`
+- `threadgroups_per_grid_Xd()`
+- `threads_per_grid_Xd()`
+- `threads_per_threadgroup_Xd()`
 
 *Where 'X' is 1, 2, or 3 according to the number of dimensions requested.*
 
-Using these in a kernel (taken directly from the [vadd example](../../examples/vadd.jl)):
+Using these in a kernel (taken directly from the [vadd example](https://github.com/JuliaGPU/Metal.jl/blob/main/examples/vadd.jl)):
 
 ```julia
 function vadd(a, b, c)
diff --git a/docs/src/usage/overview.md b/docs/src/usage/overview.md
index 0db9d4699..ca641602c 100644
--- a/docs/src/usage/overview.md
+++ b/docs/src/usage/overview.md
@@ -39,8 +39,3 @@ a = Metal.zeros(1024)
 b = Metal.ones(1024)
 a.^2 .+ sin.(b)
 ```
-
-When possible, these operations integrate with existing vendor libraries.For example,
-multiplying matrices or generating random numbers will automatically dispatch to these
-high-quality libraries, if types are supported, and fall back to generic implementations
-otherwise.
diff --git a/lib/mps/matrix.jl b/lib/mps/matrix.jl
index 8605ae198..c4edc860d 100644
--- a/lib/mps/matrix.jl
+++ b/lib/mps/matrix.jl
@@ -118,8 +118,11 @@ end
 """
     matMulMPS(a::MtlMatrix, b::MtlMatrix, c::MtlMatrix, alpha=1, beta=1,
               transpose_left=false, transpose_right=false)
+A `MPSMatrixMultiplication` kernel thay computes:
+`c = alpha * op(a) * beta * op(b) + beta * C`
 
-Perform `c = alpha * op(a) * beta * op(b) + beta * C`.
+This function should not typically be used. Rather, use the normal `LinearAlgebra` interface
+with any `MtlArray` and it should be accelerated using Metal Performance Shaders.
 """
 function matmul!(c::MtlMatrix, a::MtlMatrix, b::MtlMatrix,
                  alpha::Number=true, beta::Number=true,
@@ -146,7 +149,7 @@ function matmul!(c::MtlMatrix, a::MtlMatrix, b::MtlMatrix,
     encode!(cmdbuf, mat_mul_kernel, mps_b, mps_a, mps_c)
     commit!(cmdbuf)
 
-    c
+    return c
 end
 
 export MPSMatrixFindTopK
@@ -187,6 +190,8 @@ Return the indices in `I` and the values in `V`.
 
 `k` cannot be greater than 16.
 
+Uses `MPSMatrixFindTopK`.
+
 See also: [`topk`](@ref).
 """
 function topk!(A::MtlMatrix{T}, I::MtlMatrix{UInt32}, V::MtlMatrix{T}, k) where {T<:MtlFloat}
@@ -197,7 +202,7 @@ function topk!(A::MtlMatrix{T}, I::MtlMatrix{UInt32}, V::MtlMatrix{T}, k) where
     @assert size(V,1) >= k         "Matrix 'V' must be large enough for k rows"
     @assert size(V,2) >= size(A,2) "Matrix 'V' must have at least as many columns as A"
 
-    _topk!(A,I,V,k)
+    return _topk!(A,I,V,k)
 end
 @inline function _topk!(A::MtlMatrix{T}, I::MtlMatrix{UInt32}, V::MtlMatrix{T}, k) where {T<:MtlFloat}
     # Create MPS-compatible matrix from the MtlArrays
@@ -215,7 +220,7 @@ end
     encode!(cmdbuf, topk_kernel, mps_a, mps_i, mps_v)
     commit!(cmdbuf)
 
-    I, V
+    return I, V
 end
 
 """
@@ -226,6 +231,8 @@ Return the indices in `I` and the values in `V`.
 
 `k` cannot be greater than 16.
 
+Uses `MPSMatrixFindTopK`.
+
 See also: [`topk!`](@ref).
 """
 function topk(A::MtlMatrix{T,S}, k) where {T<:MtlFloat,S}
diff --git a/lib/mps/vector.jl b/lib/mps/vector.jl
index 1e38e0728..22092c6cf 100644
--- a/lib/mps/vector.jl
+++ b/lib/mps/vector.jl
@@ -87,28 +87,33 @@ end
 
 
 """
-A MPSMatrixVectorMultiplication kernel thay computes:
-y = alpha * op(A) * x + beta * y
+    matVecMulMPS(c::MtlVector, a::MtlMatrix, b::MtlVector, alpha=1, beta=1,
+                 transpose=false)
+A `MPSMatrixVectorMultiplication` kernel thay computes:
+`c = alpha * op(a) * b + beta * c`
+
+This function should not typically be used. Rather, use the normal `LinearAlgebra` interface
+with any `MtlArray` and it should be accelerated using Metal Performance Shaders.
 """
-function matvecmul!(y::MtlVector, a::MtlMatrix, x::MtlVector, alpha::Number=true, beta::Number=false,
+function matvecmul!(c::MtlVector, a::MtlMatrix, b::MtlVector, alpha::Number=true, beta::Number=false,
                     transpose=false)
     # NOTE: MPS uses row major, while Julia is col-major
     cols_a = size(a, transpose ? 1 : 2)
-    rows_y = length(y)
+    rows_c = length(c)
 
     # Create MPS-compatible matrix/vector from the MtlArrays
     mps_a = MPSMatrix(a)
-    mps_x = MPSVector(x)
-    mps_y = MPSVector(y)
+    mps_b = MPSVector(b)
+    mps_c = MPSVector(c)
 
     matvec_mul_kernel = MPSMatrixVectorMultiplication(current_device(), !transpose,
-                                                      rows_y, cols_a,
+                                                      rows_c, cols_a,
                                                       alpha, beta)
 
     # Encode and commit matmul kernel
     cmdbuf = MTLCommandBuffer(global_queue(current_device()))
-    encode!(cmdbuf, matvec_mul_kernel, mps_a, mps_x, mps_y)
+    encode!(cmdbuf, matvec_mul_kernel, mps_a, mps_b, mps_c)
     commit!(cmdbuf)
 
-    return y
+    return c
 end
\ No newline at end of file
diff --git a/src/array.jl b/src/array.jl
index d0392806e..626419d29 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -344,7 +344,7 @@ Uses Adapt.jl to act inside some wrapper structs.
 
 # Examples
 
-```
+```jldoctests
 julia> mtl(ones(3)')
 1×3 adjoint(::MtlVector{Float32, Metal.MTL.MTLResourceStorageModePrivate}) with eltype Float32:
  1.0  1.0  1.0
@@ -357,6 +357,12 @@ julia> mtl(1:3)
 1:3
 
 julia> MtlArray(1:3)
+3-element MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
+ 1
+ 2
+ 3
+
+julia> mtl[1,2,3]
 3-element MtlVector{Int64, Metal.MTL.MTLResourceStorageModePrivate}:
  1
  2