Merge pull request #61 from LCSB-BioCore/develop

Regular merge of develop
LCSB-BioCore · Sep 8, 2019 · 855863f · 855863f
2 parents 6ed6a0a + 46b50de
commit 855863f
Show file tree

Hide file tree

Showing 17 changed files with 528 additions and 158 deletions.
diff --git a/.artenolis.yml b/.artenolis.yml
@@ -22,12 +22,12 @@ script:
 after_success:
 
     # submit coverage report
-    - if [ "$ARCH" == "Linux" ]; then
+    - if [ "$ARCH" == "Linux" && "$JULIA_VER" = "v1.2.0" ]; then
         $ARTENOLIS_SOFT_PATH/julia/$JULIA_VER/bin/julia --color=yes -e 'using Coverage; Codecov.submit(process_folder())';
       fi
 
     # deploy documentation
-    - if [[ "$ARCH" == "Linux" && "$JENKINS_PULL_REQUEST" != "True" ]]; then
+    - if [[ "$ARCH" == "Linux" && "$JENKINS_PULL_REQUEST" != "True" && "$JULIA_VER" = "v1.2.0" ]]; then
         export TRAVIS_BRANCH=$GIT_BRANCH;
         export TRAVIS_PULL_REQUEST=false;
         var=$GIT_URL; export TRAVIS_REPO_SLUG=${var:8:${#var}};
@@ -41,3 +41,4 @@ after_success:
 after_script:
     # clean up the build directory
     - cd .. && rm -rf GigaSOM
+    - rm -rf ~/.julia
diff --git a/.gitignore b/.gitignore
@@ -13,8 +13,10 @@ docs/site/
 *.csv
 !refBatchDfCodes.csv
 !refBatchWinners.csv
+!refBatchEmbedded.csv
 !refParallelDfCodes.csv
 !refParallelWinners.csv
+!refParallelEmbedded.csv
 *.py
 __pycache__
 data/

diff --git a/Project.toml b/Project.toml
@@ -12,6 +12,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FCSFiles = "d76558cf-badf-52d4-a17e-381ab0b0d937"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/src/GigaSOM.jl b/src/GigaSOM.jl
@@ -15,10 +15,12 @@ module GigaSOM
     using FileIO
     using DistributedArrays
     using XLSX
+    using NearestNeighbors
 
     include("structs.jl")
     include("core.jl")
     include("satellites.jl")
+    include("embedding.jl")
 
     # include IO files
     include("io/input.jl")
@@ -30,7 +32,12 @@ module GigaSOM
     export #core
         initGigaSOM,
         trainGigaSOM,
-        mapToGigaSOM
+        mapToGigaSOM,
+        linearRadius,
+        expRadius
+
+    export #embedding
+        embedGigaSOM
 
     export # structs
         daFrame
@@ -39,10 +46,10 @@ module GigaSOM
         readFlowset
 
     export #satellites
-    cleanNames!,
-    createDaFrame,
-    getMarkers,
-    checkDir
+        cleanNames!,
+        createDaFrame,
+        getMarkers,
+        checkDir
 
     export # plotting
         plotCounts,

diff --git a/src/core.jl b/src/core.jl
@@ -34,9 +34,9 @@ function initGigaSOM( train, xdim, ydim = xdim;
     normParams = convert(DataFrame, normParams)
     names!(normParams, Symbol.(colNames))
 
-    # create X,y-indices for neurons:
+    # create X,Y-indices for neurons:
     #
-     x = y = collect(1:numCodes)
+    x = y = collect(1:numCodes)
     indices = DataFrame(X = x, Y = y)
 
     # make SOM object:
@@ -61,74 +61,72 @@ end
 - `kernel::function`: optional distance kernel; one of (`bubbleKernel, gaussianKernel`)
             default is `gaussianKernel`
 - `r`: optional training radius.
-       If r is not specified, it defaults to √(xdim^2 + ydim^2) / 2
+       If r is not specified, it defaults to (xdim+ydim)/3
+- `rFinal`: target radius at the last epoch, defaults to 0.1
 Training data must be convertable to Array{Float34,2} with `convert()`.
 Training samples are row-wise; one sample per row. An alternative kernel function
 can be provided to modify the distance-dependent training. The function must fit
 to the signature fun(x, r) where x is an arbitrary distance and r is a parameter
 controlling the function and the return value is between 0.0 and 1.0.
 """
-function trainGigaSOM(som::Som, train::DataFrame; kernelFun::Function = gaussianKernel,
-                    r = 0.0, epochs = 10)
+function trainGigaSOM(som::Som, train::DataFrame;
+                      kernelFun::Function = gaussianKernel,
+                      knnTreeFun = BruteTree,
+                      rStart = 0.0, rFinal=0.1, radiusFun=linearRadius,
+                      epochs = 10)
 
     train = convertTrainingData(train)
 
     # set default radius:
-    if r == 0.0
-        r = √(som.xdim^2 + som.ydim^2) / 2
+
+    if rStart == 0.0
+        rStart = √(som.xdim^2 + som.ydim^2) / 2
         @info "The radius has been determined automatically."
     end
 
     dm = distMatrix(som.grid, som.toroidal)
 
     codes = som.codes
-    globalSumNumerator = zeros(Float64, size(codes))
-    globalSumDenominator = zeros(Float64, size(codes)[1])
-
-    # linear decay
-    if r < 1.5
-        Δr = 0.0
-    else
-        Δr = (r-1.0) / epochs
-    end
 
     nWorkers = nprocs()
     dTrain = distribute(train)
 
     for j in 1:epochs
 
-     println("Epoch: $j")
-
-     if nWorkers > 1
-         # distribution across workers
-         R = Array{Future}(undef,nWorkers, 1)
-          @sync for p in workers()
-              @async R[p] = @spawnat p begin
-                 doEpoch(localpart(dTrain), codes, dm, kernelFun, r, false)
-              end
-          end
-
-          @sync for p in workers()
-              tmp = fetch(R[p])
-              globalSumNumerator += tmp[1]
-              globalSumDenominator += tmp[2]
-          end
-     else
-         # only batch mode
-         sumNumerator, sumDenominator = doEpoch(localpart(dTrain), codes, dm,
-                                                    kernelFun, r, false)
-
-        globalSumNumerator += sumNumerator
-        globalSumDenominator += sumDenominator
-     end
+        globalSumNumerator = zeros(Float64, size(codes))
+        globalSumDenominator = zeros(Float64, size(codes)[1])
 
-     r -= Δr
-     if r < 0.0
-         r = 0.0
-     end
+        tree = knnTreeFun(Array{Float64,2}(transpose(codes)))
 
-     println("Radius: $r")
-     codes = globalSumNumerator ./ globalSumDenominator
+        if nWorkers > 1
+            # distribution across workers
+            R = Array{Future}(undef,nWorkers, 1)
+             @sync for (p, pid) in enumerate(workers())
+                 @async R[p] = @spawnat pid begin
+                     doEpoch(localpart(dTrain), codes, tree)
+                 end
+             end
+
+             @sync for (p, pid) in enumerate(workers())
+                 tmp = fetch(R[p])
+                 globalSumNumerator += tmp[1]
+                 globalSumDenominator += tmp[2]
+             end
+        else
+            # only batch mode
+            sumNumerator, sumDenominator = doEpoch(localpart(dTrain), codes, tree)
+            globalSumNumerator += sumNumerator
+            globalSumDenominator += sumDenominator
+        end
+
+        r = radiusFun(rStart, rFinal, j, epochs)
+        println("Radius: $r")
+        if r <= 0
+            error("Sanity check: radius must be positive")
+        end
+
+        wEpoch = kernelFun(dm, r)
+        codes = (wEpoch*globalSumNumerator) ./ (wEpoch*globalSumDenominator)
     end
 
     som.codes[:,:] = codes[:,:]
@@ -138,46 +136,30 @@ end
 
 
 """
-    doEpoch(x::Array{Float64}, codes::Array{Float64}, dm::Array{Float64},
-            kernelFun::Function, r::Number, toroidal::Bool)
+    doEpoch(x::Array{Float64}, codes::Array{Float64}, tree)
 
 vectors and the adjustment in radius after each epoch.
 
 # Arguments:
 - `x`: training Data
 - `codes`: Codebook
-- `dm`: distance matrix of all neurons of the SOM
-- `kernelFun`: distance kernel function of type fun(x, r)
-- `r`: training radius
-- `toroidal`: if true, the SOM is toroidal.
+- `tree`: knn-compatible tree built upon the codes
 """
-function doEpoch(x::Array{Float64, 2}, codes::Array{Float64, 2}, dm::Array{Float64, 2},
-                kernelFun::Function, r::Number, toroidal::Bool)
-
-     nRows::Int64 = size(x, 1)
-     nCodes::Int64 = size(codes, 1)
+function doEpoch(x::Array{Float64, 2}, codes::Array{Float64, 2}, tree)
 
      # initialise numerator and denominator with 0's
      sumNumerator = zeros(Float64, size(codes))
      sumDenominator = zeros(Float64, size(codes)[1])
 
      # for each sample in dataset / trainingsset
-     for s in 1:nRows
-
-         sample = vec(x[s, : ])
-         bmuIdx = findBmu(codes, sample)
+     for s in 1:size(x, 1)
 
-         # for each node in codebook get distances to bmu and multiply it
-         dist = kernelFun(dm[bmuIdx, :], r)
+         (bmuIdx, bmuDist) = knn(tree, x[s, :], 1)
 
-         # doing col wise update of the numerator
-         for i in 1:size(sumNumerator, 2)
-             @inbounds @views begin
-                 sumNumerator[:,i] .+= dist .* sample[i]
-             end
+         target = bmuIdx[1]
 
-         end
-         sumDenominator += dist
+         sumNumerator[target, :] .+= x[s, :]
+         sumDenominator[target] += 1
      end
 
      return sumNumerator, sumDenominator
@@ -197,7 +179,8 @@ every row in data.
 Data must have the same number of dimensions as the training dataset
 and will be normalised with the same parameters.
 """
-function mapToGigaSOM(som::Som, data::DataFrame)
+function mapToGigaSOM(som::Som, data::DataFrame;
+                      knnTreeFun = BruteTree)
 
     data::Array{Float64,2} = convertTrainingData(data)
     if size(data,2) != size(som.codes,2)
@@ -206,28 +189,103 @@ function mapToGigaSOM(som::Som, data::DataFrame)
     end
 
     nWorkers = nprocs()
-    dData = distribute(data)
     vis = Int64[]
+    tree = knnTreeFun(Array{Float64,2}(transpose(som.codes)))
 
     if nWorkers > 1
         # distribution across workers
-        R = Array{Future}(undef,nWorkers, 1)
-         @sync for p in workers()
-             @async R[p] = @spawnat p begin
-                visual(som.codes, localpart(dData))
-             end
-         end
+        dData = distribute(data)
 
-         @sync begin myworkers = workers()
-             sort!(myworkers)
-             println(myworkers)
-             for p in myworkers
-                 append!(vis, fetch(R[p]))
-             end
-         end
+        R = Array{Future}(undef,nWorkers, 1)
+        @sync for (p, pid) in enumerate(workers())
+            @async R[p] = @spawnat pid begin
+                # knn() returns a tuple of 2 arrays of arrays (one with indexes
+                # that we take out, the second with distances that we discard
+                # here). vcat() nicely squashes the arrays-in-arrays into a
+                # single vector.
+                vcat(knn(tree, transpose(localpart(dData)), 1)[1]...)
+            end
+        end
+
+        @sync begin
+            for (p, pid) in enumerate(sort!(workers()))
+                append!(vis, fetch(R[p]))
+            end
+        end
     else
-        vis = visual(som.codes, data)
+        vis = vcat(knn(tree, transpose(data), 1)[1]...)
     end
 
     return DataFrame(index = vis)
 end
+
+
+"""
+    scaleEpochTime(iteration::Int64, epochs::Int64)
+
+Convert iteration ID and epoch number to relative time in training.
+"""
+function scaleEpochTime(iteration::Int64, epochs::Int64)
+    # prevent division by zero on 1-epoch training
+    if epochs>1
+        epochs -= 1
+    end
+
+    return Float64(iteration-1) / Float64(epochs)
+end
+
+"""
+    linearRadius(initRadius::Float64, iteration::Int64, decay::String, epochs::Int64)
+
+Return a neighbourhood radius. Use as the `radiusFun` parameter for `trainGigaSOM`.
+
+# Arguments
+- `initRadius`: Initial Radius
+- `finalRadius`: Final Radius
+- `iteration`: Training iteration
+- `epochs`: Total number of epochs
+"""
+function linearRadius(initRadius::Float64, finalRadius::Float64,
+                      iteration::Int64, epochs::Int64)
+
+    scaledTime = scaleEpochTime(iteration,epochs)
+    return initRadius*(1-scaledTime) + finalRadius*scaledTime
+end
+
+"""
+    expRadius(steepness::Float64)
+
+Return a function to be used as a `radiusFun` of `trainGigaSOM`, which causes
+exponencial decay with the selected steepness.
+
+Use: `trainGigaSOM(..., radiusFun = expRadius(0.5))`
+
+# Arguments
+- `steepness`: Steepness of exponential descent. Good values range
+  from -100.0 (almost linear) to 100.0 (really quick decay).
+
+"""
+function expRadius(steepness::Float64 = 1.0)
+    return (initRadius::Float64, finalRadius::Float64,
+            iteration::Int64, epochs::Int64) -> begin
+
+        scaledTime = scaleEpochTime(iteration,epochs)
+
+        if steepness < -100.0
+            # prevent floating point underflows
+            error("Sanity check: steepness too low, use linearRadius instead.")
+        end
+
+        # steepness is simulated by moving both points closer to zero
+        adjust = finalRadius * (1 - 1.1^(-steepness))
+
+        if initRadius <= 0 || (initRadius-adjust) <= 0 || finalRadius <= 0
+            error("Radii must be positive. (Possible alternative cause: steepness is too high.)")
+        end
+
+        initRadius -= adjust
+        finalRadius -= adjust
+
+        return adjust + initRadius * ((finalRadius/initRadius)^scaledTime)
+    end
+end