Skip to content

Commit

Permalink
Merge pull request #61 from LCSB-BioCore/develop
Browse files Browse the repository at this point in the history
Regular merge of develop
  • Loading branch information
laurentheirendt authored Sep 8, 2019
2 parents 6ed6a0a + 46b50de commit 855863f
Show file tree
Hide file tree
Showing 17 changed files with 528 additions and 158 deletions.
5 changes: 3 additions & 2 deletions .artenolis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ script:
after_success:

# submit coverage report
- if [ "$ARCH" == "Linux" ]; then
- if [ "$ARCH" == "Linux" && "$JULIA_VER" = "v1.2.0" ]; then
$ARTENOLIS_SOFT_PATH/julia/$JULIA_VER/bin/julia --color=yes -e 'using Coverage; Codecov.submit(process_folder())';
fi

# deploy documentation
- if [[ "$ARCH" == "Linux" && "$JENKINS_PULL_REQUEST" != "True" ]]; then
- if [[ "$ARCH" == "Linux" && "$JENKINS_PULL_REQUEST" != "True" && "$JULIA_VER" = "v1.2.0" ]]; then
export TRAVIS_BRANCH=$GIT_BRANCH;
export TRAVIS_PULL_REQUEST=false;
var=$GIT_URL; export TRAVIS_REPO_SLUG=${var:8:${#var}};
Expand All @@ -41,3 +41,4 @@ after_success:
after_script:
# clean up the build directory
- cd .. && rm -rf GigaSOM
- rm -rf ~/.julia
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ docs/site/
*.csv
!refBatchDfCodes.csv
!refBatchWinners.csv
!refBatchEmbedded.csv
!refParallelDfCodes.csv
!refParallelWinners.csv
!refParallelEmbedded.csv
*.py
__pycache__
data/
Expand Down
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
FCSFiles = "d76558cf-badf-52d4-a17e-381ab0b0d937"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
17 changes: 12 additions & 5 deletions src/GigaSOM.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ module GigaSOM
using FileIO
using DistributedArrays
using XLSX
using NearestNeighbors

include("structs.jl")
include("core.jl")
include("satellites.jl")
include("embedding.jl")

# include IO files
include("io/input.jl")
Expand All @@ -30,7 +32,12 @@ module GigaSOM
export #core
initGigaSOM,
trainGigaSOM,
mapToGigaSOM
mapToGigaSOM,
linearRadius,
expRadius

export #embedding
embedGigaSOM

export # structs
daFrame
Expand All @@ -39,10 +46,10 @@ module GigaSOM
readFlowset

export #satellites
cleanNames!,
createDaFrame,
getMarkers,
checkDir
cleanNames!,
createDaFrame,
getMarkers,
checkDir

export # plotting
plotCounts,
Expand Down
230 changes: 144 additions & 86 deletions src/core.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ function initGigaSOM( train, xdim, ydim = xdim;
normParams = convert(DataFrame, normParams)
names!(normParams, Symbol.(colNames))

# create X,y-indices for neurons:
# create X,Y-indices for neurons:
#
x = y = collect(1:numCodes)
x = y = collect(1:numCodes)
indices = DataFrame(X = x, Y = y)

# make SOM object:
Expand All @@ -61,74 +61,72 @@ end
- `kernel::function`: optional distance kernel; one of (`bubbleKernel, gaussianKernel`)
default is `gaussianKernel`
- `r`: optional training radius.
If r is not specified, it defaults to √(xdim^2 + ydim^2) / 2
If r is not specified, it defaults to (xdim+ydim)/3
- `rFinal`: target radius at the last epoch, defaults to 0.1
Training data must be convertable to Array{Float34,2} with `convert()`.
Training samples are row-wise; one sample per row. An alternative kernel function
can be provided to modify the distance-dependent training. The function must fit
to the signature fun(x, r) where x is an arbitrary distance and r is a parameter
controlling the function and the return value is between 0.0 and 1.0.
"""
function trainGigaSOM(som::Som, train::DataFrame; kernelFun::Function = gaussianKernel,
r = 0.0, epochs = 10)
function trainGigaSOM(som::Som, train::DataFrame;
kernelFun::Function = gaussianKernel,
knnTreeFun = BruteTree,
rStart = 0.0, rFinal=0.1, radiusFun=linearRadius,
epochs = 10)

train = convertTrainingData(train)

# set default radius:
if r == 0.0
r = (som.xdim^2 + som.ydim^2) / 2

if rStart == 0.0
rStart = (som.xdim^2 + som.ydim^2) / 2
@info "The radius has been determined automatically."
end

dm = distMatrix(som.grid, som.toroidal)

codes = som.codes
globalSumNumerator = zeros(Float64, size(codes))
globalSumDenominator = zeros(Float64, size(codes)[1])

# linear decay
if r < 1.5
Δr = 0.0
else
Δr = (r-1.0) / epochs
end

nWorkers = nprocs()
dTrain = distribute(train)

for j in 1:epochs

println("Epoch: $j")

if nWorkers > 1
# distribution across workers
R = Array{Future}(undef,nWorkers, 1)
@sync for p in workers()
@async R[p] = @spawnat p begin
doEpoch(localpart(dTrain), codes, dm, kernelFun, r, false)
end
end

@sync for p in workers()
tmp = fetch(R[p])
globalSumNumerator += tmp[1]
globalSumDenominator += tmp[2]
end
else
# only batch mode
sumNumerator, sumDenominator = doEpoch(localpart(dTrain), codes, dm,
kernelFun, r, false)

globalSumNumerator += sumNumerator
globalSumDenominator += sumDenominator
end
globalSumNumerator = zeros(Float64, size(codes))
globalSumDenominator = zeros(Float64, size(codes)[1])

r -= Δr
if r < 0.0
r = 0.0
end
tree = knnTreeFun(Array{Float64,2}(transpose(codes)))

println("Radius: $r")
codes = globalSumNumerator ./ globalSumDenominator
if nWorkers > 1
# distribution across workers
R = Array{Future}(undef,nWorkers, 1)
@sync for (p, pid) in enumerate(workers())
@async R[p] = @spawnat pid begin
doEpoch(localpart(dTrain), codes, tree)
end
end

@sync for (p, pid) in enumerate(workers())
tmp = fetch(R[p])
globalSumNumerator += tmp[1]
globalSumDenominator += tmp[2]
end
else
# only batch mode
sumNumerator, sumDenominator = doEpoch(localpart(dTrain), codes, tree)
globalSumNumerator += sumNumerator
globalSumDenominator += sumDenominator
end

r = radiusFun(rStart, rFinal, j, epochs)
println("Radius: $r")
if r <= 0
error("Sanity check: radius must be positive")
end

wEpoch = kernelFun(dm, r)
codes = (wEpoch*globalSumNumerator) ./ (wEpoch*globalSumDenominator)
end

som.codes[:,:] = codes[:,:]
Expand All @@ -138,46 +136,30 @@ end


"""
doEpoch(x::Array{Float64}, codes::Array{Float64}, dm::Array{Float64},
kernelFun::Function, r::Number, toroidal::Bool)
doEpoch(x::Array{Float64}, codes::Array{Float64}, tree)
vectors and the adjustment in radius after each epoch.
# Arguments:
- `x`: training Data
- `codes`: Codebook
- `dm`: distance matrix of all neurons of the SOM
- `kernelFun`: distance kernel function of type fun(x, r)
- `r`: training radius
- `toroidal`: if true, the SOM is toroidal.
- `tree`: knn-compatible tree built upon the codes
"""
function doEpoch(x::Array{Float64, 2}, codes::Array{Float64, 2}, dm::Array{Float64, 2},
kernelFun::Function, r::Number, toroidal::Bool)

nRows::Int64 = size(x, 1)
nCodes::Int64 = size(codes, 1)
function doEpoch(x::Array{Float64, 2}, codes::Array{Float64, 2}, tree)

# initialise numerator and denominator with 0's
sumNumerator = zeros(Float64, size(codes))
sumDenominator = zeros(Float64, size(codes)[1])

# for each sample in dataset / trainingsset
for s in 1:nRows

sample = vec(x[s, : ])
bmuIdx = findBmu(codes, sample)
for s in 1:size(x, 1)

# for each node in codebook get distances to bmu and multiply it
dist = kernelFun(dm[bmuIdx, :], r)
(bmuIdx, bmuDist) = knn(tree, x[s, :], 1)

# doing col wise update of the numerator
for i in 1:size(sumNumerator, 2)
@inbounds @views begin
sumNumerator[:,i] .+= dist .* sample[i]
end
target = bmuIdx[1]

end
sumDenominator += dist
sumNumerator[target, :] .+= x[s, :]
sumDenominator[target] += 1
end

return sumNumerator, sumDenominator
Expand All @@ -197,7 +179,8 @@ every row in data.
Data must have the same number of dimensions as the training dataset
and will be normalised with the same parameters.
"""
function mapToGigaSOM(som::Som, data::DataFrame)
function mapToGigaSOM(som::Som, data::DataFrame;
knnTreeFun = BruteTree)

data::Array{Float64,2} = convertTrainingData(data)
if size(data,2) != size(som.codes,2)
Expand All @@ -206,28 +189,103 @@ function mapToGigaSOM(som::Som, data::DataFrame)
end

nWorkers = nprocs()
dData = distribute(data)
vis = Int64[]
tree = knnTreeFun(Array{Float64,2}(transpose(som.codes)))

if nWorkers > 1
# distribution across workers
R = Array{Future}(undef,nWorkers, 1)
@sync for p in workers()
@async R[p] = @spawnat p begin
visual(som.codes, localpart(dData))
end
end
dData = distribute(data)

@sync begin myworkers = workers()
sort!(myworkers)
println(myworkers)
for p in myworkers
append!(vis, fetch(R[p]))
end
end
R = Array{Future}(undef,nWorkers, 1)
@sync for (p, pid) in enumerate(workers())
@async R[p] = @spawnat pid begin
# knn() returns a tuple of 2 arrays of arrays (one with indexes
# that we take out, the second with distances that we discard
# here). vcat() nicely squashes the arrays-in-arrays into a
# single vector.
vcat(knn(tree, transpose(localpart(dData)), 1)[1]...)
end
end

@sync begin
for (p, pid) in enumerate(sort!(workers()))
append!(vis, fetch(R[p]))
end
end
else
vis = visual(som.codes, data)
vis = vcat(knn(tree, transpose(data), 1)[1]...)
end

return DataFrame(index = vis)
end


"""
scaleEpochTime(iteration::Int64, epochs::Int64)
Convert iteration ID and epoch number to relative time in training.
"""
function scaleEpochTime(iteration::Int64, epochs::Int64)
# prevent division by zero on 1-epoch training
if epochs>1
epochs -= 1
end

return Float64(iteration-1) / Float64(epochs)
end

"""
linearRadius(initRadius::Float64, iteration::Int64, decay::String, epochs::Int64)
Return a neighbourhood radius. Use as the `radiusFun` parameter for `trainGigaSOM`.
# Arguments
- `initRadius`: Initial Radius
- `finalRadius`: Final Radius
- `iteration`: Training iteration
- `epochs`: Total number of epochs
"""
function linearRadius(initRadius::Float64, finalRadius::Float64,
iteration::Int64, epochs::Int64)

scaledTime = scaleEpochTime(iteration,epochs)
return initRadius*(1-scaledTime) + finalRadius*scaledTime
end

"""
expRadius(steepness::Float64)
Return a function to be used as a `radiusFun` of `trainGigaSOM`, which causes
exponencial decay with the selected steepness.
Use: `trainGigaSOM(..., radiusFun = expRadius(0.5))`
# Arguments
- `steepness`: Steepness of exponential descent. Good values range
from -100.0 (almost linear) to 100.0 (really quick decay).
"""
function expRadius(steepness::Float64 = 1.0)
return (initRadius::Float64, finalRadius::Float64,
iteration::Int64, epochs::Int64) -> begin

scaledTime = scaleEpochTime(iteration,epochs)

if steepness < -100.0
# prevent floating point underflows
error("Sanity check: steepness too low, use linearRadius instead.")
end

# steepness is simulated by moving both points closer to zero
adjust = finalRadius * (1 - 1.1^(-steepness))

if initRadius <= 0 || (initRadius-adjust) <= 0 || finalRadius <= 0
error("Radii must be positive. (Possible alternative cause: steepness is too high.)")
end

initRadius -= adjust
finalRadius -= adjust

return adjust + initRadius * ((finalRadius/initRadius)^scaledTime)
end
end
Loading

0 comments on commit 855863f

Please sign in to comment.