diff --git a/.artenolis.yml b/.artenolis.yml index 5a2516d9..6a0000be 100644 --- a/.artenolis.yml +++ b/.artenolis.yml @@ -22,12 +22,12 @@ script: after_success: # submit coverage report - - if [ "$ARCH" == "Linux" ]; then + - if [ "$ARCH" == "Linux" && "$JULIA_VER" = "v1.2.0" ]; then $ARTENOLIS_SOFT_PATH/julia/$JULIA_VER/bin/julia --color=yes -e 'using Coverage; Codecov.submit(process_folder())'; fi # deploy documentation - - if [[ "$ARCH" == "Linux" && "$JENKINS_PULL_REQUEST" != "True" ]]; then + - if [[ "$ARCH" == "Linux" && "$JENKINS_PULL_REQUEST" != "True" && "$JULIA_VER" = "v1.2.0" ]]; then export TRAVIS_BRANCH=$GIT_BRANCH; export TRAVIS_PULL_REQUEST=false; var=$GIT_URL; export TRAVIS_REPO_SLUG=${var:8:${#var}}; @@ -41,3 +41,4 @@ after_success: after_script: # clean up the build directory - cd .. && rm -rf GigaSOM + - rm -rf ~/.julia diff --git a/.gitignore b/.gitignore index 0e543b13..5ed4ed87 100644 --- a/.gitignore +++ b/.gitignore @@ -13,8 +13,10 @@ docs/site/ *.csv !refBatchDfCodes.csv !refBatchWinners.csv +!refBatchEmbedded.csv !refParallelDfCodes.csv !refParallelWinners.csv +!refParallelEmbedded.csv *.py __pycache__ data/ diff --git a/Project.toml b/Project.toml index 7b56b9eb..3ef7b846 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FCSFiles = "d76558cf-badf-52d4-a17e-381ab0b0d937" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/GigaSOM.jl b/src/GigaSOM.jl index 4f68fabc..e567896a 100644 --- a/src/GigaSOM.jl +++ b/src/GigaSOM.jl @@ -15,10 +15,12 @@ module GigaSOM using FileIO using DistributedArrays using XLSX + using NearestNeighbors include("structs.jl") include("core.jl") include("satellites.jl") + include("embedding.jl") # include IO files include("io/input.jl") @@ -30,7 +32,12 @@ module GigaSOM export #core initGigaSOM, trainGigaSOM, - mapToGigaSOM + mapToGigaSOM, + linearRadius, + expRadius + + export #embedding + embedGigaSOM export # structs daFrame @@ -39,10 +46,10 @@ module GigaSOM readFlowset export #satellites - cleanNames!, - createDaFrame, - getMarkers, - checkDir + cleanNames!, + createDaFrame, + getMarkers, + checkDir export # plotting plotCounts, diff --git a/src/core.jl b/src/core.jl index 8c644bca..13e432a7 100644 --- a/src/core.jl +++ b/src/core.jl @@ -34,9 +34,9 @@ function initGigaSOM( train, xdim, ydim = xdim; normParams = convert(DataFrame, normParams) names!(normParams, Symbol.(colNames)) - # create X,y-indices for neurons: + # create X,Y-indices for neurons: # - x = y = collect(1:numCodes) + x = y = collect(1:numCodes) indices = DataFrame(X = x, Y = y) # make SOM object: @@ -61,74 +61,72 @@ end - `kernel::function`: optional distance kernel; one of (`bubbleKernel, gaussianKernel`) default is `gaussianKernel` - `r`: optional training radius. - If r is not specified, it defaults to √(xdim^2 + ydim^2) / 2 + If r is not specified, it defaults to (xdim+ydim)/3 +- `rFinal`: target radius at the last epoch, defaults to 0.1 Training data must be convertable to Array{Float34,2} with `convert()`. Training samples are row-wise; one sample per row. An alternative kernel function can be provided to modify the distance-dependent training. The function must fit to the signature fun(x, r) where x is an arbitrary distance and r is a parameter controlling the function and the return value is between 0.0 and 1.0. """ -function trainGigaSOM(som::Som, train::DataFrame; kernelFun::Function = gaussianKernel, - r = 0.0, epochs = 10) +function trainGigaSOM(som::Som, train::DataFrame; + kernelFun::Function = gaussianKernel, + knnTreeFun = BruteTree, + rStart = 0.0, rFinal=0.1, radiusFun=linearRadius, + epochs = 10) train = convertTrainingData(train) # set default radius: - if r == 0.0 - r = √(som.xdim^2 + som.ydim^2) / 2 + + if rStart == 0.0 + rStart = √(som.xdim^2 + som.ydim^2) / 2 @info "The radius has been determined automatically." end dm = distMatrix(som.grid, som.toroidal) codes = som.codes - globalSumNumerator = zeros(Float64, size(codes)) - globalSumDenominator = zeros(Float64, size(codes)[1]) - - # linear decay - if r < 1.5 - Δr = 0.0 - else - Δr = (r-1.0) / epochs - end nWorkers = nprocs() dTrain = distribute(train) for j in 1:epochs - println("Epoch: $j") - - if nWorkers > 1 - # distribution across workers - R = Array{Future}(undef,nWorkers, 1) - @sync for p in workers() - @async R[p] = @spawnat p begin - doEpoch(localpart(dTrain), codes, dm, kernelFun, r, false) - end - end - - @sync for p in workers() - tmp = fetch(R[p]) - globalSumNumerator += tmp[1] - globalSumDenominator += tmp[2] - end - else - # only batch mode - sumNumerator, sumDenominator = doEpoch(localpart(dTrain), codes, dm, - kernelFun, r, false) - - globalSumNumerator += sumNumerator - globalSumDenominator += sumDenominator - end + globalSumNumerator = zeros(Float64, size(codes)) + globalSumDenominator = zeros(Float64, size(codes)[1]) - r -= Δr - if r < 0.0 - r = 0.0 - end + tree = knnTreeFun(Array{Float64,2}(transpose(codes))) - println("Radius: $r") - codes = globalSumNumerator ./ globalSumDenominator + if nWorkers > 1 + # distribution across workers + R = Array{Future}(undef,nWorkers, 1) + @sync for (p, pid) in enumerate(workers()) + @async R[p] = @spawnat pid begin + doEpoch(localpart(dTrain), codes, tree) + end + end + + @sync for (p, pid) in enumerate(workers()) + tmp = fetch(R[p]) + globalSumNumerator += tmp[1] + globalSumDenominator += tmp[2] + end + else + # only batch mode + sumNumerator, sumDenominator = doEpoch(localpart(dTrain), codes, tree) + globalSumNumerator += sumNumerator + globalSumDenominator += sumDenominator + end + + r = radiusFun(rStart, rFinal, j, epochs) + println("Radius: $r") + if r <= 0 + error("Sanity check: radius must be positive") + end + + wEpoch = kernelFun(dm, r) + codes = (wEpoch*globalSumNumerator) ./ (wEpoch*globalSumDenominator) end som.codes[:,:] = codes[:,:] @@ -138,46 +136,30 @@ end """ - doEpoch(x::Array{Float64}, codes::Array{Float64}, dm::Array{Float64}, - kernelFun::Function, r::Number, toroidal::Bool) + doEpoch(x::Array{Float64}, codes::Array{Float64}, tree) vectors and the adjustment in radius after each epoch. # Arguments: - `x`: training Data - `codes`: Codebook -- `dm`: distance matrix of all neurons of the SOM -- `kernelFun`: distance kernel function of type fun(x, r) -- `r`: training radius -- `toroidal`: if true, the SOM is toroidal. +- `tree`: knn-compatible tree built upon the codes """ -function doEpoch(x::Array{Float64, 2}, codes::Array{Float64, 2}, dm::Array{Float64, 2}, - kernelFun::Function, r::Number, toroidal::Bool) - - nRows::Int64 = size(x, 1) - nCodes::Int64 = size(codes, 1) +function doEpoch(x::Array{Float64, 2}, codes::Array{Float64, 2}, tree) # initialise numerator and denominator with 0's sumNumerator = zeros(Float64, size(codes)) sumDenominator = zeros(Float64, size(codes)[1]) # for each sample in dataset / trainingsset - for s in 1:nRows - - sample = vec(x[s, : ]) - bmuIdx = findBmu(codes, sample) + for s in 1:size(x, 1) - # for each node in codebook get distances to bmu and multiply it - dist = kernelFun(dm[bmuIdx, :], r) + (bmuIdx, bmuDist) = knn(tree, x[s, :], 1) - # doing col wise update of the numerator - for i in 1:size(sumNumerator, 2) - @inbounds @views begin - sumNumerator[:,i] .+= dist .* sample[i] - end + target = bmuIdx[1] - end - sumDenominator += dist + sumNumerator[target, :] .+= x[s, :] + sumDenominator[target] += 1 end return sumNumerator, sumDenominator @@ -197,7 +179,8 @@ every row in data. Data must have the same number of dimensions as the training dataset and will be normalised with the same parameters. """ -function mapToGigaSOM(som::Som, data::DataFrame) +function mapToGigaSOM(som::Som, data::DataFrame; + knnTreeFun = BruteTree) data::Array{Float64,2} = convertTrainingData(data) if size(data,2) != size(som.codes,2) @@ -206,28 +189,103 @@ function mapToGigaSOM(som::Som, data::DataFrame) end nWorkers = nprocs() - dData = distribute(data) vis = Int64[] + tree = knnTreeFun(Array{Float64,2}(transpose(som.codes))) if nWorkers > 1 # distribution across workers - R = Array{Future}(undef,nWorkers, 1) - @sync for p in workers() - @async R[p] = @spawnat p begin - visual(som.codes, localpart(dData)) - end - end + dData = distribute(data) - @sync begin myworkers = workers() - sort!(myworkers) - println(myworkers) - for p in myworkers - append!(vis, fetch(R[p])) - end - end + R = Array{Future}(undef,nWorkers, 1) + @sync for (p, pid) in enumerate(workers()) + @async R[p] = @spawnat pid begin + # knn() returns a tuple of 2 arrays of arrays (one with indexes + # that we take out, the second with distances that we discard + # here). vcat() nicely squashes the arrays-in-arrays into a + # single vector. + vcat(knn(tree, transpose(localpart(dData)), 1)[1]...) + end + end + + @sync begin + for (p, pid) in enumerate(sort!(workers())) + append!(vis, fetch(R[p])) + end + end else - vis = visual(som.codes, data) + vis = vcat(knn(tree, transpose(data), 1)[1]...) end return DataFrame(index = vis) end + + +""" + scaleEpochTime(iteration::Int64, epochs::Int64) + +Convert iteration ID and epoch number to relative time in training. +""" +function scaleEpochTime(iteration::Int64, epochs::Int64) + # prevent division by zero on 1-epoch training + if epochs>1 + epochs -= 1 + end + + return Float64(iteration-1) / Float64(epochs) +end + +""" + linearRadius(initRadius::Float64, iteration::Int64, decay::String, epochs::Int64) + +Return a neighbourhood radius. Use as the `radiusFun` parameter for `trainGigaSOM`. + +# Arguments +- `initRadius`: Initial Radius +- `finalRadius`: Final Radius +- `iteration`: Training iteration +- `epochs`: Total number of epochs +""" +function linearRadius(initRadius::Float64, finalRadius::Float64, + iteration::Int64, epochs::Int64) + + scaledTime = scaleEpochTime(iteration,epochs) + return initRadius*(1-scaledTime) + finalRadius*scaledTime +end + +""" + expRadius(steepness::Float64) + +Return a function to be used as a `radiusFun` of `trainGigaSOM`, which causes +exponencial decay with the selected steepness. + +Use: `trainGigaSOM(..., radiusFun = expRadius(0.5))` + +# Arguments +- `steepness`: Steepness of exponential descent. Good values range + from -100.0 (almost linear) to 100.0 (really quick decay). + +""" +function expRadius(steepness::Float64 = 1.0) + return (initRadius::Float64, finalRadius::Float64, + iteration::Int64, epochs::Int64) -> begin + + scaledTime = scaleEpochTime(iteration,epochs) + + if steepness < -100.0 + # prevent floating point underflows + error("Sanity check: steepness too low, use linearRadius instead.") + end + + # steepness is simulated by moving both points closer to zero + adjust = finalRadius * (1 - 1.1^(-steepness)) + + if initRadius <= 0 || (initRadius-adjust) <= 0 || finalRadius <= 0 + error("Radii must be positive. (Possible alternative cause: steepness is too high.)") + end + + initRadius -= adjust + finalRadius -= adjust + + return adjust + initRadius * ((finalRadius/initRadius)^scaledTime) + end +end diff --git a/src/embedding.jl b/src/embedding.jl new file mode 100644 index 00000000..f3d63574 --- /dev/null +++ b/src/embedding.jl @@ -0,0 +1,215 @@ +""" + embedGigaSOM(som::Som, data::DataFrame, k, adjust, smooth) + +Return a data frame with X,Y coordinates of EmbedSOM projection of the data. + +# Arguments +- `som`: a trained SOM +- `data`: Array or DataFrame with the data. +- `k`: number of nearest neighbors to consider (high values get quadratically + slower) +- `adjust`: position adjustment parameter (higher values avoid non-local + approximations) +- `smooth`: approximation smoothness (the higher the value, the larger the + neighborhood of approximate local linearity of the projection) + +Example: + +Produce a 2-column matrix with 2D cell coordinates: + +``` +e = embedGigaSOM(som, data) +``` + +Plotting of the result is best done using 2D histograms; e.g. with Gadfly: + +``` +using Gadfly +draw(PNG("output.png",20cm,20cm), + plot(x=e[:,1], y=e[:,2], + Geom.histogram2d(xbincount=200, ybincount=200))) +``` + +Data must have the same number of dimensions as the training dataset +and will be normalised with the same parameters. +""" +function embedGigaSOM(som::GigaSOM.Som, data::DataFrame; + knnTreeFun = BruteTree, + k::Int64=0, adjust::Float64=1.0, smooth::Float64=0.0) + + data::Array{Float64,2} = convertTrainingData(data) + if size(data,2) != size(som.codes,2) + println(" data: $(size(data,2)), codes: $(size(som.codes,2))") + error(SOM_ERRORS[:ERR_COL_NUM]) + end + + # convert `smooth` to `boost` + boost = ((1+sqrt(Float64(5)))/2)^(smooth-2) + + # default `k` + if k == 0 + k = Integer((som.xdim+som.ydim)/2) + end + + # check if `k` isn't too high + if k > som.xdim*som.ydim + k = Integer(som.xdim * som.ydim) + end + + # prepare the kNN-tree for lookups + tree = knnTreeFun(Array{Float64,2}(transpose(som.codes))) + + # run the distributed computation + nWorkers = nprocs() + if nWorkers > 1 + dData = distribute(data) + + dRes = [ (@spawnat pid embedGigaSOM_internal(som, localpart(dData), tree, + k, adjust, boost)) + for (p,pid) in enumerate(workers()) ] + + return vcat([fetch(r) for r in dRes]...) + else + return embedGigaSOM_internal(som, data, tree, k, adjust, boost) + end +end + +""" + embedGigaSOM_internal(som::GigaSOM.Som, data::Array{Float64,2}, tree, + k::Int64, adjust::Float64, boost::Float64) + +Internal function to compute parts of the embedding on a prepared kNN-tree +structure (`tree`) and `smooth` converted to `boost`. +""" +function embedGigaSOM_internal(som::GigaSOM.Som, data::Array{Float64,2}, + tree, k::Int64, adjust::Float64, boost::Float64) + + ndata = size(data,1) + dim = size(data,2) + + # output buffer + e = zeros(Float64, ndata, 2) + + # buffer for indexes of the k nearest SOM points + sp = zeros(Int, k) + + # process all data points in this batch + for di in 1:size(data,1) + + # find the nearest neighbors of the point and sort them by distance + (knidx,kndist) = knn(tree, data[di,:], k) + sortperm!(sp, kndist) + knidx=knidx[sp] # nearest point indexes + kndist=kndist[sp] # their corresponding distances + + # compute the scores accordingly to EmbedSOM scoring + sum=0.0 + ssum=0.0 + min=kndist[1] + for i in 1:k + sum += kndist[i] / i + ssum += 1.0/i + if kndist[i] < min + min = kndist[i] + end + end + + # Compute the final scores. The tiny constant avoids hitting zero. + # Higher `smooth` (and therefore `boost`) parameter causes the `sum` to + # be exaggerated, which (after the inverse) results in slower decay of + # the SOM point score with increasing distance from the point that is + # being embedded. + sum = -ssum / (1e-9 + sum * boost) + for i in 1:k + kndist[i] = exp(sum*(kndist[i]-min)) + end + + # `mtx` is used as a matrix of a linear equation (like A|b) with 2 + # unknowns. Derivations of square-error function are added to the + # matrix in a way that solving the matrix (i.e. finding the zero) + # effectively minimizes the error. + mtx = zeros(Float64, 2, 3) + + # The embedding works with pairs of points on the SOM, say I and J. + # Thus there are 2 cycles for all pairs of `i` and `j`. + for i in 1:k + idx = knidx[i] # index of I in SOM + ix = Float64(som.grid[idx,1]) # position of I in 2D space + iy = Float64(som.grid[idx,2]) + is = kndist[i] # precomputed score of I + + # a bit of single-point gravity helps with avoiding singularities + gs = 1e-9 * is + mtx[1,1] += gs + mtx[2,2] += gs + mtx[1,3] += gs*ix + mtx[2,3] += gs*iy + + for j in (i+1):k + jdx = knidx[j] # same values for J as for I + jx = Float64(som.grid[jdx,1]) + jy = Float64(som.grid[jdx,2]) + js = kndist[j] + + # compute values for approximation + scalar::Float64 = 0 # this will be dot(Point-I, J-I) + sqdist::Float64 = 0 # ... norm(J-I) + + for kk in 1:dim + tmp = som.codes[idx,kk]*som.codes[jdx,kk] + sqdist += tmp*tmp + scalar += tmp*(data[di,kk]-som.codes[idx,kk]) + end + + if scalar != 0 + if sqdist == 0 + # sounds like I==J ... + continue + else + # If everything went right, `scalar` now becomes the + # position of the point being embedded on the line + # defined by I,J, relatively to both points (I has + # position 0 and J has position 1). + scalar /= sqdist + end + end + + # Process this information into matrix coefficients that give + # derivation of the error that the resulting point will have + # from the `scalar` position between 2D images of I and J. + # + # Note: I originally did the math by hand, but, seriously, do + # not waste time with that and use e.g. Sage for getting the + # derivatives right if anything should get modified here. + hx = jx-ix + hy = jy-iy + hpxy = hx*hx+hy*hy + ihpxy = 1/hpxy + # Higher `adjust` parameter lowers approximation influence of + # SOM points that are too far in 2D. + s = is*js/(hpxy^adjust) + diag = s * hx * hy * ihpxy + rhsc = s * (scalar + (hx*ix+hy*iy) * ihpxy) + + mtx[1,1] += s * hx * hx * ihpxy + mtx[2,2] += s * hy * hy * ihpxy + + mtx[1,2] += diag + mtx[2,1] += diag + + mtx[1,3] += hx*rhsc + mtx[2,3] += hy*rhsc + end + end + + # Now the matrix contains a derivative of the error sum function; + # solving the matrix using the Cramer rule means finding zero of the + # derivative, which gives the minimum-error position, which is in turn + # the desired embedded point position that gets saved to output `e`. + det = mtx[1,1]*mtx[2,2] - mtx[1,2]*mtx[2,1] + e[di,1] = (mtx[1,3]*mtx[2,2] - mtx[1,2]*mtx[2,3]) / det + e[di,2] = (mtx[1,1]*mtx[2,3] - mtx[2,1]*mtx[1,3]) / det + end + + return e +end diff --git a/src/io/input.jl b/src/io/input.jl index 59c2ecc4..3f220c52 100644 --- a/src/io/input.jl +++ b/src/io/input.jl @@ -1,4 +1,3 @@ - """ readFlowset(filenames) @@ -8,13 +7,28 @@ Create a dictionary with filenames as keys and daFrame as values - `filenames`: Array of type string """ function readFlowset(filenames) + flowFrame = Dict() # read all FCS files into flowFrame for name in filenames # file list flowrun = FileIO.load(name) # FCS file + + # get metadata + # FCSFiles returns a dict with coumn names as key + # As the dict is not in order, use the name column form meta + # to sort the Dataframe after cast. + meta = getMetaData(flowrun) + markers = meta[:,1] + markersIsotope = meta[:,5] flowDF = DataFrame(flowrun.data) + # sort the DF according to the marker list + flowDF = flowDF[:, Symbol.(markersIsotope)] + cleanNames!(markers) + + names!(flowDF, Symbol.(markers), makeunique=true) flowFrame[name] = flowDF end + return flowFrame end diff --git a/src/io/process.jl b/src/io/process.jl index 72850548..2e2503f9 100644 --- a/src/io/process.jl +++ b/src/io/process.jl @@ -85,7 +85,7 @@ function createDaFrame(fcsRaw, md, panel) for i in eachindex(md.file_name) df = fcsRaw[md.file_name[i]] - df[:sample_id] = string(md.sample_id[i]) + insertcols!(df, 1, sample_id = string(md.sample_id[i])) end dfall = [] @@ -94,6 +94,9 @@ function createDaFrame(fcsRaw, md, panel) end dfall = vcat(dfall...) cc = map(Symbol, vcat(lineageMarkers, functionalMarkers)) + # markers can be lineage and functional at tthe same time + # therefore make cc unique + unique!(cc) push!(cc, :sample_id) # reduce the dataset to lineage (and later) functional (state) markers dfall = dfall[:, cc] @@ -112,8 +115,8 @@ Returns the `lineageMarkers` and `functionalMarkers` on a given panel function getMarkers(panel) # extract lineage markers - lineageMarkers = panel.fcs_colname[panel.Lineage .== 1, : ] - functionalMarkers = panel.fcs_colname[panel.Functional .== 1, :] + lineageMarkers = panel.Antigen[panel.Lineage .== 1, : ] + functionalMarkers = panel.Antigen[panel.Functional .== 1, : ] # lineageMarkers are 2d array, # flatten this array by using vec: @@ -125,3 +128,62 @@ function getMarkers(panel) return lineageMarkers, functionalMarkers end + +""" + getMetaData(f) +Collect the meta data information in a more user friendly format. + +# Arguments: +- `f`: input structure with `.params` and `.data` fields +""" +function getMetaData(f) + + # declarations and initializations + meta = f.params + metaKeys = keys(meta) + channel_properties = [] + defaultValue = "None" + + # determine the number of channels + pars = parse(Int, strip(join(meta["\$PAR"]))) + + # determine the range of channel numbers + channel_numbers = 1:pars + + # determine the channel properties + for (key,) in meta + if key[1:3] == "\$P1" + if !occursin(key[4], "0123456789") + push!(channel_properties, key[4:end]) + end + end + end + + # define the column names + column_names = ["\$Pn$p" for p in channel_properties] + + # create a data frame + df = DataFrame([Vector{Any}(undef, 0) for i = 1:length(column_names)]) + + # fill the data frame + for ch in channel_numbers + # build first each row of the datatable + tmpV = [] + for p in channel_properties + if "\$P$ch$p" in metaKeys + tmp = meta["\$P$ch$p"] + else + tmp = defaultValue + end + push!(tmpV, tmp) + end + + # push the row to the dataframe + push!(df, tmpV) + end + + # set the names of the df + names!(df, Symbol.(column_names)) + + return df +end \ No newline at end of file diff --git a/src/satellites.jl b/src/satellites.jl index 666f67ce..c85ad3e1 100644 --- a/src/satellites.jl +++ b/src/satellites.jl @@ -29,7 +29,7 @@ end """ - gaussianKernel(x::Array{Float64, 1}, r::Float64)::Array{Float64, 1} + gaussianKernel(x, r::Float64) Return Gaussian(x) for μ=0.0 and σ = r/3. (a value of σ = r/3 makes the training results comparable between different kernels @@ -38,7 +38,7 @@ for same values of r). # Arguments """ -function gaussianKernel(x::Array{Float64, 1}, r::Float64)::Array{Float64, 1} +function gaussianKernel(x, r::Float64) return Distributions.pdf.(Distributions.Normal(0.0,r/3), x) end diff --git a/test/batch.jl b/test/batch.jl index 22e574f8..facaf189 100644 --- a/test/batch.jl +++ b/test/batch.jl @@ -23,6 +23,8 @@ som2 = trainGigaSOM(som2, dfSom, epochs = 1) winners = mapToGigaSOM(som2, dfSom) +embed = embedGigaSOM(som2, dfSom, k=10, smooth=0.0, adjust=0.5) + #test batch @testset "Batch" begin codes = som2.codes @@ -30,25 +32,30 @@ winners = mapToGigaSOM(som2, dfSom) dfCodes = DataFrame(codes) names!(dfCodes, Symbol.(som2.colNames)) + dfEmbed = DataFrame(embed) CSV.write(genDataPath*"/batchDfCodes.csv", dfCodes) CSV.write(genDataPath*"/batchWinners.csv", winners) + CSV.write(genDataPath*"/batchEmbedded.csv", dfEmbed) #load the ref data refBatchDfCodes = CSV.File(refDataPath*"/refBatchDfCodes.csv") |> DataFrame refBatchWinners = CSV.File(refDataPath*"/refBatchWinners.csv") |> DataFrame + refBatchEmbedded = CSV.File(refDataPath*"/refBatchEmbedded.csv") |> DataFrame #load the generated data batchDfCodes = CSV.File(genDataPath*"/batchDfCodes.csv") |> DataFrame batchDfCodesTest = first(batchDfCodes, 10) batchWinners = CSV.File(genDataPath*"/batchWinners.csv") |> DataFrame batchWinnersTest = first(batchWinners, 10) + batchEmbedded = CSV.File(genDataPath*"/batchEmbedded.csv") |> DataFrame + batchEmbeddedTest = first(batchEmbedded, 10) # test the generated data against the reference data @test refBatchWinners == batchWinnersTest @test refBatchDfCodes == batchDfCodesTest + @test Array{Float64,2}(refBatchEmbedded) ≈ Array{Float64,2}(batchEmbeddedTest) atol=1e-4 for (i, j) in zip(batchDfCodesTest[:,1], refBatchDfCodes[:,1]) @test isapprox(i, j; atol = 0.001) end - @test refBatchWinners == batchWinnersTest end diff --git a/test/io.jl b/test/io.jl index c3eaee1e..643afb45 100644 --- a/test/io.jl +++ b/test/io.jl @@ -1,13 +1,6 @@ # Load and transform # build the general workflow to have the data ready -#= -using FCSFiles for loading -as this function is only the basic parsing of the binary -FCS, we need to see what functionality is missing and -extend this in the original package -=# - checkDir() #create genData and data folder and change dir to dataPath @@ -45,15 +38,8 @@ for f in dataFiles end end -md = DataFrame(XLSX.readtable("PBMC8_metadata.xlsx", "Sheet1")...) -panel = DataFrame(XLSX.readtable("PBMC8_panel.xlsx", "Sheet1")...) -panel[:Isotope] = map(string, panel[:Isotope]) -panel[:Metal] = map(string, panel[:Metal]) -panel[:Antigen] = map(string, panel[:Antigen]) -panel.Metal[1]="" - -insertcols!(panel,4,:fcs_colname => map((x,y,z)->x.*"(".*y.*z.*")".*"Dd",panel[:Antigen],panel[:Metal],panel[:Isotope])) -print(panel.fcs_colname) +md = DataFrame(XLSX.readtable("PBMC8_metadata.xlsx", "Sheet1", infer_eltypes=true)...) +panel = DataFrame(XLSX.readtable("PBMC8_panel.xlsx", "Sheet1", infer_eltypes=true)...) lineageMarkers, functionalMarkers = getMarkers(panel) @@ -66,22 +52,9 @@ daf = createDaFrame(fcsRaw, md, panel) # change the directory back to the current directory cd(cwd) -CSV.write(genDataPath*"/daf.csv", daf.fcstable) +#check if the markers from panel file are the same as loaded from the fcs file -@testset "Cleaning names" begin - for i in eachindex(lineageMarkers) - @test !in("-",i) - end - for i in eachindex(functionalMarkers) - @test !in("-",i) - end - for (k,v) in fcsRaw - colnames = names(v) - for i in eachindex(colnames) - @test !in("-",i) - end - end -end +CSV.write(genDataPath*"/daf.csv", daf.fcstable) @testset "Checksums" begin cd(dataPath) diff --git a/test/parallel.jl b/test/parallel.jl index e10a1bb4..e4d3e51f 100644 --- a/test/parallel.jl +++ b/test/parallel.jl @@ -34,10 +34,12 @@ som2 = initGigaSOM(dfSom, 10, 10) @test som2.numCodes == 100 end -som2 = trainGigaSOM(som2, dfSom, epochs = 2, r = 6.0) +som2 = trainGigaSOM(som2, dfSom, epochs = 2, rStart = 6.0) winners = mapToGigaSOM(som2, dfSom) +embed = embedGigaSOM(som2, dfSom, k=10, smooth=0.0, adjust=0.5) + #test parallel @testset "Parallel" begin codes = som2.codes @@ -45,22 +47,28 @@ winners = mapToGigaSOM(som2, dfSom) dfCodes = DataFrame(codes) names!(dfCodes, Symbol.(som2.colNames)) + dfEmbed = DataFrame(embed) CSV.write(genDataPath*"/parallelDfCodes.csv", dfCodes) CSV.write(genDataPath*"/parallelWinners.csv", winners) + CSV.write(genDataPath*"/parallelEmbedded.csv", dfEmbed) # load the ref data refParallelDfCodes = CSV.File(refDataPath*"/refParallelDfCodes.csv") |> DataFrame refParallelWinners = CSV.File(refDataPath*"/refParallelWinners.csv") |> DataFrame + refParallelEmbedded = CSV.File(refDataPath*"/refParallelEmbedded.csv") |> DataFrame # load the generated data parallelDfCodes = CSV.File(genDataPath*"/parallelDfCodes.csv") |> DataFrame parallelDfCodesTest = first(parallelDfCodes, 10) parallelWinners = CSV.File(genDataPath*"/parallelWinners.csv") |> DataFrame parallelWinnersTest = first(parallelWinners, 10) + parallelEmbedded = CSV.File(genDataPath*"/parallelEmbedded.csv") |> DataFrame + parallelEmbeddedTest = first(parallelEmbedded, 10) # test the generated data against the reference data @test refParallelWinners == parallelWinnersTest @test refParallelDfCodes == parallelDfCodesTest + @test Array{Float64,2}(refParallelEmbedded) ≈ Array{Float64,2}(parallelEmbeddedTest) atol=1e-4 #test parallel for (i, j) in zip(parallelDfCodes[:,1], refParallelDfCodes[:,1]) diff --git a/test/refData/refBatchDfCodes.csv b/test/refData/refBatchDfCodes.csv index d72f5b0a..c67d4c02 100644 --- a/test/refData/refBatchDfCodes.csv +++ b/test/refData/refBatchDfCodes.csv @@ -1,11 +1,11 @@ -CD3(110:114)Dd,CD45(In115)Dd,CD4(Nd145)Dd,CD20(Sm147)Dd,CD33(Nd148)Dd,CD123(Eu151)Dd,CD14(Gd160)Dd,IgM(Yb171)Dd,HLA_DR(Yb174)Dd,CD7(Yb176)Dd -2.175616531805762,5.060728572811399,1.5229638637283334,0.41148062163875176,0.12332053648280498,0.05529462666982641,0.39361863238273054,0.11273347988874298,0.6977747762513635,2.3388150644487227 -2.1552203089403434,5.105004902032582,1.5126715433000124,0.3888151909899564,0.15834607103998016,0.10090111790411453,0.39229002690782594,0.11722165791058765,0.7635821962087388,2.3090237808125864 -2.1084444326938336,5.143767314759177,1.5390179381090656,0.3816741252056492,0.2095164485004019,0.1751108615768694,0.40986917166552866,0.12353496507627343,0.8595189128508285,2.286900847170541 -2.0478645009318663,5.173068037527361,1.608617343836585,0.4002847331743505,0.26909338507823993,0.2694286699511748,0.43991013275425644,0.1314554707787015,0.9628081274109022,2.2927319021790185 -2.0009630051808327,5.1912907670521555,1.717351344726903,0.4484517252326335,0.3215681095045519,0.3598931086166029,0.4665801800545376,0.13991955532542874,1.0353515892276948,2.3412546503214613 -1.9963143172070326,5.200218970431588,1.8489134893221697,0.5179925093063705,0.35283356228264184,0.41832660633806856,0.4731031447917208,0.14753882075245547,1.0462935842948398,2.425088256503504 -2.0444376630090693,5.203531338915684,1.9811812263590416,0.5914701206773774,0.3603702311690916,0.4301473221495437,0.45305599023373083,0.15340423546667956,0.9931451773123359,2.513248737885701 -2.132864112792643,5.204436313218857,2.0951006258837324,0.6511073471646257,0.35331058707785146,0.4003508279999249,0.412807227641389,0.1572109461770441,0.8988787527072348,2.5693004348215505 -2.237290805951987,5.2047134978740575,2.179995191316209,0.6860197233259319,0.3440251915584102,0.3450422739734919,0.36479614814355105,0.15880104822419322,0.7926609562712944,2.571775181173908 -2.3355904137223003,5.205067631563134,2.234425716614674,0.694313423237409,0.3406010925093756,0.2803955179657554,0.3199159570252268,0.15793502043044785,0.6948087789462455,2.521732406019195 +CD3,CD45,CD4,CD20,CD33,CD123,CD14,IgM,HLA_DR,CD7 +2.1756165318060905,5.060728572812074,1.5229638637285616,0.4114806216388121,0.12332053648282311,0.05529462666983472,0.39361863238279243,0.11273347988876002,0.6977747762514656,2.338815064449122 +2.155220308942053,5.105004902036477,1.5126715433011924,0.3888151909902582,0.15834607104010073,0.10090111790419122,0.3922900269081193,0.11722165791067655,0.7635821962093082,2.3090237808143623 +2.108444432693398,5.143767314758162,1.5390179381087805,0.3816741252055754,0.20951644850036094,0.17511086157683697,0.4098691716654458,0.12353496507625035,0.8595189128506652,2.2869008471701364 +2.047864500931419,5.17306803752616,1.6086173438362115,0.4002847331742631,0.26909338507818165,0.2694286699511155,0.4399101327541554,0.13145547077867287,0.9628081274107008,2.292731902178483 +2.0009630051805583,5.191290767051384,1.7173513447266693,0.44845172523257293,0.3215681095045031,0.3598931086165544,0.4665801800544706,0.13991955532540803,1.0353515892275635,2.34125465032111 +1.9963143172068711,5.200218970431108,1.84891348932201,0.5179925093063311,0.3528335622826112,0.41832660633803964,0.4731031447916818,0.1475388207524405,1.0462935842947532,2.425088256503339 +2.0444376630083894,5.203531338914059,1.9811812263584025,0.5914701206771869,0.36037023116898115,0.4301473221493981,0.45305599023359017,0.153404235466628,0.9931451773120265,2.513248737884906 +2.1328641127916406,5.204436313216373,2.095100625882708,0.6511073471643177,0.3533105870776819,0.4003508279997395,0.4128072276411971,0.15721094617697265,0.8988787527068168,2.569300434820387 +2.2372908059515346,5.204713497872974,2.179995191315799,0.6860197233258101,0.3440251915583472,0.3450422739734255,0.36479614814347805,0.15880104822416272,0.7926609562711434,2.5717751811734044 +2.3355904137231356,5.205067631564888,2.234425716615429,0.6943134232376476,0.34060109250948684,0.28039551796585294,0.31991595702533876,0.15793502043050475,0.6948087789465008,2.5217324060200146 diff --git a/test/refData/refBatchEmbedded.csv b/test/refData/refBatchEmbedded.csv new file mode 100644 index 00000000..b2f7778c --- /dev/null +++ b/test/refData/refBatchEmbedded.csv @@ -0,0 +1,11 @@ +x1,x2 +1.4122797110018108,9.554717370430515 +5.673549502562449,9.046293168967383 +8.38125958050031,7.74180008088651 +0.23961959628442586,3.144019106918954 +8.732424427413413,0.7467367735623084 +8.394356416680376,2.0422371637406127 +3.3719595621325835,10.072636427464202 +8.357830143332713,0.3500116695793543 +8.359900565037359,-0.03878747873780514 +7.837396495584574,0.061731350853772876 diff --git a/test/refData/refParallelDfCodes.csv b/test/refData/refParallelDfCodes.csv index 613a88cb..737b987b 100644 --- a/test/refData/refParallelDfCodes.csv +++ b/test/refData/refParallelDfCodes.csv @@ -1,11 +1,11 @@ -CD3(110:114)Dd,CD45(In115)Dd,CD4(Nd145)Dd,CD20(Sm147)Dd,CD33(Nd148)Dd,CD123(Eu151)Dd,CD14(Gd160)Dd,IgM(Yb171)Dd,HLA_DR(Yb174)Dd,CD7(Yb176)Dd -2.0629621170979466,5.3277563699675365,2.0054117094661663,0.3066346545365378,0.7824808453466526,0.883851920761679,0.8902463709644877,0.09892096920302673,1.9588908909412357,1.4386535429441432 -2.0988629828918928,5.312328969687483,2.1392257004275232,0.37600994465064347,0.7427942546094817,0.9291799090359916,0.9063072569371531,0.12230722597175575,1.8791733952491325,1.712598787627129 -2.3378477920104093,5.28692051275768,2.436166494580142,0.46032669020277744,0.5760498793628477,0.7974685225999406,0.8044838584081934,0.1428701120200078,1.5342167837104532,2.221231793708745 -2.714614626795058,5.268055160709281,2.8640062331734084,0.5342561333031842,0.387974289148436,0.5383695800253903,0.6352078233870215,0.1522501885219872,1.0658693561036108,2.801191859078284 -2.967439138121573,5.256382255470344,3.111703252138898,0.5654866001098249,0.2783594846860194,0.29784443517508546,0.4875240508758867,0.14857195492279018,0.6995786968897297,3.1809640915824158 -3.035526780795569,5.244508138610567,3.0508007511758106,0.5524779929875899,0.22029930424218427,0.13077427335545166,0.3838148833891155,0.13862222698503776,0.4760700580168021,3.3490554537770247 -2.961610008718297,5.233739144845976,2.644667797825496,0.49666705117889676,0.18061577647687457,0.026637160011769596,0.31626050179864135,0.12581782143819006,0.36313033021425645,3.384369632937927 -2.7932821637328673,5.238243014392942,1.7905255679804082,0.38561746359441257,0.13705479649727137,-0.027229978971610704,0.27719900365754685,0.11150797971440587,0.35730583950760325,3.3785630125780393 -2.6656809733356046,5.261774048050473,0.883260838879093,0.2667013749637191,0.08969762957262056,-0.03672430841805612,0.2617864027292688,0.10607427790113655,0.42962477520735004,3.455693317498068 -2.615201034993371,5.274573786091328,0.4591396934989689,0.2081765075319292,0.0629857468702776,-0.032841674400200235,0.2556235250363734,0.10803971966274906,0.4860646466338831,3.5411308119994005 +CD3,CD45,CD4,CD20,CD33,CD123,CD14,IgM,HLA_DR,CD7 +2.020640867980483,5.514996489969505,1.995301723659039,0.06899596158636928,1.2129551321274725,1.1069446117485457,1.1346233736136662,0.017197416780657445,2.975833689214212,0.31267711405251 +1.5830206942528937,5.4122393348267375,2.2798800878922787,0.46998755126497455,1.5601108770600722,2.178754751890501,1.7445066811665473,0.1829815188382115,3.3791170383793707,1.1016023386405283 +2.646626921944079,5.36424524475027,2.8685115441184177,0.351714190973696,0.6722059514338777,1.5732683519728468,1.307091333600342,0.17184070133357132,2.276438760006891,2.4291104771610765 +3.087475799364081,5.272351901065231,3.4952459133431444,0.27587699315783765,0.3070691906843155,1.0803519272833533,1.0119720671319512,0.15709512788240346,1.6608176334771196,2.7930136773554226 +3.3462441727866614,5.298290933193479,3.869831636268609,0.5215927042859821,0.3029917996785627,0.5394454126183926,0.6772867699173628,0.16945524353831795,0.9551020648193844,3.274027076971062 +3.4404760310836124,5.266105987480576,3.764911979894159,0.5530998732558818,0.15632529950036322,-0.040985864969088785,0.2899359029060973,0.12325992321158036,0.23323149959588643,3.6166783978301016 +3.3312565996404313,5.211286242599864,2.763709190815365,0.2978042897318974,0.04443802207467512,-0.199622611326383,0.12317447549925953,0.09535707470201686,0.05946989342744581,3.6053971525531034 +3.286255172976288,5.310956466494462,1.8091712012292103,0.3210466344176388,0.07853789975196096,-0.1839453846196577,0.1720734463089416,0.05454526988525985,0.07255558669838269,3.4553720156665477 +3.3815655651083887,5.384944293666001,1.137980111511053,0.3701020936716946,0.09309025558281249,-0.12548891363272066,0.2052008453837239,0.04079216310931853,0.09477640099200865,3.334953572799042 +3.3757558742882576,5.463206608810518,0.0736088205090655,0.18928657599330787,0.04516596061330867,-0.03387842067151006,0.2632907059800977,0.12116638011191705,0.45223371079929814,3.722049662056219 diff --git a/test/refData/refParallelEmbedded.csv b/test/refData/refParallelEmbedded.csv new file mode 100644 index 00000000..49f3cec0 --- /dev/null +++ b/test/refData/refParallelEmbedded.csv @@ -0,0 +1,11 @@ +x1,x2 +2.286779932669506,8.172094657927772 +3.126570370524788,6.3253341809112715 +3.9788943199739335,6.28020293572781 +8.694472274715695,0.4548039537534669 +6.082817977381265,2.2586782127376765 +6.759115714316216,2.4322241043201207 +4.199049482506716,8.796680105523677 +4.852853783118988,1.9069192395167651 +3.566209335588983,2.1472912990016204 +3.54926978254048,-0.32069998181743953 diff --git a/test/refData/refParallelWinners.csv b/test/refData/refParallelWinners.csv index eb376178..be30985e 100644 --- a/test/refData/refParallelWinners.csv +++ b/test/refData/refParallelWinners.csv @@ -1,11 +1,11 @@ index 93 90 -54 +76 10 -6 8 +10 95 6 -16 -5 +25 +4