Skip to content

Commit

Permalink
Merge pull request #60 from oHunewald/get-metadata-rebased
Browse files Browse the repository at this point in the history
Large FCS file i/o
  • Loading branch information
laurentheirendt authored Sep 6, 2019
2 parents 9afe8b9 + 465e844 commit cef6011
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 36 deletions.
16 changes: 15 additions & 1 deletion src/io/input.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

"""
readFlowset(filenames)
Expand All @@ -8,13 +7,28 @@ Create a dictionary with filenames as keys and daFrame as values
- `filenames`: Array of type string
"""
function readFlowset(filenames)

flowFrame = Dict()

# read all FCS files into flowFrame
for name in filenames # file list
flowrun = FileIO.load(name) # FCS file

# get metadata
# FCSFiles returns a dict with coumn names as key
# As the dict is not in order, use the name column form meta
# to sort the Dataframe after cast.
meta = getMetaData(flowrun)
markers = meta[:,1]
markersIsotope = meta[:,5]
flowDF = DataFrame(flowrun.data)
# sort the DF according to the marker list
flowDF = flowDF[:, Symbol.(markersIsotope)]
cleanNames!(markers)

names!(flowDF, Symbol.(markers), makeunique=true)
flowFrame[name] = flowDF
end

return flowFrame
end
66 changes: 64 additions & 2 deletions src/io/process.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ function createDaFrame(fcsRaw, md, panel)
end
dfall = vcat(dfall...)
cc = map(Symbol, vcat(lineageMarkers, functionalMarkers))
# markers can be lineage and functional at tthe same time
# therefore make cc unique
unique!(cc)
push!(cc, :sample_id)
# reduce the dataset to lineage (and later) functional (state) markers
dfall = dfall[:, cc]
Expand All @@ -112,8 +115,8 @@ Returns the `lineageMarkers` and `functionalMarkers` on a given panel
function getMarkers(panel)

# extract lineage markers
lineageMarkers = panel.fcs_colname[panel.Lineage .== 1, : ]
functionalMarkers = panel.fcs_colname[panel.Functional .== 1, :]
lineageMarkers = panel.Antigen[panel.Lineage .== 1, : ]
functionalMarkers = panel.Antigen[panel.Functional .== 1, : ]

# lineageMarkers are 2d array,
# flatten this array by using vec:
Expand All @@ -125,3 +128,62 @@ function getMarkers(panel)
return lineageMarkers, functionalMarkers

end

"""
getMetaData(f)
Collect the meta data information in a more user friendly format.
# Arguments:
- `f`: input structure with `.params` and `.data` fields
"""
function getMetaData(f)

# declarations and initializations
meta = f.params
metaKeys = keys(meta)
channel_properties = []
defaultValue = "None"

# determine the number of channels
pars = parse(Int, strip(join(meta["\$PAR"])))

# determine the range of channel numbers
channel_numbers = 1:pars

# determine the channel properties
for (key,) in meta
if key[1:3] == "\$P1"
if !occursin(key[4], "0123456789")
push!(channel_properties, key[4:end])
end
end
end

# define the column names
column_names = ["\$Pn$p" for p in channel_properties]

# create a data frame
df = DataFrame([Vector{Any}(undef, 0) for i = 1:length(column_names)])

# fill the data frame
for ch in channel_numbers
# build first each row of the datatable
tmpV = []
for p in channel_properties
if "\$P$ch$p" in metaKeys
tmp = meta["\$P$ch$p"]
else
tmp = defaultValue
end
push!(tmpV, tmp)
end

# push the row to the dataframe
push!(df, tmpV)
end

# set the names of the df
names!(df, Symbol.(column_names))

return df
end
35 changes: 4 additions & 31 deletions test/io.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,6 @@
# Load and transform
# build the general workflow to have the data ready

#=
using FCSFiles for loading
as this function is only the basic parsing of the binary
FCS, we need to see what functionality is missing and
extend this in the original package
=#

checkDir()

#create genData and data folder and change dir to dataPath
Expand Down Expand Up @@ -45,15 +38,8 @@ for f in dataFiles
end
end

md = DataFrame(XLSX.readtable("PBMC8_metadata.xlsx", "Sheet1")...)
panel = DataFrame(XLSX.readtable("PBMC8_panel.xlsx", "Sheet1")...)
panel.Isotope = map(string, panel.Isotope)
panel.Metal = map(string, panel.Metal)
panel.Antigen = map(string, panel.Antigen)
panel.Metal[1]=""

insertcols!(panel,4,:fcs_colname => map((x,y,z)->x.*"(".*y.*z.*")".*"Dd",panel.Antigen,panel.Metal,panel.Isotope))
print(panel.fcs_colname)
md = DataFrame(XLSX.readtable("PBMC8_metadata.xlsx", "Sheet1", infer_eltypes=true)...)
panel = DataFrame(XLSX.readtable("PBMC8_panel.xlsx", "Sheet1", infer_eltypes=true)...)

lineageMarkers, functionalMarkers = getMarkers(panel)

Expand All @@ -66,22 +52,9 @@ daf = createDaFrame(fcsRaw, md, panel)
# change the directory back to the current directory
cd(cwd)

CSV.write(genDataPath*"/daf.csv", daf.fcstable)
#check if the markers from panel file are the same as loaded from the fcs file

@testset "Cleaning names" begin
for i in eachindex(lineageMarkers)
@test !in("-",i)
end
for i in eachindex(functionalMarkers)
@test !in("-",i)
end
for (k,v) in fcsRaw
colnames = names(v)
for i in eachindex(colnames)
@test !in("-",i)
end
end
end
CSV.write(genDataPath*"/daf.csv", daf.fcstable)

@testset "Checksums" begin
cd(dataPath)
Expand Down
2 changes: 1 addition & 1 deletion test/refData/refBatchDfCodes.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CD3(110:114)Dd,CD45(In115)Dd,CD4(Nd145)Dd,CD20(Sm147)Dd,CD33(Nd148)Dd,CD123(Eu151)Dd,CD14(Gd160)Dd,IgM(Yb171)Dd,HLA_DR(Yb174)Dd,CD7(Yb176)Dd
CD3,CD45,CD4,CD20,CD33,CD123,CD14,IgM,HLA_DR,CD7
2.1756165318060905,5.060728572812074,1.5229638637285616,0.4114806216388121,0.12332053648282311,0.05529462666983472,0.39361863238279243,0.11273347988876002,0.6977747762514656,2.338815064449122
2.155220308942053,5.105004902036477,1.5126715433011924,0.3888151909902582,0.15834607104010073,0.10090111790419122,0.3922900269081193,0.11722165791067655,0.7635821962093082,2.3090237808143623
2.108444432693398,5.143767314758162,1.5390179381087805,0.3816741252055754,0.20951644850036094,0.17511086157683697,0.4098691716654458,0.12353496507625035,0.8595189128506652,2.2869008471701364
Expand Down
2 changes: 1 addition & 1 deletion test/refData/refParallelDfCodes.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CD3(110:114)Dd,CD45(In115)Dd,CD4(Nd145)Dd,CD20(Sm147)Dd,CD33(Nd148)Dd,CD123(Eu151)Dd,CD14(Gd160)Dd,IgM(Yb171)Dd,HLA_DR(Yb174)Dd,CD7(Yb176)Dd
CD3,CD45,CD4,CD20,CD33,CD123,CD14,IgM,HLA_DR,CD7
2.020640867980483,5.514996489969505,1.995301723659039,0.06899596158636928,1.2129551321274725,1.1069446117485457,1.1346233736136662,0.017197416780657445,2.975833689214212,0.31267711405251
1.5830206942528937,5.4122393348267375,2.2798800878922787,0.46998755126497455,1.5601108770600722,2.178754751890501,1.7445066811665473,0.1829815188382115,3.3791170383793707,1.1016023386405283
2.646626921944079,5.36424524475027,2.8685115441184177,0.351714190973696,0.6722059514338777,1.5732683519728468,1.307091333600342,0.17184070133357132,2.276438760006891,2.4291104771610765
Expand Down

0 comments on commit cef6011

Please sign in to comment.