diff --git a/NEWS.md b/NEWS.md index e1b8dd49b..8c066a180 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ Each release typically has a number of minor bug fixes beyond what is listed her # Version 0.7.1 * `Geom.contour`: add support for `DataFrame` (#1150) + * `Geom.density`: add ability to use custom kernels and adds support for scaling, stacking, vertical orientation ([#1157](https://github.com/GiovineItalia/Gadfly.jl/pull/1157)) + * `Geom.violin`: add ability to adjust scaling and bandwidth and support for horizontal and split violins ([#1157](https://github.com/GiovineItalia/Gadfly.jl/pull/1157)) # Version 0.7.0 @@ -23,7 +25,7 @@ Each release typically has a number of minor bug fixes beyond what is listed her # Version 0.6.4 * Regression testing tools (#1020) - + # Version 0.6.3 * Wide format data (#1013) @@ -214,5 +216,3 @@ Each release typically has a number of minor bug fixes beyond what is listed her keys are wrapped automatically. * Default Theme changes. - - diff --git a/docs/src/gallery/geometries.md b/docs/src/gallery/geometries.md index 98295d8c4..010d42c53 100644 --- a/docs/src/gallery/geometries.md +++ b/docs/src/gallery/geometries.md @@ -92,8 +92,9 @@ gridstack([p1 p2; p3 p4]) ```@example using Gadfly, RDatasets, Distributions set_default_plot_size(21cm, 8cm) -p1 = plot(dataset("ggplot2", "diamonds"), x="Price", Geom.density) -p2 = plot(dataset("ggplot2", "diamonds"), x="Price", color="Cut", Geom.density) +data = dataset("ggplot2", "diamonds") +p1 = plot(data, x="Price", Geom.density) +p2 = plot(data, x="Price", color="Cut", Geom.density) hstack(p1,p2) ``` @@ -102,13 +103,27 @@ using Gadfly, RDatasets, Distributions set_default_plot_size(14cm, 8cm) dist = MixtureModel(Normal, [(0.5, 0.2), (1, 0.1)]) xs = rand(dist, 10^5) -plot(layer(x=xs, Geom.density, Theme(default_color="orange")), +plot(layer(x=xs, Geom.density, Theme(default_color="orange")), layer(x=xs, Geom.density(bandwidth=0.0003), Theme(default_color="green")), layer(x=xs, Geom.density(bandwidth=0.25), Theme(default_color="purple")), Guide.manual_color_key("bandwidth", ["auto", "bw=0.0003", "bw=0.25"], ["orange", "green", "purple"])) ``` +```@example +using Gadfly, RDatasets +set_default_plot_size(21cm, 8cm) +data = dataset("ggplot2", "diamonds") +p1 = plot(data, x=:Carat, color=:Cut, Geom.density(position=:stack), Guide.title("Loses marginal densities")) +p2 = plot(data, x=:Carat, color=:Cut, Geom.density(position=:stack, scale=:count), Guide.title("Preserve marginal densities")) +hstack(p1, p2) +``` + +```@example +using Gadfly, RDatasets +plot(dataset("ggplot2", "diamonds"), x=:Carat, color=:Cut, Geom.density(position=:fill), Guide.title("Conditional density estimate"), Coord.cartesian(ymax=1.0, xmax=5)) +``` + ## [`Geom.density2d`](@ref) @@ -464,8 +479,8 @@ using Gadfly, RDatasets set_default_plot_size(21cm, 8cm) coord = Coord.cartesian(xmin=-2, xmax=2, ymin=-2, ymax=2) -p1 = plot(coord, z=(x,y)->x*exp(-(x^2+y^2)), - xmin=[-2], xmax=[2], ymin=[-2], ymax=[2], +p1 = plot(coord, z=(x,y)->x*exp(-(x^2+y^2)), + xmin=[-2], xmax=[2], ymin=[-2], ymax=[2], # or: x=-2:0.25:2.0, y=-2:0.25:2.0, Geom.vectorfield(scale=0.4, samples=17), Geom.contour(levels=6), Scale.x_continuous(minvalue=-2.0, maxvalue=2.0), @@ -473,7 +488,7 @@ p1 = plot(coord, z=(x,y)->x*exp(-(x^2+y^2)), Guide.xlabel("x"), Guide.ylabel("y"), Guide.colorkey(title="z")) volcano = Matrix{Float64}(dataset("datasets", "volcano")) -volc = volcano[1:4:end, 1:4:end] +volc = volcano[1:4:end, 1:4:end] coord = Coord.cartesian(xmin=1, xmax=22, ymin=1, ymax=16) p2 = plot(coord, z=volc, x=1.0:22, y=1.0:16, Geom.vectorfield(scale=0.05), Geom.contour(levels=7), @@ -495,3 +510,17 @@ Dsing = dataset("lattice","singer") Dsing[:Voice] = [x[1:5] for x in Dsing[:VoicePart]] plot(Dsing, x=:VoicePart, y=:Height, color=:Voice, Geom.violin) ``` + +```@example +using Gadfly, RDatasets +set_default_plot_size(14cm, 8cm) +tips = dataset("reshape2", "tips") +plot(tips, x=:Day, y=:TotalBill, color=:Sex, Geom.violin(scale=:count), Scale.x_discrete(order=[3,4,2,1])) +``` + +```@example +using Gadfly, RDatasets +set_default_plot_size(12cm, 16cm) +melanoma = dataset("mlmRev", "Mmmec") +plot(melanoma, y=:Nation, x=:Deaths, color=:Nation, Geom.violin(orientation=:horizontal, scale=:count)) +``` diff --git a/src/aesthetics.jl b/src/aesthetics.jl index 6aa67d9bc..e952a4753 100755 --- a/src/aesthetics.jl +++ b/src/aesthetics.jl @@ -413,3 +413,79 @@ function inherit!(a::Aesthetics, b::Aesthetics; end nothing end + +""" + groupby(aes, by, togroupvar) + +Given aesthetics to group with, `by`, and an aesthetic to group `togroupvar` +this function constructs a dictionary that maps each given combination of the +`by` aesthetics to the positions which they apply to. Thus the output is a +dictionary of tuples of each unique combination of `by` mapped to a boolean +array of length `n` where `n` is the length of the aesthetics (they have to all +have the same length). If the provided aesthetics are missing, a placeholder +`nothing` is return instead of the unique value. + +## Examples + +```jldoctest +using Gadfly +aes = Gadfly.Aesthetics() +aes.x = repeat([1, 2], inner=3) +aes.y = collect(1:6) + +Gadfly.groupby(aes, [:x, :color], :y) + +# output + +DataStructures.OrderedDict{Tuple{Int64,Void},Array{Bool,1}} with 2 entries: + (1, nothing) => Bool[true, true, true, false, false, false] + (2, nothing) => Bool[false, false, false, true, true, true] +``` + +```jldoctest +using Gadfly +aes = Gadfly.Aesthetics() +aes.x = repeat([:a, :b], inner=2) +aes.y = collect(1:4) +aes.color = repeat([colorant"red", colorant"blue"], inner=2) + +Gadfly.groupby(aes, [:x, :color], :y) + +# output + +DataStructures.OrderedDict{Tuple{Symbol,ColorTypes.RGB{FixedPointNumbers.Normed{UInt8,8}}},Array{Bool,1}} with 2 entries: + (:a, RGB{N0f8}(1.0,0.0,0.0)) => Bool[true, true, false, false] + (:b, RGB{N0f8}(0.0,0.0,1.0)) => Bool[false, false, true, true] +``` + +""" +function groupby(aes::Gadfly.Aesthetics, by::Vector{Symbol}, togroupvar::Symbol) + types = fill(Nothing, length(by)) + isconcrete = fill(false, length(by)) + for i in 1:length(by) + isconcrete[i] = getfield(aes, by[i]) != nothing + (!isconcrete[i]) && continue + types[i] = eltype(getfield(aes, by[i])) + @assert length(getfield(aes, togroupvar)) == length(getfield(aes, by[i])) "$togroupvar and $(by[i]) aesthetics must have same length" + end + + grouped = DataStructures.OrderedDict{Tuple{types...}, Vector{Bool}}() + + # gather options for each `by` aesthetic + opt = [if isconcrete[i] unique(getfield(aes, by[i])) else [nothing] end for i in 1:length(by)] + + # The approach is to identify positions were multiple by aesthetics overlap + # and thus grouping the data positions. We first assume that all positions + # belong to a combination of aesthetics and then whittle it down + for combo in IterTools.product(opt...) + belongs = fill(true, length(getfield(aes, togroupvar))) + for i in 1:length(combo) + (combo[i] == nothing) && continue + belongs .&= getfield(aes, by[i]) .== combo[i] + end + # for multiple by variables we need to check whether there is any overlap + # between this specific combo before adding it to the dict + (any(belongs)) && (grouped[combo] = belongs) + end + grouped +end diff --git a/src/geom/density.jl b/src/geom/density.jl new file mode 100644 index 000000000..c6136ab22 --- /dev/null +++ b/src/geom/density.jl @@ -0,0 +1,191 @@ +struct DensityGeometry <: Gadfly.GeometryElement + stat::Gadfly.StatisticElement + order::Int + tag::Symbol +end + +function DensityGeometry(; n=256, + bandwidth=-Inf, + adjust=1.0, + kernel=Normal, + trim=false, + scale=:area, + position=:dodge, + orientation=:horizontal, + order=1, + tag=empty_tag) + stat = Gadfly.Stat.DensityStatistic(n, bandwidth, adjust, kernel, trim, + scale, position, orientation, false) + DensityGeometry(stat, order, tag) +end + +DensityGeometry(stat; order=1, tag=empty_tag) = DensityGeometry(stat, order, tag) + +""" + Geom.density(; bandwidth, adjust, kernel, trim, scale, position, orientation, order) + +Draws a kernel density estimate. This is a cousin of [`Geom.histogram`](@ref) +that is especially useful when the datapoints originate from a underlying smooth +distribution. Unlike histograms, density estimates do not suffer from edge +effects from incorrect bin choices. Some caveats do apply: + +1) Plot components do not necessarily correspond to the raw datapoints, but + instead to the kernel density estimation of the underlying distribution +2) Density estimation improves as a function of the number of data points and + can be misleadingly smooth when the number of datapoints is small. +3) Results can be sensitive to the choise of `kernel` and `bandwidth` + +For horizontal histograms (default), `Geom.density` draws the kernel density +estimate of `x` optionally grouped by `color`. If the `orientation=:vertical` +flag is passed to the function, then densities will be computed along `y`. The +estimates are normalized by default to have areas equal to 1, but this can +changed by passing `scale=:count` to scale by the raw number of datapoints or +`scale=:peak` to scale by the max height of the estimate. Additionally, multiple +densities can be stacked using the `position=:stack` flag or the conditional +density estimate can be drawn using `position=:fill`. See +[`Stat.DensityStatistic`](@ref Gadfly.Stat.DensityStatistic) for details on +optional parameters that can control the `bandwidth`, `kernel`, etc used. + +External links + +* [Kernel Density Estimation on Wikipedia](https://en.wikipedia.org/wiki/Kernel_density_estimation) +""" +const density = DensityGeometry + +element_aesthetics(::DensityGeometry) = Symbol[] +default_statistic(geom::DensityGeometry) = Gadfly.Stat.DensityStatistic(geom.stat) + +function render(geom::DensityGeometry, theme::Gadfly.Theme, aes::Gadfly.Aesthetics) + Gadfly.assert_aesthetics_defined("Geom.density", aes, :x, :y) + Gadfly.assert_aesthetics_equal_length("Geom.density", aes, :x, :y) + + grouped_data = Gadfly.groupby(aes, [:color], :y) + densities = Array{NTuple{2, Float64}}[] + colors = [] + + for (keys, belongs) in grouped_data + xs = aes.x[belongs] + ys = aes.y[belongs] + + push!(densities, [(x, y) for (x, y) in zip(xs, ys)]) + push!(colors, keys[1] != nothing ? keys[1] : theme.default_color) + end + + ctx = context(order=geom.order) + # TODO: This should be user controllable + if geom.stat.position == :dodge + compose!(ctx, Compose.polygon(densities, geom.tag), stroke(colors), fill(nothing)) + else + compose!(ctx, Compose.polygon(densities, geom.tag), fill(colors)) + end + + compose!(ctx, svgclass("geometry")) +end + +struct ViolinGeometry <: Gadfly.GeometryElement + stat::Gadfly.StatisticElement + order::Int + tag::Symbol +end + +function ViolinGeometry(; n=256, + bandwidth=-Inf, + adjust=1.0, + kernel=Normal, + trim=true, + scale=:area, + orientation=:vertical, + order=1, + tag=empty_tag) + stat = Gadfly.Stat.DensityStatistic(n, bandwidth, adjust, kernel, trim, + scale, :dodge, orientation, true) + ViolinGeometry(stat, order, tag) +end + +""" + Geom.violin[(; bandwidth, adjust, kernel, trim, order)] + +Draws a violin plot which is a combination of [`Geom.density`](@ref) and +[`Geom.boxplot`](@ref). This plot type is useful for comparing differences in +the distribution of quantitative data between categories, especially when the +data is non-normally distributed. See [`Geom.density`](@ref) for some caveats. + +In the case of standard vertical violins, `Geom.violin` draws the density +estimate of `y` optionally grouped categorically by `x` and colored +with `color`. See [`Stat.DensityStatistic`](@ref Gadfly.Stat.DensityStatistic) +for details on optional parameters that can control the `bandwidth`, `kernel`, +etc used. + +```@example +using RDatasets, Gadfly + +df = dataset("ggplot2", "diamonds") + +p = plot(df, x=:Cut, y=:Carat, color=:Cut, Geom.violin()) +draw(SVG("diamonds_violin1.svg", 10cm, 8cm), p) # hide +nothing # hide +``` +![](diamonds_violin1.svg) +""" +const violin = ViolinGeometry + +element_aesthetics(::ViolinGeometry) = [] + +default_statistic(geom::ViolinGeometry) = Gadfly.Stat.DensityStatistic(geom.stat) + +function render(geom::ViolinGeometry, theme::Gadfly.Theme, aes::Gadfly.Aesthetics) + + Gadfly.assert_aesthetics_defined("Geom.violin", aes, :y, :width) + Gadfly.assert_aesthetics_equal_length("Geom.violin", aes, :y, :width) + + output_dims, groupon = Gadfly.Stat._find_output_dims(geom.stat) + grouped_data = Gadfly.groupby(aes, groupon, output_dims[2]) + violins = Array{NTuple{2, Float64}}[] + + (aes.color == nothing) && (aes.color = fill(theme.default_color, length(aes.x))) + colors = eltype(aes.color)[] + color_opts = unique(aes.color) + split = false + # TODO: Add support for dodging violins (i.e. having more than two colors + # per major category). Also splitting should not happen automatically, but + # as a optional keyword to Geom.violin + if length(keys(grouped_data)) > 2*length(unique(getfield(aes, output_dims[1]))) + error("Violin plots do not currently support having more than 2 colors per $(output_dims[1]) category") + elseif length(color_opts) == 2 + split = true + end + + for (keys, belongs) in grouped_data + x, color = keys + ys = getfield(aes, output_dims[2])[belongs] + ws = aes.width[belongs] + + if split + pos = findfirst(color_opts, color) + if pos == 1 + push!(violins, [(x - w/2, y) for (y, w) in zip(ys, ws)]) + else + push!(violins, reverse!([(x + w/2, y) for (y, w) in zip(ys, ws)])) + end + push!(colors, color) + else + push!(violins, vcat([(x - w/2, y) for (y, w) in zip(ys, ws)], + reverse!([(x + w/2, y) for (y, w) in zip(ys, ws)]))) + push!(colors, color != nothing ? color : theme.default_color) + end + end + + if geom.stat.orientation == :horizontal + for violin in violins + for i in 1:length(violin) + violin[i] = reverse(violin[i]) + end + end + end + + ctx = context(order=geom.order) + compose!(ctx, Compose.polygon(violins, geom.tag), fill(colors)) + + compose!(ctx, svgclass("geometry")) + +end diff --git a/src/geom/line.jl b/src/geom/line.jl index fe10e24fc..b2ac03aae 100644 --- a/src/geom/line.jl +++ b/src/geom/line.jl @@ -51,16 +51,6 @@ geometry is equivalent to [`Geom.line`](@ref) with `preserve_order=true`. """ path() = LineGeometry(preserve_order=true) -""" - Geom.density[(; bandwidth=-Inf)] - -Draw a line showing the density estimate of the `x` aesthetic. -This geometry is equivalent to [`Geom.line`](@ref) with -[`Stat.density`](@ref); see the latter for more information. -""" -density(; bandwidth::Real=-Inf) = - LineGeometry(Gadfly.Stat.density(bandwidth=bandwidth)) - """ Geom.density2d[(; bandwidth=(-Inf,-Inf), levels=15)] diff --git a/src/geom/violin.jl b/src/geom/violin.jl deleted file mode 100644 index 21539287b..000000000 --- a/src/geom/violin.jl +++ /dev/null @@ -1,47 +0,0 @@ -struct ViolinGeometry <: Gadfly.GeometryElement - order::Int - tag::Symbol -end -ViolinGeometry(; order=1, tag=empty_tag) = ViolinGeometry(order, tag) - -""" - Geom.violin[(; order=1)] - -Draw `y` versus `width`, optionally grouping categorically by `x` and coloring -with `color`. Alternatively, if `width` is not supplied, the data in `y` will -be transformed to a density estimate using [`Stat.violin`](@ref) -""" -const violin = ViolinGeometry - -element_aesthetics(::ViolinGeometry) = [:x, :y, :color] - -default_statistic(::ViolinGeometry) = Gadfly.Stat.violin() - -function render(geom::ViolinGeometry, theme::Gadfly.Theme, aes::Gadfly.Aesthetics) - # TODO: What should we do with the color aesthetic? - - Gadfly.assert_aesthetics_defined("Geom.violin", aes, :y, :width) - Gadfly.assert_aesthetics_equal_length("Geom.violin", aes, :y, :width) - - default_aes = Gadfly.Aesthetics() - default_aes.color = fill(theme.default_color, length(aes.y)) - aes = Gadfly.inherit(aes, default_aes) - - # Group y, width and color by x - ux = unique(aes.x) - grouped_color = Dict(x => first(aes.color[aes.x.==x]) for x in ux) - grouped_y = Dict(x => aes.y[aes.x.==x] for x in ux) - grouped_width = Dict(x => aes.width[aes.x.==x] for x in ux) - - kgy = keys(grouped_y) - violins = [vcat([(x - w/2, y) for (y, w) in zip(grouped_y[x], grouped_width[x])], - reverse!([(x + w/2, y) for (y, w) in zip(grouped_y[x], grouped_width[x])])) - for x in kgy] - colors = [grouped_color[x] for x in kgy] - - ctx = context(order=geom.order) - compose!(ctx, Compose.polygon(violins, geom.tag), fill(colors)) - - compose!(ctx, svgclass("geometry")) - -end diff --git a/src/geometry.jl b/src/geometry.jl index a814e6c23..fe953534e 100644 --- a/src/geometry.jl +++ b/src/geometry.jl @@ -62,7 +62,7 @@ include("geom/point.jl") include("geom/rectbin.jl") include("geom/subplot.jl") include("geom/ribbon.jl") -include("geom/violin.jl") +include("geom/density.jl") include("geom/polygon.jl") include("geom/beeswarm.jl") include("geom/segment.jl") diff --git a/src/statistics.jl b/src/statistics.jl index 2be810105..7366a9062 100644 --- a/src/statistics.jl +++ b/src/statistics.jl @@ -500,67 +500,220 @@ function apply_statistic(stat::Density2DStatistic, apply_statistic(ContourStatistic(levels=stat.levels), scales, coord, aes) end - +""" + A general statistic for density plots (e.g. KDE plots and violin plots). +See [`Geom.density`](@ref Gadfly.Geom.density) or [`Geom.violin`](@ref +Gadfly.Geom.violin) for more details. +""" struct DensityStatistic <: Gadfly.StatisticElement - # Number of points sampled + """ + Number of points sampled for estimate. Powers of two yields better + performance. + """ n::Int - # Bandwidth used for the kernel density estimation + + """ + Smoothing bandwidth used for the kernel density estimation. This + corresponds to the standard deviation of the `kernel`. + """ bw::Real + + """ + Multiplicative adjustment of the computed optimal bandwidth. This is a + relative adjustment, see `bw` to enforce a specific numerical bandwidth. + """ + adjust::Float64 + + """ + Kernel used for density estimation, see `KernelDensity.jl` for more details. + Default is the Normal Distribution. + """ + kernel + + """ + This parameter only applies in the context of multiple densities. If set to + `false` (the default), the densities are computed over the full range of + data. If `true`, then each density's range will be computed only over the + range of data belonging to that group. This option is incompatible with + stacked densities since the ranges might not line up any more. + """ + trim::Bool + + """ + Method for scaling across multiple estimates. If `:area` (default), all + density estimates will have the same area under the curve (prior to trimming + ). If `:count`, the areas are scaled proportionally to the total number of + observations for each density estimate. If `:peak`, then all densities will + have the same maximum peak height. + """ + scale::Symbol + + """ + Control handling of multiple overlapping densities. The `:dodge` option + (default) just overlays each density such that they are in front of each + other. The `:stack` option places the densities a top of each other. The + `:fill` option is similar to `:stack`, but the stacks are all normalized to + a constant height of 1.0. This last option is useful for generating + conditional density estimates. + """ + position::Symbol + + """ + Whether the plot is `:horizontal` or `:vertical` + """ + orientation::Symbol + + """ + Internal flag that is `true` if this density statistic is a violin plot + """ + isviolin::Bool end -DensityStatistic(; n=256, bandwidth=-Inf) = DensityStatistic(n, bandwidth) -input_aesthetics(stat::DensityStatistic) = [:x, :color] -output_aesthetics(stat::DensityStatistic) = [:x, :y, :color] -default_scales(::DensityStatistic) = [Gadfly.Scale.y_continuous()] +function input_aesthetics(stat::DensityStatistic) + if stat.isviolin + return [:x, :y, :color] + elseif stat.orientation == :horizontal + return [:x, :color] + else + return [:y, :color] + end +end -""" - Stat.density[(; n=256, bandwidth=-Inf)] +output_aesthetics(stat::DensityStatistic) = [:x, :y, :color] -Estimate the density of `x` at `n` points, and put the result in `x` and `y`. -Smoothing is controlled by `bandwidth`. Used by [`Geom.density`](@ref Gadfly.Geom.density). -""" -const density = DensityStatistic +function _find_output_dims(stat::DensityStatistic) + output_dims = Union{Symbol, Nothing}[:x, :y] + (stat.orientation == :vertical) && reverse!(output_dims) + + groupon = [:color] + if stat.isviolin + reverse!(output_dims) + # For violin plots we need an additional dimension for the density data + # so we add an additional dimension on the end + push!(output_dims, :width) + insert!(groupon, 1, output_dims[1]) + else + # For simple density plots there are no categories so we'll insert a + # placeholder value into the first dimension + insert!(output_dims, 1, nothing) + end + output_dims, groupon +end function apply_statistic(stat::DensityStatistic, scales::Dict{Symbol, Gadfly.ScaleElement}, coord::Gadfly.CoordinateElement, aes::Gadfly.Aesthetics) - Gadfly.assert_aesthetics_defined("DensityStatistic", aes, :x) - if aes.color === nothing - isa(aes.x[1], Real) || error("Kernel density estimation only works on Real types.") + # For all density/violin plots we're computing a new dimension, the density + # dimension. We're also overwriting the other dimensions and part of the + # trickiness is tracking which dimension refers to what. + # Three output dimensions: (1) grouping (2) evaluation points (i.e where + # we're evaluating the KDE) (3) density values + output_dims, groupon = _find_output_dims(stat) + + if stat.isviolin + xcat, ycat = Scale.iscategorical(scales, :x), Scale.iscategorical(scales, :y) + if xcat && ycat + error("Either the x or y aesthetics must be Real for kernel density estimation") + elseif xcat && stat.orientation == :horizontal + error("Horizontal violins require a continuous x axis for kernel density estimation") + elseif ycat && stat.orientation == :vertical + error("Vertical violins require a continuous y axis for kernel density estimation") + elseif !xcat && !ycat # neither x or y is categorical so we'll assume x is meant to be categorical, see #968 + new_scale = Scale.x_discrete(order=sortperm(unique(aes.x))) + Scale.apply_scale(new_scale, [aes], Gadfly.Data(x=aes.x)) + scales[:x] = new_scale + warn( + """ + Both x and y aesthetics are continuous, violin plots require a + categorical variable. Transforming x to be categorical. + """) + end + if getfield(aes, output_dims[1]) == nothing + setfield!(aes, output_dims[1], fill(1.0, length(getfield(aes, output_dims[2])))) + end + elseif getfield(aes, output_dims[2]) == nothing + error("The $(output_dims[2]) aesthetic is required for $(stat.orientation) density plots") + end + + grouped_data = Gadfly.groupby(aes, groupon, output_dims[2]) + + + n_pts = stat.position == :fill ? stat.n : stat.n + 2 + n_groups = length(grouped_data) + + groups = Array{Float64}(n_groups) + eval_points = fill(0.0, n_groups, n_pts) + densities = fill(0.0, n_groups, n_pts) + colors = Array{eltype(aes.color)}(n_groups) + + # if the densities are stacked then we'll need to clamp them so that they + # share the same evaluation points (e.g. x values) + boundary = extrema(getfield(aes, output_dims[2])) + + for (idx, (keys, belongs)) in enumerate(grouped_data) + input = getfield(aes, output_dims[2])[belongs] + window = stat.bw <= 0.0 ? KernelDensity.default_bandwidth(input)*stat.adjust : stat.bw + (stat.trim) && (boundary = extrema(input)) + kde_est = KernelDensity.kde(input, kernel=stat.kernel, + boundary=boundary, + npoints=stat.n, + bandwidth=window) + evalpts = kde_est.x + density = kde_est.density + # only store category information if this is a violin plot and we need it + if stat.isviolin + groups[idx] = keys[1] + colors[idx] = keys[2] + elseif length(keys) == 1 + colors[idx] = keys[1] + else + error("Density plots do not support grouping by more than two dimensions.") + end + # scale density output depending on `scale` flag + scaled_density = stat.position == :fill ? density : vcat(0.0, density, 0.0) + if stat.scale == :count + scaled_density .*= sum(input) + elseif stat.scale == :peak + scaled_density ./= maximum(density) + end - x_f64 = collect(Float64, aes.x) + eval_points[idx, :] = stat.position == :fill ? evalpts : vcat(boundary[1], evalpts, boundary[2]) + densities[idx, :] = scaled_density + end - window = stat.bw <= 0.0 ? KernelDensity.default_bandwidth(x_f64) : stat.bw - f = KernelDensity.kde(x_f64, bandwidth=window, npoints=stat.n) - aes.x = collect(Float64, f.x) - aes.y = f.density - else - groups = Dict() - for (x, c) in zip(aes.x, cycle(aes.color)) - if !haskey(groups, c) - groups[c] = Float64[x] - else - push!(groups[c], x) - end + if stat.position == :dodge + # if this is a violin plot, make sure to set the grouping + (stat.isviolin) && setfield!(aes, output_dims[1], repeat(groups, outer=n_pts)) + setfield!(aes, output_dims[2], vec(eval_points)) + setfield!(aes, output_dims[3], vec(densities)) + (aes.color != nothing) && (aes.color = repeat(colors, outer=n_pts)) + elseif stat.position == :stack || stat.position == :fill + if stat.position == :fill + densities ./= sum(densities, 1) end - - colors = Array{RGB{Float32}}(0) - aes.x = Array{Float64}(0) - aes.y = Array{Float64}(0) - for (c, xs) in groups - window = stat.bw <= 0.0 ? KernelDensity.default_bandwidth(xs) : stat.bw - f = KernelDensity.kde(xs, bandwidth=window, npoints=stat.n) - append!(aes.x, f.x) - append!(aes.y, f.density) - for _ in 1:length(f.x) - push!(colors, c) + stacked_densities = hcat(copy(densities), fill(0.0, size(densities)...)) + for i in 1:n_groups + for j in 1:i-1 + stacked_densities[i, 1:n_pts] .+= densities[j, :] + stacked_densities[i, n_pts+1:2*n_pts] .+= densities[j, end:-1:1] end end - aes.color = discretize_make_ia(colors) + setfield!(aes, output_dims[2], vec(hcat(eval_points, eval_points[:, end:-1:1]))) + setfield!(aes, output_dims[3], vec(stacked_densities)) + (aes.color != nothing) && (aes.color = repeat(colors, outer=n_pts*2)) + end + + if stat.isviolin + pad = 0.1 + maxwidth = maximum(aes.width) + broadcast!(*, aes.width, aes.width, 1 - pad) + broadcast!(/, aes.width, aes.width, maxwidth) + else + scales[:y] = Scale.y_continuous() + aes.y_label = Gadfly.Scale.identity_formatter end - aes.y_label = Gadfly.Scale.identity_formatter end @@ -758,7 +911,7 @@ data with `coverage_weight`; and of having a nice numbering with granularity_weight=1/4, simplicity_weight=1/6, coverage_weight=1/3, - niceness_weight=1/4) = + niceness_weight=1/4) = TickStatistic("x", granularity_weight, simplicity_weight, coverage_weight, niceness_weight, ticks) @@ -1621,65 +1774,6 @@ function apply_statistic(stat::QQStatistic, end end - -struct ViolinStatistic <: Gadfly.StatisticElement - n::Int # Number of points sampled -end -ViolinStatistic() = ViolinStatistic(300) - -input_aesthetics(::ViolinStatistic) = [:x, :y, :color] -output_aesthetics(::ViolinStatistic) = [:x, :y, :width, :color] -default_scales(stat::ViolinStatistic) = [Gadfly.Scale.x_discrete(), Gadfly.Scale.y_continuous()] - -### very similar to Stat.density; Geom.violin could be refactored to us it instead -""" - Stat.violin[(n=300)] - -Transform $(aes2str(input_aesthetics(violin()))). -""" -const violin = ViolinStatistic - -function apply_statistic(stat::ViolinStatistic, - scales::Dict{Symbol, Gadfly.ScaleElement}, - coord::Gadfly.CoordinateElement, - aes::Gadfly.Aesthetics) - - isa(aes.y[1], Real) || error("Kernel density estimation only works on Real types.") - - grouped_y = Dict(1=>aes.y) - grouped_color = Dict{Int, Gadfly.ColorOrNothing}(1=>nothing) - ux = unique(aes.x) - uxflag = length(ux) < length(aes.x) - colorflag = aes.color != nothing - - uxflag && (grouped_y = Dict(x=>aes.y[aes.x.==x] for x in ux)) - - grouped_color = (colorflag ? Dict(x=>first(aes.color[aes.x.==x]) for x in ux) : - uxflag && Dict(x=>nothing for x in ux) ) - - aes.x = Array{Float64}(0) - aes.y = Array{Float64}(0) - aes.width = Array{Float64}(0) - colors = eltype(aes.color)[] - - for (x, ys) in grouped_y - window = stat.n > 1 ? KernelDensity.default_bandwidth(ys) : 0.1 - f = KernelDensity.kde(ys, bandwidth=window, npoints=stat.n) - append!(aes.x, fill(x, length(f.x))) - append!(aes.y, f.x) - append!(aes.width, f.density) - append!(colors, fill(grouped_color[x], length(f.x))) - end - - colorflag && (aes.color = colors) - - pad = 0.1 - maxwidth = maximum(aes.width) - broadcast!(*, aes.width, aes.width, 1 - pad) - broadcast!(/, aes.width, aes.width, maxwidth) -end - - struct JitterStatistic <: Gadfly.StatisticElement vars::Vector{Symbol} range::Float64