library(designit)
library(ggplot2)
#> Want to understand how all the pieces fit together? Read R for Data
#> Science: https://r4ds.hadley.nz/
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
(latin square should give the best score)
First using a combination of two OSAT scores (for row and column).
This usually produces a latin square when using the squared L2 norm (L2s) for aggregation of the 2 scores.
# Setting up the batch container
example1 <- BatchContainer$new(
dimensions = c(
plate = 1,
row = 4, col = 4
)
)
# Add samples to container
# Need unique Sample ID. Can we drop this constraint?
example1 <- assign_in_order(example1,
samples = tibble::tibble(
Group = rep(c("Grp 1", "Grp 2", "Grp 3", "Grp 4"), each = 4),
ID = 1:16
)
)
# The following does not work (an gives a constant score of 144!)
# scoring_f <- osat_score_generator(batch_vars = c("row","col"), feature_vars = c("Group"))
# First analysis of problem indicates that osat_score generates a full row*col vector of 'ideal scores'
# which are in fact the same value, implying an identical overall result as each position can be either
# allocated by 1 sample or 0 samples, the sum of 1's being the sample count.
# --> don't use osat_score if there's a lack of samples as compared to possible positioning
# # Set scoring function
scoring_f <- list(
Row.Score = osat_score_generator(batch_vars = c("row"), feature_vars = c("Group")),
Column.Score = osat_score_generator(batch_vars = c("col"), feature_vars = c("Group"))
)
set.seed(41)
bc <- optimize_design(
example1,
scoring = scoring_f,
max_iter = 300, # this is set to shorten vignette run-time based on known random seed, normally we don't know.
n_shuffle = 2,
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 1000, alpha = 0.5)),
aggregate_scores_func = L2s_norm
)
#> Checking variances of 2-dim. score vector.
#> ... (11.385, 16.687) - OK
#> Initial score: c(48, 0)
#> Aggregated: 2304
#> Achieved score: c(36, 4) at iteration 1
#> Aggregated: 1312
#> Achieved score: c(24, 4) at iteration 2
#> Aggregated: 592
#> Achieved score: c(24, 12) at iteration 3
#> Aggregated: 720
#> Achieved score: c(24, 8) at iteration 4
#> Aggregated: 640
#> Achieved score: c(16, 10) at iteration 5
#> Aggregated: 356
#> Achieved score: c(12, 10) at iteration 6
#> Aggregated: 244
#> Achieved score: c(10, 10) at iteration 7
#> Aggregated: 200
#> Achieved score: c(10, 6) at iteration 9
#> Aggregated: 136
#> Achieved score: c(10, 4) at iteration 10
#> Aggregated: 116
#> Achieved score: c(8, 10) at iteration 11
#> Aggregated: 164
#> Achieved score: c(8, 8) at iteration 12
#> Aggregated: 128
#> Achieved score: c(8, 8) at iteration 14
#> Aggregated: 128
#> Achieved score: c(4, 10) at iteration 19
#> Aggregated: 116
#> Achieved score: c(4, 6) at iteration 20
#> Aggregated: 52
#> Achieved score: c(4, 4) at iteration 22
#> Aggregated: 32
#> Achieved score: c(4, 4) at iteration 27
#> Aggregated: 32
#> Achieved score: c(4, 4) at iteration 36
#> Aggregated: 32
#> Achieved score: c(4, 4) at iteration 38
#> Aggregated: 32
#> Achieved score: c(0, 4) at iteration 39
#> Aggregated: 16
#> Achieved score: c(0, 4) at iteration 50
#> Aggregated: 16
#> Achieved score: c(0, 4) at iteration 54
#> Aggregated: 16
#> Achieved score: c(0, 4) at iteration 82
#> Aggregated: 16
#> Achieved score: c(0, 4) at iteration 87
#> Aggregated: 16
#> Achieved score: c(0, 4) at iteration 119
#> Aggregated: 16
#> Achieved score: c(0, 4) at iteration 125
#> Aggregated: 16
#> Achieved score: c(0, 4) at iteration 200
#> Aggregated: 16
#> Achieved score: c(4, 0) at iteration 215
#> Aggregated: 16
#> Achieved score: c(0, 0) at iteration 284
#> Aggregated: 0
bc$trace$elapsed
#> Time difference of 4.846822 secs
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex1: Using OSAT scores for plate design\n(not the recommended way!)"
)
Now using a dedicated scoring for the group distances on a plate.
This should reliably lead to a nice symmetry-bearing latin square design with only a one-dimensional score to look at.
scoring_f <- mk_plate_scoring_functions(bc, row = "row", column = "col", group = "Group")
set.seed(42)
bc <- optimize_design(
example1,
scoring = scoring_f,
max_iter = 1000, # this is set to shorten vignette run-time based on random seed, normally we don't know.
n_shuffle = 2,
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 1000, alpha = 0.5)),
quiet = TRUE
)
bc$trace$elapsed
#> Time difference of 4.184167 secs
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex1: Using a dedicated plate scoring function:\nThis should show a latin square!"
)
library(designit)
library(ggplot2)
library(dplyr)
library(tidyr)
(latin square for each plate should give the best score)
We set up in one go 2 plate scoring functions, each one acting locally on a specific plate, plus one osat score to guarantee uniform distribution of groups across plates.
The initial sample allocation (by assign_in_order) leads to a poor starting point since each plate has only 2 of the 4 groups represented.
This is not a problem as long as we make sure that initial permutations are likely to remedy the situation. That’s why we ensure 10 pairwise sample swaps for the first iterations.
# Setting up the batch container
example2 <- BatchContainer$new(
dimensions = c(
plate = 2,
row = 4, col = 4
)
)
# Add samples to container
example2 <- assign_in_order(example2, samples = tibble::tibble(
Group = c(rep(c("Grp 1", "Grp 2", "Grp 3", "Grp 4"), each = 8)),
ID = 1:32
))
scoring_f <- c(mk_plate_scoring_functions(example2, plate = "plate", row = "row", column = "col", group = "Group"),
osat_plate = osat_score_generator(batch_vars = c("plate"), feature_vars = c("Group"))
)
plot_plate(example2,
plate = plate, row = row, column = col, .color = Group,
title = "Ex2: Initial sample arrangement"
)
example2$score(scoring_f)
#> Plate 1 Plate 2 osat_plate
#> 23.89265 23.89265 128.00000
set.seed(41)
bc <- optimize_design(
example2,
scoring = scoring_f,
n_shuffle = c(rep(10, 10), rep(3, 90), rep(2, 100), rep(1, 1400)),
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 10000, alpha = 0.5)),
aggregate_scores_func = worst_score,
quiet = TRUE
)
bc$trace$elapsed
#> Time difference of 13.84524 secs
bc$score(scoring_f)
#> Plate 1 Plate 2 osat_plate
#> 6.127258 6.094080 0.000000
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex2: Design created by swapping samples 'globally' across the plates"
)
While this ‘global’ optimization is possible, it does probably not converge to an (almost) ideal solution in an acceptable time if there are more samples involved. This is due to a lot of unproductive sample swapping happening across the plates.
One way to address this: we may split the optimization into two cycles, first assigning samples to plates (balancing groups), then improving the positions of the samples within each plate. This motivates the use of a dedicated sample permutation function which takes the plate structure into account and only shuffles samples around within one plate.
scoring_f <- osat_score_generator(batch_vars = c("plate"), feature_vars = c("Group"))
set.seed(42)
bc <- optimize_design(
example2,
scoring = scoring_f,
quiet = TRUE,
max_iter = 200, # this is set to shorten vignette run-time, normally we don't know.
n_shuffle = 2,
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 10000, alpha = 0.5)),
)
bc$trace$elapsed
#> Time difference of 1.634799 secs
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex2: 'Plate wise' design\nStep 1: after allocating samples to plates"
)
scoring_f <- mk_plate_scoring_functions(bc, plate = "plate", row = "row", column = "col", group = "Group")
bc$score(scoring_f)
#> Plate 1 Plate 2
#> 12.77527 13.63704
set.seed(42)
bc <- optimize_design(
bc,
scoring = scoring_f,
max_iter = 400,
shuffle_proposal_func = mk_subgroup_shuffling_function(subgroup_vars = c("plate")),
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 1000, alpha = 0.5)),
aggregate_scores_func = L2s_norm,
quiet = TRUE
)
bc$trace$elapsed
#> Time differences in secs
#> [1] 1.634799 2.326842
bc$score(scoring_f)
#> Plate 1 Plate 2
#> 6.854748 6.309297
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex2: 'Plate wise' design\nStep 2: after arranging samples within plates"
)
In this case, the shuffling function exchanges 1 pair of sample assignments every time (the default). However, any number of constant swaps or a swapping protocol (formally a vector of integers) can be supplied as well.
Now for the most efficient solution: we start again by first assigning samples to plates (balancing groups), then making use of the independence of the two within-plate optimizations and improving them one after the other.
This is possible by passing the argument to the function that generates the permutations. It enforces permutation only to happen first within plate 1, then within plate 2, so that the two scores can be optimized in succeeding runs.
scoring_f <- osat_score_generator(batch_vars = c("plate"), feature_vars = c("Group"))
set.seed(42)
bc <- optimize_design(
example2,
scoring = scoring_f,
quiet = TRUE,
max_iter = 150, # this is set to shorten vignette run-time, normally we don't know.
n_shuffle = 2,
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 10000, alpha = 0.5)),
)
bc$trace$elapsed
#> Time difference of 1.372123 secs
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex2: 'Serial plate' design\nStep 1: after allocating samples to plates"
)
scoring_f <- mk_plate_scoring_functions(bc, plate = "plate", row = "row", column = "col", group = "Group")
bc$score(scoring_f)
#> Plate 1 Plate 2
#> 10.57482 26.16613
set.seed(42)
bc <- optimize_design(
bc,
scoring = scoring_f,
max_iter = 150,
quiet = TRUE,
shuffle_proposal_func = mk_subgroup_shuffling_function(subgroup_vars = c("plate"), restrain_on_subgroup_levels = c(1)),
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 1000, alpha = 0.5)),
aggregate_scores_func = L2s_norm
)
bc$trace$elapsed
#> Time differences in secs
#> [1] 1.372123 1.127218
bc$score(scoring_f)
#> Plate 1 Plate 2
#> 6.416193 26.166134
set.seed(42)
bc <- optimize_design(
bc,
scoring = scoring_f,
max_iter = 550,
quiet = TRUE,
shuffle_proposal_func = mk_subgroup_shuffling_function(subgroup_vars = c("plate"), restrain_on_subgroup_levels = c(2)),
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 1000, alpha = 0.5)),
aggregate_scores_func = L2s_norm
)
bc$trace$elapsed
#> Time differences in secs
#> [1] 1.372123 1.127218 3.161646
bc$score(scoring_f)
#> Plate 1 Plate 2
#> 6.416193 6.581966
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex2: 'Serial plate' design\nStep 2: after optimizing each plate in turn"
)
library(designit)
library(ggplot2)
library(dplyr)
library(tidyr)
We simulate one ordinary 96 well plate and two smaller ones with sizes 6x8 and 4x6, respectively. There are 3 experimental groups as well with sample sizes of 69, 30 and 69, respectively. This example should demonstrate that an empirically determined normalization of the scores yield 3 comparable numerical values, independent of the different plate and group sizes.
Again, a first optimization aims to achieve same group allocation on each plate, while the second run takes care of the sample distribution on each plate.
We use assign_random in this example to start from a more balanced initial position as compared to example 2.
Aggregation of scores is done by the L2s method (square of L2 norm). Because of the comparable numerical range of scores, also the worst_score method could be used for aggregation. However, the L2s method always takes into account all individual scores, providing more stability in case the 3 plate scores are not exactly normalized.
# Setting up the batch container
example3 <- BatchContainer$new(
dimensions = c(
plate = 3,
row = 8, col = 12
),
exclude = dplyr::bind_rows(
tidyr::crossing(plate = 2, row = 1:8, col = 1:12) |> dplyr::filter(row > 6 | col > 8),
tidyr::crossing(plate = 3, row = 1:8, col = 1:12) |> dplyr::filter(row > 4 | col > 6)
)
)
# Assign samples randomly to start from a better initial state
example3 <- assign_random(example3,
samples = tibble::tibble(
Group = rep.int(c("Grp 1", "Grp 2", "Grp3"),
times = c(69, 30, 69)
),
ID = 1:168
)
)
scoring_f <- osat_score_generator(batch_vars = c("plate"), feature_vars = c("Group"))
set.seed(42)
bc <- optimize_design(
example3,
scoring = scoring_f,
quiet = TRUE,
max_iter = 150,
n_shuffle = 2,
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 1000, alpha = 0.5)),
)
bc$trace$elapsed
#> Time difference of 1.534469 secs
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex3: Dealing with plates of different size\nStep 1: after distributing groups across plates"
)
scoring_f <- mk_plate_scoring_functions(bc,
plate = "plate", row = "row",
column = "col", group = "Group"
)
bc$score(scoring_f)
#> Plate 1 Plate 2 Plate 3
#> 9.706637 9.585770 10.419567
set.seed(42)
bc <- optimize_design(
bc,
scoring = scoring_f,
max_iter = 300,
shuffle_proposal_func = mk_subgroup_shuffling_function(
subgroup_vars = c("plate"),
n_swaps = c(rep(5, 500), rep(3, 1500), rep(2, 3000), rep(1, 5000))
),
# acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 10000, alpha = 1)),
aggregate_scores_func = L2s_norm,
quiet = TRUE
)
bc$trace$elapsed
#> Time differences in secs
#> [1] 1.534469 3.973510
bc$score(scoring_f)
#> Plate 1 Plate 2 Plate 3
#> 8.974408 8.253074 7.980756
plot_plate(bc,
plate = plate, row = row, column = col, .color = Group,
title = "Ex3: Dealing with plates of different size\nStep 2: after swapping samples within plates"
)
library(designit)
library(ggplot2)
library(dplyr)
library(tidyr)
In this example, we have 2 factors to distribute across one plate: Treatment and (animal) sex.
To indicate that the balancing of treatment is considered more important than the animal sex we assign a custom aggregation function giving more weight to the treatment variable. (A better aggregation mechanism has to be implemented!!!)
There can be less samples than possible positions on the plate(s). In this case, we simulate 20 animal derived samples distributed on a plate with 24 locations.
# Setting up the batch container
example4 <- BatchContainer$new(
dimensions = c(
plate = 1, row = 6, col = 4
)
)
# Assign samples randomly to start from lower score (avoid Inf values even since plate 3 will miss 2 groups initially :)
example4 <- assign_in_order(example4, samples = tibble::tibble(
Group = rep.int(c("Treatment 1", "Treatment 2"), times = c(10, 10)),
Sex = c(rep(c("M", "F", "F", "M"), times = 4), "M", NA, NA, "F"), ID = 1:20
))
cowplot::plot_grid(
plot_plate(example4, plate = plate, row = row, column = col, .color = Group, title = "Initial layout by Group"),
plot_plate(example4, plate = plate, row = row, column = col, .color = Sex, title = "Initial layout by Sex"),
ncol = 2
)
scoring_f <- c(
Group = mk_plate_scoring_functions(example4, row = "row", column = "col", group = "Group"),
Sex = mk_plate_scoring_functions(example4, row = "row", column = "col", group = "Sex")
)
example4$score(scoring_f)
#> Group.Plate Sex.Plate
#> 83.63858 239.20748
set.seed(42)
bc <- optimize_design(
example4,
scoring = scoring_f,
max_iter = 750,
n_shuffle = 1,
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 10000, alpha = 1)),
aggregate_scores_func = function(scores, ...) {
2 * scores["Group.Plate"] + scores["Sex.Plate"]
},
quiet = TRUE
)
bc$trace$elapsed
#> Time difference of 3.95466 secs
bc$score(scoring_f)
#> Group.Plate Sex.Plate
#> 8.019656 7.608810
cowplot::plot_grid(
plot_plate(bc, plate = plate, row = row, column = col, .color = Group, title = "Final layout by Group"),
plot_plate(bc, plate = plate, row = row, column = col, .color = Sex, title = "Final layout by Sex"),
ncol = 2
)
We do the same example with auto-scaling, weighted scoring and SA to have a reference!
set.seed(42)
bc <- optimize_design(
bc,
scoring = scoring_f,
max_iter = 500,
n_shuffle = 1,
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 10000, alpha = 1)),
aggregate_scores_func = function(scores, ...) {
purrr::set_names(2 * scores["Group.Plate"] + scores["Sex.Plate"], nm = "Weighted.Score")
},
autoscale_scores = T,
quiet = TRUE
)
#> ... Performing boxcox lambda estimation.
bc$score(scoring_f)
#> Group.Plate Sex.Plate
#> 7.467566 7.753204
cowplot::plot_grid(
plot_plate(bc, plate = plate, row = row, column = col, .color = Group, title = "Final layout by Group"),
plot_plate(bc, plate = plate, row = row, column = col, .color = Sex, title = "Final layout by Sex"),
ncol = 2
)
We do the same example with auto-scaling and position-dependent scoring now, not aggregating the score vector! This is more effective even when using the default acceptance function. We are strictly prioritizing the leftmost score in addition to reflect relevance for the design.
scoring_f <- c(
Group = mk_plate_scoring_functions(example4, row = "row", column = "col", group = "Group"),
Sex = mk_plate_scoring_functions(example4, row = "row", column = "col", group = "Sex")
)
example4$score(scoring_f)
#> Group.Plate Sex.Plate
#> 83.63858 239.20748
set.seed(42)
bc <- optimize_design(
example4,
scoring = scoring_f,
max_iter = 5000,
n_shuffle = 1,
acceptance_func = accept_leftmost_improvement,
autoscale_scores = TRUE,
quiet = TRUE
)
#> ... Performing boxcox lambda estimation.
bc$score(scoring_f)
#> Group.Plate Sex.Plate
#> 7.619846 7.473524
cowplot::plot_grid(
plot_plate(bc, plate = plate, row = row, column = col, .color = Group, title = "Final layout by Group"),
plot_plate(bc, plate = plate, row = row, column = col, .color = Sex, title = "Final layout by Sex"),
ncol = 2
)
Using a tolerance value to accept slightly worse solutions in the leftmost relevant score if overcompensated by other scores:
scoring_f <- c(
Group = mk_plate_scoring_functions(example4, row = "row", column = "col", group = "Group"),
Sex = mk_plate_scoring_functions(example4, row = "row", column = "col", group = "Sex")
)
set.seed(42)
bc <- optimize_design(
example4,
scoring = scoring_f,
max_iter = 5000,
n_shuffle = 1,
acceptance_func = ~ accept_leftmost_improvement(..., tolerance = 0.1),
autoscale_scores = TRUE,
quiet = TRUE
)
#> ... Performing boxcox lambda estimation.
bc$score(scoring_f)
#> Group.Plate Sex.Plate
#> 7.474561 7.412716
cowplot::plot_grid(
plot_plate(bc, plate = plate, row = row, column = col, .color = Group, title = "Final layout by Group"),
plot_plate(bc, plate = plate, row = row, column = col, .color = Sex, title = "Final layout by Sex"),
ncol = 2
)
Testing an alternative left-to-right weighing of scores, based on exponential down-weighing of the respective score differences at position \(p\) with factor \(\kappa^p\), \(0 < \kappa < 1\) We choose a \(\kappa\) of 0.5, i.e. the second score’s improvement counts half of that of the first one.
scoring_f <- c(
Group = mk_plate_scoring_functions(example4, row = "row", column = "col", group = "Group"),
Sex = mk_plate_scoring_functions(example4, row = "row", column = "col", group = "Sex")
)
bc$score(scoring_f)
#> Group.Plate Sex.Plate
#> 7.474561 7.412716
set.seed(42)
bc <- optimize_design(
example4,
scoring = scoring_f,
max_iter = 1000,
n_shuffle = 1,
acceptance_func = mk_exponentially_weighted_acceptance_func(kappa = 0.5, simulated_annealing = T),
autoscale_scores = TRUE,
quiet = TRUE
)
#> ... Performing boxcox lambda estimation.
bc$score(scoring_f)
#> Group.Plate Sex.Plate
#> 7.711676 7.499601
cowplot::plot_grid(
plot_plate(bc, plate = plate, row = row, column = col, .color = Group, title = "Final layout by Group"),
plot_plate(bc, plate = plate, row = row, column = col, .color = Sex, title = "Final layout by Sex"),
ncol = 2
)
library(designit)
library(ggplot2)
library(dplyr)
library(tidyr)
In some cases it may be essential to avoid samples of the same group being put into the same row or column on a plate, i.e. these variables are regarded as factor levels of their own, in addition to the spatial relation of the samples.
Plate scoring functions can use an specific penalty for these ‘linearly arranged’ samples on top of the distance metrics chosen.
# Setting up the batch container
example5 <- BatchContainer$new(
dimensions = c(
plate = 1, row = 8, col = 12
)
)
# Assign samples randomly to start from lower score (avoid `Inf` values when doing the 'hard' penalization)
example5 <- assign_random(example5, samples = tibble::tibble(
Group = rep.int(paste("Group", 1:5), times = c(8, 8, 8, 8, 64)),
ID = 1:96
))
penalize_lines <- "hard"
scoring_f <- c(
Group = mk_plate_scoring_functions(example5, row = "row", column = "col", group = "Group", p = 2, penalize_lines = penalize_lines)
)
example5$score(scoring_f)
#> Group.Plate
#> 10.78471
set.seed(42)
bc <- optimize_design(
example5,
scoring = scoring_f,
max_iter = 5000,
n_shuffle = 1,
acceptance_func = mk_simanneal_acceptance_func(mk_simanneal_temp_func(T0 = 500, alpha = 0.1)),
quiet = T
)
bc$trace$elapsed
#> Time difference of 19.83435 secs
bc$score(scoring_f)
#> Group.Plate
#> 8.861759
plot_plate(bc, plate = plate, row = row, column = col, .color = Group, title = stringr::str_c("Line penalization: ", penalize_lines))