vignettes/BC_LTHC_additional_analysis.Rmd
BC_LTHC_additional_analysis.Rmd
The data is formatted as a list with 100 imputations of the missing symptom onset times. For each imputation, the list contains data on the number of cases, time series of cases (by symptom onset), the facility capacity and the outbreak reported date for each facility outbreak. It also contains the time series of cases as a matrix, but we will not use that here.
# View the final imputation of missing data:
str(BC_LTHC_outbreaks_100Imputs[[100]])
#> List of 6
#> $ Location : int [1:18] 1 2 3 4 5 6 7 8 9 10 ...
#> $ num_cases : int [1:18] 67 9 25 18 88 7 5 19 3 44 ...
#> $ time_series :List of 18
#> ..$ : num [1:40] 1 0 0 1 0 0 1 1 0 1 ...
#> ..$ : num [1:42] 1 0 0 0 2 0 0 1 0 0 ...
#> ..$ : num [1:47] 1 0 0 0 0 0 0 0 0 1 ...
#> ..$ : num [1:64] 1 0 0 0 0 0 0 0 0 0 ...
#> ..$ : num [1:49] 2 0 2 0 0 1 5 4 4 2 ...
#> ..$ : num [1:17] 2 0 0 0 3 1 0 0 0 0 ...
#> ..$ : num [1:15] 1 0 0 2 0 0 1 0 0 0 ...
#> ..$ : num [1:31] 1 0 1 1 1 0 0 2 0 0 ...
#> ..$ : num [1:5] 2 0 0 0 1
#> ..$ : num [1:50] 2 0 0 0 0 2 0 0 2 0 ...
#> ..$ : num [1:40] 1 2 2 1 1 3 2 2 0 1 ...
#> ..$ : num [1:14] 2 0 1 2 2 1 0 0 0 1 ...
#> ..$ : num [1:38] 1 0 0 4 1 1 1 4 1 3 ...
#> ..$ : num [1:25] 1 0 0 0 0 0 0 0 0 1 ...
#> ..$ : num [1:33] 1 0 0 1 0 0 0 0 2 2 ...
#> ..$ : num [1:17] 1 0 0 0 0 0 0 0 0 1 ...
#> ..$ : num [1:50] 2 0 0 0 0 0 0 0 0 2 ...
#> ..$ : num [1:19] 3 0 0 0 0 2 0 0 0 1 ...
#> $ capacity : num [1:18] 139 91 189 120 142 300 87 40 66 236 ...
#> $ reported_date: Date[1:18], format: "2020-04-28" "2020-04-25" ...
#> $ case_matrix : num [1:64, 1:18] 1 0 0 1 0 0 1 1 0 1 ...
plot_data <- tibble(location=BC_LTHC_outbreaks_100Imputs[[100]]$Location, capacity=BC_LTHC_outbreaks_100Imputs[[100]]$capacity, outbreak_size=BC_LTHC_outbreaks_100Imputs[[100]]$num_cases, reported_date = BC_LTHC_outbreaks_100Imputs[[100]]$reported_date)
lab_dates <- pretty(plot_data$reported_date)
#> Warning in as.POSIXlt.POSIXct(x): unknown timezone 'GMT'
p <- plot_data %>%
ggplot(aes(x=capacity,y=outbreak_size,color=as.numeric(reported_date))) +
geom_point(size=1.8) +
geom_abline(slope=1,intercept=0,linetype="dashed",alpha=0.5) +
coord_cartesian(expand=FALSE,xlim = c(0,305), ylim=c(0,95)) +
scale_color_continuous(breaks = as.numeric(lab_dates),
labels = lab_dates,
type = "viridis") +
theme_classic() +
labs(y="Total outbreak size",x="Maximum capacity",
color="Reported date")
show(p)
# Relabel locations according to manuscript labelling
fac_names <- c("Q", "I", "J", "L", "O", "A", "D", "N", "B", "K", "R", "H", "M", "C", "G", "F", "P", "E")
df <- data.frame(BC_LTHC_outbreaks_100Imputs[[100]]$case_matrix)
colnames(df) <- fac_names
df <- cbind(Day=1:nrow(df),df)
r0_uc_slope <- (3-1)/5
r0_lc_slope <- (1.1-1)/5
p2 <- df %>%
pivot_longer(-1) %>% filter(value > 0) %>% group_by(name) %>% mutate(csum = cumsum(value), name=factor(name, levels=fac_names)) %>%
ggplot(aes(x = Day, y = csum, group = name, colour = name)) +
geom_line(show.legend = FALSE) + theme_classic() + xlab("Days since first symptom onset") + ylab("Cumulative cases") + scale_y_log10() +
geom_abline(slope=r0_lc_slope,intercept=0,linetype="dashed", col="black") +
geom_abline(slope=r0_uc_slope,intercept=0,linetype="dashed", col="black") +
geom_dl(aes(label = name), method = list(dl.combine("last.points"), cex = 0.8)) +
annotate(
"text",
x = 5,
y = exp(5 * r0_uc_slope + 1),
angle = atan(r0_uc_slope * 10) * 180/pi,
label = "R0 = 3"
) +
annotate(
"text",
x = 10,
y = exp(10 * r0_lc_slope - 0.01),
angle = atan(r0_lc_slope * 10) * 180/pi,
label = "R0 = 1.1"
) +
coord_cartesian(ylim=c(1,100), xlim=c(0, 70), expand=FALSE)
show(p2)
For each of the 100 imputations, we estimate R0 independently in each facility using exponential growth (EG) and maximum likelihood (ML) methods, with the ‘R0’ library.
nIts <- length(BC_LTHC_outbreaks_100Imputs)
res <- vector(mode = "list", length = nIts)
for (its in 1:nIts){
mGT<-generation.time("gamma", c(5.2, 1.73))
estr0<-rep(list(1),length(BC_LTHC_outbreaks_100Imputs[[1]]$Location) )
for (i in 1:length(BC_LTHC_outbreaks_100Imputs[[1]]$Location) ){
end <- as.numeric(length(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]]) + 1 -
which.max(rev(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]])))
if (end==1){end=as.numeric(length(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]]))}
t <- try(estimate.R(epid = BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]],
GT = mGT, time.step = 1,
pop.size = BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
begin = 1, end = end,
methods=c("EG", "ML")))
if("try-error" %in% class(t)){estr0[[i]] <- estimate.R(epid =
BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]],
GT = mGT, time.step = 1, pop.size =
BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
begin = 1, end = end,methods=c("EG"))}else{
estr0[[i]] <- estimate.R(epid =
BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]],
GT = mGT, time.step = 1, begin = 1, end = end, pop.size =
BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
methods=c("EG", "ML"))}
}
names(estr0) <- paste("Facility", BC_LTHC_outbreaks_100Imputs[[its]]$Location, sep="_")
res[[its]]<-estr0
}
We collect summaries of the R0 estimates from a single imputation (the final one), and also calculate confidence intervals across all 100 imputations as a sensitivity analysis.
# First, save the complete nIts=100 sets of estimates to file (the sensitivity analysis)
# We want the mean of the 100 means, and the CI of those 100 means, per outbreak
er_df <- data.frame(Location=fac_names,
EG_mean=c(rep(NA, length(names(estr0)))),
EG_mean_CI_lower=c(rep(NA, length(names(estr0)))),
EG_mean_CI_upper=c(rep(NA, length(names(estr0)))),
ML_mean=c(rep(NA, length(names(estr0)))),
ML_mean_CI_lower=c(rep(NA, length(names(estr0)))),
ML_mean_CI_upper=c(rep(NA, length(names(estr0)))) )
# To calculate the confidence intervals on the means
normConfInt <- function(x, alpha = 0.05){
mean(x) + qt(1 - alpha / 2, length(x) - 1) * sd(x) * c(-1, 1)}
for (i in 1:length(estr0)){
a<-NULL; b<-NULL;
for (j in 1:nIts){
a <- c(a, res[[j]][[i]]$estimates$EG[1])
b <- c(b, res[[j]][[i]]$estimates$ML[1])
}
er_df[i, 2] <- mean(unlist(a))
er_df[i, 3] <- normConfInt(unlist(a))[1]
er_df[i, 4] <- normConfInt(unlist(a))[2]
er_df[i, 5] <- mean(unlist(b))
er_df[i, 6] <- normConfInt(unlist(b))[1]
er_df[i, 7] <- normConfInt(unlist(b))[2]
}
er_df <- arrange(er_df, Location)
write.table(er_df, file = paste0(outdir,"/R0estimation_results_multiple.txt"))
# show table of results
er_df %>%
kableExtra::kable() %>%
kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))
Location | EG_mean | EG_mean_CI_lower | EG_mean_CI_upper | ML_mean | ML_mean_CI_lower | ML_mean_CI_upper |
---|---|---|---|---|---|---|
A | 10.6269093 | -8.8532114 | 30.1070300 | 5.460201 | -1.0344121 | 11.954814 |
B | 0.1085062 | 0.1085062 | 0.1085062 | 1.324182 | 1.3241818 | 1.324182 |
C | 1.1573214 | 1.1573214 | 1.1573214 | 1.088969 | 1.0889689 | 1.088969 |
D | 8.5813549 | 8.5813549 | 8.5813549 | NA | NaN | NaN |
E | 6.6482600 | -6.4343068 | 19.7308269 | 3.191398 | -0.4845208 | 6.867317 |
F | 2.6483619 | 2.6483619 | 2.6483619 | 2.481302 | 2.4813016 | 2.481302 |
G | 3.2251127 | 3.2251127 | 3.2251127 | 2.815736 | 2.8157357 | 2.815736 |
H | 0.9976363 | -0.8485777 | 2.8438503 | 1.869587 | -2.2292049 | 5.968380 |
I | 0.9221586 | 0.9221586 | 0.9221586 | 1.210450 | 1.2104496 | 1.210450 |
J | 1.2938316 | 1.2938316 | 1.2938316 | 1.195217 | 1.1952168 | 1.195217 |
K | 2.3304750 | -0.0707417 | 4.7316916 | 2.022159 | 0.5724734 | 3.471845 |
L | 1.4039891 | 1.4039891 | 1.4039891 | NA | NaN | NaN |
M | 18.7880835 | -59.6174627 | 97.1936297 | 2.424265 | -2.5489625 | 7.397493 |
N | 1.1345167 | 1.1345167 | 1.1345167 | 1.295400 | 1.2953995 | 1.295400 |
O | 1.7321747 | -3.5101976 | 6.9745471 | 1.590057 | -0.6351134 | 3.815227 |
P | 4.9837638 | 3.2858072 | 6.6817205 | 5.343308 | 3.4197483 | 7.266867 |
Q | 5.2011965 | 3.0231006 | 7.3792923 | 4.917799 | 3.6901495 | 6.145448 |
R | 1.7401226 | 1.7401226 | 1.7401226 | 1.962847 | 1.9628467 | 1.962847 |
# Secondly, the final imputation as the primary result
er_df <- data.frame(Location=fac_names,
EG=c(rep(NA, length(names(estr0)))),
EG_CI_lower=c(rep(NA, length(names(estr0)))),
EG_CI_upper=c(rep(NA, length(names(estr0)))),
ML=c(rep(NA, length(names(estr0)))),
ML_CI_lower=c(rep(NA, length(names(estr0)))),
ML_CI_upper=c(rep(NA, length(names(estr0)))) )
for (i in 1:length(estr0)){
er_df[i, 2] <- estr0[[i]]$estimates$EG[1]
er_df[i, 3] <- estr0[[i]]$estimates$EG$conf.int[1]
er_df[i, 4] <- estr0[[i]]$estimates$EG$conf.int[2]
er_df[i, 5] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML[1])
er_df[i, 6] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML$conf.int[1])
er_df[i, 7] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML$conf.int[2])
}
er_df <- arrange(er_df, Location)
write.table(er_df, file=paste0(outdir,"/R0estimation_results_single.txt") )
# show table of results
er_df %>%
kableExtra::kable() %>%
kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))
Location | EG | EG_CI_lower | EG_CI_upper | ML | ML_CI_lower | ML_CI_upper |
---|---|---|---|---|---|---|
A | 3.0019605 | 0.0634776 | 64.353581 | 3.972551 | 0.4554797 | 14.088511 |
B | 0.1085062 | 0.0000032 | 10.860645 | 1.324182 | 0.0105389 | 8.939934 |
C | 1.1573214 | 0.6032579 | 2.213223 | 1.088969 | 0.2293632 | 3.045568 |
D | 8.5813549 | 0.0134894 | 958.227126 | NA | NA | NA |
E | 0.5181015 | 0.1915469 | 1.125294 | 0.720507 | 0.1517629 | 2.015055 |
F | 2.6483619 | 0.8002503 | 10.352171 | 2.481302 | 0.4125962 | 7.661795 |
G | 3.2251127 | 0.6506007 | 16.932821 | 2.815736 | 0.5930789 | 7.875017 |
H | 0.7188200 | 0.2881224 | 1.598253 | 1.202215 | 0.4311160 | 2.583873 |
I | 0.9221586 | 0.5117499 | 1.582353 | 1.210450 | 0.3010219 | 3.138771 |
J | 1.2938316 | 0.9845823 | 1.724130 | 1.195217 | 0.5743125 | 2.154633 |
K | 1.3991400 | 1.0340789 | 1.910533 | 1.378132 | 0.7674802 | 2.249427 |
L | 1.4039891 | 1.0733626 | 1.898347 | NA | NA | NA |
M | 1.0422230 | 0.6580577 | 1.622863 | 1.321332 | 0.6861250 | 2.263093 |
N | 1.1345167 | 0.8169595 | 1.576101 | 1.295400 | 0.6077655 | 2.371460 |
O | 1.2304994 | 1.0090784 | 1.502667 | 1.338631 | 0.9118601 | 1.881481 |
P | 4.7932071 | 2.8437951 | 8.641037 | 5.103303 | 2.9367991 | 8.138745 |
Q | 4.7467273 | 3.0491734 | 7.749353 | 5.183023 | 3.3077990 | 7.660501 |
R | 1.7401226 | 1.3305197 | 2.296654 | 1.962847 | 1.3124932 | 2.799088 |
Some simple plots of the results.
hist(er_df$EG, breaks=20, main="EG R0 estimates", xlab="R0", col="steelblue4")
hist(er_df$ML, breaks=20, main="ML R0 estimates", xlab="R0", col="steelblue4")
We calculate the attack rate in each outbreak as the number of cases divided by the facility capacity. We incorporate uncertainty in the attack rate by varying the denominator from 85% to 115% of the known capacity. R0 is also estimated from the attack rate, according to the relation -log(1-A_r)/A_r = R0.
AR_table <- data.frame("Facility" = fac_names,
"No. cases" = BC_LTHC_outbreaks_100Imputs[[100]]$num_cases,
"Capacity" = BC_LTHC_outbreaks_100Imputs[[100]]$capacity,
"A_r" =
100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity,
"A_r (85% cap.)" =
100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/(BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85),
"A_r (115% cap.)" =
100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/(BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15),
"R0 (A_r)" =
-log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity),
"R0 (A_r, 85% cap.)" =
-log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85),
"R0 (A_r, 115% cap.)" =
-log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15) )
AR_table <- arrange(AR_table, Facility)
# show table of results
AR_table %>%
kableExtra::kable() %>%
kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))
Facility | No..cases | Capacity | A_r | A_r..85..cap.. | A_r..115..cap.. | R0..A_r. | R0..A_r..85..cap.. | R0..A_r..115..cap.. |
---|---|---|---|---|---|---|---|---|
A | 7 | 300 | 2.333333 | 2.745098 | 2.028986 | 1.011851 | 1.010050 | 1.013662 |
B | 3 | 66 | 4.545454 | 5.347594 | 3.952569 | 1.023440 | 1.019831 | 1.027084 |
C | 6 | 236 | 2.542373 | 2.991027 | 2.210759 | 1.012932 | 1.010963 | 1.014910 |
D | 5 | 87 | 5.747126 | 6.761325 | 4.997501 | 1.029886 | 1.025251 | 1.034578 |
E | 8 | 89 | 8.988764 | 10.575017 | 7.816317 | 1.047833 | 1.040267 | 1.055548 |
F | 6 | 92 | 6.521739 | 7.672634 | 5.671077 | 1.034100 | 1.028786 | 1.039487 |
G | 14 | 210 | 6.666667 | 7.843137 | 5.797101 | 1.034893 | 1.029451 | 1.040413 |
H | 12 | 75 | 16.000000 | 18.823529 | 13.913044 | 1.089709 | 1.074871 | 1.105114 |
I | 9 | 91 | 9.890110 | 11.635423 | 8.600096 | 1.052974 | 1.044548 | 1.061585 |
J | 25 | 189 | 13.227513 | 15.561780 | 11.502185 | 1.072617 | 1.060821 | 1.084774 |
K | 44 | 236 | 18.644068 | 21.934197 | 16.212233 | 1.106714 | 1.088749 | 1.125506 |
L | 18 | 120 | 15.000000 | 17.647059 | 13.043478 | 1.083460 | 1.069746 | 1.097659 |
M | 37 | 151 | 24.503311 | 28.827425 | 21.307227 | 1.147116 | 1.121314 | 1.174608 |
N | 19 | 40 | 47.500000 | 55.882353 | 41.304348 | 1.356541 | 1.280731 | 1.446607 |
O | 88 | 142 | 61.971831 | 72.908036 | 53.888549 | 1.560133 | 1.420292 | 1.749947 |
P | 89 | 154 | 57.792208 | 67.990833 | 50.254094 | 1.492529 | 1.375652 | 1.643764 |
Q | 67 | 139 | 48.201439 | 56.707575 | 41.914295 | 1.364706 | 1.286623 | 1.457915 |
R | 79 | 92 | 85.869565 | 101.023018 | 74.669187 | 2.278851 | 1.793323 | 4.437495 |
Next, compare EG and ML estimates with results from the Bayesian hierarchical model, against the attack rate.
# The Bayesian hierarchical model results are manually set here, so remember to update these if you are adapting this script for another analysis.
BHM <- c(0.56, 0.79, 0.86, 0.93, 1.2, 1.47, 1.52, 1.57, 2.1, 2.9, 3.2, 3.27, 3.32, 4.55, 5.73, 6.09, 8.17, 9.17)
# The following aren't used until the next chunk:
BHM_multi <- c(2.6, 2.97, 2.62, 3.28, 3.73, 3.68, 3.62, 5.39, 2.87, 2.71, 3.82, 2.37, 4.73, 6.65, 6.7, 6.91, 9.95, 9.13) # multi-level zeta point estimates
BHM_upCI <- c(1.17, 1.79, 1.54, 1.89, 2.21, 2.5, 2.33, 2.69, 3.23, 3.98, 4.35, 4.51, 4.58, 6.98, 7.76, 8.04, 11.38, 11.97) # BHM upper credible interval
BHM_multi_upCI <- c(4.96, 5.99, 4.87, 6.81, 7.46, 6.9, 6.54, 9.52, 5.14, 4.03, 5.55, 3.43, 7.23, 10.32, 8.9, 9.01, 13.21, 12.35) # multi-level zeta upper credible interval
BHM_lowCI <- c(0.16, 0.22, 0.35, 0.3, 0.46, 0.7, 0.91, 0.7, 1.29, 2.13, 2.41, 2.42, 2.43, 3.05, 4.37, 4.7, 5.29, 7.16) # BHM lower credible interval
BHM_multi_lowCI <- c(1.26, 1.38, 1.35, 1.59, 1.7, 1.8, 1.85, 2.45, 1.61, 1.83, 2.67, 1.6, 3.08, 4.1, 5.06, 5.31, 7.44, 7.06) # multi-level zeta lower credible interval
scatterdata <- data.frame(c(AR_table$Facility,AR_table$Facility,AR_table$Facility), c(AR_table$A_r,AR_table$A_r,AR_table$A_r), c(er_df$EG,er_df$ML,BHM), c(rep("Exponential growth",length(AR_table$A_r)), rep("Maximum likelihood",length(AR_table$A_r)), rep("Bayesian hierarchical",length(AR_table$A_r))) )
names(scatterdata) <- c("Facility", "ar", "r0", "Method")
p <- ggplot(scatterdata, aes(x=ar, y=r0, color=Method)) +
geom_point(size=3, alpha=0.8) +
geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
#theme_sleek() +
xlab("Attack Rate (%)") + ylab(expression(R['0,k'])) + theme(legend.position="right", legend.title = element_text(size=10, face="bold"), legend.text=element_text(size=10)) +
#scale_colour_ggthemr_d() +
annotate(geom="label", x=75, y=9.7, label=round(cor(AR_table$A_r, BHM),2), colour=palette()[2]) +
annotate(geom="label", x=75, y=1.7, label=round(cor(AR_table$A_r, er_df$EG),2), colour=palette()[3])+
annotate(geom="label", x=75, y=3.5, label=round(cor(AR_table$A_r[!is.na(er_df$ML)], er_df$ML[!is.na(er_df$ML)]),2), colour=palette()[4])
# The geom_smooth/geom_point warning messages can be safely ignored - these are caused by the missing ML values
p
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 2 rows containing non-finite values (stat_smooth).
#> Warning: Removed 2 rows containing missing values (geom_point).
This figure compares point estimates and confidence intervals/credible intervals from the BHM (single- and multi- level zeta), EG and ML methods.
## First combine the results into a data frame
df <- data.frame(c(rep(er_df$Location, 4)),
(c(er_df$EG, er_df$ML , BHM, BHM_multi)),
(c(er_df$EG_CI_upper, er_df$ML_CI_upper, BHM_upCI, BHM_multi_upCI)),
(c(er_df$EG_CI_lower, er_df$ML_CI_lower, BHM_lowCI, BHM_multi_lowCI)),
c(rep("Exponential growth",
length(er_df$Location)),
rep("Maximum likelihood",
length(er_df$Location)),
rep("Bayesian hierarchical",
length(er_df$Location)),
rep("Bayesian hierarchical, multi-level",
length(er_df$Location))))
names(df) <- c("Location", "R0", "lowerCI", "upperCI", "Method")
p<-ggplot(df, aes(y=Location, x=R0, group=Method, color=Method)) +
xlab(expression(R['0,k'])) + ylab("Location") +
geom_errorbarh(mapping=aes(xmin=lowerCI, xmax=upperCI), size = 0.5, alpha=0.99, position=position_dodgev(height=0.8), height=0) +
geom_point(position=position_dodgev(height=0.8)) + #theme_sleek() +
theme(legend.position="bottom") +
guides(colour = guide_legend(nrow = 2)) + scale_x_continuous(trans="log10", limits=c(0.000001,1000), labels=c("0.001","0.01","0.1", "1.0","10","100", "1000"), breaks=c(0.001,0.01, 0.1, 1, 10, 100, 1000))
p
#> Warning: Removed 2 rows containing missing values (geom_errorbarh).
#> Warning: Removed 2 rows containing missing values (geom_point).
In this section we investigate any relationships between the facility R0 estimates and additional data obtained on the LTHC facilities (age of facility, room type etc.). The additional data is primarily contained in italicBC_OSABC_facilitydata.rdaitalic, and some factors we have loaded already (e.g. outbreak reported date).
First, we take a look at a few of these individually:
# Take a look at the additional factor data:
head(BC_OSABC_facilitydata)
#> Location LTHC facility type Year facility opened Resident room type
#> 1 1 Non-profit 1974 Mostly single
#> 2 2 Private 1960 Single
#> 3 3 Public 1998 Mostly single
#> 4 4 <NA> NA <NA>
#> 5 5 Non-profit 1947 Mostly multi
#> 6 6 Non-profit 1989 Mostly single
#> Accreditation status Direct care hours /resident/day
#> 1 Accredited 3.36
#> 2 Accredited 3.36
#> 3 Accredited 3.50
#> 4 <NA> NA
#> 5 Accredited 3.60
#> 6 Accredited 3.03
#> Average resident age (years) Average resident stay (days)
#> 1 88 831
#> 2 91 1063
#> 3 84 1296
#> 4 NA NA
#> 5 85 761
#> 6 87 699
#> Residents dependent for daily activities (%)
#> 1 29.8
#> 2 27.7
#> 3 49.3
#> 4 NA
#> 5 36.6
#> 6 28.8
#> Number of lodged complaints 2018/19 Number of disease outbreaks 2018/19
#> 1 0 0
#> 2 1 0
#> 3 0 1
#> 4 NA NA
#> 5 0 2
#> 6 5 0
#> Identity of initial COVID-19 case
#> 1 Staff/worker
#> 2 Staff/worker
#> 3 Staff/worker
#> 4 Staff/worker
#> 5 Resident/patient
#> 6 Resident/patient
# Re-order to alphabetical labelling
BC_OSABC_facilitydata$Facility = fac_names
BC_OSABC_facilitydata <- arrange(BC_OSABC_facilitydata, Facility)
# Was the initial (by symptom onset) case staff?
# (2 outbreaks with multiple earliest symptom onsets, both resident & staff: 9 and 17.
# These are coded as unknown.)
staff_cat<-as.factor(BC_OSABC_facilitydata$`Identity of initial COVID-19 case`)
# Remove the 'unknown' categorisations to make this factor dichotomous, and calculate the correlation with R0
staff_cat_di <- staff_cat[staff_cat!="Unknown"]
staff_cat_di <- unclass(staff_cat_di)
# Initial case
print(paste0("BHM and identity of initial case correlation = ", round(cor(staff_cat_di, BHM[staff_cat!="Unknown"]),3)))
#> [1] "BHM and identity of initial case correlation = 0.152"
print(paste0("BHM multi-level and identity of initial case correlation = ", round(cor(staff_cat_di, BHM_multi[staff_cat!="Unknown"]),3)))
#> [1] "BHM multi-level and identity of initial case correlation = 0.007"
print(paste0("BHM and identity of initial case correlation = ", round(cor(staff_cat_di, er_df$EG[staff_cat!="Unknown"]),3)))
#> [1] "BHM and identity of initial case correlation = -0.246"
print(paste0("BHM and identity of initial case correlation = ", round(cor(unclass(staff_cat[staff_cat!="Unknown" & !is.na(er_df$ML)]), er_df$ML[staff_cat!="Unknown" & !is.na(er_df$ML)]),3))) #only non-NA ML ests
#> [1] "BHM and identity of initial case correlation = 0.056"
# Facility capacity
fac_cap <- BC_LTHC_outbreaks_100Imputs[[100]]$capacity[order(fac_names)]
print(paste0("BHM and facility capacity correlation = ", round(cor(BHM, fac_cap),3)))
#> [1] "BHM and facility capacity correlation = -0.167"
print(paste0("BHM multi-level and facility capacity correlation = ", round(cor(BHM_multi, fac_cap),3)))
#> [1] "BHM multi-level and facility capacity correlation = -0.282"
print(paste0("EG and facility capacity correlation = ", round(cor(er_df$EG, fac_cap),3)))
#> [1] "EG and facility capacity correlation = 0.073"
print(paste0("ML and facility capacity correlation = ", round(cor(er_df$ML[!is.na(er_df$ML)], fac_cap[!is.na(er_df$ML)]),3)))
#> [1] "ML and facility capacity correlation = 0.295"
# outbreak reported date
rep_date <- BC_LTHC_outbreaks_100Imputs[[100]]$reported_date[order(fac_names)]
rep_date <- as.numeric(as.POSIXct(rep_date, format="%Y-%m-%d %H:%M:%S", tz="GMT"))
print(paste0("BHM and facility capacity correlation = ", round(cor(BHM, rep_date),3)))
#> [1] "BHM and facility capacity correlation = -0.254"
print(paste0("BHM multi-level and facility capacity correlation = ", round(cor(BHM_multi, rep_date),3)))
#> [1] "BHM multi-level and facility capacity correlation = -0.09"
print(paste0("EG and facility capacity correlation = ", round(cor(er_df$EG, rep_date),3)))
#> [1] "EG and facility capacity correlation = -0.069"
print(paste0("ML and facility capacity correlation = ", round(cor(er_df$ML[!is.na(er_df$ML)], rep_date[!is.na(er_df$ML)]),3)))
#> [1] "ML and facility capacity correlation = -0.044"
Finally, we create figures and tables for all factors available.
# Make a tibble with the loaded factor data, but also outbreak reported date and facility capacity.
cor_data <- as_tibble(BC_OSABC_facilitydata)
cor_data <- cbind(cor_data, fac_cap, rep_date)
names(cor_data)[(length(cor_data)-1):length(cor_data)] <- c("Facility capacity", "COVID-19 outbreak reported date")
# Create correlation table, for numeric covariates
rho_table <- cor_data %>%
dplyr::select("Number of disease outbreaks 2018/19","Number of lodged complaints 2018/19",
"Residents dependent for daily activities (%)","Average resident stay (days)",
"Average resident age (years)","Direct care hours /resident/day",
"Facility capacity", "COVID-19 outbreak reported date", "Year facility opened")
rho_table <-
purrr::map2_df(rho_table,colnames(rho_table),function(x,var_name){
cor.test(x,BHM) %>%
broom::tidy() %>%
mutate(Property = var_name)
}) %>%
dplyr::select(Property,everything())
# BHM correlations
rho_table %>%
kableExtra::kable() %>%
kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))
Property | estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
---|---|---|---|---|---|---|---|---|
Number of disease outbreaks 2018/19 | -0.2270282 | -0.8405090 | 0.4158149 | 13 | -0.6622703 | 0.3227725 | Pearson’s product-moment correlation | two.sided |
Number of lodged complaints 2018/19 | -0.5396202 | -2.3109734 | 0.0378793 | 13 | -0.8240837 | -0.0378087 | Pearson’s product-moment correlation | two.sided |
Residents dependent for daily activities (%) | 0.1531161 | 0.5586554 | 0.5858915 | 13 | -0.3897143 | 0.6169853 | Pearson’s product-moment correlation | two.sided |
Average resident stay (days) | 0.3763166 | 1.4644810 | 0.1668234 | 13 | -0.1684115 | 0.7449694 | Pearson’s product-moment correlation | two.sided |
Average resident age (years) | 0.0137734 | 0.0496656 | 0.9611438 | 13 | -0.5020315 | 0.5223508 | Pearson’s product-moment correlation | two.sided |
Direct care hours /resident/day | 0.3959211 | 1.5545441 | 0.1440545 | 13 | -0.1459408 | 0.7550483 | Pearson’s product-moment correlation | two.sided |
Facility capacity | -0.1668244 | -0.6767814 | 0.5082118 | 16 | -0.5879053 | 0.3253887 | Pearson’s product-moment correlation | two.sided |
COVID-19 outbreak reported date | -0.2543562 | -1.0520253 | 0.3084179 | 16 | -0.6446708 | 0.2411508 | Pearson’s product-moment correlation | two.sided |
Year facility opened | -0.1650633 | -0.5797485 | 0.5728084 | 12 | -0.6396250 | 0.4006005 | Pearson’s product-moment correlation | two.sided |
# And correlations for dichotomous categorical factors (point-biserial)
print(paste0("BHM and identity of initial case (is staff) correlation = ", cor(unclass(as.factor(cor_data$"Identity of initial COVID-19 case"[cor_data$"Identity of initial COVID-19 case"!="Unknown"])), BHM[cor_data$"Identity of initial COVID-19 case"!="Unknown"])))
#> [1] "BHM and identity of initial case (is staff) correlation = 0.152020983211341"
# Make this one negative since 'not accredited' is associated with a higher number:
print(paste0("BHM and (positive) accreditation status correlation = ", -cor(unclass(as.factor(cor_data$"Accreditation status"[!is.na(cor_data$"Accreditation status")])), BHM[!is.na(cor_data$"Accreditation status")])))
#> [1] "BHM and (positive) accreditation status correlation = 0.272308889310071"
## Plot correlations with R0
# separate numeric and character factors into 2 plots
cols_to_plot_num <- c(3, 6, 7, 8, 9, 10, 11, 14, 15)
cols_to_plot_cat <- c(2, 4, 5, 12)
cor_data <- cbind(cor_data, BHM)
cordatalong_num <- pivot_longer(cor_data, cols = all_of(cols_to_plot_num), names_to = "Factor", values_to = "Value")
cordatalong_cat <- pivot_longer(cor_data, cols = all_of(cols_to_plot_cat), names_to = "Factor", values_to = "Value")
# (modified a few aesthetics of figure manually for paper - removed second R_0,k label, rearranged some titles that were cut off)
p1 <- ggplot(data=cordatalong_num, aes(x=Value, y = BHM)) +
geom_point(size=2, colour = "#E84646") + facet_wrap(~Factor, scales = "free", ncol=4) + xlab("") + ylab(expression(R['0,k'])) +
theme(text = element_text(size=12), axis.title.y = element_text(size = 15))
p2 <- ggplot(data=cordatalong_cat, aes(x=Value, y = BHM)) +
geom_point(size=2, colour = "#E84646") + facet_wrap(~Factor, scales = "free", ncol=4) + xlab("") + ylab(expression(R['0,k'])) +
theme(text = element_text(size=12), axis.title.y = element_text(size = 15)) +
theme(axis.text.x = element_text(angle = 45, hjust=1))
grid.arrange(grobs = list(p2, p1), nrow = 2, heights = c(1.1,3))
#> Warning: Removed 22 rows containing missing values (geom_point).
# If using paper colour scheme, reset the colour theme:
#ggthemr_reset()