BC Long Term Health Care outbreak data

The data is formatted as a list with 100 imputations of the missing symptom onset times. For each imputation, the list contains data on the number of cases, time series of cases (by symptom onset), the facility capacity and the outbreak reported date for each facility outbreak. It also contains the time series of cases as a matrix, but we will not use that here.

# View the final imputation of missing data:
str(BC_LTHC_outbreaks_100Imputs[[100]])
#> List of 6
#>  $ Location     : int [1:18] 1 2 3 4 5 6 7 8 9 10 ...
#>  $ num_cases    : int [1:18] 67 9 25 18 88 7 5 19 3 44 ...
#>  $ time_series  :List of 18
#>   ..$ : num [1:40] 1 0 0 1 0 0 1 1 0 1 ...
#>   ..$ : num [1:42] 1 0 0 0 2 0 0 1 0 0 ...
#>   ..$ : num [1:47] 1 0 0 0 0 0 0 0 0 1 ...
#>   ..$ : num [1:64] 1 0 0 0 0 0 0 0 0 0 ...
#>   ..$ : num [1:49] 2 0 2 0 0 1 5 4 4 2 ...
#>   ..$ : num [1:17] 2 0 0 0 3 1 0 0 0 0 ...
#>   ..$ : num [1:15] 1 0 0 2 0 0 1 0 0 0 ...
#>   ..$ : num [1:31] 1 0 1 1 1 0 0 2 0 0 ...
#>   ..$ : num [1:5] 2 0 0 0 1
#>   ..$ : num [1:50] 2 0 0 0 0 2 0 0 2 0 ...
#>   ..$ : num [1:40] 1 2 2 1 1 3 2 2 0 1 ...
#>   ..$ : num [1:14] 2 0 1 2 2 1 0 0 0 1 ...
#>   ..$ : num [1:38] 1 0 0 4 1 1 1 4 1 3 ...
#>   ..$ : num [1:25] 1 0 0 0 0 0 0 0 0 1 ...
#>   ..$ : num [1:33] 1 0 0 1 0 0 0 0 2 2 ...
#>   ..$ : num [1:17] 1 0 0 0 0 0 0 0 0 1 ...
#>   ..$ : num [1:50] 2 0 0 0 0 0 0 0 0 2 ...
#>   ..$ : num [1:19] 3 0 0 0 0 2 0 0 0 1 ...
#>  $ capacity     : num [1:18] 139 91 189 120 142 300 87 40 66 236 ...
#>  $ reported_date: Date[1:18], format: "2020-04-28" "2020-04-25" ...
#>  $ case_matrix  : num [1:64, 1:18] 1 0 0 1 0 0 1 1 0 1 ...

Capacity by outbreak size plot

plot_data <- tibble(location=BC_LTHC_outbreaks_100Imputs[[100]]$Location,  capacity=BC_LTHC_outbreaks_100Imputs[[100]]$capacity, outbreak_size=BC_LTHC_outbreaks_100Imputs[[100]]$num_cases, reported_date = BC_LTHC_outbreaks_100Imputs[[100]]$reported_date)
  
lab_dates <- pretty(plot_data$reported_date)
#> Warning in as.POSIXlt.POSIXct(x): unknown timezone 'GMT'

p <- plot_data %>%
  ggplot(aes(x=capacity,y=outbreak_size,color=as.numeric(reported_date))) +
  geom_point(size=1.8) +
  geom_abline(slope=1,intercept=0,linetype="dashed",alpha=0.5) +
  coord_cartesian(expand=FALSE,xlim = c(0,305), ylim=c(0,95)) +
  scale_color_continuous(breaks = as.numeric(lab_dates), 
                       labels = lab_dates,
                       type = "viridis") +
  theme_classic() +
  labs(y="Total outbreak size",x="Maximum capacity",
       color="Reported date")
show(p)

Cumulative cases plot

# Relabel locations according to manuscript labelling
fac_names <- c("Q", "I", "J", "L", "O", "A", "D", "N", "B", "K", "R", "H", "M", "C", "G", "F", "P", "E")

df <- data.frame(BC_LTHC_outbreaks_100Imputs[[100]]$case_matrix)
colnames(df) <- fac_names
df <- cbind(Day=1:nrow(df),df)

r0_uc_slope <- (3-1)/5
r0_lc_slope <- (1.1-1)/5

p2 <- df %>% 
  pivot_longer(-1) %>% filter(value > 0) %>% group_by(name) %>% mutate(csum = cumsum(value), name=factor(name, levels=fac_names)) %>%
  ggplot(aes(x = Day, y = csum, group = name, colour = name)) + 
  geom_line(show.legend = FALSE) + theme_classic() + xlab("Days since first symptom onset") +  ylab("Cumulative cases") + scale_y_log10() +
  geom_abline(slope=r0_lc_slope,intercept=0,linetype="dashed", col="black") + 
  geom_abline(slope=r0_uc_slope,intercept=0,linetype="dashed", col="black") +
  geom_dl(aes(label = name), method = list(dl.combine("last.points"), cex = 0.8)) + 
  annotate(
    "text",
    x = 5,
    y = exp(5 * r0_uc_slope + 1),
    angle = atan(r0_uc_slope * 10) * 180/pi,
    label = "R0 = 3"
  ) +
  annotate(
    "text",
    x = 10,
    y = exp(10 * r0_lc_slope - 0.01),
    angle = atan(r0_lc_slope * 10) * 180/pi,
    label = "R0 = 1.1"
  ) +
  coord_cartesian(ylim=c(1,100), xlim=c(0, 70), expand=FALSE)

show(p2)

Estimate R0 in each outbreak, using EG and ML methods

For each of the 100 imputations, we estimate R0 independently in each facility using exponential growth (EG) and maximum likelihood (ML) methods, with the ‘R0’ library.

nIts <- length(BC_LTHC_outbreaks_100Imputs)
res <- vector(mode = "list", length = nIts)

for (its in 1:nIts){
mGT<-generation.time("gamma", c(5.2, 1.73))
estr0<-rep(list(1),length(BC_LTHC_outbreaks_100Imputs[[1]]$Location) )
for (i in 1:length(BC_LTHC_outbreaks_100Imputs[[1]]$Location) ){
      end <- as.numeric(length(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]]) + 1 - 
        which.max(rev(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]])))
      if (end==1){end=as.numeric(length(BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]]))}
      t <- try(estimate.R(epid = BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]],
                          GT = mGT, time.step = 1, 
                          pop.size = BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                          begin = 1, end = end,
                          methods=c("EG", "ML")))
      if("try-error" %in% class(t)){estr0[[i]] <- estimate.R(epid = 
                              BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]], 
                              GT = mGT, time.step = 1, pop.size =
                              BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                              begin = 1, end = end,methods=c("EG"))}else{
                              estr0[[i]] <- estimate.R(epid = 
                              BC_LTHC_outbreaks_100Imputs[[its]]$time_series[[i]], 
                              GT = mGT, time.step = 1, begin = 1, end = end, pop.size =
                              BC_LTHC_outbreaks_100Imputs[[its]]$capacity[i],
                              methods=c("EG", "ML"))}
}
names(estr0) <- paste("Facility", BC_LTHC_outbreaks_100Imputs[[its]]$Location, sep="_")
res[[its]]<-estr0
}

Output files and summary

We collect summaries of the R0 estimates from a single imputation (the final one), and also calculate confidence intervals across all 100 imputations as a sensitivity analysis.

# First, save the complete nIts=100 sets of estimates to file (the sensitivity analysis) 
# We want the mean of the 100 means, and the CI of those 100 means, per outbreak
er_df <- data.frame(Location=fac_names,
                    EG_mean=c(rep(NA, length(names(estr0)))),
                    EG_mean_CI_lower=c(rep(NA, length(names(estr0)))),
                    EG_mean_CI_upper=c(rep(NA, length(names(estr0)))),
                    ML_mean=c(rep(NA, length(names(estr0)))),
                    ML_mean_CI_lower=c(rep(NA, length(names(estr0)))),
                    ML_mean_CI_upper=c(rep(NA, length(names(estr0)))) )


# To calculate the confidence intervals on the means
normConfInt <- function(x, alpha = 0.05){
  mean(x) + qt(1 - alpha / 2, length(x) - 1) * sd(x) * c(-1, 1)}

for (i in 1:length(estr0)){
  a<-NULL; b<-NULL;
  for (j in 1:nIts){
  a <- c(a, res[[j]][[i]]$estimates$EG[1])
  b <- c(b, res[[j]][[i]]$estimates$ML[1])
  }
  er_df[i, 2] <- mean(unlist(a))
  er_df[i, 3] <- normConfInt(unlist(a))[1]
  er_df[i, 4] <- normConfInt(unlist(a))[2]
  er_df[i, 5] <- mean(unlist(b))
  er_df[i, 6] <- normConfInt(unlist(b))[1]
  er_df[i, 7] <- normConfInt(unlist(b))[2]
}
er_df <- arrange(er_df, Location)
write.table(er_df, file = paste0(outdir,"/R0estimation_results_multiple.txt"))

# show table of results
er_df %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))
Location EG_mean EG_mean_CI_lower EG_mean_CI_upper ML_mean ML_mean_CI_lower ML_mean_CI_upper
A 10.6269093 -8.8532114 30.1070300 5.460201 -1.0344121 11.954814
B 0.1085062 0.1085062 0.1085062 1.324182 1.3241818 1.324182
C 1.1573214 1.1573214 1.1573214 1.088969 1.0889689 1.088969
D 8.5813549 8.5813549 8.5813549 NA NaN NaN
E 6.6482600 -6.4343068 19.7308269 3.191398 -0.4845208 6.867317
F 2.6483619 2.6483619 2.6483619 2.481302 2.4813016 2.481302
G 3.2251127 3.2251127 3.2251127 2.815736 2.8157357 2.815736
H 0.9976363 -0.8485777 2.8438503 1.869587 -2.2292049 5.968380
I 0.9221586 0.9221586 0.9221586 1.210450 1.2104496 1.210450
J 1.2938316 1.2938316 1.2938316 1.195217 1.1952168 1.195217
K 2.3304750 -0.0707417 4.7316916 2.022159 0.5724734 3.471845
L 1.4039891 1.4039891 1.4039891 NA NaN NaN
M 18.7880835 -59.6174627 97.1936297 2.424265 -2.5489625 7.397493
N 1.1345167 1.1345167 1.1345167 1.295400 1.2953995 1.295400
O 1.7321747 -3.5101976 6.9745471 1.590057 -0.6351134 3.815227
P 4.9837638 3.2858072 6.6817205 5.343308 3.4197483 7.266867
Q 5.2011965 3.0231006 7.3792923 4.917799 3.6901495 6.145448
R 1.7401226 1.7401226 1.7401226 1.962847 1.9628467 1.962847

# Secondly, the final imputation as the primary result
er_df <- data.frame(Location=fac_names,
                 EG=c(rep(NA, length(names(estr0)))),
                 EG_CI_lower=c(rep(NA, length(names(estr0)))),
                 EG_CI_upper=c(rep(NA, length(names(estr0)))),
                 ML=c(rep(NA, length(names(estr0)))),
                 ML_CI_lower=c(rep(NA, length(names(estr0)))),
                 ML_CI_upper=c(rep(NA, length(names(estr0)))) )

for (i in 1:length(estr0)){
  er_df[i, 2] <- estr0[[i]]$estimates$EG[1]
  er_df[i, 3] <- estr0[[i]]$estimates$EG$conf.int[1]
  er_df[i, 4] <- estr0[[i]]$estimates$EG$conf.int[2]
  er_df[i, 5] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML[1])
  er_df[i, 6] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML$conf.int[1])
  er_df[i, 7] <- ifelse(is.null(estr0[[i]]$estimates$ML[1]),NA,estr0[[i]]$estimates$ML$conf.int[2])
}
er_df <- arrange(er_df, Location)
write.table(er_df, file=paste0(outdir,"/R0estimation_results_single.txt") )

# show table of results
er_df %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))
Location EG EG_CI_lower EG_CI_upper ML ML_CI_lower ML_CI_upper
A 3.0019605 0.0634776 64.353581 3.972551 0.4554797 14.088511
B 0.1085062 0.0000032 10.860645 1.324182 0.0105389 8.939934
C 1.1573214 0.6032579 2.213223 1.088969 0.2293632 3.045568
D 8.5813549 0.0134894 958.227126 NA NA NA
E 0.5181015 0.1915469 1.125294 0.720507 0.1517629 2.015055
F 2.6483619 0.8002503 10.352171 2.481302 0.4125962 7.661795
G 3.2251127 0.6506007 16.932821 2.815736 0.5930789 7.875017
H 0.7188200 0.2881224 1.598253 1.202215 0.4311160 2.583873
I 0.9221586 0.5117499 1.582353 1.210450 0.3010219 3.138771
J 1.2938316 0.9845823 1.724130 1.195217 0.5743125 2.154633
K 1.3991400 1.0340789 1.910533 1.378132 0.7674802 2.249427
L 1.4039891 1.0733626 1.898347 NA NA NA
M 1.0422230 0.6580577 1.622863 1.321332 0.6861250 2.263093
N 1.1345167 0.8169595 1.576101 1.295400 0.6077655 2.371460
O 1.2304994 1.0090784 1.502667 1.338631 0.9118601 1.881481
P 4.7932071 2.8437951 8.641037 5.103303 2.9367991 8.138745
Q 4.7467273 3.0491734 7.749353 5.183023 3.3077990 7.660501
R 1.7401226 1.3305197 2.296654 1.962847 1.3124932 2.799088

Quick results visualisation

Some simple plots of the results.

hist(er_df$EG, breaks=20, main="EG R0 estimates", xlab="R0", col="steelblue4")

hist(er_df$ML, breaks=20, main="ML R0 estimates", xlab="R0", col="steelblue4")

Attack rates

We calculate the attack rate in each outbreak as the number of cases divided by the facility capacity. We incorporate uncertainty in the attack rate by varying the denominator from 85% to 115% of the known capacity. R0 is also estimated from the attack rate, according to the relation -log(1-A_r)/A_r = R0.

AR_table <- data.frame("Facility" = fac_names,  
                  "No. cases" = BC_LTHC_outbreaks_100Imputs[[100]]$num_cases,
                  "Capacity" = BC_LTHC_outbreaks_100Imputs[[100]]$capacity,
                  "A_r" = 
                    100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity,
                  "A_r (85% cap.)" =
                    100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/(BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85),
                  "A_r (115% cap.)" =
                    100*BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/(BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15),
                  "R0 (A_r)" =
                    -log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity),
                  "R0 (A_r, 85% cap.)" =
                    -log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*0.85),
                  "R0 (A_r, 115% cap.)" =
                    -log(1-(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15))/(BC_LTHC_outbreaks_100Imputs[[100]]$num_cases/BC_LTHC_outbreaks_100Imputs[[100]]$capacity*1.15) )
AR_table <- arrange(AR_table, Facility)

# show table of results
AR_table %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))
Facility No..cases Capacity A_r A_r..85..cap.. A_r..115..cap.. R0..A_r. R0..A_r..85..cap.. R0..A_r..115..cap..
A 7 300 2.333333 2.745098 2.028986 1.011851 1.010050 1.013662
B 3 66 4.545454 5.347594 3.952569 1.023440 1.019831 1.027084
C 6 236 2.542373 2.991027 2.210759 1.012932 1.010963 1.014910
D 5 87 5.747126 6.761325 4.997501 1.029886 1.025251 1.034578
E 8 89 8.988764 10.575017 7.816317 1.047833 1.040267 1.055548
F 6 92 6.521739 7.672634 5.671077 1.034100 1.028786 1.039487
G 14 210 6.666667 7.843137 5.797101 1.034893 1.029451 1.040413
H 12 75 16.000000 18.823529 13.913044 1.089709 1.074871 1.105114
I 9 91 9.890110 11.635423 8.600096 1.052974 1.044548 1.061585
J 25 189 13.227513 15.561780 11.502185 1.072617 1.060821 1.084774
K 44 236 18.644068 21.934197 16.212233 1.106714 1.088749 1.125506
L 18 120 15.000000 17.647059 13.043478 1.083460 1.069746 1.097659
M 37 151 24.503311 28.827425 21.307227 1.147116 1.121314 1.174608
N 19 40 47.500000 55.882353 41.304348 1.356541 1.280731 1.446607
O 88 142 61.971831 72.908036 53.888549 1.560133 1.420292 1.749947
P 89 154 57.792208 67.990833 50.254094 1.492529 1.375652 1.643764
Q 67 139 48.201439 56.707575 41.914295 1.364706 1.286623 1.457915
R 79 92 85.869565 101.023018 74.669187 2.278851 1.793323 4.437495

Compare attack rate to R0 estimates from different methods

Next, compare EG and ML estimates with results from the Bayesian hierarchical model, against the attack rate.

# The Bayesian hierarchical model results are manually set here, so remember to update these if you are adapting this script for another analysis. 
BHM <- c(0.56, 0.79, 0.86, 0.93, 1.2, 1.47, 1.52, 1.57, 2.1, 2.9, 3.2, 3.27, 3.32, 4.55, 5.73, 6.09, 8.17, 9.17)
# The following aren't used until the next chunk:
BHM_multi <- c(2.6, 2.97, 2.62, 3.28, 3.73, 3.68, 3.62, 5.39, 2.87, 2.71, 3.82, 2.37, 4.73, 6.65, 6.7, 6.91, 9.95, 9.13) # multi-level zeta point estimates
BHM_upCI <- c(1.17, 1.79, 1.54, 1.89, 2.21, 2.5, 2.33, 2.69, 3.23, 3.98, 4.35, 4.51, 4.58, 6.98, 7.76, 8.04, 11.38, 11.97) # BHM upper credible interval
BHM_multi_upCI <- c(4.96, 5.99, 4.87, 6.81, 7.46, 6.9, 6.54, 9.52, 5.14, 4.03, 5.55, 3.43, 7.23, 10.32, 8.9, 9.01, 13.21, 12.35) # multi-level zeta upper credible interval
BHM_lowCI <- c(0.16, 0.22, 0.35, 0.3, 0.46, 0.7, 0.91, 0.7, 1.29, 2.13, 2.41, 2.42, 2.43, 3.05, 4.37, 4.7, 5.29, 7.16) # BHM lower credible interval
BHM_multi_lowCI <- c(1.26, 1.38, 1.35, 1.59, 1.7, 1.8, 1.85, 2.45, 1.61, 1.83, 2.67, 1.6, 3.08, 4.1, 5.06, 5.31, 7.44, 7.06) # multi-level zeta lower credible interval

scatterdata <- data.frame(c(AR_table$Facility,AR_table$Facility,AR_table$Facility), c(AR_table$A_r,AR_table$A_r,AR_table$A_r), c(er_df$EG,er_df$ML,BHM), c(rep("Exponential growth",length(AR_table$A_r)), rep("Maximum likelihood",length(AR_table$A_r)), rep("Bayesian hierarchical",length(AR_table$A_r))) )
names(scatterdata) <- c("Facility", "ar", "r0", "Method")

p <- ggplot(scatterdata, aes(x=ar, y=r0, color=Method)) +
  geom_point(size=3, alpha=0.8) +
  geom_smooth(method=lm, se=FALSE, fullrange=TRUE)+
  #theme_sleek() + 
  xlab("Attack Rate (%)") + ylab(expression(R['0,k'])) +  theme(legend.position="right", legend.title = element_text(size=10,                                           face="bold"), legend.text=element_text(size=10)) +
  #scale_colour_ggthemr_d() +
  annotate(geom="label", x=75, y=9.7, label=round(cor(AR_table$A_r, BHM),2), colour=palette()[2]) +
  annotate(geom="label", x=75, y=1.7, label=round(cor(AR_table$A_r, er_df$EG),2), colour=palette()[3])+
  annotate(geom="label", x=75, y=3.5, label=round(cor(AR_table$A_r[!is.na(er_df$ML)], er_df$ML[!is.na(er_df$ML)]),2), colour=palette()[4]) 
# The geom_smooth/geom_point warning messages can be safely ignored - these are caused by the missing ML values
p
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 2 rows containing non-finite values (stat_smooth).
#> Warning: Removed 2 rows containing missing values (geom_point).

Compare R0 estimates from three main methods (BHM, EG, ML)

This figure compares point estimates and confidence intervals/credible intervals from the BHM (single- and multi- level zeta), EG and ML methods.


## First combine the results into a data frame
df <- data.frame(c(rep(er_df$Location, 4)),
                 (c(er_df$EG, er_df$ML , BHM, BHM_multi)),
                 (c(er_df$EG_CI_upper, er_df$ML_CI_upper, BHM_upCI, BHM_multi_upCI)),
                 (c(er_df$EG_CI_lower, er_df$ML_CI_lower, BHM_lowCI, BHM_multi_lowCI)),
                 c(rep("Exponential growth",
                       length(er_df$Location)),
                   rep("Maximum likelihood",
                       length(er_df$Location)),
                   rep("Bayesian hierarchical",
                       length(er_df$Location)),
                   rep("Bayesian hierarchical, multi-level",
                       length(er_df$Location))))
names(df) <- c("Location", "R0", "lowerCI", "upperCI", "Method")

p<-ggplot(df, aes(y=Location, x=R0, group=Method, color=Method)) +
  xlab(expression(R['0,k'])) + ylab("Location") +
  geom_errorbarh(mapping=aes(xmin=lowerCI, xmax=upperCI), size = 0.5, alpha=0.99, position=position_dodgev(height=0.8), height=0) +
  geom_point(position=position_dodgev(height=0.8)) + #theme_sleek() +
  theme(legend.position="bottom") +
  guides(colour = guide_legend(nrow = 2)) + scale_x_continuous(trans="log10", limits=c(0.000001,1000), labels=c("0.001","0.01","0.1", "1.0","10","100", "1000"), breaks=c(0.001,0.01, 0.1, 1, 10, 100, 1000))
p
#> Warning: Removed 2 rows containing missing values (geom_errorbarh).
#> Warning: Removed 2 rows containing missing values (geom_point).

Correlations between R0 estimates and additional LTHC facility covariate data

In this section we investigate any relationships between the facility R0 estimates and additional data obtained on the LTHC facilities (age of facility, room type etc.). The additional data is primarily contained in italicBC_OSABC_facilitydata.rdaitalic, and some factors we have loaded already (e.g. outbreak reported date).

First, we take a look at a few of these individually:


# Take a look at the additional factor data:
head(BC_OSABC_facilitydata)
#>   Location LTHC facility type Year facility opened Resident room type
#> 1        1         Non-profit                 1974      Mostly single
#> 2        2            Private                 1960             Single
#> 3        3             Public                 1998      Mostly single
#> 4        4               <NA>                   NA               <NA>
#> 5        5         Non-profit                 1947       Mostly multi
#> 6        6         Non-profit                 1989      Mostly single
#>   Accreditation status Direct care hours /resident/day
#> 1           Accredited                            3.36
#> 2           Accredited                            3.36
#> 3           Accredited                            3.50
#> 4                 <NA>                              NA
#> 5           Accredited                            3.60
#> 6           Accredited                            3.03
#>   Average resident age (years) Average resident stay (days)
#> 1                           88                          831
#> 2                           91                         1063
#> 3                           84                         1296
#> 4                           NA                           NA
#> 5                           85                          761
#> 6                           87                          699
#>   Residents dependent for daily activities (%)
#> 1                                         29.8
#> 2                                         27.7
#> 3                                         49.3
#> 4                                           NA
#> 5                                         36.6
#> 6                                         28.8
#>   Number of lodged complaints 2018/19 Number of disease outbreaks 2018/19
#> 1                                   0                                   0
#> 2                                   1                                   0
#> 3                                   0                                   1
#> 4                                  NA                                  NA
#> 5                                   0                                   2
#> 6                                   5                                   0
#>   Identity of initial COVID-19 case
#> 1                      Staff/worker
#> 2                      Staff/worker
#> 3                      Staff/worker
#> 4                      Staff/worker
#> 5                  Resident/patient
#> 6                  Resident/patient
# Re-order to alphabetical labelling
BC_OSABC_facilitydata$Facility = fac_names
BC_OSABC_facilitydata <- arrange(BC_OSABC_facilitydata, Facility)

# Was the initial (by symptom onset) case staff?
# (2 outbreaks with multiple earliest symptom onsets, both resident & staff: 9 and 17.
# These are coded as unknown.)
staff_cat<-as.factor(BC_OSABC_facilitydata$`Identity of initial COVID-19 case`)

# Remove the 'unknown' categorisations to make this factor dichotomous, and calculate the correlation with R0
staff_cat_di <- staff_cat[staff_cat!="Unknown"]
staff_cat_di <- unclass(staff_cat_di)

# Initial case
print(paste0("BHM and identity of initial case correlation = ", round(cor(staff_cat_di, BHM[staff_cat!="Unknown"]),3)))
#> [1] "BHM and identity of initial case correlation = 0.152"
print(paste0("BHM multi-level and identity of initial case correlation = ", round(cor(staff_cat_di, BHM_multi[staff_cat!="Unknown"]),3)))
#> [1] "BHM multi-level and identity of initial case correlation = 0.007"
print(paste0("BHM and identity of initial case correlation = ", round(cor(staff_cat_di, er_df$EG[staff_cat!="Unknown"]),3)))
#> [1] "BHM and identity of initial case correlation = -0.246"
print(paste0("BHM and identity of initial case correlation = ", round(cor(unclass(staff_cat[staff_cat!="Unknown" & !is.na(er_df$ML)]), er_df$ML[staff_cat!="Unknown" & !is.na(er_df$ML)]),3))) #only non-NA ML ests
#> [1] "BHM and identity of initial case correlation = 0.056"


# Facility capacity
fac_cap <- BC_LTHC_outbreaks_100Imputs[[100]]$capacity[order(fac_names)]
print(paste0("BHM and facility capacity correlation = ", round(cor(BHM, fac_cap),3)))
#> [1] "BHM and facility capacity correlation = -0.167"
print(paste0("BHM multi-level and facility capacity correlation = ", round(cor(BHM_multi, fac_cap),3)))
#> [1] "BHM multi-level and facility capacity correlation = -0.282"
print(paste0("EG and facility capacity correlation = ", round(cor(er_df$EG, fac_cap),3)))
#> [1] "EG and facility capacity correlation = 0.073"
print(paste0("ML and facility capacity correlation = ", round(cor(er_df$ML[!is.na(er_df$ML)], fac_cap[!is.na(er_df$ML)]),3)))
#> [1] "ML and facility capacity correlation = 0.295"


# outbreak reported date
rep_date <- BC_LTHC_outbreaks_100Imputs[[100]]$reported_date[order(fac_names)]
rep_date <- as.numeric(as.POSIXct(rep_date, format="%Y-%m-%d %H:%M:%S", tz="GMT"))
print(paste0("BHM and facility capacity correlation = ", round(cor(BHM, rep_date),3)))
#> [1] "BHM and facility capacity correlation = -0.254"
print(paste0("BHM multi-level and facility capacity correlation = ", round(cor(BHM_multi, rep_date),3)))
#> [1] "BHM multi-level and facility capacity correlation = -0.09"
print(paste0("EG and facility capacity correlation = ", round(cor(er_df$EG, rep_date),3)))
#> [1] "EG and facility capacity correlation = -0.069"
print(paste0("ML and facility capacity correlation = ", round(cor(er_df$ML[!is.na(er_df$ML)], rep_date[!is.na(er_df$ML)]),3)))
#> [1] "ML and facility capacity correlation = -0.044"

Finally, we create figures and tables for all factors available.

 # Make a tibble with the loaded factor data, but also outbreak reported date and facility capacity.
cor_data <- as_tibble(BC_OSABC_facilitydata)
cor_data <- cbind(cor_data, fac_cap, rep_date)
names(cor_data)[(length(cor_data)-1):length(cor_data)] <- c("Facility capacity", "COVID-19 outbreak reported date")

# Create correlation table, for numeric covariates
rho_table <- cor_data %>%
  dplyr::select("Number of disease outbreaks 2018/19","Number of lodged complaints 2018/19",
         "Residents dependent for daily activities (%)","Average resident stay (days)",
         "Average resident age (years)","Direct care hours /resident/day",
         "Facility capacity", "COVID-19 outbreak reported date", "Year facility opened")
rho_table <-
  purrr::map2_df(rho_table,colnames(rho_table),function(x,var_name){
    cor.test(x,BHM) %>%
      broom::tidy() %>%
      mutate(Property = var_name)
  }) %>%
  dplyr::select(Property,everything())
# BHM correlations
rho_table %>%
  kableExtra::kable() %>%
  kableExtra::kable_styling(bootstrap_options = c("striped","responsive"))
Property estimate statistic p.value parameter conf.low conf.high method alternative
Number of disease outbreaks 2018/19 -0.2270282 -0.8405090 0.4158149 13 -0.6622703 0.3227725 Pearson’s product-moment correlation two.sided
Number of lodged complaints 2018/19 -0.5396202 -2.3109734 0.0378793 13 -0.8240837 -0.0378087 Pearson’s product-moment correlation two.sided
Residents dependent for daily activities (%) 0.1531161 0.5586554 0.5858915 13 -0.3897143 0.6169853 Pearson’s product-moment correlation two.sided
Average resident stay (days) 0.3763166 1.4644810 0.1668234 13 -0.1684115 0.7449694 Pearson’s product-moment correlation two.sided
Average resident age (years) 0.0137734 0.0496656 0.9611438 13 -0.5020315 0.5223508 Pearson’s product-moment correlation two.sided
Direct care hours /resident/day 0.3959211 1.5545441 0.1440545 13 -0.1459408 0.7550483 Pearson’s product-moment correlation two.sided
Facility capacity -0.1668244 -0.6767814 0.5082118 16 -0.5879053 0.3253887 Pearson’s product-moment correlation two.sided
COVID-19 outbreak reported date -0.2543562 -1.0520253 0.3084179 16 -0.6446708 0.2411508 Pearson’s product-moment correlation two.sided
Year facility opened -0.1650633 -0.5797485 0.5728084 12 -0.6396250 0.4006005 Pearson’s product-moment correlation two.sided


# And correlations for dichotomous categorical factors (point-biserial)
print(paste0("BHM and identity of initial case (is staff) correlation = ", cor(unclass(as.factor(cor_data$"Identity of initial COVID-19 case"[cor_data$"Identity of initial COVID-19 case"!="Unknown"])), BHM[cor_data$"Identity of initial COVID-19 case"!="Unknown"])))
#> [1] "BHM and identity of initial case (is staff) correlation = 0.152020983211341"
# Make this one negative since 'not accredited' is associated with a higher number: 
print(paste0("BHM and (positive) accreditation status correlation = ", -cor(unclass(as.factor(cor_data$"Accreditation status"[!is.na(cor_data$"Accreditation status")])), BHM[!is.na(cor_data$"Accreditation status")])))
#> [1] "BHM and (positive) accreditation status correlation = 0.272308889310071"



## Plot correlations with R0

# separate numeric and character factors into 2 plots
cols_to_plot_num <- c(3, 6, 7, 8, 9, 10, 11, 14, 15)
cols_to_plot_cat <- c(2, 4, 5, 12)
cor_data <- cbind(cor_data, BHM)

cordatalong_num <- pivot_longer(cor_data, cols = all_of(cols_to_plot_num), names_to = "Factor", values_to = "Value")
cordatalong_cat <- pivot_longer(cor_data, cols = all_of(cols_to_plot_cat), names_to = "Factor", values_to = "Value")

# (modified a few aesthetics of figure manually for paper - removed second R_0,k label, rearranged some titles that were cut off)
p1 <- ggplot(data=cordatalong_num, aes(x=Value, y = BHM)) +
  geom_point(size=2, colour = "#E84646") + facet_wrap(~Factor,  scales = "free", ncol=4) + xlab("") + ylab(expression(R['0,k'])) +
  theme(text = element_text(size=12), axis.title.y = element_text(size = 15))

p2 <- ggplot(data=cordatalong_cat, aes(x=Value, y = BHM)) +
  geom_point(size=2, colour = "#E84646") + facet_wrap(~Factor,  scales = "free", ncol=4) + xlab("") + ylab(expression(R['0,k'])) +
  theme(text = element_text(size=12), axis.title.y = element_text(size = 15)) +
  theme(axis.text.x = element_text(angle = 45, hjust=1))
grid.arrange(grobs = list(p2, p1), nrow = 2, heights = c(1.1,3))
#> Warning: Removed 22 rows containing missing values (geom_point).


# If using paper colour scheme, reset the colour theme:
#ggthemr_reset()