library(data.table) # for data manipulation and file reading library(stringi) # for string processing library(lubridate) # for date processing #---------------------- # read and process data #---------------------- # read data d <- fread("cov_seasonality_immunity_public_data_23_march_2020.csv") # format variables d[, weekstart := dmy(weekstart)] seasons <- c("Nov-Mar 2006/7", "Nov-Mar 2007/8", "Nov-Mar 2008/9", "May-Sep 2009", "Oct-Feb 2009/10", "Nov-Mar 2010/11") d[, flu5mperiod := factor(flu5mperiod, seasons)] d[, mth := month(weekstart, label = T)] d[, mth := factor(mth, month.abb, month.abb)] # number of tests sapply(d[flu5mperiod %in% seasons[1:4], pcr, with = F], table) #1,104 tests in seasons 1-4 # create any flu variable pcr <- c('OV_FluA', 'OV_FluB', 'OV_hRhV', 'OV_CoV', 'OV_AdV', 'OV_RSV', 'OV_PIV', 'OV_hMPV', 'OV_H1N1p', 'OV_H1N1s', 'OV_H3N2') d[, (pcr) := lapply(.SD, function(x) x == 'Yes [1]'), .SDcols = pcr] pcr_flu <- c('OV_FluA', 'OV_FluB', 'OV_H1N1p', 'OV_H1N1s', 'OV_H3N2') d[, anyFlu := rowSums(d[, pcr_flu, with = F] == T) > 0] pcr2 <- c(setdiff(pcr, pcr_flu), 'anyFlu') # reshape data strains <- c('NL63', '229E', 'OC43') d <- melt(d, measure.vars = c(pcr2, strains), variable.name = 'pcr', variable.factor = F, value.name = 'result') #------------------ # incidence by week #------------------ inc <- d[weekstart >= as.Date('2007-01-01') & pcr %in% pcr2, .(person_weeks = .N, pos = sum(result)), c('weekstart', 'flu5mperiod', 'pcr')] pois_ci <- function(x, t) c(rate = x / t, poisson.test(x, t)$conf.int[1:2]) inc_ci <- t(mapply(pois_ci, x = inc$pos, t = inc$person_weeks)) * 100000 colnames(inc_ci) <- c('inc', 'lower', 'upper') inc <- cbind(inc, inc_ci) inc <- inc[order(weekstart)] # weeks with < 100 person-weeks are not included in the article #----------------------------------------------------------- # evidence of immunity, based on repeat confirmed infections #----------------------------------------------------------- # make dataset cov <- d[pcr %chin% strains & result == T][order(weekstart)] dist <- cov[, .N, pcr] # distribution of strains dist[, prob := N / sum(N)] cov[, n := rank(weekstart, ties.method = 'last'), p_id] cov_strain <- dcast(cov, p_id ~ n, value.var = 'pcr') setnames(cov_strain, as.character(1:2), c('first', 'second')) cov_week <- dcast(cov, p_id ~ n, value.var = 'weekstart') setnames(cov_week, as.character(1:2), c('date_first', 'date_second')) cov <- cov_week[cov_strain, on = 'p_id'] cov <- cov[!is.na(second) & date_first != date_second] # only infections that occur in different weeks cov <- unique(d[, c('p_id', 'p_lh')])[cov, on = 'p_id'] # check households # simulations set.seed(27) B <- 100000 scens <- sample(dist$pcr, B * nrow(cov), replace = T, prob = dist$prob) scens <- matrix(scens, ncol = B) sum(colSums(scens == cov$first) == 0) / B # p-value # check by calculating probability (after observing zero reinfections) setnames(dist, 'pcr', 'first') cov <- dist[cov, on = 'first'] prod(1 - cov$prob) # probability of getting different infection second time #---------------------------- # table 1 (descriptive table) #---------------------------- # make descriptive variables d[, total := 'total'] d[, ps := paste0(p_id, '_', flu5mperiod)] dv <- c('total', 'age_group', 'sex', 'area', 'imd5', 'flu5mperiod', 'mth') # function for summarising categorical variables df <- function(var = 'age_group', DAT = d) { a <- DAT[pcr == 'OV_CoV', .(individuals = uniqueN(p_id), seasons = uniqueN(ps), person_weeks = .N, isolates = sum(result)), var] a <- a[order(get(var))] a[, individuals_pc := individuals / sum(individuals)] a[, individuals_pc := paste0(format(round(individuals_pc * 100, 1), nsmall = 1, digits = 1), '%')] rt <- mapply(pois_ci, x = a$isolates, t = a$person_weeks) rt <- rt * 100000 rt <- t(rt) colnames(rt) <- c('rate', 'lower', 'upper') rt <- round(rt, 0) a <- cbind(a, rt) fc <- c('individuals', 'seasons', 'person_weeks', 'rate', 'lower', 'upper') a[, (fc) := lapply(.SD, format, big.mark = ','), .SDcols = fc] a[, rate := paste0(rate, ' (', lower, '-', upper, ')')] a[, individuals := paste0(individuals, ' (', individuals_pc, ')')] names(a)[1] <- 'level' a[, (fc) := lapply(.SD, stri_replace_all_fixed, pattern = ' ', replacement = ''), .SDcols = fc] a[, (fc) := lapply(.SD, stri_replace_all_fixed, pattern = '(', replacement = ' ('), .SDcols = fc] a[, variable := c(var, rep('', nrow(a) - 1))] a[, c('variable', 'level', 'individuals', 'seasons', 'person_weeks', 'isolates', 'rate')] } desc.tab <- rbindlist(lapply(dv, df, DAT = d[flu5mperiod %in% seasons[1:4]]))