library(data.table) # for data manipulation and file reading
library(stringi) # for string processing
library(lubridate) # for date processing

#----------------------
# read and process data
#----------------------

# read data
d <- fread("cov_seasonality_immunity_public_data_23_march_2020.csv")

# format variables
d[, weekstart := dmy(weekstart)]
seasons <- c("Nov-Mar 2006/7", "Nov-Mar 2007/8", "Nov-Mar 2008/9", "May-Sep 2009", "Oct-Feb 2009/10", "Nov-Mar 2010/11")
d[, flu5mperiod := factor(flu5mperiod, seasons)]
d[, mth := month(weekstart, label = T)]
d[, mth := factor(mth, month.abb, month.abb)]

# number of tests
sapply(d[flu5mperiod %in% seasons[1:4], pcr, with = F], table) #1,104 tests in seasons 1-4

# create any flu variable
pcr <- c('OV_FluA', 'OV_FluB', 'OV_hRhV', 'OV_CoV', 'OV_AdV', 'OV_RSV', 'OV_PIV', 'OV_hMPV', 'OV_H1N1p', 'OV_H1N1s', 'OV_H3N2')
d[, (pcr) := lapply(.SD, function(x) x == 'Yes [1]'), .SDcols = pcr]
pcr_flu <- c('OV_FluA', 'OV_FluB', 'OV_H1N1p', 'OV_H1N1s', 'OV_H3N2')
d[, anyFlu := rowSums(d[, pcr_flu, with = F] == T) > 0]
pcr2 <- c(setdiff(pcr, pcr_flu), 'anyFlu')

# reshape data
strains <- c('NL63', '229E', 'OC43')
d <- melt(d, measure.vars = c(pcr2, strains), variable.name = 'pcr', variable.factor = F, value.name = 'result')

#------------------
# incidence by week
#------------------

inc <- d[weekstart >= as.Date('2007-01-01') & pcr %in% pcr2, .(person_weeks = .N, pos = sum(result)), c('weekstart', 'flu5mperiod', 'pcr')]
pois_ci <- function(x, t) c(rate = x / t, poisson.test(x, t)$conf.int[1:2])
inc_ci <- t(mapply(pois_ci, x = inc$pos, t = inc$person_weeks)) * 100000
colnames(inc_ci) <- c('inc', 'lower', 'upper')
inc <- cbind(inc, inc_ci)
inc <- inc[order(weekstart)]
# weeks with < 100 person-weeks are not included in the article

#-----------------------------------------------------------
# evidence of immunity, based on repeat confirmed infections
#-----------------------------------------------------------

# make dataset
cov <- d[pcr %chin% strains & result == T][order(weekstart)]
dist <- cov[, .N, pcr] # distribution of strains
dist[, prob := N / sum(N)]
cov[, n := rank(weekstart, ties.method = 'last'), p_id]
cov_strain <- dcast(cov, p_id  ~ n, value.var = 'pcr')
setnames(cov_strain, as.character(1:2), c('first', 'second'))
cov_week <- dcast(cov, p_id  ~ n, value.var = 'weekstart')
setnames(cov_week, as.character(1:2), c('date_first', 'date_second'))
cov <- cov_week[cov_strain, on = 'p_id']
cov <- cov[!is.na(second) & date_first != date_second] # only infections that occur in different weeks
cov <- unique(d[, c('p_id', 'p_lh')])[cov, on = 'p_id'] # check households

# simulations
set.seed(27)
B <- 100000
scens <- sample(dist$pcr, B * nrow(cov), replace = T, prob = dist$prob)
scens <- matrix(scens, ncol = B)
sum(colSums(scens == cov$first) == 0) / B # p-value

# check by calculating probability (after observing zero reinfections)
setnames(dist, 'pcr', 'first')
cov <- dist[cov, on = 'first']
prod(1 - cov$prob) # probability of getting different infection second time

#----------------------------
# table 1 (descriptive table)
#----------------------------

# make descriptive variables
d[, total := 'total']
d[, ps := paste0(p_id, '_', flu5mperiod)]
dv <- c('total', 'age_group', 'sex', 'area', 'imd5', 'flu5mperiod', 'mth')

# function for summarising categorical variables
df <- function(var = 'age_group', DAT = d) {
  a <- DAT[pcr == 'OV_CoV', .(individuals = uniqueN(p_id), seasons = uniqueN(ps), person_weeks = .N, isolates = sum(result)), var]
  a <- a[order(get(var))]
  a[, individuals_pc := individuals / sum(individuals)]
  a[, individuals_pc := paste0(format(round(individuals_pc * 100, 1), nsmall = 1, digits = 1), '%')]
  rt <- mapply(pois_ci, x = a$isolates, t = a$person_weeks)
  rt <- rt * 100000
  rt <- t(rt)
  colnames(rt) <- c('rate', 'lower', 'upper')
  rt <- round(rt, 0)
  a <- cbind(a, rt)
  fc <- c('individuals', 'seasons', 'person_weeks', 'rate', 'lower', 'upper')
  a[, (fc) := lapply(.SD, format, big.mark = ','), .SDcols = fc]
  a[, rate := paste0(rate, ' (', lower, '-', upper, ')')]
  a[, individuals := paste0(individuals, ' (', individuals_pc, ')')]
  names(a)[1] <- 'level'
  a[, (fc) := lapply(.SD, stri_replace_all_fixed, pattern = ' ', replacement = ''), .SDcols = fc]
  a[, (fc) := lapply(.SD, stri_replace_all_fixed, pattern = '(', replacement = ' ('), .SDcols = fc]
  a[, variable := c(var, rep('', nrow(a) - 1))]
  a[, c('variable', 'level', 'individuals', 'seasons', 'person_weeks', 'isolates', 'rate')]
}
desc.tab <- rbindlist(lapply(dv, df, DAT = d[flu5mperiod %in% seasons[1:4]]))