# This script calculates the MASI and YLLI indicators for England, 2002-2016, using publicly available data.
# It is written for R version 3.5.1, using the 'data.table' package. You may need to use the command "install.packages('data.table')".
# It shows the general method behind the results in the article:

### Lewer D, Jayatunga W, Aldridge RW, Edge C, Marmot M, Story A, Hayward A. Premature mortality
### attributable to socioeconomic inequality in England between 2003 and 2018: an observational 
### study. 2019. Lancet Public Health. DOI:10.1016/ S2468-2667(19)30219-1

# The mortality and population data is available at: 
# https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/adhocs/007710numberofdeathsandpopulationsindeprivationdecileareasbysexandsingleyearofageenglandandwalesregisteredyears2001to2016
# For ease, we have reformatted the data and uploaded it to a public repository, where it can be read directly using the code provided.
# Results of this code vary slightly from results published in the article due to small differences between pubicly available data and microdata used in the article, and differences in the time period studied.

library(data.table)

#-----------------------------------
# Read population and mortality data
#-----------------------------------

pop <- fread('https://osf.io/37tdf/download')
mort <- fread('https://osf.io/y3e7c/download')

#-------------------------------------------------------
# Combine datasets and group into age groups and periods
#-------------------------------------------------------

# combine

pop <- melt(pop, measure.vars = as.character(0:74), variable.name = 'age', value.name = 'pop', variable.factor = F)
mort <- melt(mort, measure.vars =  as.character(0:74), variable.name = 'age', value.name = 'deaths', variable.factor = F)
d <- mort[pop, on = c('year', 'imd10', 'sex', 'age')]

# age groups

d$age <- as.numeric(d$age)
d$age_group <- findInterval(d$age, c(0, 1, seq(5, 70, 5)))
ages <- c('0', '1-4', paste0(seq(5, 70, 5), '-', seq(9, 74, 5)))
d$age_group <- factor(d$age_group, 1:16, ages)

# period

d <- d[year != 2001]
d$period <- findInterval(d$year, c(2002, 2005, 2008, 2011, 2014))

#---------------
# Calculate MATI
#---------------

# aggregated dataset

d_mati <- d[, .(deaths = sum(deaths), pop = sum(pop)), c('imd10', 'sex', 'age_group', 'period')]

# reference rate

ref <- d_mati[imd10 == 10]
ref$ref_rate <- ref$deaths / ref$pop * 100000
ref <- ref[,c('sex', 'age_group', 'period', 'ref_rate'), with = F]

# calculated expected and attributable

d_mati <- ref[d_mati, on = c('sex', 'age_group', 'period')]
d_mati$expected <- d_mati$pop * d_mati$ref_rate / 100000
d_mati$attributable <- d_mati$deaths - d_mati$expected

# MATI

sum(d_mati$deaths) # 2.4m deaths
sum(d_mati$attributable) # 859k attributable
sum(d_mati$attributable) / sum(d_mati$deaths) # MATI = 36.3%

# MATI by IMD

mati_by_imd <- aggregate(cbind(expected, attributable) ~ imd10, d_mati, sum)

#-----------------------------------
# Create life tables and report YLLI
#-----------------------------------

# function to calculate total YLLs using a life table

total_yll <- function(mx_, cohort = 100000) { # mx_ is vector of mortality rates by age
  mx <- c(mx_, 1)
  n <- length(mx)
  qx <- 2 * mx / (2 + mx)
  lx <- c(1, cumprod(1 - qx))[seq_len(n)] * cohort
  dx <- -c(diff(lx), lx[n] * qx[n])
  dx <- dx[1:(n-1)]
  yll <- dx * (((n-1):1) - 0.5) # assume 74.5 years lost for death age 0; 73.5 years age 1, etc.
  return(sum(yll))
}

# aggregated dataset

d_yli <- aggregate(cbind(deaths, pop) ~ imd10 + sex + age, d, sum)

# mortality rates by age, sex and IMD10

d_yli$mx <- d_yli$deaths / d_yli$pop
d_yli <- split(d_yli$mx, f = list(d_yli$imd10, d_yli$sex))

# YLLs by age, sex and IMD10

d_yli <- sapply(d_yli, total_yll)

# add male and female together, and calculate attributable YLLs, and express per person

d_yli <- d_yli[1:10] + d_yli[11:20]
names(d_yli) <- 1:10
d_yli <- rbind(total = d_yli, expected = d_yli[10])
d_yli <- rbind(d_yli, attributable = d_yli[1,] - d_yli[2,]) / 200000

sum(d_yli[3,]) / sum(d_yli[1,]) # 37% of YLLs attributable to inequality
sum(d_yli[3,]) / 10 # 1.28 YLLs attributable to inequality per person

#---------
# Barplots
#---------

par(mfrow = c(1, 2))

barplot(t(as.matrix(mati_by_imd[10:1,-1])), col = c("#FB9A99", "#E31A1C"), space = 0, ylab = 'Premature deaths, England, 2002-2016', xlab = 'IMD', main = 'MATI')
barplot(d_yli[2:3,10:1], col = c("#FB9A99", "#E31A1C"), space = 0, ylab = 'Mean YLLs per person', xlab = 'IMD', main = 'YLLI')

#------------------------------------------
# Monte-Carlo confidence intervals for MATI
#------------------------------------------

# General function

mc_paf <- function(dat, mod_var, ref, adj_vars, time = 'pop', event = 'deaths', N = 10000, point.estimate = mean, level = 0.95) {
  
  # deaths and mortality rates
  rd <- sapply(dat[,event], function(x) rpois(N, x))
  rd_rate <- t(t(rd) / dat[,time])
  
  # which are the relevant reference groups?
  refn <- dat[,c(mod_var, adj_vars)]
  refn$nc1 <- seq_len(nrow(refn))
  refn_ref <- refn[refn[,mod_var] == ref,]
  refn_ref[,mod_var] <- NULL
  names(refn_ref) <- c(adj_vars, 'nc2')
  refn <- merge(refn, refn_ref, by = adj_vars)
  refn <- refn$nc2[order(refn$nc1)]
  
  # expected
  expected <- t(t(rd_rate[,refn]) * dat[,time])
  
  # attributable
  attributable <- rd - expected
  attributable <- attributable[,dat[,mod_var] != ref] # remove those in reference category
  attributable <- rowSums(attributable)
  
  # PAF and results
  pafs <- attributable / rowSums(rd)
  point <- point.estimate(pafs)
  c(point, quantile(pafs, c((1-level)/2, 1-(1-level)/2)))
  
}

# Calculate 95% confidence intervals

d_mati$imd2 <- ifelse(d_mati$imd10 == 10, 'high', 'low') # just for efficiency. Can also use imd10 with '10' as the reference group
d_mati2 <- aggregate(cbind(deaths, pop) ~ sex + age_group + period + imd2, d_mati, sum)

mc_paf(d_mati2, mod_var = 'imd2', ref = 'high', adj_vars = c('sex', 'age_group', 'period'))
# MATI = 36.3 (95% CI 36.0-36.6)