#################################### ## Example R-script ## ## Mechanism-Based Approach ## ## Written by ## ## Maarten J. Bijlsma ## #################################### ### R-script demonstrating application of the Mechanism-Based Approach ### to APC analysis in a number of setting (both multiplicative and Monte Carlo approaches) # INDEX # 1. Linear regression, data for linear regression, starts at line 19 # 1.a. Linear regression, multiplicative approach, starts at line 53 # 1.b. Linear regression, Monte Carlo integration, starts at line 80 # 2. Logistic model, Monte Carlo integration, starts at line 123 ### 1: Linear regression # we generate data according to an APC model with a continuous outcome n <- 10000 age <- runif(n,1,100) period <- runif(n,0,100) cohort <- period - age # set age and cohort effects on y age.effect <- 2 cohort.effect <- 0.5 # set period effect on mediators m1 and m2 period.effect.m1 <- 1 period.effect.m2 <- 0.25 # set m1 and m2 effects on y m1.effect <- -1.2 m2.effect <- 0.3 # generate m1 and m2 m1 <- 0 + period*period.effect.m1 + rnorm(n,0,1) m2 <- 5 + period*period.effect.m2 + rnorm(n,0,2) # generate y y <- 20 + age*age.effect + m1*m1.effect + m2*m2.effect + cohort*cohort.effect + rnorm(n,0,10) # the effect of period on y should therefore be period.effect <- period.effect.m1*m1.effect + period.effect.m2*m2.effect # put everything together in a dataset lin.dat <- data.frame(cbind(age, period, cohort, m1, m2, y)) ## 1.a Linear regression, estimate APC effects using multipicative approach # first we estimate the effect of period on the mediators period.effect.m1.hat <- lm(m1 ~ period, data=lin.dat)$coef[2] # this value should be close to period.m1.effect period.effect.m2.hat <- lm(m2 ~ period, data=lin.dat)$coef[2] # this value should be close to period.m2.effect # then we estimate the effect of age, m1, m2, cohort on y fit.amc <- lm(y ~ age + m1 + m2 + cohort, data=lin.dat) age.effect.hat <- fit.amc$coef[2] # this value should be close to age.effect m1.effect.hat <- fit.amc$coef[3] # this value should be close to m1.effect m2.effect.hat <- fit.amc$coef[4] # this value should be close to m2.effect cohort.effect.hat <- fit.amc$coef[5] # this value should be close to cohort.effect # (note: and they usually will be due to large n; only due to randomness will they occassionally not be) # we now already possess estimates of age and cohort # we use the multiplication approach to get an estimate of the period effect period.effect.hat <- m1.effect.hat*period.effect.m1.hat+m2.effect.hat*period.effect.m2.hat # this value should be close to the period effect # we could now bootstrap from the original data to get standard errors for our estimates of A, P and C # this is not reproduced here # bias could be determined by re-running this code 1000 times and saving estimates of A, P and C # the average over these estimates for age, period and cohort respectively should be # equal to the age, period and cohort values that we set (hence, unbiased) ## 1.a Linear regression, estimate APC effects using Monte Carlo intergration # when using linear or probit regression, we could use multiplication approach # but we use here the Monte Carlo approach to demonstrate that the outcome will # in this setting, be the same if a Monte Carlo approach is used instead # the first two steps are the same as in the multiplication approach: # first we estimate the effect of period on the mediators period.effect.m1.fit <- lm(m1 ~ period, data=lin.dat) period.effect.m2.fit <- lm(m2 ~ period, data=lin.dat) # then we estimate the effect of age, m1, m2, cohort on y fit.amc <- lm(y ~ age + m1 + m2 + cohort, data=lin.dat) # generate values for a, p and c # first we determine the empirical range of a, p and c arange <- range(lin.dat$age) prange <- range(lin.dat$period) crange <- range(lin.dat$cohort) a.sim <- runif(n,arange[1],arange[2]) # we use the same size as n, but this is not a necessity p.sim <- runif(n,prange[1],prange[2]) # also, we use uniform distributions here to generate a, p and c but this is also not required c.sim <- runif(n,crange[1],crange[2]) # as can be seen, we generate a, p and c independently (a = p - c does NOT hold for these generated values) # generate values for mediators m1.sim <- period.effect.m1.fit$coef[1] + period.effect.m1.fit$coef[2]*p.sim + rnorm(n,0,summary(period.effect.m1.fit)$sigma) m2.sim <- period.effect.m2.fit$coef[1] + period.effect.m2.fit$coef[2]*p.sim + rnorm(n,0,summary(period.effect.m2.fit)$sigma) # then we generate y # using our estimated coefficients and the generated a, m and c values y.sim <- fit.amc$coef[1] + fit.amc$coef[2]*a.sim + fit.amc$coef[3]*m1.sim + fit.amc$coef[4]*m2.sim + fit.amc$coef[5]*c.sim + rnorm(n,0,summary(fit.amc)$sigma) # and then we can estimate the effects of a, p and c directly lm(y.sim ~ a.sim + p.sim + c.sim) # these estimates should be very close to age.effect, period.effect and cohort.effect # but will vary somewhat due to random variation # reproducing them many times and averaging will show 0 bias # again, we could now bootstrap from the original data to get standard errors for our estimates of A, P and C # this is not reproduced here ## 2. Logit model, estimating APC effects through Monte Carlo integration # define expit and logit functions expit <- function(a) {exp(a)/(exp(a)+1)} logit <- function(p) {log(p/(1-p))} # generate probit data n <- 10000 age <- runif(n,0,1) period <- runif(n,0,1) cohort <- period - age # set age and cohort effects on y age.effect <- 2 cohort.effect <- 0.5 # set period effect on mediators m1 and m2 period.effect.m1 <- 1 period.effect.m2 <- 0.25 # set m1 and m2 effects on y m1.effect <- -1.2 m2.effect <- 0.3 # generate m1 and m2 m1 <- rbinom(n,1,expit(0 + period*period.effect.m1)) m2 <- rbinom(n,1,expit(0.5 + period*period.effect.m2)) # generate y y <- rbinom(n,1,expit((-1 + age*age.effect + m1*m1.effect + m2*m2.effect + cohort*cohort.effect))) # put everything together in a dataset pro.dat <- data.frame(cbind(age, period, cohort, m1, m2, y)) # now we will estimate using Monte Carlo # first we estimate the effect of period on the mediators period.effect.m1.fit <- glm(m1 ~ period, family=binomial, data=pro.dat) period.effect.m2.fit <- glm(m2 ~ period, family=binomial, data=pro.dat) # then we estimate the effect of age, m1, m2, cohort on y fit.amc <- glm(y ~ age + m1 + m2 + cohort, family=binomial, data=pro.dat) # generate values for a, p and c # first we determine the empirical range of a, p and c arange <- range(pro.dat$age) prange <- range(pro.dat$period) crange <- range(pro.dat$cohort) a.sim <- runif(n,arange[1],arange[2]) # we use the same size as n, but this is not a necessity p.sim <- runif(n,prange[1],prange[2]) # also, we use uniform distributions here to generate a, p and c but this is also not required c.sim <- runif(n,crange[1],crange[2]) # as can be seen, we generate a, p and c independently (a = p - c does NOT hold for these generated values) # generate values for mediators m1.sim <- rbinom(n,1,expit(period.effect.m1.fit$coef[1] + period.effect.m1.fit$coef[2]*p.sim)) m2.sim <- rbinom(n,1,expit(period.effect.m2.fit$coef[1] + period.effect.m2.fit$coef[2]*p.sim)) # then we generate y # using our estimated coefficients and the generated a, m and c values y.sim <- rbinom(n,1,expit(fit.amc$coef[1] + fit.amc$coef[2]*a.sim + fit.amc$coef[3]*m1.sim + fit.amc$coef[4]*m2.sim + fit.amc$coef[5]*c.sim)) # and then we can estimate the effects of a, p and c directly summary(glm(y.sim ~ a.sim + p.sim + c.sim, family=binomial)) # this should actually be done multiple times (e.g. a 1000 times) # and then the estimates from the above model should be averaged # over the iterations, to reduce Monte Carlo error (see main manuscript) # these estimates will be unbiased, but not the same as the 'set' values due to # the logistic (non-linear) transformations