Chapter 9 Advanced Analysis Modules

This chapter covers the advanced statistical analysis modules in UKBAnalytica, including subgroup analysis, propensity score methods, mediation analysis, and multiple imputation pooling.

9.1 Overview

Module File Purpose
Subgroup Analysis subgroup.R Stratified analysis with interaction p-values
Propensity Score propensity.R PSM matching and IPTW weighting
Mediation Analysis mediation.R Causal mediation (based on regmedint)
MI Pooling mi_pool.R Multiple imputation result combining
Visualization visualization.R Forest plots, K-M curves, diagnostic plots

9.2 Subgroup Analysis

Subgroup (stratified) analysis examines whether treatment effects vary across patient subgroups and calculates interaction p-values.

9.2.1 Basic Usage

library(UKBAnalytica)

# Run subgroup analysis by age group
result <- run_subgroup_analysis(
  data = dt,
  exposure = "treatment",
  outcome = "event",
  subgroup_var = "age_group",
  covariates = c("sex", "bmi"),
  model_type = "cox",
  endpoint = c("time", "status")
)

print(result)
#>     subgroup  n n_event    HR lower95 upper95  pvalue p_interaction
#> 1  <50 years 500     45  1.25    0.85    1.84   0.256         0.032
#> 2 50-65 years 800     92  1.58    1.12    2.23   0.009         0.032
#> 3  >65 years 400     78  0.89    0.62    1.28   0.532         0.032

9.2.2 Multiple Subgroup Variables

# Analyze multiple subgroup variables at once
results <- run_multi_subgroup(
  data = dt,
  exposure = "treatment",
  outcome = "event",
  subgroup_vars = c("age_group", "sex", "smoking_status"),
  covariates = c("bmi", "diabetes"),
  model_type = "cox",
  endpoint = c("time", "status")
)

# Create forest plot
plot_forest(results, title = "Subgroup Analysis: Treatment Effect")

9.2.3 Parameters

Parameter Description
exposure Treatment/exposure variable name
outcome Outcome variable (for logistic/linear)
subgroup_var Variable defining subgroups
model_type "cox", "logistic", or "linear"
endpoint For Cox: c("time_var", "status_var")
ref_level Reference level for subgroup variable

9.3 Propensity Score Methods

Propensity score methods help control for confounding in observational studies.

9.3.1 Estimate Propensity Scores

# Estimate propensity scores using logistic regression
ps_result <- estimate_propensity_score(
  data = dt,
  treatment = "treatment",
  covariates = c("age", "sex", "bmi", "smoking", "diabetes"),
  method = "logistic"  # or "gbm" for gradient boosting
)

# Add PS to data
dt$ps <- ps_result$ps

# Visualize PS distribution
plot_ps_distribution(ps_result, type = "histogram")
plot_ps_distribution(ps_result, type = "density")

9.3.2 Propensity Score Matching (PSM)

# Perform 1:1 nearest neighbor matching
matched <- match_propensity(
  data = dt,
  treatment = "treatment",
  ps = dt$ps,
  ratio = 1,
  caliper = 0.2,  # 0.2 * SD of PS
  method = "nearest"
)

# Check balance
balance <- assess_balance(
  data = matched$matched_data,
  treatment = "treatment",
  covariates = c("age", "sex", "bmi", "smoking", "diabetes")
)
print(balance)

# Visualize balance improvement
plot_balance(balance, title = "Covariate Balance: Before vs After Matching")

9.3.3 Inverse Probability of Treatment Weighting (IPTW)

# Calculate IPTW weights
dt$weight <- calculate_weights(
  ps = dt$ps,
  treatment = dt$treatment,
  type = "ate"  # or "att", "atc"
)

# Run weighted analysis
result <- run_weighted_analysis(
  data = dt,
  treatment = "treatment",
  outcome = "event",
  weights = "weight",
  model_type = "cox",
  endpoint = c("time", "status")
)
print(result)

9.3.4 Weight Types

Type Formula Use Case
"ate" T/PS + (1-T)/(1-PS) Average Treatment Effect
"att" T + (1-T)*PS/(1-PS) Effect on Treated
"atc" T*(1-PS)/PS + (1-T) Effect on Controls

9.4 Mediation Analysis

Causal mediation analysis decomposes the total effect into direct and indirect effects, using the regmedint package as backend.

9.4.1 Basic Mediation Analysis

# Run mediation analysis
med_result <- run_mediation(
  data = dt,
  exposure = "treatment",
  mediator = "inflammation",
  outcome = "cvd_event",
  covariates = c("age", "sex", "bmi"),
  outcome_type = "cox",        # "linear", "logistic", or "cox"
  mediator_type = "linear",    # "linear" or "logistic"
  event_var = "status",        # For Cox models
  time_var = "time",           # For Cox models
  a0 = 0, a1 = 1,              # Exposure contrast
  m_cde = 0,                   # Mediator value for CDE
  boot = TRUE,                 # Bootstrap CIs
  nboot = 1000
)

# View results
summary(med_result)
#> Mediation Analysis Results
#> ==========================
#> Exposure: treatment (0 vs 1)
#> Mediator: inflammation
#> Outcome: cvd_event (Cox model)
#>
#>         Effect  Estimate   Std.Err   95% CI          P-value
#>         CDE     0.234      0.089     [0.060, 0.408]  0.008
#>         PNDE    0.198      0.082     [0.037, 0.359]  0.016
#>         TNIE    0.156      0.045     [0.068, 0.244]  <0.001
#>         TE      0.354      0.095     [0.168, 0.540]  <0.001
#>         PM      44.1%      -         -               -

9.4.2 Effect Definitions

Effect Description
CDE Controlled Direct Effect (setting M=m)
PNDE Pure Natural Direct Effect
TNIE Total Natural Indirect Effect
TE Total Effect (PNDE + TNIE)
PM Proportion Mediated (TNIE/TE)

9.4.3 Multiple Mediators

# Test multiple mediators
multi_results <- run_multi_mediator(
  data = dt,
  exposure = "treatment",
  mediators = c("inflammation", "oxidative_stress", "insulin_resistance"),
  outcome = "cvd_event",
  covariates = c("age", "sex"),
  outcome_type = "cox",
  event_var = "status",
  time_var = "time"
)

# Forest plot of indirect effects
plot_mediation_forest(multi_results, effect_type = "tnie")

9.4.4 Visualization

# Effect bar chart
plot_mediation(med_result, type = "effects")

# Effect decomposition
plot_mediation(med_result, type = "decomposition")

# Path diagram
plot_mediation(med_result, type = "path")

9.5 Multiple Imputation Pooling

When using multiple imputation for missing data, results from each imputed dataset must be combined using Rubin’s Rules.

9.5.1 Basic Usage

# Assume mi_datasets is a list of imputed datasets from mice
# mi_data <- mice(original_data, m = 20)
# mi_datasets <- complete(mi_data, action = "all")

# Method 1: Fit models first, then pool
models <- lapply(mi_datasets, function(d) {
  coxph(Surv(time, status) ~ treatment + age + sex, data = d)
})

pooled <- pool_mi_models(models, model_type = "cox")
summary(pooled)
#> Multiple Imputation Results (Rubin's Rules)
#> ============================================
#> Number of imputations: 20
#> Model type: cox
#>
#>   Term          HR     Std.Err   95% CI            p-value   FMI
#>   treatment     1.456  0.156     [1.072, 1.978]    0.019     7.8%
#>   age           1.023  0.008     [1.007, 1.039]    0.004     4.5%
#>   sex           0.856  0.112     [0.687, 1.067]    0.168     5.2%

9.5.2 One-Step Fitting and Pooling

# Method 2: Fit and pool in one call
pooled <- pool_mi_models(
  datasets = mi_datasets,
  formula = Surv(time, status) ~ treatment + age + sex + bmi,
  model_type = "cox"
)

# Logistic regression example
pooled_logit <- pool_mi_models(
  datasets = mi_datasets,
  formula = diabetes ~ treatment + age + sex + bmi,
  model_type = "logistic",
  exponentiate = TRUE  # Output OR
)

9.5.3 Supported Model Types

Model Function Output
"lm" lm() Beta coefficients
"logistic" glm(family=binomial) Odds Ratio
"poisson" glm(family=poisson) Rate Ratio
"cox" coxph() Hazard Ratio
"negbin" glm.nb() Rate Ratio

9.5.4 Custom Estimates Pooling

# Pool any custom estimates (not just regression)
mean_diffs <- lapply(mi_datasets, function(d) {
  mean(d$outcome[d$treatment == 1]) - mean(d$outcome[d$treatment == 0])
})

# Estimate variances
vars <- lapply(mi_datasets, function(d) {
  n1 <- sum(d$treatment == 1)
  n0 <- sum(d$treatment == 0)
  var(d$outcome[d$treatment == 1])/n1 + var(d$outcome[d$treatment == 0])/n0
})

pooled_diff <- pool_custom_estimates(
  estimates = mean_diffs,
  variances = lapply(vars, function(v) matrix(v)),
  labels = "Mean Difference"
)
summary(pooled_diff)

9.5.5 Visualization

# Forest plot of pooled estimates
plot_mi_pooled(pooled, exponentiate = TRUE)

# Diagnostic plot: Fraction of Missing Information
plot_mi_diagnostics(pooled, type = "fmi")

# Variance ratio diagnostic
plot_mi_diagnostics(pooled, type = "variance_ratio")

9.5.6 Understanding FMI

The Fraction of Missing Information (FMI) indicates how much the missing data affects each estimate:

FMI Level Interpretation
< 10% Low impact from missing data
10-30% Moderate impact
30-50% High impact
> 50% Very high impact; consider more imputations

9.6 Visualization Functions

9.6.1 Forest Plot

# From subgroup analysis
plot_forest(
  subgroup_results,
  estimate_col = "HR",
  lower_col = "lower95",
  upper_col = "upper95",
  null_value = 1,
  log_scale = TRUE,
  title = "Subgroup Analysis Forest Plot"
)

9.6.2 Kaplan-Meier Curves

plot_km_curve(
  data = dt,
  time = "surv_time",
  status = "status",
  group = "treatment",
  title = "Survival by Treatment Group",
  risk_table = TRUE,
  conf_int = TRUE
)

9.6.3 Propensity Score Plots

# PS distribution
plot_ps_distribution(ps_result, type = "density")

# Balance plot
plot_balance(balance_result, title = "SMD Before/After Matching")

# Calibration plot
plot_calibration(predicted = dt$ps, observed = dt$treatment)

9.7 Integration Example: Complete Workflow

Here’s a complete example combining multiple modules:

library(UKBAnalytica)
library(mice)
library(survival)

# 1. Multiple imputation (using mice)
mi_data <- mice(original_data, m = 20, method = "pmm")
mi_datasets <- complete(mi_data, action = "all")

# 2. Propensity score weighting in each imputation
weighted_models <- lapply(mi_datasets, function(d) {
  # Estimate PS
  ps <- estimate_propensity_score(d, "treatment", 
                                   c("age", "sex", "bmi", "smoking"))$ps
  # Calculate weights
  d$weight <- calculate_weights(ps, d$treatment, "ate")
  
  # Fit weighted Cox model
  coxph(Surv(time, status) ~ treatment, data = d, weights = weight)
})

# 3. Pool results
pooled <- pool_mi_models(weighted_models, model_type = "cox")
summary(pooled)

# 4. Subgroup analysis (on complete-case for simplicity)
subgroup_results <- run_multi_subgroup(
  data = original_data[complete.cases(original_data), ],
  exposure = "treatment",
  outcome = "event",
  subgroup_vars = c("age_group", "sex", "diabetes"),
  model_type = "cox",
  endpoint = c("time", "status")
)

# 5. Visualize
plot_forest(subgroup_results)
plot_mi_pooled(pooled)

9.8 Summary

This chapter covered five advanced analysis modules:

  1. Subgroup Analysis: run_subgroup_analysis(), run_multi_subgroup()
  2. Propensity Score: estimate_propensity_score(), match_propensity(), calculate_weights()
  3. Mediation Analysis: run_mediation(), run_multi_mediator()
  4. MI Pooling: pool_mi_models(), pool_custom_estimates()
  5. Machine Learning: ukb_ml_model(), ukb_shap(), ukb_ml_survival()

Each module includes dedicated visualization functions and integrates with the standard UKBAnalytica workflow.


9.9 Machine Learning Module

The ML module provides a unified interface for training, evaluating, and interpreting machine learning models for UK Biobank data analysis.

9.9.1 Supported Models

Model Type Use Case
rf Random Forest General classification/regression
xgboost Gradient Boosting High-performance prediction
glmnet Elastic Net Feature selection, regularization
svm Support Vector Machine Complex boundaries
nnet Neural Network Non-linear patterns
logistic Logistic/Linear Baseline comparison

9.9.2 Basic Usage

library(UKBAnalytica)

# Train a Random Forest classifier
ml_rf <- ukb_ml_model(
  diabetes ~ age + bmi + sbp + smoking + alcohol,
  data = ukb_data,
  model = "rf",
  task = "classification",
  split_ratio = 0.8,
  seed = 42
)

# View results
print(ml_rf)
#> UKB Machine Learning Model
#> ==========================
#> Model: Random Forest
#> Task: classification
#> Test AUC: 0.782

# Variable importance
ukb_ml_importance(ml_rf)

9.9.3 Model Evaluation

# Detailed metrics with confidence intervals
metrics <- ukb_ml_metrics(ml_rf, ci = TRUE, ci_method = "delong")

# ROC curve
roc_result <- ukb_ml_roc(ml_rf)
print(roc_result)

# Calibration curve
cal_result <- ukb_ml_calibration(ml_rf)
plot(cal_result)

# Confusion matrix
cm_result <- ukb_ml_confusion(ml_rf, threshold = 0.5)
print(cm_result)

9.9.4 Cross-Validation

# 5-fold cross-validation
cv_result <- ukb_ml_cv(
  diabetes ~ age + bmi + sbp + smoking,
  data = ukb_data,
  model = "rf",
  task = "classification",
  folds = 5,
  repeats = 3,
  seed = 42
)

print(cv_result)
#> Metrics (Mean +/- SD):
#>   auc: 0.781 +/- 0.023
#>   accuracy: 0.742 +/- 0.018

9.9.5 Model Comparison

# Train multiple models
ml_rf <- ukb_ml_model(outcome ~ ., data, model = "rf")
ml_xgb <- ukb_ml_model(outcome ~ ., data, model = "xgboost")
ml_lasso <- ukb_ml_model(outcome ~ ., data, model = "glmnet")

# Compare performance
comparison <- ukb_ml_compare(ml_rf, ml_xgb, ml_lasso)
print(comparison)
plot(comparison)

9.9.6 SHAP Interpretation

# Requires: install.packages("fastshap")

# Compute SHAP values
shap <- ukb_shap(ml_rf, sample_n = 1000, nsim = 100)
print(shap)

# Summary plot (global importance)
plot_shap_summary(shap, type = "beeswarm")
plot_shap_summary(shap, type = "bar")

# Dependence plot (single feature)
plot_shap_dependence(shap, feature = "age", color_feature = "sex")

# Force plot (single observation)
plot_shap_force(shap, row_id = 1, max_features = 10)

9.9.7 Survival ML Models

# Requires: install.packages("randomForestSRC")

# Random Survival Forest
surv_rf <- ukb_ml_survival(
  Surv(time, event) ~ age + sex + bmi + smoking + diabetes,
  data = ukb_data,
  model = "rsf",
  params = list(ntree = 500)
)

print(surv_rf)
#> Test C-index: 0.752

# Predict survival probabilities
pred <- ukb_ml_survival_predict(surv_rf, times = c(1, 3, 5, 10))

# Variable importance
ukb_ml_survival_importance(surv_rf)

# SHAP for survival model
surv_shap <- ukb_ml_survival_shap(surv_rf, time_point = 5, sample_n = 500)
plot_shap_summary(surv_shap)

9.9.8 Integration with Other Modules

# Use ML for propensity score estimation
ps_ml <- ukb_ml_model(
  treatment ~ age + sex + bmi + smoking,
  data = ukb_data,
  model = "xgboost",
  task = "classification",
  verbose = FALSE
)

# Extract predicted probabilities as PS
ukb_data$ps_ml <- ukb_ml_predict(ps_ml, newdata = ukb_data, type = "prob")[, 2]

# Use for IPTW analysis
ukb_data$weight <- ifelse(
  ukb_data$treatment == 1,
  1 / ukb_data$ps_ml,
  1 / (1 - ukb_data$ps_ml)
)

9.9.9 Required Packages

Function Required Package
ukb_ml_model(model = "rf") ranger
ukb_ml_model(model = "xgboost") xgboost
ukb_ml_model(model = "glmnet") glmnet
ukb_ml_model(model = "svm") e1071
ukb_ml_model(model = "nnet") nnet
ukb_ml_roc() pROC
ukb_shap() fastshap
ukb_ml_survival(model = "rsf") randomForestSRC

Install with:

install.packages(c("ranger", "xgboost", "glmnet", "pROC", "fastshap", "randomForestSRC"))

9.9.10 Required Packages (All Modules)

Module Required Package
Propensity Score Matching MatchIt
GBM for PS gbm
Mediation Analysis regmedint
MI Pooling mitools
Cox Models survival
Random Forest ranger
XGBoost xgboost
Elastic Net glmnet
SVM e1071
Neural Network nnet
ROC Analysis pROC
SHAP fastshap
Survival ML randomForestSRC

Install all with:

install.packages(c(
  "MatchIt", "gbm", "regmedint", "mitools", "survival",
  "ranger", "xgboost", "glmnet", "e1071", "nnet",
  "pROC", "fastshap", "randomForestSRC"
))