Chapter 9 Advanced Analysis Modules

This chapter covers the advanced statistical analysis modules in UKBAnalytica, including subgroup analysis, propensity score methods, mediation analysis, and multiple imputation pooling.

9.1 Overview

Module	File	Purpose
Subgroup Analysis	`subgroup.R`	Stratified analysis with interaction p-values
Propensity Score	`propensity.R`	PSM matching and IPTW weighting
Mediation Analysis	`mediation.R`	Causal mediation (based on regmedint)
MI Pooling	`mi_pool.R`	Multiple imputation result combining
Visualization	`visualization.R`	Forest plots, K-M curves, diagnostic plots

9.2 Subgroup Analysis

Subgroup (stratified) analysis examines whether treatment effects vary across patient subgroups and calculates interaction p-values.

9.2.1 Basic Usage

library(UKBAnalytica)

# Run subgroup analysis by age group
result <- run_subgroup_analysis(
  data = dt,
  exposure = "treatment",
  outcome = "event",
  subgroup_var = "age_group",
  covariates = c("sex", "bmi"),
  model_type = "cox",
  endpoint = c("time", "status")
)

print(result)
#>     subgroup  n n_event    HR lower95 upper95  pvalue p_interaction
#> 1  <50 years 500     45  1.25    0.85    1.84   0.256         0.032
#> 2 50-65 years 800     92  1.58    1.12    2.23   0.009         0.032
#> 3  >65 years 400     78  0.89    0.62    1.28   0.532         0.032

9.2.2 Multiple Subgroup Variables

# Analyze multiple subgroup variables at once
results <- run_multi_subgroup(
  data = dt,
  exposure = "treatment",
  outcome = "event",
  subgroup_vars = c("age_group", "sex", "smoking_status"),
  covariates = c("bmi", "diabetes"),
  model_type = "cox",
  endpoint = c("time", "status")
)

# Create forest plot
plot_forest(results, title = "Subgroup Analysis: Treatment Effect")

9.2.3 Parameters

Parameter	Description
`exposure`	Treatment/exposure variable name
`outcome`	Outcome variable (for logistic/linear)
`subgroup_var`	Variable defining subgroups
`model_type`	`"cox"`, `"logistic"`, or `"linear"`
`endpoint`	For Cox: `c("time_var", "status_var")`
`ref_level`	Reference level for subgroup variable

9.3 Propensity Score Methods

Propensity score methods help control for confounding in observational studies.

9.3.1 Estimate Propensity Scores

# Estimate propensity scores using logistic regression
ps_result <- estimate_propensity_score(
  data = dt,
  treatment = "treatment",
  covariates = c("age", "sex", "bmi", "smoking", "diabetes"),
  method = "logistic"  # or "gbm" for gradient boosting
)

# Add PS to data
dt$ps <- ps_result$ps

# Visualize PS distribution
plot_ps_distribution(ps_result, type = "histogram")
plot_ps_distribution(ps_result, type = "density")

9.3.2 Propensity Score Matching (PSM)

# Perform 1:1 nearest neighbor matching
matched <- match_propensity(
  data = dt,
  treatment = "treatment",
  ps = dt$ps,
  ratio = 1,
  caliper = 0.2,  # 0.2 * SD of PS
  method = "nearest"
)

# Check balance
balance <- assess_balance(
  data = matched$matched_data,
  treatment = "treatment",
  covariates = c("age", "sex", "bmi", "smoking", "diabetes")
)
print(balance)

# Visualize balance improvement
plot_balance(balance, title = "Covariate Balance: Before vs After Matching")

9.3.3 Inverse Probability of Treatment Weighting (IPTW)

# Calculate IPTW weights
dt$weight <- calculate_weights(
  ps = dt$ps,
  treatment = dt$treatment,
  type = "ate"  # or "att", "atc"
)

# Run weighted analysis
result <- run_weighted_analysis(
  data = dt,
  treatment = "treatment",
  outcome = "event",
  weights = "weight",
  model_type = "cox",
  endpoint = c("time", "status")
)
print(result)

9.3.4 Weight Types

Type	Formula	Use Case
`"ate"`	T/PS + (1-T)/(1-PS)	Average Treatment Effect
`"att"`	T + (1-T)*PS/(1-PS)	Effect on Treated
`"atc"`	T*(1-PS)/PS + (1-T)	Effect on Controls

9.4 Mediation Analysis

Causal mediation analysis decomposes the total effect into direct and indirect effects, using the regmedint package as backend.

9.4.1 Basic Mediation Analysis

# Run mediation analysis
med_result <- run_mediation(
  data = dt,
  exposure = "treatment",
  mediator = "inflammation",
  outcome = "cvd_event",
  covariates = c("age", "sex", "bmi"),
  outcome_type = "cox",        # "linear", "logistic", or "cox"
  mediator_type = "linear",    # "linear" or "logistic"
  event_var = "status",        # For Cox models
  time_var = "time",           # For Cox models
  a0 = 0, a1 = 1,              # Exposure contrast
  m_cde = 0,                   # Mediator value for CDE
  boot = TRUE,                 # Bootstrap CIs
  nboot = 1000
)

# View results
summary(med_result)
#> Mediation Analysis Results
#> ==========================
#> Exposure: treatment (0 vs 1)
#> Mediator: inflammation
#> Outcome: cvd_event (Cox model)
#>
#>         Effect  Estimate   Std.Err   95% CI          P-value
#>         CDE     0.234      0.089     [0.060, 0.408]  0.008
#>         PNDE    0.198      0.082     [0.037, 0.359]  0.016
#>         TNIE    0.156      0.045     [0.068, 0.244]  <0.001
#>         TE      0.354      0.095     [0.168, 0.540]  <0.001
#>         PM      44.1%      -         -               -

9.4.2 Effect Definitions

Effect	Description
CDE	Controlled Direct Effect (setting M=m)
PNDE	Pure Natural Direct Effect
TNIE	Total Natural Indirect Effect
TE	Total Effect (PNDE + TNIE)
PM	Proportion Mediated (TNIE/TE)

9.4.3 Multiple Mediators

# Test multiple mediators
multi_results <- run_multi_mediator(
  data = dt,
  exposure = "treatment",
  mediators = c("inflammation", "oxidative_stress", "insulin_resistance"),
  outcome = "cvd_event",
  covariates = c("age", "sex"),
  outcome_type = "cox",
  event_var = "status",
  time_var = "time"
)

# Forest plot of indirect effects
plot_mediation_forest(multi_results, effect_type = "tnie")

9.4.4 Visualization

# Effect bar chart
plot_mediation(med_result, type = "effects")

# Effect decomposition
plot_mediation(med_result, type = "decomposition")

# Path diagram
plot_mediation(med_result, type = "path")

9.5 Multiple Imputation Pooling

When using multiple imputation for missing data, results from each imputed dataset must be combined using Rubin’s Rules.

9.5.1 Basic Usage

# Assume mi_datasets is a list of imputed datasets from mice
# mi_data <- mice(original_data, m = 20)
# mi_datasets <- complete(mi_data, action = "all")

# Method 1: Fit models first, then pool
models <- lapply(mi_datasets, function(d) {
  coxph(Surv(time, status) ~ treatment + age + sex, data = d)
})

pooled <- pool_mi_models(models, model_type = "cox")
summary(pooled)
#> Multiple Imputation Results (Rubin's Rules)
#> ============================================
#> Number of imputations: 20
#> Model type: cox
#>
#>   Term          HR     Std.Err   95% CI            p-value   FMI
#>   treatment     1.456  0.156     [1.072, 1.978]    0.019     7.8%
#>   age           1.023  0.008     [1.007, 1.039]    0.004     4.5%
#>   sex           0.856  0.112     [0.687, 1.067]    0.168     5.2%

9.5.2 One-Step Fitting and Pooling

# Method 2: Fit and pool in one call
pooled <- pool_mi_models(
  datasets = mi_datasets,
  formula = Surv(time, status) ~ treatment + age + sex + bmi,
  model_type = "cox"
)

# Logistic regression example
pooled_logit <- pool_mi_models(
  datasets = mi_datasets,
  formula = diabetes ~ treatment + age + sex + bmi,
  model_type = "logistic",
  exponentiate = TRUE  # Output OR
)

9.5.3 Supported Model Types

Model	Function	Output
`"lm"`	`lm()`	Beta coefficients
`"logistic"`	`glm(family=binomial)`	Odds Ratio
`"poisson"`	`glm(family=poisson)`	Rate Ratio
`"cox"`	`coxph()`	Hazard Ratio
`"negbin"`	`glm.nb()`	Rate Ratio

9.5.4 Custom Estimates Pooling

# Pool any custom estimates (not just regression)
mean_diffs <- lapply(mi_datasets, function(d) {
  mean(d$outcome[d$treatment == 1]) - mean(d$outcome[d$treatment == 0])
})

# Estimate variances
vars <- lapply(mi_datasets, function(d) {
  n1 <- sum(d$treatment == 1)
  n0 <- sum(d$treatment == 0)
  var(d$outcome[d$treatment == 1])/n1 + var(d$outcome[d$treatment == 0])/n0
})

pooled_diff <- pool_custom_estimates(
  estimates = mean_diffs,
  variances = lapply(vars, function(v) matrix(v)),
  labels = "Mean Difference"
)
summary(pooled_diff)

9.5.5 Visualization

# Forest plot of pooled estimates
plot_mi_pooled(pooled, exponentiate = TRUE)

# Diagnostic plot: Fraction of Missing Information
plot_mi_diagnostics(pooled, type = "fmi")

# Variance ratio diagnostic
plot_mi_diagnostics(pooled, type = "variance_ratio")

9.5.6 Understanding FMI

The Fraction of Missing Information (FMI) indicates how much the missing data affects each estimate:

FMI Level	Interpretation
< 10%	Low impact from missing data
10-30%	Moderate impact
30-50%	High impact
> 50%	Very high impact; consider more imputations

9.6 Visualization Functions

9.6.1 Forest Plot

# From subgroup analysis
plot_forest(
  subgroup_results,
  estimate_col = "HR",
  lower_col = "lower95",
  upper_col = "upper95",
  null_value = 1,
  log_scale = TRUE,
  title = "Subgroup Analysis Forest Plot"
)

9.6.2 Kaplan-Meier Curves

plot_km_curve(
  data = dt,
  time = "surv_time",
  status = "status",
  group = "treatment",
  title = "Survival by Treatment Group",
  risk_table = TRUE,
  conf_int = TRUE
)

9.6.3 Propensity Score Plots

# PS distribution
plot_ps_distribution(ps_result, type = "density")

# Balance plot
plot_balance(balance_result, title = "SMD Before/After Matching")

# Calibration plot
plot_calibration(predicted = dt$ps, observed = dt$treatment)

9.7 Integration Example: Complete Workflow

Here’s a complete example combining multiple modules:

library(UKBAnalytica)
library(mice)
library(survival)

# 1. Multiple imputation (using mice)
mi_data <- mice(original_data, m = 20, method = "pmm")
mi_datasets <- complete(mi_data, action = "all")

# 2. Propensity score weighting in each imputation
weighted_models <- lapply(mi_datasets, function(d) {
  # Estimate PS
  ps <- estimate_propensity_score(d, "treatment", 
                                   c("age", "sex", "bmi", "smoking"))$ps
  # Calculate weights
  d$weight <- calculate_weights(ps, d$treatment, "ate")
  
  # Fit weighted Cox model
  coxph(Surv(time, status) ~ treatment, data = d, weights = weight)
})

# 3. Pool results
pooled <- pool_mi_models(weighted_models, model_type = "cox")
summary(pooled)

# 4. Subgroup analysis (on complete-case for simplicity)
subgroup_results <- run_multi_subgroup(
  data = original_data[complete.cases(original_data), ],
  exposure = "treatment",
  outcome = "event",
  subgroup_vars = c("age_group", "sex", "diabetes"),
  model_type = "cox",
  endpoint = c("time", "status")
)

# 5. Visualize
plot_forest(subgroup_results)
plot_mi_pooled(pooled)

9.8 Summary

This chapter covered five advanced analysis modules:

Subgroup Analysis: run_subgroup_analysis(), run_multi_subgroup()
Propensity Score: estimate_propensity_score(), match_propensity(), calculate_weights()
Mediation Analysis: run_mediation(), run_multi_mediator()
MI Pooling: pool_mi_models(), pool_custom_estimates()
Machine Learning: ukb_ml_model(), ukb_shap(), ukb_ml_survival()

Each module includes dedicated visualization functions and integrates with the standard UKBAnalytica workflow.

9.9 Machine Learning Module

The ML module provides a unified interface for training, evaluating, and interpreting machine learning models for UK Biobank data analysis.

9.9.1 Supported Models

Model	Type	Use Case
`rf`	Random Forest	General classification/regression
`xgboost`	Gradient Boosting	High-performance prediction
`glmnet`	Elastic Net	Feature selection, regularization
`svm`	Support Vector Machine	Complex boundaries
`nnet`	Neural Network	Non-linear patterns
`logistic`	Logistic/Linear	Baseline comparison

9.9.2 Basic Usage

library(UKBAnalytica)

# Train a Random Forest classifier
ml_rf <- ukb_ml_model(
  diabetes ~ age + bmi + sbp + smoking + alcohol,
  data = ukb_data,
  model = "rf",
  task = "classification",
  split_ratio = 0.8,
  seed = 42
)

# View results
print(ml_rf)
#> UKB Machine Learning Model
#> ==========================
#> Model: Random Forest
#> Task: classification
#> Test AUC: 0.782

# Variable importance
ukb_ml_importance(ml_rf)

9.9.3 Model Evaluation

# Detailed metrics with confidence intervals
metrics <- ukb_ml_metrics(ml_rf, ci = TRUE, ci_method = "delong")

# ROC curve
roc_result <- ukb_ml_roc(ml_rf)
print(roc_result)

# Calibration curve
cal_result <- ukb_ml_calibration(ml_rf)
plot(cal_result)

# Confusion matrix
cm_result <- ukb_ml_confusion(ml_rf, threshold = 0.5)
print(cm_result)

9.9.4 Cross-Validation

# 5-fold cross-validation
cv_result <- ukb_ml_cv(
  diabetes ~ age + bmi + sbp + smoking,
  data = ukb_data,
  model = "rf",
  task = "classification",
  folds = 5,
  repeats = 3,
  seed = 42
)

print(cv_result)
#> Metrics (Mean +/- SD):
#>   auc: 0.781 +/- 0.023
#>   accuracy: 0.742 +/- 0.018

9.9.5 Model Comparison

# Train multiple models
ml_rf <- ukb_ml_model(outcome ~ ., data, model = "rf")
ml_xgb <- ukb_ml_model(outcome ~ ., data, model = "xgboost")
ml_lasso <- ukb_ml_model(outcome ~ ., data, model = "glmnet")

# Compare performance
comparison <- ukb_ml_compare(ml_rf, ml_xgb, ml_lasso)
print(comparison)
plot(comparison)

9.9.6 SHAP Interpretation

# Requires: install.packages("fastshap")

# Compute SHAP values
shap <- ukb_shap(ml_rf, sample_n = 1000, nsim = 100)
print(shap)

# Summary plot (global importance)
plot_shap_summary(shap, type = "beeswarm")
plot_shap_summary(shap, type = "bar")

# Dependence plot (single feature)
plot_shap_dependence(shap, feature = "age", color_feature = "sex")

# Force plot (single observation)
plot_shap_force(shap, row_id = 1, max_features = 10)

9.9.7 Survival ML Models

# Requires: install.packages("randomForestSRC")

# Random Survival Forest
surv_rf <- ukb_ml_survival(
  Surv(time, event) ~ age + sex + bmi + smoking + diabetes,
  data = ukb_data,
  model = "rsf",
  params = list(ntree = 500)
)

print(surv_rf)
#> Test C-index: 0.752

# Predict survival probabilities
pred <- ukb_ml_survival_predict(surv_rf, times = c(1, 3, 5, 10))

# Variable importance
ukb_ml_survival_importance(surv_rf)

# SHAP for survival model
surv_shap <- ukb_ml_survival_shap(surv_rf, time_point = 5, sample_n = 500)
plot_shap_summary(surv_shap)

9.9.8 Integration with Other Modules

# Use ML for propensity score estimation
ps_ml <- ukb_ml_model(
  treatment ~ age + sex + bmi + smoking,
  data = ukb_data,
  model = "xgboost",
  task = "classification",
  verbose = FALSE
)

# Extract predicted probabilities as PS
ukb_data$ps_ml <- ukb_ml_predict(ps_ml, newdata = ukb_data, type = "prob")[, 2]

# Use for IPTW analysis
ukb_data$weight <- ifelse(
  ukb_data$treatment == 1,
  1 / ukb_data$ps_ml,
  1 / (1 - ukb_data$ps_ml)
)

9.9.9 Required Packages

Function	Required Package
`ukb_ml_model(model = "rf")`	`ranger`
`ukb_ml_model(model = "xgboost")`	`xgboost`
`ukb_ml_model(model = "glmnet")`	`glmnet`
`ukb_ml_model(model = "svm")`	`e1071`
`ukb_ml_model(model = "nnet")`	`nnet`
`ukb_ml_roc()`	`pROC`
`ukb_shap()`	`fastshap`
`ukb_ml_survival(model = "rsf")`	`randomForestSRC`

Install with:

install.packages(c("ranger", "xgboost", "glmnet", "pROC", "fastshap", "randomForestSRC"))

9.9.10 Required Packages (All Modules)

Module	Required Package
Propensity Score Matching	`MatchIt`
GBM for PS	`gbm`
Mediation Analysis	`regmedint`
MI Pooling	`mitools`
Cox Models	`survival`
Random Forest	`ranger`
XGBoost	`xgboost`
Elastic Net	`glmnet`
SVM	`e1071`
Neural Network	`nnet`
ROC Analysis	`pROC`
SHAP	`fastshap`
Survival ML	`randomForestSRC`

Install all with:

install.packages(c(
  "MatchIt", "gbm", "regmedint", "mitools", "survival",
  "ranger", "xgboost", "glmnet", "e1071", "nnet",
  "pROC", "fastshap", "randomForestSRC"
))