if (requireNamespace("pkgload", quietly = TRUE) && dir.exists("../R")) {
pkgload::load_all("..", quiet = TRUE)
}
library(UKBAnalytica)
library(data.table)
demo_raw <- ukb_demo()
dim(demo_raw)
#> [1] 5000 754
names(demo_raw)[1:12]
#> [1] "eid" "p31" "p53_i0" "p53_i1" "p53_i2" "p53_i3"
#> [7] "p21022" "p21001_i0" "p21001_i1" "p21001_i2" "p21001_i3" "p21000_i0"Exploring the relationship between Air pollution and incident arrhythmia: a UKBAnalytica demo case study
This document demonstrates an end-to-end UKBAnalytica workflow using ukb_demo() as a RAP-style raw table. The case study evaluates associations between air pollution exposure and incident cardiac arrhythmia. The workflow covers raw field extraction, phenotype definition, baseline preprocessing, participant flow, baseline table generation, Cox regression, and sensitivity analyses.
The demo data are synthetic and do not contain UK Biobank participant-level source records. Results are for workflow illustration only.
1 Load demo data
2 Specify variables from raw UKB field IDs
The analysis uses the following raw UKB fields:
- NO2:
p24016,p24018, andp24017;p24003is used for the 2010-exposure sensitivity analysis. - NOx:
p24004. - PM2.5:
p24006. - PM10:
p24019;p24005is used for the 2010-exposure sensitivity analysis. - Covariates: sex (
p31), baseline date (p53_i0), age (p21022), BMI (p21001_i0), smoking (p20116_i0), drinking (p20117_i0), Townsend deprivation index (p189), ethnicity (p21000_i0), and education (p6138_i0).
3 Define incident arrhythmia
The predefined Arrhythmia phenotype in UKBAnalytica captures broad cardiac arrhythmia using ICD-10 codes I44-I49 and procedure codes including K57.6, K59-K62, K64.1, and K72-K74 where available. Baseline history and prospective incident events are handled separately.
disease_catalog <- get_predefined_diseases()
arrhythmia_def <- disease_catalog["Arrhythmia"]
arrhythmia_def
#> $Arrhythmia
#> $Arrhythmia$name
#> [1] "Cardiac Arrhythmia"
#>
#> $Arrhythmia$icd10_pattern
#> [1] "^(I44|I45|I46|I47|I48|I49)"
#>
#> $Arrhythmia$icd9_pattern
#> NULL
#>
#> $Arrhythmia$sr_codes
#> NULL
#>
#> $Arrhythmia$death_icd10
#> [1] "^(I44|I45|I46|I47|I48|I49)"
#>
#> $Arrhythmia$opcs4_pattern
#> [1] "^(K576|K59|K60|K61|K62|K641|K72|K73|K74)"
#>
#> $Arrhythmia$first_occurrence_fields
#> [1] 131342 131344 131346 131348 131350 131352
#>
#> $Arrhythmia$first_occurrence_source_fields
#> NULL
#>
#> $Arrhythmia$cancer_icd10_pattern
#> NULL
#>
#> $Arrhythmia$cancer_histology
#> NULL
#>
#> $Arrhythmia$cancer_behaviour
#> NULL
#>
#> $Arrhythmia$algo_date_field
#> NULL
#>
#> $Arrhythmia$algo_source_field
#> NULL
demo_surv <- build_survival_dataset(
dt = demo_raw,
disease_definitions = arrhythmia_def,
prevalent_sources = c("ICD10", "OPCS4"),
outcome_sources = c("ICD10"),
baseline_col = "p53_i0",
primary_disease = "Arrhythmia",
censor_date = as.Date("2023-10-31"),
show_flow = TRUE
)
#> step n_before n_after
#> <char> <int> <int>
#> 1: Raw cohort 5000 5000
#> 2: After build_survival_dataset 5000 5000
#> 3: Keep non-missing outcome_status 5000 4880
#> 4: Exclude baseline prevalent Arrhythmia 4880 4880
#> 5: Keep valid outcome_surv_time 4880 4879
#> exclusion_rule excluded
#> <char> <int>
#> 1: None 0
#> 2: Function output row count check 0
#> 3: Exclude outcome_status is NA 120
#> 4: Keep Arrhythmia_history == 0 0
#> 5: Keep !is.na(outcome_surv_time) & outcome_surv_time >= 0 1
#> retained_from_prev retained_from_raw
#> <char> <char>
#> 1: 100.00% 100.00%
#> 2: 100.00% 100.00%
#> 3: 97.60% 97.60%
#> 4: 100.00% 97.60%
#> 5: 99.98% 97.58%
table(demo_surv$outcome_status, useNA = "ifany")
#>
#> 0 1 <NA>
#> 4601 279 120
participant_flow <- attr(demo_surv, "participant_flow")
participant_flow
#> step n_before n_after
#> <char> <int> <int>
#> 1: Raw cohort 5000 5000
#> 2: After build_survival_dataset 5000 5000
#> 3: Keep non-missing outcome_status 5000 4880
#> 4: Exclude baseline prevalent Arrhythmia 4880 4880
#> 5: Keep valid outcome_surv_time 4880 4879
#> exclusion_rule excluded
#> <char> <int>
#> 1: None 0
#> 2: Function output row count check 0
#> 3: Exclude outcome_status is NA 120
#> 4: Keep Arrhythmia_history == 0 0
#> 5: Keep !is.na(outcome_surv_time) & outcome_surv_time >= 0 1
#> retained_from_prev retained_from_raw
#> <num> <num>
#> 1: 1.0000000 1.0000
#> 2: 1.0000000 1.0000
#> 3: 0.9760000 0.9760
#> 4: 1.0000000 0.9760
#> 5: 0.9997951 0.9758outcome_status = 1 indicates incident arrhythmia, outcome_status = 0 indicates censoring, and NA indicates baseline prevalent arrhythmia.
4 Preprocess exposures and covariates
preprocess_baseline() maps raw UKB field names to standardized analysis variables. Townsend deprivation index is added through custom_mapping because it is not part of the default baseline variable set.
exposure_vars <- c(
"no2_2005", "no2_2006", "no2_2007", "no2_2010",
"nox", "pm25", "pm10_2007", "pm10_2010"
)
covariates <- c("age", "sex", "tdi", "ethnicity", "education", "smoking", "drinking")
custom_mapping <- list(
tdi = list(
ukb_col = "p189",
description = "Townsend deprivation index at recruitment"
)
)
demo_prepared <- preprocess_baseline(
df = demo_surv,
variables = c(exposure_vars, covariates),
custom_mapping = custom_mapping,
missing_action = "keep"
)
demo_prepared[, c(exposure_vars, covariates), with = FALSE][1:6]
#> no2_2005 no2_2006 no2_2007 no2_2010 nox pm25 pm10_2007 pm10_2010 age
#> <num> <num> <num> <num> <num> <num> <num> <num> <num>
#> 1: 25.87681 33.00003 21.65425 26.55422 38.93 9.58 20.47389 19.41812 65
#> 2: 23.50962 43.24436 25.87041 26.11995 33.46 10.46 14.56945 20.27382 40
#> 3: 24.25328 32.60647 35.07644 35.74929 21.42 9.66 18.44245 16.42343 40
#> 4: 33.71577 14.96466 59.59763 21.16373 19.74 11.59 17.89864 18.63921 56
#> 5: 25.59150 17.85625 29.26812 23.25018 23.60 10.03 20.56380 18.79965 52
#> 6: 40.89862 18.45907 24.00232 21.81041 41.88 10.53 16.16288 22.69649 62
#> sex tdi ethnicity education smoking drinking
#> <fctr> <num> <fctr> <fctr> <fctr> <fctr>
#> 1: Male -2.0376928 White High Previous Current
#> 2: Male 3.5060881 <NA> Medium Never Current
#> 3: Female -6.8393976 <NA> High Previous Current
#> 4: Female 0.2687084 White High Previous Current
#> 5: Female 0.8966171 White High Current Current
#> 6: Female 3.5251987 White High Previous CurrentNO2 is summarized as the mean of 2005, 2006, and 2007 estimates for the main analysis. The main PM10 exposure uses 2007 estimates. The 2010 NO2 and PM10 fields are retained for sensitivity analysis.
demo_prepared[, no2_main := rowMeans(
cbind(no2_2005, no2_2006, no2_2007),
na.rm = TRUE
)]
demo_prepared[is.nan(no2_main), no2_main := NA_real_]
main_exposures <- c("no2_main", "nox", "pm25", "pm10_2007")
sensitivity_exposures_2010 <- c("no2_2010", "nox", "pm25", "pm10_2010")
main_exposures_z <- paste0(main_exposures, "_z")
sensitivity_exposures_2010_z <- paste0(sensitivity_exposures_2010, "_z")
for (var in unique(c(main_exposures, sensitivity_exposures_2010))) {
demo_prepared[, (paste0(var, "_z")) := as.numeric(scale(get(var)))]
}
demo_prepared[, c(main_exposures, main_exposures_z), with = FALSE][1:6]
#> no2_main nox pm25 pm10_2007 no2_main_z nox_z pm25_z pm10_2007_z
#> <num> <num> <num> <num> <num> <num> <num> <num>
#> 1: 26.84370 38.93 9.58 20.47389 -0.4429355 -0.3086792 -0.39206038 0.5889900
#> 2: 30.87480 33.46 10.46 14.56945 0.3117576 -0.6611652 0.45551279 -2.4134918
#> 3: 30.64539 21.42 9.66 18.44245 0.2688093 -1.4370211 -0.31500827 -0.4440239
#> 4: 36.09268 19.74 11.59 17.89864 1.2886389 -1.5452801 1.54387379 -0.7205570
#> 5: 24.23862 23.60 10.03 20.56380 -0.9306518 -1.2965422 0.04135772 0.6347081
#> 6: 27.78667 41.88 10.53 16.16288 -0.2663944 -0.1185816 0.52293338 -1.60321275 Assemble the analytic cohort
The prospective cohort excludes participants with baseline prevalent arrhythmia, missing outcome status, invalid follow-up time, or missing values in the main exposures and adjustment covariates. The stepwise complete-case flow is retained for reporting.
analysis_data <- demo_prepared[
!is.na(outcome_status) &
!is.na(outcome_surv_time) &
outcome_surv_time > 0
]
complete_case_vars <- c(main_exposures_z, covariates)
analysis_data <- sensitivity_exclude_missing_covariates(
data = analysis_data,
covariates = complete_case_vars,
stepwise = TRUE,
verbose = TRUE
)
#> step variable n_before n_missing_in_remaining n_after n_removed
#> <int> <char> <int> <int> <int> <int>
#> 1: 1 no2_main_z 4879 0 4879 0
#> 2: 2 nox_z 4879 0 4879 0
#> 3: 3 pm25_z 4879 0 4879 0
#> 4: 4 pm10_2007_z 4879 0 4879 0
#> 5: 5 age 4879 0 4879 0
#> 6: 6 sex 4879 0 4879 0
#> 7: 7 tdi 4879 0 4879 0
#> 8: 8 ethnicity 4879 29 4850 29
#> 9: 9 education 4850 92 4758 92
#> 10: 10 smoking 4758 21 4737 21
#> 11: 11 drinking 4737 15 4722 15
#> retained_from_previous retained_from_input
#> <num> <num>
#> 1: 1.0000000 1.0000000
#> 2: 1.0000000 1.0000000
#> 3: 1.0000000 1.0000000
#> 4: 1.0000000 1.0000000
#> 5: 1.0000000 1.0000000
#> 6: 1.0000000 1.0000000
#> 7: 1.0000000 1.0000000
#> 8: 0.9940562 0.9940562
#> 9: 0.9810309 0.9751998
#> 10: 0.9955864 0.9708957
#> 11: 0.9968334 0.9678213
complete_case_flow <- attr(analysis_data, "complete_case_flow")
complete_case_flow
#> step variable n_before n_missing_in_remaining n_after n_removed
#> <int> <char> <int> <int> <int> <int>
#> 1: 1 no2_main_z 4879 0 4879 0
#> 2: 2 nox_z 4879 0 4879 0
#> 3: 3 pm25_z 4879 0 4879 0
#> 4: 4 pm10_2007_z 4879 0 4879 0
#> 5: 5 age 4879 0 4879 0
#> 6: 6 sex 4879 0 4879 0
#> 7: 7 tdi 4879 0 4879 0
#> 8: 8 ethnicity 4879 29 4850 29
#> 9: 9 education 4850 92 4758 92
#> 10: 10 smoking 4758 21 4737 21
#> 11: 11 drinking 4737 15 4722 15
#> retained_from_previous retained_from_input
#> <num> <num>
#> 1: 1.0000000 1.0000000
#> 2: 1.0000000 1.0000000
#> 3: 1.0000000 1.0000000
#> 4: 1.0000000 1.0000000
#> 5: 1.0000000 1.0000000
#> 6: 1.0000000 1.0000000
#> 7: 1.0000000 1.0000000
#> 8: 0.9940562 0.9940562
#> 9: 0.9810309 0.9751998
#> 10: 0.9955864 0.9708957
#> 11: 0.9968334 0.9678213
table(analysis_data$outcome_status)
#>
#> 0 1
#> 4455 2676 Baseline characteristics
Baseline characteristics are summarized by incident arrhythmia status. Air pollutants are shown on their original scales, while Cox models use standardized exposure variables.
table1 <- create_baseline_table(
data = analysis_data,
case_col = "outcome_status",
factor_cols = c("sex", "ethnicity", "education", "smoking", "drinking"),
continuous_cols = c("age", "tdi", main_exposures),
test = TRUE
)
print(table1, quote = FALSE, noSpaces = TRUE)
#> Stratified by outcome_status
#> 0 1 p test
#> n 4455 267
#> sex = Male (%) 2063 (46.3) 132 (49.4) 0.351
#> ethnicity = Others (%) 231 (5.2) 16 (6.0) 0.664
#> education (%) 0.923
#> Low 716 (16.1) 42 (15.7)
#> Medium 1006 (22.6) 58 (21.7)
#> High 2733 (61.3) 167 (62.5)
#> smoking (%) 0.670
#> Never 2404 (54.0) 144 (53.9)
#> Previous 1573 (35.3) 90 (33.7)
#> Current 478 (10.7) 33 (12.4)
#> drinking (%) 0.477
#> Never 208 (4.7) 12 (4.5)
#> Previous 154 (3.5) 13 (4.9)
#> Current 4093 (91.9) 242 (90.6)
#> age (mean (SD)) 56.51 (8.04) 56.60 (8.12) 0.861
#> tdi (mean (SD)) 0.03 (3.03) 0.10 (2.97) 0.712
#> no2_main (mean (SD)) 29.22 (5.37) 29.06 (4.80) 0.638
#> nox (mean (SD)) 43.72 (15.58) 42.78 (15.07) 0.337
#> pm25 (mean (SD)) 9.99 (1.05) 10.00 (0.95) 0.928
#> pm10_2007 (mean (SD)) 19.32 (1.96) 19.31 (2.01) 0.8937 Cox regression
Each pollutant is evaluated in a separate Cox proportional hazards model, adjusted for age, sex, Townsend deprivation index, ethnicity, education, smoking, and drinking. Hazard ratios are interpreted per one-standard-deviation increase in exposure.
pollutant_labels <- data.frame(
variable = main_exposures_z,
pollutant = c("NO2", "NOx", "PM2.5", "PM10"),
stringsAsFactors = FALSE
)
cox_results <- runmulti_cox(
data = analysis_data,
main_var = main_exposures_z,
covariates = covariates,
endpoint = c("outcome_surv_time", "outcome_status")
)
cox_results <- merge(cox_results, pollutant_labels, by = "variable", all.x = TRUE)
cox_results$p_bh <- p.adjust(cox_results$pvalue, method = "BH")
cox_results$p_bonferroni <- p.adjust(cox_results$pvalue, method = "bonferroni")
cox_results[order(cox_results$pvalue), ]
#> variable coef se z HR lower95 upper95
#> 2 nox_z -0.063747275 0.06483359 -0.98324464 0.9382421 0.8262823 1.065372
#> 1 no2_main_z -0.021747364 0.06167636 -0.35260455 0.9784874 0.8670741 1.104217
#> 4 pm25_z 0.003932429 0.06103574 0.06442830 1.0039402 0.8907465 1.131518
#> 3 pm10_2007_z -0.001610040 0.06166102 -0.02611114 0.9983913 0.8847383 1.126644
#> pvalue n n_event pollutant p_bh p_bonferroni
#> 2 0.3254871 4722 267 NOx 0.9791687 1
#> 1 0.7243849 4722 267 NO2 0.9791687 1
#> 4 0.9486292 4722 267 PM2.5 0.9791687 1
#> 3 0.9791687 4722 267 PM10 0.9791687 18 Sensitivity analysis 1: exclude early events
To reduce potential reverse causation, early incident arrhythmia events are excluded at 1-, 2-, and 4-year lag windows, and the adjusted Cox models are refitted.
fit_pollution_cox <- function(data, exposures, labels, lag_years = 0) {
out <- runmulti_cox(
data = data,
main_var = exposures,
covariates = covariates,
endpoint = c("outcome_surv_time", "outcome_status")
)
out$lag_years <- lag_years
merge(out, labels, by = "variable", all.x = TRUE)
}
lag_results <- lapply(c(0, 1, 2, 4), function(lag) {
lag_data <- if (lag == 0) {
analysis_data
} else {
sensitivity_exclude_early_events(
data = analysis_data,
endpoint = c("outcome_surv_time", "outcome_status"),
n_years = lag,
verbose = TRUE
)
}
fit_pollution_cox(lag_data, main_exposures_z, pollutant_labels, lag_years = lag)
})
lag_results <- data.table::rbindlist(lag_results, use.names = TRUE)
lag_results[, p_bh := p.adjust(pvalue, method = "BH"), by = lag_years]
as.data.frame(lag_results[order(pollutant, lag_years)])
#> variable coef se z HR lower95 upper95
#> 1 no2_main_z -0.021747364 0.06167636 -0.35260455 0.9784874 0.8670741 1.104217
#> 2 no2_main_z -0.033544328 0.06384072 -0.52543779 0.9670120 0.8532780 1.095906
#> 3 no2_main_z -0.026291084 0.06506894 -0.40404966 0.9740515 0.8574230 1.106544
#> 4 no2_main_z -0.017692038 0.07011299 -0.25233609 0.9824635 0.8563201 1.127189
#> 5 nox_z -0.063747275 0.06483359 -0.98324464 0.9382421 0.8262823 1.065372
#> 6 nox_z -0.073663585 0.06736998 -1.09341846 0.9289842 0.8140721 1.060117
#> 7 nox_z -0.082574976 0.06922629 -1.19282675 0.9207424 0.8039196 1.054541
#> 8 nox_z -0.075093548 0.07402062 -1.01449499 0.9276567 0.8023813 1.072491
#> 9 pm10_2007_z -0.001610040 0.06166102 -0.02611114 0.9983913 0.8847383 1.126644
#> 10 pm10_2007_z 0.017366439 0.06325203 0.27455938 1.0175181 0.8988804 1.151814
#> 11 pm10_2007_z 0.012027099 0.06463569 0.18607519 1.0120997 0.8916724 1.148792
#> 12 pm10_2007_z 0.007437510 0.06979918 0.10655583 1.0074652 0.8786520 1.155163
#> 13 pm25_z 0.003932429 0.06103574 0.06442830 1.0039402 0.8907465 1.131518
#> 14 pm25_z 0.009610136 0.06286195 0.15287685 1.0096565 0.8926176 1.142041
#> 15 pm25_z -0.001146960 0.06438523 -0.01781403 0.9988537 0.8804345 1.133200
#> 16 pm25_z -0.014898763 0.06980074 -0.21344707 0.9852117 0.8592411 1.129650
#> pvalue n n_event lag_years pollutant p_bh
#> 1 0.7243849 4722 267 0 NO2 0.9791687
#> 2 0.5992789 4706 251 1 NO2 0.8784954
#> 3 0.6861762 4696 241 2 NO2 0.9857872
#> 4 0.8007813 4662 207 4 NO2 0.9151414
#> 5 0.3254871 4722 267 0 NOx 0.9791687
#> 6 0.2742101 4706 251 1 NOx 0.8784954
#> 7 0.2329372 4696 241 2 NOx 0.9317489
#> 8 0.3103466 4662 207 4 NOx 0.9151414
#> 9 0.9791687 4722 267 0 PM10 0.9791687
#> 10 0.7836548 4706 251 1 PM10 0.8784954
#> 11 0.8523858 4696 241 2 PM10 0.9857872
#> 12 0.9151414 4662 207 4 PM10 0.9151414
#> 13 0.9486292 4722 267 0 PM2.5 0.9791687
#> 14 0.8784954 4706 251 1 PM2.5 0.8784954
#> 15 0.9857872 4696 241 2 PM2.5 0.9857872
#> 16 0.8309783 4662 207 4 PM2.5 0.91514149 Sensitivity analysis 2: 2010 exposure definition
NO2 and PM10 are redefined using 2010 exposure fields (p24003 and p24005), while NOx and PM2.5 are kept unchanged.
pollutant_labels_2010 <- data.frame(
variable = sensitivity_exposures_2010_z,
pollutant = c("NO2_2010", "NOx", "PM2.5", "PM10_2010"),
stringsAsFactors = FALSE
)
analysis_data_2010 <- sensitivity_exclude_missing_covariates(
data = demo_prepared[
!is.na(outcome_status) &
!is.na(outcome_surv_time) &
outcome_surv_time > 0
],
covariates = c(sensitivity_exposures_2010_z, covariates),
stepwise = FALSE,
verbose = TRUE
)
cox_results_2010 <- fit_pollution_cox(
data = analysis_data_2010,
exposures = sensitivity_exposures_2010_z,
labels = pollutant_labels_2010,
lag_years = 0
)
cox_results_2010$p_bh <- p.adjust(cox_results_2010$pvalue, method = "BH")
cox_results_2010
#> variable coef se z HR lower95 upper95
#> 1 no2_2010_z -0.090143975 0.06376187 -1.4137599 0.9137996 0.8064487 1.035441
#> 2 nox_z -0.063747275 0.06483359 -0.9832446 0.9382421 0.8262823 1.065372
#> 3 pm10_2010_z 0.023125198 0.06087924 0.3798536 1.0233947 0.9082861 1.153091
#> 4 pm25_z 0.003932429 0.06103574 0.0644283 1.0039402 0.8907465 1.131518
#> pvalue n n_event lag_years pollutant p_bh
#> 1 0.1574324 4722 267 0 NO2_2010 0.6297296
#> 2 0.3254871 4722 267 0 NOx 0.6509741
#> 3 0.7040541 4722 267 0 PM10_2010 0.9387388
#> 4 0.9486292 4722 267 0 PM2.5 0.9486292