Exploring the relationship between Air pollution and incident arrhythmia: a UKBAnalytica demo case study

This document demonstrates an end-to-end UKBAnalytica workflow using ukb_demo() as a RAP-style raw table. The case study evaluates associations between air pollution exposure and incident cardiac arrhythmia. The workflow covers raw field extraction, phenotype definition, baseline preprocessing, participant flow, baseline table generation, Cox regression, and sensitivity analyses.

The demo data are synthetic and do not contain UK Biobank participant-level source records. Results are for workflow illustration only.

1 Load demo data

if (requireNamespace("pkgload", quietly = TRUE) && dir.exists("../R")) {
  pkgload::load_all("..", quiet = TRUE)
}
library(UKBAnalytica)
library(data.table)

demo_raw <- ukb_demo()
dim(demo_raw)
#> [1] 5000  754
names(demo_raw)[1:12]
#>  [1] "eid"       "p31"       "p53_i0"    "p53_i1"    "p53_i2"    "p53_i3"   
#>  [7] "p21022"    "p21001_i0" "p21001_i1" "p21001_i2" "p21001_i3" "p21000_i0"

2 Specify variables from raw UKB field IDs

The analysis uses the following raw UKB fields:

NO2: p24016, p24018, and p24017; p24003 is used for the 2010-exposure sensitivity analysis.
NOx: p24004.
PM2.5: p24006.
PM10: p24019; p24005 is used for the 2010-exposure sensitivity analysis.
Covariates: sex (p31), baseline date (p53_i0), age (p21022), BMI (p21001_i0), smoking (p20116_i0), drinking (p20117_i0), Townsend deprivation index (p189), ethnicity (p21000_i0), and education (p6138_i0).

3 Define incident arrhythmia

The predefined Arrhythmia phenotype in UKBAnalytica captures broad cardiac arrhythmia using ICD-10 codes I44-I49 and procedure codes including K57.6, K59-K62, K64.1, and K72-K74 where available. Baseline history and prospective incident events are handled separately.

disease_catalog <- get_predefined_diseases()
arrhythmia_def <- disease_catalog["Arrhythmia"]
arrhythmia_def
#> $Arrhythmia
#> $Arrhythmia$name
#> [1] "Cardiac Arrhythmia"
#> 
#> $Arrhythmia$icd10_pattern
#> [1] "^(I44|I45|I46|I47|I48|I49)"
#> 
#> $Arrhythmia$icd9_pattern
#> NULL
#> 
#> $Arrhythmia$sr_codes
#> NULL
#> 
#> $Arrhythmia$death_icd10
#> [1] "^(I44|I45|I46|I47|I48|I49)"
#> 
#> $Arrhythmia$opcs4_pattern
#> [1] "^(K576|K59|K60|K61|K62|K641|K72|K73|K74)"
#> 
#> $Arrhythmia$first_occurrence_fields
#> [1] 131342 131344 131346 131348 131350 131352
#> 
#> $Arrhythmia$first_occurrence_source_fields
#> NULL
#> 
#> $Arrhythmia$cancer_icd10_pattern
#> NULL
#> 
#> $Arrhythmia$cancer_histology
#> NULL
#> 
#> $Arrhythmia$cancer_behaviour
#> NULL
#> 
#> $Arrhythmia$algo_date_field
#> NULL
#> 
#> $Arrhythmia$algo_source_field
#> NULL

demo_surv <- build_survival_dataset(
  dt = demo_raw,
  disease_definitions = arrhythmia_def,
  prevalent_sources = c("ICD10", "OPCS4"),
  outcome_sources = c("ICD10"),
  baseline_col = "p53_i0",
  primary_disease = "Arrhythmia",
  censor_date = as.Date("2023-10-31"),
  show_flow = TRUE
)
#>                                     step n_before n_after
#>                                   <char>    <int>   <int>
#> 1:                            Raw cohort     5000    5000
#> 2:          After build_survival_dataset     5000    5000
#> 3:       Keep non-missing outcome_status     5000    4880
#> 4: Exclude baseline prevalent Arrhythmia     4880    4880
#> 5:          Keep valid outcome_surv_time     4880    4879
#>                                             exclusion_rule excluded
#>                                                     <char>    <int>
#> 1:                                                    None        0
#> 2:                         Function output row count check        0
#> 3:                            Exclude outcome_status is NA      120
#> 4:                            Keep Arrhythmia_history == 0        0
#> 5: Keep !is.na(outcome_surv_time) & outcome_surv_time >= 0        1
#>    retained_from_prev retained_from_raw
#>                <char>            <char>
#> 1:            100.00%           100.00%
#> 2:            100.00%           100.00%
#> 3:             97.60%            97.60%
#> 4:            100.00%            97.60%
#> 5:             99.98%            97.58%

table(demo_surv$outcome_status, useNA = "ifany")
#> 
#>    0    1 <NA> 
#> 4601  279  120
participant_flow <- attr(demo_surv, "participant_flow")
participant_flow
#>                                     step n_before n_after
#>                                   <char>    <int>   <int>
#> 1:                            Raw cohort     5000    5000
#> 2:          After build_survival_dataset     5000    5000
#> 3:       Keep non-missing outcome_status     5000    4880
#> 4: Exclude baseline prevalent Arrhythmia     4880    4880
#> 5:          Keep valid outcome_surv_time     4880    4879
#>                                             exclusion_rule excluded
#>                                                     <char>    <int>
#> 1:                                                    None        0
#> 2:                         Function output row count check        0
#> 3:                            Exclude outcome_status is NA      120
#> 4:                            Keep Arrhythmia_history == 0        0
#> 5: Keep !is.na(outcome_surv_time) & outcome_surv_time >= 0        1
#>    retained_from_prev retained_from_raw
#>                 <num>             <num>
#> 1:          1.0000000            1.0000
#> 2:          1.0000000            1.0000
#> 3:          0.9760000            0.9760
#> 4:          1.0000000            0.9760
#> 5:          0.9997951            0.9758

outcome_status = 1 indicates incident arrhythmia, outcome_status = 0 indicates censoring, and NA indicates baseline prevalent arrhythmia.

4 Preprocess exposures and covariates

preprocess_baseline() maps raw UKB field names to standardized analysis variables. Townsend deprivation index is added through custom_mapping because it is not part of the default baseline variable set.

exposure_vars <- c(
  "no2_2005", "no2_2006", "no2_2007", "no2_2010",
  "nox", "pm25", "pm10_2007", "pm10_2010"
)

covariates <- c("age", "sex", "tdi", "ethnicity", "education", "smoking", "drinking")

custom_mapping <- list(
  tdi = list(
    ukb_col = "p189",
    description = "Townsend deprivation index at recruitment"
  )
)

demo_prepared <- preprocess_baseline(
  df = demo_surv,
  variables = c(exposure_vars, covariates),
  custom_mapping = custom_mapping,
  missing_action = "keep"
)

demo_prepared[, c(exposure_vars, covariates), with = FALSE][1:6]
#>    no2_2005 no2_2006 no2_2007 no2_2010   nox  pm25 pm10_2007 pm10_2010   age
#>       <num>    <num>    <num>    <num> <num> <num>     <num>     <num> <num>
#> 1: 25.87681 33.00003 21.65425 26.55422 38.93  9.58  20.47389  19.41812    65
#> 2: 23.50962 43.24436 25.87041 26.11995 33.46 10.46  14.56945  20.27382    40
#> 3: 24.25328 32.60647 35.07644 35.74929 21.42  9.66  18.44245  16.42343    40
#> 4: 33.71577 14.96466 59.59763 21.16373 19.74 11.59  17.89864  18.63921    56
#> 5: 25.59150 17.85625 29.26812 23.25018 23.60 10.03  20.56380  18.79965    52
#> 6: 40.89862 18.45907 24.00232 21.81041 41.88 10.53  16.16288  22.69649    62
#>       sex        tdi ethnicity education  smoking drinking
#>    <fctr>      <num>    <fctr>    <fctr>   <fctr>   <fctr>
#> 1:   Male -2.0376928     White      High Previous  Current
#> 2:   Male  3.5060881      <NA>    Medium    Never  Current
#> 3: Female -6.8393976      <NA>      High Previous  Current
#> 4: Female  0.2687084     White      High Previous  Current
#> 5: Female  0.8966171     White      High  Current  Current
#> 6: Female  3.5251987     White      High Previous  Current

NO2 is summarized as the mean of 2005, 2006, and 2007 estimates for the main analysis. The main PM10 exposure uses 2007 estimates. The 2010 NO2 and PM10 fields are retained for sensitivity analysis.

demo_prepared[, no2_main := rowMeans(
  cbind(no2_2005, no2_2006, no2_2007),
  na.rm = TRUE
)]
demo_prepared[is.nan(no2_main), no2_main := NA_real_]

main_exposures <- c("no2_main", "nox", "pm25", "pm10_2007")
sensitivity_exposures_2010 <- c("no2_2010", "nox", "pm25", "pm10_2010")

main_exposures_z <- paste0(main_exposures, "_z")
sensitivity_exposures_2010_z <- paste0(sensitivity_exposures_2010, "_z")

for (var in unique(c(main_exposures, sensitivity_exposures_2010))) {
  demo_prepared[, (paste0(var, "_z")) := as.numeric(scale(get(var)))]
}

demo_prepared[, c(main_exposures, main_exposures_z), with = FALSE][1:6]
#>    no2_main   nox  pm25 pm10_2007 no2_main_z      nox_z      pm25_z pm10_2007_z
#>       <num> <num> <num>     <num>      <num>      <num>       <num>       <num>
#> 1: 26.84370 38.93  9.58  20.47389 -0.4429355 -0.3086792 -0.39206038   0.5889900
#> 2: 30.87480 33.46 10.46  14.56945  0.3117576 -0.6611652  0.45551279  -2.4134918
#> 3: 30.64539 21.42  9.66  18.44245  0.2688093 -1.4370211 -0.31500827  -0.4440239
#> 4: 36.09268 19.74 11.59  17.89864  1.2886389 -1.5452801  1.54387379  -0.7205570
#> 5: 24.23862 23.60 10.03  20.56380 -0.9306518 -1.2965422  0.04135772   0.6347081
#> 6: 27.78667 41.88 10.53  16.16288 -0.2663944 -0.1185816  0.52293338  -1.6032127

5 Assemble the analytic cohort

The prospective cohort excludes participants with baseline prevalent arrhythmia, missing outcome status, invalid follow-up time, or missing values in the main exposures and adjustment covariates. The stepwise complete-case flow is retained for reporting.

analysis_data <- demo_prepared[
  !is.na(outcome_status) &
    !is.na(outcome_surv_time) &
    outcome_surv_time > 0
]

complete_case_vars <- c(main_exposures_z, covariates)

analysis_data <- sensitivity_exclude_missing_covariates(
  data = analysis_data,
  covariates = complete_case_vars,
  stepwise = TRUE,
  verbose = TRUE
)
#>      step    variable n_before n_missing_in_remaining n_after n_removed
#>     <int>      <char>    <int>                  <int>   <int>     <int>
#>  1:     1  no2_main_z     4879                      0    4879         0
#>  2:     2       nox_z     4879                      0    4879         0
#>  3:     3      pm25_z     4879                      0    4879         0
#>  4:     4 pm10_2007_z     4879                      0    4879         0
#>  5:     5         age     4879                      0    4879         0
#>  6:     6         sex     4879                      0    4879         0
#>  7:     7         tdi     4879                      0    4879         0
#>  8:     8   ethnicity     4879                     29    4850        29
#>  9:     9   education     4850                     92    4758        92
#> 10:    10     smoking     4758                     21    4737        21
#> 11:    11    drinking     4737                     15    4722        15
#>     retained_from_previous retained_from_input
#>                      <num>               <num>
#>  1:              1.0000000           1.0000000
#>  2:              1.0000000           1.0000000
#>  3:              1.0000000           1.0000000
#>  4:              1.0000000           1.0000000
#>  5:              1.0000000           1.0000000
#>  6:              1.0000000           1.0000000
#>  7:              1.0000000           1.0000000
#>  8:              0.9940562           0.9940562
#>  9:              0.9810309           0.9751998
#> 10:              0.9955864           0.9708957
#> 11:              0.9968334           0.9678213

complete_case_flow <- attr(analysis_data, "complete_case_flow")
complete_case_flow
#>      step    variable n_before n_missing_in_remaining n_after n_removed
#>     <int>      <char>    <int>                  <int>   <int>     <int>
#>  1:     1  no2_main_z     4879                      0    4879         0
#>  2:     2       nox_z     4879                      0    4879         0
#>  3:     3      pm25_z     4879                      0    4879         0
#>  4:     4 pm10_2007_z     4879                      0    4879         0
#>  5:     5         age     4879                      0    4879         0
#>  6:     6         sex     4879                      0    4879         0
#>  7:     7         tdi     4879                      0    4879         0
#>  8:     8   ethnicity     4879                     29    4850        29
#>  9:     9   education     4850                     92    4758        92
#> 10:    10     smoking     4758                     21    4737        21
#> 11:    11    drinking     4737                     15    4722        15
#>     retained_from_previous retained_from_input
#>                      <num>               <num>
#>  1:              1.0000000           1.0000000
#>  2:              1.0000000           1.0000000
#>  3:              1.0000000           1.0000000
#>  4:              1.0000000           1.0000000
#>  5:              1.0000000           1.0000000
#>  6:              1.0000000           1.0000000
#>  7:              1.0000000           1.0000000
#>  8:              0.9940562           0.9940562
#>  9:              0.9810309           0.9751998
#> 10:              0.9955864           0.9708957
#> 11:              0.9968334           0.9678213

table(analysis_data$outcome_status)
#> 
#>    0    1 
#> 4455  267

6 Baseline characteristics

Baseline characteristics are summarized by incident arrhythmia status. Air pollutants are shown on their original scales, while Cox models use standardized exposure variables.

table1 <- create_baseline_table(
  data = analysis_data,
  case_col = "outcome_status",
  factor_cols = c("sex", "ethnicity", "education", "smoking", "drinking"),
  continuous_cols = c("age", "tdi", main_exposures),
  test = TRUE
)

print(table1, quote = FALSE, noSpaces = TRUE)
#>                         Stratified by outcome_status
#>                          0             1             p     test
#>   n                      4455          267                     
#>   sex = Male (%)         2063 (46.3)   132 (49.4)    0.351     
#>   ethnicity = Others (%) 231 (5.2)     16 (6.0)      0.664     
#>   education (%)                                      0.923     
#>      Low                 716 (16.1)    42 (15.7)               
#>      Medium              1006 (22.6)   58 (21.7)               
#>      High                2733 (61.3)   167 (62.5)              
#>   smoking (%)                                        0.670     
#>      Never               2404 (54.0)   144 (53.9)              
#>      Previous            1573 (35.3)   90 (33.7)               
#>      Current             478 (10.7)    33 (12.4)               
#>   drinking (%)                                       0.477     
#>      Never               208 (4.7)     12 (4.5)                
#>      Previous            154 (3.5)     13 (4.9)                
#>      Current             4093 (91.9)   242 (90.6)              
#>   age (mean (SD))        56.51 (8.04)  56.60 (8.12)  0.861     
#>   tdi (mean (SD))        0.03 (3.03)   0.10 (2.97)   0.712     
#>   no2_main (mean (SD))   29.22 (5.37)  29.06 (4.80)  0.638     
#>   nox (mean (SD))        43.72 (15.58) 42.78 (15.07) 0.337     
#>   pm25 (mean (SD))       9.99 (1.05)   10.00 (0.95)  0.928     
#>   pm10_2007 (mean (SD))  19.32 (1.96)  19.31 (2.01)  0.893

7 Cox regression

Each pollutant is evaluated in a separate Cox proportional hazards model, adjusted for age, sex, Townsend deprivation index, ethnicity, education, smoking, and drinking. Hazard ratios are interpreted per one-standard-deviation increase in exposure.

pollutant_labels <- data.frame(
  variable = main_exposures_z,
  pollutant = c("NO2", "NOx", "PM2.5", "PM10"),
  stringsAsFactors = FALSE
)

cox_results <- runmulti_cox(
  data = analysis_data,
  main_var = main_exposures_z,
  covariates = covariates,
  endpoint = c("outcome_surv_time", "outcome_status")
)

cox_results <- merge(cox_results, pollutant_labels, by = "variable", all.x = TRUE)
cox_results$p_bh <- p.adjust(cox_results$pvalue, method = "BH")
cox_results$p_bonferroni <- p.adjust(cox_results$pvalue, method = "bonferroni")
cox_results[order(cox_results$pvalue), ]
#>      variable         coef         se           z        HR   lower95  upper95
#> 2       nox_z -0.063747275 0.06483359 -0.98324464 0.9382421 0.8262823 1.065372
#> 1  no2_main_z -0.021747364 0.06167636 -0.35260455 0.9784874 0.8670741 1.104217
#> 4      pm25_z  0.003932429 0.06103574  0.06442830 1.0039402 0.8907465 1.131518
#> 3 pm10_2007_z -0.001610040 0.06166102 -0.02611114 0.9983913 0.8847383 1.126644
#>      pvalue    n n_event pollutant      p_bh p_bonferroni
#> 2 0.3254871 4722     267       NOx 0.9791687            1
#> 1 0.7243849 4722     267       NO2 0.9791687            1
#> 4 0.9486292 4722     267     PM2.5 0.9791687            1
#> 3 0.9791687 4722     267      PM10 0.9791687            1

8 Sensitivity analysis 1: exclude early events

To reduce potential reverse causation, early incident arrhythmia events are excluded at 1-, 2-, and 4-year lag windows, and the adjusted Cox models are refitted.

fit_pollution_cox <- function(data, exposures, labels, lag_years = 0) {
  out <- runmulti_cox(
    data = data,
    main_var = exposures,
    covariates = covariates,
    endpoint = c("outcome_surv_time", "outcome_status")
  )
  out$lag_years <- lag_years
  merge(out, labels, by = "variable", all.x = TRUE)
}

lag_results <- lapply(c(0, 1, 2, 4), function(lag) {
  lag_data <- if (lag == 0) {
    analysis_data
  } else {
    sensitivity_exclude_early_events(
      data = analysis_data,
      endpoint = c("outcome_surv_time", "outcome_status"),
      n_years = lag,
      verbose = TRUE
    )
  }
  fit_pollution_cox(lag_data, main_exposures_z, pollutant_labels, lag_years = lag)
})

lag_results <- data.table::rbindlist(lag_results, use.names = TRUE)
lag_results[, p_bh := p.adjust(pvalue, method = "BH"), by = lag_years]
as.data.frame(lag_results[order(pollutant, lag_years)])
#>       variable         coef         se           z        HR   lower95  upper95
#> 1   no2_main_z -0.021747364 0.06167636 -0.35260455 0.9784874 0.8670741 1.104217
#> 2   no2_main_z -0.033544328 0.06384072 -0.52543779 0.9670120 0.8532780 1.095906
#> 3   no2_main_z -0.026291084 0.06506894 -0.40404966 0.9740515 0.8574230 1.106544
#> 4   no2_main_z -0.017692038 0.07011299 -0.25233609 0.9824635 0.8563201 1.127189
#> 5        nox_z -0.063747275 0.06483359 -0.98324464 0.9382421 0.8262823 1.065372
#> 6        nox_z -0.073663585 0.06736998 -1.09341846 0.9289842 0.8140721 1.060117
#> 7        nox_z -0.082574976 0.06922629 -1.19282675 0.9207424 0.8039196 1.054541
#> 8        nox_z -0.075093548 0.07402062 -1.01449499 0.9276567 0.8023813 1.072491
#> 9  pm10_2007_z -0.001610040 0.06166102 -0.02611114 0.9983913 0.8847383 1.126644
#> 10 pm10_2007_z  0.017366439 0.06325203  0.27455938 1.0175181 0.8988804 1.151814
#> 11 pm10_2007_z  0.012027099 0.06463569  0.18607519 1.0120997 0.8916724 1.148792
#> 12 pm10_2007_z  0.007437510 0.06979918  0.10655583 1.0074652 0.8786520 1.155163
#> 13      pm25_z  0.003932429 0.06103574  0.06442830 1.0039402 0.8907465 1.131518
#> 14      pm25_z  0.009610136 0.06286195  0.15287685 1.0096565 0.8926176 1.142041
#> 15      pm25_z -0.001146960 0.06438523 -0.01781403 0.9988537 0.8804345 1.133200
#> 16      pm25_z -0.014898763 0.06980074 -0.21344707 0.9852117 0.8592411 1.129650
#>       pvalue    n n_event lag_years pollutant      p_bh
#> 1  0.7243849 4722     267         0       NO2 0.9791687
#> 2  0.5992789 4706     251         1       NO2 0.8784954
#> 3  0.6861762 4696     241         2       NO2 0.9857872
#> 4  0.8007813 4662     207         4       NO2 0.9151414
#> 5  0.3254871 4722     267         0       NOx 0.9791687
#> 6  0.2742101 4706     251         1       NOx 0.8784954
#> 7  0.2329372 4696     241         2       NOx 0.9317489
#> 8  0.3103466 4662     207         4       NOx 0.9151414
#> 9  0.9791687 4722     267         0      PM10 0.9791687
#> 10 0.7836548 4706     251         1      PM10 0.8784954
#> 11 0.8523858 4696     241         2      PM10 0.9857872
#> 12 0.9151414 4662     207         4      PM10 0.9151414
#> 13 0.9486292 4722     267         0     PM2.5 0.9791687
#> 14 0.8784954 4706     251         1     PM2.5 0.8784954
#> 15 0.9857872 4696     241         2     PM2.5 0.9857872
#> 16 0.8309783 4662     207         4     PM2.5 0.9151414

9 Sensitivity analysis 2: 2010 exposure definition

NO2 and PM10 are redefined using 2010 exposure fields (p24003 and p24005), while NOx and PM2.5 are kept unchanged.

pollutant_labels_2010 <- data.frame(
  variable = sensitivity_exposures_2010_z,
  pollutant = c("NO2_2010", "NOx", "PM2.5", "PM10_2010"),
  stringsAsFactors = FALSE
)

analysis_data_2010 <- sensitivity_exclude_missing_covariates(
  data = demo_prepared[
    !is.na(outcome_status) &
      !is.na(outcome_surv_time) &
      outcome_surv_time > 0
  ],
  covariates = c(sensitivity_exposures_2010_z, covariates),
  stepwise = FALSE,
  verbose = TRUE
)

cox_results_2010 <- fit_pollution_cox(
  data = analysis_data_2010,
  exposures = sensitivity_exposures_2010_z,
  labels = pollutant_labels_2010,
  lag_years = 0
)
cox_results_2010$p_bh <- p.adjust(cox_results_2010$pvalue, method = "BH")
cox_results_2010
#>      variable         coef         se          z        HR   lower95  upper95
#> 1  no2_2010_z -0.090143975 0.06376187 -1.4137599 0.9137996 0.8064487 1.035441
#> 2       nox_z -0.063747275 0.06483359 -0.9832446 0.9382421 0.8262823 1.065372
#> 3 pm10_2010_z  0.023125198 0.06087924  0.3798536 1.0233947 0.9082861 1.153091
#> 4      pm25_z  0.003932429 0.06103574  0.0644283 1.0039402 0.8907465 1.131518
#>      pvalue    n n_event lag_years pollutant      p_bh
#> 1 0.1574324 4722     267         0  NO2_2010 0.6297296
#> 2 0.3254871 4722     267         0       NOx 0.6509741
#> 3 0.7040541 4722     267         0 PM10_2010 0.9387388
#> 4 0.9486292 4722     267         0     PM2.5 0.9486292