For manuscript “Development and validation of predictive models of early immune effector cell-associated hematotoxicity (eIPMs)” by Liang et al., 2024
library(openxlsx)
library(here)
library(janitor)
library(tidyverse)
library(labelled)
library(zoo)
library(latrend)
library(ggplot2)
library(ggnewscale)
library(scales)
library(ggrepel)
library(gtsummary)
library(prodlim)
library(ggsurvfit)
library(Epi)
library(pmsampsize)
library(caret)
library(rms)
library(tidymodels)
library(glmnet)
library(probably)
library(vip)
library(runway)
library(cutpointr)
library(dcurves)
library(survival)
library(patchwork)
theme_set(theme_bw())
theme_gtsummary_compact()
# Clinical data
# Already excludes the patients who ho died prior to day +14 after CAR T-cell infusion,
# who had more than 7 consecutive days of missing ANC values, or
# who had a second CAR T-cell infusion within 30 days of the first CAR T-cell infusion
df <-
read.xlsx(
here(
"dfs",
"Outputs",
"File for eIPM Supplement.xlsx"
),
detectDates = TRUE
)
# CBC data
df_cbc <-
read.xlsx(here("dfs", "EPIC Data Exports", "2024.05.29.hems.xlsx"),
detectDates = TRUE)
df_cbc <- clean_names(df_cbc)
df_cbc <- df_cbc %>%
filter(cart_num == 1)
# Create data frame with longitudinal ANC values
# If last lab draw occurs after 23:00, assign ANC value to the following day
df_ancx <- df_cbc %>%
filter(upn %in% df$upn) %>%
select(upn, cart_date, date, time, ancx) %>%
mutate(time = as.POSIXct(time, format = "%H:%M:%S"),
new_date = as.Date(ifelse(
time >= as.POSIXct("23:00:00", format = "%H:%M:%S"),
date + days(1),
date
))) %>%
ungroup()
df_ancx <- df_ancx %>%
select(-date, -time) %>%
rename(date = new_date)
df_ancx <- df_ancx %>%
mutate(time_post_inf = as.numeric(date - cart_date)) %>%
filter(time_post_inf %in% 0:30) %>%
distinct(upn, time_post_inf, ancx, .keep_all = TRUE) %>% # Keep unique combinations of upn, time_post_inf, and anc
group_by(upn, time_post_inf) %>%
slice_min(ancx) %>% # For each combination of upn and time_post_inf, keep the lowest ANC value
ungroup()
# Fill in missing time_post_inf and replicate UPNs by first creating df_extract
df_extract <- df %>%
distinct(upn, cart_date, datelast)
# Create replicate_ids() to replicate UPNs according to duration of follow-up
replicate_ids <- function(upn, cart_date, datelast) {
if (is.na(datelast) || (datelast - cart_date) >= 30) {
return(replicate(31, upn))
} else {
return(replicate(datelast - cart_date + 1, upn)) # Add 1 to account for day +0
}
}
# Create df_id with replicated UPNs
df_id <- df_extract %>%
dplyr::rowwise() %>%
do(data.frame(upn = replicate_ids(.$upn, .$cart_date, .$datelast))) %>%
group_by(upn) %>%
mutate(time_post_inf = row_number() - 1) %>%
ungroup()
# Merge df_id with df
df_ancx <- df_ancx %>%
group_by(upn) %>%
merge(.,
df_id,
by = c("upn", "time_post_inf"),
all.y = TRUE) %>%
ungroup()
# Fill in cart_date and date columns
df_ancx <- df_ancx %>%
group_by(upn) %>%
mutate(cart_date = as.Date(ifelse(
is.na(cart_date), max(cart_date, na.rm = TRUE), cart_date
)),
date = as.Date(cart_date + time_post_inf)) %>%
ungroup()
# Linearly interpolate missing ANC values for up to 7 consecutive NAs
df_ancx <- df_ancx %>%
arrange(upn, time_post_inf) %>%
group_by(upn) %>%
mutate(ancx = na.approx(ancx, maxgap = 7, rule = 2)) %>%
ungroup()
# Round ANC values to nearest integer divisible by 10 (like real ANC values)
df_ancx <- df_ancx %>%
mutate(ancx = round(ancx/10) * 10)
# Add UWIDs
df_ancx <- df_ancx %>%
inner_join(., df %>% select(upn, uwid), by = "upn") %>%
select(upn, uwid, cart_date, date, time_post_inf, ancx)
# Log10-transform ANC and select variables for clustering
df_anc <- df_ancx %>%
mutate(anc = ifelse(ancx == 0, 0.01, ancx),
anc = log10(anc)) %>%
select(upn, time_post_inf, anc)
var_label(df_anc$time_post_inf) <- "Days after CAR T-cell infusion"
var_label(df_anc$anc) <- "ANC"
# For eICAHT threshold of ANC ≤ 500 cells/μL
df_exceedances_below_501 <-
# ddply splits a data frame, applies a function, and combines results into a data frame
plyr::ddply(.data = df_ancx, .variables = c("upn", "uwid"), .fun = function(data) {
heatwaveR::exceedance(
data,
x = date,
y = ancx,
threshold = 501,
below = TRUE, # Whether to detect events below the threshold,
minDuration = 1, # Minimum duration for acceptance of detected events
joinAcrossGaps = TRUE, # Whether to join events which occur before/after gap
maxGap = 2 # Maximum length of gap allowed for joining of events
)$exceedance
})
# Create data frame containing maximum duration of events for each UPN
df_anc_500 <- df_exceedances_below_501 %>%
select(upn, uwid, exceedance_no, duration) %>%
group_by(upn, uwid) %>%
dplyr::summarize(duration_below_501_max = max(duration)) %>%
mutate(duration_below_501_max = ifelse( # Replace NAs with 0 (never below threshold)
is.na(duration_below_501_max),
0,
duration_below_501_max
))
# For eICAHT threshold of ANC ≤ 100 cells/μL
df_exceedances_below_101 <-
plyr::ddply(.data = df_ancx, .variables = c("upn"), .fun = function(data) {
heatwaveR::exceedance(
data,
x = date,
y = ancx,
threshold = 101,
below = TRUE,
minDuration = 1,
joinAcrossGaps = TRUE,
maxGap = 2,
maxPadLength = FALSE
)$exceedance
})
df_anc_100 <- df_exceedances_below_101 %>%
select(upn, exceedance_no, duration) %>%
group_by(upn) %>%
dplyr::summarize(duration_below_101_max = max(duration)) %>%
mutate(duration_below_101_max = ifelse(
is.na(duration_below_101_max),
0,
duration_below_101_max
))
# Create data frame with exceedances below each threshold for each UPN
df_icaht <- df_anc_500 %>%
left_join(., df_anc_100, by = "upn") %>%
ungroup()
# Create column with eICAHT grades
df_icaht <- df_icaht %>%
mutate(
icaht_grade = case_when(
duration_below_501_max == 0 & duration_below_101_max == 0 ~ "Grade 0",
duration_below_501_max %in% 1:6 & duration_below_101_max < 7 ~ "Grade 1",
duration_below_501_max %in% 7:13 & duration_below_101_max < 7 ~ "Grade 2",
(duration_below_501_max %in% 14:30 & duration_below_101_max < 7) |
(duration_below_501_max < 31 & duration_below_101_max %in% 7:13) ~ "Grade 3",
(duration_below_501_max >= 31 & duration_below_101_max < 14) | duration_below_101_max >= 14 ~ "Grade 4"
)
)
df_exceedances_below_501 <- df_exceedances_below_501 %>%
filter(!is.na(exceedance_no)) %>% # Filter out patients who never had neutropenia
select(upn,
exceedance_below_501_no = exceedance_no,
exceedance_below_501_date_start = date_start,
exceedance_below_501_date_end = date_end) %>%
mutate(exceedance_below_501_date_start = as.Date(exceedance_below_501_date_start), # Convert dates to class "Date"
exceedance_below_501_date_end = as.Date(exceedance_below_501_date_end))
# Add date of last available ANC value
df_last_time_post_inf <- df_ancx %>%
arrange(upn, time_post_inf) %>%
group_by(upn) %>%
slice_tail(n = 1) %>% # Select last row
ungroup() %>%
select(upn,
cart_date,
last_lab_date = date)
df_exceedances_below_501 <- df_exceedances_below_501 %>%
left_join(., df_last_time_post_inf, by = "upn")
# Classify patients who had exceedances below ANC ≤ 500 cells/μL
# starting between days 0-3 and ending on "last_lab_date" as grade 4 eICAHT
df_exceedances_below_501 <- df_exceedances_below_501 %>%
mutate(
icaht_grade_4 = ifelse(
exceedance_below_501_date_start - cart_date <= 3 &
exceedance_below_501_date_end == last_lab_date,
"Yes",
"No"
)
)
df_icaht <- df_icaht %>%
left_join(.,
df_exceedances_below_501 %>% select(upn, icaht_grade_4),
by = "upn")
df_icaht <- df_icaht %>%
mutate(
icaht_grade = dplyr::if_else(
icaht_grade_4 == "Yes",
"Grade 4",
icaht_grade,
missing = icaht_grade
)
) %>%
distinct(upn,
icaht_grade,
.keep_all = TRUE)
# Data frame contains automated grades assigned by {heatwaveR} approach, MSKCC manual code, and manual grades for discrepancies
df_check <-
read.xlsx(here("dfs",
"Outputs",
"Manual vs. Automated Early ICAHT Grades.xlsx")) %>%
clean_names()
# Filter to cases incorrectly graded by {heatwaveR} approach
df_check <- df_check %>%
filter(center == "FHCC",
disagreement_manual_heatwave_r == 1) %>%
select(upn = patient_id,
manual_icaht_grade) %>%
mutate(manual_icaht_grade = paste0("Grade ", manual_icaht_grade))
# Replace grades in df_icaht with correct grades
df_icaht <- df_icaht %>%
left_join(., df_check, by = "upn")
df_icaht <- df_icaht %>%
mutate(icaht_grade = ifelse(!is.na(manual_icaht_grade), manual_icaht_grade, icaht_grade)) %>%
select(-icaht_grade_4, -manual_icaht_grade)
# Add eICAHT grades to df
df <- df %>%
filter(upn %in% df_icaht$upn) %>%
left_join(., df_icaht %>% select(upn, icaht_grade), by = "upn")
df <- df %>%
mutate(
icaht_grade_num = case_when(
icaht_grade == "Grade 0" ~ 0,
icaht_grade == "Grade 1" ~ 1,
icaht_grade == "Grade 2" ~ 2,
icaht_grade == "Grade 3" ~ 3,
icaht_grade == "Grade 4" ~ 4
),
icaht_grade = fct_relevel(icaht_grade, "Grade 0"),
icaht_grade_3_4 = ifelse(icaht_grade_num %in% 3:4, 1, 0)
)
# Use {latrend} to create lcModel object based on reference assignments using lcModelPartition()
options(latrend.id = "upn", latrend.time = "time_post_inf")
df_anc <- df_anc %>%
left_join(., df %>% select(upn, icaht_grade_num), by = "upn") %>%
mutate(icaht_grade_latrend = icaht_grade_num + 1) # latrend requires integers ≥1
icaht_traj_assigns <- aggregate(icaht_grade_latrend ~ upn,
data = df_anc,
FUN = data.table::first)
icaht_model <-
lcModelPartition(
data = df_anc,
response = "anc",
trajectoryAssignments = icaht_traj_assigns$icaht_grade_latrend
)
clusterNames(icaht_model) <-
c("Grade 0", "Grade 1", "Grade 2", "Grade 3", "Grade 4")
icaht_model
Longitudinal cluster model using part
lcMethod specifying "undefined"
no arguments
Cluster sizes (K=5):
Grade 0 Grade 1 Grade 2 Grade 3 Grade 4
112 (16.2%) 366 (53%) 111 (16.1%) 67 (9.7%) 35 (5.1%)
Number of obs: 21378, strata (upn): 691
Scaled residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-6.16331 -0.23806 0.04851 0.00000 0.37705 4.65705
# Plot individual ANC trajectories (black lines) and cluster trajectories (colored lines)
df_icaht_traj <- trajectories(icaht_model) # Extract individual trajectories and cluster assignments
df_icaht_cluster_traj = clusterTrajectories(icaht_model) # Extract cluster trajectories
df_hline <- data.frame(threshold = factor(c("≤100 cells/μL", "≤500 cells/μL")), y = c(100, 500)) # df to plot early ICAHT thresholds
# Create new labels including cluster percentages
cluster_percentages <- prop.table(table(df_icaht_traj$Cluster))
cluster_labels <- sprintf("%s (%d%%)",
names(cluster_percentages),
round(cluster_percentages * 100))
new_labels <- as_labeller(
c(
"Grade 0" = cluster_labels[1],
"Grade 1" = cluster_labels[2],
"Grade 2" = cluster_labels[3],
"Grade 3" = cluster_labels[4],
"Grade 4" = cluster_labels[5]
)
)
ggplot() +
geom_line(
data = df_icaht_traj,
aes(x = time_post_inf, y = anc, group = upn),
color = "black",
alpha = 0.5
) +
geom_line(data = df_icaht_cluster_traj,
aes(x = time_post_inf, y = anc, color = Cluster),
size = 2,
show.legend = FALSE) +
new_scale_color() +
geom_hline(
data = df_hline,
aes(
yintercept = log10(y),
linetype = threshold,
colour = threshold
),
size = 1
) +
scale_colour_manual(
values = c("red", "blue"),
labels = c("≤100 cells/μL", "≤500 cells/μL"),
name = "Early ICAHT Threshold"
) +
scale_linetype_manual(
values = c("dashed", "dashed"),
labels = c("≤100 cells/μL", "≤500 cells/μL"),
name = "Early ICAHT Threshold"
) +
facet_wrap(~ Cluster,
labeller = new_labels) +
scale_x_continuous(name = "Days after CAR T-cell infusion") +
scale_y_continuous(name = "ANC (log10(cells/μL)") +
theme(strip.text.x = element_text(size = 12),
legend.position = c(0.85, 0.35))
df <- df %>%
mutate(
ethnih_cat = ifelse(
ethnih == "Hispanic or Latino",
"Hispanic or Latino",
"Not Hispanic/Latino or Unknown"
),
ethnih_cat = fct_relevel(as.factor(ethnih_cat), "Not Hispanic/Latino or Unknown"),
raceethnih_cat = case_when(
racenih == "White" ~ "White",
racenih == "Black or African American" &
ethnih == "Not Hispanic or Latino" ~ "Non-Hispanic Black or African American",
.default = "Other"
),
raceethnih_cat = fct_relevel(as.factor(raceethnih_cat), "White"),
disease_cat =
factor(
disease_cat,
levels = c(
"Aggressive NHL",
"Other indolent NHL",
"ALL",
"Mantle cell lymphoma",
"MM/PCL"
)
),
LD_regimen_cat = factor(
LD_regimen_cat,
levels = c("Low-intensity Cy/Flu", "High-intensity Cy/Flu", "Other")
),
product_cat = as.factor(product_cat),
car_target = case_when(
product_cat == "Commercial CD19-targeted CAR T-cell product with CD28 costimulatory domain" |
product_cat == "Commercial CD19-targeted CAR T-cell product with 4-1BB costimulatory domain" |
product_infused == "JCAR014" |
product_infused == "JCAR021" ~ "CD19",
product_infused == "Investigational CD20-targeted product" ~ "CD20",
.default = "BCMA"
),
car_target = factor(car_target, levels = c("BCMA", "CD19", "CD20"))
)
var_label(df$cart_age) <- "Age"
var_label(df$gender_desc) <- "Sex"
var_label(df$racenih) <- "Race"
var_label(df$ethnih) <- "Ethnicity"
var_label(df$ethnih_cat) <- "Ethnicity"
var_label(df$raceethnih_cat) <- "Race/ethnicity"
var_label(df$disease_cat) <- "Disease"
var_label(df$prior_hct) <- "Prior HCT"
var_label(df$bm_disease_by_morph) <- "Bone marrow involvement by morphology (%)"
var_label(df$bm_disease_by_morph_log10) <- "Bone marrow involvement by morphology (log10(%))"
var_label(df$bm_disease_by_flow) <- "Bone marrow involvement by flow cytometry (%)"
var_label(df$bm_disease_by_flow_log10) <- "Bone marrow involvement by flow cytometry (log10(%))"
var_label(df$bridging_yn) <- "Bridging therapy"
var_label(df$LD_regimen_cat) <- "Lymphodepletion regimen"
var_label(df$LD_regimen_low_CyFlu_vs_other) <- "Lymphodepletion regimen (low-dose Cy/Flu vs. other)"
var_label(df$product_infused) <- "CAR T-cell product"
var_label(df$product_cat) <- "CAR T-cell product type"
var_label(df$car_target) <- "CAR target"
var_label(df$costim_domain) <- "Costimulatory domain"
var_label(df$cart_dose_total) <- "Total CAR T-cell dose (x 10^6 cells)"
var_label(df$cart_dose_total_log10) <- "Total CAR T-cell dose (log10(x 10^6 cells))"
var_label(df$anc_pre_ld) <- "Pre-LD ANC (x 10^3 cells/µL)"
var_label(df$hb_pre_ld) <- "Pre-LD Hb (g/dL)"
var_label(df$plt_pre_ld) <- "Pre-LD platelet count (x 10^3 cells/µL)"
var_label(df$anc_pre_ld_log10) <- "Pre-LD ANC (log10(x 10^3 cells/µL))"
var_label(df$anc_day_3_log10) <- "Day +3 ANC (log10(x 10^3 cells/µL))"
var_label(df$lym_pre_ld_log10) <- "Pre-LD ALC (log10(x 10^3 cells/µL))"
var_label(df$plt_pre_ld_log10) <- "Pre-LD platelet count (log10(x 10^3 /µL))"
var_label(df$plt_day_3_log10) <- "Day +3 platelet count (log10(x 10^3 /µL))"
var_label(df$ldh_pre_ld_log10) <- "Pre-LD LDH (log10(U/L))"
var_label(df$crp_pre_ld_log10) <- "Pre-LD CRP (log10(mg/L))"
var_label(df$crp_day_0_log10) <- "Day +0 CRP (log10(mg/L))"
var_label(df$crp_day_3_log10) <- "Day +3 CRP (log10(mg/L))"
var_label(df$crp_day_5_log10) <- "Day +5 CRP (log10(mg/L))"
var_label(df$crp_day_7_log10) <- "Day +7 CRP (log10(mg/L))"
var_label(df$crp_max_log10) <- "Peak CRP (log10(mg/L))"
var_label(df$ferritin_pre_ld_log10) <- "Pre-LD ferritin (log10(ng/mL))"
var_label(df$ferritin_day_0_log10) <- "Day +0 ferritin (log10(ng/mL))"
var_label(df$ferritin_day_3_log10) <- "Day +3 ferritin (log10(ng/mL))"
var_label(df$ferritin_day_5_log10) <- "Day +5 ferritin (log10(ng/mL))"
var_label(df$ferritin_day_7_log10) <- "Day +7 ferritin (log10(ng/mL))"
var_label(df$ferritin_max_log10) <- "Peak ferritin (log10(ng/mL))"
var_label(df$il6_pre_ld_log10) <- "Pre-LD IL-6 (log10(pg/mL))"
var_label(df$il6_day_0_log10) <- "Day +0 IL-6 (log10(pg/mL))"
var_label(df$il6_day_3_log10) <- "Day +3 IL-6 (log10(pg/mL))"
var_label(df$il6_day_5_log10) <- "Day +5 IL-6 (log10(pg/mL))"
var_label(df$il6_day_7_log10) <- "Day +7 IL-6 (log10(pg/mL))"
var_label(df$il6_max_log10) <- "Peak IL-6 (log10(pg/mL))"
var_label(df$fibrinogen_pre_ld_log10) <- "Pre-LD fibrinogen (log10(mg/dL))"
var_label(df$fibrinogen_day_0_log10) <- "Day +0 fibrinogen (log10(mg/dL))"
var_label(df$fibrinogen_day_3_log10) <- "Day +3 fibrinogen (log10(mg/dL))"
var_label(df$fibrinogen_day_5_log10) <- "Day +5 fibrinogen (log10(mg/dL))"
var_label(df$fibrinogen_day_7_log10) <- "Day +7 fibrinogen (log10(mg/dL))"
var_label(df$fibrinogen_min_log10) <- "Nadir fibrinogen (log10(mg/dL))"
var_label(df$d_dimer_pre_ld_log10) <- "Pre-LD D-dimer (log10(mg/L FEU))"
var_label(df$d_dimer_day_0_log10) <- "Day +0 D-dimer (log10(mg/L FEU))"
var_label(df$d_dimer_day_3_log10) <- "Day +3 D-dimer (log10(mg/L FEU))"
var_label(df$d_dimer_day_5_log10) <- "Day +5 D-dimer (log10(mg/L FEU))"
var_label(df$d_dimer_day_7_log10) <- "Day +7 D-dimer (log10(mg/L FEU))"
var_label(df$d_dimer_max_log10) <- "Peak D-dimer (log10(mg/L FEU))"
var_label(df$ptt_pre_ld_log10) <- "Pre-LD PTT (log10(seconds))"
var_label(df$ptt_day_0_log10) <- "Day +0 PTT (log10(seconds))"
var_label(df$ptt_day_3_log10) <- "Day +3 PTT (log10(seconds))"
var_label(df$ptt_day_5_log10) <- "Day +5 PTT (log10(seconds))"
var_label(df$ptt_day_7_log10) <- "Day +7 PTT (log10(seconds))"
var_label(df$ptt_max_log10) <- "Peak PTT (log10(seconds))"
var_label(df$icaht_grade) <- "eICAHT grade"
var_label(df$icaht_grade_3_4) <- "Grade 3-4 eICAHT"
var_label(df$gcsf) <- "Received G-CSF"
var_label(df$tpo_ra) <- "Received TPO-RA"
var_label(df$crs_grade) <- "CRS grade"
var_label(df$nt_grade) <- "Neurotoxicity grade"
var_label(df$iec_hs) <- "IEC-HS"
df %>%
select(
cart_age,
gender_desc,
racenih,
ethnih,
prior_hct,
disease_cat,
bm_disease_by_morph,
bm_disease_by_flow,
bridging_yn,
LD_regimen_cat,
cart_dose_total,
product_infused,
product_cat,
anc_pre_ld,
hb_pre_ld,
plt_pre_ld
) %>%
tbl_summary(
missing_text = "Missing",
type = list(all_continuous() ~ "continuous2"),
statistic = list(
all_continuous() ~ c("{median} ({p25}, {p75})", "{min} - {max}"),
all_categorical() ~ "{n} ({p}%)"
),
digits = all_categorical() ~ 0,
sort = list(everything() ~ "frequency")
) %>%
bold_labels()
Characteristic | N = 6911 |
---|---|
Age | |
Median (IQR) | 61 (51, 68) |
Range | 19 - 84 |
Sex | |
Male | 437 (63%) |
Female | 254 (37%) |
Race | |
White | 583 (84%) |
Asian | 58 (8%) |
Black or African American | 18 (3%) |
American Indian or Alaska Native | 13 (2%) |
Multiple | 8 (1%) |
Unknown | 7 (1%) |
Native Hawaiian or other Pacific Islander | 4 (1%) |
Ethnicity | |
Not Hispanic or Latino | 629 (91%) |
Hispanic or Latino | 52 (8%) |
Unknown | 10 (1%) |
Prior HCT | 234 (34%) |
Disease | |
Aggressive NHL | 318 (46%) |
MM/PCL | 120 (17%) |
Other indolent NHL | 117 (17%) |
ALL | 99 (14%) |
Mantle cell lymphoma | 37 (5%) |
Bone marrow involvement by morphology (%) | |
Median (IQR) | 0 (0, 30) |
Range | 0 - 100 |
Missing | 267 |
Bone marrow involvement by flow cytometry (%) | |
Median (IQR) | 0 (0, 19) |
Range | 0 - 99 |
Missing | 252 |
Bridging therapy | 422 (61%) |
Lymphodepletion regimen | |
Low-intensity Cy/Flu | 403 (58%) |
High-intensity Cy/Flu | 253 (37%) |
Other | 35 (5%) |
Total CAR T-cell dose (x 10^6 cells) | |
Median (IQR) | 170 (111, 402) |
Range | 5 - 43,692 |
CAR T-cell product | |
JCAR014 | 193 (28%) |
YESCARTA | 140 (20%) |
BREYANZI | 87 (13%) |
CARVYKTI | 52 (8%) |
TECARTUS | 47 (7%) |
Investigational CD20-targeted product | 45 (7%) |
JCAR021 | 44 (6%) |
Investigational BCMA-targeted product | 38 (5%) |
ABECMA | 30 (4%) |
KYMRIAH | 15 (2%) |
CAR T-cell product type | |
Investigational CAR T-cell product with 4-1BB costimulatory domain | 320 (46%) |
Commercial CD19-targeted CAR T-cell product with CD28 costimulatory domain | 187 (27%) |
Commercial CD19-targeted CAR T-cell product with 4-1BB costimulatory domain | 102 (15%) |
Commercial BCMA-targeted CAR T-cell product | 82 (12%) |
Pre-LD ANC (x 10^3 cells/µL) | |
Median (IQR) | 2.84 (1.79, 4.29) |
Range | 0.00 - 54.03 |
Missing | 1 |
Pre-LD Hb (g/dL) | |
Median (IQR) | 10.70 (9.50, 12.35) |
Range | 7.00 - 17.10 |
Pre-LD platelet count (x 10^3 cells/µL) | |
Median (IQR) | 142 (89, 202) |
Range | 6 - 790 |
1 n (%) |
df %>%
select(disease_cat, bridging_yn) %>%
tbl_summary(by = disease_cat) %>%
bold_labels()
Characteristic | Aggressive NHL, N = 3181 | Other indolent NHL, N = 1171 | ALL, N = 991 | Mantle cell lymphoma, N = 371 | MM/PCL, N = 1201 |
---|---|---|---|---|---|
Bridging therapy | 153 (48%) | 69 (59%) | 59 (60%) | 33 (89%) | 108 (90%) |
1 n (%) |
df_LD <-
read.xlsx(
here(
"dfs",
"EPIC Data Exports",
"2024.05.29.lymphodepletionRegimen.xlsx"
),
detectDates = TRUE
) %>%
clean_names()
df_LD <- df_LD %>% filter(cart_num == 1)
df_LD <- df_LD %>%
mutate(LD_regimen = case_when(
grepl("cyclophosphamide", dep_regimen, ignore.case = TRUE) &
grepl("fludarabine", dep_regimen, ignore.case = TRUE) ~ "Cy/Flu",
.default = "Other"
))
df_LD %>%
filter(
upn %in% df$upn,
LD_regimen == "Other") %>%
select(dep_regimen) %>%
tbl_summary()
Characteristic | N = 351 |
---|---|
dep_regimen | |
Bendamustine | 4 (11%) |
Bendamustine; Bendamustine | 2 (5.7%) |
Cyclophosphamide | 10 (29%) |
Cyclophosphamide + Etoposide + Dex; Etoposide | 1 (2.9%) |
Cyclophosphamide + Etoposide; Etoposide | 6 (17%) |
Cyclophosphamide + Mesna | 9 (26%) |
Cyclophosphamide + Mesna + Etoposide; Etoposide; Neulasta | 1 (2.9%) |
Fludarabine | 1 (2.9%) |
Gemcitabine; Start Growth factor; Cyclophosphamide + Mesna | 1 (2.9%) |
1 n (%) |
df %>%
select(icaht_grade) %>%
tbl_summary(
statistic = list(all_categorical() ~ "{n} ({p}%)"),
digits = all_categorical() ~ 0,
sort = list(everything() ~ "frequency")
) %>%
bold_labels()
Characteristic | N = 6911 |
---|---|
eICAHT grade | |
Grade 1 | 366 (53%) |
Grade 0 | 112 (16%) |
Grade 2 | 111 (16%) |
Grade 3 | 67 (10%) |
Grade 4 | 35 (5%) |
1 n (%) |
df %>%
select(
icaht_grade_3_4,
disease_cat
) %>%
tbl_summary(
by = disease_cat) %>%
bold_labels()
Characteristic | Aggressive NHL, N = 3181 | Other indolent NHL, N = 1171 | ALL, N = 991 | Mantle cell lymphoma, N = 371 | MM/PCL, N = 1201 |
---|---|---|---|---|---|
Grade 3-4 eICAHT | 35 (11%) | 15 (13%) | 36 (36%) | 4 (11%) | 12 (10%) |
1 n (%) |
df %>%
select(crs_grade, nt_grade) %>%
tbl_summary(statistic = list(all_categorical() ~ "{n} ({p}%)"))
Characteristic | N = 6911 |
---|---|
CRS grade | |
0 | 175 (25%) |
1 | 224 (32%) |
2 | 257 (37%) |
3 | 23 (3.3%) |
4 | 10 (1.4%) |
5 | 2 (0.3%) |
Neurotoxicity grade | |
0 | 411 (59%) |
1 | 72 (10%) |
2 | 84 (12%) |
3 | 103 (15%) |
4 | 19 (2.7%) |
5 | 2 (0.3%) |
1 n (%) |
df %>%
select(gcsf, tpo_ra) %>%
tbl_summary() %>%
bold_labels()
Characteristic | N = 6911 |
---|---|
Received G-CSF | 584 (85%) |
Received TPO-RA | 2 (0.3%) |
1 n (%) |
df <- df %>%
mutate(
OS_status = ifelse(is.na(deathdat), 0, 1),
OS_months = as.duration(cart_date %--% datelast) / dmonths(1)
)
# Create data frame with OS landmarked at day +30
df_landmark_OS <- df %>%
filter(OS_months > 30/30.4167) %>%
mutate(OS_months_landmark = OS_months - 30/30.4167)
quantile(prodlim(Hist(OS_months, OS_status) ~ 1, data = df, reverse = TRUE))
Quantiles of the potential follow up time distribution based on the Kaplan-Meier method
applied to the censored times reversing the roles of event status and censored.
Table of quantiles and corresponding confidence limits:
q quantile lower upper
<num> <num> <num> <num>
1: 0.00 NA NA NA
2: 0.25 62.98 58.28 71
3: 0.50 32.92 26.61 37
4: 0.75 12.25 11.40 14
5: 1.00 0.92 0.92 1
Median with interquartile range (IQR):
Median (IQR)
<char>
1: 32.92 (12.25;62.98)
Call: survfit(formula = Surv(OS_months, OS_status) ~ 1, data = df)
n events median 0.95LCL 0.95UCL
[1,] 691 307 34.6 27.1 42.6
# 3-year estimate
summary(surv_OS, times = 36)
Call: survfit(formula = Surv(OS_months, OS_status) ~ 1, data = df)
time n.risk n.event survival std.err lower 95% CI upper 95% CI
36 157 274 0.485 0.0231 0.441 0.532
Using log10-transformed laboratory values, except pre-LD hemoglobin due to lack of convergence with log10(pre-LD hemoglobin)
tbl_uvregression <- df %>%
select(
icaht_grade_3_4,
cart_age,
gender_desc,
raceethnih_cat,
prior_hct,
disease_cat,
bm_disease_by_morph_log10,
bm_disease_by_flow_log10,
bridging_yn,
LD_regimen_cat,
LD_regimen_low_CyFlu_vs_other,
cart_dose_total_log10,
car_target,
costim_domain,
crs_grade,
nt_grade,
iec_hs,
anc_pre_ld_log10,
anc_day_3_log10,
hb_pre_ld,
plt_pre_ld_log10,
plt_day_3_log10,
ldh_pre_ld_log10,
crp_pre_ld_log10,
crp_day_0_log10,
crp_day_3_log10,
crp_day_5_log10,
crp_day_7_log10,
crp_max_log10,
ferritin_pre_ld_log10,
ferritin_day_0_log10,
ferritin_day_3_log10,
ferritin_day_5_log10,
ferritin_day_7_log10,
ferritin_max_log10,
il6_pre_ld_log10,
il6_day_0_log10,
il6_day_3_log10,
il6_day_5_log10,
il6_day_7_log10,
il6_max_log10,
fibrinogen_pre_ld_log10,
fibrinogen_day_0_log10,
fibrinogen_day_3_log10,
fibrinogen_day_5_log10,
fibrinogen_day_7_log10,
fibrinogen_min_log10,
d_dimer_pre_ld_log10,
d_dimer_day_0_log10,
d_dimer_day_3_log10,
d_dimer_day_5_log10,
d_dimer_day_7_log10,
d_dimer_max_log10,
ptt_pre_ld_log10,
ptt_day_0_log10,
ptt_day_3_log10,
ptt_day_5_log10,
ptt_day_7_log10,
ptt_max_log10
) %>%
tbl_uvregression(
method = glm,
y = icaht_grade_3_4,
method.args = list(family = binomial),
exponentiate = TRUE,
pvalue_fun = ~ style_pvalue(.x, digits = 2)
) %>%
add_nevent() %>%
bold_p() %>%
bold_labels()
tbl_uvregression
Characteristic | N | Event N | OR1 | 95% CI1 | p-value |
---|---|---|---|---|---|
Age | 691 | 102 | 0.97 | 0.96, 0.98 | <0.001 |
Sex | 691 | 102 | |||
Female | — | — | |||
Male | 0.84 | 0.55, 1.30 | 0.44 | ||
Race/ethnicity | 691 | 102 | |||
White | — | — | |||
Non-Hispanic Black or African American | 1.19 | 0.27, 3.73 | 0.79 | ||
Other | 0.69 | 0.32, 1.31 | 0.29 | ||
Prior HCT | 691 | 102 | |||
No | — | — | |||
Yes | 1.25 | 0.80, 1.92 | 0.31 | ||
Disease | 691 | 102 | |||
Aggressive NHL | — | — | |||
Other indolent NHL | 1.19 | 0.61, 2.23 | 0.60 | ||
ALL | 4.62 | 2.70, 7.95 | <0.001 | ||
Mantle cell lymphoma | 0.98 | 0.28, 2.65 | 0.97 | ||
MM/PCL | 0.90 | 0.43, 1.75 | 0.76 | ||
Bone marrow involvement by morphology (log10(%)) | 424 | 65 | 1.43 | 1.22, 1.68 | <0.001 |
Bone marrow involvement by flow cytometry (log10(%)) | 439 | 69 | 1.52 | 1.29, 1.81 | <0.001 |
Bridging therapy | 691 | 102 | |||
No | — | — | |||
Yes | 2.05 | 1.29, 3.35 | 0.003 | ||
Lymphodepletion regimen | 691 | 102 | |||
Low-intensity Cy/Flu | — | — | |||
High-intensity Cy/Flu | 1.45 | 0.93, 2.25 | 0.10 | ||
Other | 2.44 | 1.03, 5.35 | 0.031 | ||
Lymphodepletion regimen (low-dose Cy/Flu vs. other) | 691 | 102 | |||
Low-intensity Cy/Flu | — | — | |||
Other | 1.56 | 1.02, 2.38 | 0.040 | ||
Total CAR T-cell dose (log10(x 10^6 cells)) | 691 | 102 | 0.42 | 0.29, 0.60 | <0.001 |
CAR target | 691 | 102 | |||
BCMA | — | — | |||
CD19 | 1.78 | 0.98, 3.54 | 0.076 | ||
CD20 | 0.64 | 0.14, 2.15 | 0.51 | ||
Costimulatory domain | 691 | 102 | |||
4-1BB | — | — | |||
CD28 | 1.02 | 0.63, 1.62 | 0.93 | ||
Dual 4-1BB and CD28 | 0.40 | 0.09, 1.13 | 0.13 | ||
CRS grade | 691 | 102 | 2.01 | 1.60, 2.56 | <0.001 |
Neurotoxicity grade | 691 | 102 | 1.51 | 1.29, 1.76 | <0.001 |
IEC-HS | 691 | 102 | |||
No | — | — | |||
Yes | 3.12 | 1.41, 6.51 | 0.003 | ||
Pre-LD ANC (log10(x 10^3 cells/µL)) | 690 | 102 | 0.15 | 0.08, 0.24 | <0.001 |
Day +3 ANC (log10(x 10^3 cells/µL)) | 691 | 102 | 0.53 | 0.47, 0.61 | <0.001 |
Pre-LD Hb (g/dL) | 691 | 102 | 0.61 | 0.52, 0.70 | <0.001 |
Pre-LD platelet count (log10(x 10^3 /µL)) | 691 | 102 | 0.06 | 0.03, 0.11 | <0.001 |
Day +3 platelet count (log10(x 10^3 /µL)) | 691 | 102 | 0.06 | 0.03, 0.11 | <0.001 |
Pre-LD LDH (log10(U/L)) | 677 | 101 | 9.95 | 4.72, 21.5 | <0.001 |
Pre-LD CRP (log10(mg/L)) | 439 | 58 | 3.44 | 2.24, 5.43 | <0.001 |
Day +0 CRP (log10(mg/L)) | 644 | 85 | 3.24 | 2.21, 4.84 | <0.001 |
Day +3 CRP (log10(mg/L)) | 643 | 91 | 3.54 | 2.30, 5.65 | <0.001 |
Day +5 CRP (log10(mg/L)) | 444 | 81 | 3.36 | 2.10, 5.57 | <0.001 |
Day +7 CRP (log10(mg/L)) | 549 | 77 | 3.73 | 2.36, 6.08 | <0.001 |
Peak CRP (log10(mg/L)) | 683 | 98 | 16.7 | 7.91, 38.4 | <0.001 |
Pre-LD ferritin (log10(ng/mL)) | 486 | 69 | 5.57 | 3.42, 9.51 | <0.001 |
Day +0 ferritin (log10(ng/mL)) | 658 | 90 | 9.87 | 5.82, 17.5 | <0.001 |
Day +3 ferritin (log10(ng/mL)) | 649 | 94 | 9.24 | 5.65, 15.8 | <0.001 |
Day +5 ferritin (log10(ng/mL)) | 445 | 80 | 6.74 | 4.17, 11.5 | <0.001 |
Day +7 ferritin (log10(ng/mL)) | 555 | 81 | 5.49 | 3.65, 8.57 | <0.001 |
Peak ferritin (log10(ng/mL)) | 691 | 102 | 6.90 | 4.80, 10.2 | <0.001 |
Pre-LD IL-6 (log10(pg/mL)) | 130 | 14 | 4.13 | 1.58, 11.4 | 0.004 |
Day +0 IL-6 (log10(pg/mL)) | 336 | 41 | 2.95 | 1.68, 5.24 | <0.001 |
Day +3 IL-6 (log10(pg/mL)) | 476 | 71 | 1.44 | 1.11, 1.86 | 0.005 |
Day +5 IL-6 (log10(pg/mL)) | 356 | 60 | 1.56 | 1.18, 2.07 | 0.002 |
Day +7 IL-6 (log10(pg/mL)) | 413 | 59 | 2.13 | 1.53, 2.99 | <0.001 |
Peak IL-6 (log10(pg/mL)) | 551 | 79 | 2.60 | 2.00, 3.43 | <0.001 |
Pre-LD fibrinogen (log10(mg/dL)) | 378 | 60 | 3.87 | 0.53, 29.4 | 0.19 |
Day +0 fibrinogen (log10(mg/dL)) | 606 | 79 | 7.19 | 1.13, 46.4 | 0.037 |
Day +3 fibrinogen (log10(mg/dL)) | 613 | 85 | 3.85 | 0.66, 23.2 | 0.14 |
Day +5 fibrinogen (log10(mg/dL)) | 404 | 69 | 0.80 | 0.16, 4.14 | 0.79 |
Day +7 fibrinogen (log10(mg/dL)) | 508 | 70 | 0.68 | 0.17, 2.78 | 0.58 |
Nadir fibrinogen (log10(mg/dL)) | 661 | 94 | 0.07 | 0.03, 0.18 | <0.001 |
Pre-LD D-dimer (log10(mg/L FEU)) | 341 | 57 | 2.97 | 1.50, 5.92 | 0.002 |
Day +0 D-dimer (log10(mg/L FEU)) | 557 | 72 | 4.85 | 2.65, 9.00 | <0.001 |
Day +3 D-dimer (log10(mg/L FEU)) | 561 | 79 | 4.20 | 2.41, 7.41 | <0.001 |
Day +5 D-dimer (log10(mg/L FEU)) | 373 | 63 | 4.01 | 2.21, 7.44 | <0.001 |
Day +7 D-dimer (log10(mg/L FEU)) | 455 | 60 | 3.58 | 2.09, 6.19 | <0.001 |
Peak D-dimer (log10(mg/L FEU)) | 651 | 92 | 4.69 | 3.00, 7.49 | <0.001 |
Pre-LD PTT (log10(seconds)) | 455 | 71 | 5.04 | 0.18, 114 | 0.32 |
Day +0 PTT (log10(seconds)) | 502 | 69 | 7.63 | 0.43, 111 | 0.14 |
Day +3 PTT (log10(seconds)) | 494 | 74 | 7.22 | 1.05, 45.2 | 0.038 |
Day +5 PTT (log10(seconds)) | 298 | 55 | 4.70 | 0.78, 26.0 | 0.079 |
Day +7 PTT (log10(seconds)) | 402 | 59 | 4.81 | 0.67, 30.0 | 0.10 |
Peak PTT (log10(seconds)) | 606 | 91 | 9.09 | 2.90, 27.9 | <0.001 |
1 OR = Odds Ratio, CI = Confidence Interval |
df_regression <- tbl_uvregression$table_body %>%
mutate(color = case_when(estimate < 1 & p.value < 0.05 ~ "Low",
estimate > 1 & p.value < 0.05 ~ "High",
.default = "Other"),
color = as.factor(color))
vp <-
ggplot(data = df_regression, aes(x = estimate, y = -log2(p.value), color = color)) +
geom_point() +
scale_color_manual(values = c("red", "darkgreen", "darkgray")) +
geom_hline(yintercept = -log2(0.05), color = "black", linetype = "dashed") +
geom_hline(yintercept = -log2(0.01), color = "black", linetype = "dashed") +
geom_vline(xintercept = 1, color = "black", linetype = "dashed") +
geom_text(aes(x = 19, y = 3, label = "p = 0.05"), color = "black", stat = "unique") +
geom_text(aes(x = 19, y = 8, label = "p = 0.01"), color = "black", stat = "unique") +
geom_text(aes(x = 2, y = 75, label = "OR = 1"), color = "black", stat = "unique") +
scale_x_continuous(name = "Odds of grade 3-4 eICAHT", limits = c(0, 20)) +
coord_cartesian(clip = "off") +
theme(legend.position = "none")
options(ggrepel.max.overlaps = Inf)
vp + geom_text_repel(data = filter(df_regression, p.value < 0.05),
aes(label = var_label))
df %>%
select(
cart_age,
gender_desc,
raceethnih_cat,
prior_hct,
disease_cat,
bm_disease_by_morph_log10,
bm_disease_by_flow_log10,
bridging_yn,
LD_regimen_cat,
LD_regimen_low_CyFlu_vs_other,
car_target,
costim_domain,
cart_dose_total_log10,
crs_grade,
nt_grade,
iec_hs,
anc_pre_ld_log10,
anc_day_3_log10,
hb_pre_ld,
plt_pre_ld_log10,
plt_day_3_log10,
ldh_pre_ld_log10,
crp_pre_ld_log10,
crp_day_0_log10,
crp_day_3_log10,
crp_day_5_log10,
crp_day_7_log10,
crp_max_log10,
ferritin_pre_ld_log10,
ferritin_day_0_log10,
ferritin_day_3_log10,
ferritin_day_5_log10,
ferritin_day_7_log10,
ferritin_max_log10,
il6_pre_ld_log10,
il6_day_0_log10,
il6_day_3_log10,
il6_day_5_log10,
il6_day_7_log10,
il6_max_log10,
fibrinogen_pre_ld_log10,
fibrinogen_day_0_log10,
fibrinogen_day_3_log10,
fibrinogen_day_5_log10,
fibrinogen_day_7_log10,
fibrinogen_min_log10,
d_dimer_pre_ld_log10,
d_dimer_day_0_log10,
d_dimer_day_3_log10,
d_dimer_day_5_log10,
d_dimer_day_7_log10,
d_dimer_max_log10,
ptt_pre_ld_log10,
ptt_day_0_log10,
ptt_day_3_log10,
ptt_day_5_log10,
ptt_day_7_log10,
ptt_max_log10
) %>%
tbl_summary(
missing_text = "Missing",
type = list(all_continuous() ~ "continuous2"),
statistic = list(
all_continuous() ~ c("{median} ({p25}, {p75})", "{min} - {max}"),
all_categorical() ~ "{n} ({p}%)"
),
digits = all_categorical() ~ 0,
sort = list(everything() ~ "frequency")
) %>%
bold_labels()
Characteristic | N = 6911 |
---|---|
Age | |
Median (IQR) | 61 (51, 68) |
Range | 19 - 84 |
Sex | |
Male | 437 (63%) |
Female | 254 (37%) |
Race/ethnicity | |
White | 583 (84%) |
Other | 91 (13%) |
Non-Hispanic Black or African American | 17 (2%) |
Prior HCT | 234 (34%) |
Disease | |
Aggressive NHL | 318 (46%) |
MM/PCL | 120 (17%) |
Other indolent NHL | 117 (17%) |
ALL | 99 (14%) |
Mantle cell lymphoma | 37 (5%) |
Bone marrow involvement by morphology (log10(%)) | |
Median (IQR) | -2.00 (-2.00, 1.48) |
Range | -2.00 - 2.00 |
Missing | 267 |
Bone marrow involvement by flow cytometry (log10(%)) | |
Median (IQR) | -1.28 (-2.00, 1.29) |
Range | -3.00 - 1.99 |
Missing | 252 |
Bridging therapy | 422 (61%) |
Lymphodepletion regimen | |
Low-intensity Cy/Flu | 403 (58%) |
High-intensity Cy/Flu | 253 (37%) |
Other | 35 (5%) |
Lymphodepletion regimen (low-dose Cy/Flu vs. other) | |
Low-intensity Cy/Flu | 403 (58%) |
Other | 288 (42%) |
CAR target | |
CD19 | 526 (76%) |
BCMA | 120 (17%) |
CD20 | 45 (7%) |
Costimulatory domain | |
4-1BB | 459 (66%) |
CD28 | 187 (27%) |
Dual 4-1BB and CD28 | 45 (7%) |
Total CAR T-cell dose (log10(x 10^6 cells)) | |
Median (IQR) | 2.23 (2.04, 2.60) |
Range | 0.69 - 4.64 |
CRS grade | |
2 | 257 (37%) |
1 | 224 (32%) |
0 | 175 (25%) |
3 | 23 (3%) |
4 | 10 (1%) |
5 | 2 (0%) |
Neurotoxicity grade | |
0 | 411 (59%) |
3 | 103 (15%) |
2 | 84 (12%) |
1 | 72 (10%) |
4 | 19 (3%) |
5 | 2 (0%) |
IEC-HS | 33 (5%) |
Pre-LD ANC (log10(x 10^3 cells/µL)) | |
Median (IQR) | 0.45 (0.25, 0.63) |
Range | -2.00 - 1.73 |
Missing | 1 |
Day +3 ANC (log10(x 10^3 cells/µL)) | |
Median (IQR) | 3.04 (2.52, 3.29) |
Range | -2.00 - 3.95 |
Pre-LD Hb (g/dL) | |
Median (IQR) | 10.70 (9.50, 12.35) |
Range | 7.00 - 17.10 |
Pre-LD platelet count (log10(x 10^3 /µL)) | |
Median (IQR) | 2.15 (1.95, 2.31) |
Range | 0.78 - 2.90 |
Day +3 platelet count (log10(x 10^3 /µL)) | |
Median (IQR) | 5.00 (4.75, 5.18) |
Range | 3.70 - 5.68 |
Pre-LD LDH (log10(U/L)) | |
Median (IQR) | 2.26 (2.16, 2.41) |
Range | 1.66 - 4.35 |
Missing | 14 |
Pre-LD CRP (log10(mg/L)) | |
Median (IQR) | 0.81 (0.26, 1.38) |
Range | -0.70 - 2.40 |
Missing | 252 |
Day +0 CRP (log10(mg/L)) | |
Median (IQR) | 0.99 (0.52, 1.52) |
Range | -0.52 - 2.55 |
Missing | 47 |
Day +3 CRP (log10(mg/L)) | |
Median (IQR) | 1.54 (0.97, 1.97) |
Range | -0.70 - 2.68 |
Missing | 48 |
Day +5 CRP (log10(mg/L)) | |
Median (IQR) | 1.49 (1.03, 1.88) |
Range | -0.40 - 2.66 |
Missing | 247 |
Day +7 CRP (log10(mg/L)) | |
Median (IQR) | 1.35 (0.84, 1.75) |
Range | -0.40 - 2.61 |
Missing | 142 |
Peak CRP (log10(mg/L)) | |
Median (IQR) | 1.90 (1.51, 2.18) |
Range | -0.15 - 2.68 |
Missing | 8 |
Pre-LD ferritin (log10(ng/mL)) | |
Median (IQR) | 2.57 (2.08, 3.03) |
Range | 0.48 - 4.56 |
Missing | 205 |
Day +0 ferritin (log10(ng/mL)) | |
Median (IQR) | 2.67 (2.30, 3.06) |
Range | 1.15 - 4.24 |
Missing | 33 |
Day +3 ferritin (log10(ng/mL)) | |
Median (IQR) | 2.76 (2.35, 3.16) |
Range | 0.85 - 4.98 |
Missing | 42 |
Day +5 ferritin (log10(ng/mL)) | |
Median (IQR) | 2.84 (2.34, 3.21) |
Range | 1.00 - 5.28 |
Missing | 246 |
Day +7 ferritin (log10(ng/mL)) | |
Median (IQR) | 2.88 (2.40, 3.25) |
Range | 0.78 - 6.00 |
Missing | 136 |
Peak ferritin (log10(ng/mL)) | |
Median (IQR) | 3.10 (2.59, 3.54) |
Range | 1.15 - 6.00 |
Pre-LD IL-6 (log10(pg/mL)) | |
Median (IQR) | 0.78 (0.48, 1.15) |
Range | 0.00 - 2.50 |
Missing | 561 |
Day +0 IL-6 (log10(pg/mL)) | |
Median (IQR) | 0.90 (0.60, 1.33) |
Range | 0.00 - 3.07 |
Missing | 355 |
Day +3 IL-6 (log10(pg/mL)) | |
Median (IQR) | 1.65 (1.18, 2.34) |
Range | 0.00 - 4.53 |
Missing | 215 |
Day +5 IL-6 (log10(pg/mL)) | |
Median (IQR) | 1.68 (1.11, 2.47) |
Range | 0.30 - 4.52 |
Missing | 335 |
Day +7 IL-6 (log10(pg/mL)) | |
Median (IQR) | 1.58 (1.08, 2.15) |
Range | 0.00 - 4.23 |
Missing | 278 |
Peak IL-6 (log10(pg/mL)) | |
Median (IQR) | 2.20 (1.62, 3.09) |
Range | 0.30 - 5.32 |
Missing | 140 |
Pre-LD fibrinogen (log10(mg/dL)) | |
Median (IQR) | 2.61 (2.53, 2.71) |
Range | 2.11 - 2.95 |
Missing | 313 |
Day +0 fibrinogen (log10(mg/dL)) | |
Median (IQR) | 2.64 (2.56, 2.74) |
Range | 2.12 - 3.04 |
Missing | 85 |
Day +3 fibrinogen (log10(mg/dL)) | |
Median (IQR) | 2.70 (2.60, 2.78) |
Range | 2.15 - 3.00 |
Missing | 78 |
Day +5 fibrinogen (log10(mg/dL)) | |
Median (IQR) | 2.66 (2.56, 2.76) |
Range | 1.97 - 2.97 |
Missing | 287 |
Day +7 fibrinogen (log10(mg/dL)) | |
Median (IQR) | 2.62 (2.50, 2.73) |
Range | 1.89 - 2.97 |
Missing | 183 |
Nadir fibrinogen (log10(mg/dL)) | |
Median (IQR) | 2.41 (2.19, 2.53) |
Range | 1.53 - 2.95 |
Missing | 30 |
Pre-LD D-dimer (log10(mg/L FEU)) | |
Median (IQR) | -0.09 (-0.35, 0.20) |
Range | -0.57 - 1.51 |
Missing | 350 |
Day +0 D-dimer (log10(mg/L FEU)) | |
Median (IQR) | -0.11 (-0.34, 0.20) |
Range | -0.57 - 1.30 |
Missing | 134 |
Day +3 D-dimer (log10(mg/L FEU)) | |
Median (IQR) | 0.06 (-0.21, 0.38) |
Range | -0.57 - 1.50 |
Missing | 130 |
Day +5 D-dimer (log10(mg/L FEU)) | |
Median (IQR) | 0.11 (-0.17, 0.39) |
Range | -0.57 - 1.58 |
Missing | 318 |
Day +7 D-dimer (log10(mg/L FEU)) | |
Median (IQR) | 0.12 (-0.20, 0.45) |
Range | -0.57 - 1.60 |
Missing | 236 |
Peak D-dimer (log10(mg/L FEU)) | |
Median (IQR) | 0.41 (0.05, 0.80) |
Range | -0.55 - 1.60 |
Missing | 40 |
Pre-LD PTT (log10(seconds)) | |
Median (IQR) | 1.46 (1.41, 1.51) |
Range | 1.30 - 2.02 |
Missing | 236 |
Day +0 PTT (log10(seconds)) | |
Median (IQR) | 1.48 (1.45, 1.53) |
Range | 1.34 - 2.15 |
Missing | 189 |
Day +3 PTT (log10(seconds)) | |
Median (IQR) | 1.53 (1.48, 1.59) |
Range | 1.32 - 2.19 |
Missing | 197 |
Day +5 PTT (log10(seconds)) | |
Median (IQR) | 1.52 (1.46, 1.59) |
Range | 1.34 - 2.30 |
Missing | 393 |
Day +7 PTT (log10(seconds)) | |
Median (IQR) | 1.51 (1.45, 1.57) |
Range | 1.32 - 2.23 |
Missing | 289 |
Peak PTT (log10(seconds)) | |
Median (IQR) | 1.58 (1.52, 1.65) |
Range | 1.32 - 2.30 |
Missing | 85 |
1 n (%) |
# Set Cox-Snell R squared = 0.1
# 6 predictors
pmsampsize(
type = "b",
nagrsquared = NA,
csrsquared = 0.1,
rsquared = NA,
parameters = 6,
shrinkage = 0.9,
prevalence = 0.16,
cstatistic = NA,
seed = 123,
rate = NA,
timepoint = NA,
meanfup = NA,
intercept = NA,
sd = NA,
mmoe = 1.1)
NB: Assuming 0.05 acceptable difference in apparent & adjusted R-squared
NB: Assuming 0.05 margin of error in estimation of intercept
NB: Events per Predictor Parameter (EPP) assumes prevalence = 0.16
Samp_size Shrinkage Parameter CS_Rsq Max_Rsq Nag_Rsq EPP
Criteria 1 510 0.900 6 0.1 0.585 0.171 13.60
Criteria 2 192 0.774 6 0.1 0.585 0.171 5.12
Criteria 3 207 0.900 6 0.1 0.585 0.171 5.52
Final 510 0.900 6 0.1 0.585 0.171 13.60
Minimum sample size required for new model development based on user inputs = 510,
with 82 events (assuming an outcome prevalence = 0.16) and an EPP = 13.6
Create training and test sets (70%/30%)
# Create data frame for tidy model
df_tidy <- df %>%
mutate(
disease_cat = ifelse(disease_cat == "ALL", "ALL", "Other"),
disease_cat = factor(disease_cat, levels = c("ALL", "Other")),
bridging_yn = factor(bridging_yn, levels = c("No", "Yes"))
) %>%
select(
upn,
icaht_grade_3_4,
disease_cat,
bridging_yn,
LD_regimen_low_CyFlu_vs_other,
anc_pre_ld_log10,
anc_day_3_log10,
plt_pre_ld_log10,
plt_day_3_log10,
ldh_pre_ld_log10,
ferritin_pre_ld_log10,
ferritin_day_0_log10,
ferritin_day_3_log10,
crp_day_3_log10,
d_dimer_day_3_log10
) %>%
mutate(icaht_grade_3_4 = factor(icaht_grade_3_4, levels = c("0", "1")))
# Split df into training set and testing set
set.seed(100)
df_split <-
initial_split(df_tidy, prop = 0.7, strata = icaht_grade_3_4)
df_train <- training(df_split)
df_test <- testing(df_split)
df_train %>%
count(icaht_grade_3_4) %>%
mutate(prop = n / sum(n))
icaht_grade_3_4 n prop
1 0 412 0.8530021
2 1 71 0.1469979
icaht_grade_3_4 n prop
1 0 177 0.8509615
2 1 31 0.1490385
Build and train logistic regression model with LASSO regularization
# Create model recipe to pre-process data
recipe <- recipe(icaht_grade_3_4 ~ ., data = df_train) %>%
update_role(upn, new_role = "ID") %>% # Assign upn to identifier role (as opposed to predictor or outcome)
update_role_requirements("ID", bake = FALSE) %>%
step_impute_bag(
# Impute missing values using all other predictors
# No missing values for disease category, LD regimen, bridging therapy, pre-LD plt, or day +3 ANC/plt
anc_pre_ld_log10,
ldh_pre_ld_log10,
ferritin_pre_ld_log10,
ferritin_day_0_log10,
ferritin_day_3_log10,
crp_day_3_log10,
d_dimer_day_3_log10,
seed_val = 100
) %>%
# step_zv(all_numeric(), -all_outcomes()) %>% # Remove variables containing only a single value
step_normalize(all_numeric(), -all_outcomes()) %>% # Normalize numeric data
step_dummy(disease_cat, bridging_yn, LD_regimen_low_CyFlu_vs_other) # Create dummy variables (nominal to numeric binary)
# Prep (apply pre-processing steps) and tidy the recipe (return data frame of pre-processing steps)
prep(recipe) %>%
tidy()
# A tibble: 3 × 6
number operation type trained skip id
<int> <chr> <chr> <lgl> <lgl> <chr>
1 1 step impute_bag TRUE FALSE impute_bag_mnEZ1
2 2 step normalize TRUE FALSE normalize_RkOpV
3 3 step dummy TRUE FALSE dummy_Tbmf1
# Tidy step_normalize (extract mean and SD)
prep(recipe) %>%
tidy(number = 2)
# A tibble: 20 × 4
terms statistic value id
<chr> <chr> <dbl> <chr>
1 anc_pre_ld_log10 mean 0.408 normalize_RkOpV
2 anc_day_3_log10 mean 2.61 normalize_RkOpV
3 plt_pre_ld_log10 mean 2.09 normalize_RkOpV
4 plt_day_3_log10 mean 4.96 normalize_RkOpV
5 ldh_pre_ld_log10 mean 2.32 normalize_RkOpV
6 ferritin_pre_ld_log10 mean 2.53 normalize_RkOpV
7 ferritin_day_0_log10 mean 2.69 normalize_RkOpV
8 ferritin_day_3_log10 mean 2.75 normalize_RkOpV
9 crp_day_3_log10 mean 1.42 normalize_RkOpV
10 d_dimer_day_3_log10 mean 0.103 normalize_RkOpV
11 anc_pre_ld_log10 sd 0.405 normalize_RkOpV
12 anc_day_3_log10 sd 1.33 normalize_RkOpV
13 plt_pre_ld_log10 sd 0.318 normalize_RkOpV
14 plt_day_3_log10 sd 0.310 normalize_RkOpV
15 ldh_pre_ld_log10 sd 0.241 normalize_RkOpV
16 ferritin_pre_ld_log10 sd 0.645 normalize_RkOpV
17 ferritin_day_0_log10 sd 0.525 normalize_RkOpV
18 ferritin_day_3_log10 sd 0.576 normalize_RkOpV
19 crp_day_3_log10 sd 0.645 normalize_RkOpV
20 d_dimer_day_3_log10 sd 0.388 normalize_RkOpV
# Build model specification
spec_model <- logistic_reg(penalty = 0.1, mixture = 1) %>%
set_engine("glmnet")
# Create model formula
model_formula <-
icaht_grade_3_4 ~ disease_cat_Other + anc_pre_ld_log10 + plt_pre_ld_log10 + ldh_pre_ld_log10 + ferritin_pre_ld_log10
# Create model workflow (bundles recipe and model)
wf <- workflow() %>%
add_model(spec_model, formula = model_formula) %>%
add_recipe(recipe)
# Fit model on training set
fit_train <- wf %>%
fit(data = df_train)
# Extract estimated coefficients
fit_train %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 6 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.77 0.1
2 disease_cat_Other 0 0.1
3 anc_pre_ld_log10 -0.0669 0.1
4 plt_pre_ld_log10 -0.135 0.1
5 ldh_pre_ld_log10 0 0.1
6 ferritin_pre_ld_log10 0.00556 0.1
Tune LASSO parameters
# Build set of bootstrap resamples
set.seed(123)
bootstrap_resamples <-
bootstraps(df_train, strata = icaht_grade_3_4)
# Create tuning specifications
# Set penalty = tune() instead of to a single value
tune_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
# Use penalty() to set up an appropriate grid
lambda_grid <- grid_regular(penalty(), levels = 5)
# Tune grid using workflow object
doParallel::registerDoParallel()
wf_tune <- workflow() %>%
add_model(tune_spec, formula = model_formula) %>%
add_recipe(recipe)
set.seed(2023)
lasso_grid <- tune_grid(wf_tune,
resamples = bootstrap_resamples,
grid = lambda_grid)
# lasso_grid <- tune_grid(
# wf %>% update_model(tune_spec, formula = model_formula),
# resamples = bootstrap_resamples,
# grid = lambda_grid
# )
lasso_grid %>% collect_metrics()
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 accuracy binary 0.874 25 0.00302 Preproces…
2 0.0000000001 brier_class binary 0.0999 25 0.00189 Preproces…
3 0.0000000001 roc_auc binary 0.778 25 0.00824 Preproces…
4 0.0000000316 accuracy binary 0.874 25 0.00302 Preproces…
5 0.0000000316 brier_class binary 0.0999 25 0.00189 Preproces…
6 0.0000000316 roc_auc binary 0.778 25 0.00824 Preproces…
7 0.00001 accuracy binary 0.874 25 0.00302 Preproces…
8 0.00001 brier_class binary 0.0999 25 0.00189 Preproces…
9 0.00001 roc_auc binary 0.778 25 0.00824 Preproces…
10 0.00316 accuracy binary 0.875 25 0.00295 Preproces…
11 0.00316 brier_class binary 0.0997 25 0.00186 Preproces…
12 0.00316 roc_auc binary 0.778 25 0.00831 Preproces…
13 1 accuracy binary 0.855 25 0.00293 Preproces…
14 1 brier_class binary 0.124 25 0.00207 Preproces…
15 1 roc_auc binary 0.5 25 0 Preproces…
# Visualize performance with regularization parameter
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
geom_line(linewidth = 1.5) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
# Select highest AUROC
highest_auroc <- lasso_grid %>%
select_best(metric = "roc_auc")
# Finalize workflow
wf_final <-
finalize_workflow(wf_tune,
highest_auroc)
# Visualize most important variables
wf_final %>%
fit(data = df_train) %>%
pull_workflow_fit() %>%
vi(lambda = highest_auroc$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
Fit final best model on training set and evaluate on test set
# Fit the final best model to the training set and evaluate the test set
final_fit <- last_fit(wf_final,
df_split)
# Extract trained workflow from final_fit
wf_final_trained <- final_fit %>%
extract_workflow()
wf_final_trained
══ Workflow [trained] ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
3 Recipe Steps
• step_impute_bag()
• step_normalize()
• step_dummy()
── Model ─────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.123300
2 2 2.56 0.112400
3 2 4.94 0.102400
4 3 7.05 0.093280
5 4 9.23 0.084990
6 4 11.35 0.077440
7 4 13.08 0.070560
8 4 14.52 0.064300
9 4 15.73 0.058580
10 4 16.75 0.053380
11 4 17.62 0.048640
12 4 18.36 0.044320
13 4 18.99 0.040380
14 4 19.54 0.036790
15 4 20.00 0.033520
16 4 20.40 0.030550
17 4 20.74 0.027830
18 4 21.04 0.025360
19 5 21.29 0.023110
20 5 21.52 0.021050
21 5 21.71 0.019180
22 5 21.88 0.017480
23 5 22.02 0.015930
24 5 22.14 0.014510
25 5 22.24 0.013220
26 5 22.33 0.012050
27 5 22.40 0.010980
28 5 22.47 0.010000
29 5 22.52 0.009114
30 5 22.56 0.008304
31 5 22.60 0.007566
32 5 22.63 0.006894
33 5 22.66 0.006282
34 5 22.68 0.005724
35 5 22.70 0.005215
36 5 22.71 0.004752
37 5 22.73 0.004330
38 5 22.74 0.003945
39 5 22.75 0.003595
40 5 22.75 0.003275
41 5 22.76 0.002984
42 5 22.76 0.002719
43 5 22.77 0.002478
44 5 22.77 0.002258
45 5 22.78 0.002057
46 5 22.78 0.001874
...
and 6 more lines.
# saveRDS(wf_final_trained, "eIPMPre.rds")
# Extract trained model from final_fit
final_fit_mod <- final_fit %>%
extract_fit_parsnip()
final_fit_mod
parsnip model object
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.123300
2 2 2.56 0.112400
3 2 4.94 0.102400
4 3 7.05 0.093280
5 4 9.23 0.084990
6 4 11.35 0.077440
7 4 13.08 0.070560
8 4 14.52 0.064300
9 4 15.73 0.058580
10 4 16.75 0.053380
11 4 17.62 0.048640
12 4 18.36 0.044320
13 4 18.99 0.040380
14 4 19.54 0.036790
15 4 20.00 0.033520
16 4 20.40 0.030550
17 4 20.74 0.027830
18 4 21.04 0.025360
19 5 21.29 0.023110
20 5 21.52 0.021050
21 5 21.71 0.019180
22 5 21.88 0.017480
23 5 22.02 0.015930
24 5 22.14 0.014510
25 5 22.24 0.013220
26 5 22.33 0.012050
27 5 22.40 0.010980
28 5 22.47 0.010000
29 5 22.52 0.009114
30 5 22.56 0.008304
31 5 22.60 0.007566
32 5 22.63 0.006894
33 5 22.66 0.006282
34 5 22.68 0.005724
35 5 22.70 0.005215
36 5 22.71 0.004752
37 5 22.73 0.004330
38 5 22.74 0.003945
39 5 22.75 0.003595
40 5 22.75 0.003275
41 5 22.76 0.002984
42 5 22.76 0.002719
43 5 22.77 0.002478
44 5 22.77 0.002258
45 5 22.78 0.002057
46 5 22.78 0.001874
47 5 22.78 0.001708
48 5 22.78 0.001556
49 5 22.78 0.001418
50 5 22.78 0.001292
51 5 22.79 0.001177
52 5 22.79 0.001073
# A tibble: 6 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.97 0.00316
2 disease_cat_Other -0.212 0.00316
3 anc_pre_ld_log10 -0.533 0.00316
4 plt_pre_ld_log10 -0.278 0.00316
5 ldh_pre_ld_log10 0.477 0.00316
6 ferritin_pre_ld_log10 0.457 0.00316
# Evaluate final model on test set
final_fit %>% collect_metrics()
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.885 Preprocessor1_Model1
2 roc_auc binary 0.873 Preprocessor1_Model1
3 brier_class binary 0.0868 Preprocessor1_Model1
# Plot calibration curve for test set using {runway}
# Augment df_test (i.e., add columns for predictions)
df_mod_pre_ferritin_pre_ld <- wf_final_trained %>%
augment(new_data = df_test) %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
p_cal_pre_ferritin_pre_ld <- cal_plot(
df_mod_pre_ferritin_pre_ld,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPre"
)
p_cal_pre_ferritin_pre_ld
cal_plot(
df_mod_pre_ferritin_pre_ld,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 4,
plot_title = "eIPMPre"
)
# Plot threshold-performance plot for test set
runway::threshperf_plot(
df_mod_pre_ferritin_pre_ld,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1"
)
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.873 208 31 177
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.1173 0.6191 0.7212 0.9355 0.6836 29 2 56 121
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.01485423 0.02350964 0.04473573 0.09134867 0.1566652
0 0.01485423 0.02220945 0.03954348 0.07580228 0.1142895
1 0.05883319 0.11199019 0.17600774 0.28972388 0.3986170
3rd Qu. 95% Max. SD NAs
0.1948714 0.6009895 0.9946001 0.1793896 0
0.1519401 0.3293760 0.6288580 0.1080376 0
0.6244815 0.9012574 0.9946001 0.2871989 0
df_mod_pre_ferritin_pre_ld <- df_mod_pre_ferritin_pre_ld %>%
mutate(icaht_grade_3_4 = fct_relevel(icaht_grade_3_4, c("0", "1")))
dcurves::dca(icaht_grade_3_4 ~ .pred_1, data = df_mod_pre_ferritin_pre_ld,
thresholds = seq(0, 0.35, by = 0.01)) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
ALL
# Define test subset
df_test_all <- df_test %>% filter(disease_cat == "ALL")
# Augment test subset (i.e., add columns for predictions)
df_mod_pre_ferritin_pre_ld_all <- wf_final_trained %>%
augment(new_data = df_test_all)
# Calculate accuracy and AUROC
model_metrics <- metric_set(yardstick::accuracy, roc_auc)
df_mod_pre_ferritin_pre_ld_all %>%
ungroup() %>%
model_metrics(truth = icaht_grade_3_4,
estimate = .pred_class,
.pred_1,
event_level = "second")
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.714
2 roc_auc binary 0.873
# Plot calibration curve using {runway}
df_mod_pre_ferritin_pre_ld_all <- df_mod_pre_ferritin_pre_ld_all %>%
mutate(.pred_class = as.numeric(as.character(.pred_class))) %>%
ungroup()
cal_plot(
df_mod_pre_ferritin_pre_ld_all,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 3,
plot_title = "eIPMPre (ALL)"
)
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.8727 21 11 10
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.2141 0.7091 0.8571 0.9091 0.8 10 1 2 8
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.03447581 0.04478461 0.1464598 0.2521307 0.3632346
0 0.03447581 0.03911477 0.0940455 0.1513242 0.1813287
1 0.11784828 0.16598329 0.2757682 0.5768907 0.5286036
3rd Qu. 95% Max. SD NAs
0.6139658 0.8931991 0.8944719 0.2951087 0
0.1740077 0.4605932 0.6139658 0.1670296 0
0.7603098 0.8938355 0.8944719 0.2930836 0
Other disease types
# Define ther disease types test subset
df_test_other <- df_test %>% filter(disease_cat == "Other")
# Augment test subset (i.e., add columns for predictions)
df_mod_pre_ferritin_pre_ld_other <- wf_final_trained %>%
augment(new_data = df_test_other)
# Calculate accuracy and AUROC
df_mod_pre_ferritin_pre_ld_other %>%
ungroup() %>%
model_metrics(truth = icaht_grade_3_4,
estimate = .pred_class,
.pred_1,
event_level = "second")
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.904
2 roc_auc binary 0.846
# Plot calibration curve using {runway}
df_mod_pre_ferritin_pre_ld_other <- df_mod_pre_ferritin_pre_ld_other %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
cal_plot(
df_mod_pre_ferritin_pre_ld_other,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE
)
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.8455 187 20 167
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.1173 0.6066 0.7273 0.9 0.7066 18 2 49 118
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.01485423 0.02281042 0.04275294 0.08216188 0.1334676
0 0.01485423 0.02136819 0.03939678 0.07378063 0.1102752
1 0.05883319 0.10424424 0.16568519 0.21035841 0.3271243
3rd Qu. 95% Max. SD NAs
0.1716741 0.3673488 0.9946001 0.1452267 0
0.1455395 0.3265144 0.6288580 0.1028384 0
0.3897169 0.9123707 0.9946001 0.2643194 0
Build and train logistic regression model with LASSO regularization
# Create model formula
model_formula <-
icaht_grade_3_4 ~ disease_cat_Other + bridging_yn_Yes + anc_pre_ld_log10 +
plt_pre_ld_log10 + ldh_pre_ld_log10 + ferritin_pre_ld_log10
# Create model workflow (bundles recipe and model)
wf <- workflow() %>%
add_model(spec_model, formula = model_formula) %>%
add_recipe(recipe)
# Fit model on training set
fit_train <- wf %>%
fit(data = df_train)
# Extract estimated coefficients
fit_train %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.77 0.1
2 disease_cat_Other 0 0.1
3 bridging_yn_Yes 0 0.1
4 anc_pre_ld_log10 -0.0669 0.1
5 plt_pre_ld_log10 -0.135 0.1
6 ldh_pre_ld_log10 0 0.1
7 ferritin_pre_ld_log10 0.00556 0.1
Tune LASSO parameters
# Tune grid using workflow object
doParallel::registerDoParallel()
wf_tune <- workflow() %>%
add_model(tune_spec, formula = model_formula) %>%
add_recipe(recipe)
set.seed(2023)
lasso_grid <- tune_grid(wf_tune,
resamples = bootstrap_resamples,
grid = lambda_grid)
lasso_grid %>% collect_metrics()
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 accuracy binary 0.876 25 0.00294 Preproces…
2 0.0000000001 brier_class binary 0.0999 25 0.00188 Preproces…
3 0.0000000001 roc_auc binary 0.775 25 0.00799 Preproces…
4 0.0000000316 accuracy binary 0.876 25 0.00294 Preproces…
5 0.0000000316 brier_class binary 0.0999 25 0.00188 Preproces…
6 0.0000000316 roc_auc binary 0.775 25 0.00799 Preproces…
7 0.00001 accuracy binary 0.876 25 0.00294 Preproces…
8 0.00001 brier_class binary 0.0999 25 0.00188 Preproces…
9 0.00001 roc_auc binary 0.775 25 0.00799 Preproces…
10 0.00316 accuracy binary 0.875 25 0.00286 Preproces…
11 0.00316 brier_class binary 0.0996 25 0.00184 Preproces…
12 0.00316 roc_auc binary 0.776 25 0.00814 Preproces…
13 1 accuracy binary 0.855 25 0.00293 Preproces…
14 1 brier_class binary 0.124 25 0.00207 Preproces…
15 1 roc_auc binary 0.5 25 0 Preproces…
# Visualize performance with regularization parameter
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
geom_line(linewidth = 1.5) +
facet_wrap( ~ .metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
# Select highest accuracy
highest_auroc <- lasso_grid %>%
select_best(metric = "roc_auc")
# Finalize workflow
wf_final <-
finalize_workflow(wf_tune,
highest_auroc)
# Visualize most important variables
wf_final %>%
fit(data = df_train) %>%
pull_workflow_fit() %>%
vi(lambda = highest_auroc$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
Fit final best model on training set and evaluate on test set
# Fit the final best model to the training set and evaluate the test set
final_fit <- last_fit(wf_final,
df_split)
# Extract trained workflow from final_fit
wf_final_trained <- final_fit %>%
extract_workflow()
wf_final_trained
══ Workflow [trained] ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
3 Recipe Steps
• step_impute_bag()
• step_normalize()
• step_dummy()
── Model ─────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.123300
2 2 2.56 0.112400
3 2 4.94 0.102400
4 3 7.05 0.093280
5 4 9.23 0.084990
6 4 11.35 0.077440
7 4 13.08 0.070560
8 4 14.52 0.064300
9 4 15.73 0.058580
10 4 16.75 0.053380
11 4 17.62 0.048640
12 4 18.36 0.044320
13 4 18.99 0.040380
14 4 19.54 0.036790
15 4 20.00 0.033520
16 4 20.40 0.030550
17 4 20.74 0.027830
18 5 21.08 0.025360
19 6 21.41 0.023110
20 6 21.71 0.021050
21 6 21.96 0.019180
22 6 22.17 0.017480
23 6 22.35 0.015930
24 6 22.50 0.014510
25 6 22.63 0.013220
26 6 22.74 0.012050
27 6 22.84 0.010980
28 6 22.92 0.010000
29 6 22.98 0.009114
30 6 23.04 0.008304
31 6 23.09 0.007566
32 6 23.13 0.006894
33 6 23.16 0.006282
34 6 23.19 0.005724
35 6 23.21 0.005215
36 6 23.23 0.004752
37 6 23.25 0.004330
38 6 23.26 0.003945
39 6 23.27 0.003595
40 6 23.28 0.003275
41 6 23.29 0.002984
42 6 23.30 0.002719
43 6 23.30 0.002478
44 6 23.31 0.002258
45 6 23.31 0.002057
46 6 23.31 0.001874
...
and 7 more lines.
# Extract trained model from final_fit
final_fit_mod <- final_fit %>%
extract_fit_parsnip()
final_fit_mod
parsnip model object
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.123300
2 2 2.56 0.112400
3 2 4.94 0.102400
4 3 7.05 0.093280
5 4 9.23 0.084990
6 4 11.35 0.077440
7 4 13.08 0.070560
8 4 14.52 0.064300
9 4 15.73 0.058580
10 4 16.75 0.053380
11 4 17.62 0.048640
12 4 18.36 0.044320
13 4 18.99 0.040380
14 4 19.54 0.036790
15 4 20.00 0.033520
16 4 20.40 0.030550
17 4 20.74 0.027830
18 5 21.08 0.025360
19 6 21.41 0.023110
20 6 21.71 0.021050
21 6 21.96 0.019180
22 6 22.17 0.017480
23 6 22.35 0.015930
24 6 22.50 0.014510
25 6 22.63 0.013220
26 6 22.74 0.012050
27 6 22.84 0.010980
28 6 22.92 0.010000
29 6 22.98 0.009114
30 6 23.04 0.008304
31 6 23.09 0.007566
32 6 23.13 0.006894
33 6 23.16 0.006282
34 6 23.19 0.005724
35 6 23.21 0.005215
36 6 23.23 0.004752
37 6 23.25 0.004330
38 6 23.26 0.003945
39 6 23.27 0.003595
40 6 23.28 0.003275
41 6 23.29 0.002984
42 6 23.30 0.002719
43 6 23.30 0.002478
44 6 23.31 0.002258
45 6 23.31 0.002057
46 6 23.31 0.001874
47 6 23.32 0.001708
48 6 23.32 0.001556
49 6 23.32 0.001418
50 6 23.32 0.001292
51 6 23.32 0.001177
52 6 23.33 0.001073
53 6 23.33 0.000977
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -2.20 0.00316
2 disease_cat_Other -0.246 0.00316
3 bridging_yn_Yes 0.421 0.00316
4 anc_pre_ld_log10 -0.541 0.00316
5 plt_pre_ld_log10 -0.242 0.00316
6 ldh_pre_ld_log10 0.466 0.00316
7 ferritin_pre_ld_log10 0.433 0.00316
# Evaluate final model on test set
final_fit %>% collect_metrics()
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.885 Preprocessor1_Model1
2 roc_auc binary 0.858 Preprocessor1_Model1
3 brier_class binary 0.0881 Preprocessor1_Model1
# Plot calibration curve for test set
# Using {probably}
final_fit %>%
collect_predictions() %>%
cal_plot_breaks(
truth = icaht_grade_3_4,
estimate = .pred_1,
event_level = "second",
num_breaks = 5
)
# Using {runway}
# Augment df_test (i.e., add columns for predictions)
df_mod_pre_ferritin_pre_ld_bridging <- wf_final_trained %>%
augment(new_data = df_test) %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
p_cal_pre_ferritin_pre_ld_bridging <- cal_plot(
df_mod_pre_ferritin_pre_ld_bridging,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPre (+ bridging Y/N)"
)
p_cal_pre_ferritin_pre_ld_bridging
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.858 208 31 177
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.1921 0.5611 0.8077 0.7419 0.8192 23 8 32 145
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.01212559 0.02243276 0.04603704 0.09317768 0.1575520
0 0.01212559 0.02125369 0.04148493 0.07912474 0.1156375
1 0.06811707 0.08853358 0.18067349 0.29767356 0.3968701
3rd Qu. 95% Max. SD NAs
0.1972440 0.6122819 0.9945373 0.1808306 0
0.1586298 0.3269365 0.6508477 0.1095079 0
0.6323817 0.9080596 0.9945373 0.2927925 0
df_mod_pre_ferritin_pre_ld_bridging <- df_mod_pre_ferritin_pre_ld_bridging %>%
mutate(icaht_grade_3_4 = fct_relevel(icaht_grade_3_4, c("0", "1")))
dcurves::dca(
icaht_grade_3_4 ~ .pred_1,
data = df_mod_pre_ferritin_pre_ld_bridging,
thresholds = seq(0, 0.35, by = 0.01)
) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
Build and train logistic regression model with LASSO regularization
# Create model formula
model_formula <-
icaht_grade_3_4 ~ disease_cat_Other + LD_regimen_low_CyFlu_vs_other_Other + anc_pre_ld_log10 +
plt_pre_ld_log10 + ldh_pre_ld_log10 + ferritin_pre_ld_log10
# Create model workflow (bundles recipe and model)
wf <- workflow() %>%
add_model(spec_model, formula = model_formula) %>%
add_recipe(recipe)
# Fit model on training set
fit_train <- wf %>%
fit(data = df_train)
# Extract estimated coefficients
fit_train %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.77 0.1
2 disease_cat_Other 0 0.1
3 LD_regimen_low_CyFlu_vs_other_Other 0 0.1
4 anc_pre_ld_log10 -0.0669 0.1
5 plt_pre_ld_log10 -0.135 0.1
6 ldh_pre_ld_log10 0 0.1
7 ferritin_pre_ld_log10 0.00556 0.1
Tune LASSO parameters
# Tune grid using workflow object
doParallel::registerDoParallel()
wf_tune <- workflow() %>%
add_model(tune_spec, formula = model_formula) %>%
add_recipe(recipe)
set.seed(2023)
lasso_grid <- tune_grid(wf_tune,
resamples = bootstrap_resamples,
grid = lambda_grid)
lasso_grid %>% collect_metrics()
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 accuracy binary 0.876 25 0.00311 Preproces…
2 0.0000000001 brier_class binary 0.0965 25 0.00187 Preproces…
3 0.0000000001 roc_auc binary 0.794 25 0.00787 Preproces…
4 0.0000000316 accuracy binary 0.876 25 0.00311 Preproces…
5 0.0000000316 brier_class binary 0.0965 25 0.00187 Preproces…
6 0.0000000316 roc_auc binary 0.794 25 0.00787 Preproces…
7 0.00001 accuracy binary 0.876 25 0.00311 Preproces…
8 0.00001 brier_class binary 0.0965 25 0.00187 Preproces…
9 0.00001 roc_auc binary 0.794 25 0.00787 Preproces…
10 0.00316 accuracy binary 0.876 25 0.00309 Preproces…
11 0.00316 brier_class binary 0.0965 25 0.00184 Preproces…
12 0.00316 roc_auc binary 0.793 25 0.00799 Preproces…
13 1 accuracy binary 0.855 25 0.00293 Preproces…
14 1 brier_class binary 0.124 25 0.00207 Preproces…
15 1 roc_auc binary 0.5 25 0 Preproces…
# Visualize performance with regularization parameter
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
geom_line(linewidth = 1.5) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
# Select highest accuracy
highest_auroc <- lasso_grid %>%
select_best(metric = "roc_auc")
# Finalize workflow
wf_final <-
finalize_workflow(wf_tune,
highest_auroc)
# Visualize most important variables
wf_final %>%
fit(data = df_train) %>%
pull_workflow_fit() %>%
vi(lambda = highest_auroc$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
Fit final best model on training set and evaluate on test set
# Fit the final best model to the training set and evaluate the test set
final_fit <- last_fit(wf_final,
df_split)
# Extract trained workflow from final_fit
wf_final_trained <- final_fit %>%
extract_workflow()
wf_final_trained
══ Workflow [trained] ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
3 Recipe Steps
• step_impute_bag()
• step_normalize()
• step_dummy()
── Model ─────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.123300
2 2 2.56 0.112400
3 2 4.94 0.102400
4 3 7.05 0.093280
5 4 9.23 0.084990
6 4 11.35 0.077440
7 4 13.08 0.070560
8 4 14.52 0.064300
9 4 15.73 0.058580
10 4 16.75 0.053380
11 4 17.62 0.048640
12 4 18.36 0.044320
13 5 19.23 0.040380
14 5 20.03 0.036790
15 5 20.72 0.033520
16 5 21.31 0.030550
17 5 21.82 0.027830
18 6 22.26 0.025360
19 6 22.67 0.023110
20 6 23.01 0.021050
21 6 23.30 0.019180
22 6 23.56 0.017480
23 6 23.77 0.015930
24 6 23.95 0.014510
25 6 24.11 0.013220
26 6 24.24 0.012050
27 6 24.36 0.010980
28 6 24.45 0.010000
29 6 24.53 0.009114
30 6 24.60 0.008304
31 6 24.66 0.007566
32 6 24.71 0.006894
33 6 24.75 0.006282
34 6 24.78 0.005724
35 6 24.81 0.005215
36 6 24.84 0.004752
37 6 24.86 0.004330
38 6 24.87 0.003945
39 6 24.89 0.003595
40 6 24.90 0.003275
41 6 24.91 0.002984
42 6 24.92 0.002719
43 6 24.92 0.002478
44 6 24.93 0.002258
45 6 24.94 0.002057
46 6 24.94 0.001874
...
and 8 more lines.
# Extract trained model from final_fit
final_fit_mod <- final_fit %>%
extract_fit_parsnip()
final_fit_mod
parsnip model object
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.123300
2 2 2.56 0.112400
3 2 4.94 0.102400
4 3 7.05 0.093280
5 4 9.23 0.084990
6 4 11.35 0.077440
7 4 13.08 0.070560
8 4 14.52 0.064300
9 4 15.73 0.058580
10 4 16.75 0.053380
11 4 17.62 0.048640
12 4 18.36 0.044320
13 5 19.23 0.040380
14 5 20.03 0.036790
15 5 20.72 0.033520
16 5 21.31 0.030550
17 5 21.82 0.027830
18 6 22.26 0.025360
19 6 22.67 0.023110
20 6 23.01 0.021050
21 6 23.30 0.019180
22 6 23.56 0.017480
23 6 23.77 0.015930
24 6 23.95 0.014510
25 6 24.11 0.013220
26 6 24.24 0.012050
27 6 24.36 0.010980
28 6 24.45 0.010000
29 6 24.53 0.009114
30 6 24.60 0.008304
31 6 24.66 0.007566
32 6 24.71 0.006894
33 6 24.75 0.006282
34 6 24.78 0.005724
35 6 24.81 0.005215
36 6 24.84 0.004752
37 6 24.86 0.004330
38 6 24.87 0.003945
39 6 24.89 0.003595
40 6 24.90 0.003275
41 6 24.91 0.002984
42 6 24.92 0.002719
43 6 24.92 0.002478
44 6 24.93 0.002258
45 6 24.94 0.002057
46 6 24.94 0.001874
47 6 24.94 0.001708
48 6 24.95 0.001556
49 6 24.95 0.001418
50 6 24.95 0.001292
51 6 24.95 0.001177
52 6 24.95 0.001073
53 6 24.95 0.000977
54 6 24.96 0.000890
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -2.39 0.0000000001
2 disease_cat_Other -0.281 0.0000000001
3 LD_regimen_low_CyFlu_vs_other_Other 0.883 0.0000000001
4 anc_pre_ld_log10 -0.545 0.0000000001
5 plt_pre_ld_log10 -0.377 0.0000000001
6 ldh_pre_ld_log10 0.461 0.0000000001
7 ferritin_pre_ld_log10 0.446 0.0000000001
# Evaluate final model on test set
final_fit %>% collect_metrics()
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.885 Preprocessor1_Model1
2 roc_auc binary 0.840 Preprocessor1_Model1
3 brier_class binary 0.0906 Preprocessor1_Model1
# Plot calibration curve for test set
# Using {probably}
final_fit %>%
collect_predictions() %>%
cal_plot_breaks(
truth = icaht_grade_3_4,
estimate = .pred_1,
event_level = "second",
num_breaks = 5
)
# Using {runway}
# Augment df_test (i.e., add columns for predictions)
df_mod_pre_ferritin_pre_ld_LD_regimen <- wf_final_trained %>%
augment(new_data = df_test) %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
p_cal_pre_ferritin_pre_ld_LD_regimen <- cal_plot(
df_mod_pre_ferritin_pre_ld_LD_regimen,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPre (+ LD regimen)"
)
p_cal_pre_ferritin_pre_ld_LD_regimen
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.8398 208 31 177
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.1303 0.5562 0.7356 0.8387 0.7175 26 5 50 127
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.01074805 0.01601036 0.03927698 0.08968762 0.1545562
0 0.01074805 0.01448999 0.03428915 0.07383580 0.1122840
1 0.03575507 0.07427495 0.13775421 0.27510320 0.3959172
3rd Qu. 95% Max. SD NAs
0.1747263 0.5592538 0.9965745 0.1893149 0
0.1442390 0.3435103 0.7378751 0.1166286 0
0.6494173 0.9094487 0.9965745 0.3110581 0
df_mod_pre_ferritin_pre_ld_LD_regimen <- df_mod_pre_ferritin_pre_ld_LD_regimen %>%
mutate(icaht_grade_3_4 = fct_relevel(icaht_grade_3_4, c("0", "1")))
dcurves::dca(
icaht_grade_3_4 ~ .pred_1,
data = df_mod_pre_ferritin_pre_ld_LD_regimen,
thresholds = seq(0, 0.35, by = 0.01),
label = list(.pred_1 = "eIPMPre")
) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
Build and train logistic regression model with LASSO regularization
# Create model formula
model_formula <-
icaht_grade_3_4 ~ disease_cat_Other + anc_pre_ld_log10 + plt_pre_ld_log10 + ldh_pre_ld_log10 + ferritin_day_3_log10
# Create model workflow (bundles recipe and model)
wf <- workflow() %>%
add_model(spec_model, formula = model_formula) %>%
add_recipe(recipe)
# Fit model on training set
fit_train <- wf %>%
fit(data = df_train)
# Extract estimated coefficients
fit_train %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 6 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.78 0.1
2 disease_cat_Other 0 0.1
3 anc_pre_ld_log10 -0.0685 0.1
4 plt_pre_ld_log10 -0.0684 0.1
5 ldh_pre_ld_log10 0 0.1
6 ferritin_day_3_log10 0.162 0.1
Tune LASSO parameters
# Tune grid using workflow object
doParallel::registerDoParallel()
wf_tune <- workflow() %>%
add_model(tune_spec, formula = model_formula) %>%
add_recipe(recipe)
set.seed(2023)
lasso_grid <- tune_grid(wf_tune,
resamples = bootstrap_resamples,
grid = lambda_grid)
lasso_grid %>% collect_metrics()
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 accuracy binary 0.873 25 0.00385 Preproces…
2 0.0000000001 brier_class binary 0.0972 25 0.00213 Preproces…
3 0.0000000001 roc_auc binary 0.814 25 0.00757 Preproces…
4 0.0000000316 accuracy binary 0.873 25 0.00385 Preproces…
5 0.0000000316 brier_class binary 0.0972 25 0.00213 Preproces…
6 0.0000000316 roc_auc binary 0.814 25 0.00757 Preproces…
7 0.00001 accuracy binary 0.873 25 0.00385 Preproces…
8 0.00001 brier_class binary 0.0972 25 0.00213 Preproces…
9 0.00001 roc_auc binary 0.814 25 0.00757 Preproces…
10 0.00316 accuracy binary 0.874 25 0.00356 Preproces…
11 0.00316 brier_class binary 0.0968 25 0.00208 Preproces…
12 0.00316 roc_auc binary 0.815 25 0.00764 Preproces…
13 1 accuracy binary 0.855 25 0.00293 Preproces…
14 1 brier_class binary 0.124 25 0.00207 Preproces…
15 1 roc_auc binary 0.5 25 0 Preproces…
# Visualize performance with regularization parameter
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
geom_line(linewidth = 1.5) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
# Select highest AUROC
highest_auroc <- lasso_grid %>%
select_best(metric = "roc_auc")
# Finalize workflow
wf_final <-
finalize_workflow(wf_tune,
highest_auroc)
# Visualize most important variables
wf_final %>%
fit(data = df_train) %>%
pull_workflow_fit() %>%
vi(lambda = highest_auroc$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
Fit final best model on training set and evaluate on test set
# Fit the final best model to the training set and evaluate the test set
final_fit <- last_fit(wf_final,
df_split)
# Extract trained workflow from final_fit
wf_final_trained <- final_fit %>%
extract_workflow()
wf_final_trained
══ Workflow [trained] ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
3 Recipe Steps
• step_impute_bag()
• step_normalize()
• step_dummy()
── Model ─────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.126200
2 3 3.15 0.115000
3 3 6.60 0.104800
4 3 9.30 0.095470
5 3 11.46 0.086990
6 4 13.32 0.079260
7 4 15.26 0.072220
8 4 16.87 0.065800
9 4 18.22 0.059960
10 4 19.36 0.054630
11 4 20.33 0.049780
12 4 21.15 0.045350
13 4 21.86 0.041330
14 4 22.47 0.037650
15 4 22.99 0.034310
16 4 23.43 0.031260
17 4 23.82 0.028480
18 4 24.15 0.025950
19 4 24.43 0.023650
20 4 24.67 0.021550
21 4 24.88 0.019630
22 5 25.06 0.017890
23 5 25.22 0.016300
24 5 25.36 0.014850
25 5 25.48 0.013530
26 5 25.58 0.012330
27 5 25.66 0.011230
28 5 25.73 0.010240
29 5 25.79 0.009327
30 5 25.85 0.008499
31 5 25.89 0.007744
32 5 25.93 0.007056
33 5 25.96 0.006429
34 5 25.98 0.005858
35 5 26.00 0.005337
36 5 26.02 0.004863
37 5 26.04 0.004431
38 5 26.05 0.004038
39 5 26.06 0.003679
40 5 26.07 0.003352
41 5 26.08 0.003054
42 5 26.08 0.002783
43 5 26.09 0.002536
44 5 26.09 0.002310
45 5 26.10 0.002105
46 5 26.10 0.001918
...
and 7 more lines.
# saveRDS(wf_final_trained, "eIPMPost.rds")
# Extract trained model from final_fit
final_fit_mod <- final_fit %>%
extract_fit_parsnip()
final_fit_mod
parsnip model object
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.126200
2 3 3.15 0.115000
3 3 6.60 0.104800
4 3 9.30 0.095470
5 3 11.46 0.086990
6 4 13.32 0.079260
7 4 15.26 0.072220
8 4 16.87 0.065800
9 4 18.22 0.059960
10 4 19.36 0.054630
11 4 20.33 0.049780
12 4 21.15 0.045350
13 4 21.86 0.041330
14 4 22.47 0.037650
15 4 22.99 0.034310
16 4 23.43 0.031260
17 4 23.82 0.028480
18 4 24.15 0.025950
19 4 24.43 0.023650
20 4 24.67 0.021550
21 4 24.88 0.019630
22 5 25.06 0.017890
23 5 25.22 0.016300
24 5 25.36 0.014850
25 5 25.48 0.013530
26 5 25.58 0.012330
27 5 25.66 0.011230
28 5 25.73 0.010240
29 5 25.79 0.009327
30 5 25.85 0.008499
31 5 25.89 0.007744
32 5 25.93 0.007056
33 5 25.96 0.006429
34 5 25.98 0.005858
35 5 26.00 0.005337
36 5 26.02 0.004863
37 5 26.04 0.004431
38 5 26.05 0.004038
39 5 26.06 0.003679
40 5 26.07 0.003352
41 5 26.08 0.003054
42 5 26.08 0.002783
43 5 26.09 0.002536
44 5 26.09 0.002310
45 5 26.10 0.002105
46 5 26.10 0.001918
47 5 26.10 0.001748
48 5 26.11 0.001592
49 5 26.11 0.001451
50 5 26.11 0.001322
51 5 26.11 0.001205
52 5 26.11 0.001098
53 5 26.11 0.001000
# A tibble: 6 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -2.10 0.00316
2 disease_cat_Other -0.177 0.00316
3 anc_pre_ld_log10 -0.548 0.00316
4 plt_pre_ld_log10 -0.199 0.00316
5 ldh_pre_ld_log10 0.446 0.00316
6 ferritin_day_3_log10 0.762 0.00316
# Evaluate final model on test set
final_fit %>% collect_metrics()
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.889 Preprocessor1_Model1
2 roc_auc binary 0.878 Preprocessor1_Model1
3 brier_class binary 0.0846 Preprocessor1_Model1
# Plot calibration curve for test set
# Using {probably}
final_fit %>%
collect_predictions() %>%
cal_plot_breaks(
truth = icaht_grade_3_4,
estimate = .pred_1,
event_level = "second",
num_breaks = 5
)
# Using {runway}
# Augment df_test (i.e., add columns for predictions)
df_mod_post_ferritin_only <- wf_final_trained %>%
augment(new_data = df_test) %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
p_cal_post_ferritin_only <- cal_plot(
df_mod_post_ferritin_only,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPost"
)
p_cal_post_ferritin_only
cal_plot(
df_mod_post_ferritin_only,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 4,
plot_title = "eIPMPost"
)
# Plot threshold-performance plot for test set
runway::threshperf_plot(
df_mod_post_ferritin_only,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1"
)
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.8784 208 31 177
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.152 0.6184 0.7885 0.8387 0.7797 26 5 39 138
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.00656279 0.01650296 0.03949988 0.09172247 0.1564294
0 0.00656279 0.01591629 0.03520880 0.07534019 0.1108258
1 0.05078428 0.10410927 0.17463322 0.29974585 0.4168115
3rd Qu. 95% Max. SD NAs
0.1923062 0.5830375 0.9966972 0.1883222 0
0.1351734 0.3208616 0.6531640 0.1109754 0
0.6345243 0.9629399 0.9966972 0.3002167 0
df_mod_post_ferritin_only <- df_mod_post_ferritin_only %>%
mutate(icaht_grade_3_4 = fct_relevel(icaht_grade_3_4, c("0", "1")))
dcurves::dca(icaht_grade_3_4 ~ .pred_1, data = df_mod_post_ferritin_only,
thresholds = seq(0, 0.35, by = 0.01)) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
ALL
# Augment test subset (i.e., add columns for predictions)
df_mod_post_ferritin_only_all <- wf_final_trained %>%
augment(new_data = df_test_all)
# Calculate accuracy and AUROC
model_metrics <- metric_set(yardstick::accuracy, roc_auc)
df_mod_post_ferritin_only_all %>%
ungroup() %>%
model_metrics(truth = icaht_grade_3_4,
estimate = .pred_class,
.pred_1,
event_level = "second")
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.714
2 roc_auc binary 0.9
# Plot calibration curve using {runway}
df_mod_post_ferritin_only_all <- df_mod_post_ferritin_only_all %>%
mutate(.pred_class = as.numeric(as.character(.pred_class))) %>%
ungroup()
cal_plot(
df_mod_post_ferritin_only_all,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 3,
plot_title = "eIPMPost (ALL)"
)
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.9 21 11 10
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.2266 0.8091 0.9048 0.9091 0.9 10 1 1 9
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.04809059 0.04819017 0.14749837 0.2266330 0.3752421
0 0.04809059 0.04813540 0.08353623 0.1650011 0.1830184
1 0.10333398 0.16498350 0.25168933 0.5439176 0.5499909
3rd Qu. 95% Max. SD NAs
0.6442408 0.9539433 0.9617106 0.3151846 0
0.1886184 0.4497377 0.6442408 0.1733286 0
0.8029840 0.9578269 0.9617106 0.3179675 0
Other disease types
# Define ther disease types test subset
df_test_other <- df_test %>% filter(disease_cat == "Other")
# Augment test subset (i.e., add columns for predictions)
df_mod_post_ferritin_only_other <- wf_final_trained %>%
augment(new_data = df_test_other)
# Calculate accuracy and AUROC
df_mod_post_ferritin_only_other %>%
ungroup() %>%
model_metrics(truth = icaht_grade_3_4,
estimate = .pred_class,
.pred_1,
event_level = "second")
# A tibble: 2 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 accuracy binary 0.909
2 roc_auc binary 0.856
# Plot calibration curve using {runway}
df_mod_post_ferritin_only_other <- df_mod_post_ferritin_only_other %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
cal_plot(
df_mod_post_ferritin_only_other,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPost (Other disease types)"
)
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.8557 187 20 167
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.1194 0.6126 0.7326 0.9 0.7126 18 2 48 119
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.00656279 0.01608832 0.03616852 0.08339907 0.1318569
0 0.00656279 0.01591243 0.03426449 0.07012160 0.1065029
1 0.05078428 0.10217954 0.15663687 0.27126373 0.3435629
3rd Qu. 95% Max. SD NAs
0.1601082 0.3776642 0.9966972 0.1509132 0
0.1288125 0.3141927 0.6531640 0.1053369 0
0.3956410 0.9657956 0.9966972 0.2705217 0
# Create model formula
model_formula <-
icaht_grade_3_4 ~ disease_cat_Other + bridging_yn_Yes + anc_pre_ld_log10 +
plt_pre_ld_log10 + ldh_pre_ld_log10 + ferritin_day_3_log10
# Create model workflow (bundles recipe and model)
wf <- workflow() %>%
add_model(spec_model, formula = model_formula) %>%
add_recipe(recipe)
# Fit model on training set
fit_train <- wf %>%
fit(data = df_train)
# Extract estimated coefficients
fit_train %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.78 0.1
2 disease_cat_Other 0 0.1
3 bridging_yn_Yes 0 0.1
4 anc_pre_ld_log10 -0.0685 0.1
5 plt_pre_ld_log10 -0.0684 0.1
6 ldh_pre_ld_log10 0 0.1
7 ferritin_day_3_log10 0.162 0.1
Tune LASSO parameters
# Tune grid using workflow object
doParallel::registerDoParallel()
wf_tune <- workflow() %>%
add_model(tune_spec, formula = model_formula) %>%
add_recipe(recipe)
set.seed(2023)
lasso_grid <- tune_grid(wf_tune,
resamples = bootstrap_resamples,
grid = lambda_grid)
lasso_grid %>% collect_metrics()
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 accuracy binary 0.872 25 0.00317 Preproces…
2 0.0000000001 brier_class binary 0.0971 25 0.00204 Preproces…
3 0.0000000001 roc_auc binary 0.809 25 0.00721 Preproces…
4 0.0000000316 accuracy binary 0.872 25 0.00317 Preproces…
5 0.0000000316 brier_class binary 0.0971 25 0.00204 Preproces…
6 0.0000000316 roc_auc binary 0.809 25 0.00721 Preproces…
7 0.00001 accuracy binary 0.872 25 0.00317 Preproces…
8 0.00001 brier_class binary 0.0971 25 0.00204 Preproces…
9 0.00001 roc_auc binary 0.809 25 0.00721 Preproces…
10 0.00316 accuracy binary 0.872 25 0.00323 Preproces…
11 0.00316 brier_class binary 0.0966 25 0.00200 Preproces…
12 0.00316 roc_auc binary 0.810 25 0.00719 Preproces…
13 1 accuracy binary 0.855 25 0.00293 Preproces…
14 1 brier_class binary 0.124 25 0.00207 Preproces…
15 1 roc_auc binary 0.5 25 0 Preproces…
# Visualize performance with regularization parameter
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
geom_line(linewidth = 1.5) +
facet_wrap( ~ .metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
# Select highest accuracy
highest_auroc <- lasso_grid %>%
select_best(metric = "roc_auc")
# Finalize workflow
wf_final <-
finalize_workflow(wf_tune,
highest_auroc)
# Visualize most important variables
wf_final %>%
fit(data = df_train) %>%
pull_workflow_fit() %>%
vi(lambda = highest_auroc$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
Fit final best model on training set and evaluate on test set
# Fit the final best model to the training set and evaluate the test set
final_fit <- last_fit(wf_final,
df_split)
# Extract trained workflow from final_fit
wf_final_trained <- final_fit %>%
extract_workflow()
wf_final_trained
══ Workflow [trained] ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
3 Recipe Steps
• step_impute_bag()
• step_normalize()
• step_dummy()
── Model ─────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.126200
2 3 3.15 0.115000
3 3 6.60 0.104800
4 3 9.30 0.095470
5 3 11.46 0.086990
6 4 13.32 0.079260
7 4 15.26 0.072220
8 4 16.87 0.065800
9 4 18.22 0.059960
10 4 19.36 0.054630
11 4 20.33 0.049780
12 4 21.15 0.045350
13 4 21.86 0.041330
14 4 22.47 0.037650
15 4 22.99 0.034310
16 4 23.43 0.031260
17 4 23.82 0.028480
18 5 24.22 0.025950
19 5 24.58 0.023650
20 5 24.90 0.021550
21 5 25.17 0.019630
22 6 25.41 0.017890
23 6 25.62 0.016300
24 6 25.80 0.014850
25 6 25.95 0.013530
26 6 26.08 0.012330
27 6 26.19 0.011230
28 6 26.28 0.010240
29 6 26.36 0.009327
30 6 26.43 0.008499
31 6 26.49 0.007744
32 6 26.53 0.007056
33 6 26.57 0.006429
34 6 26.61 0.005858
35 6 26.64 0.005337
36 6 26.66 0.004863
37 6 26.68 0.004431
38 6 26.70 0.004038
39 6 26.71 0.003679
40 6 26.72 0.003352
41 6 26.73 0.003054
42 6 26.74 0.002783
43 6 26.75 0.002536
44 6 26.75 0.002310
45 6 26.76 0.002105
46 6 26.76 0.001918
...
and 8 more lines.
# Extract trained model from final_fit
final_fit_mod <- final_fit %>%
extract_fit_parsnip()
final_fit_mod
parsnip model object
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.126200
2 3 3.15 0.115000
3 3 6.60 0.104800
4 3 9.30 0.095470
5 3 11.46 0.086990
6 4 13.32 0.079260
7 4 15.26 0.072220
8 4 16.87 0.065800
9 4 18.22 0.059960
10 4 19.36 0.054630
11 4 20.33 0.049780
12 4 21.15 0.045350
13 4 21.86 0.041330
14 4 22.47 0.037650
15 4 22.99 0.034310
16 4 23.43 0.031260
17 4 23.82 0.028480
18 5 24.22 0.025950
19 5 24.58 0.023650
20 5 24.90 0.021550
21 5 25.17 0.019630
22 6 25.41 0.017890
23 6 25.62 0.016300
24 6 25.80 0.014850
25 6 25.95 0.013530
26 6 26.08 0.012330
27 6 26.19 0.011230
28 6 26.28 0.010240
29 6 26.36 0.009327
30 6 26.43 0.008499
31 6 26.49 0.007744
32 6 26.53 0.007056
33 6 26.57 0.006429
34 6 26.61 0.005858
35 6 26.64 0.005337
36 6 26.66 0.004863
37 6 26.68 0.004431
38 6 26.70 0.004038
39 6 26.71 0.003679
40 6 26.72 0.003352
41 6 26.73 0.003054
42 6 26.74 0.002783
43 6 26.75 0.002536
44 6 26.75 0.002310
45 6 26.76 0.002105
46 6 26.76 0.001918
47 6 26.77 0.001748
48 6 26.77 0.001592
49 6 26.77 0.001451
50 6 26.77 0.001322
51 6 26.77 0.001205
52 6 26.78 0.001098
53 6 26.78 0.001000
54 6 26.78 0.000911
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -2.39 0.00316
2 disease_cat_Other -0.197 0.00316
3 bridging_yn_Yes 0.473 0.00316
4 anc_pre_ld_log10 -0.562 0.00316
5 plt_pre_ld_log10 -0.153 0.00316
6 ldh_pre_ld_log10 0.434 0.00316
7 ferritin_day_3_log10 0.758 0.00316
# Evaluate final model on test set
final_fit %>% collect_metrics()
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.889 Preprocessor1_Model1
2 roc_auc binary 0.863 Preprocessor1_Model1
3 brier_class binary 0.0861 Preprocessor1_Model1
# Plot calibration curve for test set
# Using {probably}
final_fit %>%
collect_predictions() %>%
cal_plot_breaks(
truth = icaht_grade_3_4,
estimate = .pred_1,
event_level = "second",
num_breaks = 5
)
# Using {runway}
# Augment df_test (i.e., add columns for predictions)
df_mod_post_bridging <- wf_final_trained %>%
augment(new_data = df_test) %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
p_cal_post_bridging <- cal_plot(
df_mod_post_bridging,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPost (+ bridging Y/N)"
)
p_cal_post_bridging
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.8631 208 31 177
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.175 0.5821 0.8029 0.7742 0.8079 24 7 34 143
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.005095741 0.01629999 0.03753716 0.08915127 0.1580132
0 0.005095741 0.01503095 0.03390703 0.07706598 0.1125861
1 0.060003793 0.07991962 0.17607529 0.28611468 0.4173873
3rd Qu. 95% Max. SD NAs
0.2015784 0.6020981 0.9967971 0.1914246 0
0.1468183 0.3419500 0.6807804 0.1146879 0
0.6448698 0.9672225 0.9967971 0.3065623 0
df_mod_post_bridging <- df_mod_post_bridging %>%
mutate(icaht_grade_3_4 = fct_relevel(icaht_grade_3_4, c("0", "1")))
dcurves::dca(
icaht_grade_3_4 ~ .pred_1,
data = df_mod_post_bridging,
thresholds = seq(0, 0.35, by = 0.01)
) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
Build and train logistic regression model with LASSO regularization
# Create model formula
model_formula <-
icaht_grade_3_4 ~ disease_cat_Other + anc_pre_ld_log10 + plt_pre_ld_log10 + ldh_pre_ld_log10 +
ferritin_day_3_log10 + crp_day_3_log10
# Create model workflow (bundles recipe and model)
wf <- workflow() %>%
add_model(spec_model, formula = model_formula) %>%
add_recipe(recipe)
# Fit model on training set
fit_train <- wf %>%
fit(data = df_train)
# Extract estimated coefficients
fit_train %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.78 0.1
2 disease_cat_Other 0 0.1
3 anc_pre_ld_log10 -0.0685 0.1
4 plt_pre_ld_log10 -0.0684 0.1
5 ldh_pre_ld_log10 0 0.1
6 ferritin_day_3_log10 0.162 0.1
7 crp_day_3_log10 0 0.1
Tune LASSO parameters
# Tune grid using workflow object
doParallel::registerDoParallel()
wf_tune <- workflow() %>%
add_model(tune_spec, formula = model_formula) %>%
add_recipe(recipe)
set.seed(2023)
lasso_grid <- tune_grid(wf_tune,
resamples = bootstrap_resamples,
grid = lambda_grid)
lasso_grid %>% collect_metrics()
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 accuracy binary 0.873 25 0.00349 Preproces…
2 0.0000000001 brier_class binary 0.0965 25 0.00208 Preproces…
3 0.0000000001 roc_auc binary 0.823 25 0.00748 Preproces…
4 0.0000000316 accuracy binary 0.873 25 0.00349 Preproces…
5 0.0000000316 brier_class binary 0.0965 25 0.00208 Preproces…
6 0.0000000316 roc_auc binary 0.823 25 0.00748 Preproces…
7 0.00001 accuracy binary 0.873 25 0.00349 Preproces…
8 0.00001 brier_class binary 0.0965 25 0.00208 Preproces…
9 0.00001 roc_auc binary 0.823 25 0.00748 Preproces…
10 0.00316 accuracy binary 0.874 25 0.00304 Preproces…
11 0.00316 brier_class binary 0.0961 25 0.00201 Preproces…
12 0.00316 roc_auc binary 0.823 25 0.00757 Preproces…
13 1 accuracy binary 0.855 25 0.00293 Preproces…
14 1 brier_class binary 0.124 25 0.00207 Preproces…
15 1 roc_auc binary 0.5 25 0 Preproces…
# Visualize performance with regularization parameter
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
geom_line(linewidth = 1.5) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
# Select highest AUROC
highest_auroc <- lasso_grid %>%
select_best(metric = "roc_auc")
# Finalize workflow
wf_final <-
finalize_workflow(wf_tune,
highest_auroc)
# Visualize most important variables
wf_final %>%
fit(data = df_train) %>%
pull_workflow_fit() %>%
vi(lambda = highest_auroc$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
Fit final best model on training set and evaluate on test set
# Fit the final best model to the training set and evaluate the test set
final_fit <- last_fit(wf_final,
df_split)
# Extract trained workflow from final_fit
wf_final_trained <- final_fit %>%
extract_workflow()
wf_final_trained
══ Workflow [trained] ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
3 Recipe Steps
• step_impute_bag()
• step_normalize()
• step_dummy()
── Model ─────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.126200
2 3 3.15 0.115000
3 3 6.60 0.104800
4 3 9.30 0.095470
5 3 11.46 0.086990
6 4 13.32 0.079260
7 4 15.26 0.072220
8 4 16.87 0.065800
9 4 18.22 0.059960
10 4 19.36 0.054630
11 5 20.49 0.049780
12 5 21.49 0.045350
13 5 22.34 0.041330
14 5 23.08 0.037650
15 5 23.72 0.034310
16 5 24.28 0.031260
17 5 24.75 0.028480
18 5 25.17 0.025950
19 5 25.52 0.023650
20 6 25.85 0.021550
21 6 26.14 0.019630
22 6 26.38 0.017890
23 6 26.60 0.016300
24 6 26.78 0.014850
25 6 26.94 0.013530
26 6 27.07 0.012330
27 6 27.18 0.011230
28 6 27.28 0.010240
29 6 27.36 0.009327
30 6 27.43 0.008499
31 6 27.49 0.007744
32 6 27.54 0.007056
33 6 27.58 0.006429
34 6 27.62 0.005858
35 6 27.65 0.005337
36 6 27.67 0.004863
37 6 27.70 0.004431
38 6 27.71 0.004038
39 6 27.73 0.003679
40 6 27.74 0.003352
41 6 27.75 0.003054
42 6 27.76 0.002783
43 6 27.77 0.002536
44 6 27.78 0.002310
45 6 27.78 0.002105
46 6 27.78 0.001918
...
and 9 more lines.
# Extract trained model from final_fit
final_fit_mod <- final_fit %>%
extract_fit_parsnip()
final_fit_mod
parsnip model object
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.126200
2 3 3.15 0.115000
3 3 6.60 0.104800
4 3 9.30 0.095470
5 3 11.46 0.086990
6 4 13.32 0.079260
7 4 15.26 0.072220
8 4 16.87 0.065800
9 4 18.22 0.059960
10 4 19.36 0.054630
11 5 20.49 0.049780
12 5 21.49 0.045350
13 5 22.34 0.041330
14 5 23.08 0.037650
15 5 23.72 0.034310
16 5 24.28 0.031260
17 5 24.75 0.028480
18 5 25.17 0.025950
19 5 25.52 0.023650
20 6 25.85 0.021550
21 6 26.14 0.019630
22 6 26.38 0.017890
23 6 26.60 0.016300
24 6 26.78 0.014850
25 6 26.94 0.013530
26 6 27.07 0.012330
27 6 27.18 0.011230
28 6 27.28 0.010240
29 6 27.36 0.009327
30 6 27.43 0.008499
31 6 27.49 0.007744
32 6 27.54 0.007056
33 6 27.58 0.006429
34 6 27.62 0.005858
35 6 27.65 0.005337
36 6 27.67 0.004863
37 6 27.70 0.004431
38 6 27.71 0.004038
39 6 27.73 0.003679
40 6 27.74 0.003352
41 6 27.75 0.003054
42 6 27.76 0.002783
43 6 27.77 0.002536
44 6 27.78 0.002310
45 6 27.78 0.002105
46 6 27.78 0.001918
47 6 27.79 0.001748
48 6 27.79 0.001592
49 6 27.79 0.001451
50 6 27.80 0.001322
51 6 27.80 0.001205
52 6 27.80 0.001098
53 6 27.80 0.001000
54 6 27.80 0.000911
55 6 27.80 0.000830
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -2.08 0.0000000001
2 disease_cat_Other -0.336 0.0000000001
3 anc_pre_ld_log10 -0.563 0.0000000001
4 plt_pre_ld_log10 -0.246 0.0000000001
5 ldh_pre_ld_log10 0.429 0.0000000001
6 ferritin_day_3_log10 0.555 0.0000000001
7 crp_day_3_log10 0.534 0.0000000001
# Evaluate final model on test set
final_fit %>% collect_metrics()
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.889 Preprocessor1_Model1
2 roc_auc binary 0.866 Preprocessor1_Model1
3 brier_class binary 0.0881 Preprocessor1_Model1
# Plot calibration curve for test set
# Using {probably}
final_fit %>%
collect_predictions() %>%
cal_plot_breaks(
truth = icaht_grade_3_4,
estimate = .pred_1,
event_level = "second",
num_breaks = 5
)
# Using {runway}
# Augment df_test (i.e., add columns for predictions)
df_mod_post_crp <- wf_final_trained %>%
augment(new_data = df_test) %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
p_cal_post_crp <- cal_plot(
df_mod_post_crp,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPost (+ day +3 CRP)"
)
p_cal_post_crp
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.8664 208 31 177
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.2223 0.6063 0.8462 0.7419 0.8644 23 8 24 153
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.004248709 0.01075153 0.03614793 0.08968716 0.1563897
0 0.004248709 0.01060042 0.03134483 0.06847338 0.1114428
1 0.049466971 0.10137204 0.19028993 0.27979496 0.4130221
3rd Qu. 95% Max. SD NAs
0.1770402 0.6453671 0.9971238 0.1947642 0
0.1462608 0.3451540 0.7361069 0.1220691 0
0.6923864 0.9713001 0.9971238 0.3071562 0
Build and train logistic regression model with LASSO regularization
# Create model formula
model_formula <-
icaht_grade_3_4 ~ disease_cat_Other + anc_pre_ld_log10 + plt_pre_ld_log10 + ldh_pre_ld_log10 +
ferritin_day_3_log10 + d_dimer_day_3_log10
# Create model workflow (bundles recipe and model)
wf <- workflow() %>%
add_model(spec_model, formula = model_formula) %>%
add_recipe(recipe)
# Fit model on training set
fit_train <- wf %>%
fit(data = df_train)
# Extract estimated coefficients
fit_train %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.78 0.1
2 disease_cat_Other 0 0.1
3 anc_pre_ld_log10 -0.0685 0.1
4 plt_pre_ld_log10 -0.0684 0.1
5 ldh_pre_ld_log10 0 0.1
6 ferritin_day_3_log10 0.162 0.1
7 d_dimer_day_3_log10 0 0.1
Tune LASSO parameters
# Tune grid using workflow object
doParallel::registerDoParallel()
wf_tune <- workflow() %>%
add_model(tune_spec, formula = model_formula) %>%
add_recipe(recipe)
set.seed(2023)
lasso_grid <- tune_grid(wf_tune,
resamples = bootstrap_resamples,
grid = lambda_grid)
lasso_grid %>% collect_metrics()
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 accuracy binary 0.874 25 0.00305 Preproces…
2 0.0000000001 brier_class binary 0.0976 25 0.00216 Preproces…
3 0.0000000001 roc_auc binary 0.808 25 0.00802 Preproces…
4 0.0000000316 accuracy binary 0.874 25 0.00305 Preproces…
5 0.0000000316 brier_class binary 0.0976 25 0.00216 Preproces…
6 0.0000000316 roc_auc binary 0.808 25 0.00802 Preproces…
7 0.00001 accuracy binary 0.874 25 0.00305 Preproces…
8 0.00001 brier_class binary 0.0976 25 0.00216 Preproces…
9 0.00001 roc_auc binary 0.808 25 0.00802 Preproces…
10 0.00316 accuracy binary 0.875 25 0.00303 Preproces…
11 0.00316 brier_class binary 0.0972 25 0.00210 Preproces…
12 0.00316 roc_auc binary 0.809 25 0.00801 Preproces…
13 1 accuracy binary 0.855 25 0.00293 Preproces…
14 1 brier_class binary 0.124 25 0.00207 Preproces…
15 1 roc_auc binary 0.5 25 0 Preproces…
# Visualize performance with regularization parameter
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
geom_line(linewidth = 1.5) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
# Select highest AUROC
highest_auroc <- lasso_grid %>%
select_best(metric = "roc_auc")
# Finalize workflow
wf_final <-
finalize_workflow(wf_tune,
highest_auroc)
# Visualize most important variables
wf_final %>%
fit(data = df_train) %>%
pull_workflow_fit() %>%
vi(lambda = highest_auroc$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
Fit final best model on training set and evaluate on test set
# Fit the final best model to the training set and evaluate the test set
final_fit <- last_fit(wf_final,
df_split)
# Extract trained workflow from final_fit
wf_final_trained <- final_fit %>%
extract_workflow()
wf_final_trained
══ Workflow [trained] ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
3 Recipe Steps
• step_impute_bag()
• step_normalize()
• step_dummy()
── Model ─────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.126200
2 3 3.15 0.115000
3 3 6.60 0.104800
4 3 9.30 0.095470
5 3 11.46 0.086990
6 4 13.32 0.079260
7 4 15.26 0.072220
8 4 16.87 0.065800
9 4 18.22 0.059960
10 4 19.36 0.054630
11 4 20.33 0.049780
12 4 21.15 0.045350
13 4 21.86 0.041330
14 5 22.47 0.037650
15 5 23.03 0.034310
16 5 23.51 0.031260
17 5 23.93 0.028480
18 5 24.28 0.025950
19 5 24.58 0.023650
20 6 24.84 0.021550
21 6 25.09 0.019630
22 6 25.30 0.017890
23 6 25.48 0.016300
24 6 25.63 0.014850
25 6 25.76 0.013530
26 6 25.87 0.012330
27 6 25.97 0.011230
28 6 26.05 0.010240
29 6 26.12 0.009327
30 6 26.17 0.008499
31 6 26.22 0.007744
32 6 26.26 0.007056
33 6 26.30 0.006429
34 6 26.33 0.005858
35 6 26.35 0.005337
36 6 26.37 0.004863
37 6 26.39 0.004431
38 6 26.41 0.004038
39 6 26.42 0.003679
40 6 26.43 0.003352
41 6 26.44 0.003054
42 6 26.44 0.002783
43 6 26.45 0.002536
44 6 26.45 0.002310
45 6 26.46 0.002105
46 6 26.46 0.001918
...
and 7 more lines.
# Extract trained model from final_fit
final_fit_mod <- final_fit %>%
extract_fit_parsnip()
final_fit_mod
parsnip model object
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.126200
2 3 3.15 0.115000
3 3 6.60 0.104800
4 3 9.30 0.095470
5 3 11.46 0.086990
6 4 13.32 0.079260
7 4 15.26 0.072220
8 4 16.87 0.065800
9 4 18.22 0.059960
10 4 19.36 0.054630
11 4 20.33 0.049780
12 4 21.15 0.045350
13 4 21.86 0.041330
14 5 22.47 0.037650
15 5 23.03 0.034310
16 5 23.51 0.031260
17 5 23.93 0.028480
18 5 24.28 0.025950
19 5 24.58 0.023650
20 6 24.84 0.021550
21 6 25.09 0.019630
22 6 25.30 0.017890
23 6 25.48 0.016300
24 6 25.63 0.014850
25 6 25.76 0.013530
26 6 25.87 0.012330
27 6 25.97 0.011230
28 6 26.05 0.010240
29 6 26.12 0.009327
30 6 26.17 0.008499
31 6 26.22 0.007744
32 6 26.26 0.007056
33 6 26.30 0.006429
34 6 26.33 0.005858
35 6 26.35 0.005337
36 6 26.37 0.004863
37 6 26.39 0.004431
38 6 26.41 0.004038
39 6 26.42 0.003679
40 6 26.43 0.003352
41 6 26.44 0.003054
42 6 26.44 0.002783
43 6 26.45 0.002536
44 6 26.45 0.002310
45 6 26.46 0.002105
46 6 26.46 0.001918
47 6 26.46 0.001748
48 6 26.47 0.001592
49 6 26.47 0.001451
50 6 26.47 0.001322
51 6 26.47 0.001205
52 6 26.47 0.001098
53 6 26.47 0.001000
# A tibble: 7 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -2.01 0.00316
2 disease_cat_Other -0.277 0.00316
3 anc_pre_ld_log10 -0.548 0.00316
4 plt_pre_ld_log10 -0.200 0.00316
5 ldh_pre_ld_log10 0.430 0.00316
6 ferritin_day_3_log10 0.646 0.00316
7 d_dimer_day_3_log10 0.183 0.00316
# Evaluate final model on test set
final_fit %>% collect_metrics()
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.885 Preprocessor1_Model1
2 roc_auc binary 0.879 Preprocessor1_Model1
3 brier_class binary 0.0853 Preprocessor1_Model1
# Plot calibration curve for test set
# Using {probably}
final_fit %>%
collect_predictions() %>%
cal_plot_breaks(
truth = icaht_grade_3_4,
estimate = .pred_1,
event_level = "second",
num_breaks = 5
)
# Using {runway}
# Augment df_test (i.e., add columns for predictions)
df_mod_post_d_dimer <- wf_final_trained %>%
augment(new_data = df_test) %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
p_cal_post_d_dimer <- cal_plot(
df_mod_post_d_dimer,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPost (+ day +3 D-dimer)"
)
p_cal_post_d_dimer
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.879 208 31 177
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.2175 0.6233 0.8606 0.7419 0.8814 23 8 21 156
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.007698222 0.01753568 0.04001359 0.09004261 0.1573276
0 0.007698222 0.01729843 0.03625101 0.07104828 0.1119078
1 0.043622126 0.10359214 0.19091415 0.30097656 0.4166597
3rd Qu. 95% Max. SD NAs
0.1841708 0.5847027 0.9969185 0.1895310 0
0.1552416 0.3695235 0.6825382 0.1146589 0
0.6277296 0.9627989 0.9969185 0.2984432 0
df_mod_post_d_dimer <- df_mod_post_d_dimer %>%
mutate(icaht_grade_3_4 = fct_relevel(icaht_grade_3_4, c("0", "1")))
dcurves::dca(icaht_grade_3_4 ~ .pred_1, data = df_mod_post_d_dimer,
thresholds = seq(0, 0.35, by = 0.01)) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
Build and train logistic regression model with LASSO regularization
# Create model formula
model_formula <-
icaht_grade_3_4 ~ disease_cat_Other + anc_day_3_log10 + plt_day_3_log10 + ldh_pre_ld_log10 + ferritin_day_3_log10
# Create model workflow (bundles recipe and model)
wf <- workflow() %>%
add_model(spec_model, formula = model_formula) %>%
add_recipe(recipe)
# Fit model on training set
fit_train <- wf %>%
fit(data = df_train)
# Extract estimated coefficients
fit_train %>%
extract_fit_parsnip() %>%
tidy()
# A tibble: 6 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.81 0.1
2 disease_cat_Other 0 0.1
3 anc_day_3_log10 -0.374 0.1
4 plt_day_3_log10 0 0.1
5 ldh_pre_ld_log10 0 0.1
6 ferritin_day_3_log10 0.0260 0.1
Tune LASSO parameters
# Tune grid using workflow object
doParallel::registerDoParallel()
wf_tune <- workflow() %>%
add_model(tune_spec, formula = model_formula) %>%
add_recipe(recipe)
set.seed(2023)
lasso_grid <- tune_grid(wf_tune,
resamples = bootstrap_resamples,
grid = lambda_grid)
lasso_grid %>% collect_metrics()
# A tibble: 15 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.0000000001 accuracy binary 0.874 25 0.00212 Preproces…
2 0.0000000001 brier_class binary 0.0985 25 0.00217 Preproces…
3 0.0000000001 roc_auc binary 0.834 25 0.00674 Preproces…
4 0.0000000316 accuracy binary 0.874 25 0.00212 Preproces…
5 0.0000000316 brier_class binary 0.0985 25 0.00217 Preproces…
6 0.0000000316 roc_auc binary 0.834 25 0.00674 Preproces…
7 0.00001 accuracy binary 0.874 25 0.00212 Preproces…
8 0.00001 brier_class binary 0.0985 25 0.00217 Preproces…
9 0.00001 roc_auc binary 0.834 25 0.00674 Preproces…
10 0.00316 accuracy binary 0.874 25 0.00210 Preproces…
11 0.00316 brier_class binary 0.0982 25 0.00208 Preproces…
12 0.00316 roc_auc binary 0.834 25 0.00684 Preproces…
13 1 accuracy binary 0.855 25 0.00293 Preproces…
14 1 brier_class binary 0.124 25 0.00207 Preproces…
15 1 roc_auc binary 0.5 25 0 Preproces…
# Visualize performance with regularization parameter
lasso_grid %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean, color = .metric)) +
geom_errorbar(aes(ymin = mean - std_err, ymax = mean + std_err), alpha = 0.5) +
geom_line(linewidth = 1.5) +
facet_wrap( ~ .metric, scales = "free", nrow = 2) +
scale_x_log10() +
theme(legend.position = "none")
# Select highest AUROC
highest_auroc <- lasso_grid %>%
select_best(metric = "roc_auc")
# Finalize workflow
wf_final <-
finalize_workflow(wf_tune,
highest_auroc)
# Visualize most important variables
wf_final %>%
fit(data = df_train) %>%
pull_workflow_fit() %>%
vi(lambda = highest_auroc$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
Fit final best model on training set and evaluate on test set
# Fit the final best model to the training set and evaluate the test set
final_fit <- last_fit(wf_final,
df_split)
# Extract trained workflow from final_fit
wf_final_trained <- final_fit %>%
extract_workflow()
wf_final_trained
══ Workflow [trained] ════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()
── Preprocessor ──────────────────────────────────────────────────────
3 Recipe Steps
• step_impute_bag()
• step_normalize()
• step_dummy()
── Model ─────────────────────────────────────────────────────────────
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.164100
2 1 3.98 0.149600
3 1 6.82 0.136300
4 1 8.96 0.124200
5 1 10.63 0.113100
6 2 12.06 0.103100
7 2 14.01 0.093930
8 2 15.64 0.085590
9 2 17.02 0.077980
10 2 18.19 0.071060
11 4 19.24 0.064740
12 4 20.44 0.058990
13 4 21.44 0.053750
14 4 22.30 0.048980
15 5 23.03 0.044620
16 5 23.76 0.040660
17 5 24.38 0.037050
18 5 24.91 0.033760
19 5 25.36 0.030760
20 5 25.74 0.028030
21 5 26.06 0.025540
22 5 26.33 0.023270
23 5 26.56 0.021200
24 5 26.76 0.019320
25 5 26.93 0.017600
26 5 27.07 0.016040
27 5 27.19 0.014610
28 5 27.29 0.013310
29 5 27.37 0.012130
30 5 27.44 0.011050
31 5 27.50 0.010070
32 5 27.55 0.009177
33 5 27.60 0.008362
34 5 27.63 0.007619
35 5 27.66 0.006942
36 5 27.69 0.006325
37 5 27.71 0.005764
38 5 27.73 0.005251
39 5 27.74 0.004785
40 5 27.75 0.004360
41 5 27.76 0.003973
42 5 27.77 0.003620
43 5 27.78 0.003298
44 5 27.79 0.003005
45 5 27.79 0.002738
46 5 27.80 0.002495
...
and 8 more lines.
# Extract trained model from final_fit
final_fit_mod <- final_fit %>%
extract_fit_parsnip()
final_fit_mod
parsnip model object
Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial", alpha = ~1)
Df %Dev Lambda
1 0 0.00 0.164100
2 1 3.98 0.149600
3 1 6.82 0.136300
4 1 8.96 0.124200
5 1 10.63 0.113100
6 2 12.06 0.103100
7 2 14.01 0.093930
8 2 15.64 0.085590
9 2 17.02 0.077980
10 2 18.19 0.071060
11 4 19.24 0.064740
12 4 20.44 0.058990
13 4 21.44 0.053750
14 4 22.30 0.048980
15 5 23.03 0.044620
16 5 23.76 0.040660
17 5 24.38 0.037050
18 5 24.91 0.033760
19 5 25.36 0.030760
20 5 25.74 0.028030
21 5 26.06 0.025540
22 5 26.33 0.023270
23 5 26.56 0.021200
24 5 26.76 0.019320
25 5 26.93 0.017600
26 5 27.07 0.016040
27 5 27.19 0.014610
28 5 27.29 0.013310
29 5 27.37 0.012130
30 5 27.44 0.011050
31 5 27.50 0.010070
32 5 27.55 0.009177
33 5 27.60 0.008362
34 5 27.63 0.007619
35 5 27.66 0.006942
36 5 27.69 0.006325
37 5 27.71 0.005764
38 5 27.73 0.005251
39 5 27.74 0.004785
40 5 27.75 0.004360
41 5 27.76 0.003973
42 5 27.77 0.003620
43 5 27.78 0.003298
44 5 27.79 0.003005
45 5 27.79 0.002738
46 5 27.80 0.002495
47 5 27.80 0.002273
48 5 27.80 0.002071
49 5 27.80 0.001887
50 5 27.81 0.001720
51 5 27.81 0.001567
52 5 27.81 0.001428
53 5 27.81 0.001301
54 5 27.81 0.001185
# A tibble: 6 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) -1.81 0.00316
2 disease_cat_Other -0.556 0.00316
3 anc_day_3_log10 -0.609 0.00316
4 plt_day_3_log10 -0.218 0.00316
5 ldh_pre_ld_log10 0.298 0.00316
6 ferritin_day_3_log10 0.576 0.00316
# Evaluate final model on test set
final_fit %>% collect_metrics()
# A tibble: 3 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy binary 0.861 Preprocessor1_Model1
2 roc_auc binary 0.877 Preprocessor1_Model1
3 brier_class binary 0.0923 Preprocessor1_Model1
# Plot calibration curve for test set
# Using {probably}
final_fit %>%
collect_predictions() %>%
cal_plot_breaks(
truth = icaht_grade_3_4,
estimate = .pred_1,
event_level = "second",
num_breaks = 5
)
# Using {runway}
# Augment df_test (i.e., add columns for predictions)
df_mod_post_day_3 <- wf_final_trained %>%
augment(new_data = df_test) %>%
mutate(.pred_class = as.numeric(as.character(.pred_class)))
p_cal_post_day_3 <- cal_plot(
df_mod_post_day_3,
outcome = "icaht_grade_3_4",
positive = "1",
prediction = ".pred_1",
n_bins = 0,
show_loess = TRUE,
plot_title = "eIPMPost (day +3 ANC/plt)"
)
p_cal_post_day_3
Method: maximize_metric
Predictor: .pred_1
Outcome: icaht_grade_3_4
Direction: >=
AUC n n_pos n_neg
0.8772 208 31 177
optimal_cutpoint youden acc sensitivity specificity tp fn fp tn
0.115 0.6812 0.774 0.9355 0.7458 29 2 45 132
Predictor summary:
Data Min. 5% 1st Qu. Median Mean
Overall 0.008789295 0.01859198 0.03869103 0.07791277 0.1499486
0 0.008789295 0.01834401 0.03576852 0.06418202 0.1082043
1 0.046506841 0.10745259 0.15332250 0.25756441 0.3882952
3rd Qu. 95% Max. SD NAs
0.1697938 0.5909424 0.9789442 0.1915267 0
0.1170436 0.3374924 0.7266540 0.1303143 0
0.5510965 0.9580952 0.9789442 0.2906861 0
df_mod_post_day_3 <- df_mod_post_day_3 %>%
mutate(icaht_grade_3_4 = fct_relevel(icaht_grade_3_4, c("0", "1")))
dcurves::dca(icaht_grade_3_4 ~ .pred_1,
data = df_mod_post_day_3,
thresholds = seq(0, 0.35, by = 0.01)) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
df_mod_all <- df_mod_pre_ferritin_pre_ld %>%
rename(predictions_pre_ferritin_pre_ld = .pred_1) %>%
left_join(
.,
df_mod_pre_ferritin_pre_ld_bridging %>% select(upn, predictions_pre_ferritin_pre_ld_bridging = .pred_1),
by = "upn"
) %>%
left_join(
.,
df_mod_pre_ferritin_pre_ld_LD_regimen %>% select(upn, predictions_pre_ferritin_pre_ld_LD_regimen = .pred_1),
by = "upn"
) %>%
left_join(
.,
df_mod_post_ferritin_only %>% select(upn, predictions_post_ferritin_only = .pred_1),
by = "upn"
) %>%
left_join(.,
df_mod_post_bridging %>% select(upn, predictions_post_bridging = .pred_1),
by = "upn") %>%
left_join(.,
df_mod_post_crp %>% select(upn, predictions_post_crp = .pred_1),
by = "upn") %>%
left_join(.,
df_mod_post_d_dimer %>% select(upn, predictions_post_d_dimer = .pred_1),
by = "upn") %>%
left_join(.,
df_mod_post_day_3 %>% select(upn, predictions_post_day_3 = .pred_1),
by = "upn") %>%
select(
upn,
icaht_grade_3_4,
predictions_pre_ferritin_pre_ld,
predictions_pre_ferritin_pre_ld_bridging,
predictions_pre_ferritin_pre_ld_LD_regimen,
predictions_post_ferritin_only,
predictions_post_bridging,
predictions_post_crp,
predictions_post_d_dimer,
predictions_post_day_3
)
dcurves::dca(
icaht_grade_3_4 ~ predictions_pre_ferritin_pre_ld + predictions_post_ferritin_only,
data = df_mod_all,
thresholds = seq(0, 0.35, by = 0.01),
label = list(
predictions_pre_ferritin_pre_ld = "eIPMPre",
predictions_post_ferritin_only = "eIPMPost"
)
) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
p_dca_pre_all <- dcurves::dca(
icaht_grade_3_4 ~ predictions_pre_ferritin_pre_ld + predictions_pre_ferritin_pre_ld_bridging +
predictions_pre_ferritin_pre_ld_LD_regimen,
data = df_mod_all,
thresholds = seq(0, 0.35, by = 0.01),
label = list(
predictions_pre_ferritin_pre_ld = "eIPMPre",
predictions_pre_ferritin_pre_ld_bridging = "eIPMPre (+ bridging Y/N)",
predictions_pre_ferritin_pre_ld_LD_regimen = "eIPMPre (+ LD regimen)"
)
) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
p_dca_post_all <- dcurves::dca(
icaht_grade_3_4 ~ predictions_post_ferritin_only + predictions_post_bridging + predictions_post_crp +
predictions_post_d_dimer + predictions_post_day_3,
data = df_mod_all,
thresholds = seq(0, 0.35, by = 0.01),
label = list(
predictions_post_ferritin_only = "eIPMPost",
predictions_post_bridging = "eIPMPost (+ bridging Y/N)",
predictions_post_crp ="eIPMPost (+ day +3 CRP)",
predictions_post_d_dimer ="eIPMPost (+ day +3 D-dimer)",
predictions_post_day_3 = "eIPMPost (day +3 ANC/plt)"
)
) %>%
plot(smooth = TRUE) +
labs(x = "Threshold Probability (Physician/Patient Preference)")
p <- p_dca_pre_all + p_dca_post_all
p + plot_layout(ncol = 1) + plot_annotation(tag_levels = "A")
# Create ROC data frames for each model
df_roc_pre_ferritin_pre_ld <- df_mod_pre_ferritin_pre_ld %>%
ungroup() %>%
roc_curve(truth = icaht_grade_3_4, .pred_1, event_level = "second")
df_roc_pre_ferritin_pre_ld_bridging <- df_mod_pre_ferritin_pre_ld_bridging %>%
ungroup() %>%
roc_curve(truth = icaht_grade_3_4, .pred_1, event_level = "second")
df_roc_pre_ferritin_pre_ld_LD_regimen <- df_mod_pre_ferritin_pre_ld_LD_regimen %>%
ungroup() %>%
roc_curve(truth = icaht_grade_3_4, .pred_1, event_level = "second")
df_roc_post_ferritin_only <- df_mod_post_ferritin_only %>%
ungroup() %>%
roc_curve(truth = icaht_grade_3_4, .pred_1, event_level = "second")
df_roc_post_bridging <- df_mod_post_bridging %>%
ungroup() %>%
roc_curve(truth = icaht_grade_3_4, .pred_1, event_level = "second")
df_roc_post_crp <- df_mod_post_crp %>%
ungroup() %>%
roc_curve(truth = icaht_grade_3_4, .pred_1, event_level = "second")
df_roc_post_d_dimer <- df_mod_post_d_dimer %>%
ungroup() %>%
roc_curve(truth = icaht_grade_3_4, .pred_1, event_level = "second")
df_roc_post_day_3 <- df_mod_post_day_3 %>%
ungroup() %>%
roc_curve(truth = icaht_grade_3_4, .pred_1, event_level = "second")
# eIPMPres
p_roc_pre_all <- df_roc_pre_ferritin_pre_ld %>%
mutate(model = "eIPMPre") %>%
bind_rows(
df_roc_pre_ferritin_pre_ld_bridging %>% mutate(model = "eIPMPre (+ bridging Y/N)"),
df_roc_pre_ferritin_pre_ld_LD_regimen %>% mutate(model = "eIPMPre (+ LD regimen)")
) %>%
ggplot(aes(
x = 1 - specificity,
y = sensitivity,
color = model
)) +
geom_path(size = 1) +
geom_abline(lty = 3) +
labs(color = "Model")
# eIPMPosts
p_roc_post_all <- df_roc_post_ferritin_only %>%
mutate(model = "eIPMPost") %>%
bind_rows(
df_roc_post_bridging %>% mutate(model = "eIPMPost (+ bridging Y/N)"),
df_roc_post_crp %>% mutate(model = "eIPMPost (+ day +3 CRP)"),
df_roc_post_d_dimer %>% mutate(model = "eIPMPost (+ day +3 D-dimer)"),
df_roc_post_day_3 %>% mutate(model = "eIPMPost (day +3 ANC/plt)")
) %>%
ggplot(aes(
x = 1 - specificity,
y = sensitivity,
color = model
)) +
geom_path(size = 1) +
geom_abline(lty = 3) +
labs(color = "Model")
p <- p_roc_pre_all + p_roc_post_all
p + plot_layout(ncol = 1) + plot_annotation(tag_levels = "A")
tbl_train <- df %>%
filter(upn %in% df_train$upn) %>%
select(
cart_age,
gender_desc,
racenih,
ethnih,
prior_hct,
disease_cat,
bm_disease_by_morph,
bm_disease_by_flow,
bridging_yn,
LD_regimen_cat,
product_infused,
product_cat,
costim_domain,
anc_pre_ld,
hb_pre_ld,
plt_pre_ld
) %>%
tbl_summary(
missing_text = "Missing",
type = list(all_continuous() ~ "continuous2"),
statistic = list(
all_continuous() ~ c("{median} ({p25}, {p75})", "{min} - {max}"),
all_categorical() ~ "{n} ({p}%)"
),
digits = all_categorical() ~ 0,
sort = list(everything() ~ "frequency")
) %>%
bold_labels()
tbl_test <- df %>%
filter(upn %in% df_test$upn) %>%
select(
cart_age,
gender_desc,
racenih,
ethnih,
prior_hct,
disease_cat,
bm_disease_by_morph,
bm_disease_by_flow,
bridging_yn,
LD_regimen_cat,
product_infused,
product_cat,
costim_domain,
anc_pre_ld,
hb_pre_ld,
plt_pre_ld
) %>%
tbl_summary(
missing_text = "Missing",
type = list(all_continuous() ~ "continuous2"),
statistic = list(
all_continuous() ~ c("{median} ({p25}, {p75})", "{min} - {max}"),
all_categorical() ~ "{n} ({p}%)"
),
digits = all_categorical() ~ 0,
sort = list(everything() ~ "frequency")
) %>%
bold_labels()
tbl_merge(
tbls = list(tbl_train, tbl_test),
tab_spanner = c("**Training cohort**", "**Test cohort**")
)
Characteristic | Training cohort | Test cohort |
---|---|---|
N = 4831 | N = 2081 | |
Age | ||
Median (IQR) | 62 (50, 68) | 61 (53, 67) |
Range | 21 - 84 | 19 - 79 |
Sex | ||
Male | 306 (63%) | 131 (63%) |
Female | 177 (37%) | 77 (37%) |
Race | ||
White | 413 (86%) | 170 (82%) |
Asian | 35 (7%) | 23 (11%) |
Black or African American | 13 (3%) | 5 (2%) |
American Indian or Alaska Native | 11 (2%) | 2 (1%) |
Multiple | 6 (1%) | 2 (1%) |
Unknown | 3 (1%) | 4 (2%) |
Native Hawaiian or other Pacific Islander | 2 (0%) | 2 (1%) |
Ethnicity | ||
Not Hispanic or Latino | 440 (91%) | 189 (91%) |
Hispanic or Latino | 37 (8%) | 15 (7%) |
Unknown | 6 (1%) | 4 (2%) |
Prior HCT | 161 (33%) | 73 (35%) |
Disease | ||
Aggressive NHL | 216 (45%) | 102 (49%) |
Other indolent NHL | 84 (17%) | 33 (16%) |
MM/PCL | 79 (16%) | 41 (20%) |
ALL | 78 (16%) | 21 (10%) |
Mantle cell lymphoma | 26 (5%) | 11 (5%) |
Bone marrow involvement by morphology (%) | ||
Median (IQR) | 0 (0, 27) | 0 (0, 40) |
Range | 0 - 100 | 0 - 95 |
Missing | 188 | 79 |
Bone marrow involvement by flow cytometry (%) | ||
Median (IQR) | 0 (0, 20) | 0 (0, 18) |
Range | 0 - 99 | 0 - 97 |
Missing | 177 | 75 |
Bridging therapy | 292 (60%) | 130 (63%) |
Lymphodepletion regimen | ||
Low-intensity Cy/Flu | 284 (59%) | 119 (57%) |
High-intensity Cy/Flu | 175 (36%) | 78 (38%) |
Other | 24 (5%) | 11 (5%) |
CAR T-cell product | ||
JCAR014 | 134 (28%) | 59 (28%) |
YESCARTA | 94 (19%) | 46 (22%) |
BREYANZI | 64 (13%) | 23 (11%) |
CARVYKTI | 35 (7%) | 17 (8%) |
Investigational CD20-targeted product | 34 (7%) | 11 (5%) |
JCAR021 | 33 (7%) | 11 (5%) |
TECARTUS | 32 (7%) | 15 (7%) |
ABECMA | 22 (5%) | 8 (4%) |
Investigational BCMA-targeted product | 22 (5%) | 16 (8%) |
KYMRIAH | 13 (3%) | 2 (1%) |
CAR T-cell product type | ||
Investigational CAR T-cell product with 4-1BB costimulatory domain | 223 (46%) | 97 (47%) |
Commercial CD19-targeted CAR T-cell product with CD28 costimulatory domain | 126 (26%) | 61 (29%) |
Commercial CD19-targeted CAR T-cell product with 4-1BB costimulatory domain | 77 (16%) | 25 (12%) |
Commercial BCMA-targeted CAR T-cell product | 57 (12%) | 25 (12%) |
Costimulatory domain | ||
4-1BB | 323 (67%) | 136 (65%) |
CD28 | 126 (26%) | 61 (29%) |
Dual 4-1BB and CD28 | 34 (7%) | 11 (5%) |
Pre-LD ANC (x 10^3 cells/µL) | ||
Median (IQR) | 2.86 (1.84, 4.22) | 2.77 (1.58, 4.60) |
Range | 0.00 - 23.17 | 0.02 - 54.03 |
Missing | 1 | |
Pre-LD Hb (g/dL) | ||
Median (IQR) | 10.70 (9.50, 12.25) | 10.70 (9.60, 12.42) |
Range | 7.30 - 17.10 | 7.00 - 15.60 |
Pre-LD platelet count (x 10^3 cells/µL) | ||
Median (IQR) | 146 (94, 203) | 139 (82, 201) |
Range | 6 - 790 | 7 - 453 |
1 n (%) |
df %>%
filter(upn %in% df_train$upn) %>%
plyr::summarize(
anc_pre_ld_5p = quantile(anc_pre_ld, 0.05, na.rm = TRUE),
plt_pre_ld_5p = quantile(plt_pre_ld, 0.05, na.rm = TRUE),
ldh_pre_ld_5p = quantile(ldh_pre_ld, 0.05, na.rm = TRUE),
ferritin_pre_ld_5p = quantile(ferritin_pre_ld, 0.05, na.rm = TRUE),
ferritin_day_3_5p = quantile(ferritin_day_3, 0.05, na.rm = TRUE)
)
anc_pre_ld_5p plt_pre_ld_5p ldh_pre_ld_5p ferritin_pre_ld_5p
1 0.71 24.1 114.35 27.1
ferritin_day_3_5p
1 68
df %>%
filter(upn %in% df_train$upn) %>%
plyr::summarize(
anc_pre_ld_95p = quantile(anc_pre_ld, 0.95, na.rm = TRUE),
plt_pre_ld_95p = quantile(plt_pre_ld, 0.95, na.rm = TRUE),
ldh_pre_ld_95p = quantile(ldh_pre_ld, 0.95, na.rm = TRUE),
ferritin_pre_ld_95p = quantile(ferritin_pre_ld, 0.95, na.rm = TRUE),
ferritin_day_3_95p = quantile(ferritin_day_3, 0.95, na.rm = TRUE)
)
anc_pre_ld_95p plt_pre_ld_95p ldh_pre_ld_95p ferritin_pre_ld_95p
1 7.9815 287.4 644.75 3904.1
ferritin_day_3_95p
1 4931.45
# eIPMPre
df_mod_pre_ferritin_pre_ld <- df_mod_pre_ferritin_pre_ld %>%
left_join(., df_icaht %>% select(upn, duration_below_501_max), by = "upn") %>%
mutate(.pred_1_as_percentage = .pred_1 * 100)
# Calculate correlation coefficient and p-value
cor_test <- cor.test(
df_mod_pre_ferritin_pre_ld$.pred_1_as_percentage,
df_mod_pre_ferritin_pre_ld$duration_below_501_max
)
cor_coef <- cor_test$estimate
cor_coef
cor
0.5979965
p_value <- cor_test$p.value
p_value
[1] 1.472629e-21
p1 <- df_mod_pre_ferritin_pre_ld %>%
ggplot(aes(x = .pred_1_as_percentage, y = duration_below_501_max)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE,
color = "blue") + # Add linear regression line
labs(x = "Probability of Severe eICAHT (%) Predicted by eIPMPre",
y = "Duration of Neutropenia (Days)") +
annotate(
"text",
x = 90,
y = 5,
label = paste("r =", round(cor_coef, 2), "\np < 0.001"),
hjust = 0,
vjust = 1,
size = 4,
color = "red"
)
# eIPMPost
df_mod_post_ferritin_only <- df_mod_post_ferritin_only %>%
left_join(., df_icaht %>% select(upn, duration_below_501_max), by = "upn") %>%
mutate(.pred_1_as_percentage = .pred_1 * 100)
# Calculate correlation coefficient and p-value
cor_test <- cor.test(
df_mod_post_ferritin_only$.pred_1_as_percentage,
df_mod_post_ferritin_only$duration_below_501_max
)
cor_coef <- cor_test$estimate
cor_coef
cor
0.6136532
p_value <- cor_test$p.value
p_value
[1] 6.550573e-23
p2 <- df_mod_post_ferritin_only %>%
ggplot(aes(x = .pred_1_as_percentage, y = duration_below_501_max)) +
geom_point() +
geom_smooth(method = "lm",
se = FALSE,
color = "blue") + # Add linear regression line
labs(x = "Probability of Severe eICAHT (%) Predicted by eIPMPost",
y = "Duration of Neutropenia (Days)") +
annotate(
"text",
x = 90,
y = 5,
label = paste("r =", round(cor_coef, 2), "\np < 0.001"),
hjust = 0,
vjust = 1,
size = 4,
color = "red"
)
p <- p1 + p2
p + plot_layout(ncol = 1) + plot_annotation(tag_levels = "A")
# eIPMPre
df_mod_pre_ferritin_pre_ld <- df_mod_pre_ferritin_pre_ld %>%
left_join(., df %>% select(upn, OS_status, OS_months), by = "upn")
tbl_pre <- coxph(Surv(OS_months, OS_status) ~ .pred_1_as_percentage,
data = df_mod_pre_ferritin_pre_ld) %>%
tbl_regression(exponentiate = TRUE) %>%
bold_p()
# eIPMPost
df_mod_post_ferritin_only <- df_mod_post_ferritin_only %>%
left_join(., df %>% select(upn, OS_status, OS_months), by = "upn")
tbl_post <- coxph(Surv(OS_months, OS_status) ~ .pred_1_as_percentage,
data = df_mod_post_ferritin_only) %>%
tbl_regression(exponentiate = TRUE) %>%
bold_p()
tbl_stack(
list(tbl_pre, tbl_post),
group_header =
c(
"eIPMPre",
"eIPMPost"
)
) %>%
modify_header(label = "**Probability of severe eICAHT predicted by by eIPM**")
Probability of severe eICAHT predicted by by eIPM | HR1 | 95% CI1 | p-value |
---|---|---|---|
eIPMPre | |||
.pred_1_as_percentage | 1.04 | 1.03, 1.05 | <0.001 |
eIPMPost | |||
.pred_1_as_percentage | 1.04 | 1.03, 1.04 | <0.001 |
1 HR = Hazard Ratio, CI = Confidence Interval |
# Impute missing data
df_car_hematotox <- df %>%
filter(upn %in% df_test$upn) %>%
select(upn,
icaht_grade_3_4,
anc_pre_ld,
hb_pre_ld,
plt_pre_ld,
crp_pre_ld,
ferritin_pre_ld)
recipe <- recipe(icaht_grade_3_4 ~ ., data = df_car_hematotox) %>%
update_role(upn, new_role = "ID") %>%
update_role_requirements("ID", bake = FALSE) %>%
step_impute_bag(anc_pre_ld,
hb_pre_ld,
plt_pre_ld,
crp_pre_ld,
ferritin_pre_ld,
seed_val = 100)
# Extract processed dataframe
df_car_hematotox <- recipe %>%
prep() %>%
bake(new_data = NULL)
# Calculate CAR-HEMATOTOX score (continuous)
df_car_hematotox <- df_car_hematotox %>%
mutate(
plt_hematotox_score = case_when(
plt_pre_ld > 175 ~ 0,
plt_pre_ld %in% c(75:175) ~ 1,
plt_pre_ld < 75 ~ 2,
is.na(plt_pre_ld) ~ NA_real_
),
anc_hematotox_score = case_when(
anc_pre_ld > 1.2 ~ 0,
anc_pre_ld < 1.2 ~ 1,
is.na(anc_pre_ld) ~ NA_real_
),
hb_hematotox_score = case_when(hb_pre_ld > 9 ~ 0,
hb_pre_ld < 9 ~ 1,
is.na(hb_pre_ld) ~ NA_real_),
crp_hematotox_score = case_when(crp_pre_ld < 3 ~ 0,
crp_pre_ld > 3 ~ 1,
is.na(crp_pre_ld) ~ NA_real_),
ferritin_hematotox_score = case_when(
ferritin_pre_ld < 650 ~ 0,
ferritin_pre_ld %in% c(650:2000) ~ 1,
ferritin_pre_ld > 2000 ~ 2,
is.na(ferritin_pre_ld) ~ NA_real_
)
)
# Calculate CAR-HEMATOTOX score (categorical)
df_car_hematotox <- df_car_hematotox %>%
mutate(
car_hematotox_score = rowSums(across(
plt_hematotox_score:ferritin_hematotox_score
)),
car_hematotox_categorical = case_when(car_hematotox_score < 2 ~ "Low",
car_hematotox_score >= 2 ~ "High"),
car_hematotox_categorical = as.factor(car_hematotox_categorical),
car_hematotox_categorical = fct_relevel(car_hematotox_categorical, c("Low", "High"))
)
var_label(df_car_hematotox$car_hematotox_score) <- "CAR-HEMATOTOX score (continuous)"
var_label(df_car_hematotox$car_hematotox_categorical) <- "CAR-HEMATOTOX score (categorical)"
# Sensitivity and specificity based on confusion matrix
df_matrix <- df_car_hematotox %>%
select(upn, icaht_grade_3_4, car_hematotox_categorical) %>%
mutate(car_hematotox_high = ifelse(car_hematotox_categorical == "High", 1, 0)) %>%
drop_na()
observed_value <- factor(df_matrix$icaht_grade_3_4, levels = c("1", "0"))
predicted_value <- factor(df_matrix$car_hematotox_high, levels = c("1", "0"))
matrix <-
confusionMatrix(data = predicted_value,
reference = observed_value,
positive = "1")
matrix
Confusion Matrix and Statistics
Reference
Prediction 1 0
1 28 87
0 1 69
Accuracy : 0.5243
95% CI : (0.4498, 0.5981)
No Information Rate : 0.8432
P-Value [Acc > NIR] : 1
Kappa : 0.1848
Mcnemar's Test P-Value : <2e-16
Sensitivity : 0.9655
Specificity : 0.4423
Pos Pred Value : 0.2435
Neg Pred Value : 0.9857
Prevalence : 0.1568
Detection Rate : 0.1514
Detection Prevalence : 0.6216
Balanced Accuracy : 0.7039
'Positive' Class : 1
# ROC
Epi::ROC(df_matrix$car_hematotox_high, df_matrix$icaht_grade_3_4)
devtools::session_info()
─ Session info ─────────────────────────────────────────────────────
setting value
version R version 4.4.1 (2024-06-14)
os macOS 15.2
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/Los_Angeles
date 2025-01-10
pandoc 3.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/aarch64/ (via rmarkdown)
─ Packages ─────────────────────────────────────────────────────────
package * version date (UTC) lib source
assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.4.0)
backports 1.5.0 2024-05-23 [1] CRAN (R 4.4.0)
base64enc 0.1-3 2015-07-28 [1] CRAN (R 4.4.0)
broom * 1.0.6 2024-05-17 [1] CRAN (R 4.4.0)
broom.helpers 1.15.0 2024-04-05 [1] CRAN (R 4.4.0)
bslib 0.8.0 2024-07-29 [1] CRAN (R 4.4.0)
cachem 1.1.0 2024-05-16 [1] CRAN (R 4.4.0)
caret * 6.0-94 2023-03-21 [1] CRAN (R 4.4.0)
checkmate 2.3.2 2024-07-29 [1] CRAN (R 4.4.0)
class 7.3-22 2023-05-03 [1] CRAN (R 4.4.1)
cli 3.6.3 2024-06-21 [1] CRAN (R 4.4.0)
cluster 2.1.6 2023-12-01 [1] CRAN (R 4.4.1)
cmprsk 2.2-12 2024-05-19 [1] CRAN (R 4.4.0)
codetools 0.2-20 2024-03-31 [1] CRAN (R 4.4.1)
colorspace 2.1-1 2024-07-26 [1] CRAN (R 4.4.0)
commonmark 1.9.1 2024-01-30 [1] CRAN (R 4.4.0)
cutpointr * 1.1.2 2022-04-13 [1] CRAN (R 4.4.0)
data.table 1.15.4 2024-03-30 [1] CRAN (R 4.4.0)
dcurves * 0.4.0 2022-12-23 [1] CRAN (R 4.4.0)
devtools 2.4.5 2022-10-11 [1] CRAN (R 4.4.0)
dials * 1.2.1 2024-02-22 [1] CRAN (R 4.4.0)
DiceDesign 1.10 2023-12-07 [1] CRAN (R 4.4.0)
digest 0.6.36 2024-06-23 [1] CRAN (R 4.4.0)
distill * 1.6 2023-10-06 [1] CRAN (R 4.4.0)
doParallel 1.0.17 2022-02-07 [1] CRAN (R 4.4.0)
downlit 0.4.4 2024-06-10 [1] CRAN (R 4.4.0)
dplyr * 1.1.4 2023-11-17 [1] CRAN (R 4.4.0)
e1071 1.7-14 2023-12-06 [1] CRAN (R 4.4.0)
ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.4.0)
Epi * 2.53 2024-07-18 [1] CRAN (R 4.4.0)
etm 1.1.1 2020-09-08 [1] CRAN (R 4.4.0)
evaluate 0.24.0 2024-06-10 [1] CRAN (R 4.4.0)
fansi 1.0.6 2023-12-08 [1] CRAN (R 4.4.0)
farver 2.1.2 2024-05-13 [1] CRAN (R 4.4.0)
fastmap 1.2.0 2024-05-15 [1] CRAN (R 4.4.0)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.4.0)
foreach 1.5.2 2022-02-02 [1] CRAN (R 4.4.0)
foreign 0.8-86 2023-11-28 [1] CRAN (R 4.4.1)
Formula 1.2-5 2023-02-24 [1] CRAN (R 4.4.0)
fs 1.6.4 2024-04-25 [1] CRAN (R 4.4.0)
furrr 0.3.1 2022-08-15 [1] CRAN (R 4.4.0)
future 1.33.2 2024-03-26 [1] CRAN (R 4.4.0)
future.apply 1.11.2 2024-03-28 [1] CRAN (R 4.4.0)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.4.0)
ggeasy 0.1.4 2023-03-12 [1] CRAN (R 4.4.0)
ggnewscale * 0.4.10 2024-02-08 [1] CRAN (R 4.4.0)
ggplot2 * 3.5.1 2024-04-23 [1] CRAN (R 4.4.0)
ggrepel * 0.9.5 2024-01-10 [1] CRAN (R 4.4.0)
ggsurvfit * 1.1.0 2024-05-08 [1] CRAN (R 4.4.0)
glmnet * 4.1-8 2023-08-22 [1] CRAN (R 4.4.0)
globals 0.16.3 2024-03-08 [1] CRAN (R 4.4.0)
glue 1.7.0 2024-01-09 [1] CRAN (R 4.4.0)
gower 1.0.1 2022-12-22 [1] CRAN (R 4.4.0)
GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.4.0)
gridExtra 2.3 2017-09-09 [1] CRAN (R 4.4.0)
gt 0.11.0 2024-07-09 [1] CRAN (R 4.4.0)
gtable 0.3.5 2024-04-22 [1] CRAN (R 4.4.0)
gtsummary * 1.7.2 2023-07-15 [1] CRAN (R 4.4.0)
hardhat 1.4.0 2024-06-02 [1] CRAN (R 4.4.0)
haven 2.5.4 2023-11-30 [1] CRAN (R 4.4.0)
heatwaveR 0.4.6 2021-10-27 [1] CRAN (R 4.4.0)
here * 1.0.1 2020-12-13 [1] CRAN (R 4.4.0)
highr 0.11 2024-05-26 [1] CRAN (R 4.4.0)
Hmisc * 5.1-3 2024-05-28 [1] CRAN (R 4.4.0)
hms 1.1.3 2023-03-21 [1] CRAN (R 4.4.0)
htmlTable 2.4.3 2024-07-21 [1] CRAN (R 4.4.0)
htmltools 0.5.8.1 2024-04-04 [1] CRAN (R 4.4.0)
htmlwidgets 1.6.4 2023-12-06 [1] CRAN (R 4.4.0)
httpuv 1.6.15 2024-03-26 [1] CRAN (R 4.4.0)
infer * 1.0.7 2024-03-25 [1] CRAN (R 4.4.0)
ipred 0.9-15 2024-07-18 [1] CRAN (R 4.4.0)
iterators 1.0.14 2022-02-05 [1] CRAN (R 4.4.0)
janitor * 2.2.0 2023-02-02 [1] CRAN (R 4.4.0)
jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.4.0)
jsonlite 1.8.8 2023-12-04 [1] CRAN (R 4.4.0)
knitr * 1.48 2024-07-07 [1] CRAN (R 4.4.0)
labeling 0.4.3 2023-08-29 [1] CRAN (R 4.4.0)
labelled * 2.13.0 2024-04-23 [1] CRAN (R 4.4.0)
later 1.3.2 2023-12-06 [1] CRAN (R 4.4.0)
latrend * 1.6.1 2024-05-15 [1] CRAN (R 4.4.0)
lattice * 0.22-6 2024-03-20 [1] CRAN (R 4.4.1)
lava 1.8.0 2024-03-05 [1] CRAN (R 4.4.0)
lhs 1.2.0 2024-06-30 [1] CRAN (R 4.4.0)
lifecycle 1.0.4 2023-11-07 [1] CRAN (R 4.4.0)
listenv 0.9.1 2024-01-29 [1] CRAN (R 4.4.0)
lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.4.0)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.4.0)
markdown 1.13 2024-06-04 [1] CRAN (R 4.4.0)
MASS 7.3-60.2 2024-04-26 [1] CRAN (R 4.4.1)
Matrix * 1.7-0 2024-04-26 [1] CRAN (R 4.4.1)
MatrixModels 0.5-3 2023-11-06 [1] CRAN (R 4.4.0)
matrixStats 1.3.0 2024-04-11 [1] CRAN (R 4.4.0)
memoise 2.0.1 2021-11-26 [1] CRAN (R 4.4.0)
mgcv 1.9-1 2023-12-21 [1] CRAN (R 4.4.1)
mime 0.12 2021-09-28 [1] CRAN (R 4.4.0)
miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.4.0)
modeldata * 1.4.0 2024-06-19 [1] CRAN (R 4.4.0)
ModelMetrics 1.2.2.2 2020-03-17 [1] CRAN (R 4.4.0)
multcomp 1.4-26 2024-07-18 [1] CRAN (R 4.4.0)
munsell 0.5.1 2024-04-01 [1] CRAN (R 4.4.0)
mvtnorm 1.2-5 2024-05-21 [1] CRAN (R 4.4.0)
nlme 3.1-164 2023-11-27 [1] CRAN (R 4.4.1)
nnet 7.3-19 2023-05-03 [1] CRAN (R 4.4.1)
numDeriv 2016.8-1.1 2019-06-06 [1] CRAN (R 4.4.0)
openxlsx * 4.2.5.2 2023-02-06 [1] CRAN (R 4.4.0)
parallelly 1.37.1 2024-02-29 [1] CRAN (R 4.4.0)
parsnip * 1.2.1 2024-03-22 [1] CRAN (R 4.4.0)
patchwork * 1.2.0 2024-01-08 [1] CRAN (R 4.4.0)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.4.0)
pkgbuild 1.4.4 2024-03-17 [1] CRAN (R 4.4.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.4.0)
pkgload 1.4.0 2024-06-28 [1] CRAN (R 4.4.0)
plyr 1.8.9 2023-10-02 [1] CRAN (R 4.4.0)
pmsampsize * 1.1.3 2023-12-06 [1] CRAN (R 4.4.0)
polspline 1.1.25 2024-05-10 [1] CRAN (R 4.4.0)
probably * 1.0.3 2024-02-23 [1] CRAN (R 4.4.0)
pROC 1.18.5 2023-11-01 [1] CRAN (R 4.4.0)
prodlim * 2024.06.25 2024-06-24 [1] CRAN (R 4.4.0)
profvis 0.3.8 2023-05-02 [1] CRAN (R 4.4.0)
promises 1.3.0 2024-04-05 [1] CRAN (R 4.4.0)
proxy 0.4-27 2022-06-09 [1] CRAN (R 4.4.0)
purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.4.0)
quantreg 5.98 2024-05-26 [1] CRAN (R 4.4.0)
R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.4.0)
R.oo 1.26.0 2024-01-24 [1] CRAN (R 4.4.0)
R.utils 2.12.3 2023-11-18 [1] CRAN (R 4.4.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.4.0)
rbibutils 2.2.16 2023-10-25 [1] CRAN (R 4.4.0)
Rcpp 1.0.13 2024-07-17 [1] CRAN (R 4.4.0)
Rdpack 2.6 2023-11-08 [1] CRAN (R 4.4.0)
readr * 2.1.5 2024-01-10 [1] CRAN (R 4.4.0)
recipes * 1.1.0 2024-07-04 [1] CRAN (R 4.4.0)
remotes 2.5.0 2024-03-17 [1] CRAN (R 4.4.0)
reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.4.0)
rlang 1.1.4 2024-06-04 [1] CRAN (R 4.4.0)
rmarkdown * 2.27 2024-05-17 [1] CRAN (R 4.4.0)
rms * 6.8-1 2024-05-27 [1] CRAN (R 4.4.0)
rpart 4.1.23 2023-12-05 [1] CRAN (R 4.4.1)
rprojroot 2.0.4 2023-11-05 [1] CRAN (R 4.4.0)
rsample * 1.2.1 2024-03-25 [1] CRAN (R 4.4.0)
rstudioapi 0.16.0 2024-03-24 [1] CRAN (R 4.4.0)
runway * 0.1.0 2024-08-08 [1] Github (ML4LHS/runway@e93d55e)
sandwich 3.1-0 2023-12-11 [1] CRAN (R 4.4.0)
sass 0.4.9 2024-03-15 [1] CRAN (R 4.4.0)
scales * 1.3.0 2023-11-28 [1] CRAN (R 4.4.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.4.0)
shape 1.4.6.1 2024-02-23 [1] CRAN (R 4.4.0)
shiny 1.8.1.1 2024-04-02 [1] CRAN (R 4.4.0)
snakecase 0.11.1 2023-08-27 [1] CRAN (R 4.4.0)
SparseM 1.84-2 2024-07-17 [1] CRAN (R 4.4.0)
stringi 1.8.4 2024-05-06 [1] CRAN (R 4.4.0)
stringr * 1.5.1 2023-11-14 [1] CRAN (R 4.4.0)
survival * 3.6-4 2024-04-24 [1] CRAN (R 4.4.1)
TH.data 1.1-2 2023-04-17 [1] CRAN (R 4.4.0)
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.4.0)
tidymodels * 1.2.0 2024-03-25 [1] CRAN (R 4.4.0)
tidyr * 1.3.1 2024-01-24 [1] CRAN (R 4.4.0)
tidyselect 1.2.1 2024-03-11 [1] CRAN (R 4.4.0)
tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.4.0)
timechange 0.3.0 2024-01-18 [1] CRAN (R 4.4.0)
timeDate 4032.109 2023-12-14 [1] CRAN (R 4.4.0)
tune * 1.2.1 2024-04-18 [1] CRAN (R 4.4.0)
tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.4.0)
urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.4.0)
usethis 2.2.3 2024-02-19 [1] CRAN (R 4.4.0)
utf8 1.2.4 2023-10-22 [1] CRAN (R 4.4.0)
vctrs 0.6.5 2023-12-01 [1] CRAN (R 4.4.0)
vip * 0.4.1 2023-08-21 [1] CRAN (R 4.4.0)
withr 3.0.1 2024-07-31 [1] CRAN (R 4.4.0)
workflows * 1.1.4 2024-02-19 [1] CRAN (R 4.4.0)
workflowsets * 1.1.0 2024-03-21 [1] CRAN (R 4.4.0)
xfun 0.46 2024-07-18 [1] CRAN (R 4.4.0)
xml2 1.3.6 2023-12-04 [1] CRAN (R 4.4.0)
xtable 1.8-4 2019-04-21 [1] CRAN (R 4.4.0)
yaml 2.3.10 2024-07-26 [1] CRAN (R 4.4.0)
yardstick * 1.3.1 2024-03-21 [1] CRAN (R 4.4.0)
zip 2.3.1 2024-01-27 [1] CRAN (R 4.4.0)
zoo * 1.8-12 2023-04-13 [1] CRAN (R 4.4.0)
[1] /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
────────────────────────────────────────────────────────────────────