Appendix D. Sample code used to implement the Cox proportional hazard model in the heart failure mortality example

From NorthShore Analytics
Jump to: navigation, search

In the listing below, the formatting is different from that of the actual code due to a different line length in the typeset font. User-defined packages NS.CA.statUtils, NS.CA.dataUtils, NS.CA.dateUtils, NS.CA.modelUtils, NS.CA.plotUtils and NS.CA.mathUtils are defined in Appendix E. R package manuals.

### Daniel Chertok
### (C) 2014 NorthShore University HealthSystem
### All rights reserved.

rm(list=ls())

library(debug)
library(reshape2)
library(date)
library(stringr)
library(plyr)
library(scales)
library(lubridate)

library(NS.CA.statUtils)
library(NS.CA.dataUtils)
library(NS.CA.dateUtils)
library(NS.CA.modelUtils)
library(NS.CA.plotUtils)
library(NS.CA.mathUtils)

# main module
# set up constants

testFrac <- 0.2
threshold <- 0.05
oma <- c( 0, 0, 3, 0 )
period <- years( 1 )
minCor <- 0.6
sensThreshold <- 0.01
signLev <- 0.95

years <- 2010:2012
today <- as.Date("2013-01-01")
lastDate <- as.Date("2012-01-01")
dataDir <- "../Data/"
resultDir <- "../Results/"
graphDir <- paste( resultDir, "Graphs", sep="" )
interimDir <- paste( resultDir, "Interim/", sep="" )
timeStamp <- timestamp.from.date()

# read the input file

admRaw <- read.csv( paste(dataDir, "HF_EOL.csv", sep="") )
outcomes <- c( "DEATH_365_DAYS" )
admRaw[, outcomes] <- as.factor( admRaw[, outcomes] )

# all dates will be used later

keyDates <- sort( unique( as.Date( admRaw$AS_OF_DATE ) ) )

# for analysis, use data up to today only

admRaw <- transform( admRaw, AS_OF_DATE=as.Date( AS_OF_DATE ) )
adm <- subset( admRaw, AS_OF_DATE <= lastDate )

# columns irrelevant to the analysis

excludeCol <- c( "PAT_ID" )

# columns where NAs can be turne into 0s

NAzero <- c("IND", "PRESCRIBED", "VISIT", "DAYS", "NUM" )
NAzeroCol <- unique( Reduce( c, sapply( NAzero, grep, colnames( adm ) ) ) )
NAzCol <- NAzeroCol[! colnames( adm[, NAzeroCol] ) %in% outcomes]
adm[, NAzCol] <- sapply( adm[, NAzCol], function( x ) 
  ifelse( is.na( x ), 0, x ) )

# independent of train / test breakdown

ejfrMed <- median( na.omit( admRaw$EJFR_NUM ) )
adm <- mutate( adm, EJFR_IND=ifelse( EJFR_NUM > 0, 1, 0 ),
               EJFR_NUM=ifelse( EJFR_NUM > 0, EJFR_NUM, ejfrMed ),
               EJFR_DIF=abs( EJFR_NUM - ejfrMed ),
               FEMALE_IND=ifelse( GENDER_CODE=="F", 1, 0 ) )
adm[is.na( adm$MARITAL_STATUS ),"MARITAL_STATUS"] <- "Unknown"
adm[is.na( adm$RACE_ETHNICITY ),"RACE_ETHNICITY"] <-
  NS.CA.mode( adm$RACE_ETHNICITY )

# indicator columns

indCol <- grep( "TOTAL", grep( "_IND|_PRESCRIBED|_GT", names( adm ), value=T ), 
                invert=T, value=T )
statusCol <- which( colnames( adm ) %in% outcomes )

# select random rows for survival

idCol <- "PAT_MRN_ID"
patRows <- c( rand.surv( adm[adm$AS_OF_DATE < lastDate, ], idCol, seed=1 ),
              rownames( adm[adm$AS_OF_DATE == lastDate, ] ) )
adm <- adm[patRows, ! colnames( adm ) %in% excludeCol]

# set.seed( NULL )   # random sampling
# set.seed( 1 )   # reproducible results
# testRows <- createDataPartition( adm[, statusCol], p=testFrac, list=F )
# testRows <- which( adm$AS_OF_DATE < as.Date( '2011-01-01' ))  # test 2010-2011
testRows <- which( adm$AS_OF_DATE == lastDate )  # test on MG 2012-01-01
# testRows <- which(adm$AS_OF_DATE == '2014-01-01')  # test on MG 2014-01-01

# set up filter arguments

rcnt <- c( "__MR", "__06_MO", "__12_MO", "__24_MO", "__36_MO", "__48_MO" )
rcntCol <- unique( Reduce( c, sapply( rcnt, grep, colnames( adm ) ) ) )

# use train set only

trainRows <- rownames( adm[-testRows, ] )

# remove or backfill sparse columns

dataSet <- rem.sparse.col( adm, trainRows, rcntCol, 0.2 )

# convert outcomes to numeric

dataSet[, outcomes] <- 
  as.numeric( levels( dataSet[, outcomes]  ) )[dataSet[, outcomes] ]

# eliminate indicators with only one value

dataTrain <- rem.single.val.col( dataSet[trainRows, ] )
dataTrainCol <- colnames( dataTrain )
dataSet <- dataSet[, dataTrainCol]
indCol <- indCol[indCol %in% dataTrainCol]


# univariate ethnicity

ethnicitySummary <- 
  multi.category.stats( dataTrain, "RACE_ETHNICITY", outcomes, sensThreshold, 
                        "Caucasian", "Other", c("0", "1") )
write.csv( ethnicitySummary, paste( interimDir, "Ethnicity_", timeStamp,
                                    ".csv", sep="" ), row.names=F )

# this comes in after the initial analysis

levels(dataSet$RACE_ETHNICITY) <- c(levels(dataSet$RACE_ETHNICITY),
                                    "AfroEuroAsian")
dataSet[! dataSet$RACE_ETHNICITY %in%
          c("Asian","African American", "Caucasian"), "RACE_ETHNICITY"] <-
  "Other"
dataSet[dataSet$RACE_ETHNICITY %in%
          c("Asian","African American", "Caucasian"), "RACE_ETHNICITY"] <-
  "AfroEuroAsian"

# marital status

maritalStatus <-
  multi.category.stats( dataTrain, "MARITAL_STATUS", outcomes, sensThreshold, 
                        "Married",  "Unknown", c( "1", "0" ) )
write.csv( maritalStatus, paste( interimDir, "MaritalStatus_", timeStamp,
                                  ".csv", sep="" ), row.names=F )
levels(dataSet$MARITAL_STATUS) <- c( levels( dataSet$MARITAL_STATUS ),
                                     "NotWidowed", "Other" )
dataSet[! dataSet$MARITAL_STATUS %in% c( "Married","Widowed" ),
        "MARITAL_STATUS"] <- "Other"
dataSet[dataSet$MARITAL_STATUS %in% c( "Married","Other" ), 
        "MARITAL_STATUS"] <- "NotWidowed"

eTs <- transform( subset( ethnicitySummary,
                          ! RACE_ETHNICITY %in% c( "TOTAL", "Caucasian" ) ),
                  Variable=paste( RACE_ETHNICITY, ": Caucasian" ) )
mSt <- transform( subset( maritalStatus,
                          ! MARITAL_STATUS %in% c( "TOTAL", "Married" ) ),
                  Variable=paste( MARITAL_STATUS, ": Married" ) )

uniOddsRatio <- odds.ratio.save( dataTrain, outcomes, indCol, idCol, 
                                 addList=list( eTs, mSt ), oRdir=interimDir )

outCol <- which( colnames( dataTrain ) %in% outcomes )
hiCorTab <- output.corr( dataTrain, indCol, dir=interimDir, 
                         timeStamp=timeStamp )

indSignif <- na.omit( uniOddsRatio[uniOddsRatio$Validity == '*', "Variable"] )
indSignif <- indSignif[!indSignif %in% as.character( hiCorTab$var2 )]

# manual adjustments to predictive variables 

exclCol <- c( "ADN_EVER"   # correlated variables
              , "BMI__06_MO"
              , "BMI__12_MO"
              , "WEIGHT__MR"
              , "WEIGHT__06_MO"
              , "WEIGHT__12_MO"
              , "WEIGHT__24_MO"
              , "NUM_CARD_SEEN_MOST_LST_24_MO"
              , "VISIT_PRACTICE_NAME"
              , "LDL__MR"
              , "CREAT__06_MO"
              , "DBP__06_MO"
              , "SBP__06_MO"
              , "NUM_KCC_VISIT"
              , "MR_VISIT_PRIM_SPEC"
              , "NOT_PRN_MED_TOTAL_PRESCRIBED"
              , "PACEMAKER_IND"
              , "VISIT_PHY_MG_IND"
              , "NONINSULIN_DIAB_PRESCRIBED"
              
              , "CANCER_DX"
              , "CHOL__MR"
              , "COMBOANTHYP_PRESCRIBED"
              , "DBP__12_MO"
              , "DBP__24_MO"
              , "SBP__12_MO"
              , "DISCHARGED_DISP_30_DAYS"
              , "DISCHARGED_DISP_365_DAYS"
              , "EJFR_IND"
              , "EJFR_DIF"
              , "EJFR_NUM"
              , "INSULIN_PRESCRIBED"
              , "MG_ONC_VISIT"
              , "NUM_HOSP_30_DAYS_CHF"
              , "NUM_HOSP_365_DAYS_CHF"
              , "NUM_MISSED_APPTS_365"
              , "ORALVASODILSPERF_PRESCRIBED"
              , "PLAT__MR"
              , "MR_PHY_ID"
              , "ACE_INHIBITOR_PRESCRIBED"
              , "SPIRON_PRESCRIBED"
              , "NUM_HOSP_365_DAYS_ICU"
              , "LOOPDIURETIC_PRESCRIBED"
              , "ASPIRIN_PRESCRIBED"
              , "CARDO_12MM_VISIT"
              
              , "INP_OBS_HF_365_DAYS"   # two look-ahead variables, can't use
              , "INP_OBS_HF_ADM_365_DAYS"
              
              , "ANTICOAG_PRESCRIBED"   # latest iteration testing on 2012
              , "FEMALE_IND"
              , "MI_IND"
              , "PL_DEPR_IND"
              , "METOLAZONE_PRESCRIBED"
              , "THIAZIDEDIUR_PRESCRIBED"
              , "TRIGL__MR"
              , "NUM_HOSP_30_DAYS"
              #, "NUM_HOSP_365_DAYS"
              , "COPD_IND"
              , "NUM_MISSED_APPTS_3_YRS"
              , "HDL__MR"
)
inclCol <- c( outcomes[[1]]
              , indSignif[indSignif %in% colnames( dataSet )]
              , "BETA_BLOCKER_PRESCRIBED"
              , "CSO_EVER"   # latest iteration testing on 2012
              #, "CARDO_24MM_VISIT"
              , "SBP__MR"
)

baseText <- paste( "HF EOL model validation for dates between", years[1], "and",
                   years[length(years)] )
testText <- paste( baseText, ", \n test =", testFrac, "of total",
                   "sensitivity threshold (% of population", "flagged) =",
                   threshold )

prSort <- run.glm.model( dataSet, trainRows, testRows, inclCol, exclCol, 
                         outcomes, uniOddsRatio, length( patRows ), 
                         testText=testText, oRdir=interimDir, 
                         coefDir=resultDir, varJoiner='+', timeStamp=timeStamp )

# plot the PPV / sensitivity graph

scenario <- "Full"
allRisk <- ppv.sens.risk( dataSet, prSort, scenario, baseText, 
                          graphDir=graphDir, resultDir=resultDir, 
                          timeStamp=timeStamp )

# lift table, top 5% and random 100 from top 5%

lift.table.save( prSort, nrow( dataSet ), admRaw$AS_OF_DATE, 
                 max( admRaw$AS_OF_DATE ), allRisk,
                 c( 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1 ),
                 resultDir=resultDir, timeStamp=timeStamp )

# Cox proportional hazard model

prSortCox <- run.cox.model( dataTrain, dataSet[testRows, ], today, inclCol, 
                            exclCol, outcomes, coefDir=resultDir, 
                            timeStamp=timeStamp )

baseTextCox <- paste( "HF EOL Cox survival model validation for dates between", 
                      years[1], "and", years[length(years)] )
allRiskCox <- ppv.sens.risk( dataSet, prSortCox, scenario, baseTextCox, 
                             graphDir=graphDir, resultDir=resultDir, 
                             ppvFile="PPV_Sens_Cox_", riskFile="AllRisk_Cox_", 
                             timeStamp=timeStamp  )

# lift table, top 5% and random 100 from top 5%

lift.table.save( prSortCox, nrow( dataSet ), admRaw$AS_OF_DATE, 
                 max( admRaw$AS_OF_DATE ), allRiskCox,
                 c( 0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 1 ),
                 resultDir=resultDir, liftFile="Lift_Cox_", 
                 riskFile="HiRisk_Cox_", topRiskFile="HiRiskRand_Cox_", 
                 timeStamp=timeStamp )

                          

Listing D.1: Example: R code for Cox proportional hazard modeling.

The most recent version of the code in Listing D.1 can be found in HF_EOL.R.