Appendix C. Sample code used to process the running COPD example

From NorthShore Analytics
Jump to: navigation, search

In the listing below, the formatting is different from that of the actual code due to a different line length in the typeset font. User-defined packages NS.CA.statUtils, NS.CA.dataUtils, NS.CA.dateUtils, NS.CA.modelUtils, NS.CA.plotUtils and NS.CA.mathUtils are defined in Appendix E. R package manuals.

                                                  
### Daniel Chertok
### (C) 2014 NorthShore University HealthSystem
### All rights reserved

rm(list=ls())

library(debug)
library(reshape2)
library(date)
library(stringr)
library(plyr)
library(glmnet)
library(MASS)
library(scales)
library(lubridate)

library(NS.CA.statUtils)
library(NS.CA.dataUtils)
library(NS.CA.dateUtils)
library(NS.CA.modelUtils)
library(NS.CA.plotUtils)
library(NS.CA.mathUtils)

SPARSE_MIN <- 0.2

yearCsvName <- function( dataDir, name, year ) {
  paste(dataDir, name, year, ".csv", sep="")
}

# main module

years <- 2010:2014
today <- as.Date("2014-01-01")
dataDir <- "../Data/"
resultDir <- "../Results/"
graphDir <- paste( resultDir, "Graphs", sep="" )
interimDir <- paste( resultDir, "Interim/", sep="" )
timeStamp <- timestamp.from.date()

# read the input file

COPDadmRaw <- read.csv( paste(dataDir, "COPD_ALL_ALIVE.csv", sep="") )

outcomes <- c(
  "INP_OBS_COPD_ADM_365_DAYS",
  "DEATH_365D_IND"
)

# all dates will be used later

keyDates <- sort(unique(as.Date(COPDadmRaw[, "AS_OF_DATE"])))

# for analysis, use data up to today only

COPDadmRaw <- transform( COPDadmRaw, AS_OF_DATE=as.Date(AS_OF_DATE) )
COPDadm <- subset( COPDadmRaw, AS_OF_DATE < today )

# columns irrelevant to the analysis

excludeCol <- c("PAT_ID", "TEN_YR_RISK_PCT", outcomes[[2]])

# columns where NAs can be turne into 0s

NAzero <- c("IND", "PRESCRIBED", "VISIT", "DAYS", "NUM")
NAzeroCol <- unique( Reduce( c, sapply( NAzero, grep, colnames(COPDadm) ) ) )

COPDadm[, NAzeroCol] <- sapply(COPDadm[, NAzeroCol], function(x) ifelse(is.na(x), 0, x) )

# independent of train / test breakdown

COPDadm <- 
   transform( COPDadm, 
              EJFR_IND=ifelse( EJFR_NUM > 0, 1, 0 ),
              FEMALE_IND=
                 ifelse( GENDER_CODE=="F", 1, 0 ),
              HOSP_365_DAYS_COPD_IND=
                 ifelse( NUM_HOSP_365_DAYS_COPD > 0, 1, 0 ),
              ER_VISITS_365_DAYS_COPD_IND=
                 ifelse( NUM_ER_VISITS_365_DAYS_COPD > 0, 1, 0 ) )
COPDadm[is.na(COPDadm$MARITAL_STATUS),"MARITAL_STATUS"] <- "Unknown"
COPDadm[is.na(COPDadm$RACE_ETHNICITY),"RACE_ETHNICITY"] <- NS.CA.mode(COPDadm$RACE_ETHNICITY)
ftpPts <- na.to.colMeans(data.frame(ftp=COPDadm$FRAM_TOTAL_POINTS))
COPDadm$FRAM_TOTAL_POINTS <- ftpPts$FTP


# indicator columns

indCol <- grep( "TOTAL", grep("_IND|_PRESCRIBED|_GT", names(COPDadm), value=T), invert=T, value=T )
statusCol <- which(colnames(COPDadm) %in% outcomes[[1]])

# set up models

# set.seed(1)
testFrac <- 0.2
tRows <- createDataPartition( COPDadm[, statusCol], p=testFrac, list=F )

# train on all training dataset patients, test on MG patients only

mgRows <- which( COPDadm[, "MG_PCP_24M_SEEN_IND"] == 1 )
tMgRows <- tRows[tRows %in% mgRows]
nMgRows <- length( tMgRows )

COPDadm <- COPDadm[, ! names(COPDadm) %in% excludeCol]
dsfTestRows <- tMgRows
threshold <- 0.05
oma <- c(0, 0, 3, 0)
period <- years(1)

# set up test dates

fwdTestDates <- forward.test.dates( keyDates, today, period )
backTestDates <- backward.test.dates( keyDates, today, period )
midTestDates <- mid.test.dates( keyDates, today, period )

baseText <- paste( "COPD model validation for dates between", years[1], "and", years[length(years)] )

testModel <- list(
  Full=
  list( Model=glm.test,
        ModelRowArg=dsfTestRows,
        TestRows=dsfTestRows,
        Text=
           paste( baseText, ", \n test =", 
                  testFrac, "of total",
                  "sensitivity threshold (% of population", "flagged) =",
                  threshold ) ),

# set up filter arguments

rcnt <- c("__MR", "WEIGHT")
rcntCol <- unique( Reduce( c, sapply( rcnt, grep, colnames(COPDadm) ) ) )

minCor <- 0.6
sensThreshold <- 0.01
signLev <- 0.95

# run the test program

scenario <- "Full"

testRows <- testModel[[scenario]][["TestRows"]]

# use train set only

trainRows <- rownames( COPDadm[-testRows, ] )

# remove or backfill sparse columns

pctNA <- percent.NA.col( COPDadm[trainRows, rcntCol] )

# usable sparse columns

rareCol <- names( pctNA[pctNA <= SPARSE_MIN] )   
if ( length(rareCol) > 0 ) {
  rareData <- na.to.colMeans(COPDadm[, rareCol])
  COPDadm[, rareCol] <- rareData
}

# data too sparse to use

sparseCol <- names( pctNA[pctNA > SPARSE_MIN] )  
dataSet <- COPDadm[, ! colnames(COPDadm) %in% sparseCol]

# eliminate indicators with only one value

singleValCol <- which( sapply( dataSet[trainRows, ], function(c) length(unique(c)) ) <= 1 )
dataSet <- dataSet[, -singleValCol]
indCol <- indCol[indCol %in% colnames(dataSet)]

# indicator odds ratios

oRi <- odds.ratio.matrix(dataSet[trainRows, ], dataSet[trainRows, outcomes[[1]]], indCol, level=signLevel)

rnORbase <- rownames(oRi)
posCountMap <- rep( 1, length(rnORbase) )
names(posCountMap) <- rnORbase
posStat <- pos.stat( dataSet[trainRows, ], names(posCountMap), outcomes[[1]], posCountMap )

oddsRatioInd <- merge( oRi, posStat, by="row.names", all.x=T, rownames=T)
rownames(oddsRatioInd) <- oddsRatioInd$Row.names
outputCol <- c("IndPosCount", "IndPosPct", "IndPosOutcomePct", "Odds.Ratio", "CI.Lower", "CI.Upper", "Pr...z..", "Validity")

total <- data.frame(lapply(outputCol, function(c) NA))
colnames(total) <- outputCol
total <- 
   mutate( total, 
           Row.names="TOTAL",
           IndPosCount=nrow(dataSet[trainRows, ]),
           IndPosOutcomePct=
              nrow( dataSet[trainRows, ]
                 [dataSet[trainRows, outcomes[[1]]]==1,] ) /
                 nrow(dataSet[trainRows, ]) )
oRind <- rbind(oddsRatioInd[, c("Row.names", outputCol)], total)
colnames(oRind)[colnames(oRind) == "Row.names"] <- "Variable"
write.csv( oRind, paste(interimDir, "OR_", timeStamp, ".csv", sep=""), row.names=F )

indSignif <- na.omit(oRind[oRind$Validity == '*', "Variable"])

# numeric odds ratio

nc <- colnames(dataSet[, !colnames(dataSet) %in% c(indCol, "PAT_MRN_ID", outcomes[[1]])])
numCol <- nc[sapply( dataSet[, nc], is.numeric )]
oddsRatioNum <- 
   odds.ratio.num( dataSet[trainRows, ],
                   dataSet[trainRows, outcomes[[1]]], 
                   numCol,
                   level=signLev )

# combined univariate ratio

uniOutCol <- c( "Row.names", "Odds.Ratio", "Pr...z..", "CI.Lower", "CI.Upper", "Validity" )
oRn <- transform( oddsRatioNum, Row.names=rownames(oddsRatioNum) )

# univariate ethnicity

ethnicitySummary <-
  multi.category.stats( dataSet[trainRows, ], 
                        "RACE_ETHNICITY", 
                        outcomes[[1]],
                        sensThreshold, 
                        "Caucasian", 
                        "Other", 
                        c("0", "1") )
write.csv( ethnicitySummary, paste( interimDir, "Ethnicity_", timeStamp, ".csv", sep="" ), row.names=F )


levels(dataSet$RACE_ETHNICITY) <- c(levels(dataSet$RACE_ETHNICITY), "AfroEuroAsian")
dataSet[! dataSet$RACE_ETHNICITY %in% c("Asian","African American", "Caucasian"), "RACE_ETHNICITY"] <- "Other"

dataSet[dataSet$RACE_ETHNICITY %in% c("Asian","African American", "Caucasian"), "RACE_ETHNICITY"] <- "AfroEuroAsian"


# marital status

maritalStatus <- multi.category.stats( dataSet[trainRows, ], "MARITAL_STATUS", outcomes[[1]], sensThreshold, "Married",  "Unknown", c("1", "0") )
write.csv( maritalStatus, paste(interimDir, "MaritalStatus_", timeStamp, ".csv", sep=""), row.names=F )

levels(dataSet$MARITAL_STATUS) <- c(levels(dataSet$MARITAL_STATUS), "NotWidowed", "Other")
dataSet[! dataSet$MARITAL_STATUS %in% c("Married","Widowed"), "MARITAL_STATUS"] <- "Other"
dataSet[dataSet$MARITAL_STATUS %in% c("Married","Other"), "MARITAL_STATUS"] <- "NotWidowed"

eTs <- transform( subset( ethnicitySummary, ! RACE_ETHNICITY %in% c( "TOTAL", "Caucasian" ) ),
Row.names=paste( RACE_ETHNICITY, ": Caucasian" ) )
mSt <- transform( subset( maritalStatus, ! MARITAL_STATUS %in% c( "TOTAL", "Married" ) ), Row.names=paste( MARITAL_STATUS, ": Married" ) )
uniOddRatio <- rbind( oddsRatioInd[, uniOutCol], oRn[, uniOutCol], eTs[, uniOutCol], mSt[, uniOutCol] )
colnames(uniOddRatio)[colnames(uniOddRatio) == "Row.names"] <- "Variable"
write.csv( uniOddRatio, paste(interimDir, "UniOR_", timeStamp, ".csv", sep=""), row.names=F )

cDs <- colnames(dataSet)
hiCorPriorNum <- high.corr( dataSet[trainRows, sort( cDs[which( !cDs %in% indCol )] )], minCor, "Numerical factor correlation matrix")
write.csv( hiCorPriorNum, paste( interimDir, "CorVar_Prior_", timeStamp, ".csv", sep="" ), row.names=F )
hiCorNumDf <- high.corr.df( hiCorPriorNum, minCor )
write.csv( hiCorNumDf, paste( interimDir, "CorVar_PriorTable_", timeStamp, ".csv", sep="" ), row.names=F )

hiCorPriorInd <- high.corr( dataSet[trainRows, sort(indCol)], minCor, "Indicator correlation matrix", scaleArg=list( cex=0.75 ) )
write.csv( hiCorPriorInd, paste( interimDir, "CorVarInd_", timeStamp, ".csv", sep="" ), row.names=F )
hiCorIndDf <- high.corr.df( hiCorPriorInd, minCor )
write.csv( hiCorIndDf,paste( interimDir, "CorVar_PriorIndTable_", timeStamp, ".csv", sep="" ), row.names=F )
indSignif <- indSignif[!indSignif %in% hiCorIndDf$var2]

exclCol <- c(
  "MG_PCP_18M_SEEN_IND"
  , "LOOPDIURETIC_PRESCRIBED"
  , "NOT_PRN_MED_TOTAL_PRESCRIBED"
  , "DAYS_SINCE_LAST_EF"
  , "DISCHARGED_DISP_31_365_DAYS"
  , "NUM_HOSP_31_365_DAYS_COPD"
  , "NUM_ER_VISITS_31_365_DAYS_COPD"
  , "NUM_HOSP_31_365_DAYS_PNEU"
  , "NUM_ER_VISITS_31_365_DAYS_PNEU"
  , "TOTAL_LOS_HOSP_DAYS_LST_12_MO"
  , "NUM_HOSP_365_DAYS_COPD"
  , "NUM_ER_VISITS_365_DAYS_COPD"
  , "EJFR_NUM"
)
inclCol <- c( c(
  "PAT_AGE_YRS",
  "CARDO_24MM_VISIT",
  "PULMO_24MM_VISIT",
  "HOSP_30D_VISIT",
  "NUM_HOSP_365_DAYS_COPD",
  "NUM_ER_VISITS_365_DAYS_COPD",
  "NUM_ER_VISITS_365_DAYS_PNEU",
  "HOSP_12M_VISIT",
  "NUM_UNIQUE_SPECIALTIES",
  outcomes[[1]],
  numCol, indCol
) )
factorCol <- unique( subset( inclCol, ! inclCol %in% exclCol ) )
hiCor <- high.corr( dataSet[trainRows, sort( factorCol )], minCor, "Factor correlation matrix", scaleArg=c( cex=0.6 ) )
if ( Reduce( '*', dim(hiCor) ) > 0 ) {
  hiCorIndDf <- high.corr.df( hiCor, minCor )
}

train <- dataSet[, factorCol]
statusCol <- which(colnames(train) %in% outcomes[[1]])

# test on the "odds ratio" training dataset

fm <- glm("INP_OBS_COPD_ADM_365_DAYS ~ .", train[trainRows,], family='binomial', maxit=100, trace=T)
trainGlm <- auc.perf( fm, train[trainRows, ], train[trainRows, statusCol], type='response', text='Training set AUC' )

# test on "odds ratio" test dataset  selection

fullModel <- glm.test( train, statusCol, testRows, threshold, trace=T, maxit=100, text=testModel[[scenario]][["Text"]], oma=c(0,0,3,0), varJoiner='+' )
coefOut <- odds.ratio.stat( fullModel$model, uniOddRatio, signLev )
write.csv( coefOut, paste( interimDir, "OR_All_", timeStamp, ".csv", sep="" ), row.names=F )
coefSummary <- coef( summary( fullModel$model ) )
write.csv( coefSummary[order( rownames( coefSummary ) ), ], paste( resultDir, "Coef_All_", timeStamp, ".csv", sep="" ) )

# generate performance statistics

prSort <- perf.clsf.stat( fullModel$perf$predicted, nrow( COPDadmRaw ), train[testRows, outcomes[[1]]] )

# plot the PPV / sensitivity graph

g <- ppv.sens.plot( prSort, main=baseText )
print( g )
ggsave( file=paste( "PPV_Sens_", scenario, "_", timeStamp, ".pdf", sep="" ), path=graphDir, plot=g, width=11, height=8.5, units="in" )

# two.ord.plot( prSort, main=testModel[[scenario]][["Text"]] )

allRisk <- cbind( PAT_MRN_ID=dataSet[rownames(prSort),"PAT_MRN_ID"], prSort )
write.csv( allRisk, paste( resultDir, "AllRisk_", timeStamp, ".csv", sep="" ), row.names=F )

# do hospitalizations matter?

testNull <- dataSet[testRows, ]
hospLastYr <- which( testNull$NUM_HOSP_365_DAYS_COPD > 0 )
hospNextYr <-  which( testNull[, outcomes[[1]]] == 1 )
hospLastNextYr <- which( testNull[hospLastYr, outcomes[[1]]] == 1 )
numHospLastYr <- length( hospLastYr )
numHospNextYr <- length( hospNextYr )
numHospLastNextYr <- length( hospLastNextYr )
totalSize <- nrow( testNull )
nullHyp <- mutate ( 
     data.frame( Total.Population.Size=nrow( testNull ),
     Num.Total.Hosp.Last.Yr=numHospLastYr,
     Num.Hosp.Next.Yr=numHospNextYr,
     Num.Hosp.Last.Next.Yr=numHospLastNextYr,
     Pct.Hosp.COPD.Last.Yr=numHospLastYr / totalSize,
     Pct.Hosp.COPD.Last.Next.Yr=numHospLastNextYr /
        numHospLastYr,
     Pct.Flagged.Hosp.Total=numHospLastNextYr /
        length( hospNextYr ) ),
     F1=2.0 / ( 1.0 / Pct.Hosp.COPD.Last.Next.Yr +
        1.0 / Pct.Flagged.Hosp.Total ) )

write.csv( nullHyp, paste( interimDir, "NullHyp_", timeStamp, ".csv", sep="" ), row.names=F )

# lift table

pctList <- sort( c( 0.01, 0.05, 0.1, 0.2, nullHyp$Pct.Hosp.COPD.Last.Yr, 0.3, 0.5, 1  ) )
liftTable <- lift.table( prSort, pctList, nrow( dataSet ) )
mostRecent <- which( COPDadmRaw$AS_OF_DATE == today )
liftTable <- transform( liftTable, Num.Flagged.Most.Recent=NS.CA.round( Pct.Total * length( mostRecent ) ) )
write.csv( liftTable, paste( interimDir, "Lift_", timeStamp, ".csv", sep="" ), row.names=F )

# top 5 %

liftThreshold <- 0.05
hiRisk <- head( allRisk, liftThreshold * nrow( prSort ) )
write.csv( hiRisk, paste( resultDir, "HiRisk_", timeStamp, ".csv", sep="" ), row.names=F )

# random 100 from top 5%

write.csv( hiRisk[sample( 1:nrow(hiRisk), 100 ), ], paste( resultDir, "HiRiskRand100_", timeStamp, ".csv", sep="" ), row.names=F)

Listing C.1: Example: R code for linear regression.

The most recent version of the code in Listing C.1 can be found in COPDadm.R.