Section 1

Question 1

abmData <- read_dta("abm.dta")
abmData <- select(abmData, abm, age, sex, priorrx, gl, polys, wbc, bands, lymphs, month)
abmData <- drop_na(abmData)

Question 2

abmData <- mutate(abmData, season = case_when(
            month > 11 ~ "Winter",
            month > 2 ~ "Non-winter",
            TRUE ~ "Winter")
            )

abmData <- mutate(abmData, sex = as.factor(sex))

Question 3

Part A

plot(jitter(abmData$age), jitter(abmData$abm, factor=0.2),
     xlab="Age", ylab="ABM (0/1)", main="Age vs ABM")

plot(jitter(abmData$wbc), jitter(abmData$abm, factor=0.2),
     xlab="White Blood Count", ylab="ABM (0/1)", main="White Blood Count vs ABM")

plot(jitter(abmData$bands), jitter(abmData$abm, factor=0.2),
     xlab="Bands", ylab="ABM (0/1)", main="Bands vs ABM")

plot(jitter(abmData$gl), jitter(abmData$abm, factor=0.2),
     xlab="Glucose", ylab="ABM (0/1)", main="Glucose vs ABM")

plot(jitter(abmData$polys), jitter(abmData$abm, factor=0.2),
     xlab="Polymorphonuclear Leukocytes", ylab="ABM (0/1)", main="Polymorphonuclear Leukocytes vs ABM")

plot(jitter(abmData$lymphs[-37]), jitter(abmData$abm[-37], factor=0.2),
     xlab="Lymphs", ylab="ABM (0/1)", main="Lymphs vs ABM")

tab_sex <- table(abmData$sex, abmData$abm)
prop_sex <- prop.table(tab_sex, 1)
barplot(t(prop_sex[, c("1","0")]), beside=FALSE,
        main="Proportion of ABM by Sex", ylab="Proportion", names.arg = c("Male", "Female"), legend.text = c("ABM", "No ABM"))

tab_priorrx <- table(abmData$priorrx, abmData$abm)
prop_priorrx <- prop.table(tab_priorrx, 1)
barplot(t(prop_priorrx[, c("1","0")]), beside=FALSE,
        main="Proportion of ABM by Priorrx", ylab="Proportion", names.arg = c("No Priorrx", "Priorrx"), legend.text = c("ABM", "No ABM"))

tab_month <- table(abmData$month, abmData$abm)
prop_month <- prop.table(tab_month, 1)
barplot(t(prop_month[, c("1","0")]), beside=FALSE,
        main="Proportion of ABM by Month of Admission", ylab="Proportion", legend.text = c("ABM", "No ABM"))

tab_season <- table(abmData$season, abmData$abm)
prop_season <- prop.table(tab_season, 1)
barplot(t(prop_season[, c("1","0")]), beside=FALSE,
        main="Proportion of ABM by Season", ylab="Proportion", legend.text = c("ABM", "No ABM"))

Part B

initialModel <- glm(abm ~ age + sex + priorrx + gl + polys + wbc + bands + lymphs + month + season, data = abmData, family = binomial)
summary(initialModel)

## 
## Call:
## glm(formula = abm ~ age + sex + priorrx + gl + polys + wbc + 
##     bands + lymphs + month + season, family = binomial, data = abmData)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.408139   1.340322  -4.035 5.46e-05 ***
## age           0.030191   0.011952   2.526 0.011537 *  
## sex2          0.371320   0.415486   0.894 0.371482    
## priorrx       1.283335   0.477098   2.690 0.007148 ** 
## gl           -0.028062   0.006750  -4.157 3.22e-05 ***
## polys         0.038489   0.010383   3.707 0.000210 ***
## wbc           0.228834   0.048371   4.731 2.24e-06 ***
## bands         0.164062   0.039915   4.110 3.95e-05 ***
## lymphs        0.002489   0.008448   0.295 0.768316    
## month        -0.014900   0.070381  -0.212 0.832335    
## seasonWinter  1.956397   0.584528   3.347 0.000817 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 368.79  on 266  degrees of freedom
## Residual deviance: 153.79  on 256  degrees of freedom
## AIC: 175.79
## 
## Number of Fisher Scoring iterations: 6

simplifiedModel <- glm(abm ~ age + priorrx + gl + polys + wbc + bands + season, data = abmData, family = binomial)
summary(simplifiedModel)

## 
## Call:
## glm(formula = abm ~ age + priorrx + gl + polys + wbc + bands + 
##     season, family = binomial, data = abmData)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -5.082294   0.988304  -5.142 2.71e-07 ***
## age           0.029791   0.011932   2.497  0.01254 *  
## priorrx       1.271579   0.476033   2.671  0.00756 ** 
## gl           -0.028283   0.006788  -4.166 3.09e-05 ***
## polys         0.036350   0.006824   5.326 1.00e-07 ***
## wbc           0.228648   0.048174   4.746 2.07e-06 ***
## bands         0.167666   0.039862   4.206 2.60e-05 ***
## seasonWinter  1.948379   0.570658   3.414  0.00064 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 368.79  on 266  degrees of freedom
## Residual deviance: 154.72  on 259  degrees of freedom
## AIC: 170.72
## 
## Number of Fisher Scoring iterations: 6

anova(initialModel, simplifiedModel, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: abm ~ age + sex + priorrx + gl + polys + wbc + bands + lymphs + 
##     month + season
## Model 2: abm ~ age + priorrx + gl + polys + wbc + bands + season
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       256     153.79                     
## 2       259     154.72 -3 -0.93281   0.8175

I began by fitting a model using all explanatory terms and no interactions. This resulted in the effects of all but three parameters proving significant. I proceeded to remove the three non-significant terms and then to do an ANOVA between the complex and simplified models. The test resulted in no evidence for a difference between the two, confirming the selection of the simpler model which had significant main effects or all variables.

Part C

simplifiedModel$deviance

## [1] 154.722

simplifiedModel$df.residual

## [1] 259

1 - pchisq(simplifiedModel$deviance, simplifiedModel$df.residual)

## [1] 1

Here we see a residual deviance of 154.722 on 259 degrees of freedom which gives us a Chi-Square Goodness Of Fit statistic very close to 1. This suggests that our model fits the data very well, and possibly too well. This is a case of under-dispersion which should be noted, but is not necessary an issue.

Part D

plot(fitted(simplifiedModel), qresiduals(simplifiedModel), ylab = "Randomized Quantile Residuals")

Looking at the randomized quantile residuals plot, it appears that the residuals have an approximately constant variance. From this along with the Goodness Of Fit test, we can conclude that our model fits the data sufficiently well.

Question 4

Part A

gamFit <- gam(abm ~ s(age) + s(gl) + s(polys) + s(wbc) + s(bands) + season + priorrx, data = abmData, family = binomial)
plot(gamFit)

The GAM plots above display apparent quadratic relationships between abm~age and abm~gl. All other relationships appear linear.

Part B

complexNonLinearModel <- glm(abm ~ age + I(age^2) + gl + I(gl^2) + polys + I(polys^2) + wbc + I(wbc^2) + bands + I(bands^2) + season + priorrx, data = abmData, family = binomial)
summary(complexNonLinearModel)

## 
## Call:
## glm(formula = abm ~ age + I(age^2) + gl + I(gl^2) + polys + I(polys^2) + 
##     wbc + I(wbc^2) + bands + I(bands^2) + season + priorrx, family = binomial, 
##     data = abmData)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.4939339  2.1463000   1.162 0.245248    
## age          -0.2457584  0.0653383  -3.761 0.000169 ***
## I(age^2)      0.0040771  0.0009626   4.236 2.28e-05 ***
## gl           -0.1599344  0.0390230  -4.098 4.16e-05 ***
## I(gl^2)       0.0007359  0.0002119   3.473 0.000514 ***
## polys        -0.0154611  0.0347084  -0.445 0.655990    
## I(polys^2)    0.0005190  0.0003447   1.505 0.132203    
## wbc           0.1157136  0.2212001   0.523 0.600893    
## I(wbc^2)      0.0024663  0.0077862   0.317 0.751430    
## bands         0.1565298  0.0911011   1.718 0.085760 .  
## I(bands^2)   -0.0011625  0.0033551  -0.346 0.728982    
## seasonWinter  2.0918995  0.7086221   2.952 0.003157 ** 
## priorrx       1.8139735  0.6672927   2.718 0.006560 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 368.787  on 266  degrees of freedom
## Residual deviance:  99.548  on 254  degrees of freedom
## AIC: 125.55
## 
## Number of Fisher Scoring iterations: 8

simpleNonLinearModel <- glm(abm ~ age + I(age^2) + gl + I(gl^2) + polys + wbc + bands + season + priorrx, data = abmData, family = binomial)
summary(simpleNonLinearModel)

## 
## Call:
## glm(formula = abm ~ age + I(age^2) + gl + I(gl^2) + polys + wbc + 
##     bands + season + priorrx, family = binomial, data = abmData)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.5184019  1.7399793   0.873 0.382851    
## age          -0.2333576  0.0604045  -3.863 0.000112 ***
## I(age^2)      0.0039737  0.0009053   4.389 1.14e-05 ***
## gl           -0.1585199  0.0373872  -4.240 2.24e-05 ***
## I(gl^2)       0.0007419  0.0002047   3.625 0.000289 ***
## polys         0.0363385  0.0085889   4.231 2.33e-05 ***
## wbc           0.1719846  0.0549834   3.128 0.001760 ** 
## bands         0.1317812  0.0405073   3.253 0.001141 ** 
## seasonWinter  1.9637877  0.6812882   2.882 0.003946 ** 
## priorrx       1.5637366  0.6320470   2.474 0.013358 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 368.79  on 266  degrees of freedom
## Residual deviance: 102.34  on 257  degrees of freedom
## AIC: 122.34
## 
## Number of Fisher Scoring iterations: 7

anova(complexNonLinearModel, simpleNonLinearModel, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: abm ~ age + I(age^2) + gl + I(gl^2) + polys + I(polys^2) + wbc + 
##     I(wbc^2) + bands + I(bands^2) + season + priorrx
## Model 2: abm ~ age + I(age^2) + gl + I(gl^2) + polys + wbc + bands + season + 
##     priorrx
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       254     99.548                     
## 2       257    102.345 -3  -2.7963   0.4241

anova(simplifiedModel, simpleNonLinearModel, test = "Chisq")

## Analysis of Deviance Table
## 
## Model 1: abm ~ age + priorrx + gl + polys + wbc + bands + season
## Model 2: abm ~ age + I(age^2) + gl + I(gl^2) + polys + wbc + bands + season + 
##     priorrx
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)    
## 1       259     154.72                         
## 2       257     102.34  2   52.377 4.23e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Above, I built a model including all quadratic terms and found that the variables appearing to follow a curve in the GAM plots were also the only ones with significant non-linear effects in the output (age and gl). I proceeded to build another model with only these quadratic terms included and then did an ANOVA to confirm no significant difference between the two.

I then compared the simplified non-linear model to the one with only linear terms using an ANOVA. I found a significant difference between the two, with the non-linear model having a lower deviance with a difference of 52.38 on a difference of 2 degrees of freedom. This gives us evidence that the non-linear model is the better of the two.

Part C

complexInteractModel <- glm(abm ~ age + I(age^2) + I(gl^2)+ (gl + polys + wbc + bands)^2 + season + priorrx, data = abmData, family = binomial)

all.fits <- dredge(complexInteractModel)

topInteractionModel <- get.models(all.fits, 1)[[1]]
summary(topInteractionModel)

## 
## Call:
## glm(formula = abm ~ age + I(age^2) + bands + gl + I(gl^2) + polys + 
##     priorrx + season + wbc + 1, family = binomial, data = abmData)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   1.5184019  1.7399793   0.873 0.382851    
## age          -0.2333576  0.0604045  -3.863 0.000112 ***
## I(age^2)      0.0039737  0.0009053   4.389 1.14e-05 ***
## bands         0.1317812  0.0405073   3.253 0.001141 ** 
## gl           -0.1585199  0.0373872  -4.240 2.24e-05 ***
## I(gl^2)       0.0007419  0.0002047   3.625 0.000289 ***
## polys         0.0363385  0.0085889   4.231 2.33e-05 ***
## priorrx       1.5637366  0.6320470   2.474 0.013358 *  
## seasonWinter  1.9637877  0.6812882   2.882 0.003946 ** 
## wbc           0.1719846  0.0549834   3.128 0.001760 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 368.79  on 266  degrees of freedom
## Residual deviance: 102.34  on 257  degrees of freedom
## AIC: 122.34
## 
## Number of Fisher Scoring iterations: 7

Above, I fit a model with all two-way interaction terms between clinical variables and used the “dredge” function to determine the best combination of these terms. This resulted in a model without any interaction, but with all linear and quadratic terms included from before. This gives us evidence that there is no significant interaction between clinical variables in our data.

Part D

plot(fitted(topInteractionModel), qresiduals(topInteractionModel), ylab = "Randomized Quantile Residuals")

The residual plot shows approximately constant variance which suggests that our model fits the data well.

Part E

topInteractionModel$deviance

## [1] 102.3445

topInteractionModel$df.residual

## [1] 257

1 - pchisq(topInteractionModel$deviance, topInteractionModel$df.residual)

## [1] 1

For this model we have a deviance of 102.345 on 257 degrees of freedom, resulting in a Chi-square Goodness Of Fit statistic very close to 1. This suggests that our model fits the data very well and results in under dispersion. The level of under dispersion is not necessary concerning, however, and so we can judge this to be ok.

Section 2

Question 1 and 2

newData <- select(abmData, abm, age, gl, priorrx, season)

grouped <- newData %>%
  mutate(age_cat = cut(age, breaks = c(0, 5, 15, max(age)), include.lowest = TRUE)) %>%
  mutate(gl_cat = cut(gl, breaks = c(0, 80, max(gl)), include.lowest = TRUE)) %>%
  group_by(age_cat, gl_cat, priorrx, season) %>%
  summarise(abmRate = mean(abm), n = n(), y=sum(abm))

The variables must be grouped because working with grouped binomial data is very different than ungrouped. Grouped data allows us to estimamte parameters and compare trends between sets of observations rather than looking only at each observation individually. It also changes how we work with the data computationally, requiring us to check for sparsity based on the size of each group.

Question 3

groupedModel <- glm(cbind(y, n-y)~ (age_cat + gl_cat + priorrx + season)^2, data=grouped, family= binomial)

all.fits.grouped <- dredge(groupedModel)

bestGroupedModel <- get.models(all.fits.grouped, 1)[[1]]
summary(bestGroupedModel)

## 
## Call:
## glm(formula = cbind(y, n - y) ~ age_cat + gl_cat + priorrx + 
##     season + 1, family = binomial, data = grouped)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.04905    0.22349  -0.219 0.826282    
## age_cat(5,15]  -1.64117    0.44648  -3.676 0.000237 ***
## age_cat(15,84] -0.95213    0.31610  -3.012 0.002594 ** 
## gl_cat(80,240] -0.87728    0.37367  -2.348 0.018888 *  
## priorrx         1.24764    0.33688   3.703 0.000213 ***
## seasonWinter    1.66461    0.39887   4.173    3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 106.65  on 22  degrees of freedom
## Residual deviance:  33.27  on 17  degrees of freedom
## AIC: 81.466
## 
## Number of Fisher Scoring iterations: 5

Here I built a model with all variables and two way interactions included and used “dredge” to find the best combination of terms. This resulted in keeping only the main effects of all variables.

bestGroupedModel$deviance

## [1] 33.26956

bestGroupedModel$df.residual

## [1] 17

1 - pchisq(bestGroupedModel$deviance, bestGroupedModel$df.residual)

## [1] 0.01041695

plot(grouped$n, grouped$y/grouped$n, xlim = c(0, max(grouped$n))) %>%
abline(v=5, col="red")

plot(fitted(bestGroupedModel), qresiduals(groupedModel), ylab = "Randomized Quantile Residuals")

Here we have a a deviance of 33.269 on 17 degrees of freedom, giving us a low Goodness Of Fit statistic of 0.0104. This suggests that our model does not fit the data very well.

Also shown are a plot to check for sparsity with a red line at group size = 5, and the residuals plot which is hard to interpret but seems to show relatively constant variance.

Even with relatively constant variance, however, our low Goodness Of Fit tells us that the model does not sufficiently fit the data.

Question 5

modifiedData <- abmData %>%
  mutate(age_cat = cut(age, breaks = c(0, 5, 15, max(age)), include.lowest = TRUE)) %>%
  mutate(gl_cat = cut(gl, breaks = c(0, 80, max(gl)), include.lowest = TRUE))

ungroupedModel <- glm(abm ~ age_cat + gl_cat + priorrx + season + 1, family = binomial, data = modifiedData)
summary(ungroupedModel)

## 
## Call:
## glm(formula = abm ~ age_cat + gl_cat + priorrx + season + 1, 
##     family = binomial, data = modifiedData)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -0.04905    0.22349  -0.219 0.826282    
## age_cat(5,15]  -1.64117    0.44648  -3.676 0.000237 ***
## age_cat(15,84] -0.95213    0.31610  -3.012 0.002594 ** 
## gl_cat(80,240] -0.87728    0.37367  -2.348 0.018888 *  
## priorrx         1.24764    0.33688   3.703 0.000213 ***
## seasonWinter    1.66461    0.39887   4.173    3e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 368.79  on 266  degrees of freedom
## Residual deviance: 295.41  on 261  degrees of freedom
## AIC: 307.41
## 
## Number of Fisher Scoring iterations: 4

ungroupedModel$deviance

## [1] 295.4098

ungroupedModel$df.residual

## [1] 261

1 - pchisq(ungroupedModel$deviance, ungroupedModel$df.residual)

## [1] 0.07031723

plot(fitted(ungroupedModel), qresiduals(ungroupedModel), ylab = "Randomized Quantile Residuals")

After fitting the grouped binomial model to the ungrouped data we see that the model coefficients have stayed exactly the same but the Goodness Of Fit has changed. With a deviance of 295.4098 on 261 degrees of freedom, the Goodness Of Fit statistic increased to 0.0703. This puts the model fit on the edge, but within the acceptable range for fitting the data. I also produced a residual plot where we can see approximately constant varaince. Together, this tells us that while the grouped binomial model did not fit the grouped data, it does seem to fit the ungrouped data even if not very well.

STATS 330 Assignment 2

Liam Rohrer, lroh486, 973023817

2025-08-13

Section 1

Question 1

Question 2

Question 3

Part A

Part B

Part C

Part D

Question 4

Part A

Part B

Part C

Part D

Part E

Section 2

Question 1 and 2

Question 3

Question 5