import numpy as np
import pandas as pd
#Reading in data
data = pd.read_csv("winequality-red.csv", sep=";")
#Converting quality to binary classification with a cutoff at 6
data["quality"] = (data["quality"] >= 6).astype(int)
X = data.drop("quality", axis=1).values
y = data["quality"].values
#Randomly splitting data into training and testing sets
np.random.seed(0)
idx = np.random.permutation(len(X))
split = int(0.7*len(X))
trainFeatures, testFeatures = X[idx[:split]], X[idx[split:]]
trainTarget, testTarget = y[idx[:split]], y[idx[split:]]
#Calculating entropy
def entropy(y):
p = np.bincount(y)/len(y)
p = p[p>0]
return -np.sum(p*np.log2(p))
#Calculating information gain
def infoGain(X_col, y, t):
p = entropy(y)
left = y[X_col <= t]
right = y[X_col > t]
if len(left)==0 or len(right)==0:
return 0
c = (len(left)/len(y))*entropy(left) + (len(right)/len(y))*entropy(right)
return p-c
#Splitting data based on best information gain
def splitInto(X,y):
best_gain = -1
best_feat = None
best_thresh = None
for f in range(X.shape[1]):
for t in np.unique(X[:,f]):
g = infoGain(X[:,f],y,t)
if g > best_gain:
best_gain = g
best_feat = f
best_thresh = t
return best_feat,best_thresh
#Training the model with no depth limit
def train(X, y):
if len(np.unique(y)) == 1:
return y[0]
f, t = splitInto(X, y)
if f is None:
return np.bincount(y).argmax()
left = X[:, f] <= t
right = X[:, f] > t
return {
"feature": f,
"threshold": t,
"left": train(X[left], y[left]),
"right": train(X[right], y[right])
}
#Printing the tree for visual analysis
def printTree(tree, depth=0):
indent = " " * depth
if not isinstance(tree, dict):
print(indent + "Leaf:", tree)
return
print(indent + f"[X{tree['feature']} <= {tree['threshold']:.3f}]")
printTree(tree["left"], depth+1)
printTree(tree["right"], depth+1)
#Creating a model with no depth limit
tree_full = train(trainFeatures,trainTarget)
I converted wine quality into a binary variables to make it easier to classify and interpret predictions using a decision tree. I found that the minimum and maximum values for "quality" were 3 and 8, so I set 6 as my cutoff point. I divided rankings into either high (>=6) or low (<6) quality which were represented by 1 and 0, respectively. I then divided my data into a training and testing set with a 70/30 split. I did this to use the largest training set possible, while still keeping enough data to accurately evaluate my model. The specific split was chosen by construct as it seems to be a common choice for training machine learning models.
The tree produced is long, including many splits. It appears to continue splitting nodes until all observations are correctly classified by the model. This leads to the model presumably being over fit.
Part 2¶
#Trainiing the model with a given depth limit
def trainDepth(X,y,depth=0,stopping_depth=None):
if len(np.unique(y))==1:
return y[0]
if stopping_depth is not None and depth>=stopping_depth:
return np.bincount(y).argmax()
f,t = splitInto(X,y)
if f is None:
return np.bincount(y).argmax()
left = X[:,f] <= t
right = X[:,f] > t
return {
"feature":f,
"threshold":t,
"left":trainDepth(X[left],y[left],depth+1,stopping_depth),
"right":trainDepth(X[right],y[right],depth+1,stopping_depth)
}
#Creating models with depth limits of 2, 3, and 4 and printing them for visual analysis
tree_d2 = trainDepth(trainFeatures,trainTarget,stopping_depth=2)
tree_d3 = trainDepth(trainFeatures,trainTarget,stopping_depth=3)
tree_d4 = trainDepth(trainFeatures,trainTarget,stopping_depth=4)
printTree(tree_d2)
print()
printTree(tree_d3)
print()
printTree(tree_d4)
[X10 <= 10.500]
[X9 <= 0.550]
Leaf: 0
Leaf: 0
[X1 <= 0.865]
Leaf: 1
Leaf: 0
[X10 <= 10.500]
[X9 <= 0.550]
[X1 <= 0.450]
Leaf: 0
Leaf: 0
[X6 <= 103.000]
Leaf: 0
Leaf: 0
[X1 <= 0.865]
[X9 <= 0.580]
Leaf: 1
Leaf: 1
[X7 <= 0.994]
Leaf: 0
Leaf: 0
[X10 <= 10.500]
[X9 <= 0.550]
[X1 <= 0.450]
[X4 <= 0.081]
Leaf: 1
Leaf: 0
[X0 <= 7.300]
Leaf: 0
Leaf: 0
[X6 <= 103.000]
[X1 <= 0.360]
Leaf: 1
Leaf: 0
Leaf: 0
[X1 <= 0.865]
[X9 <= 0.580]
[X7 <= 0.996]
Leaf: 1
Leaf: 0
[X10 <= 11.300]
Leaf: 1
Leaf: 1
[X7 <= 0.994]
[X0 <= 5.600]
Leaf: 0
Leaf: 1
Leaf: 0
The trees created here are limited to 2, 3, and 4 levels of splits. This makes it much easier to understand what decisions are being made, but the significant restrictions in depth mean that the models are likely underfit to the data.
Part 3¶
#Predicting labels for individual observations
def predictSample(x,tree):
if not isinstance(tree,dict):
return tree
if x[tree["feature"]] <= tree["threshold"]:
return predictSample(x,tree["left"])
else:
return predictSample(x,tree["right"])
#Predicting labels for a set of observations
def predict(X,tree):
return np.array([predictSample(x,tree) for x in X])
#Calculating the accuracy of the model on the given data
def accuracy(y,prediction):
return np.mean(y==prediction)
#Calculating confusion matrix values for performance metrics
def confusionCounts(y_true, y_pred):
tp = np.sum((y_true == 1) & (y_pred == 1))
tn = np.sum((y_true == 0) & (y_pred == 0))
fp = np.sum((y_true == 0) & (y_pred == 1))
fn = np.sum((y_true == 1) & (y_pred == 0))
return tp, tn, fp, fn
#Calculating and displaying precision, recall, and false positive rates for model analysis
def performanceMetrics(y_true, y_pred):
tp, tn, fp, fn = confusionCounts(y_true, y_pred)
print(f"TP={tp}, TN={tn}, FP={fp}, FN={fn}")
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
print(f"Precision = {precision:.4f}")
print(f"Recall = {recall:.4f}")
print(f"FPR = {fpr:.4f}")
print()
#Creating a model for depth limit 6 which I found to be the best performing limit
tree_d6 = trainDepth(trainFeatures,trainTarget,stopping_depth=6)
#Printing out performance metrics on both training and testing data for the full model and the depth 6 model for comparison
print("Accuracy of unrestricted model on training data:",accuracy(trainTarget,predict(trainFeatures,tree_full)))
print("Accuracy of unrestricted model on testing data:",accuracy(testTarget,predict(testFeatures,tree_full)))
performanceMetrics(predict(testFeatures, tree_full), testTarget)
print("Accuracy of depth 6 model on training data:",accuracy(trainTarget,predict(trainFeatures,tree_d6)))
print("Accuracy of depth 6 model on testing data:",accuracy(testTarget,predict(testFeatures,tree_d6)))
Accuracy of unrestricted model on training data: 1.0 Accuracy of unrestricted model on testing data: 0.75 TP=195, TN=165, FP=69, FN=51 Precision = 0.7386 Recall = 0.7927 FPR = 0.2949 Accuracy of depth 6 model on training data: 0.80875781948168 Accuracy of depth 6 model on testing data: 0.7729166666666667
Here, I've assesed the unrestriced model by calculating its overall accuracy, precision, recall, and false positive rate on the testing data. I've also displayed its accuracy on the training data. The model performs perfectly on the training data, but has an overall accuracy of 0.75 on the testing data. This tell us that the model is overfit.
Looking further at performance on the test set, we see a precision rate of 0.79 which tells us that correct predictions are reliable about 79% of the time. The model also missed some good wines with a slightly lower recall rate of 0.74. This means that 26% of the good wines weren't classified as such. Finally, we have a relatively high false positive rate, classifying about 24% of bad wines as good. Overall, the unrestricted model seems to be performing relatively well but is overfit to the data.
With some trial and error, I discovered that a depth limit of 6 gives the best result with an overall accuracy of 0.81 on the training data and 0.77 on the testing data.
Task 2 (Reflection)¶
Part 1:¶
Changing the splitting criterion will change the logic of how our model divides observations at each split. Currently, we're using maximum information gain which redces entropy as much as possible at each step and considers the entire class distribution. This prioritizes node purity and leads to deep splits in our tree. We could instead consider using minumum misclassification error. Misclassification error only cares about the majority class and so will make much less targeted splits. This will likely make for a simpler, shallower tree and lead to a higher risk of underfitting compared to the criterion we used.
Part 2:¶
I used my testing procedure in Task 1-C to compare the overall accuracy of the unrestricted model on the training data to its accuracy on the testing data. I found that the model performed perfectly on the training data, but only classified 75% of the test observations correctly. This indicates that the model is overfit to the data.
In comparison, I also assessed the same metrics for a model with a depth limit of 6. I found that, although it performed worse on the training data, it performed better on the testing data. This further confirms overfitting in the unrestricted model.