import numpy as np
import pandas as pd

#Reading in data
data = pd.read_csv("winequality-red.csv", sep=";")

#Converting quality to binary classification with a cutoff at 6
data["quality"] = (data["quality"] >= 6).astype(int)

X = data.drop("quality", axis=1).values
y = data["quality"].values

#Randomly splitting data into training and testing sets
np.random.seed(0)
idx = np.random.permutation(len(X))
split = int(0.7*len(X))

trainFeatures, testFeatures = X[idx[:split]], X[idx[split:]]
trainTarget, testTarget = y[idx[:split]], y[idx[split:]]

#Calculating entropy
def entropy(y):
    p = np.bincount(y)/len(y)
    p = p[p>0]
    return -np.sum(p*np.log2(p))

#Calculating information gain
def infoGain(X_col, y, t):
    p = entropy(y)
    left = y[X_col <= t]
    right = y[X_col > t]
    if len(left)==0 or len(right)==0:
        return 0
    c = (len(left)/len(y))*entropy(left) + (len(right)/len(y))*entropy(right)

    return p-c

#Splitting data based on best information gain
def splitInto(X,y):
    best_gain = -1
    best_feat = None
    best_thresh = None
    for f in range(X.shape[1]):
        for t in np.unique(X[:,f]):
            g = infoGain(X[:,f],y,t)
            if g > best_gain:
                best_gain = g
                best_feat = f
                best_thresh = t
    return best_feat,best_thresh

#Training the model with no depth limit
def train(X, y):
    if len(np.unique(y)) == 1:
        return y[0]
    f, t = splitInto(X, y)
    if f is None:
        return np.bincount(y).argmax()
    left = X[:, f] <= t
    right = X[:, f] > t

    return {
        "feature": f,
        "threshold": t,
        "left": train(X[left], y[left]),
        "right": train(X[right], y[right])
    }

#Printing the tree for visual analysis
def printTree(tree, depth=0):
    indent = "  " * depth
    if not isinstance(tree, dict):
        print(indent + "Leaf:", tree)
        return
    print(indent + f"[X{tree['feature']} <= {tree['threshold']:.3f}]")
    printTree(tree["left"], depth+1)
    printTree(tree["right"], depth+1)
    
#Creating a model with no depth limit
tree_full = train(trainFeatures,trainTarget)

#Trainiing the model with a given depth limit
def trainDepth(X,y,depth=0,stopping_depth=None):
    if len(np.unique(y))==1:
        return y[0]
    if stopping_depth is not None and depth>=stopping_depth:
        return np.bincount(y).argmax()
    f,t = splitInto(X,y)
    if f is None:
        return np.bincount(y).argmax()
    left = X[:,f] <= t
    right = X[:,f] > t

    return {
        "feature":f,
        "threshold":t,
        "left":trainDepth(X[left],y[left],depth+1,stopping_depth),
        "right":trainDepth(X[right],y[right],depth+1,stopping_depth)
    }

#Creating models with depth limits of 2, 3, and 4 and printing them for visual analysis
tree_d2 = trainDepth(trainFeatures,trainTarget,stopping_depth=2)
tree_d3 = trainDepth(trainFeatures,trainTarget,stopping_depth=3)
tree_d4 = trainDepth(trainFeatures,trainTarget,stopping_depth=4)

printTree(tree_d2)
print()
printTree(tree_d3)
print()
printTree(tree_d4)

[X10 <= 10.500]
  [X9 <= 0.550]
    Leaf: 0
    Leaf: 0
  [X1 <= 0.865]
    Leaf: 1
    Leaf: 0

[X10 <= 10.500]
  [X9 <= 0.550]
    [X1 <= 0.450]
      Leaf: 0
      Leaf: 0
    [X6 <= 103.000]
      Leaf: 0
      Leaf: 0
  [X1 <= 0.865]
    [X9 <= 0.580]
      Leaf: 1
      Leaf: 1
    [X7 <= 0.994]
      Leaf: 0
      Leaf: 0

[X10 <= 10.500]
  [X9 <= 0.550]
    [X1 <= 0.450]
      [X4 <= 0.081]
        Leaf: 1
        Leaf: 0
      [X0 <= 7.300]
        Leaf: 0
        Leaf: 0
    [X6 <= 103.000]
      [X1 <= 0.360]
        Leaf: 1
        Leaf: 0
      Leaf: 0
  [X1 <= 0.865]
    [X9 <= 0.580]
      [X7 <= 0.996]
        Leaf: 1
        Leaf: 0
      [X10 <= 11.300]
        Leaf: 1
        Leaf: 1
    [X7 <= 0.994]
      [X0 <= 5.600]
        Leaf: 0
        Leaf: 1
      Leaf: 0

#Predicting labels for individual observations
def predictSample(x,tree):
    if not isinstance(tree,dict):
        return tree
    if x[tree["feature"]] <= tree["threshold"]:
        return predictSample(x,tree["left"])
    else:
        return predictSample(x,tree["right"])

#Predicting labels for a set of observations
def predict(X,tree):
    return np.array([predictSample(x,tree) for x in X])

#Calculating the accuracy of the model on the given data
def accuracy(y,prediction):
    return np.mean(y==prediction)

#Calculating confusion matrix values for performance metrics
def confusionCounts(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    return tp, tn, fp, fn

#Calculating and displaying precision, recall, and false positive rates for model analysis
def performanceMetrics(y_true, y_pred):
    tp, tn, fp, fn = confusionCounts(y_true, y_pred)
    print(f"TP={tp}, TN={tn}, FP={fp}, FN={fn}")
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    print(f"Precision = {precision:.4f}")
    print(f"Recall = {recall:.4f}")
    print(f"FPR = {fpr:.4f}")
    print()

#Creating a model for depth limit 6 which I found to be the best performing limit
tree_d6 = trainDepth(trainFeatures,trainTarget,stopping_depth=6)

#Printing out performance metrics on both training and testing data for the full model and the depth 6 model for comparison
print("Accuracy of unrestricted model on training data:",accuracy(trainTarget,predict(trainFeatures,tree_full)))

print("Accuracy of unrestricted model on testing data:",accuracy(testTarget,predict(testFeatures,tree_full)))
performanceMetrics(predict(testFeatures, tree_full), testTarget)

print("Accuracy of depth 6 model on training data:",accuracy(trainTarget,predict(trainFeatures,tree_d6)))
print("Accuracy of depth 6 model on testing data:",accuracy(testTarget,predict(testFeatures,tree_d6)))

Accuracy of unrestricted model on training data: 1.0
Accuracy of unrestricted model on testing data: 0.75
TP=195, TN=165, FP=69, FN=51
Precision = 0.7386
Recall = 0.7927
FPR = 0.2949

Accuracy of depth 6 model on training data: 0.80875781948168
Accuracy of depth 6 model on testing data: 0.7729166666666667

COMPSCI 361 Assignment 1¶

Liam Rohrer (973023817)¶

Task 1 (Coding)¶

Part 1¶

Part 2¶

Part 3¶

Task 2 (Reflection)¶

Part 1:¶

Part 2:¶