import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

#Vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_raw['Article'])   # fit and transform
X_test = vectorizer.transform(test_raw['Article'])        # transform only

#Report sizes
print(f"Train: {X_train.shape[0]} articles, {X_train.shape[1]} features")
print(f"Test: {X_test.shape[0]}  articles, {X_test.shape[1]} features")

#Show 5 example articles with their TF-IDF feature vectors
feature_names = vectorizer.get_feature_names_out()

example_rows = []
for i in range(5):
    sparse_row = X_train[i]
    nonzero_idx = sparse_row.nonzero()[1]
    nonzero_vals = sparse_row.data
    top5_idx = nonzero_idx[np.argsort(nonzero_vals)[::-1][:5]]
    top5_vals = np.asarray(sparse_row[0, top5_idx].todense()).flatten()

    example_rows.append({
        'Id': train_raw['Id'].iloc[i],
        'Category': train_raw['Category'].iloc[i],
        'Article (first 60 chars)': train_raw['Article'].iloc[i][:60],
        'Top-5 features (word: tfidf weight)': ', '.join(f"{feature_names[j]}: {v:.3f}" for j, v in zip(top5_idx, top5_vals))
    })

example_df = pd.DataFrame(example_rows)
print(example_df.to_string(index=False))


#Term frequency analysis — three plots

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X_counts = cv.fit_transform(train_raw['Article'])
words = cv.get_feature_names_out()

#Plot 1: Top-50 term frequency distribution
word_freq = X_counts.sum(axis=0).A1
freq_series = pd.Series(word_freq, index=words).sort_values(ascending=False)
top50 = freq_series.head(50)

fig, ax = plt.subplots(figsize=(16, 5))
ax.bar(range(50), top50.values, color='steelblue', edgecolor='white')
ax.set_xticks(range(50))
ax.set_xticklabels(top50.index, rotation=90, fontsize=8)
ax.set_title('Top-50 Term Frequency Distribution (Train)', fontsize=13)
ax.set_xlabel('Term')
ax.set_ylabel('Total Count')
plt.tight_layout()
plt.savefig('task1_top50_freq.png', dpi=120)
plt.show()

#Plot 2: Term frequency distribution per class
tech_articles = train_raw[train_raw['Category'] == 'tech']['Article']
ent_articles = train_raw[train_raw['Category'] == 'entertainment']['Article']

# Use same vocabulary as above (already fit)
tech_freq = pd.Series(cv.transform(tech_articles).sum(axis=0).A1, index=words)
ent_freq = pd.Series(cv.transform(ent_articles).sum(axis=0).A1, index=words)

top_n = 20
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

tech_top = tech_freq.nlargest(top_n)
axes[0].barh(tech_top.index[::-1], tech_top.values[::-1], color='steelblue')
axes[0].set_title(f'Top {top_n} Terms — Tech', fontsize=12)
axes[0].set_xlabel('Count')

ent_top = ent_freq.nlargest(top_n)
axes[1].barh(ent_top.index[::-1], ent_top.values[::-1], color='salmon')
axes[1].set_title(f'Top {top_n} Terms — Entertainment', fontsize=12)
axes[1].set_xlabel('Count')

plt.suptitle('Term Frequency Distribution per Class (Train)', fontsize=13)
plt.tight_layout()
plt.savefig('task1_per_class_freq.png', dpi=120)
plt.show()

#Plot 3: Class distribution (train and test sets)
fig, axes = plt.subplots(1, 2, figsize=(9, 4), sharey=True)

train_counts = train_raw['Category'].value_counts()
test_counts = test_raw['Category'].value_counts()

axes[0].bar(train_counts.index, train_counts.values, color=['steelblue', 'salmon'], edgecolor='white')
axes[0].set_title('Class Distribution — Train', fontsize=12)
axes[0].set_ylabel('Article Count')
for bar, val in zip(axes[0].patches, train_counts.values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, str(val), ha='center', fontsize=11)

axes[1].bar(test_counts.index, test_counts.values, color=['steelblue', 'salmon'], edgecolor='white')
axes[1].set_title('Class Distribution — Test', fontsize=12)
for bar, val in zip(axes[1].patches, test_counts.values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, str(val), ha='center', fontsize=11)

plt.suptitle('Class Distribution (Train vs Test)', fontsize=13)
plt.tight_layout()
plt.savefig('task1_class_dist.png', dpi=120)
plt.show()

Train: 428 articles, 13518 features
Test: 106  articles, 13518 features
  Id      Category                                     Article (first 60 chars)                                              Top-5 features (word: tfidf weight)
1976          tech lifestyle governs mobile choice faster better funkier hardwa           phone: 0.313, bjorn: 0.308, dr: 0.229, ericsson: 0.225, cameras: 0.221
1797 entertainment french honour director parker british film director sir alan             vabres: 0.309, parker: 0.276, alan: 0.218, french: 0.205, sir: 0.203
1866 entertainment fockers fuel festive film chart comedy meet fockers topped f            fockers: 0.347, meet: 0.305, christmas: 0.304, day: 0.191, box: 0.183
1153 entertainment housewives lift channel 4 ratings debut us television hit de audience: 0.311, housewives: 0.271, channel: 0.255, share: 0.236, january: 0.212
 342 entertainment u2 desire number one u2 three prestigious grammy awards hit                    band: 0.262, u2: 0.200, bono: 0.187, album: 0.149, rock: 0.128

#TASK 2: Classification Models

import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(train_raw['Category'])
y_test = le.transform(test_raw['Category'])


#Task 2(a): Naive Bayes

nb = MultinomialNB()
nb.fit(X_train, y_train)

#Log P(word|class) for each class — shape (n_classes, n_features)
log_probs = nb.feature_log_prob_
class_labels = le.classes_ 

#(i) Top 20 most predictive words per class
for i, cls in enumerate(class_labels):
    top20_idx = np.argsort(log_probs[i])[::-1][:20]
    top20_words = [feature_names[j] for j in top20_idx]
    print(f"\nTop 20 predictive words for {cls}:")
    print(top20_words)

#(ii) Top 20 most discriminative words (maximize P(word|tech) / P(word|ent))
# In log space this is log P(word|tech) - log P(word|ent)
tech_idx = list(class_labels).index('tech')
ent_idx = list(class_labels).index('entertainment')
log_ratio = log_probs[tech_idx] - log_probs[ent_idx]
top20_disc_idx = np.argsort(log_ratio)[::-1][:20]
top20_disc_words = [feature_names[j] for j in top20_disc_idx]
print("\nTop 20 most discriminative words (tech vs entertainment):")
print(top20_disc_words)


#Task 2(b): kNN

#Reduce to 2D with PCA so we can plot a decision boundary
pca = PCA(n_components=2, random_state=42)
X_train_2d = pca.fit_transform(X_train.toarray())
X_test_2d = pca.transform(X_test.toarray())

def plot_decision_boundary(clf, X, y, title, ax):
    h = 0.3
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha = 0.3, cmap ='coolwarm')
    colors = ['steelblue', 'salmon']
    markers = ['o', 's']
    for cls, color, marker in zip([0, 1], colors, markers):
        mask = y == cls
        ax.scatter(X[mask, 0], X[mask, 1], c = color, marker = marker, label = le.classes_[cls], edgecolors = 'k', linewidths = 0.3, s = 30)
    ax.set_title(title, fontsize=10)
    ax.set_xlabel('PCA Component 1')
    ax.set_ylabel('PCA Component 2')
    ax.legend(fontsize = 8)

#Plot four combinations: k = 3 and 10, metric = euclidean and manhattan
fig, axes = plt.subplots(2, 2, figsize=(13, 10))
configs = [(3, 'euclidean'), (3, 'manhattan'), (10, 'euclidean'), (10, 'manhattan')]

for ax, (k, metric) in zip(axes.flat, configs):
    knn = KNeighborsClassifier(n_neighbors = k, metric = metric)
    knn.fit(X_train_2d, y_train)
    plot_decision_boundary(knn, X_train_2d, y_train, f'k = {k},  metric = {metric}', ax)

plt.suptitle('kNN Decision Boundaries (PCA 2D Projection)', fontsize = 13)
plt.tight_layout()
plt.savefig('task2b_knn_boundaries.png', dpi = 120)
plt.show()


#Task 2(c): SVM

#(i) Soft-margin linear SVM — vary C
fig, axes = plt.subplots(1, 3, figsize = (16, 5))
for ax, C in zip(axes, [0.01, 1, 100]):
    svm = SVC(kernel='linear', C = C)
    svm.fit(X_train_2d, y_train)
    plot_decision_boundary(svm, X_train_2d, y_train, f'Linear SVM  C = {C}', ax)

plt.suptitle('Soft-Margin Linear SVM — Effect of C (PCA 2D)', fontsize = 13)
plt.tight_layout()
plt.savefig('task2c_svm_linear.png', dpi=120)
plt.show()

#(ii) Hard-margin RBF kernel SVM — vary gamma (≈ 1/2σ²)
#Large C used to approximate a hard margin
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
for ax, gamma in zip(axes, [0.01, 0.1, 1.0]):
    svm = SVC(kernel = 'rbf', C = 1e6, gamma = gamma)
    svm.fit(X_train_2d, y_train)
    plot_decision_boundary(svm, X_train_2d, y_train, f'RBF SVM  gamma={gamma}  (hard margin)', ax)

plt.suptitle('Hard-Margin RBF SVM — Effect of gamma/σ (PCA 2D)', fontsize = 13)
plt.tight_layout()
plt.savefig('task2c_svm_rbf.png', dpi = 120)
plt.show()


# Task 2(d): ANN — Effect of number of hidden units

# Train a single hidden layer MLP with: initial weights drawn uniformly from [0, 0.1], learning rate = 0.01, 100 epochs
hidden_units_list = [2, 5, 20, 40]
final_losses = []

for h in hidden_units_list:
    mlp = MLPClassifier(
        hidden_layer_sizes = (h,),
        activation = 'relu',
        solver= 'sgd',
        learning_rate = 'constant',
        learning_rate_init = 0.01,
        max_iter = 100,
        random_state = 42,
    )
    mlp.fit(X_train, y_train)
    final_losses.append(mlp.loss_)
    print(f"h = {h:2d}:  final loss = {mlp.loss_:.4f}")

fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(hidden_units_list, final_losses, marker = 'o', color = 'steelblue', linewidth = 2, markersize = 8)
ax.set_title('ANN: Final Training Loss vs Number of Hidden Units', fontsize = 12)
ax.set_xlabel('Number of Hidden Units (h)')
ax.set_ylabel('Binary Cross-Entropy Loss (after 100 epochs)')
ax.set_xticks(hidden_units_list)
plt.tight_layout()
plt.savefig('task2d_ann_loss.png', dpi = 120)
plt.show()

Top 20 predictive words for entertainment:
['film', 'best', 'said', 'show', 'band', 'music', 'year', 'awards', 'us', 'award', 'actor', 'album', 'star', 'chart', 'tv', 'also', 'number', 'oscar', 'top', 'new']

Top 20 predictive words for tech:
['said', 'people', 'mobile', 'software', 'games', 'phone', 'net', 'users', 'technology', 'mr', 'microsoft', 'virus', 'computer', 'broadband', 'new', 'use', 'could', 'would', 'digital', 'game']

Top 20 most discriminative words (tech vs entertainment):
['mobile', 'software', 'users', 'microsoft', 'games', 'net', 'technology', 'virus', 'phone', 'broadband', 'computer', 'phones', 'spam', 'mail', 'firms', 'use', 'spyware', 'online', 'pc', 'internet']

h =  2:  final loss = 0.3936
h =  5:  final loss = 0.4427
h = 20:  final loss = 0.4551
h = 40:  final loss = 0.4081

#TASK 3: Classification Quality Evaluation

from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

# Task 3(a): Hyperparameter investigation using 5-fold cross-validation

#Naive Bayes: alpha (Laplace smoothing)
nb_alphas = [0.01, 0.1, 0.5, 1.0, 2.0, 5.0]
nb_cv_scores = [cross_val_score(MultinomialNB(alpha= a), X_train, y_train, cv = 5, scoring = 'f1').mean() for a in nb_alphas]

#kNN: adjusting number of neighbours k
knn_ks = [1, 3, 6, 12, 18, 24]
knn_cv_scores = [cross_val_score(KNeighborsClassifier(n_neighbors = k), X_train, y_train, cv = 5, scoring ='f1').mean() for k in knn_ks]

#SVM: regularisation parameter C (linear kernel)
#Low C = wide margin, more misclassifications allowed (underfitting)
#High C = narrow margin, fewer misclassifications (risk of overfitting)
svm_Cs = [0.01, 0.1, 1, 10, 100]
svm_cv_scores = [cross_val_score(SVC(kernel = 'rbf', C = c, gamma = 0.1), X_train, y_train, cv = 5, scoring = 'f1').mean() for c in svm_Cs]

#ANN: number of hidden units h
#More units = more capacity to learn complex patterns, but may overfit
ann_hs = [2, 5, 20, 40]
ann_cv_scores = [cross_val_score(MLPClassifier(hidden_layer_sizes = (h,), solver = 'sgd', learning_rate_init = 0.01, max_iter = 100, random_state= 42),X_train, y_train, cv=5, scoring='f1').mean() for h in ann_hs]

#Plot all four hyperparameter sweeps
fig, axes = plt.subplots(2, 2, figsize = (13, 9))

axes[0, 0].plot(nb_alphas, nb_cv_scores, marker = 'o', color= 'steelblue')
axes[0, 0].set_title('NB: alpha vs CV F1')
axes[0, 0].set_xlabel('alpha')
axes[0, 0].set_ylabel('Mean CV F1')
axes[0, 0].set_xscale('log')

axes[0, 1].plot(knn_ks, knn_cv_scores, marker = 'o', color = 'salmon')
axes[0, 1].set_title('kNN: k vs CV F1')
axes[0, 1].set_xlabel('k')
axes[0, 1].set_ylabel('Mean CV F1')

axes[1, 0].plot(svm_Cs, svm_cv_scores, marker = 'o', color = 'seagreen')
axes[1, 0].set_title('SVM: C vs CV F1')
axes[1, 0].set_xlabel('C')
axes[1, 0].set_ylabel('Mean CV F1')
axes[1, 0].set_xscale('log')

axes[1, 1].plot(ann_hs, ann_cv_scores, marker = 'o', color = 'mediumpurple')
axes[1, 1].set_title('ANN: hidden units vs CV F1')
axes[1, 1].set_xlabel('Hidden units (h)')
axes[1, 1].set_ylabel('Mean CV F1')

plt.suptitle('Task 3(a): Hyperparameter Impact on 5-Fold CV F1', fontsize=13)
plt.tight_layout()
plt.savefig('task3a_hyperparameters.png', dpi=120)
plt.show()

# Print best values
print(f"Best NB alpha: {nb_alphas[np.argmax(nb_cv_scores)]} (F1={max(nb_cv_scores):.4f})")
print(f"Best kNN k: {knn_ks[np.argmax(knn_cv_scores)]} (F1={max(knn_cv_scores):.4f})")
print(f"Best SVM C: {svm_Cs[np.argmax(svm_cv_scores)]} (F1={max(svm_cv_scores):.4f})")
print(f"Best ANN h: {ann_hs[np.argmax(ann_cv_scores)]} (F1={max(ann_cv_scores):.4f})")


#Task 3(b): Compare best models on the test set using F1 score

#Best hyperparameters from 3(a)
best_models = {
    'Naive Bayes': MultinomialNB(alpha =0.1),
    'kNN': KNeighborsClassifier(n_neighbors = 10),
    'SVM': SVC(kernel = 'rbf', C = 1),
    'ANN': MLPClassifier(hidden_layer_sizes = (40,), solver='sgd', learning_rate_init = 0.01, max_iter = 100, random_state = 42)
}

print("\nTask 3(b): Test set F1 scores with best hyperparameters")
test_f1_results = {}
for name, clf in best_models.items():
    clf.fit(X_train, y_train)
    f1 = f1_score(y_test, clf.predict(X_test))
    test_f1_results[name] = f1
    print(f"{name:15s}  F1 = {f1:.4f}")

#Bar chart comparison
fig, ax = plt.subplots(figsize = (8, 5))
colors = ['steelblue', 'salmon', 'seagreen', 'mediumpurple']
bars = ax.bar(test_f1_results.keys(), test_f1_results.values(), color = colors, edgecolor = 'white')
ax.set_ylim(0.85, 1.01)
ax.set_title('Task 3(b): Test F1 Score by Classifier', fontsize=13)
ax.set_ylabel('F1 Score')
for bar, val in zip(bars, test_f1_results.values()):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.002, f'{val:.4f}', ha='center', fontsize=10)
plt.tight_layout()
plt.savefig('task3b_test_f1.png', dpi=120)
plt.show()


#Task 3(c): Effect of training set size

#For each fraction m of the training data, train on data[0 : int(m * N)] and evaluate F1 on both the training slice and the full test set
ms = [0.1, 0.3, 0.5, 0.7, 0.9]
N = X_train.shape[0]

train_f1s = {name: [] for name in best_models}
test_f1s = {name: [] for name in best_models}

for m in ms:
    n = int(m * N)
    Xm = X_train[:n]
    ym = y_train[:n]
    for name, clf in best_models.items():
        clf.fit(Xm, ym)
        train_f1s[name].append(f1_score(ym, clf.predict(Xm)))
        test_f1s[name].append(f1_score(y_test, clf.predict(X_test)))

colors_map = {
    'Naive Bayes': 'steelblue',
    'kNN': 'salmon',
    'SVM': 'seagreen',
    'ANN': 'mediumpurple'
}

#Plot (i): Training F1 score vs m
fig, ax = plt.subplots(figsize=(8, 5))
for name in best_models:
    ax.plot(ms, train_f1s[name], marker='o', label=name, color = colors_map[name])
ax.set_title('Task 3(c): Train F1 vs Training Set Size', fontsize = 13)
ax.set_xlabel('Fraction of training data (m)')
ax.set_ylabel('Train F1')
ax.set_xticks(ms)
ax.legend()
plt.tight_layout()
plt.savefig('task3c_train_f1.png', dpi = 120)
plt.show()

#Plot (ii): Testing F1 score vs m
fig, ax = plt.subplots(figsize = (8, 5))
for name in best_models:
    ax.plot(ms, test_f1s[name], marker = 'o', label = name, color = colors_map[name])
ax.set_title('Task 3(c): Test F1 vs Training Set Size', fontsize = 13)
ax.set_xlabel('Fraction of training data (m)')
ax.set_ylabel('Test F1')
ax.set_xticks(ms)
ax.legend()
plt.tight_layout()
plt.savefig('task3c_test_f1.png', dpi=120)
plt.show()

Best NB alpha: 0.1 (F1=0.9839)
Best kNN k: 12 (F1=0.9795)
Best SVM C: 10 (F1=0.9883)
Best ANN h: 40 (F1=0.9813)

Task 3(b): Test set F1 scores with best hyperparameters
Naive Bayes      F1 = 0.9677
kNN              F1 = 0.9778
SVM              F1 = 0.9890
ANN              F1 = 0.9890

Document classification and model analysis using Naive Bayes, k Nearest Neighbors, Support Vector Machines, and Artificial Neural Networks¶

Liam Black Rohrer¶

Task 1(a):¶

Task 1(b):¶

Task 2 (a):¶

Task 2 (b):¶

Task 2 (c):¶

Task 2 (d):¶

Task 3 (a):¶

Task 3 (b):¶

Task 3(c):¶