###Standard Naive Bayes Classifier Implementation

import csv
import math
from collections import defaultdict, Counter

#Loading in CSV files from the training set which have a class label
def load_csv(filepath):
    records = []
    with open(filepath, newline = '', encoding = 'utf-8') as f:
        for row in csv.DictReader(f):
            records.append((row['id'], row['class'].strip(), row['abstract']))
    return records

#Loading in CSV files from the test set which do not have a class label
def load_csv_no_class(filepath):
    records = []
    with open(filepath, newline = '', encoding = 'utf-8') as f:
        for row in csv.DictReader(f):
            records.append((row['id'], None, row['abstract']))
    return records

#Converting the text to usable tokens
def tokenise(text):
    return text.lower().split()

#Naive Bayes Classifier implementation
class NaiveBayesClassifier:
    #Initializing class variables
    def __init__(self):
        self.classes = []
        self.log_prior = {}
        self.log_likelihood = {}
        self.vocab = set()

    #Training the model
    def train(self, records):
        labels = [r[1] for r in records]
        token_lists = [tokenise(r[2]) for r in records]
        self.classes = sorted(set(labels))
        self.vocab = {t for tokens in token_lists for t in tokens}

        #Calculating prior probabilities in log space
        counts = Counter(labels)
        n = len(labels)
        self.log_prior = {c: math.log(counts[c] / n) for c in self.classes}

        #Summing word counts per class
        word_counts = {c: defaultdict(int) for c in self.classes}
        for tokens, label in zip(token_lists, labels):
            for t in tokens:
                word_counts[label][t] += 1

        #Calculating likelihoods in log space using laplace smoothing with fixed alpha = 1 value
        V = len(self.vocab)
        for c in self.classes:
            total = sum(word_counts[c].values()) + 1 * V
            self.log_likelihood[c] = {t: math.log((word_counts[c][t] + 1) / total)for t in self.vocab}
            self.log_likelihood[c]['__UNK__'] = math.log(1 / total)

    #Predicting the class of a single observation
    def predict_one(self, abstract):
        tokens = tokenise(abstract)
        scores = {}
        for c in self.classes:
            score = self.log_prior[c]
            for t in tokens:
                score += self.log_likelihood[c].get(t, self.log_likelihood[c]['__UNK__'])
            scores[c] = score
        return max(scores, key=scores.get)

#Compiling predictions for all observations in a given data set
def predict_all(classifier, records):
    return [(r[0], classifier.predict_one(r[2])) for r in records]

#Saving compiled predictions to a CSV file for kaggle submission
def save_predictions(predictions, filepath):
    with open(filepath, 'w', newline = '', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'class'])
        writer.writerows(predictions)

#Testing accuracy of a model on a labeled data set
def accuracy(classifier, records):
    correct = 0
    for _, true_label, abstract in records:
        predicted = classifier.predict_one(abstract)
        if predicted == true_label:
            correct += 1
    return correct / len(records)



#Training on the training set
train_records = load_csv('trg.csv')
naive_bayes = NaiveBayesClassifier()
naive_bayes.train(train_records)

#Print accuracy on training set
train_accuracy = accuracy(naive_bayes, train_records)
print("Training accuracy:", round(train_accuracy, 4))

#Predicting labels of the test set and saving predictions to a CSV file
test_records = load_csv_no_class('tst.csv')  # see note below
predictions = predict_all(naive_bayes, test_records)
save_predictions(predictions, 'predictions.csv')

Training accuracy: 0.9778

###Improved Naive Bayes Classifier Implementation

import csv
import math
from collections import defaultdict, Counter
import re

#Loading in CSV files from the training set which have a class label
def load_csv(filepath):
    records = []
    with open(filepath, newline = '', encoding = 'utf-8') as f:
        for row in csv.DictReader(f):
            records.append((row['id'], row['class'].strip(), row['abstract']))
    return records

#Loading in CSV files from the test set which do not have a class label
def load_csv_no_class(filepath):
    records = []
    with open(filepath, newline = '', encoding = 'utf-8') as f:
        for row in csv.DictReader(f):
            records.append((row['id'], None, row['abstract']))
    return records

#Converting the text to usable tokens, removing stop-words, and the commented out code for including bigrams
def tokenise(text):
    stopwords = {"the", "of", "and", "to", "a", "in", "is", "it", "you", "that", "he", "was", "for", "on", 
                 "are", "as", "with", "his", "they", "I", "at", "be", "this", "have", "from", "or", "one", 
                 "had", "by", "but", "not", "what", "all", "were", "we", "when", "your", "can", "said", 
                 "there", "use", "an", "each", "which", "she", "do", "how", "their", "if", "will"}
    text = text.lower()
    tokens = [t for t in text.split() if t not in stopwords]
    #bigrams = ['_'.join(tokens[i:i+2]) for i in range(len(tokens) - 1)]
    return tokens #+ bigrams

#Naive Bayes Classifier implementation
class NaiveBayesClassifier:
    #Initializing class variables
    def __init__(self, alpha = 1, min_occ = 1):
        self.alpha = alpha
        self.min_occ = min_occ
        self.classes = []
        self.log_prior = {}
        self.log_likelihood = {}
        self.vocab = set()

    #Training the model
    def train(self, records):
        labels = [r[1] for r in records]
        token_lists = [tokenise(r[2]) for r in records]
        self.classes = sorted(set(labels))
        self.vocab = {t for tokens in token_lists for t in tokens}

        #Calculating prior probabilities in log space
        counts = Counter(labels)
        n = len(labels)
        self.log_prior = {c: math.log(counts[c] / n) for c in self.classes}

        #Summing word counts per class and applying minimum word occurrence threshold
        word_counts = {c: defaultdict(int) for c in self.classes}
        for tokens, label in zip(token_lists, labels):
            for t in tokens:
                word_counts[label][t] += 1
        self.vocab = {t for t in self.vocab if sum(word_counts[c][t] for c in self.classes) >= self.min_occ}

        #Calculating likelihoods in log space using laplace smoothing and an adjustable alpha value
        V = len(self.vocab)
        for c in self.classes:
            total = sum(word_counts[c].values()) + self.alpha * V
            self.log_likelihood[c] = {t: math.log((word_counts[c][t] + self.alpha) / total)for t in self.vocab}
            self.log_likelihood[c]['__UNK__'] = math.log(self.alpha / total)

    #Predicting the class of a single observation
    def predict_one(self, abstract):
        tokens = tokenise(abstract)
        scores = {}
        for c in self.classes:
            score = self.log_prior[c]
            for t in tokens:
                score += self.log_likelihood[c].get(t, self.log_likelihood[c]['__UNK__'])
            scores[c] = score
        return max(scores, key=scores.get)

#Compiling predictions for all observations in a given data set
def predict_all(classifier, records):
    return [(r[0], classifier.predict_one(r[2])) for r in records]

#Saving compiled predictions to a CSV file for kaggle submission
def save_predictions(predictions, filepath):
    with open(filepath, 'w', newline = '', encoding = 'utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'class'])
        writer.writerows(predictions)

#Testing accuracy of a model on a labeled data set
def accuracy(classifier, records):
    correct = 0
    for _, true_label, abstract in records:
        predicted = classifier.predict_one(abstract)
        if predicted == true_label:
            correct += 1
    return correct / len(records)

#Cross validation to test hyper-parameters
def cross_validate(records, alpha = 1, min_occ = 1, k=5):
    fold_size = len(records) // k
    accs = []
    for i in range(k):
        val = records[i * fold_size : (i+1) * fold_size]
        train = records[ :i * fold_size] + records[(i+1) * fold_size: ]
        candidate = NaiveBayesClassifier(alpha=alpha, min_occ = min_occ)
        candidate.train(train)
        correct = accuracy(candidate, val)
        accs.append(correct)
    print(f"\nMean accuracy: {sum(accs)/len(accs):.4f}")



#Testing hyper-parameters with cross-validation
train_records = load_csv('trg.csv')
print("Tuning minimum word occurrence:")
cross_validate(train_records, min_occ = 1, k = 5)
cross_validate(train_records, min_occ = 2, k = 5)
cross_validate(train_records, min_occ = 3, k = 5)
print("Tuning alpha:")
cross_validate(train_records, alpha = 2, k = 5)
cross_validate(train_records, alpha = 1, k = 5)
cross_validate(train_records, alpha = 0.5, k = 5)
cross_validate(train_records, alpha = 0.8, k = 5)
cross_validate(train_records, alpha = 0.6, k = 5)
cross_validate(train_records, alpha = 0.7, k = 5)


#Training on the training set using optimized hyper-parameters
opt_naive_bayes = NaiveBayesClassifier(alpha = 0.7)
opt_naive_bayes.train(train_records)

#Print accuracy on training set
train_accuracy = accuracy(opt_naive_bayes, train_records)
print("Training accuracy:", round(train_accuracy, 4))

#Predicting labels of the test set and saving predictions to a CSV file
test_records = load_csv_no_class('tst.csv')
predictions = predict_all(opt_naive_bayes, test_records)
save_predictions(predictions, 'predictions.csv')

Tuning minimum word occurrence:

Mean accuracy: 0.9517

Mean accuracy: 0.9295

Mean accuracy: 0.8562
Tuning alpha:

Mean accuracy: 0.9303

Mean accuracy: 0.9517

Mean accuracy: 0.9560

Mean accuracy: 0.9553

Mean accuracy: 0.9563

Mean accuracy: 0.9572
Training accuracy: 0.9908