import numpy as np
import matplotlib.pyplot as plt
import glob


def estimate_log_prob_params(strings):
    """
    Estimate the log multinomial probability parameters for a
    particular set of strings.  This will be our model
    
    Parameters
    ----------
    strings: list of string
        A list of strings
    
    Returns
    -------
    probs: dict
        word->log probability in model
    total: int
        Total number of words in the model, plus smoothing
    """
    probs = {}
    total = 0
    for s in strings:
        for word in s.split():
            if not word in probs:
                probs[word] = 1 # Do this for smoothing
                total += 1
            probs[word] += 1
            total += 1
    # Convert to log probabilities
    for word in probs:
        probs[word] = np.log(probs[word]/total)
    return (probs, total)

def get_log_probability_words(model, words):
    """
    Compute the log probability of a set of words using
    a model
    
    Parameters
    ----------
    model: (probs, total)
        probs: dict
            word->log probability in model
        total: int
            Total number of words in the model, plus smoothing
    words: list of string
        Words to which to apply the model
    
    Returns
    -------
    float: The log probability of the set of words under the model
    """
    (probs, total) = model
    res = 0
    for word in words:
        if not word in probs:
            # If this word has never been seen, give it a low
            # but nonzero probability before taking the log
            res += np.log(1/total)
        else:
            res += probs[word] 
    return res


categories = ["clinton", "trump"]
models = []
for c in categories:
    fin = open("text/2016Debates/{}1.txt".format(c))
    strings = [fin.read()]
    fin.close()
    fin = open("text/2016Debates/{}2.txt".format(c))
    strings.append(fin.read())
    fin.close()
    model = estimate_log_prob_params(strings)
    models.append(model)


# Apply test data
confusion = np.zeros((2, 2), dtype=int)
for i, c in enumerate(categories):
    for f in glob.glob("text/2016Debates/{}3*.txt".format(c)):
        fin = open(f)
        test = fin.read()
        words = test.split()
        log_probs = [get_log_probability_words(models[j], words) for j in range(len(categories))]
        j = np.argmax(log_probs)
        confusion[i, j] += 1
        fin.close()
#print(confusion, ":", np.sum(np.diag(confusion))/np.sum(confusion), "accuracy")

plt.imshow(confusion)
for i in range(len(confusion)):
    for j in range(len(confusion)):
        plt.text(j, i, confusion[i, j])
plt.xticks(np.arange(len(confusion)), categories)
plt.yticks(np.arange(len(confusion)), categories)
plt.title("{:f}%s Accuracy".format(100*np.sum(np.diag(confusion))/np.sum(confusion)));

Text Bag of Words with Naive Bayes¶

Chris Tralie¶

2016 Presidential Debates¶