import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import statistics as stats


# Import Data
lrdata = pd.read_csv('./Data/A3_Q1_data.csv')



# Format and split the data
#xarr = lrdata.iloc[:,0:2].values
#yarr = lrdata.iloc[:,2].values

xarr = lrdata.values[:,0:2]
yarr = lrdata.values[:,2]

#xarr[:,0] = (xarr[:,0].mean() - xarr[:,0])/stats.stdev(xarr[:,0])
#xarr[:,1] = (xarr[:,1].mean() - xarr[:,1])/stats.stdev(xarr[:,1])



X_train, X_test, y_train, y_test = train_test_split(xarr, yarr, test_size=0.30, shuffle = True, random_state=42)


# Plot the data in a scatterplot
x1 = X_train[:,0]
x2 = X_train[:,1]

fig, ax = plt.subplots()
scatter = plt.scatter(x1, x2, c = y_train)
plt.xlabel('x1 Observation')
plt.ylabel('x2 Observation')
plt.title('Training Data Scatterplot')
legend = ax.legend(*scatter.legend_elements(), loc="lower right", title="Classes")
ax.add_artist(legend)

<matplotlib.legend.Legend at 0x11fc58dc0>


#Describe the data, check for missing values
print(lrdata.isnull().values.any())

print(lrdata.describe())

False
               x1          x2           y
count  200.000000  200.000000  200.000000
mean     0.151376   -0.385426    0.485000
std      1.411722    1.217490    0.501029
min     -3.210005   -3.193456    0.000000
25%     -0.912029   -1.341047    0.000000
50%      0.112286   -0.479684    0.000000
75%      1.174400    0.495114    1.000000
max      3.867647    3.103541    1.000000


import numpy as np
import pandas as pd


# Logistic regression class
class Logistic_regression:
#Class constructor
    def __init__(self):
        self.w = None     # logistic regression weights
        self.saved_w = [] # Since this is a small problem, we can save the weights
                 #  at each iteration of gradient descent to build our 
                          #  learning curves
        # returns nothing
        self.cost_list = []
        pass
    
    def sigmoid(self, X, w):
        # returns the value of the sigmoid
        sigmoid = 1 / (1 + np.exp(-np.dot(X,w.T)))
        #print(sigmoid)
        return sigmoid

    def cost(self, X, y, w):
        # returns the average cross entropy cost
        y_hat = self.sigmoid(X, w)
        N = X.shape[0]
        #cost = -(1/N)*(np.sum(y @ np.log(y_hat) + (1-y).T@np.log(1-y_hat)))
        #cost = np.log(1-y_hat)*(-1)
        cost = (np.dot(y.T, np.log(y_hat))+ np.dot((1-y).T, np.log(1-y_hat)))*(-1/N)
        return cost
        pass

    def gradient_descent(self, X, y, lr):
        # returns s scalar of the magnitude of the Euclidean norm 
        #  of the change in the weights during one gradient descent step
        #print(np.sum(np.dot(X.T,self.sigmoid(X, self.w)-y)))
        
        gradient = (np.exp(np.dot(X,self.w.T))/(1+(np.exp(np.dot(X,self.w.T)))))
        
        wdelta = []
        N = X.shape[0]

        wdelta0 = (1/N)*(np.sum(np.dot(X[:,0], gradient))-np.sum(np.dot(X[:,0], y)))
        wdelta1 = (1/N)*(np.sum(np.dot(X[:,1], gradient))-np.sum(np.dot(X[:,1], y)))    
        wdelta2 = (1/N)*(np.sum(np.dot(X[:,2], gradient))-np.sum(np.dot(X[:,2], y)))  
        
        wdelta = [wdelta0, wdelta1, wdelta2]
        #print(wdelta, lr)
        wdelta = [i * lr for i in wdelta]
        #print(wdelta)
        
        self.w = self.w - wdelta
        #print(self.w)
        
        return np.linalg.norm(wdelta)
    
    def fit(self, X, y, w_init, lr, delta_thresh=1e-6, max_iter=5000, verbose=False):
        # Note the verbose flag enables you to print out the weights at each iteration 
        #  (optional - but may help with one of the questions)
        self.w = w_init
        X = self.prepare_x(X)
        count = 0
        while count <= max_iter:
            if verbose == True:
                print(self.w)
            self.saved_w.append(self.w)
            step_size = self.gradient_descent(X, y, lr)
            cost = self.cost(X, y, self.w)
            self.cost_list.append(cost)
            if step_size <= delta_thresh:
                break
            count = count + 1
        # returns nothing
        pass    
    
    def predict_proba(self, X):
        # returns the confidence score for the each sample
        X = self.prepare_x(X)
        prob = self.sigmoid(X, self.w)
        return prob
    
    # Use the trained model to make binary predictions
    def predict(self, X, thresh=0.5):
        # returns a binary prediction for each sample
        prob = self.predict_proba(X)
        predictions = []
        for value in prob:
            if value > thresh:
                pred = 1
            else:
                pred = 0
            predictions.append(pred)
        return predictions
    
    # Stores the learning curves from saved weights from gradient descent
    def learning_curve(self, X, y):
        # returns the value of the cost function from each step in gradient descent
        #  from the last model fitting process
        cost_list = []
        for wgt in saved_w:
            c = self.cost(X, y, wgt)
            cost_list.append(c)
        return cost_list
    
    # Appends a column of ones as the first feature to account for the bias term
    def prepare_x(self, X):
        # returns the X with a new feature of all ones (a column that is the new column 0)
        X = np.concatenate((np.ones((len(X), 1)), X), axis=1)
        return X


# Random seed to get consistent starting weights
np.random.seed(123)

# Set random weights and learning rate based on given material
W = np.random.rand(1,3)
lr_list = [10**(-2), 10**(-4), 10**(-6), 10**0]



for lr in lr_list:
    model = Logistic_regression()
    model.fit(X_train, y_train, W, lr)
    predictions = model.predict(X_test)
        
    plt.plot(model.cost_list, label= f'Learning Curve LR = {lr}')
    plt.title('Learning Curves')
    plt.xlabel('Step Number')
    plt.ylabel('Cost Value')
    plt.legend()


#Create, fit, and predict with model
model = Logistic_regression()
model.fit(X_train, y_train, W, 10**0)
predictions = model.predict(X_test)

weights = model.saved_w[:50]
weights

[array([[0.69646919, 0.28613933, 0.22685145]]),
 array([[ 0.59245704, -0.39126897, -0.15613718]]),
 array([[ 0.47389284, -0.71272569, -0.28158751]]),
 array([[ 0.3838197 , -0.91348266, -0.34142249]]),
 array([[ 0.31251674, -1.06206543, -0.37876474]]),
 array([[ 0.2542961 , -1.18079755, -0.40556752]]),
 array([[ 0.20575412, -1.27988801, -0.42655781]]),
 array([[ 0.16465837, -1.36493536, -0.44397086]]),
 array([[ 0.12944883, -1.43937497, -0.45898568]]),
 array([[ 0.09898718, -1.50548248, -0.47227418]]),
 array([[ 0.07241661, -1.56485133, -0.48424364]]),
 array([[ 0.04907688, -1.61864536, -0.49515459]]),
 array([[ 0.02844971, -1.66774319, -0.5051823 ]]),
 array([[ 0.010122  , -1.71282613, -0.5144504 ]]),
 array([[-0.00623975, -1.75443419, -0.52305022]]),
 array([[-0.02090787, -1.79300336, -0.53105211]]),
 array([[-0.03410688, -1.82889119, -0.5385124 ]]),
 array([[-0.04602358, -1.86239489, -0.54547769]]),
 array([[-0.05681464, -1.89376441, -0.55198757]]),
 array([[-0.06661241, -1.92321214, -0.55807649]]),
 array([[-0.07552947, -1.95092019, -0.56377488]]),
 array([[-0.08366219, -1.97704599, -0.56911003]]),
 array([[-0.09109355, -2.00172662, -0.57410662]]),
 array([[-0.09789538, -2.02508221, -0.57878718]]),
 array([[-0.10413022, -2.0472187 , -0.58317236]]),
 array([[-0.10985272, -2.06823   , -0.58728119]]),
 array([[-0.11511092, -2.08819976, -0.59113127]]),
 array([[-0.11994716, -2.10720285, -0.59473891]]),
 array([[-0.12439899, -2.12530656, -0.59811924]]),
 array([[-0.12849976, -2.14257159, -0.60128636]]),
 array([[-0.13227924, -2.15905292, -0.60425336]]),
 array([[-0.13576408, -2.17480047, -0.60703249]]),
 array([[-0.13897823, -2.18985975, -0.60963516]]),
 array([[-0.14194324, -2.20427235, -0.61207202]]),
 array([[-0.14467862, -2.2180764 , -0.61435304]]),
 array([[-0.14720199, -2.2313069 , -0.61648754]]),
 array([[-0.14952941, -2.24399611, -0.61848423]]),
 array([[-0.15167545, -2.2561738 , -0.62035129]]),
 array([[-0.15365342, -2.26786752, -0.62209636]]),
 array([[-0.15547552, -2.27910279, -0.62372663]]),
 array([[-0.15715288, -2.28990331, -0.62524881]]),
 array([[-0.15869574, -2.30029114, -0.62666922]]),
 array([[-0.16011351, -2.31028684, -0.6279938 ]]),
 array([[-0.16141488, -2.31990957, -0.6292281 ]]),
 array([[-0.16260783, -2.32917728, -0.63037737]]),
 array([[-0.16369975, -2.33810676, -0.63144651]]),
 array([[-0.16469749, -2.34671373, -0.63244017]]),
 array([[-0.16560737, -2.35501298, -0.63336268]]),
 array([[-0.16643528, -2.36301839, -0.63421816]]),
 array([[-0.16718667, -2.37074303, -0.63501046]])]


from sklearn.neighbors import KNeighborsClassifier

#Learning Rate from aboce
lr = 10**-2

#Create the LR and KNN models for plotting
LRmodel = Logistic_regression()
LRmodel.fit(X_train, y_train, W, lr)
predictions = model.predict(X_test)

knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

# Store the models so they can be iteraed later
Model_graph = [LRmodel, knn]


# Bring in the data from the training set formatted for graphing
x1 = X_train[:,0]
x2 = X_train[:,1]
y = y_train

#Graph colors to pass in
colors1 = ListedColormap(['cyan', 'black', 'mediumaquamarine'])



def plot(mod, x1, x2, y, lr, ax, i):

    colors1 = ListedColormap(['cyan', 'black', 'mediumaquamarine'])


    x1_min, x1_max = x1.min() - 1, x1.max() + 1
    x2_min, x2_max = x1.min() - 1, x2.max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, lr),
                         np.arange(x2_min, x2_max, lr))

     #Model Predictions, needs to convert to array to be reshaped
    print(type(mod),mod)
    Z = np.array(mod.predict(np.c_[xx1.ravel(), xx2.ravel()])) 
    Z = Z.reshape(xx1.shape)
    
    # Plot decision boundry and predictions on the graph
    ax.contourf(xx1, xx2, Z, cmap=colors1)
    # Add plot formatting, I had to make the font size smaller in order to
    ax.scatter(x1,x2, c=y)
    ax.set_xlabel('X Coordinate 1', fontsize = 10) 
    ax.set_ylabel('X Coordinate 2', fontsize = 10)
    if i == 0:
        ax.set_title(f'Logistic Regression Training Set', fontsize = 13)
    else:
        ax.set_title(f'KNN Training Set', fontsize = 13)
    ax.legend(*scatter.legend_elements(),loc="lower right", title="Classes") #ax.legend([0,1], loc = 'upper right', fontsize = 10)
    
    
fig, ax = plt.subplots(1,2, figsize = (10,5))    
for index, mod in enumerate(Model_graph):
    plot(mod,x1, x2, y, lr, ax[index], index)

<class '__main__.Logistic_regression'> <__main__.Logistic_regression object at 0x136873f70>
<class 'sklearn.neighbors._classification.KNeighborsClassifier'> KNeighborsClassifier(n_neighbors=7)


x1 = X_test[:,0]
x2 = X_test[:,1]
y = y_test


def plot(mod, x1, x2, y, lr, ax, i):

    colors1 = ListedColormap(['cyan', 'black', 'mediumaquamarine'])


    x1_min, x1_max = x1.min() - 1, x1.max() + 1
    x2_min, x2_max = x1.min() - 1, x2.max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, lr),
                         np.arange(x2_min, x2_max, lr))

     #Model Predictions, needs to convert to array to be reshaped
    print(type(mod),mod)
    Z = np.array(mod.predict(np.c_[xx1.ravel(), xx2.ravel()])) 
    Z = Z.reshape(xx1.shape)
    
    # Plot decision boundry and predictions on the graph
    ax.contourf(xx1, xx2, Z, cmap=colors1)
    # Add plot formatting, I had to make the font size smaller in order to
    ax.scatter(x1,x2, c=y)
    ax.set_xlabel('X Coordinate 1', fontsize = 10) 
    ax.set_ylabel('X Coordinate 2', fontsize = 10)
    if i == 0:
        ax.set_title(f'Logistic Regression Test Set', fontsize = 13)
    else:
        ax.set_title(f'KNN Test Set', fontsize = 13)
    ax.legend(*scatter.legend_elements(),loc="lower right", title="Classes") #ax.legend([0,1], loc = 'upper right', fontsize = 10)
    
    
fig, ax = plt.subplots(1,2, figsize = (10,5))    
for index, mod in enumerate(Model_graph):
    plot(mod,x1, x2, y, lr, ax[index], index)

<class '__main__.Logistic_regression'> <__main__.Logistic_regression object at 0x136873f70>
<class 'sklearn.neighbors._classification.KNeighborsClassifier'> KNeighborsClassifier(n_neighbors=7)


from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
# ROC Curves
lr = 10**-2

# Create the K fold Validations
kfold = StratifiedKFold(n_splits=10)
kfold.get_n_splits(X_train, y_train)
kflr = Logistic_regression()

#Create Lists to store true positive and false positive rates which make up the ROC curves
knn_pred_list = []
lr_pred_list = []
y_actual = []

for i, (train, test) in enumerate(kfold.split(X_train, y_train)):
    #split the data into the Kfoldsamples
    X_trainsample = X_train[train,:]
    X_testsample = X_train[test,:]
    y_trainsample = y_train[train]
    y_testsample = y_train[test]
    
    kflr = Logistic_regression()
    kflr.fit(X_trainsample, y_trainsample, W, lr)
    predict = kflr.predict_proba(X_testsample)
    lr_pred_list.append(predict)


    # make knn predictions
    KFknn = KNeighborsClassifier(n_neighbors=7)
    KFknn = KFknn.fit(X_trainsample, y_trainsample)
    KNNpred = KFknn.predict_proba(X_testsample)
    knn_pred_list.append(KNNpred)
  
    y_actual.append(y_testsample)


# ROC Curve
#knnfpr, knntpr, knnthreshold = metrics.roc_curve(y_actual, knn_pred_list)
#knn_auc = metrics.auc(knnfpr, knntpr)

df = pd.DataFrame(lr_pred_list, columns=['y_pred'])
lrfpr, lrtpr, lrthreshold = metrics.roc_curve(y_actual, df['y_pred'].values)
lr_auc = metrics.auc(lrfpr, lrtpr)


# ROC Curve plot
plt.figure(figsize = (10,10))

#Plot all models
#plt.plot(knnfpr, knntpr,label = 'K-Fold Logistic Regression AUC = %0.2f')
plt.plot(lrfpr, lrtpr,label = 'K-Fold Logistic Regression AUC = %0.2f')
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')

plt.legend(loc = 'lower right')
plt.title('ROC Curve For Different Classifiers')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid('on')
plt.axis('square')



plt.show()

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-465-9ad1b20ba94b> in <module>
      3 #knn_auc = metrics.auc(knnfpr, knntpr)
      4 
----> 5 df = pd.DataFrame(lr_pred_list, columns=['y_pred'])
      6 lrfpr, lrtpr, lrthreshold = metrics.roc_curve(y_actual, df['y_pred'].values)
      7 lr_auc = metrics.auc(lrfpr, lrtpr)

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    521                     mgr = arrays_to_mgr(arrays, columns, index, columns, dtype=dtype)
    522                 else:
--> 523                     mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy)
    524             else:
    525                 mgr = init_dict({}, index, columns, dtype=dtype)

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/construction.py in init_ndarray(values, index, columns, dtype, copy)
    188     # by definition an array here
    189     # the dtypes will be coerced to a single dtype
--> 190     values = _prep_ndarray(values, copy=copy)
    191 
    192     if dtype is not None:

/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/construction.py in _prep_ndarray(values, copy)
    322         values = values.reshape((values.shape[0], 1))
    323     elif values.ndim != 2:
--> 324         raise ValueError(f"Must pass 2-d input. shape={values.shape}")
    325 
    326     return values

ValueError: Must pass 2-d input. shape=(10, 14, 1)


# Load the MNIST Data
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import pickle

# Set this to True to download the data for the first time and False after the first time 
#   so that you just load the data locally instead
download_data = True

if download_data:
    # Load data from https://www.openml.org/d/554
    X, y = fetch_openml('mnist_784', return_X_y=True, as_frame=False)
    
    # Adjust the labels to be '1' if y==3, and '0' otherwise
    y[y!='3'] = 0
    y[y=='3'] = 1
    y = y.astype('int')
    
    # Divide the data intro a training and test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/7, random_state=88)
    
    file = open('tmpdata', 'wb')
    pickle.dump((X_train, X_test, y_train, y_test), file)
    file.close()
else:
    file = open('tmpdata', 'rb')
    X_train, X_test, y_train, y_test = pickle.load(file)
    file.close()


indices = [i for i, x in enumerate(y_train) if x == 1]
xsamplelist = indices[:10]
sample_list = []
for index in xsamplelist:
    sample = X_train[index,:]
    sample = np.reshape(sample, (28,28))
    sample_list.append(sample)


indices0 = [i for i, x in enumerate(y_train) if x == 0]
xsamplelist0 = indices0[:10]
sample_list0 = []
for index in xsamplelist0:
    sample0 = X_train[index,:]
    sample0 = np.reshape(sample0, (28,28))
    sample_list0.append(sample0)


# Sample of 3's
plt.figure(figsize = (25,25))
for i in range(0,len(sample_list)):
    plt.subplot(1,10,i+1)
    plt.imshow(sample_list[i], cmap = 'gray')
    plt.title("Class 1")
plt.show()

# Sampel of non 3's
plt.figure(figsize = (25,25))
for i in range(0,len(sample_list0)):
    plt.subplot(1,10,i+1)
    plt.imshow(sample_list0[i], cmap = 'gray')
    plt.title("Class 0")
plt.show()


print(f'There are {len(indices)} examples of threes, and {len(indices0)} examples of numbers that are not threes in this dataset')

colors = ListedColormap(['Blue', 'Green'])
plt.hist((indices0, indices), bins = 1, color = ['Blue', 'Green'])
plt.title('Histogram of Class distrribution')
plt.xlabel('Class')
plt.ylabel('Number of Observations')
#plt.xticks()

There are 6129 examples of threes, and 53871 examples of numbers that are not threes in this dataset

Text(0, 0.5, 'Number of Observations')


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn import metrics

# Initialize lists for all the scores 
cost_list = []
coef_list = []
f1_list = []
auc_list = []

C = np.logspace(-4,4,20)

# Loop through the values in C to run model with all 20 iterations of the logspace parameter
for c in C:
    model = LogisticRegression(penalty='l1', C=c, solver = 'liblinear')
    model.fit(X_train, y_train)
    
    # Make predictions using the model, compute the 
    predictions = model.predict(X_test)
    f_cost = log_loss(y_test, predictions)
    cost_list.append(f_cost)
    
    
    #Store number of nonzero parameters
    coefs = np.sum(model.coef_ != 0)
    coef_list.append(coefs)
    
    # Calculate and store the F1 score
    f1 = f1_score(y_test, predictions, pos_label=1)
    f1_list.append(f1)
    
    #Get AUC Info
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions, pos_label=1)
    auc = metrics.roc_auc_score(y_test, predictions)
    auc_list.append(auc)


plt.figure(figsize = (10,10))

# Coefficient graph
plt.subplot(2,2,1)
plt.plot(C, coef_list)
plt.xscale("log")
plt.title('Non-zero Coefficients per model')
plt.xlabel('Log of (C) parameter')
plt.ylabel('Number of non-zero Coefficients')
plt.show()

# Cost List
plt.figure(figsize = (10,10))
plt.subplot(2,2,2)
plt.plot(C, cost_list)
plt.xscale('log')
plt.title('Cost Function Score')
plt.xlabel('Log of (C) parameter')
plt.ylabel('Number of non-zero Coefficients')
plt.show()

# F1 - Score Plot
plt.figure(figsize = (10,10))
plt.subplot(2,2,3)
plt.plot(C, f1_list)
plt.xscale('log')
plt.title('F1 - Score')
plt.xlabel('Log of (C) parameter')
plt.ylabel('F1 Score')
plt.show()

# AUC Curve
plt.figure(figsize = (10,10))
plt.subplot(2,2,4)
plt.plot(C, auc_list)
plt.xscale('log')
plt.title('AUC Curve')
plt.xlabel('Log of (C) parameter')
plt.ylabel('Area Under the Curve')
plt.show()


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import random

random.seed(20)


###(1) logistic regression classifier with minimal regularization (using the Scikit Learn package, set penalty='l1', C=1e100 to approximate this) 
model1 = LogisticRegression(penalty='l1', C=1e100, solver = 'liblinear')
model1.fit(X_train, y_train)
predictions = model1.predict_proba(X_test)[:,1]

# AUC Curve
mod1fpr, mod1tpr, thresholds1 = metrics.roc_curve(y_test, predictions, pos_label=1)
auc_mod1 = metrics.roc_auc_score(y_test, predictions)



###(2) a logistic regression classifier with the best value of the regularization parameter from the last section, 
model2 = LogisticRegression(penalty='l1', C=10**(-2), solver = 'liblinear')
model2.fit(X_train, y_train)
predictions2 = model2.predict_proba(X_test)[:,1]

#AUC Curve
mod2fpr, mod2tpr, thresholds2 = metrics.roc_curve(y_test, predictions2, pos_label=1)
auc_mod2 = metrics.roc_auc_score(y_test, predictions2)




###(3) a Linear Discriminant Analysis (LDA) Classifier
LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train, y_train)
LDApredictions = LDA.predict_proba(X_test)[:,1]

#AUC Curve
ldafpr, ldatpr, thresholds3 = metrics.roc_curve(y_test, LDApredictions, pos_label=1)
auc_lda = metrics.roc_auc_score(y_test, LDApredictions)



###(4) a Random Forest (RF) classifier (using default parameters for the LDA and RF classifiers) 

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
CLFpredictions = clf.predict_proba(X_test)[:,1]

#AUC Curve
rfcfpr, rfctpr, thresholds4 = metrics.roc_curve(y_test, CLFpredictions, pos_label=1)
auc_rfc = metrics.roc_auc_score(y_test, CLFpredictions)


# ROC Curve

# Model 1 ROC Curve plot
plt.figure(figsize = (10,10))

#Plot all models
plt.plot(mod1fpr, mod1tpr,label = 'Model1 Logistic Regression AUC = %0.2f' % auc_mod1)
plt.plot(mod2fpr, mod2tpr,label = 'Model 2 Logistic Regression AUC = %0.2f' % auc_mod2)
plt.plot(ldafpr, ldatpr,label = 'LDA AUC = %0.2f' % auc_lda)
plt.plot(rfcfpr, rfctpr,label = 'Random Forrest Classifier AUC = %0.2f' % auc_rfc)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')

plt.legend(loc = 'lower right')
plt.title('ROC Curve For Different Classifiers')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid('on')
plt.axis('square')



plt.show()


from sklearn.metrics import precision_recall_curve

# Precision Recall Log Reg Model1
mod1prec, mod1recall, mod1thresh = precision_recall_curve(y_test, predictions, pos_label =1)

# Precision Recall Log Reg Model2
mod2prec, mod2recall, mod2thresh = precision_recall_curve(y_test, predictions2, pos_label =1)

# Precision Recall LDA
ldaprec, ldarecall, ldathresh = precision_recall_curve(y_test, LDApredictions, pos_label =1)

# Precision Recall Log Reg Model1
rfcprec, rfcrecall, rfcthresh = precision_recall_curve(y_test, CLFpredictions, pos_label =1)

#random line with the percent of positive y's
rand = (sum(y)/len(y))

### Plot the Precision Recall curves 
plt.figure(figsize=(15,15))
plt.plot(mod1recall, mod1prec, label = 'Logistic Regresstion Model 1')
plt.plot(mod2recall, mod2prec, label = 'Logistic Regresstion Model 2')
plt.plot(ldarecall, ldaprec, label = 'LDA Model')
plt.plot(rfcrecall, rfcprec, label = 'Random Forrest Classifier')
plt.plot([0,1],[rand,rand],linestyle="--", linewidth=1, color='red', label="random")
plt.ylabel("Precision")
plt.xlabel("Recall")
plt.title('Precision-Recall Curve for Multiple Classifiers')
plt.legend(loc = 'lower right')
plt.show()


from scipy.stats import expon

# X in domain of [0,2] is the linspace of 0,2 - and create the distributions for Lambda = 1 and 5
lspace = np.linspace(0,2)
lambda1 = expon.pdf(lspace, scale = 1)
lambda5 = expon.pdf(lspace, scale = (1/5))

# Plot the two curves created above

plt.plot(lspace, lambda1, label='Lambda = 1')
plt.plot(lspace, lambda5, label='Lambda = 5')
plt.title('Probability of Class 0 and Class1 Conditional Distribution')
plt.xlabel('Linspace Value')
plt.ylabel('Probability')
plt.legend()
plt.show()


# Import the test Data
test = pd.read_csv('./Data/A3_Q3_test.csv')


# Loop through values in the column of x, if they are larger than the .4024 calculated above, classify as 1
classifier = []
for row in test['x']:
    if row > .4024 : classifier.append(1)
    else:
        classifier.append(0)

# Append classification rates to the Data frame
test['classifier'] = classifier


# Calculate missclassified observations if y does not equal classification
misclass = sum(abs(test['y']-test['classifier']))
N = len(test)

misclass_rate = misclass/N
misclass_rate

0.23395


# Import the train Data

train = pd.read_csv('./Data/A3_Q3_train.csv')

# Split the data and reshape it to work with the function
x_train = train.iloc[:,1].values
y_train = train.iloc[:,2].values
x_test = test.iloc[:,1].values
y_test = test.iloc[:,2].values

x_train = x_train.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)
x_test = x_test.reshape(-1, 1)


#Create, fit and do predictions with Logistic Regression model
logreg = LogisticRegression()
logreg.fit(x_train, y_train.ravel())
predictions = logreg.predict(x_test)

# Count the number of incorrect predictions and divide by total observations
misclass2 = sum(abs(predictions-y_test))
misclass2/N

0.234


print(f'Intercept: {logreg.intercept_}, Coefficient:{logreg.coef_}, Decision Boundry:{(-logreg.intercept_/logreg.coef_)}' )

Intercept: [-1.60348178], Coefficient:[[3.97552165]], Decision Boundry:[[0.40333871]]


# Creacte ROC scores
fpr2, tpr2, threshold = roc_curve(y_test, model.predict_proba(X_test)[:,1])

# Plot ROC Scores 
plt.figure(figsize = (10,10))
plt.plot(fpr2, tpr2,label = 'Model1 Logistic Regression AUC = %0.2f')
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Random guess')

plt.legend(loc = 'lower right')
plt.title('ROC Curve For Different Classifiers')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.grid('on')
plt.axis('square')



plt.show()

Supervised Learning: model training and evaluation¶

Caleb O'Neel¶

Learning Objectives:¶

1¶

[50 points] Classification using logistic regression: build it from the ground up¶

I. Load, prepare, and plot your data¶

II. Stating the hypothesis set of models to evaluate (we'll use logistic regression)¶

III. Find the cost function that we can use to choose the model parameters, $\mathbf{w}$, that best fit the training data.¶

IV. Implement gradient descent and your logistic regression algorithm¶

V. Evaluate your model performance through cross validation¶

2¶

[30 points] Digits classification¶

3¶

[20 points] Comparing the Bayes' decision rule with logistic regression¶