# MAC USERS TAKE NOTE:
# For clearer plots in Jupyter notebooks on macs, run the following line of code:
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import distance
from scipy import stats
import pandas as pd
from statistics import mode 
import sklearn.datasets
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from matplotlib.colors import ListedColormap
import random
import math
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.metrics import classification_report
import time


X = np.array([[ 0, 3, 0],
              [ 2, 0, 0],
              [ 0, 1, 3],
              [ 0, 1, 2],
              [-1, 0, 1],
              [ 1, 1, 1]])
y = np.array(['r','r','r','b','b','r'])


columns = ['Distance']
df = pd.DataFrame(columns=columns)

origin = [0, 0, 0]
for cords in X:
    dst = distance.euclidean(origin, cords)
    df = df.append({'Distance': dst}, ignore_index=True)
df.index = np.arange(1, len(df) + 1)
df.index.name = 'Obs.'


# (a) Write your own kNN classifier

class Knn:
# k-Nearest Neighbor class object for classification training and testing
    def __init__(self):
        self.train= None
        self.ytarget = None

        
    def fit(self, x, y):
        # Save the training data to properties of this class
        self.train = x.values
        self.ytarget = y.values
        
    def predict(self, x, k):
        y_hat = [] # Variable to store the estimated class label for 
        # Calculate the distance from each vector in x to the training data
        # Convert x values to a format that allows for easier manipulation   
        x = x.values
        for cord in x:
            # Calculate the Euclidian distance between the first X coordinate and all cords in training data
            dst = np.sqrt(np.sum((cord - self.train)**2, axis = 1))
            
            #Sort the list of distances and take the index numbers of the k-smallest values
            dst_sort = np.argsort(dst)[0:k]
            
            # Get the Y classifiers for the corresponding distances, and select the most common classification
            target = self.ytarget[dst_sort]
            y_target = stats.mode(target)
            
            #Reformat to a single integer, append it to y_hat before taking the next coord of X and repeating process
            y_target = y_target[0][0].tolist()
            #y_array = np.array(y_target)
            y_hat.append(y_target)
        # For Accuracy, list format must be changed to be flat
        y_hat = [item for sublist in y_hat for item in sublist]
        # Return the estimated targets
        return y_hat
    
# Metric of overall classification accuracy
#  (a more general function, sklearn.metrics.accuracy_score, is also available)
def accuracy(y,y_hat):
    nvalues = len(y)
    accuracy = sum(y == y_hat) / nvalues
    return accuracy


# Load the Data Sets
x_test_high = pd.read_csv('Data/A2_X_test_high.csv', header=None)
x_test_low = pd.read_csv('Data/A2_X_test_low.csv', header=None)
x_train_high = pd.read_csv('Data/A2_X_train_high.csv', header=None)
x_train_low = pd.read_csv('Data/A2_X_train_low.csv', header=None)
y_test_high = pd.read_csv('Data/A2_y_test_high.csv', header=None)
y_test_low = pd.read_csv('Data/A2_y_test_low.csv', header=None)
y_train_low = pd.read_csv('Data/A2_y_train_low.csv', header=None)
y_train_high = pd.read_csv('Data/A2_y_train_high.csv', header=None)


# Evaluate the performance of your kNN classifier on a low- and a high-dimensional dataset 
#   and time the predictions of each

### Low demensional data set
model = Knn()
model.fit(x_train_low, y_train_low)

t0 = time.time()
pred = model.predict(x_test_low, 5)
t1 = time.time()
low_time = round(t1 - t0,4)

#Without converting to values accuracy function struggles
y_test = y_test_low[0].values.ravel()
low_acc = accuracy(y_test, pred)


### High demensional data set
model = Knn()
model.fit(x_train_high, y_train_high)

t2 = time.time()
highpred = model.predict(x_test_high, 5)
t3 = time.time()
high_time = round(t3 - t2,4)

#Without converting to values accuracy function struggles
y_test_high = y_test_high[0].values.ravel()
high_acc = accuracy(y_test_high, highpred)



print(f'The low dimensional model has an accuracy of {low_acc} and takes {low_time} to run')
print(f'The high dimensional model has an accuracy of {high_acc} and takes {high_time} to run')

The low dimensional model has an accuracy of 0.925 and takes 0.2073 to run
The high dimensional model has an accuracy of 0.993 and takes 0.3624 to run


# Time SKLearn model Low dim
skm = KNeighborsClassifier(n_neighbors=5)
skm.fit(x_train_low.values, y_train_low.values.ravel())

t4 = time.time()
skm_pred = skm.predict(x_test_low.values)
t5 = time.time()
skm_time = round(t5 - t4,4)

y_test = y_test_low[0].values.ravel()
lowskm_acc = accuracy(y_test, skm_pred)


# Time SKLearn model high dim
skm = KNeighborsClassifier(n_neighbors=5)
skm.fit(x_train_high.values, y_train_high.values.ravel())

t4 = time.time()
highskm_pred = skm.predict(x_test_high.values)
t5 = time.time()
highskm_time = round(t5 - t4,4)

y_test = y_test_low[0].values.ravel()
highskm_acc = accuracy(y_test, skm_pred)



print(f'The low dimensional model has an accuracy of {lowskm_acc} and takes {skm_time} to run')
print(f'The high dimensional model has an accuracy of {highskm_acc} and takes {highskm_time} to run')

The low dimensional model has an accuracy of 0.925 and takes 0.0281 to run
The high dimensional model has an accuracy of 0.925 and takes 0.1857 to run


x, y = sklearn.datasets.make_moons(n_samples = 1000, noise = 0.35)


# create a scatterplot with the two columns of coordinates of x as the X and Y axis, amd the Y as the classifier
fig, ax = plt.subplots()
scatter = plt.scatter(x[:,0],x[:,1], c = y, alpha = .5)

#Create data labels
plt.title('Random Distribution by Classifier')
plt.xlabel('X1')
plt.ylabel('X2')

legend = ax.legend(*scatter.legend_elements(),
                    loc="lower left", title="Classes")
ax.add_artist(legend)

<matplotlib.legend.Legend at 0x11d04b490>


# Put X and Y values into data Frame so they can be sampled
xdf = pd.DataFrame(columns = ['x1', 'x2'], data = x)
ydf = pd.DataFrame(columns = ['y'], data = y)
df = xdf.join(ydf, how='outer')

# Create three samples of 100 from the dataframe
sample1 = df.sample(100)
sample2 = df.sample(100)
sample3 = df.sample(100)



# Convert them back to arrays for KNN interpretation
samp1x = sample1[['x1', 'x2']].values
samp1y = sample1[['y']].values 

samp2x = sample2[['x1', 'x2']].values
samp2y = sample2[['y']].values 

samp3x = sample3[['x1', 'x2']].values
samp3y = sample3[['y']].values


# Create a loop to go through each sample and KNN combo and create a model

samplelist = [sample1, sample2, sample3]
knn_num = [1, 25, 50]

model_list = []
for data in samplelist:
    data = data.values
    x = data[:,0:2]
    y = data[:,2]
    for k in knn_num:
        model = KNeighborsClassifier(n_neighbors=k)
        mod = model.fit(x,y)
        model_list.append(mod)
        
print(model_list)

[KNeighborsClassifier(n_neighbors=1), KNeighborsClassifier(n_neighbors=25), KNeighborsClassifier(n_neighbors=50), KNeighborsClassifier(n_neighbors=1), KNeighborsClassifier(n_neighbors=25), KNeighborsClassifier(n_neighbors=50), KNeighborsClassifier(n_neighbors=1), KNeighborsClassifier(n_neighbors=25), KNeighborsClassifier(n_neighbors=50)]


## Create Stepsize and colormap to be used later in the the decision boundry graphing
step_size = 0.02
colors1 = ListedColormap(['cyan', 'black', 'mediumaquamarine'])



# Creating a plot function to loop through to graph each subplot
def plot(df, k, ax):
    #Bring in data from respective sampel and format it
    data = df.values
    x = data[:,0:2]
    y = data[:,2]
    x1 = data[:,0]
    x2 = data[:,1]
    y = data[:,2]
    
    #Get the information necessary to plot the decision boundry
    x1_min, x1_max = x1.min() - 1, x1.max() + 1
    x2_min, x2_max = x1.min() - 1, x2.max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, step_size),
                         np.arange(x2_min, x2_max, step_size))
    
    #create model based on K-values
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x,y)
    
    #Model Predictions
    Z = model.predict(np.c_[xx1.ravel(), xx2.ravel()])
    Z = Z.reshape(xx1.shape)
    
    # Plot decision boundry and predictions on the graph
    ax.contourf(xx1, xx2, Z, cmap=colors1)
    
    # Add plot formatting, I had to make the font size smaller in order to keep it from overlapping with the ones above and below. 
    ax.scatter(x1,x2, c = y)
    ax.set_xlabel('X Coordinate 1', fontsize = 10)
    ax.set_ylabel('X Coordinate 2', fontsize = 10)
    ax.set_title(f'Scatter Plot When K ={k}', fontsize = 13)
    ax.legend(*scatter.legend_elements(),
                    loc="lower left", title="Classes")
    #ax.legend([0,1], loc = 'upper right', fontsize = 10)


# Create a subplot 
fig, ax = plt.subplots(3, 3, figsize = (15,15))
for ind, data in enumerate(samplelist):
    for ind2, k in enumerate(knn_num):
        plot(data, k, ax[ind, ind2])


# Bring in dataframe from problem 5 for training data
training = df
x_train = xdf[['x1', 'x2']].values
y_train = ydf[['y']].values

# Create test data
x_test, y_test = sklearn.datasets.make_moons(n_samples = 1000, noise = 0.35, shuffle = True)


# Create a dataframe to append the knn value and error rate for each step
accuracydf = pd.DataFrame(columns = ['KNN', '%Error'])

#step through each k value 1 through 500 and fit the KNN model, make predictions, and compare to the true answers
for i in range(1,501):
    model = KNeighborsClassifier(n_neighbors = i)
    model.fit(x_train, y_train.ravel())
    predictions = model.predict(x_test)
    errors = np.abs(predictions - y_test)
    error_rate = sum(errors)/len(y_test)
    accuracydf = accuracydf.append({'KNN':i, '%Error':error_rate}, ignore_index=True)


#This loop does the same thing but on the test data. 
testdf = pd.DataFrame(columns = ['KNN', '%Error'])
for i in range(1,501):
    tmodel = KNeighborsClassifier(n_neighbors = i)
    tmodel.fit(x_test, y_test.ravel())
    predictions = tmodel.predict(x_test)
    errors = np.abs(predictions - y_test)
    error_rate = sum(errors)/len(y_test)
    testdf = testdf.append({'KNN':i, '%Error':error_rate}, ignore_index=True)


# Create plots for the error rate for both test and training data
plt.plot(accuracydf['KNN'], accuracydf['%Error'], color='r', linewidth=1.0, alpha=0.5)
plt.xlabel('K Nearest Neighbors')
plt.ylabel('Error Rate')
plt.title('KNN vs. Error Rate')

plt.plot(testdf['KNN'], testdf['%Error'], color='b', linewidth=1.0, alpha=0.5)

plt.show()


accuracydf.loc[accuracydf['%Error'] == accuracydf['%Error'].min()]


# Import Data, drop extraneous first column
test = pd.read_csv('Data/A2_Q7_test.csv')
train = pd.read_csv('Data/A2_Q7_train.csv')
test.drop(['Unnamed: 0'], axis=1)
train.drop(['Unnamed: 0'], axis=1)

# Turn individual columns into arrays 
x = train['x'].values
y = train['y'].values
x_test = test['x'].values
y_test = test['y'].values


# Create the scatterplot, add title and legend
fig, ax = plt.subplots()
scatter = plt.scatter(x, y, alpha = .5)
plt.title('Training Data Distribution')
plt.xlabel('Coordinate X1')
plt.ylabel('Coordinate X2')

Text(0, 0.5, 'Coordinate X2')


train = train.sort_values(by='x')


#Reshape data 
x = x.reshape(-1,1)


# Fit and create model
reg = LinearRegression()
reg.fit(x,y)
pred = reg.predict(x)

# Store the Intercept and Coefficient
intercept = round(reg.intercept_,2)
beta = round(reg.coef_[0],2)

# store the R-squared and Mean square error terms
mse = metrics.mean_squared_error(y, pred)
r_squared = metrics.r2_score(y, pred)



print(f'The output of the model follows the equation: y = {intercept} + {beta}x')
print('r2: ', round(r_squared,2))
print('MSE: ', round(mse,2))

The output of the model follows the equation: y = 17.2 + 2.59x
r2:  0.06
MSE:  791.42


# Creat X transformations
ctrain = train
x = ctrain['x']
sinx = np.sin(ctrain['x'].values)
cubex = ctrain['x'].values ** 3

ctest = test
x_test = ctest['x']
sinx_test = np.sin(ctest['x'].values)
cubex_test = ctest['x'].values ** 3

#Reformat so they can be used in linreg model

newdf = pd.DataFrame()
newdf['x'] = x
newdf['x2'] = sinx
newdf['x3'] = cubex

testdf = pd.DataFrame()
testdf['x'] = x_test
testdf['x2'] = sinx_test
testdf['x3'] = cubex_test


creg = LinearRegression()
creg.fit(newdf, y)
cpred = creg.predict(newdf)


# Store the Intercept and Coefficient
cintercept = round(creg.intercept_,2)
beta1, beta2, beta3 = creg.coef_
beta1 = round(beta1,2) 
beta2 = round(beta2,2)
beta3 = round(beta3,2)

# store the R-squared and Mean square error terms
mse = metrics.mean_squared_error(y, cpred)
r_squared = metrics.r2_score(y, cpred)



print(f'The output of the model follows the equation: y = {intercept} + {beta1}x + {beta2}sin(x)  + {beta3}x^3')
print('r2: ', round(r_squared,2))
print('MSE: ', round(mse,2))

The output of the model follows the equation: y = 17.2 + -1.38x + 5.35sin(x)  + 0.08x^3
r2:  0.01
MSE:  838.56


graphx = x.values.reshape(-1,1)
#x = x.reshape(-1,1)

# Create the scatterplot, add title and legend
fig, ax = plt.subplots()
scatter2 = plt.scatter(x, y, alpha = .5)
plt.title('Regression Model Comparison Chart')
plt.xlabel('Coordinate X1')
plt.ylabel('Coordinate X2')

plt.plot(graphx, reg.predict(graphx), color = "green")
plt.plot(graphx, creg.predict(newdf), color = "red")

[<matplotlib.lines.Line2D at 0x11daff340>]


### Model 1 
x = x.values.reshape(-1,1)
x_test = x_test.values.reshape(-1,1)

# Fit and create model
reg = LinearRegression()
reg.fit(x,y)
pred = reg.predict(x_test)

# Store the Intercept and Coefficient
intercept = round(reg.intercept_,2)
beta = round(reg.coef_[0],2)

# store the R-squared and Mean square error terms
mse = metrics.mean_squared_error(y_test, pred)
r_squared = metrics.r2_score(y_test, pred)



print(f'The output of the model follows the equation: y = {intercept} + {beta}x')
print('r2: ', round(r_squared,2))
print('MSE: ', round(mse,2))

The output of the model follows the equation: y = 18.35 + 0.09x
r2:  -0.03
MSE:  1014.65


### Model 2
creg = LinearRegression()
creg.fit(newdf, y)
cpred = creg.predict(testdf)


# Store the Intercept and Coefficient
intercept = round(creg.intercept_,2)
beta = round(creg.coef_[0],2)

# store the R-squared and Mean square error terms
mse = metrics.mean_squared_error(y_test, cpred)
r_squared = metrics.r2_score(y_test, cpred)



print(f'The output of the model follows the equation: y = {intercept} + {beta}x')
print('r2: ', round(r_squared,2))
print('MSE: ', round(mse,2))

The output of the model follows the equation: y = 18.27 + -1.38x
r2:  -0.03
MSE:  1012.53

Obs.	$x_1$	$x_2$	$x_3$	$y$
1	0	3	0	Red
2	2	0	0	Red
3	0	1	3	Red
4	0	1	2	Blue
5	-1	0	1	Blue
6	1	1	1	Red

Obs.	$x_1$	$x_2$	$x_3$	$y$	$Distance$
1	0	3	0	Red	3.00
2	2	0	0	Red	2.00
3	0	1	3	Red	3.16
4	0	1	2	Blue	2.24
5	-1	0	1	Blue	1.14
6	1	1	1	Red	1.73

	KNN	%Error
66	67.0	0.102
68	69.0	0.102
75	76.0	0.102

Machine Learning Basics¶

Caleb O'Neel¶

Learning Objectives:¶

Conceptual Questions¶

1¶

2¶

Practical Questions¶

3¶

4¶

5¶

6¶

7¶