import matplotlib.pyplot as plt

#
plt.hist(df.MarApr_AB, bins = 20)
plt.xlabel('At Bats')
plt.ylabel('Number of Players')
plt.title('Distribution of at Bats')

plt.plot()

[]


##### Plot R-Squared Scores

x = np.arange(len(lrLabel))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, lrR, width, label='Entire Dataset', color = 'red')
rects2 = ax.bar(x + width/2, lrValidationR, width, label='Validation Dataset', color = 'lightskyblue')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('R-Squared Score')
ax.set_title('R-Squared Score by Model')
ax.set_xticks(x)
ax.set_xticklabels(lrLabel)
ax.set_ylim(0, 0.5)
ax.legend(loc='lower right')


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)


fig.tight_layout()

plt.show()


fig, axs = plt.subplots(1,2,figsize=(13,6))

axs[0].barh(labels, rmse_val1)
axs[0].set_xlim(0, 0.035)


for index, value in enumerate(rmse_val1):
    axs[0].text(value, index,  str(value))
    
    
axs[1].barh(labels, score_list, color = 'lightblue')
axs[1].set_xlim(0, 0.42)


for index, value in enumerate(score_list):
    axs[1].text(value, index,
             str(value))

axs[0].set(xlabel= 'Mean Absolute Error', title='Mean Absolute Error Validation Score')
axs[1].set(xlabel= 'R-Square', title='R-Square Validation Score')
    
plt.tight_layout()
    
    
plt.show()


pd.set_option('display.max_rows', final.shape[0]+1)
print(final)

     playerid                  Name  Predictions
0       11680            Elias Diaz     0.292925
1       12161       Xander Bogaerts     0.292431
2       10816           Jedd Gyorko     0.260030
3       13611          Mookie Betts     0.287350
4       11493         Manny Machado     0.305780
5        6012        Didi Gregorius     0.285219
6       11739         J.T. Realmuto     0.279354
7       11738  Christian Villanueva     0.265056
8       11205            Adam Eaton     0.279949
9       12927         Brandon Nimmo     0.263948
10      14145      Daniel Robertson     0.271082
11      14274         Mitch Haniger     0.270699
12      19755         Shohei Ohtani     0.278276
13      13066     Teoscar Hernandez     0.285200
14      15640           Aaron Judge     0.291580
15      10155            Mike Trout     0.271453
16      16472          Rhys Hoskins     0.264838
17       9256          A.J. Pollock     0.266804
18       4747     Curtis Granderson     0.269405
19      15191           Chad Pinder     0.267329
20      12552        Eugenio Suarez     0.273347
21       2967            Tommy Pham     0.293536
22       4418            Jed Lowrie     0.295873
23      11850          Tyler Austin     0.252827
24      10264          Brandon Belt     0.276446
25       7226         Matt Davidson     0.244398
26       5361       Freddie Freeman     0.288120
27      18721     Guillermo Heredia     0.263762
28      16556          Ozzie Albies     0.271305
29       4962      Asdrubal Cabrera     0.296142
30      11368       Yasmani Grandal     0.279047
31       7859      Charlie Blackmon     0.256628
32      15429           Kris Bryant     0.282676
33      11342         Jesus Aguilar     0.283029
34      16478        Kyle Schwarber     0.259033
35      11579          Bryce Harper     0.262660
36       6184         J.D. Martinez     0.289611
37       8709          Elvis Andrus     0.278774
38      11445            Mark Canha     0.262223
39       5275    Francisco Cervelli     0.271444
40       3711           Eric Thames     0.247524
41      12225        Scott Schebler     0.267538
42       9345            Brock Holt     0.286565
43       9777         Nolan Arenado     0.288930
44       3086        Mitch Moreland     0.269700
45      14162         Carlos Correa     0.294064
46      14221           Jorge Soler     0.273931
47       3269         Robinson Cano     0.292855
48      12979           Javier Baez     0.281351
49      12144            Max Kepler     0.285083
50      11476        Odubel Herrera     0.302049
51       2136         David Peralta     0.273382
52       9218      Paul Goldschmidt     0.269185
53       9077          Lorenzo Cain     0.279905
54       6153       Eduardo Escobar     0.270987
55      13355            Luke Maile     0.274114
56       5631             Matt Kemp     0.274036
57       4892        Mike Moustakas     0.293506
58      12434          Kevin Pillar     0.287218
59      16505          Matt Chapman     0.273920
60       5930         Nick Markakis     0.300449
61       8259           Kurt Suzuki     0.272748
62      13853           Joey Wendle     0.297176
63      13621     Jeimer Candelario     0.272714
64       7435           Ben Zobrist     0.271568
65       9393            Matt Adams     0.266147
66       7996         Jose Martinez     0.298844
67       1857             Joe Mauer     0.289714
68       4106      Michael Brantley     0.298114
69       9874           DJ LeMahieu     0.285719
70      17232          Yoan Moncada     0.262282
71      10847     Andrelton Simmons     0.287485
72       1433          Wilson Ramos     0.278336
73       5417           Jose Altuve     0.309490
74       3516           Eric Hosmer     0.291714
75      11899          Joc Pederson     0.277328
76      13510          Jose Ramirez     0.271543
77       5107           Jefry Marte     0.273722
78        785          Todd Frazier     0.260177
79      18015           Paul DeJong     0.250996
80      12147            Nick Ahmed     0.265370
81       4314            Joey Votto     0.274427
82      15161          Mitch Garver     0.248095
83      15878        Miguel Andujar     0.287527
84      10762       Corey Dickerson     0.303977
85      10556       Cesar Hernandez     0.271930
86       5352     Yangervis Solarte     0.259758
87       9627             Yan Gomes     0.248793
88       2434           Nelson Cruz     0.282198
89       4810          Brian McCann     0.256379
90      15676            Jose Abreu     0.279100
91       8347           Denard Span     0.267408
92      10951           Greg Garcia     0.249636
93      13590          Jesse Winker     0.290777
94      10059            Max Stassi     0.236177
95      13608          Mallex Smith     0.290114
96       9957          Steve Pearce     0.260518
97      12856       George Springer     0.271049
98       9241        Starling Marte     0.280284
99      12546             C.J. Cron     0.260799
100      5297           Aaron Hicks     0.255122
101     11737  Nicholas Castellanos     0.289528
102     11846         Leonys Martin     0.269843
103      9549          David Freese     0.249868
104      5038        Josh Donaldson     0.256421
105      4298          Matt Wieters     0.248765
106      9112           Khris Davis     0.255435
107      9054          Justin Smoak     0.252207
108      3892          Josh Reddick     0.244735
109     16997        Gleyber Torres     0.272272
110     18030        Harrison Bader     0.260250
111     14818          JaCoby Jones     0.271866
112       639         Adrian Beltre     0.293093
113     18289        Brian Anderson     0.274260
114      9368         Evan Longoria     0.255308
115     15998        Cody Bellinger     0.267604
116      9744           Justin Bour     0.261348
117     11982           Travis Shaw     0.261903
118     16252           Trea Turner     0.281689
119     12861        Anthony Rendon     0.274236
120     17678          Alex Bregman     0.273934
121      7304        Salvador Perez     0.275850
122      2530         Yonder Alonso     0.256695
123     11609     Willson Contreras     0.275927
124     12564          Trevor Story     0.243116
125     16909           Colin Moran     0.274315
126     15149          Trey Mancini     0.278475
127      9166          Buster Posey     0.273663
128     14109     Albert Almora Jr.     0.282913
129      5933           Jean Segura     0.293729
130     10339       Scooter Gennett     0.283552
131      5409        Pablo Sandoval     0.253109
132      3123         Gregor Blanco     0.273765
133     11477      Christian Yelich     0.276575
134     14128            Joey Gallo     0.252568
135      3410            Ryan Braun     0.268616
136      3353            Matt Joyce     0.252870
137      3376          Nick Hundley     0.266467
138     18314        Dansby Swanson     0.280395
139     10472     Enrique Hernandez     0.251541
140     12179         Maikel Franco     0.273200
141     15172          Tim Anderson     0.262556
142     11281       Whit Merrifield     0.275342
143      4940         Jason Heyward     0.265992
144      8203            Dee Gordon     0.294201
145      4579        Starlin Castro     0.288291
146     11442          Gary Sanchez     0.243494
147     17901     Andrew Benintendi     0.268149
148     14968       Victor Caratini     0.269560
149     12916      Francisco Lindor     0.272156
150      8392       Daniel Descalso     0.249496
151     12533         Marcus Semien     0.273854
152     10047             Wil Myers     0.262576
153     12158         Austin Barnes     0.243357
154     13367      Stephen Piscotty     0.276902
155      3142     Robinson Chirinos     0.210971
156     14553          Nomar Mazara     0.277634
157      4949     Giancarlo Stanton     0.262234
158      9810          Brian Dozier     0.253562
159     14344            Matt Olson     0.257348
160     16376      Michael Conforto     0.243557
161     12164           Miguel Sano     0.229511
162      9847      Andrew McCutchen     0.247868
163     10815      Jurickson Profar     0.266373
164     13836            Matt Duffy     0.268037
165      7007         Yadier Molina     0.271016
166     12907       Gregory Polanco     0.242849
167     13157       Nicky Delmonico     0.267396
168      3174         Shin-Soo Choo     0.247084
169      9892             Jay Bruce     0.266554
170     10200       Tucker Barnhart     0.252615
171     11936             Joe Panik     0.272378
172     10346      John Ryan Murphy     0.230840
173     11270         Aaron Altherr     0.250712
174      5491         Austin Romine     0.275273
175     17350         Rafael Devers     0.272822
176     15464        Hunter Renfroe     0.249453
177     15447            Ryon Healy     0.245932
178     17919              Ian Happ     0.242721
179     11602        Yolmer Sanchez     0.272300
180      6364        Danny Valencia     0.242699
181     13593           Jose Peraza     0.294497
182     14388         Ronald Guzman     0.252068
183      7870       Jonathan Lucroy     0.275647
184     13301             Max Muncy     0.222517
185      8202         Josh Harrison     0.260206
186      7287       Carlos Gonzalez     0.265259
187      5497       Marwin Gonzalez     0.254354
188      9785           Kyle Seager     0.257056
189      5913          Leury Garcia     0.272576
190      5666        Devin Mesoraco     0.243431
191      5485           Jose Pirela     0.285972
192      6310       Alcides Escobar     0.270625
193     11489     Michael A. Taylor     0.252748
194     13757          Chris Taylor     0.253427
195     11379      Delino DeShields     0.265940
196     12547            John Hicks     0.253693
197      2616           Zack Cozart     0.252432
198     10459    Adeiny Hechavarria     0.278844
199      1177         Albert Pujols     0.274423
200     12092          Niko Goodrum     0.261177
201       393       Victor Martinez     0.262938
202     10030          Chris Owings     0.249946
203      5222          Justin Upton     0.248148
204      5254       Robbie Grossman     0.253928
205      6609         Freddy Galvis     0.262822
206     14106       Addison Russell     0.282469
207     16512    Isiah Kiner-Falefa     0.248712
208      7802          Miguel Rojas     0.262603
209     13265           Mike Zunino     0.222838
210     12325         Jace Peterson     0.236613
211     12859          James McCann     0.269147
212     19238   Lourdes Gurriel Jr.     0.265672
213     12775           Brad Miller     0.259631
214     17975         Scott Kingery     0.263935
215      4616        Russell Martin     0.238747
216     14523        Pedro Severino     0.249521
217      4922        Ender Inciarte     0.276371
218      6848         Eduardo Nunez     0.270693
219     12155         Eddie Rosario     0.255429
220      5209           Alex Gordon     0.255373
221      4969         Luis Valbuena     0.251203
222      8267        Chris Iannetta     0.261319
223      5827         Wilmer Flores     0.259926
224     10071       Jonathan Villar     0.266241
225     10542        Derek Dietrich     0.258397
226      9927         Brett Gardner     0.250241
227      5223        Cameron Maybin     0.252281
228      6368            Adam Jones     0.268991
229      3448           Jeff Mathis     0.241899
230     19198          Yuli Gurriel     0.277306
231      5227               Jon Jay     0.263343
232      7476            Alex Avila     0.208257
233      2636         Brandon Guyer     0.232858
234     13145             Josh Bell     0.266934
235     10348       Domingo Santana     0.262288
236     12984    Jackie Bradley Jr.     0.244921
237     12532           Kolten Wong     0.244516
238     15223          Adam Frazier     0.262491
239      4220        Ryan Zimmerman     0.264021
240     10950           Adam Duvall     0.235516
241      6547          Jordy Mercer     0.268124
242     15518          Amed Rosario     0.268723
243     11265       Jonathan Schoop     0.243657
244      8418        Ehire Adrianza     0.250682
245      8090        Matt Carpenter     0.221384
246     12180          Jorge Alfaro     0.263749
247      2502            Lucas Duda     0.251917
248      2396        Carlos Santana     0.240763
249      6195           Ian Kinsler     0.256368
250     14320           Wilmer Difo     0.258769
251     14738         Phillip Ervin     0.248102
252     11003           Evan Gattis     0.244595
253      4146      Gorkys Hernandez     0.244234
254      4062         Dexter Fowler     0.232186
255     10324         Marcell Ozuna     0.275447
256     12282          Rougned Odor     0.235851
257      2829            Manny Pina     0.247617
258     11472         Dixon Machado     0.248878
259     15197         Carlos Asuaje     0.247743
260      2151     Edwin Encarnacion     0.225060
261     12294      Cory Spangenberg     0.246259
262      8553         Gerardo Parra     0.257434
263     13613           Ketel Marte     0.273561
264     10231         Jose Iglesias     0.264486
265     15937          Aledmys Diaz     0.243101
266     10199        Billy Hamilton     0.227276
267     13338          Omar Narvaez     0.246121
268      6885           Ian Desmond     0.247488
269      4881          Carlos Gomez     0.226840
270     15112          Ryan McMahon     0.236123
271      5760        Avisail Garcia     0.266080
272      4866          Jarrod Dyson     0.240458
273     14712         Manuel Margot     0.251401
274      9205        Logan Morrison     0.238307
275     12976         Austin Hedges     0.222531
276      9272           Chris Davis     0.219566
277      7949           Tim Beckham     0.241913
278      9776          Jason Kipnis     0.241397
279     13185         Orlando Arcia     0.247115
280      5343      Brandon Crawford     0.241366
281     14225           Yasiel Puig     0.270020
282     14352         Lewis Brinson     0.229892
283      3708           Rajai Davis     0.255353
284      7185        Logan Forsythe     0.241255
285     14330         Nick Williams     0.247072
286      5751          Hernan Perez     0.241368
287      8610       Kendrys Morales     0.254441
288      9774     Christian Vazquez     0.253153
289     11038       Kevin Kiermaier     0.227778
290      9848        Austin Jackson     0.237487
291      3473         Anthony Rizzo     0.235880
292      6887      Martin Maldonado     0.236737
293     14942          Andrew Knapp     0.233236
294      2900         Roberto Perez     0.237724
295      1736            Jose Reyes     0.236641
296     13862          Devon Travis     0.233160
297     10243        Randal Grichuk     0.216024
298     15082            Adam Engel     0.232924
299     11470          Tony Wolters     0.227694
300     11339        Jake Marisnick     0.223680
301     11200          Kole Calhoun     0.233334
302      7539           Neil Walker     0.239136
303     12160             Ben Gamel     0.248269
304      3298     Charlie Culberson     0.222315
305      7087          Caleb Joseph     0.230833
306      8252          Hunter Pence     0.243040
307     13130         Mikie Mahtook     0.214062
308      5273            Sandy Leon     0.228433


import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import ShuffleSplit


### Data import merging and cleaning

# Import Data
fg = pd.read_csv('/Users/caleboneel/Downloads/batting.csv')
statcast = pd.read_csv('/Users/caleboneel/Downloads/FanGraphsSC.csv')
battedball = pd.read_csv('/Users/caleboneel/Downloads/FanGraphsBB.csv')


# Consolidate new datasets into just data columns we are interested in to make merge cleaner
statcast = statcast[['playerid', 'EV', 'maxEV', 'LA', 'Barrel%', 'HardHit%', 'wOBA']]
battedball = battedball[['playerid', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%']]

# Merge Datasets
df = fg.merge(statcast, how = 'left', on='playerid')
df = df.merge(battedball, how = 'left', on='playerid')
df.head(5)


# Check for missing Values
print(df.isnull().any())

playerid             False
Name                 False
Team                 False
MarApr_PA            False
MarApr_AB            False
MarApr_H             False
MarApr_HR            False
MarApr_R             False
MarApr_RBI           False
MarApr_SB            False
MarApr_BB%           False
MarApr_K%            False
MarApr_ISO           False
MarApr_BABIP         False
MarApr_AVG           False
MarApr_OBP           False
MarApr_SLG           False
MarApr_LD%           False
MarApr_GB%           False
MarApr_FB%           False
MarApr_IFFB%         False
MarApr_HR/FB         False
MarApr_O-Swing%      False
MarApr_Z-Swing%      False
MarApr_Swing%        False
MarApr_O-Contact%    False
MarApr_Z-Contact%    False
MarApr_Contact%      False
FullSeason_AVG       False
EV                   False
maxEV                False
LA                   False
Barrel%              False
HardHit%             False
wOBA                 False
Pull%                False
Cent%                False
Oppo%                False
Soft%                False
Med%                 False
Hard%                False
dtype: bool


# Convert all percent strings to floats
df['HardHit%'] = df['HardHit%'].str.rstrip('%').astype('float') / 100.0
df['Pull%'] = df['Pull%'].str.rstrip('%').astype('float') / 100.0
df['Barrel%'] = df['Barrel%'].str.rstrip('%').astype('float') / 100.0
df['Cent%'] = df['Cent%'].str.rstrip('%').astype('float') / 100.0
df['Oppo%'] = df['Oppo%'].str.rstrip('%').astype('float') / 100.0
df['Soft%'] = df['Soft%'].str.rstrip('%').astype('float') / 100.0
df['Med%'] = df['Med%'].str.rstrip('%').astype('float') / 100.0
df['Hard%'] = df['Hard%'].str.rstrip('%').astype('float') / 100.0


# Plot distribution of AB's 
plt.hist(df.MarApr_AB, bins = 20)
plt.xlabel('At Bats')
plt.ylabel('Number of Players')
plt.title('Distribution of at Bats')

Text(0.5, 1.0, 'Distribution of at Bats')


# Turn full season average into the response variable for test train validation, drop from independent variable list
y = df['FullSeason_AVG']
X = df.drop(['FullSeason_AVG', 'playerid', 'Name', 'Team'
            ,'MarApr_R','MarApr_RBI', 'MarApr_SB'], axis=1)

# Baseline Dataset
default = fg
y_ols = default['FullSeason_AVG']
X_ols = default.drop(['FullSeason_AVG', 'playerid', 'Name', 'Team'], axis=1)

# Split the Data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=50)


from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import NearestNeighbors
from sklearn import svm
from sklearn.metrics import precision_recall_curve
from sklearn import metrics
from sklearn.metrics import mean_squared_error 
from math import sqrt


#### Lasso Optimization and Visualization ####

from sklearn.metrics import log_loss, f1_score, roc_curve, auc
from tqdm import tqdm
from sklearn import linear_model
from sklearn.model_selection import RepeatedKFold
from numpy import absolute

def lasso_mod(alpha):
    lr = linear_model.Lasso(alpha=alpha)
    lr.fit(X_train, y_train)
    return lr

non_zero = []
rsquare_list = []
score_list = []
c_param = np.logspace(-4,4,20)

# Iterate over the regularization coefficient, C
for c in tqdm(c_param):
    lr = lasso_mod(c)
    
    
    # number of non-zero params
    coefs = np.sum(lr.coef_ != 0)
    non_zero.append(coefs)
    
    # R-square
    rsquare_list.append(lr.score(X_test, y_test))
    
    
    # MAE score
    pred = lr.predict(X_test)
    scores = cross_val_score(lr, X, y, scoring='neg_mean_absolute_error', n_jobs=-1)
    scores = absolute(scores)
    score_list.append(scores)


    
# Graph the Data    
plt.figure(figsize=(13, 8))

# Cost Function
plt.subplot(221)
plt.plot(c_param, rsquare_list, marker='.')
plt.grid(True)
plt.xlabel('C')
plt.ylabel('R-Square')
plt.xscale('log')
plt.tick_params(axis='y', which='minor')

# MAE Score
plt.subplot(222)
plt.plot(c_param, score_list, marker='.')
plt.grid(True)
plt.xlabel('C')
plt.ylabel('MAE Score')
plt.xscale('log')


# Plot Nonzero Params
plt.subplot(224)
plt.plot(c_param, non_zero, marker='.')
plt.grid(True)
plt.xlabel('C')
plt.ylabel('Non-zero Params')
plt.xscale('log')

plt.tight_layout()
plt.show()

100%|██████████| 20/20 [00:03<00:00,  5.89it/s]


#### LASSO RANDOM SEARCH OPTIMIZATION ####

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform


# Create MLP model with desired parameters, omitting the ones we will use random search to find
clf = linear_model.Lasso()
kfold = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# A function to print results of random search
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
# specify parameters and distributions to sample from
param_dist = {
              'alpha': loguniform(1e-10, 1e2),
           }

# run randomized search
n_iter_search = 50
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=kfold, n_jobs=-1,
                                   n_iter=n_iter_search, verbose=1).fit(X,y)

# Time and output results of random search model
start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:    5.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.

Fitting 30 folds for each of 50 candidates, totalling 1500 fits

[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    1.2s

RandomizedSearchCV took 4.77 seconds for 50 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.313 (std: 0.142)
Parameters: {'alpha': 0.00019425131971719884}

Model with rank: 2
Mean validation score: 0.312 (std: 0.142)
Parameters: {'alpha': 0.00024799861633536955}

Model with rank: 3
Mean validation score: 0.311 (std: 0.143)
Parameters: {'alpha': 0.0001532394142271856}

[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:    4.8s finished


#####  COMPARING ALL REGRESSION MODELS 

lrLabel = ['Default Data', 'Augmented Data', 'Weighted LR', 'Lasso']
lrMAE = []
lrR = []
lrValidationMAE = []
lrValidationR = []

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)



####### Default Dataset Linear Regression #######

# R-Square Entire Dataset
from sklearn.linear_model import LinearRegression
default = LinearRegression().fit(X_ols, y_ols)
lrR.append(round(default.score(X_ols, y_ols), 3))

# MAE Entire Dataset
pred=default.predict(X_ols)
error = sqrt(mean_squared_error(y_ols,pred))
lrMAE.append(round(error, 3))

# R-Square Validation
defaultRsqCV = cross_val_score(default, X_ols, y_ols, scoring='r2', cv=cv, n_jobs=-1)
lrValidationR.append(round(sum(defaultRsqCV)/len(defaultRsqCV), 3))

# MAE Validation 
defaultCV = cross_val_score(default, X_ols, y_ols, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
defaultCV = absolute(defaultCV)
lrValidationMAE.append(round(sum(defaultCV)/len(defaultCV), 3))




###### Augmented Dataset Linear Regression ########

# R-Square Entire Dataset
augment = LinearRegression().fit(X, y)
lrR.append(round(augment.score(X, y), 3))

# MAE Entire Dataset
pred = augment.predict(X)
error = sqrt(mean_squared_error(y,pred))
lrMAE.append(round(error, 3))

# R-Square Validation
AugmentRsqCV = cross_val_score(augment, X, y, scoring='r2', cv=cv, n_jobs=-1)
lrValidationR.append(round(sum(AugmentRsqCV)/len(AugmentRsqCV), 3))

# MAE Validation 
augmentCV = cross_val_score(augment, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
augmentCV = absolute(augmentCV)
lrValidationMAE.append(round(sum(augmentCV)/len(augmentCV), 3))




###### Weighted Linear Regression ########

# R-Square Entire Dataset
weighted = LinearRegression().fit(X, y, sample_weight = X['MarApr_AB'])
lrR.append(round(weighted.score(X, y), 3))

# MAE Entire Dataset
pred = weighted.predict(X)
error = sqrt(mean_squared_error(y,pred))
lrMAE.append(round(error, 3))

# R-Square Validation
WeightedRsqCV = cross_val_score(weighted, X, y, scoring='r2', cv=cv, n_jobs=-1)
lrValidationR.append(round(sum(WeightedRsqCV)/len(WeightedRsqCV), 3))

# MAE Validation 
WeightedCV = cross_val_score(weighted, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
WeightedCV = absolute(WeightedCV)
lrValidationMAE.append(round(sum(WeightedCV)/len(WeightedCV), 3))



###### Lasso Regression ########

# R-Square Entire Dataset
#lasso = linear_model.Lasso(alpha=0.00252).fit(X, y)
lasso = linear_model.Lasso(alpha=0.0002268128333327706).fit(X, y)
lrR.append(round(lasso.score(X, y), 3))

# MAE Entire Dataset
pred = lasso.predict(X)
error = sqrt(mean_squared_error(y,pred))
lrMAE.append(round(error, 3))

# R-Square Validation
lassoRsqCV = cross_val_score(lasso, X, y, scoring='r2', cv=cv, n_jobs=-1)
lrValidationR.append(round(sum(lassoRsqCV)/len(lassoRsqCV), 3))

# MAE Validation 
lassoCV = cross_val_score(lasso, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
lassoCV = absolute(lassoCV)
lrValidationMAE.append(round(sum(lassoCV)/len(lassoCV), 3))


##### Plot R-Squared Scores

x = np.arange(len(lrLabel))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, lrR, width, label='Entire Dataset', color = 'red')
rects2 = ax.bar(x + width/2, lrValidationR, width, label='Validation Set', color = 'lightskyblue')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('R-Square Score')
ax.set_title('R-Square Scores by Model')
ax.set_xticks(x)
ax.set_xticklabels(lrLabel)
ax.set_ylim(0, 0.5)
ax.legend(loc='lower right')


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)


fig.tight_layout()

plt.show()


x = np.arange(len(lrLabel))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, lrMAE, width, label='Entire Dataset', color = 'red')
rects2 = ax.bar(x + width/2, lrValidationMAE, width, label='Validation Set', color = 'lightskyblue')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('MAE Score')
ax.set_title('Mean Absolute Error Scores by Model')
ax.set_xticks(x)
ax.set_xticklabels(lrLabel)
ax.set_ylim(0, 0.027)
ax.legend(loc='lower right')


def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')


autolabel(rects1)
autolabel(rects2)

fig.tight_layout()

plt.show()


# Different Models to test
reg = LinearRegression()
rf = RandomForestRegressor()
svmTest = svm.LinearSVR(max_iter=100000000)
GradBoost = GradientBoostingRegressor(random_state=0)
lasso2 = linear_model.Lasso(alpha=0.0002268128333327706)

models = [reg, lasso2, rf, svmTest, GradBoost]
labels = ['OLS Regression','Lasso', 'Random Forest', 'Support Vector Regressor', 'Gradient Boost']


score_list =[]
rmse_val1 = []


for i, model in enumerate(models):

    model.fit(X_train, y_train)
    pred=model.predict(X_test) #make prediction on test set
    error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
    rmse_val1.append(round(error, 3)) #store rmse values
    score = model.score(X_test, y_test)
    score_list.append(round(score, 3))
    print(f'{model} score: {score}')

LinearRegression() score: 0.28378337757155037
Lasso(alpha=0.0002268128333327706) score: 0.3363427385368609
RandomForestRegressor() score: 0.32504048286194154
LinearSVR(max_iter=100000000) score: 0.37173454706114495
GradientBoostingRegressor(random_state=0) score: 0.32511467175200737

/opt/anaconda3/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn("Liblinear failed to converge, increase "


import matplotlib.pyplot as plt

#labels = ['OLS Regression','Random Forest', 'Support Vector Regressor', 'Gradient Boost' , 'Lasso']
fig, axs = plt.subplots(1,2,figsize=(13,6))

axs[0].barh(labels, rmse_val1)
axs[0].set_xlim(0, 0.035)


for index, value in enumerate(rmse_val1):
    axs[0].text(value, index,  str(value))
    
    
axs[1].barh(labels, score_list, color = 'lightblue')
axs[1].set_xlim(0, 0.4)


for index, value in enumerate(score_list):
    axs[1].text(value, index,
             str(value))

axs[0].set(xlabel= 'Mean Absolute Error', title='Mean Absolute Error Validation Score')
axs[1].set(xlabel= 'R-Square', title='R-Square Validation Score')
    
plt.tight_layout()
    
    
plt.show()


#### Final SVR Predictions
svmScore = svm.LinearSVR(max_iter=100000000)
svmScore.fit(X, y)
svmScore.score(X, y)

/opt/anaconda3/lib/python3.8/site-packages/sklearn/svm/_base.py:976: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  warnings.warn("Liblinear failed to converge, increase "

0.41545194768448845


SVRpredictions = svmScore.predict(X)
final = fg
final['Predictions'] = SVRpredictions
final = final[['playerid', 'Name', 'Predictions']]


# Dataset for training only on observations with more than 50 AB's

dfAB = df[df['MarApr_AB'] > 50]
yAB = dfAB['FullSeason_AVG']
XAB = dfAB.drop(['FullSeason_AVG', 'playerid', 'Name', 'Team'], axis=1)

XAB_train, XAB_test, yAB_train, yAB_test = train_test_split(XAB, yAB, test_size=0.2, random_state=50)


sub = X[['MarApr_K%', 'MarApr_BABIP', 'MarApr_AVG', 'Pull%', 'Cent%', 'Oppo%','Soft%', 'Med%', 'Hard%']]
sub['batted'] = 1-sub['MarApr_K%']
sub['Soft%'] = sub['batted'] * sub['Soft%']
sub['Med%'] = sub['batted'] * sub['Med%']
sub['Hard%'] = sub['batted'] * sub['Hard%']


reg = LinearRegression().fit(sub, y)
reg.score(sub, y)

<ipython-input-70-da6a98ec2a70>:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['batted'] = 1-sub['MarApr_K%']
<ipython-input-70-da6a98ec2a70>:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['Soft%'] = sub['batted'] * sub['Soft%']
<ipython-input-70-da6a98ec2a70>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['Med%'] = sub['batted'] * sub['Med%']
<ipython-input-70-da6a98ec2a70>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['Hard%'] = sub['batted'] * sub['Hard%']

0.36366010857816244


reg = LinearRegression().fit(sub, y)
reg.score(sub, y)

CV = cross_val_score(reg, sub, y, scoring='r2', cv=cv, n_jobs=-1)
round(sum(CV)/len(CV), 3)

0.265


# Smoothed Batting Average
Xs = X
avg = Xs['MarApr_AVG'].mean()
Xs['MarApr_AVG'] = Xs['MarApr_AVG']*.75+avg*.75

reg = LinearRegression().fit(Xs, y)
print(f'Entire Dataset Score: {reg.score(Xs, y)}')

reg = LinearRegression().fit(Xs, y)
reg.score(Xs, y)

CV = cross_val_score(reg, Xs, y, scoring='r2', cv=cv, n_jobs=-1)
round(sum(CV)/len(CV), 3)

Entire Dataset Score: 0.4637466512317324

0.254

	playerid	Name	Team	MarApr_PA	MarApr_AB	MarApr_H	MarApr_HR	MarApr_R	MarApr_RBI	MarApr_SB	...	LA	Barrel%	HardHit%	wOBA	Pull%	Cent%	Oppo%	Soft%	Med%	Hard%
0	11680	Elias Diaz	Pirates	34	31	15	2	6	6	0	...	3.2	7.1%	42.9%	0.517	39.3%	39.3%	21.4%	17.9%	64.3%	17.9%
1	12161	Xander Bogaerts	Red Sox	54	51	21	3	8	15	0	...	15.2	18.6%	41.9%	0.486	46.5%	34.9%	18.6%	11.6%	60.5%	27.9%
2	10816	Jedd Gyorko	Cardinals	30	22	8	2	5	5	1	...	19.6	11.8%	35.3%	0.485	58.8%	29.4%	11.8%	11.8%	52.9%	35.3%
3	13611	Mookie Betts	Red Sox	107	90	31	8	29	18	3	...	22.2	20.3%	51.9%	0.481	50.6%	32.9%	16.5%	12.7%	38.0%	49.4%
4	11493	Manny Machado	Orioles	125	108	39	9	14	22	2	...	16.2	15.2%	47.8%	0.464	33.7%	39.1%	27.2%	22.8%	46.7%	30.4%

Background¶

Data¶

Preprocessing and Data Exploration¶

Modeling¶

Conclusion¶

Final Batting Average Predictions¶

Code Appendix¶

Cleaning, EDA, Merging¶

Modeling - Regression¶

Modeling - Other ML¶

Misc Datasubsets used¶