#necessary libraries

import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from matplotlib import pyplot as plt
import sklearn.model_selection as ms
from scipy import stats
import statsmodels.api as sm
from sklearn import linear_model 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import random


#hockey-reference.com associates team names with an abbreviation for individual team data links. The abbreviations they used were not standarized or could be found on website so I manually made a dictionary that associates the team 

teams_with_abr = {'St. Louis Blues':'STL', 'Detroit Red Wings':'DET', 'Philadelphia Flyers' : 'PHI',
 'New Jersey Devils' : 'NJD','Washington Capitals':'WSH', 'Dallas Stars':'DAL',
 'Toronto Maple Leafs':'TOR', 'Florida Panthers' :'FLA', 'Colorado Avalanche' : 'COL',
 'Ottawa Senators' : 'OTT' , 'Los Angeles Kings' : 'LAK' , 'Phoenix Coyotes' : 'PHX', 'Arizona Coyotes' : 'ARI',
 'Pittsburgh Penguins' : 'PIT', 'Edmonton Oilers' : 'EDM' , 'San Jose Sharks' : 'SJS', 
 'Buffalo Sabres' : 'BUF', 'Carolina Hurricanes' : 'CAR',  'Vancouver Canucks' : 'VAN', 
 'Montreal Canadiens' : 'MTL' , 'Mighty Ducks of Anaheim' : 'MDA', 'Chicago Blackhawks' : 'CHI', 
 'Calgary Flames' : 'CGY' , 'New York Rangers' : 'NYR', 'Boston Bruins' :'BOS', 'Nashville Predators' : "NSH",
 'New York Islanders' : 'NYI' , 'Tampa Bay Lightning' : 'TBL' , 'Atlanta Thrashers' : 'ATL' , 'Winnipeg Jets' : 'WPG', 
 'Vegas Golden Knights' : 'VEG', 'Seattle Kraken' : 'SEA', 'Anaheim Ducks' : 'ANA', 'Minnesota Wild' : 'MIN'
 ,'Columbus Blue Jackets':'CBJ'} 


#all my data is from the year 2000 to 2022. However, the current 2022 season has not been 
#finished so this data will be saved separately to later make predictions on who will win the stanley cup this year. 
years = [i for i in range(2000,2023)]

regular_season = []

#saving the dataframe for current 2022 regular season separately 
current_regular_season =[]

#looping through each year
for year in years:
  # the nhl was not held in 2005 so ignore this year
  if (year != 2005 ):

    #had to donwnload data manually and then read it from a dictionary
    data = f"data/nhl_{year}.csv"

    #reading in csv file as pandas dataframe
    curr_df = pd.read_csv(data,index_col=[0])

    #had to tidy up some of the columns to make it more readable
    curr_df = curr_df.rename(columns = {'Unnamed: 1':'blah'})
    curr_df.columns = curr_df.iloc[0]
    curr_df = curr_df.drop(curr_df.index[0])
    curr_df = curr_df
    curr_df = curr_df.rename(columns = {np.NaN:'Team'})
    curr_df.columns
    curr_df = curr_df[:-1]

    # a * in a teams name means that are a playoff Team. So only want to save teams data from regular season that are playoff teams

    #right before submitting project, the website updated and removed the * for each 
    #team that made the 2022 playoffs so had to add in this check to get the 16 teams with highest points
    if (year == 2022):
      curr_df = curr_df.iloc[0:16]
    else:
      curr_df = curr_df.loc[curr_df['Team'].str.contains("\*")]

    #after filtering can strip the * from each teams name since only looking at playoff teams
    curr_df['Team'] = curr_df['Team'].str.strip("*")

    #converting desired columns into floats (previously were saved as strings)
    curr_df[['AvAge', 'PTS%', 'PIM/G', 'S%', 'SV%','SRS', 'SOS', 'GF/G', 'GA/G', 'PP%', 'PK%']] = curr_df[['AvAge', 'PTS%', 'PIM/G', 'S%', 'SV%','SRS', 'SOS', 'GF/G', 'GA/G', 'PP%', 'PK%']].astype(float)
    #converting desired columns into ints (previously were saved as strings)
    curr_df[['W', 'L','OL','PTS','GF','GA']] = curr_df[['W', 'L','OL','PTS','GF','GA']].astype(int)

    #creating additional columns that will be used later in project
    curr_df['PlayoffWins'] = np.NAN
    curr_df['WinPctLast10'] = np.NAN
    curr_df['ID'] = np.NAN
    curr_df['PlayoffRound'] = np.NAN

    #getting a teams id, used because each teams website is based on their id
    curr_df['ID'] = curr_df['Team'].map(teams_with_abr)

    #getting each teams win_percentage in last 10 games (will be seen why later)
    for tm in curr_df['ID']:
      curr_team = pd.read_html(f"https://www.hockey-reference.com/teams/{tm}/{year}_games.html")
      curr_team = curr_team[0].tail(10)

      #for some reason the 2022 season column names are labeled differently
      if (year == 2022):
        curr_team = curr_team.rename(columns = {'Unnamed: 7':'Result'})
      else:
        curr_team = curr_team.rename(columns = {'Unnamed: 6':'Result'})

      #gettings teams win losses and ties for last 10 games. 
      game_result = curr_team['Result'].value_counts()
      
      #From 2000-2004 the NHL allowed ties so I had to check for each possible combination of this. 
      if 'T' in game_result.index and 'L' in game_result.index and 'W' in game_result.index:
        win_percentage = game_result['W'] / (game_result['W'] + game_result['L'] + game_result['T'])
      elif 'W' in game_result.index and 'L' in game_result.index:
        win_percentage = game_result['W'] / (game_result['W'] + game_result['L'])
      elif 'W' in game_result.index and 'T' in game_result.index:
        win_percentage = game_result['W'] / (game_result['W'] + game_result['T'])
      elif 'W' in game_result.index:
        win_percentage = 1.0
      else:
        win_percentage = 0.0
      
      #saving computed regular season win percentage in last 10 games to teams respective year 
      curr_df.loc[curr_df.loc[curr_df['ID'] == tm].index[0],['WinPctLast10']] = win_percentage
       #saving current year
    curr_df['year'] = year
    curr_df = curr_df[['Team', 'AvAge', 'GP', 'W', 'L', 'OL', 'PTS', 'PTS%', 'GF', 
                       'GF/G', 'GA/G', 'PP', 'PPO', 'PP%', 'PPA', 'PPOA','GA','S%', 'SRS', 'SOS',
                       'PK%', 'PIM/G','SV%','PlayoffWins', 'WinPctLast10', 'ID', 'year']]

  

    #saved all the dataframes into an array of dataframes unless it is 2022 season since not yet completed
    if (year != 2022):
      regular_season.append(curr_df)
    else:
      current_regular_season.append(curr_df)

#example dataframe from the 2004 regular season
regular_season[4].head(100)


count = 0

years = [i for i in range(2000,2022)]

playoffs = []
stanley_cup_winners = []

#looping through each year
for year in years:
  #again NHL did not happen in 2005
  if year != 2005:
    #getting url
    url = f"https://www.hockey-reference.com/playoffs/NHL_{year}.html"
    info = requests.get(url).text

    #parsing table
    soup = BeautifulSoup(info,"html.parser")

    table = soup.find("table", id="teams")
    table_body = table.find('tbody')
    data = []
    rows = table_body.find_all('tr')
    for row in rows:
      cols = row.find_all('td')
      cols = [i.text.strip() for i in cols]
      data.append([i for i in cols if i])

    #creating dataframe and labeling columns 
    df_table = pd.DataFrame(data[0:], columns=['Team',  'GP', 'W', 'L', 'T', 'OW', 'OL', 'W-L%', 'G' , 'GA', 'DIFF'])

    #dropping last row since it computed averages for each columns, which I am not interested in
    df_table = df_table[:-1] 
    
    #saving columns that are supposed to be ints as ints (previously strings)
    df_table[['W','GP','L','T', 'OW', 'OL', 'G', 'GA', 'DIFF']] = df_table[['W','GP','L','T', 'OW', 'OL', 'G', 'GA', 'DIFF']].astype(int)
    #saving columns that are supposed to be ints as ints (previously strings)
    df_table[['W-L%']] = df_table[['W-L%']].astype(float)

    #saving the season's stanley cup winner (aka the team with the most wins in the playoff 16)
    lord_stanley = df_table.loc[df_table['W'] == df_table['W'].max()]['Team'][0]

    #appending dataframe to the playoffs table
    playoffs.append(df_table)
    stanley_cup_winners.append(lord_stanley)


#prints the 2000 season dataframe
playoffs[0].head(20)


#printing out all the teams that have won the Stanley Cup in past years
stanley_cup_winners

['New Jersey Devils',
 'Colorado Avalanche',
 'Detroit Red Wings',
 'New Jersey Devils',
 'Tampa Bay Lightning',
 'Carolina Hurricanes',
 'Anaheim Ducks',
 'Detroit Red Wings',
 'Pittsburgh Penguins',
 'Chicago Blackhawks',
 'Boston Bruins',
 'Los Angeles Kings',
 'Chicago Blackhawks',
 'Los Angeles Kings',
 'Chicago Blackhawks',
 'Pittsburgh Penguins',
 'Pittsburgh Penguins',
 'Washington Capitals',
 'St. Louis Blues',
 'Tampa Bay Lightning',
 'Tampa Bay Lightning']


#adding playoff wins category into df and also combining all of the dataframes into one
for i in range(0,len(playoffs)): 
  if (i != 19):
    for tm, pct in zip(regular_season[i]['Team'], regular_season[i]['PTS%']):
      playoff_wins = playoffs[i].loc[playoffs[i]['Team'] == tm]['W'].values[0]
      regular_season[i].loc[regular_season[i].loc[regular_season[i]['Team']==tm].index,'PlayoffWins'] = playoff_wins


#concat all dataframes together
combined_regular = pd.concat(regular_season)

#showing example of the dataframe
combined_regular.head(100)


def playoff_performance(category, combined_regular_df, plot_title_name):
  #defining a linear regression
  lm = linear_model.LinearRegression()

  #getting rid of nan values that appeared for missing playoff seasons. Specifically
  #all the data for the 2020 covid season is NaN because I did not record that playoff data for reasons specfied above. 
  combined_regular_df = combined_regular_df.dropna(subset=['PlayoffWins'])

  #x_data represents the category we are looking at
  x_data = combined_regular_df[category].values.reshape(-1,1)

  #going to be predicting on the number of playoff wins
  y_data = combined_regular_df['PlayoffWins']

  #fitting model to data
  model = lm.fit(x_data, y_data)

  #plotting data as well as adding regression line
  plt.figure(figsize=(10,10))
  plt.scatter(combined_regular_df[category], combined_regular_df['PlayoffWins'], color='blue')
  plt.plot(x_data, model.predict(x_data), color='darkgoldenrod')

  plt.title(f'Regular Season {plot_title_name} vs Number of Wins in Playoffs', fontsize=18)

  plt.ylabel('Number of Wins in Playoffs', fontsize=12)
  plt.xlabel(f'Regular Season {plot_title_name}', fontsize=12)

  #printing out results from statsmodels
  new_x = sm.add_constant(x_data)
  regression = sm.OLS(y_data, new_x).fit()
  
  plt.show()
  print(f"test: {regression.summary()}")


playoff_performance('PTS%', combined_regular, 'PTS%')

test:                             OLS Regression Results                            
==============================================================================
Dep. Variable:            PlayoffWins   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     20.85
Date:                Sun, 15 May 2022   Prob (F-statistic):           7.10e-06
Time:                        22:36:38   Log-Likelihood:                -937.98
No. Observations:                 320   AIC:                             1880.
Df Residuals:                     318   BIC:                             1887.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -9.2719      3.230     -2.871      0.004     -15.627      -2.917
x1            23.7697      5.205      4.566      0.000      13.528      34.011
==============================================================================
Omnibus:                       32.922   Durbin-Watson:                   2.028
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               41.246
Skew:                           0.876   Prob(JB):                     1.11e-09
Kurtosis:                       2.851   Cond. No.                         28.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


playoff_performance('SV%', combined_regular, 'Save Percentage')

test:                             OLS Regression Results                            
==============================================================================
Dep. Variable:            PlayoffWins   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.006
Method:                 Least Squares   F-statistic:                     2.947
Date:                Sun, 15 May 2022   Prob (F-statistic):             0.0870
Time:                        22:36:38   Log-Likelihood:                -946.66
No. Observations:                 320   AIC:                             1897.
Df Residuals:                     318   BIC:                             1905.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -45.1798     29.484     -1.532      0.126    -103.188      12.829
x1            55.5822     32.379      1.717      0.087      -8.122     119.286
==============================================================================
Omnibus:                       37.715   Durbin-Watson:                   1.989
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.067
Skew:                           0.957   Prob(JB):                     2.21e-11
Kurtosis:                       2.858   Cond. No.                         227.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


playoff_performance('AvAge', combined_regular, 'Average Roster Age')

test:                             OLS Regression Results                            
==============================================================================
Dep. Variable:            PlayoffWins   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.5573
Date:                Sun, 15 May 2022   Prob (F-statistic):              0.456
Time:                        22:36:38   Log-Likelihood:                -947.86
No. Observations:                 320   AIC:                             1900.
Df Residuals:                     318   BIC:                             1907.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.8239      6.177      0.133      0.894     -11.330      12.978
x1             0.1636      0.219      0.746      0.456      -0.268       0.595
==============================================================================
Omnibus:                       36.582   Durbin-Watson:                   1.975
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               46.967
Skew:                           0.933   Prob(JB):                     6.33e-11
Kurtosis:                       2.791   Cond. No.                         665.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


playoff_performance('GF/G', combined_regular, 'Goals per Game')

test:                             OLS Regression Results                            
==============================================================================
Dep. Variable:            PlayoffWins   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     3.205
Date:                Sun, 15 May 2022   Prob (F-statistic):             0.0744
Time:                        22:36:38   Log-Likelihood:                -946.53
No. Observations:                 320   AIC:                             1897.
Df Residuals:                     318   BIC:                             1905.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.7001      2.656      0.264      0.792      -4.525       5.925
x1             1.6202      0.905      1.790      0.074      -0.160       3.401
==============================================================================
Omnibus:                       36.146   Durbin-Watson:                   1.975
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               46.356
Skew:                           0.928   Prob(JB):                     8.59e-11
Kurtosis:                       2.818   Cond. No.                         33.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


playoff_performance('WinPctLast10', combined_regular, 'Win Percentage Last 10 Games')

test:                             OLS Regression Results                            
==============================================================================
Dep. Variable:            PlayoffWins   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.785
Date:                Sun, 15 May 2022   Prob (F-statistic):              0.183
Time:                        22:36:39   Log-Likelihood:                -947.24
No. Observations:                 320   AIC:                             1898.
Df Residuals:                     318   BIC:                             1906.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.1256      1.012      4.077      0.000       2.135       6.116
x1             2.3212      1.738      1.336      0.183      -1.097       5.740
==============================================================================
Omnibus:                       38.251   Durbin-Watson:                   1.959
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               49.991
Skew:                           0.966   Prob(JB):                     1.40e-11
Kurtosis:                       2.869   Cond. No.                         8.77
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


#dropping 2020 playoffs
combined_regular = combined_regular.dropna(subset=['PlayoffWins'])

#independent data
x_data = combined_regular[['AvAge', 'PTS%', 'PIM/G', 'S%', 'SV%','SRS', 'SOS', 'GF/G', 'GA/G', 'PP%', 'PK%', 'WinPctLast10']]

#dependent data
y_data = combined_regular['PlayoffWins']

#using both decision tree classifier and random forest classifer
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()

#using cross_val_score and accuracy
scoresDTC = ms.cross_val_score(dtc,x_data,y_data,cv=5,scoring='accuracy')
scoresRFC = ms.cross_val_score(rfc,x_data,y_data,cv=5,scoring='accuracy')

#looking at accuracy, error, and standarized error of the two models
print(f"DTC: Mean Accuracy: {scoresDTC.mean()}, Error : {1 - scoresDTC.mean()}, Std: {scoresDTC.std()/ np.sqrt(5)}")
print(f"RFC: Mean Accuracy: {scoresRFC.mean()}, Error : {1 - scoresRFC.mean()}, Std: {scoresRFC.std()/ np.sqrt(5)}")

DTC: Mean Accuracy: 0.10625, Error : 0.89375, Std: 0.020916500663351885
RFC: Mean Accuracy: 0.190625, Error : 0.809375, Std: 0.014921670482891652


rfc = RandomForestClassifier()


#redefining data for clarity purposes
x_data = combined_regular[['AvAge', 'PTS%', 'PIM/G', 'S%', 'SV%','SRS', 'SOS', 'GF/G', 'GA/G', 'PP%', 'PK%', 'WinPctLast10']]
y_data = combined_regular['PlayoffWins']

#used cross val predict becuase it gives a list of predicted values
predicted = ms.cross_val_predict(rfc,x_data,y_data,cv=5)

#random_acc list
random_acc = []

#represents accuracy of predicted list
accuracy = []
#looping through actual wins per team and predicted wins per team by model
for actual, curr_prediction in zip(y_data, predicted):
  
  #giving a score of 1 if made correct prediction or 0 otherwise
  if actual==curr_prediction:
    accuracy.append(1)
  else:
    accuracy.append(0)

  #computing the random value
  random_val = random.randint(0,16)
  if (random_val == actual):
    random_acc.append(1)
  else:
    random_acc.append(0)

#conducting t test and returning p-value
result = stats.ttest_rel(accuracy,random_acc)
print(f"P-value of our model vs random model: {result.pvalue}")

P-value of our model vs random model: 5.141375877977624e-05


#defining what round a team will make it into playoffs based on number of playoff wins
combined_regular['PlayoffRound'] = (combined_regular['PlayoffWins']//4)+1

#getting same data as before
x_data = combined_regular[['AvAge', 'PTS%', 'PIM/G', 'S%', 'SV%','SRS', 'SOS', 'GF/G', 'GA/G', 'PP%', 'PK%', 'WinPctLast10']]
y_data = combined_regular['PlayoffRound']


#defining classifiers
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()

#using cross val score. This time can do 10-fold cross validation since will have more data inside each bucket becuase have less overall buckets
scoresDTC = ms.cross_val_score(dtc,x_data,y_data,cv=10,scoring='accuracy')
scoresRFC = ms.cross_val_score(rfc,x_data,y_data,cv=10,scoring='accuracy')

#calculating accuracy
print(f"DTC: Mean Accuracy: {scoresDTC.mean()}, Error : {1 - scoresDTC.mean()}, Std Error: {scoresDTC.std()/ np.sqrt(10)}")
print(f"RFC: Mean Accuracy: {scoresRFC.mean()}, Error : {1 - scoresRFC.mean()}, Std Error: {scoresRFC.std()/ np.sqrt(10)}")

DTC: Mean Accuracy: 0.328125, Error : 0.671875, Std Error: 0.014823176532039278
RFC: Mean Accuracy: 0.4625, Error : 0.5375, Std Error: 0.02891258376555094


# same as the code in previous section

combined_regular['PlayoffRound'] = (combined_regular['PlayoffWins']//4)+1

x_data = combined_regular[['AvAge', 'PTS%', 'PIM/G', 'S%', 'SV%','SRS', 'SOS', 'GF/G', 'GA/G', 'PP%', 'PK%', 'WinPctLast10']]
y_data = combined_regular['PlayoffRound']


rfc = RandomForestClassifier()


#again using cross_val_predict becuase gives the prediction for all values
predicted = ms.cross_val_predict(rfc,x_data,y_data,cv=10)

random_acc = []
accuracy = []

#computing accuracy of our model and accuracy of random model. 
for actual, curr_prediction in zip(y_data, predicted):

  if actual==curr_prediction:
    accuracy.append(1)
  else:
    accuracy.append(0)

  #now random value can be between 1 and 5 for round 
  random_val = random.randint(1,5)
  if (random_val == actual):
    random_acc.append(1)
  else:
    random_acc.append(0)

#conducting t test and returning p-value
result = stats.ttest_rel(accuracy,random_acc)
print(f"P-value of our model vs random model: {result.pvalue}")

P-value of our model vs random model: 1.436309807381069e-10


prediction_results = []
num_times = 100

#running prediction 100 times
for i in range(0,num_times):
  x_data = combined_regular[['AvAge', 'PTS%', 'PIM/G', 'S%', 'SV%','SRS', 'SOS', 'GF/G', 'GA/G', 'PP%', 'PK%', 'WinPctLast10']]
  y_data = combined_regular['PlayoffRound']
  dtc = DecisionTreeClassifier()

  #fitting previous years data
  dtc.fit(x_data, y_data)

  #2022 regular season data
  x_data_curr = current_regular_season[0][['AvAge', 'PTS%', 'PIM/G', 'S%', 'SV%','SRS', 'SOS', 'GF/G', 'GA/G', 'PP%', 'PK%', 'WinPctLast10']]

  prediction_results.append(dtc.predict(x_data_curr))


#computing sum of all the results for each team
col_totals = [ sum(x) for x in zip(*prediction_results) ]

#dividing each column by num_times and rounding it to get true round prediction
final_pred = [round(x / num_times) for x in col_totals]

#associating prediction results with team name and displaying that as a dateframe: 

current_regular_season[0]['RoundPrediction'] = final_pred
prediction_df = current_regular_season[0][['Team','RoundPrediction']]
#sorting rankings
prediction_df = prediction_df.sort_values(by=['RoundPrediction'], ascending=False)


prediction_df.head(100)

Rk	Team	AvAge	GP	W	L	OL	PTS	PTS%	GF	GF/G	...	S%	SRS	SOS	PK%	PIM/G	SV%	PlayoffWins	WinPctLast10	ID	year
1	St. Louis Blues	28.7	82	51	19	1	114	0.695	248	3.02	...	10.1	1.08	0.07	87.83	13.6	0.909	3.0	0.5	STL	2000
2	Detroit Red Wings	30.7	82	48	22	2	108	0.659	278	3.39	...	10.6	0.88	0.05	85.85	12.1	0.903	5.0	0.5	DET	2000
3	Philadelphia Flyers	29.0	82	45	22	3	105	0.640	237	2.89	...	9.5	0.58	-0.13	86.71	14.9	0.908	11.0	0.7	PHI	2000
4	New Jersey Devils	27.8	82	45	24	5	103	0.628	251	3.06	...	9.2	0.46	-0.13	87.54	15.8	0.903	16.0	0.5	NJD	2000
5	Washington Capitals	29.0	82	44	24	2	102	0.622	227	2.77	...	10.0	0.28	-0.13	86.22	12.0	0.915	1.0	0.5	WSH	2000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
16	Tampa Bay Lightning	29.3	82	43	33	6	92	0.561	246	3.00	...	9.5	-0.18	-0.08	81.59	11.4	0.887	1.0	0.5	TBL	2006
1	Buffalo Sabres	27.2	82	53	22	7	113	0.689	298	3.63	...	12.3	0.64	-0.16	81.35	14.6	0.906	9.0	0.7	BUF	2007
2	Detroit Red Wings	32.3	82	50	19	13	113	0.689	252	3.07	...	9.1	0.72	0.05	84.56	12.0	0.905	10.0	0.5	DET	2007
3	Nashville Predators	27.4	82	51	23	8	110	0.671	266	3.24	...	11.8	0.78	0.05	85.90	14.4	0.919	1.0	0.5	NSH	2007
4	Anaheim Ducks	28.5	82	48	20	14	110	0.671	254	3.10	...	9.8	0.67	0.06	85.12	17.8	0.912	16.0	0.5	ANA	2007

Are Hockey Playoffs Actually Random?¶

CMSC320 Final Project by Matt Blodgett¶

Introduction¶

Data Collection + Processing¶

Getting Regular Season data from 2000-2022¶

Getting Playoff Data from 2000-2022¶

Combining Dataframes Together¶

Exploratory Data Analysis¶

Creating Playoff Performance Function:¶

Now can look at the plot for each individual category as well as the regression results:¶

PTS% vs Playoff Wins:¶

PTS% VS PlayoffWins Explanation:¶

Goalie Save Percentage vs Playoff Wins:¶

Goalie Save Percentage vs Playoff Wins Explanation:¶

Average Team Age vs Playoff Wins:¶

Average Team Age vs Playoff Wins Explanation:¶

Goals per Game vs Playoff Wins:¶

Goals per Game vs Playoff Wins Explanation:¶

Win Percentage in Last 10 Games vs Playoff Wins:¶

Win Percentage in Last 10 Games vs Playoff Wins Explanation:¶

Where Do We Go From Here¶

Machine Learning¶

Predicting Number of Playoff Wins:¶

Predicting Number of Wins Explanation:¶

Conducting T-Test on Number of Wins Model¶

T-Test Explanation¶

Are We Done?¶

Predicting Playoff Round:¶

Predicting Playoff Round Explanation:¶

Conducting T-Test on Number of Wins Model¶

T-Test Explanation¶

Predicting Winner of 2022 Season¶

Prediction Results¶

Conclusion¶

Refereneces¶

Rk	Team	AvAge	GP	W	L	OL	PTS	PTS%	GF	GF/G	...	S%	SRS	SOS	PK%	PIM/G	SV%	PlayoffWins	WinPctLast10	ID	year
1	Detroit Red Wings	31.9	82	48	21	2	109	0.665	255	3.11	...	10.3	0.72	-0.09	86.75	11.5	0.912	NaN	0.6	DET	2004
2	Tampa Bay Lightning	28.1	82	46	22	6	106	0.646	245	2.99	...	10.0	0.56	-0.09	84.89	11.6	0.908	NaN	0.5	TBL	2004
3	Boston Bruins	27.9	82	41	19	7	104	0.634	209	2.55	...	8.4	0.29	0.03	83.58	14.5	0.918	NaN	0.6	BOS	2004
4	San Jose Sharks	26.8	82	43	21	6	104	0.634	219	2.67	...	9.5	0.40	-0.04	85.27	13.1	0.923	NaN	0.8	SJS	2004
5	Toronto Maple Leafs	30.7	82	45	24	3	103	0.628	242	2.95	...	10.7	0.48	0.02	83.42	17.5	0.906	NaN	0.7	TOR	2004
6	Ottawa Senators	27.0	82	43	23	6	102	0.622	262	3.20	...	10.8	0.88	-0.01	83.57	15.3	0.907	NaN	0.5	OTT	2004
7	Vancouver Canucks	27.5	82	43	24	5	101	0.616	235	2.87	...	9.9	0.52	0.02	86.11	15.2	0.911	NaN	0.6	VAN	2004
8	Philadelphia Flyers	29.7	82	40	21	6	101	0.616	229	2.79	...	9.5	0.47	-0.05	83.33	16.3	0.911	NaN	0.4	PHI	2004
9	Colorado Avalanche	27.9	82	40	22	7	100	0.610	236	2.88	...	9.8	0.48	0.02	83.75	15.4	0.915	NaN	0.2	COL	2004
10	New Jersey Devils	28.8	82	43	25	2	100	0.610	213	2.60	...	8.8	0.54	-0.06	85.34	10.7	0.918	NaN	0.6	NJD	2004
11	Dallas Stars	30.6	82	41	26	2	97	0.591	194	2.37	...	8.8	0.22	-0.01	85.85	13.7	0.908	NaN	0.5	DAL	2004
12	Calgary Flames	26.9	82	42	30	3	94	0.573	200	2.44	...	8.9	0.33	0.04	84.68	17.0	0.916	NaN	0.6	CGY	2004
13	Montreal Canadiens	28.0	82	41	30	4	93	0.567	208	2.54	...	9.2	0.23	0.04	82.48	12.5	0.918	NaN	0.4	MTL	2004
14	St. Louis Blues	30.0	82	39	30	2	91	0.555	191	2.33	...	8.6	-0.09	-0.01	84.55	15.2	0.906	NaN	0.6	STL	2004
15	Nashville Predators	26.5	82	38	29	4	91	0.555	216	2.63	...	9.7	-0.05	-0.04	81.77	16.3	0.907	NaN	0.5	NSH	2004
16	New York Islanders	27.7	82	38	29	4	91	0.555	237	2.89	...	10.1	0.29	-0.04	85.52	14.0	0.906	NaN	0.6	NYI	2004

	Team	GP	W	L	OW	OL	W-L%	G	GA	DIFF
0	New Jersey Devils	23	16	7	1	1	0.696	61	39	22
1	Dallas Stars	23	14	9	2	1	0.609	52	46	6
2	Colorado Avalanche	17	11	6	1	1	0.647	43	32	11
3	Philadelphia Flyers	18	11	7	2	1	0.611	44	40	4
4	Pittsburgh Penguins	11	6	5	1	2	0.545	31	23	8
5	Toronto Maple Leafs	12	6	6	1	0	0.500	26	26	0
6	Detroit Red Wings	9	5	4	0	1	0.556	23	19	4
7	San Jose Sharks	12	5	7	0	0	0.417	27	37	-10
8	St. Louis Blues	7	3	4	0	0	0.429	22	20	2
9	Ottawa Senators	6	2	4	0	1	0.333	10	17	-7
10	Edmonton Oilers	5	1	4	0	0	0.200	11	14	-3
11	Phoenix Coyotes	5	1	4	0	0	0.200	10	17	-7
12	Buffalo Sabres	5	1	4	1	0	0.200	8	14	-6
13	Washington Capitals	5	1	4	0	1	0.200	8	17	-9
14	Los Angeles Kings	4	0	4	0	0	0.000	6	15	-9
15	Florida Panthers	4	0	4	0	0	0.000	6	12	-6

Rk	Team	RoundPrediction
15	Nashville Predators	5
1	Florida Panthers	4
5	Minnesota Wild	4
8	Tampa Bay Lightning	4
2	Colorado Avalanche	3
3	Carolina Hurricanes	2
4	Toronto Maple Leafs	2
9	New York Rangers	2
10	Boston Bruins	2
12	Pittsburgh Penguins	2
16	Dallas Stars	2
6	Calgary Flames	1
7	St. Louis Blues	1
11	Edmonton Oilers	1
13	Washington Capitals	1
14	Los Angeles Kings	1