Jupyter Notebook#

Lect 16: Subset Selection#

# Everyone's favorite standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
# First, we're going to do all the data loading we've had for a while for this data set
auto = pd.read_csv('../../DataSets/Auto.csv')
auto = auto.replace('?', np.nan)
auto = auto.dropna()
auto.horsepower = auto.horsepower.astype('int')

#this shuffles my data set in advance so that i don't need to worry about it later 
auto = auto.sample(frac=1).reset_index(drop=True)


auto.head()

Let’s try to run subset selection on the auto data set! We’re going to use cylinders, horsepower, weight, and acceleration to predict mpg.

inputvars = ['cylinders','horsepower','weight', 'acceleration']

The first tool we are going to use is the itertools package, which gives us a way to get subsets of whatever size we want using the combinations command.

from itertools import combinations

The weird thing is it’s an iterator, so if I just try to print out what I want, it’s not helpful to me.

combinations(inputvars,2)

But if I use it in a for loop it does what I want!

for x in combinations(inputvars,2):
    print(x)

Here’s some code stolen from the last few days to run linear regression on a subset of the input variables and get the training error. Note that this has no train/test split.

def myscore(df,listofvars, outputvar = 'mpg'):
    '''
    This code returns the training mean squared error of a linear regression model. No train test split or anything like that. 
    '''
    if listofvars == 'null model':
        # Predict the mean of the output variable for any input 
        yhat = np.full(auto.shape[0],np.average(auto.mpg))
        return mean_squared_error(auto.mpg, yhat)
    
    else:
        X = df[list(listofvars)]
        y = df[outputvar]
        
        #build linear regression model
        model = LinearRegression()
        
        # fit model
        model.fit(X,y)

        #predict y
        ypred = model.predict(X)

        #view mean absolute error
        return mean_squared_error(y,ypred)
    

myvars = ('cylinders', 'acceleration')
print(f"MSE for the (cylinders, acceleration) model: {myscore(auto,myvars)}")

print(f"MSE for the null model: {myscore(auto,'null model')}")

Do this: Modify the code below by using the myscore function to get the training error for each possible model of size \(k=2\).

def getBestModel_df(df, k=2, outputvar = 'mpg'):
    '''
    This function will return the best model for a given number of input variables
    '''
    myvars = []
    myscores = []
    
    if k == 0:
        return pd.DataFrame({'Vars':'null model', 'Train Score':[myscore(auto,'null model')]})
    else:

        for Xs in combinations(inputvars,k):
            myvars.append(Xs) 
            myscores.append(None) #<--- Fix this None
                
        myResults = pd.DataFrame({'Vars':myvars, 'Train Score':myscores})
        return myResults

print("k = 2")
print(getBestModel_df(auto, k=2))
print("\n")
print("k = 0, so this is the null model")
print(getBestModel_df(auto, k=0))

Q: What is \(M_2\)? That is, what is the set of variables that give the lowest training error over all sets of size 2? Write a command to pull this automatically from the table above.

Hint: the command DataFrame.idxmin() will give you the row index of the minimum entry in a pandas series.

# Your code here 

def getBestModel(df, k=2, outputvar = 'mpg'):
    '''
    This function will return the best model for a given number of input variables
    '''
    myResults = getBestModel_df(df, k, outputvar)
    
    # Add code here to find the index of the minimum training score
    M_k = None
    return M_k

print(getBestModel(auto, k=2))

# Check to make sure this is right :-P
getBestModel(auto, k=2) == ('horsepower', 'weight')

Now, once we’ve found all the \(M_k\), we want to return the best model using the testing error. So, here’s a function that will take in a list of variables and give you the 5-fold CV score.

def myscore_cv(df,listofvars, outputvar = 'mpg'):


    if listofvars == 'null model':
        # Predict the mean of the output variable for any input
        test_scores = [] 
        for (train,test) in KFold(5).split(df):
            y_test = df[outputvar].iloc[train]
            yhat = np.full(y_test.shape[0],np.average(y_test))
            test_scores.append(mean_squared_error(y_test, yhat))
        return np.average(test_scores)
    else:
        X = df[list(listofvars)]
        y = df[outputvar]
        #build linear regression model
        model = LinearRegression()
        
        #use 5-fold CV to evaluate model
        scores = cross_val_score(model, X,y, 
                                scoring='neg_mean_squared_error',
                                cv=5)

        #view mean absolute error
        return np.average(np.absolute(scores))
    

myvars = ('cylinders', 'acceleration')
myscore_cv(auto,myvars)

Do this: Put it all together. Finish the for loop that runs through \(k=0\) to \(k=5\), and gets the set of variables giving the best size \(k\) model, and then computs the test error for each model. Which model would be returned by the best subset selection method?

for k in range(5):
    M_k = None #<-- Use the getBestModel function here to get the list of variables
    print(f"Best {k} variable model: {M_k}")
    M_k_CV = 0 #<-- Use the myscore_cv function here to get the test error
    print(f"CV Score: {M_k_CV:0.3f}")

Congratulations, we’re done!#

Written by Dr. Liz Munch, Michigan State University

Creative Commons License
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.