import pandas as pd
import numpy as np
import scipy as sp
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import scale
%matplotlib inline
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')
There are 14 predictors in the data set, the data set contains median house value for 506 neighborhoods around Boston city.
# Data from R ISLR package - write.csv(Boston, "Boston.csv", col.names = FALSE)
boston_df = pd.read_csv("Data/Boston.csv")
boston_df.head()
# LSTAT - % of population with low status; MEDV - median value of home
ax = boston_df.plot(x="lstat", y="medv", style="o", xlim=[0, 40])
ax.set_ylabel("medv")
# The statsmodels library provides a small subset of models, but has more emphasis on
# parameter estimation and statistical testing. The summary output is similar to R's
# summary function.
# X is an "array" of column values, y is a single column value
# X = boston_df[["lstat"]].values
X = boston_df.lstat.reshape(-1,1)
X = sm.add_constant(X) # add the intercept term
#y = boston_df["medv"].values
y = boston_df.medv
ols = sm.OLS(y, X).fit()
ols.summary()
# Scikit Learn provides a larger number of models, but has more of a Machine Learning POV
# and doesn't come with the statistical testing data shown above. However, it produces an
# identical linear model as shown below:
reg = LinearRegression()
X = boston_df.lstat.reshape(-1,1)
y = boston_df.medv
reg.fit(X, y)
(reg.intercept_, reg.coef_)
# Drawing the regression line on top of the scatterplot
ax = boston_df.plot(x="lstat", y="medv", style="o", xlim=[0, 40])
ax.set_ylabel("medv")
xs = range(int(np.min(X)), int(np.max(X)))
ys = [reg.predict([x]) for x in xs]
ax.plot(xs, ys, 'r', linewidth=2.5)
# Prediction
test_data = [[5], [10], [15]]
reg.predict(test_data)
boston_df.pred1 = reg.predict(X)
boston_df.resid1 = boston_df.medv - boston_df.pred1
sns.regplot(boston_df.pred1, boston_df.resid1)
# regression with 2 input columns
X = boston_df[["lstat", "age"]]
reg2 = LinearRegression()
reg2.fit(X, y)
(reg2.intercept_, reg2.coef_)
# regression using all input columns
xcols = boston_df.columns[0:-1]
X = boston_df[xcols]
reg3 = LinearRegression()
reg3.fit(X, y)
(reg3.intercept_, reg3.coef_)
# Plotting a fitted regression with R returns 4 graphs - Residuals vs Fitted, Normal Q-Q,
# Scale-Location (Standardized Residuals vs Fitted), and Residuals vs Leverage. Only the
# Q-Q plot is available from statsmodels. The residuals vs Fitted function is implemented
# below and is used for plot #1 and #3. The Residuals vs Leverage is TBD.
def residuals_vs_fitted(fitted, residuals, xlabel, ylabel):
plt.subplot(111)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.scatter(fitted, residuals)
polyline = np.poly1d(np.polyfit(fitted, residuals, 2)) # model non-linearity with quadratic
xs = range(int(np.min(fitted)), int(np.max(fitted)))
plt.plot(xs, polyline(xs), color='r', linewidth=2.5)
def qq_plot(residuals):
sm.qqplot(residuals)
def standardize(xs):
xmean = np.mean(xs)
xstd = np.std(xs)
return (xs - xmean) / xstd
fitted = reg3.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals")
fig = sm.qqplot(residuals, dist="norm", line="r")
residuals_vs_fitted(fitted, std_residuals, "Fitted", "Std.Residuals")
Python offers formula parsing support via the Patsy toolkit. StatsModels uses Patsy to provide formula parsing support for its models. But this can be easily implemented as temporary columns in Pandas dataframes as shown below.
# fitting medv ~ lstat * age
boston_df["lstat*age"] = boston_df["lstat"] * boston_df["age"]
reg5 = LinearRegression()
X = boston_df[["lstat", "age", "lstat*age"]]
y = boston_df["medv"]
reg5.fit(X, y)
(reg5.intercept_, reg5.coef_)
fitted = reg5.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals")
# fitting medv ~ lstat + I(lstat^2)
boston_df["lstat^2"] = boston_df["lstat"] ** 2
reg6 = LinearRegression()
X = boston_df[["lstat", "lstat^2"]]
y = boston_df["medv"]
reg6.fit(X, y)
# save the predicted ys for given xs for future plot
lstats = boston_df["lstat"].values
xs = range(int(np.min(lstats)), int(np.max(lstats)))
ys6 = [reg6.predict([x, x*x]) for x in xs]
(reg6.intercept_, reg6.coef_)
fitted = reg6.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals")
# fitting medv ~ poly(lstat,4). We already have lstat^2 and lstat from previous
boston_df["lstat^4"] = np.power(boston_df["lstat"], 4)
boston_df["lstat^3"] = np.power(boston_df["lstat"], 4)
X = boston_df[["lstat^4", "lstat^3", "lstat^2", "lstat"]]
y = boston_df["medv"]
reg7 = LinearRegression()
reg7.fit(X, y)
ys7 = [reg7.predict([x**4, x**3, x**2, x]) for x in xs]
(reg7.intercept_, reg7.coef_)
fitted = reg7.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted, residuals, "Fitted", "Residuals")
# Plot the different lines. Not that the green line (reg7) follows the distribution
# better than the red line (reg6).
ax = boston_df.plot(x="lstat", y="medv", style="o")
ax.set_ylabel("medv")
plt.plot(xs, ys6, color='r', linewidth=2.5)
plt.plot(xs, ys7, color='g', linewidth=2.5)
# Data from ISLR package: write.csv(Carseats, 'Carseats.csv', col.names=FALSE)
carseats_df = pd.read_csv("Data/Carseats.csv")
carseats_df.head()
# convert non-numeric to factors
carseats_df["ShelveLoc"] = pd.factorize(carseats_df["ShelveLoc"])[0]
carseats_df["Urban"] = pd.factorize(carseats_df["Urban"])[0]
carseats_df["US"] = pd.factorize(carseats_df["US"])[0]
# Sales ~ . + Income:Advertising + Age:Price
carseats_df["Income:Advertising"] = carseats_df["Income"] * carseats_df["Advertising"]
carseats_df["Age:Price"] = carseats_df["Age"] * carseats_df["Price"]
X = carseats_df[carseats_df[1:].columns]
y = carseats_df["Sales"]
reg = LinearRegression()
reg.fit(X, y)
(reg.intercept_, reg.coef_)
# R has a contrasts() function that shows how factors are encoded by default. We can do
# this manually using scikit-learn's OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
colnames = ["ShelveLoc", "Urban", "US"]
enc = OneHotEncoder()
X = carseats_df[colnames]
enc.fit(X)
X_tr = enc.transform(X).toarray()
colnos = enc.n_values_
colnames_tr = []
for (idx, colname) in enumerate(colnames):
for i in range(0, colnos[idx]):
colnames_tr.append(colname + "_" + str(i))
col = 0
for colname_tr in colnames_tr:
carseats_df[colname_tr] = X_tr[:, col]
col = col + 1
del carseats_df["ShelveLoc"]
del carseats_df["Urban"]
del carseats_df["US"]
carseats_df[colnames_tr].head()
def regplot(x, y, xlabel, ylabel, dot_style, line_color):
x = x.values
y = y.values
reg = LinearRegression()
X = np.matrix(x).T
reg.fit(X, y)
ax = plt.scatter(x, y, marker=dot_style)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
xs = range(int(np.min(x)), int(np.max(x)))
ys = [reg.predict(x) for x in xs]
plt.plot(xs, ys, color=line_color, linewidth=2.5)
regplot(carseats_df["Price"], carseats_df["Sales"], "Price", "Sales", 'o', 'r')
import statsmodels
# %load ../standard_import.txt
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
from sklearn.preprocessing import scale
import sklearn.linear_model as skl_lm
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
#%config InlineBackend.figure_formats = {'pdf',}
%matplotlib inline
import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')
Datasets available on http://www-bcf.usc.edu/~gareth/ISL/data.html, which is introduced in chapter 2
advertising = pd.read_csv('Data/Advertising.csv', usecols=[1,2,3,4])
advertising
credit = pd.read_csv('Data/Credit.csv', usecols=list(range(1,12)))
credit['Student2'] = credit.Student.map({'No':0, 'Yes':1})
credit.head(3)
auto = pd.read_csv('Data/Auto.csv', na_values='?').dropna()
auto.info()
sns.regplot(advertising.TV, advertising.Sales, order=1, ci=None, scatter_kws={'color':'r'})
plt.xlim(-10,310)
plt.ylim(ymin=0);
Residual sum of squares
# Regression coefficients (Ordinary Least Squares)
regr = skl_lm.LinearRegression()
X = scale(advertising.TV, with_mean=True, with_std=False).reshape(-1,1)
y = advertising.Sales
regr.fit(X,y)
print(regr.intercept_)
print(regr.coef_)
print(regr.score(X,y))
# Create grid coordinates for plotting
B0 = np.linspace(regr.intercept_-2, regr.intercept_+2, 50)
B1 = np.linspace(regr.coef_-0.02, regr.coef_+0.02, 50)
xx, yy = np.meshgrid(B0, B1, indexing='xy')
Z = np.zeros((B0.size,B1.size))
# Calculate Z-values (RSS) based on grid of coefficients
for (i,j),v in np.ndenumerate(Z):
Z[i,j] =((y - (xx[i,j]+X.ravel()*yy[i,j]))**2).sum()/1000
# Minimized RSS
min_RSS = r'$\beta_0$, $\beta_1$ for minimized RSS'
min_rss = np.sum((regr.intercept_+regr.coef_*X - y.reshape(-1,1))**2)/1000
min_rss
fig = plt.figure(figsize=(15,6))
fig.suptitle('RSS - Regression coefficients', fontsize=20)
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122, projection='3d')
# Left plot
CS = ax1.contour(xx, yy, Z, cmap=cm.Set1, levels=[2.15, 2.2, 2.3, 2.5, 3])
ax1.scatter(regr.intercept_, regr.coef_[0], c='r', label=min_RSS)
ax1.clabel(CS, inline=True, fontsize=10, fmt='%1.1f')
# Right plot
ax2.plot_surface(xx, yy, Z, rstride=3, cstride=3, alpha=0.3)
ax2.contour(xx, yy, Z, zdir='z', offset=Z.min(), cmap=cm.Set1, alpha=0.4, levels=[2.15, 2.2, 2.3, 2.5, 3])
ax2.scatter3D(regr.intercept_, regr.coef_[0], min_rss, c='r', label=min_RSS)
ax2.set_zlabel('RSS')
ax2.set_zlim(Z.min(),Z.max())
ax2.set_ylim(0.02,0.07)
# settings common to both plots
for ax in fig.axes:
ax.set_xlabel(r'$\beta_0$', fontsize=17)
ax.set_ylabel(r'$\beta_1$', fontsize=17)
ax.set_yticks([0.03,0.04,0.05,0.06])
ax.legend()
It can define confidence level, t statistic, and p-value
est = smf.ols('Sales ~ TV', advertising).fit()
est.summary().tables[1]
# RSS with regression coefficients
((advertising.Sales - (est.params[0] + est.params[1]*advertising.TV))**2).sum()/1000
regr = skl_lm.LinearRegression()
X = advertising.TV.reshape(-1,1)
y = advertising.Sales
regr.fit(X,y)
print(regr.intercept_)
print(regr.coef_)
Sales_pred = regr.predict(X)
r2_score(y, Sales_pred)
est = smf.ols('Sales ~ Radio', advertising).fit()
est.summary().tables[1]
est = smf.ols('Sales ~ Newspaper', advertising).fit()
est.summary().tables[1]
est = smf.ols('Sales ~ TV + Radio + Newspaper', advertising).fit()
est.summary()
# Using the seaborn correlation plot.
sns.corrplot(advertising)
# Using Numpy
np.corrcoef(advertising, rowvar=None).round(3)
regr = skl_lm.LinearRegression()
X = advertising[['Radio', 'TV']].as_matrix()
y = advertising.Sales
regr.fit(X,y)
print(regr.coef_)
print(regr.intercept_)
# What are the min/max values of Radio & TV?
# Use these values to set up the grid for plotting.
advertising[['Radio', 'TV']].describe()
# Create a coordinate grid
Radio = np.arange(0,50)
TV = np.arange(0,300)
B1, B2 = np.meshgrid(Radio, TV, indexing='xy')
Z = np.zeros((TV.size, Radio.size))
for (i,j),v in np.ndenumerate(Z):
Z[i,j] =(regr.intercept_ + B1[i,j]*regr.coef_[0] + B2[i,j]*regr.coef_[1])
# Create plot
fig = plt.figure(figsize=(10,6))
fig.suptitle('Regression: Sales ~ Radio + TV Advertising', fontsize=20)
ax = axes3d.Axes3D(fig)
ax.plot_surface(B1, B2, Z, rstride=10, cstride=5, alpha=0.4)
ax.scatter3D(advertising.Radio, advertising.TV, advertising.Sales, c='r')
ax.set_xlabel('Radio')
ax.set_xlim(0,50)
ax.set_ylabel('TV')
ax.set_ylim(ymin=0)
ax.set_zlabel('Sales')
sns.pairplot(credit[['Balance','Age','Cards','Education','Income','Limit','Rating']])
est = smf.ols('Balance ~ Gender', credit).fit()
est.summary().tables[1]
est = smf.ols('Balance ~ Ethnicity', credit).fit()
est.summary().tables[1]
est = smf.ols('Sales ~ TV + Radio + TV*Radio', advertising).fit()
est.summary().tables[1]
est1 = smf.ols('Balance ~ Income + Student2', credit).fit()
regr1 = est1.params
est2 = smf.ols('Balance ~ Income + Income*Student2', credit).fit()
regr2 = est2.params
print('Regression 1 - without interaction term')
print(regr1)
print('\nRegression 2 - with interaction term')
print(regr2)
# Income (x-axis)
income = np.linspace(0,150)
# Balance without interaction term (y-axis)
student1 = np.linspace(regr1['Intercept']+regr1['Student2'],
regr1['Intercept']+regr1['Student2']+150*regr1['Income'])
non_student1 = np.linspace(regr1['Intercept'], regr1['Intercept']+150*regr1['Income'])
# Balance with iteraction term (y-axis)
student2 = np.linspace(regr2['Intercept']+regr2['Student2'],
regr2['Intercept']+regr2['Student2']+150*(regr2['Income']+regr2['Income:Student2']))
non_student2 = np.linspace(regr2['Intercept'], regr2['Intercept']+150*regr2['Income'])
# Create plot
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,5))
ax1.plot(income, student1, 'r', income, non_student1, 'k')
ax2.plot(income, student2, 'r', income, non_student2, 'k')
for ax in fig.axes:
ax.legend(['student', 'non-student'], loc=2)
ax.set_xlabel('Income')
ax.set_ylabel('Balance')
ax.set_ylim(ymax=1550)
# With Seaborn's regplot() you can easily plot higher order polynomials.
plt.scatter(auto.horsepower, auto.mpg, facecolors='None', edgecolors='k')
sns.regplot(auto.horsepower, auto.mpg, ci=None, label='Linear', scatter=False)
sns.regplot(auto.horsepower, auto.mpg, ci=None, label='Degree 2', order=2, scatter=False)
sns.regplot(auto.horsepower, auto.mpg, ci=None, label='Degree 5', order=5, scatter=False)
plt.legend()
plt.ylim(5,55)
plt.xlim(40,240);
auto['horsepower2'] = auto.horsepower**2
auto.head(3)
est = smf.ols('mpg ~ horsepower + horsepower2', auto).fit()
est.summary().tables[1]
regr = skl_lm.LinearRegression()
# Linear fit
X = auto.horsepower.reshape(-1,1)
y = auto.mpg
regr.fit(X, y)
auto['pred1'] = regr.predict(X)
auto['resid1'] = auto.mpg - auto.pred1
# Quadratic fit
X2 = auto[['horsepower', 'horsepower2']].as_matrix()
regr.fit(X2, y)
auto['pred2'] = regr.predict(X2)
auto['resid2'] = auto.mpg - auto.pred2
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,5))
# Left plot
sns.regplot(auto.pred1, auto.resid1, lowess=True, ax=ax1, line_kws={'color':'r', 'lw':1})
ax1.hlines(0,xmin=ax1.xaxis.get_data_interval()[0], xmax=ax1.xaxis.get_data_interval()[1], linestyles='dotted')
ax1.set_title('Residual Plot for Linear Fit')
# Right plot
sns.regplot(auto.pred2, auto.resid2, lowess=True, line_kws={'color':'r', 'lw':1}, ax=ax2)
ax2.hlines(0,xmin=ax2.xaxis.get_data_interval()[0], xmax=ax2.xaxis.get_data_interval()[1], linestyles='dotted')
ax2.set_title('Residual Plot for Quadratic Fit')
for ax in fig.axes:
ax.set_xlabel('Fitted values')
ax.set_ylabel('Residuals')
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(12,5))
# Left plot
ax1.scatter(credit.Limit, credit.Age, facecolor='None', edgecolor='r')
ax1.set_ylabel('Age')
# Right plot
ax2.scatter(credit.Limit, credit.Rating, facecolor='None', edgecolor='r')
ax2.set_ylabel('Rating')
for ax in fig.axes:
ax.set_xlabel('Limit')
ax.set_xticks([2000,4000,6000,8000,12000])
y = credit.Balance
# Regression for left plot
X = credit[['Age', 'Limit']].as_matrix()
regr1 = skl_lm.LinearRegression()
regr1.fit(scale(X.astype('float'), with_std=False), y)
print('Age/Limit\n',regr1.intercept_)
print(regr1.coef_)
# Regression for right plot
X2 = credit[['Rating', 'Limit']].as_matrix()
regr2 = skl_lm.LinearRegression()
regr2.fit(scale(X2.astype('float'), with_std=False), y)
print('\nRating/Limit\n',regr2.intercept_)
print(regr2.coef_)
# Create grid coordinates for plotting
B_Age = np.linspace(regr1.coef_[0]-3, regr1.coef_[0]+3, 100)
B_Limit = np.linspace(regr1.coef_[1]-0.02, regr1.coef_[1]+0.02, 100)
B_Rating = np.linspace(regr2.coef_[0]-3, regr2.coef_[0]+3, 100)
B_Limit2 = np.linspace(regr2.coef_[1]-0.2, regr2.coef_[1]+0.2, 100)
X1, Y1 = np.meshgrid(B_Limit, B_Age, indexing='xy')
X2, Y2 = np.meshgrid(B_Limit2, B_Rating, indexing='xy')
Z1 = np.zeros((B_Age.size,B_Limit.size))
Z2 = np.zeros((B_Rating.size,B_Limit2.size))
Limit_scaled = scale(credit.Limit.astype('float'), with_std=False)
Age_scaled = scale(credit.Age.astype('float'), with_std=False)
Rating_scaled = scale(credit.Rating.astype('float'), with_std=False)
# Calculate Z-values (RSS) based on grid of coefficients
for (i,j),v in np.ndenumerate(Z1):
Z1[i,j] =((y - (regr1.intercept_ + X1[i,j]*Limit_scaled + Y1[i,j]*Age_scaled))**2).sum()/1000000
for (i,j),v in np.ndenumerate(Z2):
Z2[i,j] =((y - (regr2.intercept_ + X2[i,j]*Limit_scaled+Y2[i,j]*Rating_scaled))**2).sum()/1000000
fig = plt.figure(figsize=(12,5))
fig.suptitle('RSS - Regression coefficients', fontsize=20)
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)
min_RSS = r'$\beta_0$, $\beta_1$ for minimized RSS'
# Left plot
CS = ax1.contour(X1, Y1, Z1, cmap=cm.Set1, levels=[21.25, 21.5, 21.8])
ax1.scatter(regr1.coef_[1], regr1.coef_[0], c='r', label=min_RSS)
ax1.clabel(CS, inline=True, fontsize=10, fmt='%1.1f')
ax1.set_ylabel(r'$\beta_{Age}$', fontsize=17)
# Right plot
CS = ax2.contour(X2, Y2, Z2, cmap=cm.Set1, levels=[21.5, 21.8])
ax2.scatter(regr2.coef_[1], regr2.coef_[0], c='r', label=min_RSS)
ax2.clabel(CS, inline=True, fontsize=10, fmt='%1.1f')
ax2.set_ylabel(r'$\beta_{Rating}$', fontsize=17)
ax2.set_xticks([-0.1, 0, 0.1, 0.2])
for ax in fig.axes:
ax.set_xlabel(r'$\beta_{Limit}$', fontsize=17)
ax.legend()
est_Age = smf.ols('Age ~ Rating + Limit', credit).fit()
est_Rating = smf.ols('Rating ~ Age + Limit', credit).fit()
est_Limit = smf.ols('Limit ~ Age + Rating', credit).fit()
print(1/(1-est_Age.rsquared))
print(1/(1-est_Rating.rsquared))
print(1/(1-est_Limit.rsquared))