3.6 Lab: Linear Regression

#load csv into dataframe
%matplotlib inline
import pandas as pd

boston_df = pd.read_csv("/Users/rcphillips/Documents/Data/Boston.csv")

#inspect data for quality
boston_df.head()

	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

#import modelling package
from sklearn import linear_model

#predict medv using lstat
import numpy as np
lm = linear_model.LinearRegression()
#predictor value:
X=np.transpose(np.matrix(boston_df.lstat.values))
#value being estimated:
y=np.transpose(np.matrix(boston_df.medv.values))

lm.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

#gotta reshape the data  to fit it into the sklearn inputs
np.shape(boston_df.medv.values)

(506,)

np.shape(np.matrix(boston_df.medv.values))

(1, 506)

np.shape(np.transpose(np.matrix(boston_df.medv.values)))

(506, 1)

predictions=lm.predict(np.transpose(np.matrix(boston_df.lstat.values)))

side_by_side = pd.DataFrame(boston_df.medv)

side_by_side['predictions']=predictions

side_by_side['residuals']=side_by_side.medv-side_by_side.predictions

side_by_side

	medv	predictions	residuals
0	24.0	29.822595	-5.822595
1	21.6	25.870390	-4.270390
2	34.7	30.725142	3.974858
3	33.4	31.760696	1.639304
4	36.2	29.490078	6.709922
5	28.7	29.604084	-0.904084
6	22.9	22.744727	0.155273
7	27.1	16.360396	10.739604
8	16.5	6.118864	10.381136
9	18.9	18.307997	0.592003
10	15.0	15.125332	-0.125332
11	18.9	21.946686	-3.046686
12	21.7	19.628566	2.071434
13	20.4	26.706433	-6.306433
14	18.2	24.806335	-6.606335
15	19.9	26.506923	-6.606923
16	23.1	28.302516	-5.202516
17	17.5	20.616617	-3.116617
18	20.2	23.447764	-3.247764
19	18.2	23.837284	-5.637284
20	13.6	14.583803	-0.983803
21	19.6	21.414658	-1.814658
22	15.2	16.768917	-1.568917
23	14.5	15.666860	-1.166860
24	15.6	19.068036	-3.468036
25	13.9	18.868526	-4.968526
26	16.6	20.483610	-3.883610
27	14.8	18.136988	-3.336988
28	18.4	22.393209	-3.993209
29	21.0	23.172250	-2.172250
...	...	...	...
476	16.7	16.806919	-0.106919
477	12.0	10.888111	1.111889
478	14.6	17.424451	-2.824451
479	21.4	22.098694	-0.698694
480	23.0	24.350311	-1.350311
481	23.7	27.200459	-3.500459
482	25.0	27.893995	-2.893995
483	21.8	24.654327	-2.854327
484	20.6	21.880183	-1.280183
485	21.2	24.502319	-3.302319
486	19.1	20.322102	-1.222102
487	20.6	23.675776	-3.075776
488	15.2	17.395950	-2.195950
489	7.0	11.781158	-4.781158
490	8.1	6.356376	1.743624
491	13.6	17.386449	-3.786449
492	20.1	21.870682	-1.770682
493	21.8	23.143748	-1.343748
494	24.5	21.642670	2.857330
495	23.1	17.832972	5.267028
496	19.7	14.469798	5.230202
497	18.3	21.158145	-2.858145
498	21.2	22.279203	-1.079203
499	17.5	20.208096	-2.708096
500	16.8	20.939634	-4.139634
501	22.4	25.366864	-2.966864
502	20.6	25.927393	-5.327393
503	23.9	29.195563	-5.295563
504	22.0	28.397521	-6.397521
505	11.9	27.067452	-15.167452

506 rows × 3 columns

#extract interesting model features.
coef=lm.coef_
print('Coefficients: \n', coef)
from scipy import stats
slope, intercept, r_value, p_value, std_err = stats.linregress(list(boston_df.lstat.values), list(boston_df.medv.values))

print('slope,            intercept,      r_value,       p_value,        std_err' )
print(slope, intercept, r_value, p_value, std_err )

Coefficients: 
 [[-0.95004935]]
slope,            intercept,      r_value,       p_value,        std_err
-0.950049353758 34.5538408794 -0.737662726174 5.08110339439e-88 0.0387334162126

#plot data, and line of best fit
import matplotlib.pyplot as plt
lstat = list(boston_df.lstat.values)
medv= list(boston_df.medv.values)
a=plt.scatter(lstat,medv)
plt.plot(lstat,predictions, color='red')

[&lt;matplotlib.lines.Line2D at 0x1108115c0&gt;]

png

#plotting residuals
import seaborn as sns

sns.regplot(side_by_side.medv, side_by_side.predictions)

&lt;matplotlib.axes._subplots.AxesSubplot at 0x1142f0128&gt;

png

sns.regplot(side_by_side.medv, side_by_side.residuals)

&lt;matplotlib.axes._subplots.AxesSubplot at 0x114309400&gt;

png

Multiple Linear Regression

#predict medv using lstat and crim
import numpy as np
lm = linear_model.LinearRegression()
#predictor value:
X=np.transpose(np.matrix(boston_df.lstat.values))
#value being estimated:
a=np.transpose(np.matrix(boston_df.medv.values))
b=np.transpose(np.matrix(boston_df.crim.values))
y=np.append(a,b,1)

lm.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

coef=lm.coef_

import statsmodels.formula.api as smf

smf.OLS(X,y)

&lt;statsmodels.regression.linear_model.OLS at 0x114977080&gt;

results=smf.OLS('crim ~ crim ', boston_df).fit()

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

&lt;ipython-input-24-6996c1147a45&gt; in &lt;module&gt;()
----&gt; 1 results=smf.OLS('crim ~ crim ', boston_df).fit()


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
    629                  **kwargs):
    630         super(OLS, self).__init__(endog, exog, missing=missing,
--&gt; 631                                   hasconst=hasconst, **kwargs)
    632         if "weights" in self._init_keys:
    633             self._init_keys.remove("weights")


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
    524             weights = weights.squeeze()
    525         super(WLS, self).__init__(endog, exog, missing=missing,
--&gt; 526                                   weights=weights, hasconst=hasconst, **kwargs)
    527         nobs = self.exog.shape[0]
    528         weights = self.weights


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/regression/linear_model.py in __init__(self, endog, exog, **kwargs)
     93     """
     94     def __init__(self, endog, exog, **kwargs):
---&gt; 95         super(RegressionModel, self).__init__(endog, exog, **kwargs)
     96         self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
     97


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
    210 
    211     def __init__(self, endog, exog=None, **kwargs):
--&gt; 212         super(LikelihoodModel, self).__init__(endog, exog, **kwargs)
    213         self.initialize()
    214


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
     61         hasconst = kwargs.pop('hasconst', None)
     62         self.data = self._handle_data(endog, exog, missing, hasconst,
---&gt; 63                                       **kwargs)
     64         self.k_constant = self.data.k_constant
     65         self.exog = self.data.exog


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/base/model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
     86 
     87     def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
---&gt; 88         data = handle_data(endog, exog, missing, hasconst, **kwargs)
     89         # kwargs arrays could have changed, easier to just attach here
     90         for key in kwargs:


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/base/data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
    628     klass = handle_data_class_factory(endog, exog)
    629     return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
--&gt; 630                  **kwargs)


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/base/data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
     74             self.orig_endog = endog
     75             self.orig_exog = exog
---&gt; 76             self.endog, self.exog = self._convert_endog_exog(endog, exog)
     77 
     78         # this has side-effects, attaches k_constant and const_idx


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/base/data.py in _convert_endog_exog(self, endog, exog)
    471             raise ValueError("Pandas data cast to numpy dtype of object. "
    472                              "Check input data with np.asarray(data).")
--&gt; 473         return super(PandasData, self)._convert_endog_exog(endog, exog)
    474 
    475     @classmethod


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/base/data.py in _convert_endog_exog(self, endog, exog)
    310 
    311         # for consistent outputs if endog is (n,1)
--&gt; 312         yarr = self._get_yarr(endog)
    313         xarr = None
    314         if exog is not None:


/Users/rcphillips/anaconda3/lib/python3.5/site-packages/statsmodels/base/data.py in _get_yarr(self, endog)
    385             endog = data_util.struct_to_ndarray(endog)
    386         endog = np.asarray(endog)
--&gt; 387         if len(endog) == 1:  # never squeeze to a scalar
    388             if endog.ndim == 1:
    389                 return endog


TypeError: len() of unsized object

boston_df.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'black', 'lstat', 'medv'],
      dtype='object')

Linear Regression Example

3.6 Lab: Linear Regression

Multiple Linear Regression

Published

Category

Tags