Logo

OLS Example with joint significance tests

In [1]: import numpy as np

from scipy import stats

In [2]: import statsmodels.api as sm

In [3]: import matplotlib
In [4]: import matplotlib.pyplot as plt

In [5]: from statsmodels.sandbox.regression.predstd import wls_prediction_std

fix a seed for these examples

In [6]: np.random.seed(9876789)

OLS non-linear curve but linear in parameters

In [7]: nsample = 50

In [8]: sig = 0.5

In [9]: x1 = np.linspace(0, 20, nsample)

In [10]: X = np.c_[x1, np.sin(x1), (x1-5)**2, np.ones(nsample)]

In [11]: beta = [0.5, 0.5, -0.02, 5.]

In [12]: y_true = np.dot(X, beta)

In [13]: y = y_true + sig * np.random.normal(size=nsample)

In [14]: plt.figure()
Out[14]: <matplotlib.figure.Figure at 0x6447b90>

In [15]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[15]: 
[<matplotlib.lines.Line2D at 0x62cf2d0>,
 <matplotlib.lines.Line2D at 0x62cfc10>]

In [16]: res = sm.OLS(y, X).fit()

In [17]: print res.params
[ 0.47040466  0.2931004  -0.01826292  5.24935422]

In [18]: print res.bse
[ 0.02853117  0.11215937  0.00250506  0.18499717]

In [19]: print res.predict()
[  4.79278116   5.17262168   5.52726298   5.84073136   6.10281792   6.31075592   6.46967527   6.59175976   6.69424528   6.796588     6.91726779   7.07075203   7.26511865   7.50072896   7.77016828
   8.05946415   8.35038197   8.62342091   8.8610178    9.05043274   9.18584224   9.26929595   9.31037998   9.32464188   9.33103621   9.34881043   9.39434252   9.47845015   9.60461339   9.76840293
   9.95820778  10.15714295  10.34582361  10.50554994  10.62137947  10.68458209  10.69407437  10.65659757  10.58561009  10.49907624  10.41651482  10.3557922   10.33018692  10.34620807  10.4025259
  10.49019023  10.594101    10.69548911  10.77500019  10.81587443]

In [20]: prstd, iv_l, iv_u = wls_prediction_std(res)

In [21]: plt.plot(x1, res.fittedvalues, 'r--.')
Out[21]: [<matplotlib.lines.Line2D at 0x62cb110>]

In [22]: plt.plot(x1, iv_u, 'r--')
Out[22]: [<matplotlib.lines.Line2D at 0x64476d0>]

In [23]: plt.plot(x1, iv_l, 'r--')
Out[23]: [<matplotlib.lines.Line2D at 0x57d8810>]

In [24]: plt.title('blue: true,   red: OLS')
Out[24]: <matplotlib.text.Text at 0x643b350>

In [25]: print res.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     175.9
Date:                Mon, 14 May 2012   Prob (F-statistic):           3.30e-25
Time:                        17:04:22   Log-Likelihood:                -38.308
No. Observations:                  50   AIC:                             84.62
Df Residuals:                      46   BIC:                             92.26
Df Model:                           3                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.4704      0.029     16.487      0.000         0.413     0.528
x2             0.2931      0.112      2.613      0.012         0.067     0.519
x3            -0.0183      0.003     -7.290      0.000        -0.023    -0.013
const          5.2494      0.185     28.375      0.000         4.877     5.622
==============================================================================
Omnibus:                        1.779   Durbin-Watson:                   2.359
Prob(Omnibus):                  0.411   Jarque-Bera (JB):                1.440
Skew:                           0.414   Prob(JB):                        0.487
Kurtosis:                       2.933   Cond. No.                         221.
==============================================================================

OLS with dummy variables

In [26]: sig = 1.

suppose observations from 3 groups

In [27]: xg = np.zeros(nsample, int)

In [28]: xg[20:40] = 1

In [29]: xg[40:] = 2

In [30]: print xg
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2]

In [31]: dummy = (xg[:,None] == np.unique(xg)).astype(float)

use group 0 as benchmark

In [32]: X = np.c_[x1, dummy[:,1:], np.ones(nsample)]

In [33]: beta = [1., 3, -3, 10]

In [34]: y_true = np.dot(X, beta)

In [35]: y = y_true + sig * np.random.normal(size=nsample)

In [36]: plt.figure()
Out[36]: <matplotlib.figure.Figure at 0x62cbb50>

In [37]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[37]: 
[<matplotlib.lines.Line2D at 0x4948590>,
 <matplotlib.lines.Line2D at 0x4942a50>]

In [38]: plt.figure()
Out[38]: <matplotlib.figure.Figure at 0x62cb550>

In [39]: plt.plot(x1, y, 'o', x1, y_true, 'b-')
Out[39]: 
[<matplotlib.lines.Line2D at 0x5850690>,
 <matplotlib.lines.Line2D at 0x5850250>]

In [40]: res2 = sm.OLS(y, X).fit()

In [41]: print res2.params
[ 0.98475721  3.44866047 -2.68110961  9.77981018]

In [42]: print res2.bse
[ 0.06801982  0.64590456  1.05239527  0.35213863]

In [43]: print res.predict()
[  4.79278116   5.17262168   5.52726298   5.84073136   6.10281792   6.31075592   6.46967527   6.59175976   6.69424528   6.796588     6.91726779   7.07075203   7.26511865   7.50072896   7.77016828
   8.05946415   8.35038197   8.62342091   8.8610178    9.05043274   9.18584224   9.26929595   9.31037998   9.32464188   9.33103621   9.34881043   9.39434252   9.47845015   9.60461339   9.76840293
   9.95820778  10.15714295  10.34582361  10.50554994  10.62137947  10.68458209  10.69407437  10.65659757  10.58561009  10.49907624  10.41651482  10.3557922   10.33018692  10.34620807  10.4025259
  10.49019023  10.594101    10.69548911  10.77500019  10.81587443]

In [44]: prstd, iv_l, iv_u = wls_prediction_std(res2)

In [45]: plt.plot(x1, res2.fittedvalues, 'r--.')
Out[45]: [<matplotlib.lines.Line2D at 0x580c190>]

In [46]: plt.plot(x1, iv_u, 'r--')
Out[46]: [<matplotlib.lines.Line2D at 0x580c4d0>]

In [47]: plt.plot(x1, iv_l, 'r--')
Out[47]: [<matplotlib.lines.Line2D at 0x580c150>]

In [48]: plt.title('blue: true,   red: OLS')
Out[48]: <matplotlib.text.Text at 0x5883a50>

In [49]: print res.summary()
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.920
Model:                            OLS   Adj. R-squared:                  0.915
Method:                 Least Squares   F-statistic:                     175.9
Date:                Mon, 14 May 2012   Prob (F-statistic):           3.30e-25
Time:                        17:04:23   Log-Likelihood:                -38.308
No. Observations:                  50   AIC:                             84.62
Df Residuals:                      46   BIC:                             92.26
Df Model:                           3                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.4704      0.029     16.487      0.000         0.413     0.528
x2             0.2931      0.112      2.613      0.012         0.067     0.519
x3            -0.0183      0.003     -7.290      0.000        -0.023    -0.013
const          5.2494      0.185     28.375      0.000         4.877     5.622
==============================================================================
Omnibus:                        1.779   Durbin-Watson:                   2.359
Prob(Omnibus):                  0.411   Jarque-Bera (JB):                1.440
Skew:                           0.414   Prob(JB):                        0.487
Kurtosis:                       2.933   Cond. No.                         221.
==============================================================================

In [50]: R = [[0, 1, 0, 0],
   ....:       [0, 0, 1, 0]]
   ....: 

F test joint hypothesis R * beta = 0 i.e. coefficient on both dummy variables equal zero

In [51]: print res2.f_test(R)
<F test: F=array([[ 124.30756422]]), p=[[ 0.]], df_denom=46, df_num=2>

strongly rejected Null of identical constant in 3 groups .. <F test: F=124.19050615860911, p=2.87411973729e-019, df_denom=46, df_num=2>

In [52]: help(res2.f_test)
Help on method f_test in module statsmodels.base.model:

f_test(self, r_matrix, q_matrix=None, cov_p=None, scale=1.0, invcov=None) method of statsmodels.regression.linear_model.OLSResults instance
    Compute an Fcontrast/F-test for a contrast matrix.
    
    Here, matrix `r_matrix` is assumed to be non-singular. More precisely,
    
    r_matrix (pX pX.T) r_matrix.T
    
    is assumed invertible. Here, pX is the generalized inverse of the
    design matrix of the model. There can be problems in non-OLS models
    where the rank of the covariance of the noise is not full.
    
    Parameters
    ----------
    r_matrix : array-like
        q x p array where q is the number of restrictions to test and
        p is the number of regressors in the full model fit.
        If q is 1 then f_test(r_matrix).fvalue is equivalent to
        the square of t_test(r_matrix).t
    q_matrix : array-like
        q x 1 array, that represents the sum of each linear restriction.
        Default is all zeros for each restriction.
    scale : float, optional
        Default is 1.0 for no scaling.
    invcov : array-like, optional
        A qxq matrix to specify an inverse covariance
        matrix based on a restrictions matrix.
    
    Examples
    --------
    >>> import numpy as np
    >>> import statsmodels.api as sm
    >>> data = sm.datasets.longley.load()
    >>> data.exog = sm.add_constant(data.exog)
    >>> results = sm.OLS(data.endog, data.exog).fit()
    >>> A = np.identity(len(results.params))
    >>> A = A[:-1,:]
    
    This tests that each coefficient is jointly statistically
    significantly different from zero.
    
    >>> print results.f_test(A)
    <F contrast: F=330.28533923463488, p=4.98403052872e-10,
     df_denom=9, df_num=6>
    
    Compare this to
    
    >>> results.F
    330.2853392346658
    >>> results.F_p
    4.98403096572e-10
    
    >>> B = np.array(([0,1,-1,0,0,0,0],[0,0,0,0,1,-1,0]))
    
    This tests that the coefficient on the 2nd and 3rd regressors are
    equal and jointly that the coefficient on the 5th and 6th regressors
    are equal.
    
    >>> print results.f_test(B)
    <F contrast: F=9.740461873303655, p=0.00560528853174, df_denom=9,
     df_num=2>
    
    See also
    --------
    statsmodels.contrasts
    statsmodels.model.t_test

t test for Null hypothesis effects of 2nd and 3rd group add to zero

In [53]: R = [0, 1, -1, 0]

In [54]: print res2.t_test(R)
<T test: effect=array([ 6.12977008]), sd=array([[ 0.58029388]]), t=array([[ 10.5632168]]), p=array([[ 0.]]), df_denom=46>

don’t reject Null at 5% confidence level (note one sided p-value) .. <T test: effect=1.0363792917100714, sd=0.52675137730463362, t=1.9674923243925513, p=0.027586676754860262, df_denom=46>

OLS with small group effects

In [55]: beta = [1., 0.3, -0.0, 10]

In [56]: y_true = np.dot(X, beta)

In [57]: y = y_true + sig * np.random.normal(size=nsample)

In [58]: res3 = sm.OLS(y, X).fit()

In [59]: print res3.f_test(R)
<F test: F=array([[ 0.4929838]]), p=[[ 0.48613705]], df_denom=46, df_num=1>

don’t reject Null of identical constant in 3 groups .. <F test: F=1.9715385826285652, p=0.15083366806, df_denom=46, df_num=2>

Table Of Contents

This Page