"""
Results containers and post-estimation diagnostics for IV models
"""
from __future__ import annotations
from linearmodels.compat.statsmodels import Summary
from collections.abc import Sequence
import datetime as dt
from functools import cached_property
from typing import Any, Union
import numpy
from numpy import (
array,
asarray,
c_,
diag,
empty,
isnan,
log,
ndarray,
ones,
sqrt,
squeeze,
)
from numpy.linalg import inv
import pandas
from pandas import DataFrame, Series, concat, to_numeric
import scipy.stats as stats
from statsmodels.iolib.summary import SimpleTable, fmt_2cols, fmt_params
from statsmodels.iolib.table import default_txt_fmt
import linearmodels
from linearmodels.iv._utility import annihilate, proj
from linearmodels.iv.data import IVData
from linearmodels.shared.base import _ModelComparison, _SummaryStr
from linearmodels.shared.hypotheses import (
InvalidTestStatistic,
WaldTestStatistic,
quadratic_form_test,
)
from linearmodels.shared.io import _str, add_star, pval_format
import linearmodels.typing.data
def stub_concat(
lists: Sequence[list[str] | tuple[str, ...]], sep: str = "="
) -> list[str]:
col_size = max(max(map(len, stubs)) for stubs in lists)
out: list[str] = []
for stubs in lists:
out.extend(stubs)
out.append(sep * (col_size + 2))
return out[:-1]
def table_concat(lists: Sequence[list[list[str]]], sep: str = "=") -> list[list[str]]:
col_sizes = []
for table in lists:
size = [[len(item) for item in row] for row in table]
size_arr = array(size)
col_sizes.append(list(asarray(size_arr.max(0))))
col_size = asarray(array(col_sizes).max(axis=0))
sep_cols: list[str] = [sep * (cs + 2) for cs in col_size]
out: list[list[str]] = []
for table in lists:
out.extend(table)
out.append(sep_cols)
return out[:-1]
class _LSModelResultsBase(_SummaryStr):
"""
Results from OLS model estimation
Parameters
----------
results : dict[str, any]
A dictionary of results from the model estimation.
model : _OLS
The model used to estimate parameters.
"""
def __init__(self, results: dict[str, Any], model: Any) -> None:
self._resid = results["eps"]
self._wresid = results["weps"]
self._params = results["params"]
self._cov = results["cov"]
self.model = model
self._r2 = results["r2"]
self._cov_type = results["cov_type"]
self._rss = results["residual_ss"]
self._tss = results["total_ss"]
self._s2 = results["s2"]
self._debiased = results["debiased"]
self._f_statistic = results["fstat"]
self._vars = results["vars"]
self._cov_config = results["cov_config"]
self._method = results["method"]
self._kappa = results.get("kappa", None)
self._datetime = dt.datetime.now()
self._cov_estimator = results["cov_estimator"]
self._original_index = results["original_index"]
self._fitted = results["fitted"]
self._df_model = results.get("df_model", self._params.shape[0])
@property
def cov_config(self) -> dict[str, Any]:
"""Parameter values from covariance estimator"""
return self._cov_config
@property
def cov_estimator(self) -> str:
"""Type of covariance estimator used to compute covariance"""
return self._cov_type
@property
def cov(self) -> DataFrame:
"""Estimated covariance of parameters"""
return self._cov
@property
def params(self) -> Series:
"""Estimated parameters"""
return self._params
@cached_property
def resids(self) -> Series:
"""Estimated residuals"""
return self._resid()
@cached_property
def fitted_values(self) -> Series:
"""Fitted values"""
return self._fitted()
@property
def idiosyncratic(self) -> Series:
"""
Idiosyncratic error
Notes
-----
Differs from resids since this is the estimated idiosyncratic shock
from the data. It has the same dimension as the dependent data.
The shape and nature of resids depends on the model estimated. These
estimates only depend on the model estimated through the estimation
of parameters and inclusion of effects, if any.
"""
return self.resids
@cached_property
def wresids(self) -> Series:
"""Weighted estimated residuals"""
return self._wresid()
@property
def nobs(self) -> int:
"""Number of observations"""
return self.model.dependent.shape[0]
@property
def df_resid(self) -> int:
"""Residual degree of freedom"""
return self.nobs - self.df_model
@property
def df_model(self) -> int:
"""Model degree of freedom"""
return int(self._df_model)
@property
def has_constant(self) -> bool:
"""Flag indicating the model includes a constant or equivalent"""
return self.model.has_constant
@property
def rsquared(self) -> float:
"""Coefficient of determination (R**2)"""
return self._r2
@property
def rsquared_adj(self) -> float:
"""Sample-size adjusted coefficient of determination (R**2)"""
n, k, c = self.nobs, self.df_model, int(self.has_constant)
return 1 - ((n - c) / (n - k)) * (1 - self._r2)
@property
def cov_type(self) -> str:
"""Covariance estimator used"""
return self._cov_type
@cached_property
def std_errors(self) -> Series:
"""Estimated parameter standard errors"""
std_errors = sqrt(diag(self.cov))
return Series(std_errors, index=self._vars, name="stderr")
@cached_property
def tstats(self) -> Series:
"""Parameter t-statistics"""
return Series(self._params / self.std_errors, name="tstat")
@cached_property
def pvalues(self) -> Series:
"""
Parameter p-vals. Uses t(df_resid) if ``debiased`` is True, else normal
"""
if self.debiased:
pvals = 2 - 2 * stats.t.cdf(abs(self.tstats), self.df_resid)
else:
pvals = 2 - 2 * stats.norm.cdf(abs(self.tstats))
return Series(pvals, index=self._vars, name="pvalue")
@property
def total_ss(self) -> float:
"""Total sum of squares"""
return self._tss
@property
def model_ss(self) -> float:
"""Residual sum of squares"""
return self._tss - self._rss
@property
def resid_ss(self) -> float:
"""Residual sum of squares"""
return self._rss
@property
def s2(self) -> float:
"""Residual variance estimator"""
return self._s2
@property
def debiased(self) -> bool:
"""Flag indicating whether covariance uses a small-sample adjustment"""
return self._debiased
@property
def f_statistic(self) -> WaldTestStatistic:
"""
Model F-statistic
Returns
-------
WaldTestStatistic
Test statistic for null all coefficients excluding constant terms
are zero.
Notes
-----
Despite name, always implemented using a quadratic-form test based on
estimated parameter covariance. Default is to use a chi2 distribution
to compute p-values. If ``debiased`` is True, divides statistic by
number of parameters tested and uses an F-distribution.
This version of the F-statistic directly uses the model covariance
estimator and so is robust against the same specification issues.
"""
return self._f_statistic
@property
def method(self) -> str:
"""Method used to estimate model parameters"""
return self._method
def conf_int(self, level: float = 0.95) -> DataFrame:
"""
Confidence interval construction
Parameters
----------
level : float
Confidence level for interval
Returns
-------
DataFrame
Confidence interval of the form [lower, upper] for each parameters
Notes
-----
Uses a t(df_resid) if ``debiased`` is True, else normal.
"""
ci_quantiles = [(1 - level) / 2, 1 - (1 - level) / 2]
if self._debiased:
q = stats.t.ppf(ci_quantiles, self.df_resid)
else:
q = stats.norm.ppf(ci_quantiles)
q = q[None, :]
ci = asarray(self.params)[:, None] + asarray(self.std_errors)[:, None] * q
return DataFrame(ci, index=self._vars, columns=["lower", "upper"])
def _top_right(self) -> list[tuple[str, str]]:
f_stat = _str(self.f_statistic.stat)
if isnan(self.f_statistic.stat):
f_stat = " N/A"
return [
("R-squared:", _str(self.rsquared)),
("Adj. R-squared:", _str(self.rsquared_adj)),
("F-statistic:", f_stat),
("P-value (F-stat)", pval_format(self.f_statistic.pval)),
("Distribution:", str(self.f_statistic.dist_name)),
("", ""),
("", ""),
]
@property
def summary(self) -> Summary:
"""
Model estimation summary.
Returns
-------
Summary
Summary table of model estimation results
Notes
-----
Supports export to csv, html and latex using the methods ``summary.as_csv()``,
``summary.as_html()`` and ``summary.as_latex()``.
"""
title = self._method + " Estimation Summary"
mod = self.model
top_left = [
("Dep. Variable:", mod.dependent.cols[0]),
("Estimator:", self._method),
("No. Observations:", self.nobs),
("Date:", self._datetime.strftime("%a, %b %d %Y")),
("Time:", self._datetime.strftime("%H:%M:%S")),
("Cov. Estimator:", self._cov_type),
("", ""),
]
top_right = self._top_right()
stubs = []
vals = []
for stub, val in top_left:
stubs.append(stub)
vals.append([val])
table = SimpleTable(vals, txt_fmt=fmt_2cols, title=title, stubs=stubs)
# create summary table instance
smry = Summary()
# Top Table
# Parameter table
fmt = fmt_2cols
fmt["data_fmts"][1] = "%18s"
top_right = [("%-21s" % (" " + k), v) for k, v in top_right]
stubs = []
vals = []
for stub, val in top_right:
stubs.append(stub)
vals.append([val])
table.extend_right(SimpleTable(vals, stubs=stubs))
smry.tables.append(table)
param_data = c_[
self.params.values[:, None],
self.std_errors.values[:, None],
self.tstats.values[:, None],
self.pvalues.values[:, None],
self.conf_int(),
]
data = []
for row in param_data:
txt_row = []
for i, v in enumerate(row):
f = _str
if i == 3:
f = pval_format
txt_row.append(f(v))
data.append(txt_row)
title = "Parameter Estimates"
table_stubs = list(self.params.index)
extra_text = []
if table_stubs:
header = [
"Parameter",
"Std. Err.",
"T-stat",
"P-value",
"Lower CI",
"Upper CI",
]
table = SimpleTable(
data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, title=title
)
smry.tables.append(table)
else:
extra_text.append("Model contains no parameters")
extra_text = self._update_extra_text(extra_text)
if extra_text:
smry.add_extra_txt(extra_text)
return smry
def _update_extra_text(self, extra_text: list[str]) -> list[str]:
return extra_text
def wald_test(
self,
restriction: pandas.DataFrame | numpy.ndarray | None = None,
value: pandas.Series | numpy.ndarray | None = None,
*,
formula: str | list[str] | dict[str, float] | None = None,
) -> WaldTestStatistic:
r"""
Test linear equality constraints using a Wald test
Parameters
----------
restriction : {ndarray, DataFrame}
q by nvar array containing linear weights to apply to parameters
when forming the restrictions. It is not possible to use both
restriction and formula.
value : {ndarray, Series}
q element array containing the restricted values.
formula : {str, list[str]}
formulaic linear constraints. The simplest formats are one of:
* A single comma-separated string such as "x1=0, x2+x3=1"
* A list of strings where each element is a single constraint
such as ["x1=0", "x2+x3=1"]
* A single string without commas to test simple constraints such
as "x1=x2=x3=0"
* A dictionary where each key is a parameter restriction and
the corresponding value is the restriction value, e.g.,
{"x1": 0, "x2+x3": 1}.
It is not possible to use both ``restriction`` and ``formula``.
Returns
-------
WaldTestStatistic
Test statistic for null that restrictions are valid.
Notes
-----
Hypothesis test examines whether :math:`H_0:C\theta=v` where the
matrix C is ``restriction`` and v is ``value``. The test statistic
has a :math:`\chi^2_q` distribution where q is the number of rows in C.
Examples
--------
>>> import numpy as np
>>> from linearmodels.datasets import wage
>>> from linearmodels.iv import IV2SLS
>>> data = wage.load()
>>> formula = "np.log(wage) ~ 1 + exper + I(exper**2) + brthord + [educ ~ sibs]"
>>> res = IV2SLS.from_formula(formula, data).fit()
Testing the experience is not needed in the model
>>> restriction = np.array([[0, 1, 0, 0, 0],
... [0, 0, 1, 0, 0]])
>>> value = np.array([0, 0])
>>> wald_res = res.wald_test(restriction, value)
Using the formula interface to test the same restrictions
>>> formula = "exper = I(exper**2) = 0"
>>> wald_res = res.wald_test(formula=formula)
Using the formula interface with a list
>>> wald_res = res.wald_test(formula=["exper = 0", "I(exper**2) = 0"])
"""
return quadratic_form_test(
self._params,
self.cov,
restriction=restriction,
value=value,
formula=formula,
)
[docs]
class OLSResults(_LSModelResultsBase):
"""
Results from OLS model estimation
Parameters
----------
results : dict[str, any]
A dictionary of results from the model estimation.
model : _OLS
The model used to estimate parameters.
"""
def __init__(
self,
results: dict[str, Any],
model: linearmodels.iv.model._IVModelBase,
) -> None:
super().__init__(results, model)
def _out_of_sample(
self,
exog: linearmodels.typing.data.ArrayLike | None,
endog: linearmodels.typing.data.ArrayLike | None,
data: linearmodels.typing.data.ArrayLike | None,
missing: bool | None,
) -> DataFrame:
"""Interface between model predict and predict for OOS fits"""
if not (exog is None and endog is None) and data is not None:
raise ValueError(
"Predictions can only be constructed using one "
"of exog/endog or data, but not both."
)
pred = self.model.predict(self.params, exog=exog, endog=endog, data=data)
if not missing:
pred = pred.loc[pred.notnull().all(axis=1)]
return pred
[docs]
def predict(
self,
exog: linearmodels.typing.data.ArrayLike | None = None,
endog: linearmodels.typing.data.ArrayLike | None = None,
*,
data: pandas.DataFrame | None = None,
fitted: bool = True,
idiosyncratic: bool = False,
missing: bool = False,
) -> DataFrame:
"""
In- and out-of-sample predictions
Parameters
----------
exog : array_like
Exogenous values to use in out-of-sample prediction (nobs by nexog)
endog : array_like
Endogenous values to use in out-of-sample prediction (nobs by nendog)
data : DataFrame
DataFrame to use for out-of-sample predictions when model was
constructed using a formula.
fitted : bool
Flag indicating whether to include the fitted values
idiosyncratic : bool
Flag indicating whether to include the estimated idiosyncratic shock
missing : bool
Flag indicating to adjust for dropped observations. If True, the
values returned will have the same size as the original input data
before filtering missing values. If False, then missing
observations will not be returned.
Returns
-------
DataFrame
DataFrame containing columns for all selected outputs
Notes
-----
If `exog`, `endog` and `data` are all `None`, in-sample predictions
(fitted values) will be returned.
If `data` is not none, then `exog` and `endog` must be none.
Predictions from models constructed using formulas can
be computed using either `exog` and `endog`, which will treat these are
arrays of values corresponding to the formula-process data, or using
`data` which will be processed using the formula used to construct the
values corresponding to the original model specification.
"""
if not (exog is None and endog is None and data is None):
return self._out_of_sample(exog, endog, data, missing)
out = []
if fitted:
out.append(self.fitted_values)
if idiosyncratic:
out.append(self.idiosyncratic)
if len(out) == 0:
raise ValueError("At least one output must be selected")
out_df: DataFrame = concat(out, axis=1)
if missing:
index = self._original_index
out_df = out_df.reindex(index)
return out_df
@property
def kappa(self) -> float:
"""k-class estimator value"""
return self._kappa
def _update_extra_text(self, extra_text: list[str]) -> list[str]:
instruments = self.model.instruments
if instruments.shape[1] > 0:
endog = self.model.endog
extra_text.append("Endogenous: " + ", ".join(endog.cols))
extra_text.append("Instruments: " + ", ".join(instruments.cols))
cov_descr = str(self._cov_estimator)
for line in cov_descr.split("\n"):
extra_text.append(line)
return extra_text
[docs]
class AbsorbingLSResults(_LSModelResultsBase):
"""
Results from IV estimation
Parameters
----------
results : dict[str, any]
A dictionary of results from the model estimation.
model : AbsorbingLS
The model used to estimate parameters.
"""
def __init__(
self, results: dict[str, Any], model: linearmodels.iv.absorbing.AbsorbingLS
) -> None:
super().__init__(results, model)
self._absorbed_rsquared = results["absorbed_r2"]
self._absorbed_effects = results["absorbed_effects"]
def _top_right(self) -> list[tuple[str, str]]:
f_stat = _str(self.f_statistic.stat)
if isnan(self.f_statistic.stat):
f_stat = " N/A"
return [
("R-squared:", _str(self.rsquared)),
("Adj. R-squared:", _str(self.rsquared_adj)),
("F-statistic:", f_stat),
("P-value (F-stat):", pval_format(self.f_statistic.pval)),
("Distribution:", str(self.f_statistic.dist_name)),
("R-squared (No Effects):", _str(round(self.absorbed_rsquared, 5))),
("Variables Absorbed:", _str(self.df_absorbed)),
]
@property
def absorbed_rsquared(self) -> float:
"""Coefficient of determination (R**2), ignoring absorbed variables"""
return self._absorbed_rsquared
@cached_property
def absorbed_effects(self) -> DataFrame:
"""Fitted values from only absorbed terms"""
return self._absorbed_effects()
@property
def df_absorbed(self) -> int:
"""Number of variables absorbed"""
return self.df_model - self.params.shape[0]
[docs]
class FirstStageResults(_SummaryStr):
"""
First stage estimation results and diagnostics
"""
def __init__(
self,
dep: IVData,
exog: IVData,
endog: IVData,
instr: IVData,
weights: IVData,
cov_type: str,
cov_config: dict[str, Any],
) -> None:
self.dep = dep
self.exog = exog
self.endog = endog
self.instr = instr
self.weights = weights
reg = c_[self.exog.ndarray, self.endog.ndarray]
self._reg = DataFrame(reg, columns=self.exog.cols + self.endog.cols)
self._cov_type = cov_type
self._cov_config = cov_config
@cached_property
def diagnostics(self) -> DataFrame:
"""
Post estimation diagnostics of first-stage fit
Returns
-------
DataFrame
DataFrame where each endogenous variable appears as a row and
the columns contain alternative measures. The columns are:
* rsquared - R-squared from regression of endogenous on exogenous
and instruments
* partial.rsquared - R-squared from regression of the exogenous
variable on instruments where both the exogenous variable and
the instrument have been orthogonalized to the exogenous
regressors in the model.
* f.stat - Test that all coefficients are zero in the model
used to estimate the partial R-squared. Uses a standard F-test
when the covariance estimator is unadjusted - otherwise uses a
Wald test statistic with a chi2 distribution.
* f.pval - P-value of the test that all coefficients are zero
in the model used to estimate the partial R-squared
* f.dist - Distribution of f.stat
* shea.rsquared - Shea's r-squared which measures the correlation
between the projected and orthogonalized instrument on the
orthogonalized endogenous regressor where the orthogonalization
is with respect to the other included variables in the model.
"""
from linearmodels.iv.model import _OLS, IV2SLS
endog, exog, instr, weights = self.endog, self.exog, self.instr, self.weights
w = sqrt(weights.ndarray)
z = w * instr.ndarray
nz = z.shape[1]
x = w * exog.ndarray
ez = annihilate(z, x)
individual_results = self.individual
out_df = DataFrame(
index=["rsquared", "partial.rsquared", "f.stat", "f.pval", "f.dist"],
columns=[],
)
for col in endog.pandas:
# TODO: BUG in pandas-stube
# https://github.com/pandas-dev/pandas-stubs/issues/97
y = w * endog.pandas[[col]].values
ey = annihilate(y, x)
partial = _OLS(ey, ez).fit(cov_type=self._cov_type, **self._cov_config)
full = individual_results[str(col)]
params = full.params.values[-nz:]
params = params[:, None]
c = asarray(full.cov)[-nz:, -nz:]
stat = params.T @ inv(c) @ params
stat = float(stat.squeeze())
if full.cov_type in ("homoskedastic", "unadjusted"):
df_denom = full.df_resid
stat /= params.shape[0]
else:
df_denom = None
w_test = WaldTestStatistic(
stat, null="", df=params.shape[0], df_denom=df_denom
)
inner = {
"rsquared": full.rsquared,
"partial.rsquared": partial.rsquared,
"f.stat": w_test.stat,
"f.pval": w_test.pval,
"f.dist": w_test.dist_name,
}
out_df[col] = Series(inner)
out_df = out_df.T
dep = self.dep
r2sls = IV2SLS(dep, exog, endog, instr, weights=weights).fit(
cov_type="unadjusted"
)
rols = _OLS(dep, self._reg, weights=weights).fit(cov_type="unadjusted")
shea = (rols.std_errors / r2sls.std_errors) ** 2
shea *= (1 - r2sls.rsquared) / (1 - rols.rsquared)
out_df["shea.rsquared"] = shea[out_df.index]
cols = [
"rsquared",
"partial.rsquared",
"shea.rsquared",
"f.stat",
"f.pval",
"f.dist",
]
out_df = out_df[cols]
for col in out_df:
try:
out_df[col] = to_numeric(out_df[col])
except ValueError:
# If an error is raised, ignore and keep the column
pass
return out_df
@cached_property
def individual(self) -> dict[str, OLSResults]:
"""
Individual model results from first-stage regressions
Returns
-------
dict
Dictionary containing first stage estimation results. Keys are
the variable names of the endogenous regressors.
"""
from linearmodels.iv.model import _OLS
exog_instr = DataFrame(
c_[self.exog.ndarray, self.instr.ndarray],
columns=self.exog.cols + self.instr.cols,
)
res: dict[str, OLSResults] = {}
for col in self.endog.pandas:
dep = self.endog.pandas[col]
mod = _OLS(dep, exog_instr, weights=self.weights.ndarray)
res[str(col)] = mod.fit(cov_type=self._cov_type, **self._cov_config)
return res
@property
def summary(self) -> Summary:
"""
Model estimation summary.
Returns
-------
Summary
Summary table of model estimation results
Notes
-----
Supports export to csv, html and latex using the methods ``summary.as_csv()``,
``summary.as_html()`` and ``summary.as_latex()``.
"""
smry = Summary()
if not self.individual:
table = SimpleTable([[]])
smry.tables.append(table)
smry.add_extra_txt(
["Model contains no endogenous variables. No first stage results."]
)
return smry
stubs_lookup = {
"rsquared": "R-squared",
"partial.rsquared": "Partial R-squared",
"shea.rsquared": "Shea's R-squared",
"f.stat": "Partial F-statistic",
"f.pval": "P-value (Partial F-stat)",
"f.dist": "Partial F-stat Distn",
}
diagnostics = self.diagnostics
vals = []
for c in diagnostics:
if c != "f.dist":
vals.append([_str(v) for v in diagnostics[c]])
else:
vals.append([v for v in diagnostics[c]])
stubs = [stubs_lookup[s] for s in list(diagnostics.columns)]
header = list(diagnostics.index)
params = []
for var in header:
res = self.individual[var]
v = c_[res.params.values, res.tstats.values]
params.append(v.ravel())
params_arr = array(params)
params_fmt = [[_str(val) for val in row] for row in params_arr.T]
for i in range(1, len(params_fmt), 2):
for j in range(len(params_fmt[i])):
params_fmt[i][j] = f"({params_fmt[i][j]})"
params_stub = []
for var in res.params.index:
params_stub.extend([var, ""])
title = "First Stage Estimation Results"
vals = table_concat((vals, params_fmt))
stubs = stub_concat((stubs, params_stub))
txt_fmt = default_txt_fmt.copy()
txt_fmt["data_aligns"] = "r"
txt_fmt["header_align"] = "r"
table = SimpleTable(
vals, headers=header, title=title, stubs=stubs, txt_fmt=txt_fmt
)
smry.tables.append(table)
extra_txt = [
"T-stats reported in parentheses",
"T-stats use same covariance type as original model",
]
smry.add_extra_txt(extra_txt)
return smry
class _CommonIVResults(OLSResults):
"""
Results from IV estimation
"""
def __init__(
self,
results: dict[str, Any],
model: linearmodels.iv.model._IVModelBase,
) -> None:
super().__init__(results, model)
self._liml_kappa = results.get("liml_kappa", None)
@property
def first_stage(self) -> FirstStageResults:
"""
First stage regression results
Returns
-------
FirstStageResults
Object containing results for diagnosing instrument relevance issues.
"""
return FirstStageResults(
self.model.dependent,
self.model.exog,
self.model.endog,
self.model.instruments,
self.model.weights,
self._cov_type,
self._cov_config,
)
[docs]
class IVResults(_CommonIVResults):
"""
Results from IV estimation
Parameters
----------
results : dict[str, any]
A dictionary of results from the model estimation.
model : {IV2SLS, IVLIML}
The model used to estimate parameters.
"""
def __init__(
self, results: dict[str, Any], model: linearmodels.iv.model._IVLSModelBase
) -> None:
super().__init__(results, model)
self._kappa = results.get("kappa", 1)
@cached_property
def sargan(self) -> InvalidTestStatistic | WaldTestStatistic:
r"""
Sargan test of overidentifying restrictions
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
Requires more instruments than endogenous variables
Tests the ratio of re-projected IV regression residual variance to
variance of the IV residuals.
.. math ::
n(1-\hat{\epsilon}^{\prime}M_{Z}\hat{\epsilon}/
\hat{\epsilon}^{\prime}\hat{\epsilon})\sim\chi_{v}^{2}
where :math:`M_{z}` is the annihilator matrix where z is the set of
instruments and :math:`\hat{\epsilon}` are the residuals from the IV
estimator. The degree of freedom is the difference between the number
of instruments and the number of endogenous regressors.
.. math ::
v = n_{instr} - n_{exog}
"""
z = self.model.instruments.ndarray
nobs, ninstr = z.shape
nendog = self.model.endog.shape[1]
name = "Sargan's test of overidentification"
if ninstr - nendog == 0:
return InvalidTestStatistic(
"Test requires more instruments than endogenous variables.",
name=name,
)
eps = self.resids.values[:, None]
u = annihilate(eps, self.model._z)
stat = nobs * (1 - (u.T @ u) / (eps.T @ eps)).squeeze()
null = "The model is not overidentified."
return WaldTestStatistic(stat, null, ninstr - nendog, name=name)
@cached_property
def basmann(self) -> InvalidTestStatistic | WaldTestStatistic:
r"""
Basmann's test of overidentifying restrictions
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
Requires more instruments than endogenous variables
Tests is a small-sample version of Sargan's test that has the same
distribution.
.. math ::
s (n - n_{instr}) / (n - s) \sim \chi^2_{v}
where :math:`n_{instr}` is the number of instruments, :math:`n_{exog}`
is the number of exogenous regressors and :math:`n_{endog}` is the
number of endogenous regressors. The degree of freedom is the
difference between the number of instruments and the number of
endogenous regressors.
.. math ::
v = n_{instr} - n_{exog}
"""
mod = self.model
ninstr = mod.instruments.shape[1]
nobs, nendog = mod.endog.shape
nz = mod._z.shape[1]
name = "Basmann's test of overidentification"
if ninstr - nendog == 0:
return InvalidTestStatistic(
"Test requires more instruments than " "endogenous variables.",
name=name,
)
sargan_test = self.sargan
s = sargan_test.stat
stat = s * (nobs - nz) / (nobs - s)
return WaldTestStatistic(stat, sargan_test.null, sargan_test.df, name=name)
def _endogeneity_setup(
self, variables: str | list[str] | None = None
) -> tuple[ndarray, ndarray, ndarray, int, int, int, int]:
"""Setup function for some endogeneity iv"""
if isinstance(variables, str):
variables = [variables]
elif variables is not None and not isinstance(variables, list):
raise TypeError("variables must be a str or a list of str.")
nobs = self.model.dependent.shape[0]
e2 = asarray(self.resids.values)
nendog, nexog = self.model.endog.shape[1], self.model.exog.shape[1]
if variables is None:
assumed_exog = self.model.endog.ndarray
aug_exog = c_[self.model.exog.ndarray, assumed_exog]
still_endog = empty((nobs, 0))
else:
assert isinstance(variables, list)
assumed_exog = self.model.endog.pandas[variables].values
ex = [c for c in self.model.endog.cols if c not in variables]
still_endog = self.model.endog.pandas[ex].values
aug_exog = c_[self.model.exog.ndarray, assumed_exog]
ntested = assumed_exog.shape[1]
from linearmodels.iv import IV2SLS
mod = IV2SLS(
self.model.dependent, aug_exog, still_endog, self.model.instruments
)
e0 = mod.fit().resids.values[:, None]
z2 = c_[self.model.exog.ndarray, self.model.instruments.ndarray]
z1 = c_[z2, assumed_exog]
e1 = proj(e0, z1)
e2 = proj(e2, self.model.instruments.ndarray)
return e0, e1, e2, nobs, nexog, nendog, ntested
[docs]
def durbin(self, variables: str | list[str] | None = None) -> WaldTestStatistic:
r"""
Durbin's test of exogeneity
Parameters
----------
variables : {str, list[str]}
List of variables to test for exogeneity. If None, all variables
are jointly tested.
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
Test statistic is difference between sum of squared OLS and sum of
squared IV residuals where each set of residuals has been projected
onto the set of instruments in the IV model.
Start by defining
.. math ::
\delta & = \hat{\epsilon}'_e P_{[z,w]} \hat{\epsilon}_e -
\hat{\epsilon}'_c P_{z} \hat{\epsilon}_c
where :math:`\hat{\epsilon}_e` are the regression residuals from a
model where ``vars`` are treated as exogenous,
:math:`\hat{\epsilon}_c` are the regression residuals from the model
leaving ``vars`` as endogenous, :math:`P_{[z,w]}` is a projection
matrix onto the exogenous variables and instruments (`z`) as well as
``vars``, and :math:`P_{z}` is a projection matrix only onto `z`.
The test statistic is then
.. math ::
\delta / (\hat{\epsilon}'_e\hat{\epsilon}_e) / n \sim \chi^2_{q}
where :math:`q` is the number of variables tested.
"""
null = "All endogenous variables are exogenous"
if variables is not None:
null = "Variables {} are exogenous".format(", ".join(variables))
e0, e1, e2, nobs, _, _, ntested = self._endogeneity_setup(variables)
stat = e1.T @ e1 - e2.T @ e2
stat /= (e0.T @ e0) / nobs
name = "Durbin test of exogeneity"
df = ntested
return WaldTestStatistic(float(squeeze(stat)), null, df, name=name)
[docs]
def wu_hausman(self, variables: str | list[str] | None = None) -> WaldTestStatistic:
r"""
Wu-Hausman test of exogeneity
Parameters
----------
variables : {str, list[str]}
List of variables to test for exogeneity. If None, all variables
are jointly tested.
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
Test statistic is difference between sum of squared OLS and sum of
squared IV residuals where each set of residuals has been projected
onto the set of instruments in the IV model.
Start by defining
.. math ::
\delta & = \hat{\epsilon}'_e P_{[z,w]} \hat{\epsilon}_e -
\hat{\epsilon}'_c P_{z} \hat{\epsilon}_c
where :math:`\hat{\epsilon}_e` are the regression residuals from a
model where ``vars`` are treated as exogenous,
:math:`\hat{\epsilon}_c` are the regression residuals from the model
leaving ``vars`` as endogenous, :math:`P_{[z,w]}` is a projection
matrix onto the exogenous variables and instruments (`z`) as well as
``vars``, and :math:`P_{z}` is a projection matrix only onto `z`.
The test statistic is then
.. math ::
\frac{\delta / q}{(\hat{\epsilon}'_e\hat{\epsilon}_e - \delta) / v}
where :math:`q` is the number of variables iv,
:math:`v = n - n_{endog} - n_{exog} - q`. The test statistic has a
:math:`F_{q, v}` distribution.
"""
null = "All endogenous variables are exogenous"
if variables is not None:
null = "Variables {} are exogenous".format(", ".join(variables))
e0, e1, e2, nobs, nexog, nendog, ntested = self._endogeneity_setup(variables)
df = ntested
df_denom = nobs - nexog - nendog - ntested
delta = e1.T @ e1 - e2.T @ e2
stat = delta / df
stat /= (e0.T @ e0 - delta) / df_denom
stat = float(squeeze(stat))
name = "Wu-Hausman test of exogeneity"
return WaldTestStatistic(stat, null, df, df_denom, name=name)
@cached_property
def wooldridge_score(self) -> WaldTestStatistic:
r"""
Wooldridge's score test of exogeneity
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
Wooldridge's test examines whether there is correlation between the
errors produced when the endogenous variable are treated as
exogenous so that the model can be fit by OLS, and the component of
the endogenous variables that cannot be explained by the instruments.
The test is implemented using a regression,
.. math ::
1 = \gamma_1 \hat{\epsilon}_1 \hat{v}_{1,i} + \ldots
+ \gamma_p \hat{\epsilon}_1 \hat{v}_{p,i} + \eta_i
where :math:`\hat{v}_{j,i}` is the residual from regressing endogenous
variable :math:`x_j` on the exogenous variables and instruments.
The test is a :math:`n\times R^2 \sim \chi^2_{p}`.
Implemented using the expression in Wooldridge (2002), Eq. 6.19
"""
from linearmodels.iv.model import _OLS
e = annihilate(self.model.dependent.ndarray, self.model._x)
r = annihilate(self.model.endog.ndarray, self.model._z)
nobs = e.shape[0]
r = annihilate(r, self.model._x)
res = _OLS(ones((nobs, 1)), r * e).fit(cov_type="unadjusted")
stat = res.nobs - res.resid_ss
df = self.model.endog.shape[1]
null = "Endogenous variables are exogenous"
name = "Wooldridge's score test of exogeneity"
return WaldTestStatistic(stat, null, df, name=name)
@cached_property
def wooldridge_regression(self) -> WaldTestStatistic:
r"""
Wooldridge's regression test of exogeneity
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
Wooldridge's test examines whether there is correlation between the
components of the endogenous variables that cannot be explained by
the instruments and the OLS regression residuals.
The test is implemented as an OLS where
.. math ::
y_i = x_{1i}\beta_i + x_{2i}\beta_2 + \hat{e}_i\gamma + \epsilon_i
where :math:`x_{1i}` are the exogenous regressors, :math:`x_{2i}` are
the endogenous regressors and :math:`\hat{e}_{i}` are the residuals
from regressing the endogenous variables on the exogenous variables
and instruments. The null is :math:`\gamma=0` and is implemented
using a Wald test. The covariance estimator used in the test is
identical to the covariance estimator used with ``fit``.
"""
from linearmodels.iv.model import _OLS
r = annihilate(self.model.endog.ndarray, self.model._z)
augx = c_[self.model._x, r]
mod = _OLS(self.model.dependent, augx)
res = mod.fit(cov_type=self.cov_type, **self.cov_config)
norig = self.model._x.shape[1]
test_params = asarray(res.params.values[norig:], dtype=float)
test_cov = res.cov.values[norig:, norig:]
stat = test_params.T @ inv(test_cov) @ test_params
df = len(test_params)
null = "Endogenous variables are exogenous"
name = "Wooldridge's regression test of exogeneity"
return WaldTestStatistic(stat, null, df, name=name)
@cached_property
def wooldridge_overid(self) -> InvalidTestStatistic | WaldTestStatistic:
r"""
Wooldridge's score test of overidentification
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
Wooldridge's test examines whether there is correlation between the
model residuals and the component of the instruments that is
orthogonal to the endogenous variables. Define :math:`\tilde{z}`
to be the residuals of the instruments regressed on the exogenous
variables and the first-stage fitted values of the endogenous
variables. The test is computed as a regression
.. math ::
1 = \gamma_1 \hat{\epsilon}_i \tilde{z}_{i,1} + \ldots +
\gamma_q \hat{\epsilon}_i \tilde{z}_{i,q}
where :math:`q = n_{instr} - n_{endog}`. The test is a
:math:`n\times R^2 \sim \chi^2_{q}`.
The order of the instruments does not affect this test.
"""
from linearmodels.iv.model import _OLS
exog, endog = self.model.exog, self.model.endog
instruments = self.model.instruments
nobs, nendog = endog.shape
ninstr = instruments.shape[1]
name = "Wooldridge's score test of overidentification"
if ninstr - nendog == 0:
return InvalidTestStatistic(
"Test requires more instruments than " "endogenous variables.",
name=name,
)
endog_hat = proj(endog.ndarray, c_[exog.ndarray, instruments.ndarray])
q = instruments.ndarray[:, : (ninstr - nendog)]
q_res = annihilate(q, c_[self.model.exog.ndarray, endog_hat])
test_functions = q_res * self.resids.values[:, None]
res = _OLS(ones((nobs, 1)), test_functions).fit(cov_type="unadjusted")
stat = res.nobs * res.rsquared
df = ninstr - nendog
null = "Model is not overidentified."
return WaldTestStatistic(stat, null, df, name=name)
@cached_property
def anderson_rubin(self) -> InvalidTestStatistic | WaldTestStatistic:
r"""
Anderson-Rubin test of overidentifying restrictions
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
The Anderson-Rubin test examines whether the value of :math:`\kappa`
computed for the LIML estimator is sufficiently close to one to
indicate the model is not overidentified. The test statistic is
.. math ::
n \ln(\hat{\kappa}) \sim \chi^2_{q}
where :math:`q = n_{instr} - n_{endog}`.
"""
nobs, ninstr = self.model.instruments.shape
nendog = self.model.endog.shape[1]
name = "Anderson-Rubin test of overidentification"
if ninstr - nendog == 0:
return InvalidTestStatistic(
"Test requires more instruments than " "endogenous variables.",
name=name,
)
stat = nobs * log(self._liml_kappa)
df = ninstr - nendog
null = "The model is not overidentified."
return WaldTestStatistic(stat, null, df, name=name)
@cached_property
def basmann_f(self) -> InvalidTestStatistic | WaldTestStatistic:
r"""
Basmann's F test of overidentifying restrictions
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
Basmann's F test examines whether the value of :math:`\kappa`
computed for the LIML estimator is sufficiently close to one to
indicate the model is not overidentified. The test statistic is
.. math ::
\hat{\kappa} (n -n_{instr})/q \sim F_{q, n - n_{instr}}
where :math:`q = n_{instr} - n_{endog}`.
"""
nobs, ninstr = self.model.instruments.shape
nendog, nexog = self.model.endog.shape[1], self.model.exog.shape[1]
name = "Basmann' F test of overidentification"
if ninstr - nendog == 0:
return InvalidTestStatistic(
"Test requires more instruments than " "endogenous variables.",
name=name,
)
df = ninstr - nendog
df_denom = nobs - (nexog + ninstr)
stat = (self._liml_kappa - 1) * df_denom / df
null = "The model is not overidentified."
return WaldTestStatistic(stat, null, df, df_denom=df_denom, name=name)
[docs]
class IVGMMResults(_CommonIVResults):
"""
Results from GMM estimation of IV models
Parameters
----------
results : dict[str, any]
A dictionary of results from the model estimation.
model : {IVGMM, IVGMMCUE}
The model used to estimate parameters.
"""
def __init__(
self, results: dict[str, Any], model: linearmodels.iv.model._IVGMMBase
):
super().__init__(results, model)
self._weight_mat = results["weight_mat"]
self._weight_type = results["weight_type"]
self._weight_config = results["weight_config"]
self._iterations = results["iterations"]
self._j_stat = results["j_stat"]
@property
def weight_matrix(self) -> linearmodels.typing.data.Float64Array:
"""Weight matrix used in the final-step GMM estimation"""
return self._weight_mat
@property
def iterations(self) -> int:
"""Iterations used in GMM estimation"""
return self._iterations
@property
def weight_type(self) -> str:
"""Weighting matrix method used in estimation"""
return self._weight_type
@property
def weight_config(self) -> dict[str, Any]:
"""Weighting matrix configuration used in estimation"""
return self._weight_config
@property
def j_stat(self) -> InvalidTestStatistic | WaldTestStatistic:
r"""
J-test of overidentifying restrictions
Returns
-------
WaldTestStatistic
J statistic test of overidentifying restrictions
Notes
-----
The J statistic tests whether the moment conditions are sufficiently
close to zero to indicate that the model is not overidentified. The
statistic is defined as
.. math ::
n \bar{g}'W^{-1}\bar{g} \sim \chi^2_q
where :math:`\bar{g} = n^{-1}\sum \hat{\epsilon}_i z_i` where
:math:`z_i` includes both the exogenous variables and instruments and
:math:`\hat{\epsilon}_i` are the model residuals. :math:`W` is a consistent
estimator of the variance of :math:`\sqrt{n}\bar{g}`. The degree of
freedom is :math:`q = n_{instr} - n_{endog}`.
"""
return self._j_stat
[docs]
def c_stat(self, variables: list[str] | str | None = None) -> WaldTestStatistic:
r"""
C-test of endogeneity
Parameters
----------
variables : {str, list[str]}
List of variables to test for exogeneity. If None, all variables
are jointly tested.
Returns
-------
WaldTestStatistic
Object containing test statistic, p-value, distribution and null
Notes
-----
The C statistic iv the difference between the model estimated by
assuming one or more of the endogenous variables is actually
exogenous. The test is implemented as the difference between the
J statistic s of two GMM estimations where both use the same weighting
matrix. The use of a common weighting matrix is required for the C
statistic to be positive.
The first model is a estimated uses GMM estimation where one or more
of the endogenous variables are assumed to be endogenous. The model
would be relatively efficient if the assumption were true, and two
quantities are computed, the J statistic, :math:`J_e`, and the
moment weighting matrix, :math:`W_e`.
WLOG assume the q variables tested are in the final q positions so that
the first :math:`n_{exog} + n_{instr}` rows and columns correspond to
the moment conditions in the original model. The second J statistic is
computed using parameters estimated using the original moment
conditions along with the upper left block of :math:`W_e`. Denote this
values as :math:`J_c` where the c is used to indicate consistent.
The test statistic is then
.. math ::
J_e - J_c \sim \chi^2_{m}
where :math:`m` is the number of variables whose exogeneity is being
tested.
"""
dependent, instruments = self.model.dependent, self.model.instruments
exog, endog = self.model.exog, self.model.endog
if variables is None:
exog_e = c_[exog.ndarray, endog.ndarray]
nobs = exog_e.shape[0]
endog_e = empty((nobs, 0))
null = "All endogenous variables are exogenous"
else:
if isinstance(variables, list):
variable_lst = variables
elif isinstance(variables, str):
variable_lst = [variables]
else:
raise TypeError("variables must be a str or a list of str.")
exog_e = c_[exog.ndarray, endog.pandas[variable_lst].values]
ex = [c for c in endog.pandas if c not in variable_lst]
endog_e = endog.pandas[ex].values
null = "Variables {} are exogenous".format(", ".join(variable_lst))
from linearmodels.iv.model import IVGMM, IVGMMCUE
mod = IVGMM(dependent, exog_e, endog_e, instruments)
res_e = mod.fit(cov_type=self.cov_type, **self.cov_config)
assert isinstance(res_e, IVGMMResults)
j_e = res_e.j_stat.stat
x = self.model._x
y = self.model._y
z = self.model._z
nz = z.shape[1]
weight_mat_c = asarray(res_e.weight_matrix)[:nz, :nz]
params_c = mod.estimate_parameters(x, y, z, weight_mat_c)
assert isinstance(self.model, (IVGMM, IVGMMCUE))
j_c = self.model._j_statistic(params_c, weight_mat_c).stat
stat = j_e - j_c
df = exog_e.shape[1] - exog.shape[1]
return WaldTestStatistic(stat, null, df, name="C-statistic")
AnyResult = Union[IVResults, IVGMMResults, OLSResults]
[docs]
class IVModelComparison(_ModelComparison):
"""
Comparison of multiple models
Parameters
----------
results : {list, dict}
Set of results to compare. If a dict, the keys will be used as model
names.
precision : {"tstats","std_errors", "std-errors", "pvalues"}
Estimator precision estimator to include in the comparison output.
Default is "tstats".
stars : bool
Add stars based on the p-value of the coefficient where 1, 2 and
3-stars correspond to p-values of 10%, 5% and 1%, respectively.
"""
_supported = (IVResults, IVGMMResults, OLSResults)
def __init__(
self,
results: Sequence[AnyResult] | dict[str, AnyResult],
*,
precision: str = "tstats",
stars: bool = False,
):
super().__init__(results, precision=precision, stars=stars)
@property
def rsquared_adj(self) -> Series:
"""Sample-size adjusted coefficients of determination (R**2)"""
return self._get_property("rsquared_adj")
@property
def estimator_method(self) -> Series:
"""Estimation methods"""
return self._get_property("_method")
@property
def cov_estimator(self) -> Series:
"""Covariance estimator descriptions"""
return self._get_property("cov_estimator")
@property
def summary(self) -> Summary:
"""
Model estimation summary.
Returns
-------
Summary
Summary table of model estimation results
Notes
-----
Supports export to csv, html and latex using the methods ``summary.as_csv()``,
``summary.as_html()`` and ``summary.as_latex()``.
"""
smry = Summary()
models = list(self._results.keys())
title = "Model Comparison"
stubs = [
"Dep. Variable",
"Estimator",
"No. Observations",
"Cov. Est.",
"R-squared",
"Adj. R-squared",
"F-statistic",
"P-value (F-stat)",
]
dep_name: dict[str, str] = {}
for key in self._results:
dep_name[key] = str(self._results[key].model.dependent.cols[0])
dep_names = Series(dep_name)
vals = concat(
[
dep_names,
self.estimator_method,
self.nobs,
self.cov_estimator,
self.rsquared,
self.rsquared_adj,
self.f_statistic,
],
axis=1,
)
vals_list = [[i for i in v] for v in vals.T.values]
vals_list[2] = [str(v) for v in vals_list[2]]
for i in range(4, len(vals_list)):
vals_list[i] = [_str(v) for v in vals_list[i]]
params = self.params
precision = getattr(self, self._precision)
pvalues = asarray(self.pvalues)
params_fmt = []
params_stub: list[str] = []
for i in range(len(params)):
formatted_and_starred = []
for v, pv in zip(params.values[i], pvalues[i]):
formatted_and_starred.append(add_star(_str(v), pv, self._stars))
params_fmt.append(formatted_and_starred)
precision_fmt = []
for v in precision.values[i]:
v_str = _str(v)
v_str = f"({v_str})" if v_str.strip() else v_str
precision_fmt.append(v_str)
params_fmt.append(precision_fmt)
params_stub.append(str(params.index[i]))
params_stub.append(" ")
vals_tab = table_concat((vals_list, params_fmt))
stubs = stub_concat((stubs, params_stub))
all_instr = []
for key in self._results:
res = self._results[key]
all_instr.append(res.model.instruments.cols)
ninstr = max(map(len, all_instr))
instruments = []
instrument_stub = ["Instruments"]
for i in range(ninstr):
if i > 0:
instrument_stub.append("")
row = []
for j in range(len(self._results)):
instr = all_instr[j]
if len(instr) > i:
row.append(instr[i])
else:
row.append("")
instruments.append(row)
if instruments:
vals_tab = table_concat((vals_tab, instruments))
stubs = stub_concat((stubs, instrument_stub))
txt_fmt = default_txt_fmt.copy()
txt_fmt["data_aligns"] = "r"
txt_fmt["header_align"] = "r"
table = SimpleTable(
vals_tab, headers=models, title=title, stubs=stubs, txt_fmt=txt_fmt
)
smry.tables.append(table)
prec_type = self._PRECISION_TYPES[self._precision]
smry.add_extra_txt([f"{prec_type} reported in parentheses"])
return smry
[docs]
def compare(
results: dict[str, AnyResult] | Sequence[AnyResult],
*,
precision: str = "tstats",
stars: bool = False,
) -> IVModelComparison:
"""
Compare the results of multiple models
Parameters
----------
results : {list, dict}
Set of results to compare. If a dict, the keys will be used as model
names.
precision : {"tstats","std_errors", "std-errors", "pvalues"}
Estimator precision estimator to include in the comparison output.
Default is "tstats".
stars : bool
Add stars based on the p-value of the coefficient where 1, 2 and
3-stars correspond to p-values of 10%, 5% and 1%, respectively.
Returns
-------
IVModelComparison
The model comparison object.
"""
return IVModelComparison(results, precision=precision, stars=stars)