Source code for linearmodels.iv.results

"""
Results containers and post-estimation diagnostics for IV models
"""
from __future__ import annotations

from linearmodels.compat.statsmodels import Summary

from collections.abc import Sequence
import datetime as dt
from functools import cached_property
from typing import Any, Union

from numpy import (
    array,
    asarray,
    c_,
    diag,
    empty,
    isnan,
    log,
    ndarray,
    ones,
    sqrt,
    squeeze,
)
from numpy.linalg import inv
from pandas import DataFrame, Series, concat, to_numeric
import scipy.stats as stats
from statsmodels.iolib.summary import SimpleTable, fmt_2cols, fmt_params
from statsmodels.iolib.table import default_txt_fmt

import linearmodels
from linearmodels.iv._utility import annihilate, proj
from linearmodels.iv.data import IVData
from linearmodels.shared.base import _ModelComparison, _SummaryStr
from linearmodels.shared.hypotheses import (
    InvalidTestStatistic,
    WaldTestStatistic,
    quadratic_form_test,
)
from linearmodels.shared.io import _str, add_star, pval_format
from linearmodels.typing import ArrayLike, Float64Array


def stub_concat(
    lists: Sequence[list[str] | tuple[str, ...]], sep: str = "="
) -> list[str]:
    col_size = max(max(map(len, stubs)) for stubs in lists)
    out: list[str] = []
    for stubs in lists:
        out.extend(stubs)
        out.append(sep * (col_size + 2))
    return out[:-1]


def table_concat(lists: Sequence[list[list[str]]], sep: str = "=") -> list[list[str]]:
    col_sizes = []
    for table in lists:
        size = [[len(item) for item in row] for row in table]
        size_arr = array(size)
        col_sizes.append(list(asarray(size_arr.max(0))))
    col_size = asarray(array(col_sizes).max(axis=0))
    sep_cols: list[str] = [sep * (cs + 2) for cs in col_size]
    out: list[list[str]] = []
    for table in lists:
        out.extend(table)
        out.append(sep_cols)
    return out[:-1]


class _LSModelResultsBase(_SummaryStr):
    """
    Results from OLS model estimation

    Parameters
    ----------
    results : dict[str, any]
        A dictionary of results from the model estimation.
    model : _OLS
        The model used to estimate parameters.
    """

    def __init__(self, results: dict[str, Any], model: Any) -> None:
        self._resid = results["eps"]
        self._wresid = results["weps"]
        self._params = results["params"]
        self._cov = results["cov"]
        self.model = model
        self._r2 = results["r2"]
        self._cov_type = results["cov_type"]
        self._rss = results["residual_ss"]
        self._tss = results["total_ss"]
        self._s2 = results["s2"]
        self._debiased = results["debiased"]
        self._f_statistic = results["fstat"]
        self._vars = results["vars"]
        self._cov_config = results["cov_config"]
        self._method = results["method"]
        self._kappa = results.get("kappa", None)
        self._datetime = dt.datetime.now()
        self._cov_estimator = results["cov_estimator"]
        self._original_index = results["original_index"]
        self._fitted = results["fitted"]
        self._df_model = results.get("df_model", self._params.shape[0])

    @property
    def cov_config(self) -> dict[str, Any]:
        """Parameter values from covariance estimator"""
        return self._cov_config

    @property
    def cov_estimator(self) -> str:
        """Type of covariance estimator used to compute covariance"""
        return self._cov_type

    @property
    def cov(self) -> DataFrame:
        """Estimated covariance of parameters"""
        return self._cov

    @property
    def params(self) -> Series:
        """Estimated parameters"""
        return self._params

    @cached_property
    def resids(self) -> Series:
        """Estimated residuals"""
        return self._resid()

    @cached_property
    def fitted_values(self) -> Series:
        """Fitted values"""
        return self._fitted()

    @property
    def idiosyncratic(self) -> Series:
        """
        Idiosyncratic error

        Notes
        -----
        Differs from resids since this is the estimated idiosyncratic shock
        from the data. It has the same dimension as the dependent data.
        The shape and nature of resids depends on the model estimated. These
        estimates only depend on the model estimated through the estimation
        of parameters and inclusion of effects, if any.
        """
        return self.resids

    @cached_property
    def wresids(self) -> Series:
        """Weighted estimated residuals"""
        return self._wresid()

    @property
    def nobs(self) -> int:
        """Number of observations"""
        return self.model.dependent.shape[0]

    @property
    def df_resid(self) -> int:
        """Residual degree of freedom"""
        return self.nobs - self.df_model

    @property
    def df_model(self) -> int:
        """Model degree of freedom"""
        return int(self._df_model)

    @property
    def has_constant(self) -> bool:
        """Flag indicating the model includes a constant or equivalent"""
        return self.model.has_constant

    @property
    def rsquared(self) -> float:
        """Coefficient of determination (R**2)"""
        return self._r2

    @property
    def rsquared_adj(self) -> float:
        """Sample-size adjusted coefficient of determination (R**2)"""
        n, k, c = self.nobs, self.df_model, int(self.has_constant)
        return 1 - ((n - c) / (n - k)) * (1 - self._r2)

    @property
    def cov_type(self) -> str:
        """Covariance estimator used"""
        return self._cov_type

    @cached_property
    def std_errors(self) -> Series:
        """Estimated parameter standard errors"""
        std_errors = sqrt(diag(self.cov))
        return Series(std_errors, index=self._vars, name="stderr")

    @cached_property
    def tstats(self) -> Series:
        """Parameter t-statistics"""
        return Series(self._params / self.std_errors, name="tstat")

    @cached_property
    def pvalues(self) -> Series:
        """
        Parameter p-vals. Uses t(df_resid) if ``debiased`` is True, else normal
        """
        if self.debiased:
            pvals = 2 - 2 * stats.t.cdf(abs(self.tstats), self.df_resid)
        else:
            pvals = 2 - 2 * stats.norm.cdf(abs(self.tstats))

        return Series(pvals, index=self._vars, name="pvalue")

    @property
    def total_ss(self) -> float:
        """Total sum of squares"""
        return self._tss

    @property
    def model_ss(self) -> float:
        """Residual sum of squares"""
        return self._tss - self._rss

    @property
    def resid_ss(self) -> float:
        """Residual sum of squares"""
        return self._rss

    @property
    def s2(self) -> float:
        """Residual variance estimator"""
        return self._s2

    @property
    def debiased(self) -> bool:
        """Flag indicating whether covariance uses a small-sample adjustment"""
        return self._debiased

    @property
    def f_statistic(self) -> WaldTestStatistic:
        """
        Model F-statistic

        Returns
        -------
        WaldTestStatistic
            Test statistic for null all coefficients excluding constant terms
            are zero.

        Notes
        -----
        Despite name, always implemented using a quadratic-form test based on
        estimated parameter covariance. Default is to use a chi2 distribution
        to compute p-values. If ``debiased`` is True, divides statistic by
        number of parameters tested and uses an F-distribution.

        This version of the F-statistic directly uses the model covariance
        estimator and so is robust against the same specification issues.
        """
        return self._f_statistic

    @property
    def method(self) -> str:
        """Method used to estimate model parameters"""
        return self._method

    def conf_int(self, level: float = 0.95) -> DataFrame:
        """
        Confidence interval construction

        Parameters
        ----------
        level : float
            Confidence level for interval

        Returns
        -------
        DataFrame
            Confidence interval of the form [lower, upper] for each parameters

        Notes
        -----
        Uses a t(df_resid) if ``debiased`` is True, else normal.
        """
        ci_quantiles = [(1 - level) / 2, 1 - (1 - level) / 2]
        if self._debiased:
            q = stats.t.ppf(ci_quantiles, self.df_resid)
        else:
            q = stats.norm.ppf(ci_quantiles)
        q = q[None, :]
        ci = asarray(self.params)[:, None] + asarray(self.std_errors)[:, None] * q
        return DataFrame(ci, index=self._vars, columns=["lower", "upper"])

    def _top_right(self) -> list[tuple[str, str]]:
        f_stat = _str(self.f_statistic.stat)
        if isnan(self.f_statistic.stat):
            f_stat = "      N/A"

        return [
            ("R-squared:", _str(self.rsquared)),
            ("Adj. R-squared:", _str(self.rsquared_adj)),
            ("F-statistic:", f_stat),
            ("P-value (F-stat)", pval_format(self.f_statistic.pval)),
            ("Distribution:", str(self.f_statistic.dist_name)),
            ("", ""),
            ("", ""),
        ]

    @property
    def summary(self) -> Summary:
        """
        Model estimation summary.

        Returns
        -------
        Summary
            Summary table of model estimation results

        Notes
        -----
        Supports export to csv, html and latex  using the methods ``summary.as_csv()``,
        ``summary.as_html()`` and ``summary.as_latex()``.
        """

        title = self._method + " Estimation Summary"
        mod = self.model
        top_left = [
            ("Dep. Variable:", mod.dependent.cols[0]),
            ("Estimator:", self._method),
            ("No. Observations:", self.nobs),
            ("Date:", self._datetime.strftime("%a, %b %d %Y")),
            ("Time:", self._datetime.strftime("%H:%M:%S")),
            ("Cov. Estimator:", self._cov_type),
            ("", ""),
        ]

        top_right = self._top_right()

        stubs = []
        vals = []
        for stub, val in top_left:
            stubs.append(stub)
            vals.append([val])
        table = SimpleTable(vals, txt_fmt=fmt_2cols, title=title, stubs=stubs)

        # create summary table instance
        smry = Summary()
        # Top Table
        # Parameter table
        fmt = fmt_2cols
        fmt["data_fmts"][1] = "%18s"

        top_right = [("%-21s" % ("  " + k), v) for k, v in top_right]
        stubs = []
        vals = []
        for stub, val in top_right:
            stubs.append(stub)
            vals.append([val])
        table.extend_right(SimpleTable(vals, stubs=stubs))
        smry.tables.append(table)

        param_data = c_[
            self.params.values[:, None],
            self.std_errors.values[:, None],
            self.tstats.values[:, None],
            self.pvalues.values[:, None],
            self.conf_int(),
        ]
        data = []
        for row in param_data:
            txt_row = []
            for i, v in enumerate(row):
                f = _str
                if i == 3:
                    f = pval_format
                txt_row.append(f(v))
            data.append(txt_row)
        title = "Parameter Estimates"
        table_stubs = list(self.params.index)
        extra_text = []
        if table_stubs:
            header = [
                "Parameter",
                "Std. Err.",
                "T-stat",
                "P-value",
                "Lower CI",
                "Upper CI",
            ]
            table = SimpleTable(
                data, stubs=table_stubs, txt_fmt=fmt_params, headers=header, title=title
            )
            smry.tables.append(table)
        else:
            extra_text.append("Model contains no parameters")

        extra_text = self._update_extra_text(extra_text)
        if extra_text:
            smry.add_extra_txt(extra_text)

        return smry

    def _update_extra_text(self, extra_text: list[str]) -> list[str]:
        return extra_text

    def wald_test(
        self,
        restriction: DataFrame | ndarray | None = None,
        value: Series | ndarray | None = None,
        *,
        formula: str | list[str] | dict[str, float] | None = None,
    ) -> WaldTestStatistic:
        r"""
        Test linear equality constraints using a Wald test

        Parameters
        ----------
        restriction : {ndarray, DataFrame}
            q by nvar array containing linear weights to apply to parameters
            when forming the restrictions. It is not possible to use both
            restriction and formula.
        value : {ndarray, Series}
            q element array containing the restricted values.
        formula : {str, list[str]}
            formulaic linear constraints. The simplest formats are one of:

            * A single comma-separated string such as "x1=0, x2+x3=1"
            * A list of strings where each element is a single constraint
              such as ["x1=0", "x2+x3=1"]
            * A single string without commas to test simple constraints such
              as "x1=x2=x3=0"
            * A dictionary where each key is a parameter restriction and
              the corresponding value is the restriction value, e.g.,
              {"x1": 0, "x2+x3": 1}.

            It is not possible to use both ``restriction`` and ``formula``.

        Returns
        -------
        WaldTestStatistic
            Test statistic for null that restrictions are valid.

        Notes
        -----
        Hypothesis test examines whether :math:`H_0:C\theta=v` where the
        matrix C is ``restriction`` and v is ``value``. The test statistic
        has a :math:`\chi^2_q` distribution where q is the number of rows in C.

        Examples
        --------
        >>> import numpy as np
        >>> from linearmodels.datasets import wage
        >>> from linearmodels.iv import IV2SLS
        >>> data = wage.load()
        >>> formula = "np.log(wage) ~ 1 + exper + I(exper**2) + brthord + [educ ~ sibs]"
        >>> res = IV2SLS.from_formula(formula, data).fit()

        Testing the experience is not needed in the model

        >>> restriction = np.array([[0, 1, 0, 0, 0],
        ...                         [0, 0, 1, 0, 0]])
        >>> value = np.array([0, 0])
        >>> wald_res = res.wald_test(restriction, value)

        Using the formula interface to test the same restrictions

        >>> formula = "exper = I(exper**2) = 0"
        >>> wald_res = res.wald_test(formula=formula)

        Using the formula interface with a list

        >>> wald_res = res.wald_test(formula=["exper = 0", "I(exper**2) = 0"])
        """
        return quadratic_form_test(
            self._params,
            self.cov,
            restriction=restriction,
            value=value,
            formula=formula,
        )


[docs] class OLSResults(_LSModelResultsBase): """ Results from OLS model estimation Parameters ---------- results : dict[str, any] A dictionary of results from the model estimation. model : _OLS The model used to estimate parameters. """ def __init__( self, results: dict[str, Any], model: linearmodels.iv.model._IVModelBase, ) -> None: super().__init__(results, model) def _out_of_sample( self, exog: ArrayLike | None, endog: ArrayLike | None, data: ArrayLike | None, missing: bool | None, ) -> DataFrame: """Interface between model predict and predict for OOS fits""" if not (exog is None and endog is None) and data is not None: raise ValueError( "Predictions can only be constructed using one " "of exog/endog or data, but not both." ) pred = self.model.predict(self.params, exog=exog, endog=endog, data=data) if not missing: pred = pred.loc[pred.notnull().all(1)] return pred
[docs] def predict( self, exog: ArrayLike | None = None, endog: ArrayLike | None = None, *, data: DataFrame | None = None, fitted: bool = True, idiosyncratic: bool = False, missing: bool = False, ) -> DataFrame: """ In- and out-of-sample predictions Parameters ---------- exog : array_like Exogenous values to use in out-of-sample prediction (nobs by nexog) endog : array_like Endogenous values to use in out-of-sample prediction (nobs by nendog) data : DataFrame DataFrame to use for out-of-sample predictions when model was constructed using a formula. fitted : bool Flag indicating whether to include the fitted values idiosyncratic : bool Flag indicating whether to include the estimated idiosyncratic shock missing : bool Flag indicating to adjust for dropped observations. If True, the values returned will have the same size as the original input data before filtering missing values. If False, then missing observations will not be returned. Returns ------- DataFrame DataFrame containing columns for all selected outputs Notes ----- If `exog`, `endog` and `data` are all `None`, in-sample predictions (fitted values) will be returned. If `data` is not none, then `exog` and `endog` must be none. Predictions from models constructed using formulas can be computed using either `exog` and `endog`, which will treat these are arrays of values corresponding to the formula-process data, or using `data` which will be processed using the formula used to construct the values corresponding to the original model specification. """ if not (exog is None and endog is None and data is None): return self._out_of_sample(exog, endog, data, missing) out = [] if fitted: out.append(self.fitted_values) if idiosyncratic: out.append(self.idiosyncratic) if len(out) == 0: raise ValueError("At least one output must be selected") out_df: DataFrame = concat(out, axis=1) if missing: index = self._original_index out_df = out_df.reindex(index) return out_df
@property def kappa(self) -> float: """k-class estimator value""" return self._kappa def _update_extra_text(self, extra_text: list[str]) -> list[str]: instruments = self.model.instruments if instruments.shape[1] > 0: endog = self.model.endog extra_text.append("Endogenous: " + ", ".join(endog.cols)) extra_text.append("Instruments: " + ", ".join(instruments.cols)) cov_descr = str(self._cov_estimator) for line in cov_descr.split("\n"): extra_text.append(line) return extra_text
[docs] class AbsorbingLSResults(_LSModelResultsBase): """ Results from IV estimation Parameters ---------- results : dict[str, any] A dictionary of results from the model estimation. model : AbsorbingLS The model used to estimate parameters. """ def __init__( self, results: dict[str, Any], model: linearmodels.iv.absorbing.AbsorbingLS ) -> None: super().__init__(results, model) self._absorbed_rsquared = results["absorbed_r2"] self._absorbed_effects = results["absorbed_effects"] def _top_right(self) -> list[tuple[str, str]]: f_stat = _str(self.f_statistic.stat) if isnan(self.f_statistic.stat): f_stat = " N/A" return [ ("R-squared:", _str(self.rsquared)), ("Adj. R-squared:", _str(self.rsquared_adj)), ("F-statistic:", f_stat), ("P-value (F-stat):", pval_format(self.f_statistic.pval)), ("Distribution:", str(self.f_statistic.dist_name)), ("R-squared (No Effects):", _str(round(self.absorbed_rsquared, 5))), ("Variables Absorbed:", _str(self.df_absorbed)), ] @property def absorbed_rsquared(self) -> float: """Coefficient of determination (R**2), ignoring absorbed variables""" return self._absorbed_rsquared @cached_property def absorbed_effects(self) -> DataFrame: """Fitted values from only absorbed terms""" return self._absorbed_effects() @property def df_absorbed(self) -> int: """Number of variables absorbed""" return self.df_model - self.params.shape[0]
[docs] class FirstStageResults(_SummaryStr): """ First stage estimation results and diagnostics """ def __init__( self, dep: IVData, exog: IVData, endog: IVData, instr: IVData, weights: IVData, cov_type: str, cov_config: dict[str, Any], ) -> None: self.dep = dep self.exog = exog self.endog = endog self.instr = instr self.weights = weights reg = c_[self.exog.ndarray, self.endog.ndarray] self._reg = DataFrame(reg, columns=self.exog.cols + self.endog.cols) self._cov_type = cov_type self._cov_config = cov_config @cached_property def diagnostics(self) -> DataFrame: """ Post estimation diagnostics of first-stage fit Returns ------- DataFrame DataFrame where each endogenous variable appears as a row and the columns contain alternative measures. The columns are: * rsquared - R-squared from regression of endogenous on exogenous and instruments * partial.rsquared - R-squared from regression of the exogenous variable on instruments where both the exogenous variable and the instrument have been orthogonalized to the exogenous regressors in the model. * f.stat - Test that all coefficients are zero in the model used to estimate the partial R-squared. Uses a standard F-test when the covariance estimator is unadjusted - otherwise uses a Wald test statistic with a chi2 distribution. * f.pval - P-value of the test that all coefficients are zero in the model used to estimate the partial R-squared * f.dist - Distribution of f.stat * shea.rsquared - Shea's r-squared which measures the correlation between the projected and orthogonalized instrument on the orthogonalized endogenous regressor where the orthogonalization is with respect to the other included variables in the model. """ from linearmodels.iv.model import _OLS, IV2SLS endog, exog, instr, weights = self.endog, self.exog, self.instr, self.weights w = sqrt(weights.ndarray) z = w * instr.ndarray nz = z.shape[1] x = w * exog.ndarray ez = annihilate(z, x) individual_results = self.individual out_df = DataFrame( index=["rsquared", "partial.rsquared", "f.stat", "f.pval", "f.dist"], columns=[], ) for col in endog.pandas: # TODO: BUG in pandas-stube # https://github.com/pandas-dev/pandas-stubs/issues/97 y = w * endog.pandas[[col]].values ey = annihilate(y, x) partial = _OLS(ey, ez).fit(cov_type=self._cov_type, **self._cov_config) full = individual_results[str(col)] params = full.params.values[-nz:] params = params[:, None] c = asarray(full.cov)[-nz:, -nz:] stat = params.T @ inv(c) @ params stat = float(stat.squeeze()) if full.cov_type in ("homoskedastic", "unadjusted"): df_denom = full.df_resid stat /= params.shape[0] else: df_denom = None w_test = WaldTestStatistic( stat, null="", df=params.shape[0], df_denom=df_denom ) inner = { "rsquared": full.rsquared, "partial.rsquared": partial.rsquared, "f.stat": w_test.stat, "f.pval": w_test.pval, "f.dist": w_test.dist_name, } out_df[col] = Series(inner) out_df = out_df.T dep = self.dep r2sls = IV2SLS(dep, exog, endog, instr, weights=weights).fit( cov_type="unadjusted" ) rols = _OLS(dep, self._reg, weights=weights).fit(cov_type="unadjusted") shea = (rols.std_errors / r2sls.std_errors) ** 2 shea *= (1 - r2sls.rsquared) / (1 - rols.rsquared) out_df["shea.rsquared"] = shea[out_df.index] cols = [ "rsquared", "partial.rsquared", "shea.rsquared", "f.stat", "f.pval", "f.dist", ] out_df = out_df[cols] for col in out_df: try: out_df[col] = to_numeric(out_df[col]) except ValueError: # If an error is raised, ignore and keep the column pass return out_df @cached_property def individual(self) -> dict[str, OLSResults]: """ Individual model results from first-stage regressions Returns ------- dict Dictionary containing first stage estimation results. Keys are the variable names of the endogenous regressors. """ from linearmodels.iv.model import _OLS exog_instr = DataFrame( c_[self.exog.ndarray, self.instr.ndarray], columns=self.exog.cols + self.instr.cols, ) res: dict[str, OLSResults] = {} for col in self.endog.pandas: dep = self.endog.pandas[col] mod = _OLS(dep, exog_instr, weights=self.weights.ndarray) res[str(col)] = mod.fit(cov_type=self._cov_type, **self._cov_config) return res @property def summary(self) -> Summary: """ Model estimation summary. Returns ------- Summary Summary table of model estimation results Notes ----- Supports export to csv, html and latex using the methods ``summary.as_csv()``, ``summary.as_html()`` and ``summary.as_latex()``. """ smry = Summary() if not self.individual: table = SimpleTable([[]]) smry.tables.append(table) smry.add_extra_txt( ["Model contains no endogenous variables. No first stage results."] ) return smry stubs_lookup = { "rsquared": "R-squared", "partial.rsquared": "Partial R-squared", "shea.rsquared": "Shea's R-squared", "f.stat": "Partial F-statistic", "f.pval": "P-value (Partial F-stat)", "f.dist": "Partial F-stat Distn", } diagnostics = self.diagnostics vals = [] for c in diagnostics: if c != "f.dist": vals.append([_str(v) for v in diagnostics[c]]) else: vals.append([v for v in diagnostics[c]]) stubs = [stubs_lookup[s] for s in list(diagnostics.columns)] header = list(diagnostics.index) params = [] for var in header: res = self.individual[var] v = c_[res.params.values, res.tstats.values] params.append(v.ravel()) params_arr = array(params) params_fmt = [[_str(val) for val in row] for row in params_arr.T] for i in range(1, len(params_fmt), 2): for j in range(len(params_fmt[i])): params_fmt[i][j] = f"({params_fmt[i][j]})" params_stub = [] for var in res.params.index: params_stub.extend([var, ""]) title = "First Stage Estimation Results" vals = table_concat((vals, params_fmt)) stubs = stub_concat((stubs, params_stub)) txt_fmt = default_txt_fmt.copy() txt_fmt["data_aligns"] = "r" txt_fmt["header_align"] = "r" table = SimpleTable( vals, headers=header, title=title, stubs=stubs, txt_fmt=txt_fmt ) smry.tables.append(table) extra_txt = [ "T-stats reported in parentheses", "T-stats use same covariance type as original model", ] smry.add_extra_txt(extra_txt) return smry
class _CommonIVResults(OLSResults): """ Results from IV estimation """ def __init__( self, results: dict[str, Any], model: linearmodels.iv.model._IVModelBase, ) -> None: super().__init__(results, model) self._liml_kappa = results.get("liml_kappa", None) @property def first_stage(self) -> FirstStageResults: """ First stage regression results Returns ------- FirstStageResults Object containing results for diagnosing instrument relevance issues. """ return FirstStageResults( self.model.dependent, self.model.exog, self.model.endog, self.model.instruments, self.model.weights, self._cov_type, self._cov_config, )
[docs] class IVResults(_CommonIVResults): """ Results from IV estimation Parameters ---------- results : dict[str, any] A dictionary of results from the model estimation. model : {IV2SLS, IVLIML} The model used to estimate parameters. """ def __init__( self, results: dict[str, Any], model: linearmodels.iv.model._IVLSModelBase ) -> None: super().__init__(results, model) self._kappa = results.get("kappa", 1) @cached_property def sargan(self) -> InvalidTestStatistic | WaldTestStatistic: r""" Sargan test of overidentifying restrictions Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- Requires more instruments than endogenous variables Tests the ratio of re-projected IV regression residual variance to variance of the IV residuals. .. math :: n(1-\hat{\epsilon}^{\prime}M_{Z}\hat{\epsilon}/ \hat{\epsilon}^{\prime}\hat{\epsilon})\sim\chi_{v}^{2} where :math:`M_{z}` is the annihilator matrix where z is the set of instruments and :math:`\hat{\epsilon}` are the residuals from the IV estimator. The degree of freedom is the difference between the number of instruments and the number of endogenous regressors. .. math :: v = n_{instr} - n_{exog} """ z = self.model.instruments.ndarray nobs, ninstr = z.shape nendog = self.model.endog.shape[1] name = "Sargan's test of overidentification" if ninstr - nendog == 0: return InvalidTestStatistic( "Test requires more instruments than endogenous variables.", name=name, ) eps = self.resids.values[:, None] u = annihilate(eps, self.model._z) stat = nobs * (1 - (u.T @ u) / (eps.T @ eps)).squeeze() null = "The model is not overidentified." return WaldTestStatistic(stat, null, ninstr - nendog, name=name) @cached_property def basmann(self) -> InvalidTestStatistic | WaldTestStatistic: r""" Basmann's test of overidentifying restrictions Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- Requires more instruments than endogenous variables Tests is a small-sample version of Sargan's test that has the same distribution. .. math :: s (n - n_{instr}) / (n - s) \sim \chi^2_{v} where :math:`n_{instr}` is the number of instruments, :math:`n_{exog}` is the number of exogenous regressors and :math:`n_{endog}` is the number of endogenous regressors. The degree of freedom is the difference between the number of instruments and the number of endogenous regressors. .. math :: v = n_{instr} - n_{exog} """ mod = self.model ninstr = mod.instruments.shape[1] nobs, nendog = mod.endog.shape nz = mod._z.shape[1] name = "Basmann's test of overidentification" if ninstr - nendog == 0: return InvalidTestStatistic( "Test requires more instruments than " "endogenous variables.", name=name, ) sargan_test = self.sargan s = sargan_test.stat stat = s * (nobs - nz) / (nobs - s) return WaldTestStatistic(stat, sargan_test.null, sargan_test.df, name=name) def _endogeneity_setup( self, variables: str | list[str] | None = None ) -> tuple[ndarray, ndarray, ndarray, int, int, int, int]: """Setup function for some endogeneity iv""" if isinstance(variables, str): variables = [variables] elif variables is not None and not isinstance(variables, list): raise TypeError("variables must be a str or a list of str.") nobs = self.model.dependent.shape[0] e2 = asarray(self.resids.values) nendog, nexog = self.model.endog.shape[1], self.model.exog.shape[1] if variables is None: assumed_exog = self.model.endog.ndarray aug_exog = c_[self.model.exog.ndarray, assumed_exog] still_endog = empty((nobs, 0)) else: assert isinstance(variables, list) assumed_exog = self.model.endog.pandas[variables].values ex = [c for c in self.model.endog.cols if c not in variables] still_endog = self.model.endog.pandas[ex].values aug_exog = c_[self.model.exog.ndarray, assumed_exog] ntested = assumed_exog.shape[1] from linearmodels.iv import IV2SLS mod = IV2SLS( self.model.dependent, aug_exog, still_endog, self.model.instruments ) e0 = mod.fit().resids.values[:, None] z2 = c_[self.model.exog.ndarray, self.model.instruments.ndarray] z1 = c_[z2, assumed_exog] e1 = proj(e0, z1) e2 = proj(e2, self.model.instruments.ndarray) return e0, e1, e2, nobs, nexog, nendog, ntested
[docs] def durbin(self, variables: str | list[str] | None = None) -> WaldTestStatistic: r""" Durbin's test of exogeneity Parameters ---------- variables : {str, list[str]} List of variables to test for exogeneity. If None, all variables are jointly tested. Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- Test statistic is difference between sum of squared OLS and sum of squared IV residuals where each set of residuals has been projected onto the set of instruments in the IV model. Start by defining .. math :: \delta & = \hat{\epsilon}'_e P_{[z,w]} \hat{\epsilon}_e - \hat{\epsilon}'_c P_{z} \hat{\epsilon}_c where :math:`\hat{\epsilon}_e` are the regression residuals from a model where ``vars`` are treated as exogenous, :math:`\hat{\epsilon}_c` are the regression residuals from the model leaving ``vars`` as endogenous, :math:`P_{[z,w]}` is a projection matrix onto the exogenous variables and instruments (`z`) as well as ``vars``, and :math:`P_{z}` is a projection matrix only onto `z`. The test statistic is then .. math :: \delta / (\hat{\epsilon}'_e\hat{\epsilon}_e) / n \sim \chi^2_{q} where :math:`q` is the number of variables tested. """ null = "All endogenous variables are exogenous" if variables is not None: null = "Variables {} are exogenous".format(", ".join(variables)) e0, e1, e2, nobs, _, _, ntested = self._endogeneity_setup(variables) stat = e1.T @ e1 - e2.T @ e2 stat /= (e0.T @ e0) / nobs name = "Durbin test of exogeneity" df = ntested return WaldTestStatistic(float(squeeze(stat)), null, df, name=name)
[docs] def wu_hausman(self, variables: str | list[str] | None = None) -> WaldTestStatistic: r""" Wu-Hausman test of exogeneity Parameters ---------- variables : {str, list[str]} List of variables to test for exogeneity. If None, all variables are jointly tested. Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- Test statistic is difference between sum of squared OLS and sum of squared IV residuals where each set of residuals has been projected onto the set of instruments in the IV model. Start by defining .. math :: \delta & = \hat{\epsilon}'_e P_{[z,w]} \hat{\epsilon}_e - \hat{\epsilon}'_c P_{z} \hat{\epsilon}_c where :math:`\hat{\epsilon}_e` are the regression residuals from a model where ``vars`` are treated as exogenous, :math:`\hat{\epsilon}_c` are the regression residuals from the model leaving ``vars`` as endogenous, :math:`P_{[z,w]}` is a projection matrix onto the exogenous variables and instruments (`z`) as well as ``vars``, and :math:`P_{z}` is a projection matrix only onto `z`. The test statistic is then .. math :: \frac{\delta / q}{(\hat{\epsilon}'_e\hat{\epsilon}_e - \delta) / v} where :math:`q` is the number of variables iv, :math:`v = n - n_{endog} - n_{exog} - q`. The test statistic has a :math:`F_{q, v}` distribution. """ null = "All endogenous variables are exogenous" if variables is not None: null = "Variables {} are exogenous".format(", ".join(variables)) e0, e1, e2, nobs, nexog, nendog, ntested = self._endogeneity_setup(variables) df = ntested df_denom = nobs - nexog - nendog - ntested delta = e1.T @ e1 - e2.T @ e2 stat = delta / df stat /= (e0.T @ e0 - delta) / df_denom stat = float(squeeze(stat)) name = "Wu-Hausman test of exogeneity" return WaldTestStatistic(stat, null, df, df_denom, name=name)
@cached_property def wooldridge_score(self) -> WaldTestStatistic: r""" Wooldridge's score test of exogeneity Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- Wooldridge's test examines whether there is correlation between the errors produced when the endogenous variable are treated as exogenous so that the model can be fit by OLS, and the component of the endogenous variables that cannot be explained by the instruments. The test is implemented using a regression, .. math :: 1 = \gamma_1 \hat{\epsilon}_1 \hat{v}_{1,i} + \ldots + \gamma_p \hat{\epsilon}_1 \hat{v}_{p,i} + \eta_i where :math:`\hat{v}_{j,i}` is the residual from regressing endogenous variable :math:`x_j` on the exogenous variables and instruments. The test is a :math:`n\times R^2 \sim \chi^2_{p}`. Implemented using the expression in Wooldridge (2002), Eq. 6.19 """ from linearmodels.iv.model import _OLS e = annihilate(self.model.dependent.ndarray, self.model._x) r = annihilate(self.model.endog.ndarray, self.model._z) nobs = e.shape[0] r = annihilate(r, self.model._x) res = _OLS(ones((nobs, 1)), r * e).fit(cov_type="unadjusted") stat = res.nobs - res.resid_ss df = self.model.endog.shape[1] null = "Endogenous variables are exogenous" name = "Wooldridge's score test of exogeneity" return WaldTestStatistic(stat, null, df, name=name) @cached_property def wooldridge_regression(self) -> WaldTestStatistic: r""" Wooldridge's regression test of exogeneity Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- Wooldridge's test examines whether there is correlation between the components of the endogenous variables that cannot be explained by the instruments and the OLS regression residuals. The test is implemented as an OLS where .. math :: y_i = x_{1i}\beta_i + x_{2i}\beta_2 + \hat{e}_i\gamma + \epsilon_i where :math:`x_{1i}` are the exogenous regressors, :math:`x_{2i}` are the endogenous regressors and :math:`\hat{e}_{i}` are the residuals from regressing the endogenous variables on the exogenous variables and instruments. The null is :math:`\gamma=0` and is implemented using a Wald test. The covariance estimator used in the test is identical to the covariance estimator used with ``fit``. """ from linearmodels.iv.model import _OLS r = annihilate(self.model.endog.ndarray, self.model._z) augx = c_[self.model._x, r] mod = _OLS(self.model.dependent, augx) res = mod.fit(cov_type=self.cov_type, **self.cov_config) norig = self.model._x.shape[1] test_params = asarray(res.params.values[norig:], dtype=float) test_cov = res.cov.values[norig:, norig:] stat = test_params.T @ inv(test_cov) @ test_params df = len(test_params) null = "Endogenous variables are exogenous" name = "Wooldridge's regression test of exogeneity" return WaldTestStatistic(stat, null, df, name=name) @cached_property def wooldridge_overid(self) -> InvalidTestStatistic | WaldTestStatistic: r""" Wooldridge's score test of overidentification Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- Wooldridge's test examines whether there is correlation between the model residuals and the component of the instruments that is orthogonal to the endogenous variables. Define :math:`\tilde{z}` to be the residuals of the instruments regressed on the exogenous variables and the first-stage fitted values of the endogenous variables. The test is computed as a regression .. math :: 1 = \gamma_1 \hat{\epsilon}_i \tilde{z}_{i,1} + \ldots + \gamma_q \hat{\epsilon}_i \tilde{z}_{i,q} where :math:`q = n_{instr} - n_{endog}`. The test is a :math:`n\times R^2 \sim \chi^2_{q}`. The order of the instruments does not affect this test. """ from linearmodels.iv.model import _OLS exog, endog = self.model.exog, self.model.endog instruments = self.model.instruments nobs, nendog = endog.shape ninstr = instruments.shape[1] name = "Wooldridge's score test of overidentification" if ninstr - nendog == 0: return InvalidTestStatistic( "Test requires more instruments than " "endogenous variables.", name=name, ) endog_hat = proj(endog.ndarray, c_[exog.ndarray, instruments.ndarray]) q = instruments.ndarray[:, : (ninstr - nendog)] q_res = annihilate(q, c_[self.model.exog.ndarray, endog_hat]) test_functions = q_res * self.resids.values[:, None] res = _OLS(ones((nobs, 1)), test_functions).fit(cov_type="unadjusted") stat = res.nobs * res.rsquared df = ninstr - nendog null = "Model is not overidentified." return WaldTestStatistic(stat, null, df, name=name) @cached_property def anderson_rubin(self) -> InvalidTestStatistic | WaldTestStatistic: r""" Anderson-Rubin test of overidentifying restrictions Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- The Anderson-Rubin test examines whether the value of :math:`\kappa` computed for the LIML estimator is sufficiently close to one to indicate the model is not overidentified. The test statistic is .. math :: n \ln(\hat{\kappa}) \sim \chi^2_{q} where :math:`q = n_{instr} - n_{endog}`. """ nobs, ninstr = self.model.instruments.shape nendog = self.model.endog.shape[1] name = "Anderson-Rubin test of overidentification" if ninstr - nendog == 0: return InvalidTestStatistic( "Test requires more instruments than " "endogenous variables.", name=name, ) stat = nobs * log(self._liml_kappa) df = ninstr - nendog null = "The model is not overidentified." return WaldTestStatistic(stat, null, df, name=name) @cached_property def basmann_f(self) -> InvalidTestStatistic | WaldTestStatistic: r""" Basmann's F test of overidentifying restrictions Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- Basmann's F test examines whether the value of :math:`\kappa` computed for the LIML estimator is sufficiently close to one to indicate the model is not overidentified. The test statistic is .. math :: \hat{\kappa} (n -n_{instr})/q \sim F_{q, n - n_{instr}} where :math:`q = n_{instr} - n_{endog}`. """ nobs, ninstr = self.model.instruments.shape nendog, nexog = self.model.endog.shape[1], self.model.exog.shape[1] name = "Basmann' F test of overidentification" if ninstr - nendog == 0: return InvalidTestStatistic( "Test requires more instruments than " "endogenous variables.", name=name, ) df = ninstr - nendog df_denom = nobs - (nexog + ninstr) stat = (self._liml_kappa - 1) * df_denom / df null = "The model is not overidentified." return WaldTestStatistic(stat, null, df, df_denom=df_denom, name=name)
[docs] class IVGMMResults(_CommonIVResults): """ Results from GMM estimation of IV models Parameters ---------- results : dict[str, any] A dictionary of results from the model estimation. model : {IVGMM, IVGMMCUE} The model used to estimate parameters. """ def __init__( self, results: dict[str, Any], model: linearmodels.iv.model._IVGMMBase ): super().__init__(results, model) self._weight_mat = results["weight_mat"] self._weight_type = results["weight_type"] self._weight_config = results["weight_config"] self._iterations = results["iterations"] self._j_stat = results["j_stat"] @property def weight_matrix(self) -> Float64Array: """Weight matrix used in the final-step GMM estimation""" return self._weight_mat @property def iterations(self) -> int: """Iterations used in GMM estimation""" return self._iterations @property def weight_type(self) -> str: """Weighting matrix method used in estimation""" return self._weight_type @property def weight_config(self) -> dict[str, Any]: """Weighting matrix configuration used in estimation""" return self._weight_config @property def j_stat(self) -> InvalidTestStatistic | WaldTestStatistic: r""" J-test of overidentifying restrictions Returns ------- WaldTestStatistic J statistic test of overidentifying restrictions Notes ----- The J statistic tests whether the moment conditions are sufficiently close to zero to indicate that the model is not overidentified. The statistic is defined as .. math :: n \bar{g}'W^{-1}\bar{g} \sim \chi^2_q where :math:`\bar{g} = n^{-1}\sum \hat{\epsilon}_i z_i` where :math:`z_i` includes both the exogenous variables and instruments and :math:`\hat{\epsilon}_i` are the model residuals. :math:`W` is a consistent estimator of the variance of :math:`\sqrt{n}\bar{g}`. The degree of freedom is :math:`q = n_{instr} - n_{endog}`. """ return self._j_stat
[docs] def c_stat(self, variables: list[str] | str | None = None) -> WaldTestStatistic: r""" C-test of endogeneity Parameters ---------- variables : {str, list[str]} List of variables to test for exogeneity. If None, all variables are jointly tested. Returns ------- WaldTestStatistic Object containing test statistic, p-value, distribution and null Notes ----- The C statistic iv the difference between the model estimated by assuming one or more of the endogenous variables is actually exogenous. The test is implemented as the difference between the J statistic s of two GMM estimations where both use the same weighting matrix. The use of a common weighting matrix is required for the C statistic to be positive. The first model is a estimated uses GMM estimation where one or more of the endogenous variables are assumed to be endogenous. The model would be relatively efficient if the assumption were true, and two quantities are computed, the J statistic, :math:`J_e`, and the moment weighting matrix, :math:`W_e`. WLOG assume the q variables tested are in the final q positions so that the first :math:`n_{exog} + n_{instr}` rows and columns correspond to the moment conditions in the original model. The second J statistic is computed using parameters estimated using the original moment conditions along with the upper left block of :math:`W_e`. Denote this values as :math:`J_c` where the c is used to indicate consistent. The test statistic is then .. math :: J_e - J_c \sim \chi^2_{m} where :math:`m` is the number of variables whose exogeneity is being tested. """ dependent, instruments = self.model.dependent, self.model.instruments exog, endog = self.model.exog, self.model.endog if variables is None: exog_e = c_[exog.ndarray, endog.ndarray] nobs = exog_e.shape[0] endog_e = empty((nobs, 0)) null = "All endogenous variables are exogenous" else: if isinstance(variables, list): variable_lst = variables elif isinstance(variables, str): variable_lst = [variables] else: raise TypeError("variables must be a str or a list of str.") exog_e = c_[exog.ndarray, endog.pandas[variable_lst].values] ex = [c for c in endog.pandas if c not in variable_lst] endog_e = endog.pandas[ex].values null = "Variables {} are exogenous".format(", ".join(variable_lst)) from linearmodels.iv.model import IVGMM, IVGMMCUE mod = IVGMM(dependent, exog_e, endog_e, instruments) res_e = mod.fit(cov_type=self.cov_type, **self.cov_config) assert isinstance(res_e, IVGMMResults) j_e = res_e.j_stat.stat x = self.model._x y = self.model._y z = self.model._z nz = z.shape[1] weight_mat_c = asarray(res_e.weight_matrix)[:nz, :nz] params_c = mod.estimate_parameters(x, y, z, weight_mat_c) assert isinstance(self.model, (IVGMM, IVGMMCUE)) j_c = self.model._j_statistic(params_c, weight_mat_c).stat stat = j_e - j_c df = exog_e.shape[1] - exog.shape[1] return WaldTestStatistic(stat, null, df, name="C-statistic")
AnyResult = Union[IVResults, IVGMMResults, OLSResults]
[docs] class IVModelComparison(_ModelComparison): """ Comparison of multiple models Parameters ---------- results : {list, dict} Set of results to compare. If a dict, the keys will be used as model names. precision : {"tstats","std_errors", "std-errors", "pvalues"} Estimator precision estimator to include in the comparison output. Default is "tstats". stars : bool Add stars based on the p-value of the coefficient where 1, 2 and 3-stars correspond to p-values of 10%, 5% and 1%, respectively. """ _supported = (IVResults, IVGMMResults, OLSResults) def __init__( self, results: Sequence[AnyResult] | dict[str, AnyResult], *, precision: str = "tstats", stars: bool = False, ): super().__init__(results, precision=precision, stars=stars) @property def rsquared_adj(self) -> Series: """Sample-size adjusted coefficients of determination (R**2)""" return self._get_property("rsquared_adj") @property def estimator_method(self) -> Series: """Estimation methods""" return self._get_property("_method") @property def cov_estimator(self) -> Series: """Covariance estimator descriptions""" return self._get_property("cov_estimator") @property def summary(self) -> Summary: """ Model estimation summary. Returns ------- Summary Summary table of model estimation results Notes ----- Supports export to csv, html and latex using the methods ``summary.as_csv()``, ``summary.as_html()`` and ``summary.as_latex()``. """ smry = Summary() models = list(self._results.keys()) title = "Model Comparison" stubs = [ "Dep. Variable", "Estimator", "No. Observations", "Cov. Est.", "R-squared", "Adj. R-squared", "F-statistic", "P-value (F-stat)", ] dep_name: dict[str, str] = {} for key in self._results: dep_name[key] = str(self._results[key].model.dependent.cols[0]) dep_names = Series(dep_name) vals = concat( [ dep_names, self.estimator_method, self.nobs, self.cov_estimator, self.rsquared, self.rsquared_adj, self.f_statistic, ], axis=1, ) vals_list = [[i for i in v] for v in vals.T.values] vals_list[2] = [str(v) for v in vals_list[2]] for i in range(4, len(vals_list)): vals_list[i] = [_str(v) for v in vals_list[i]] params = self.params precision = getattr(self, self._precision) pvalues = asarray(self.pvalues) params_fmt = [] params_stub: list[str] = [] for i in range(len(params)): formatted_and_starred = [] for v, pv in zip(params.values[i], pvalues[i]): formatted_and_starred.append(add_star(_str(v), pv, self._stars)) params_fmt.append(formatted_and_starred) precision_fmt = [] for v in precision.values[i]: v_str = _str(v) v_str = f"({v_str})" if v_str.strip() else v_str precision_fmt.append(v_str) params_fmt.append(precision_fmt) params_stub.append(str(params.index[i])) params_stub.append(" ") vals_tab = table_concat((vals_list, params_fmt)) stubs = stub_concat((stubs, params_stub)) all_instr = [] for key in self._results: res = self._results[key] all_instr.append(res.model.instruments.cols) ninstr = max(map(len, all_instr)) instruments = [] instrument_stub = ["Instruments"] for i in range(ninstr): if i > 0: instrument_stub.append("") row = [] for j in range(len(self._results)): instr = all_instr[j] if len(instr) > i: row.append(instr[i]) else: row.append("") instruments.append(row) if instruments: vals_tab = table_concat((vals_tab, instruments)) stubs = stub_concat((stubs, instrument_stub)) txt_fmt = default_txt_fmt.copy() txt_fmt["data_aligns"] = "r" txt_fmt["header_align"] = "r" table = SimpleTable( vals_tab, headers=models, title=title, stubs=stubs, txt_fmt=txt_fmt ) smry.tables.append(table) prec_type = self._PRECISION_TYPES[self._precision] smry.add_extra_txt([f"{prec_type} reported in parentheses"]) return smry
[docs] def compare( results: dict[str, AnyResult] | Sequence[AnyResult], *, precision: str = "tstats", stars: bool = False, ) -> IVModelComparison: """ Compare the results of multiple models Parameters ---------- results : {list, dict} Set of results to compare. If a dict, the keys will be used as model names. precision : {"tstats","std_errors", "std-errors", "pvalues"} Estimator precision estimator to include in the comparison output. Default is "tstats". stars : bool Add stars based on the p-value of the coefficient where 1, 2 and 3-stars correspond to p-values of 10%, 5% and 1%, respectively. Returns ------- IVModelComparison The model comparison object. """ return IVModelComparison(results, precision=precision, stars=stars)