Source code for linearmodels.system.results

from __future__ import annotations

from linearmodels.compat.statsmodels import Summary

import datetime as dt
from functools import cached_property
from typing import Any, Union

import numpy as np
import pandas
from pandas import DataFrame, Series, concat
from scipy import stats
from statsmodels.iolib.summary import SimpleTable, fmt_2cols

import linearmodels
from linearmodels.shared.base import _SummaryStr
from linearmodels.shared.hypotheses import InvalidTestStatistic, WaldTestStatistic
from linearmodels.shared.io import _str, format_wide, param_table, pval_format
from linearmodels.shared.utility import AttrDict
import linearmodels.typing.data

__all__ = ["Equation", "SystemResults", "SystemEquationResult", "GMMSystemResults"]

Equation = Union[
    tuple[linearmodels.typing.data.ArrayLike, linearmodels.typing.data.ArrayLike],
    dict[str, linearmodels.typing.data.ArrayLike],
]


class _CommonResults(_SummaryStr):
    def __init__(self, results: AttrDict) -> None:
        self._method = results.method
        self._params = results.params
        self._cov = results.cov
        self._param_names = results.param_names
        self._debiased = results.debiased
        self._r2 = results.r2
        self._resid = results.resid
        self._wresid = results.wresid
        self._fitted = results.fitted
        self._nobs = results.nobs
        self._df_model = results.df_model
        self._df_resid = self._nobs - self._df_model
        self._index = results.index
        self._iter = results.iter
        self._cov_type = results.cov_type
        self._tss = results.total_ss
        self._rss = results.resid_ss
        self._datetime = dt.datetime.now()
        self._cov_estimator = results.cov_estimator
        self._cov_config = results.cov_config
        self._original_index = results.original_index

    @property
    def method(self) -> str:
        """Estimation method"""
        return self._method

    @property
    def cov(self) -> DataFrame:
        """Estimated covariance of parameters"""
        return DataFrame(self._cov, index=self._param_names, columns=self._param_names)

    @property
    def cov_estimator(self) -> str:
        """Type of covariance estimator used to compute covariance"""
        return self._cov_type

    @property
    def cov_config(self) -> dict[str, bool]:
        """Configuration of covariance estimator used to compute covariance"""
        return self._cov_config

    @property
    def iterations(self) -> int:
        """Number of iterations of the GLS executed"""
        return self._iter

    @property
    def debiased(self) -> bool:
        """Flag indicating whether covariance uses a small-sample adjustment"""
        return self._debiased

    @property
    def params(self) -> Series:
        """Estimated parameters"""
        return Series(self._params.squeeze(), index=self._param_names, name="params")

    @property
    def std_errors(self) -> Series:
        """Estimated parameter standard errors"""
        std_errors = np.sqrt(np.diag(self.cov))
        return Series(std_errors, index=self._param_names, name="stderr")

    @property
    def tstats(self) -> Series:
        """Parameter t-statistics"""
        return Series(self.params / self.std_errors, name="tstat")

    @cached_property
    def pvalues(self) -> Series:
        """
        Parameter p-vals. Uses t(df_resid) if ``debiased`` is True, else normal
        """
        if self.debiased:
            pvals = 2 * (1 - stats.t.cdf(np.abs(self.tstats), self._df_resid))
        else:
            pvals = 2 * (1 - stats.norm.cdf(np.abs(self.tstats)))

        return Series(pvals, index=self._param_names, name="pvalue")

    @property
    def rsquared(self) -> float:
        r"""
        Coefficient of determination (R2)

        Returns
        -------
        float
            The coefficient of determinations.

        Notes
        -----
        The overall R2 is similar to Judge's system R2 since no weighting is
        used. These two only differ if one or more equations do not include
        constants. It is defined as

        .. math::

           1 - \frac{\sum_i \sum_j \hat{\epsilon}_{ij}^2}
               {\sum_i \sum_j \hat{\eta}_{ij}^2}

        where :math:`\eta` is the residual from a regression on only a
        constant. Note that if a constant is not present in an equation
        then the term in the denominator is **not** demeaned so that
        :math:`\hat{\eta}_{ij}=y_{ij}`.
        """
        return self._r2

    @property
    def total_ss(self) -> float:
        """Total sum of squares"""
        return self._tss

    @property
    def model_ss(self) -> float:
        """Residual sum of squares"""
        return self._tss - self._rss

    @property
    def resid_ss(self) -> float:
        """Residual sum of squares"""
        return self._rss

    @property
    def nobs(self) -> int:
        """Number of observations"""
        return self._nobs

    @property
    def df_resid(self) -> int:
        """Residual degree of freedom"""
        return self._df_resid

    @property
    def df_model(self) -> int:
        """Model degree of freedom"""
        return self._df_model

    def conf_int(self, level: float = 0.95) -> DataFrame:
        """
        Confidence interval construction

        Parameters
        ----------
        level : float
            Confidence level for interval

        Returns
        -------
        DataFrame
            Confidence interval of the form [lower, upper] for each parameters

        Notes
        -----
        Uses a t(df_resid) if ``debiased`` is True, else normal.
        """
        ci_quantiles = [(1 - level) / 2, 1 - (1 - level) / 2]
        if self._debiased:
            q = stats.t.ppf(ci_quantiles, self.df_resid)
        else:
            q = stats.norm.ppf(ci_quantiles)
        q = q[None, :]
        params = np.asarray(self.params)[:, None]
        ci = params + np.asarray(self.std_errors)[:, None] * q
        return DataFrame(ci, index=self._param_names, columns=["lower", "upper"])



[docs]
class SystemResults(_CommonResults):
    """
    Results from Seemingly Unrelated Regression Estimators

    Parameters
    ----------
    results : AttrDict
        Dictionary of model estimation results
    """

    def __init__(self, results: AttrDict) -> None:
        super().__init__(results)
        self._individual = AttrDict()
        for key in results.individual:
            self._individual[key] = SystemEquationResult(results.individual[key])
        self._system_r2 = results.system_r2
        self._sigma = results.sigma
        self._model = results.model
        self._constraints = results.constraints
        self._num_constraints = "None"
        if results.constraints is not None:
            self._num_constraints = str(results.constraints.r.shape[0])
        self._weight_estimtor = results.get("weight_estimator", None)

    @property
    def model(self) -> linearmodels.system.model.IV3SLS:
        """Model used in estimation"""
        return self._model

    @property
    def equations(self) -> AttrDict:
        """Individual equation results"""
        return self._individual

    @property
    def equation_labels(self) -> list[str]:
        """Individual equation labels"""
        return list(self._individual.keys())

    @property
    def resids(self) -> DataFrame:
        """Estimated residuals"""
        return DataFrame(self._resid, index=self._index, columns=self.equation_labels)

    @property
    def fitted_values(self) -> DataFrame:
        """Fitted values"""
        return DataFrame(self._fitted, index=self._index, columns=self.equation_labels)

    def _out_of_sample(
        self,
        equations: dict[str, dict[str, linearmodels.typing.data.ArrayLike]] | None,
        data: pandas.DataFrame | None,
        missing: bool,
        dataframe: bool,
    ) -> dict[str, Series] | DataFrame:
        if equations is not None and data is not None:
            raise ValueError(
                "Predictions can only be constructed using one "
                "of eqns or data, but not both."
            )
        pred: DataFrame = self.model.predict(
            self.params, equations=equations, data=data, eval_env=3
        )
        if dataframe:
            if missing:
                pred = pred.dropna(how="all", axis=1)
            return pred

        pred_dict = {str(col): pred[col] for col in pred}
        if missing:
            for col, val in pred_dict.items():
                pred_dict[col] = val.dropna()

        return pred_dict


[docs]
    def predict(
        self,
        equations: (
            dict[str, dict[str, linearmodels.typing.data.ArrayLike]] | None
        ) = None,
        *,
        data: pandas.DataFrame | None = None,
        fitted: bool = True,
        idiosyncratic: bool = False,
        missing: bool = False,
        dataframe: bool = False,
    ) -> pandas.DataFrame | dict:
        """
        In- and out-of-sample predictions

        Parameters
        ----------
        equations : dict
            Dictionary-like structure containing exogenous and endogenous
            variables.  Each key is an equations label and must
            match the labels used to fit the model. Each value must be either a tuple
            of the form (exog, endog) or a dictionary with keys "exog" and "endog".
            If predictions are not required for one of more of the model equations,
            these keys can be omitted.
        data : DataFrame
            DataFrame to use for out-of-sample predictions when model was
            constructed using a formula.
        fitted : bool
            Flag indicating whether to include the fitted values
        idiosyncratic : bool
            Flag indicating whether to include the estimated idiosyncratic shock
        missing : bool
            Flag indicating to adjust for dropped observations.  if True, the
            values returns will have the same size as the original input data
            before filtering missing values
        dataframe : bool
            Flag indicating to return output as a dataframe. If False, a
            dictionary is returned using the equation labels as keys.

        Returns
        -------
        predictions : {DataFrame, dict}
            DataFrame or dictionary containing selected outputs

        Notes
        -----
        If `equations` and `data` are both `None`, in-sample predictions
        (fitted values) will be returned.

        If `data` is not none, then `equations` must be none.
        Predictions from models constructed using formulas can
        be computed using either `equations`, which will treat these are
        arrays of values corresponding to the formula-process data, or using
        `data` which will be processed using the formula used to construct the
        values corresponding to the original model specification.

        When using `exog` and `endog`, the regressor array for a particular
        equation is assembled as
        `[equations[eqn]["exog"], equations[eqn]["endog"]]` where `eqn` is
        an equation label. These must correspond to the columns in the
        estimated model.
        """
        if equations is not None or data is not None:
            return self._out_of_sample(equations, data, missing, dataframe)
        if not (fitted or idiosyncratic):
            raise ValueError("At least one output must be selected")
        out: dict[str, DataFrame] | DataFrame
        if dataframe:
            if fitted and not idiosyncratic:
                out = self.fitted_values
            elif idiosyncratic and not fitted:
                out = self.resids
            else:
                out = {
                    "fitted_values": self.fitted_values,
                    "idiosyncratic": self.resids,
                }
        else:
            out = {}
            for key in self.equation_labels:
                vals = []
                if fitted:
                    vals.append(self.fitted_values[[key]])
                if idiosyncratic:
                    vals.append(self.resids[[key]])
                out[key] = concat(vals, axis=1)

        if missing:
            if isinstance(out, DataFrame):
                out = out.reindex(self._original_index)
            else:
                for key in out:
                    out[key] = out[key].reindex(self._original_index)

        return out


    @property
    def wresids(self) -> DataFrame:
        """Weighted estimated residuals"""
        return DataFrame(self._wresid, index=self._index, columns=self.equation_labels)

    @property
    def sigma(self) -> DataFrame:
        """Estimated residual covariance"""
        return self._sigma

    @property
    def system_rsquared(self) -> Series:
        r"""
        Alternative measure of system fit

        Returns
        -------
        Series
            The measures of overall system fit.

        Notes
        -----
        McElroy's R2 is defined as

        .. math::

           1 - \frac{SSR_{\Omega}}{TSS_{\Omega}}

        where

        .. math::

           SSR_{\Omega} = \hat{\epsilon}^\prime\hat{\Omega}^{-1}\hat{\epsilon}

        and

        .. math::

           TSS_{\Omega} = \hat{\eta}^\prime\hat{\Omega}^{-1}\hat{\eta}

        where :math:`\eta` is the residual from a regression on only a constant.

        Judge's system R2 is defined as

        .. math::

           1 - \frac{\sum_i \sum_j \hat{\epsilon}_ij^2}{\sum_i \sum_j \hat{\eta}_ij^2}

        where :math:`\eta` is the residual from a regression on only a constant.

        Berndt's system R2 is defined as

        .. math::

           1 - \frac{|\hat{\Sigma}_\epsilon|}{|\hat{\Sigma}_\eta|}

        where :math:`\hat{\Sigma}_\epsilon` and :math:`\hat{\Sigma}_\eta` are the
        estimated covariances :math:`\epsilon` and :math:`\eta`, respectively.

        Dhrymes's system R2 is defined as a weighted average of the R2 of each
        equation

        .. math::

            \sum__i w_i R^2_i

        where the weight is

        .. math::

           w_i = \frac{\hat{\Sigma}_{\eta}^{[ii]}}{\tr{\hat{\Sigma}_{\eta}}}

        the ratio of the variance the dependent in an equation to the total
        variance of all dependent variables.
        """
        return self._system_r2

    @property
    def summary(self) -> Summary:
        """
        Model estimation summary.

        Returns
        -------
        Summary
            Summary table of model estimation results

        Notes
        -----
        Supports export to csv, html and latex  using the methods ``summary.as_csv()``,
        ``summary.as_html()`` and ``summary.as_latex()``.
        """

        title = "System " + self._method + " Estimation Summary"

        top_left = [
            ("Estimator:", self._method),
            ("No. Equations.:", str(len(self.equation_labels))),
            ("No. Observations:", str(self.resids.shape[0])),
            ("Date:", self._datetime.strftime("%a, %b %d %Y")),
            ("Time:", self._datetime.strftime("%H:%M:%S")),
            ("", ""),
            ("", ""),
        ]

        top_right = [
            ("Overall R-squared:", _str(self.rsquared)),
            ("McElroy's R-squared:", _str(self.system_rsquared.mcelroy)),
            ("Judge's (OLS) R-squared:", _str(self.system_rsquared.judge)),
            ("Berndt's R-squared:", _str(self.system_rsquared.berndt)),
            ("Dhrymes's R-squared:", _str(self.system_rsquared.dhrymes)),
            ("Cov. Estimator:", self._cov_type),
            ("Num. Constraints: ", self._num_constraints),
        ]

        stubs = []
        vals = []
        for stub, val in top_left:
            stubs.append(stub)
            vals.append([val])
        table = SimpleTable(vals, txt_fmt=fmt_2cols, title=title, stubs=stubs)

        # create summary table instance
        smry = Summary()
        # Top Table
        # Parameter table
        fmt = fmt_2cols
        fmt["data_fmts"][1] = "%10s"

        top_right = [("%-21s" % ("  " + k), v) for k, v in top_right]
        stubs = []
        vals = []
        for stub, val in top_right:
            stubs.append(stub)
            vals.append([val])
        table.extend_right(SimpleTable(vals, stubs=stubs))
        smry.tables.append(table)

        for i, eqlabel in enumerate(self.equation_labels):
            last_row = i == (len(self.equation_labels) - 1)
            results = self.equations[eqlabel]
            dep_name = results.dependent
            title = f"Equation: {eqlabel}, Dependent Variable: {dep_name}"
            pad_bottom = results.instruments is not None and not last_row
            smry.tables.append(param_table(results, title, pad_bottom=pad_bottom))
            if results.instruments:
                formatted = format_wide(results.instruments, 80)
                if not last_row:
                    formatted.append([" "])
                smry.tables.append(SimpleTable(formatted, headers=["Instruments"]))
        extra_text = ["Covariance Estimator:"]
        for line in str(self._cov_estimator).split("\n"):
            extra_text.append(line)
        if self._weight_estimtor:
            extra_text.append("Weight Estimator:")
            for line in str(self._weight_estimtor).split("\n"):
                extra_text.append(line)
        smry.add_extra_txt(extra_text)

        return smry


[docs]
    def breusch_pagan(self) -> WaldTestStatistic | InvalidTestStatistic:
        r"""
        Breusch-Pagan LM test for no cross-correlation

        Returns
        -------
        WaldTestStatistic
            Test statistic for null all correlations are zero.

        Notes
        -----
        The null hypothesis is that the shock correlations are all 0, and
        so there are no gains to using GLS estimation in the system estimator.
        When the null is rejected, there should be efficiency gains to using
        GLS as long the regressors are not common to all models.

        The Breusch-Pagan test statistic is defined as

        .. math::

           LM = n \sum_{i=1}^k \sum_{j=i+1}^k \hat{\rho}_{ij}^2

        where :math:`\hat{\rho}_{ij}` is the sample residual correlation
        between series i and j. n is the sample size. It has an asymptotic
        :math:`\chi^2_{k(k-1)/2}` distribution. See [1]_ for details.

        References
        ----------
        .. [1] Greene, William H. Econometric analysis. Pearson Education, 2003.
        """
        name = "Breusch-Pagan LM Test"
        resids = self.resids
        if resids.shape[1] == 1:
            return InvalidTestStatistic(
                "Cannot test correlation when the system contains a single "
                "dependent variable.",
                name=name,
            )
        r = np.corrcoef(resids.T)
        k = r.shape[0]
        distinct_corr = np.tril(r, -1)
        stat = self.resids.shape[0] * (distinct_corr**2).sum()
        return WaldTestStatistic(
            stat,
            "Residuals are uncorrelated",
            k * (k - 1) // 2,
            name=name,
        )



[docs]
    def likelihood_ratio(self) -> WaldTestStatistic | InvalidTestStatistic:
        r"""
        Likelihood ratio test of no cross-correlation

        Returns
        -------
        WaldTestStatistic
            Test statistic that the covariance is diagonal.

        Notes
        -----
        The null hypothesis is that the shock covariance matrix is diagonal,
        and so all correlations are 0. In this case, there are no gains to
        using GLS estimation in the system estimator.

        When the null is rejected, there should be efficiency gains to using
        GLS as long the regressors are not common to all models.

        The LR test statistic is defined as

        .. math::

           LR=n\left[\sum_{i=1}^{k}\log\hat{\sigma}_i^2
              -\log\left|\hat{\Sigma}\right|\right]

        where :math:`\hat{\sigma}_i^2` is the sample residual variance for
        series i and :math:`\hat{\Sigma}` is the residual covariance.
        n is the sample size. It has an asymptotic :math:`\chi^2_{k(k-1)/2}`
        distribution. The asymptotic distribution of the likelihood ratio
        test requires homoskedasticity. See [1]_ for details.

        References
        ----------
        .. [1] Greene, William H. Econometric analysis. Pearson Education, 2003.
        """
        name = "Likelihood Ratio Test for Diagonal Covariance"
        resids = np.asarray(self.resids)
        if resids.shape[1] == 1:
            return InvalidTestStatistic(
                "Cannot test covariance structure when the system contains a single "
                "dependent variable.",
                name=name,
            )
        sigma = resids.T @ resids / resids.shape[0]
        nobs, k = resids.shape
        _, logdet = np.linalg.slogdet(sigma)
        stat = nobs * (np.log(np.diag(sigma)).sum() - logdet)
        return WaldTestStatistic(
            stat,
            "Covariance is diagonal",
            k * (k - 1) // 2,
            name=name,
        )




class SystemEquationResult(_CommonResults):
    """
    Results from a single equation of a Seemingly Unrelated Regression

    Parameters
    ----------
    results : AttrDict
        Dictionary of model estimation results
    """

    def __init__(self, results: AttrDict) -> None:
        super().__init__(results)
        self._eq_label = results.eq_label
        self._dependent = results.dependent
        self._f_statistic = results.f_stat
        self._r2a = results.r2a
        self._instruments = results.instruments
        self._endog = results.endog
        self._weight_estimator = results.get("weight_estimator", None)

    @property
    def equation_label(self) -> str:
        """Equation label"""
        return self._eq_label

    @property
    def dependent(self) -> dict[str, DataFrame]:
        """Name of dependent variable"""
        return self._dependent

    @property
    def instruments(self) -> dict[str, DataFrame | None]:
        """Instruments used in estimation.  None if all variables assumed exogenous."""
        return self._instruments

    @property
    def summary(self) -> Summary:
        """
        Model estimation summary.

        Returns
        -------
        Summary
            Summary table of model estimation results

        Notes
        -----
        Supports export to csv, html and latex  using the methods ``summary.as_csv()``,
        ``summary.as_html()`` and ``summary.as_latex()``.
        """

        title = self._method + " Estimation Summary"

        top_left = [
            ("Eq. Label:", self.equation_label),
            ("Dep. Variable:", self.dependent),
            ("Estimator:", self._method),
            ("No. Observations:", self.nobs),
            ("Date:", self._datetime.strftime("%a, %b %d %Y")),
            ("Time:", self._datetime.strftime("%H:%M:%S")),
            ("", ""),
        ]

        top_right = [
            ("R-squared:", _str(self.rsquared)),
            ("Adj. R-squared:", _str(self.rsquared_adj)),
            ("Cov. Estimator:", self._cov_type),
            ("F-statistic:", _str(self.f_statistic.stat)),
            ("P-value (F-stat)", pval_format(self.f_statistic.pval)),
            ("Distribution:", str(self.f_statistic.dist_name)),
            ("", ""),
        ]

        stubs = []
        vals = []
        for stub, val in top_left:
            stubs.append(stub)
            vals.append([val])
        table = SimpleTable(vals, txt_fmt=fmt_2cols, title=title, stubs=stubs)

        # create summary table instance
        smry = Summary()
        # Top Table
        # Parameter table
        fmt = fmt_2cols
        fmt["data_fmts"][1] = "%10s"

        top_right = [("%-21s" % ("  " + k), v) for k, v in top_right]
        stubs = []
        vals = []
        for stub, val in top_right:
            stubs.append(stub)
            vals.append([val])
        table.extend_right(SimpleTable(vals, stubs=stubs))
        smry.tables.append(table)
        smry.tables.append(param_table(self, "Parameter Estimates", pad_bottom=True))

        extra_text = []
        instruments = self._instruments
        if instruments:
            endog = self._endog
            extra_text = [
                "Endogenous: " + ", ".join(endog),
                "Instruments: " + ", ".join(instruments),
            ]

        extra_text.append("Covariance Estimator:")
        for line in str(self._cov_estimator).split("\n"):
            extra_text.append(line)
        if self._weight_estimator:
            extra_text.append("Weight Estimator:")
            for line in str(self._weight_estimator).split("\n"):
                extra_text.append(line)
        smry.add_extra_txt(extra_text)

        return smry

    @property
    def f_statistic(self) -> WaldTestStatistic:
        """
        Model F-statistic

        Returns
        -------
        WaldTestStatistic
            Test statistic for null all coefficients excluding constant terms
            are zero.

        Notes
        -----
        Despite name, always implemented using a quadratic-form test based on
        estimated parameter covariance. Default is to use a chi2 distribution
        to compute p-values. If ``debiased`` is True, divides statistic by
        number of parameters tested and uses an F-distribution.

        This version of the F-statistic directly uses the model covariance
        estimator and so is robust against the same specification issues.
        """
        return self._f_statistic

    @property
    def resids(self) -> Series:
        """Estimated residuals"""
        return Series(self._resid.squeeze(), index=self._index, name="resid")

    @property
    def wresids(self) -> Series:
        """Weighted estimated residuals"""
        return Series(self._wresid.squeeze(), index=self._index, name="wresid")

    @property
    def fitted_values(self) -> Series:
        """Fitted values"""
        return Series(self._fitted.squeeze(), index=self._index, name="fitted_values")

    @property
    def rsquared_adj(self) -> float:
        """Sample-size adjusted coefficient of determination (R**2)"""
        return self._r2a



[docs]
class GMMSystemResults(SystemResults):
    """
    Results from GMM System Estimators

    Parameters
    ----------
    results : AttrDict
        Dictionary of model estimation results
    """

    def __init__(self, results: AttrDict) -> None:
        super().__init__(results)
        self._wmat = results.wmat
        self._weight_type = results.weight_type
        self._weight_config = results.weight_config
        self._j_stat = results.j_stat

    @property
    def w(self) -> linearmodels.typing.data.Float64Array:
        """GMM weight matrix used in estimation"""
        return self._wmat

    @property
    def weight_type(self) -> str:
        """Type of weighting used in GMM estimation"""
        return self._weight_type

    @property
    def weight_config(self) -> dict[str, Any]:
        """Weight configuration options used in GMM estimation"""
        return self._weight_config

    @property
    def j_stat(self) -> WaldTestStatistic:
        r"""
        J-test of overidentifying restrictions

        Returns
        -------
        WaldTestStatistic
            J statistic  test of overidentifying restrictions

        Notes
        -----
        The J statistic tests whether the moment conditions are sufficiently
        close to zero to indicate that the model is not overidentified. The
        statistic is defined as

        .. math ::

          n \bar{g}'W^{-1}\bar{g} \sim \chi^2_q

        where :math:`\bar{g}` is the average of the moment conditional and
        :math:`W` is a consistent estimator of the variance of
        :math:`\sqrt{n}\bar{g}`. The degree of freedom is
        :math:`q = n_{instr} - n_{endog}`.
        """
        return self._j_stat