Source code for linearmodels.iv.data

"""
A data abstraction that allow multiple input data formats
"""

from __future__ import annotations

import copy
from typing import Any, Union

import numpy as np
import pandas
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_string_dtype

import linearmodels.typing.data

dim_err = "{0} has too many dims.  Maximum is 2, actual is {1}"
type_err = "Only ndarrays, DataArrays and Series and DataFrames are supported"


def convert_columns(
    s: pd.Series, drop_first: bool
) -> linearmodels.typing.data.AnyPandas:
    if isinstance(s.dtype, pd.CategoricalDtype):
        out = pd.get_dummies(s, drop_first=drop_first)
        # TODO: Remove once pandas typing fixed
        out.columns = pd.Index([str(s.name) + "." + str(c) for c in out])
        return out
    return s


def expand_categoricals(x: pandas.DataFrame, drop_first: bool) -> pd.DataFrame:
    if x.shape[1] == 0:
        return x
    return pd.concat(
        [convert_columns(x[c], drop_first) for c in x.columns], axis=1, sort=False
    )



[docs]
class IVData:
    """
    Type abstraction for use in univariate models.

    Parameters
    ----------
    x : {ndarray, Series, DataFrame, DataArray}
        Data to wrap and standardize.  If None, then nobs must be provided to
        produce an IVData instance with shape (nobs, 0).
    var_name : str
        Variable name to use when naming variables in NumPy arrays or
        xarray DataArrays
    nobs : int
        Number of observation, used when `x` is None. If `x` is array_like,
        then nobs is used to check the number of observations in `x`.
    convert_dummies : bool
        Flat indicating whether pandas categoricals or string input data
        should be converted to dummy variables
    drop_first : bool
        Flag indicating to drop first dummy category
    """

    def __init__(
        self,
        x: IVDataLike | None,
        var_name: str = "x",
        nobs: int | None = None,
        convert_dummies: bool = True,
        drop_first: bool = True,
    ):
        if isinstance(x, IVData):
            self.__dict__.update(copy.deepcopy(x.__dict__))
            return
        if x is None and nobs is not None:
            x = np.empty((nobs, 0))
        elif x is None:
            raise ValueError("nobs required when x is None")

        self.original = x
        assert x is not None
        xndim = x.ndim
        if xndim > 2:
            raise ValueError(dim_err.format(var_name, xndim))

        if isinstance(x, np.ndarray):
            x = x.astype(dtype=np.float64)
            if xndim == 1:
                x = x.reshape((x.shape[0], -1))

            self._ndarray = x.astype(np.float64)
            index = list(range(x.shape[0]))
            if x.shape[1] == 1:
                cols = [var_name]
            else:
                cols = [var_name + f".{i}" for i in range(x.shape[1])]
            self._pandas = pd.DataFrame(x, index=index, columns=cols)
            self._row_labels = index
            self._col_labels = cols

        elif isinstance(x, (pd.Series, pd.DataFrame)):
            if isinstance(x, pd.Series):
                name = var_name if not x.name else x.name
                x = pd.DataFrame({name: x})
            copied = False
            columns = list(x.columns)
            if len(set(columns)) != len(columns):
                raise ValueError(
                    "DataFrame contains duplicate column names. "
                    "All column names must be distinct"
                )
            all_numeric = True
            for col in x:
                c = x[col]
                if (
                    is_string_dtype(c.dtype)
                    and c.map(lambda v: isinstance(v, str)).all()
                ):
                    c = c.astype("category")
                    if not copied:
                        x = x.copy()
                        copied = True
                    x[col] = c
                dt = c.dtype
                all_numeric = all_numeric and is_numeric_dtype(dt)
                if not (is_numeric_dtype(dt) or isinstance(dt, pd.CategoricalDtype)):
                    raise ValueError(
                        "Only numeric, string  or categorical " "data permitted"
                    )

            if convert_dummies:
                x = expand_categoricals(x, drop_first)

            self._pandas = x
            self._ndarray = np.asarray(self._pandas)
            if all_numeric or convert_dummies:
                self._ndarray = self._ndarray.astype(np.float64)
            self._row_labels = list(x.axes[0])
            self._col_labels = list(x.axes[1])

        else:
            try:
                import xarray as xr
            except ImportError:
                raise TypeError(type_err)
            if isinstance(x, xr.DataArray):
                if x.ndim == 1:
                    x = xr.concat([x], dim=var_name)
                    assert isinstance(x, xr.DataArray)
                    x = x.transpose()
                assert isinstance(x, xr.DataArray)
                index = list(x.coords[x.dims[0]].values)
                xr_col_values = x.coords[x.dims[1]].values
                xr_cols = list(xr_col_values)
                if is_numeric_dtype(xr_col_values.dtype):
                    xr_cols = [var_name + f".{i}" for i in range(x.shape[1])]
                self._ndarray = x.values.astype(np.float64)
                self._pandas = pd.DataFrame(self._ndarray, columns=xr_cols, index=index)
                self._row_labels = index
                self._col_labels = xr_cols
            else:
                raise TypeError(type_err)

        if nobs is not None:
            if self._ndarray.shape[0] != nobs:
                msg = "Array required to have {nobs} obs, has " "{act}".format(
                    nobs=nobs, act=self._ndarray.shape[0]
                )
                raise ValueError(msg)

    @property
    def pandas(self) -> pd.DataFrame:
        """DataFrame view of data"""
        return self._pandas

    @property
    def ndarray(self) -> linearmodels.typing.data.NumericArray:
        """ndarray view of data, always 2d"""
        return self._ndarray

    @property
    def shape(self) -> tuple[int, int]:
        """Tuple containing shape"""
        return self._ndarray.shape[0], self._ndarray.shape[1]

    @property
    def ndim(self) -> int:
        """Number of dimensions"""
        return self._ndarray.ndim

    @property
    def cols(self) -> list[Any]:
        """Column labels"""
        return self._col_labels

    @property
    def rows(self) -> list[Any]:
        """Row labels (index)"""
        return self._row_labels

    @property
    def labels(self) -> dict[int, Any]:
        """Dictionary containing row and column labels keyed by axis"""
        return {0: self._row_labels, 1: self._col_labels}

    @property
    def isnull(self) -> pd.Series:
        return self._pandas.isnull().any(axis=1)


[docs]
    def drop(self, locs: linearmodels.typing.data.ArrayLike) -> None:
        locs = np.asarray(locs)
        self._pandas = self.pandas.loc[~locs]
        self._ndarray = self._ndarray[~locs]
        self._row_labels = list(pd.Series(self._row_labels).loc[~locs])




IVDataLike = Union[IVData, linearmodels.typing.data.ArrayLike]