Source code for netneurotools.stats.regression

"""Functions for calculating regression."""

from itertools import combinations

import numpy as np
from tqdm import tqdm
import scipy.stats as sstats
from joblib import Parallel, delayed
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_array


def _add_constant(data):
    """
    Add a constant (i.e., intercept) term to `data`.

    Parameters
    ----------
    data : (N, M) array_like
        Samples by features data array

    Returns
    -------
    data : (N, F) np.ndarray
        Where `F` is `M + 1`

    Examples
    --------
    >>> from netneurotools import stats

    >>> A = np.zeros((5, 5))
    >>> Ac = stats._add_constant(A)
    >>> Ac
    array([[0., 0., 0., 0., 0., 1.],
           [0., 0., 0., 0., 0., 1.],
           [0., 0., 0., 0., 0., 1.],
           [0., 0., 0., 0., 0., 1.],
           [0., 0., 0., 0., 0., 1.]])
    """
    data = check_array(data, ensure_2d=False)
    return np.column_stack([data, np.ones(len(data))])


[docs] def residualize(X, Y, Xc=None, Yc=None, normalize=True, add_intercept=True): """ Return residuals of regression equation from `Y ~ X`. Parameters ---------- X : (N[, R]) array_like Coefficient matrix of `R` variables for `N` subjects Y : (N[, F]) array_like Dependent variable matrix of `F` variables for `N` subjects Xc : (M[, R]) array_like, optional Coefficient matrix of `R` variables for `M` subjects. If not specified then `X` is used to estimate betas. Default: None Yc : (M[, F]) array_like, optional Dependent variable matrix of `F` variables for `M` subjects. If not specified then `Y` is used to estimate betas. Default: None normalize : bool, optional Whether to normalize (i.e., z-score) residuals. Will use residuals from `Yc ~ Xc` for generating mean and variance. Default: True add_intercept : bool, optional Whether to add intercept to `X` (and `Xc`, if provided). The intercept will not be removed, just used in beta estimation. Default: True Returns ------- Yr : (N, F) numpy.ndarray Residuals of `Y ~ X` Notes ----- If both `Xc` and `Yc` are provided, these are used to calculate betas which are then applied to `X` and `Y`. """ if ((Yc is None and Xc is not None) or (Yc is not None and Xc is None)): raise ValueError('If processing against a comparative group, you must ' 'provide both `Xc` and `Yc`.') X, Y = np.asarray(X), np.asarray(Y) if Yc is None: Xc, Yc = X.copy(), Y.copy() else: Xc, Yc = np.asarray(Xc), np.asarray(Yc) # add intercept to regressors if requested and calculate fit if add_intercept: X, Xc = _add_constant(X), _add_constant(Xc) betas, *_ = np.linalg.lstsq(Xc, Yc, rcond=None) # remove intercept from regressors and betas for calculation of residuals if add_intercept: betas = betas[:-1] X, Xc = X[:, :-1], Xc[:, :-1] # calculate residuals Yr = Y - (X @ betas) Ycr = Yc - (Xc @ betas) if normalize: Yr = sstats.zmap(Yr, compare=Ycr) return Yr
[docs] def get_dominance_stats(X, y, use_adjusted_r_sq=True, verbose=False, n_jobs=1): """ Return the dominance analysis statistics for multilinear regression. This is a rewritten & simplified version of [DA1]_. It is briefly tested against the original package, but still in early stages. Please feel free to report any bugs. Warning: Still work-in-progress. Parameters might change! Parameters ---------- X : (N, M) array_like Input data y : (N,) array_like Target values use_adjusted_r_sq : bool, optional Whether to use adjusted r squares. Default: True verbose : bool, optional Whether to print debug messages. Default: False n_jobs : int, optional The number of jobs to run in parallel. Default: 1 Returns ------- model_metrics : dict The dominance metrics, currently containing `individual_dominance`, `partial_dominance`, `total_dominance`, and `full_r_sq`. model_r_sq : dict Contains all model r squares Notes ----- Example usage .. code:: python from netneurotools.stats import get_dominance_stats from sklearn.datasets import load_boston X, y = load_boston(return_X_y=True) model_metrics, model_r_sq = get_dominance_stats(X, y) To compare with [DA1]_, use `use_adjusted_r_sq=False` .. code:: python from dominance_analysis import Dominance_Datasets from dominance_analysis import Dominance boston_dataset=Dominance_Datasets.get_boston() dominance_regression=Dominance(data=boston_dataset, target='House_Price',objective=1) incr_variable_rsquare=dominance_regression.incremental_rsquare() dominance_regression.dominance_stats() References ---------- .. [DA1] https://github.com/dominance-analysis/dominance-analysis """ # this helps to remove one element from a tuple def remove_ret(tpl, elem): lst = list(tpl) lst.remove(elem) return tuple(lst) # sklearn linear regression wrapper def get_reg_r_sq(X, y, use_adjusted_r_sq=True): lin_reg = LinearRegression() lin_reg.fit(X, y) yhat = lin_reg.predict(X) SS_Residual = sum((y - yhat) ** 2) SS_Total = sum((y - np.mean(y)) ** 2) r_squared = 1 - (float(SS_Residual)) / SS_Total adjusted_r_squared = 1 - (1 - r_squared) * \ (len(y) - 1) / (len(y) - X.shape[1] - 1) if use_adjusted_r_sq: return adjusted_r_squared else: return r_squared # helper function to compute r_sq for a given idx_tuple def compute_r_sq(idx_tuple): return idx_tuple, get_reg_r_sq(X[:, idx_tuple], y, use_adjusted_r_sq=use_adjusted_r_sq) # generate all predictor combinations in list (num of predictors) of lists n_predictor = X.shape[-1] # n_comb_len_group = n_predictor - 1 predictor_combs = [list(combinations(range(n_predictor), i)) for i in range(1, n_predictor + 1)] if verbose: print(f"[Dominance analysis] Generated \ {len([v for i in predictor_combs for v in i])} combinations") model_r_sq = dict() results = Parallel(n_jobs=n_jobs)( delayed(compute_r_sq)(idx_tuple) for len_group in tqdm(predictor_combs, desc='num-of-predictor loop', disable=not verbose) for idx_tuple in tqdm(len_group, desc='insider loop', disable=not verbose)) # extract r_sq from results for idx_tuple, r_sq in results: model_r_sq[idx_tuple] = r_sq if verbose: print(f"[Dominance analysis] Acquired {len(model_r_sq)} r^2's") # getting all model metrics model_metrics = dict([]) # individual dominance individual_dominance = [] for i_pred in range(n_predictor): individual_dominance.append(model_r_sq[(i_pred,)]) individual_dominance = np.array(individual_dominance).reshape(1, -1) model_metrics["individual_dominance"] = individual_dominance # partial dominance partial_dominance = [[] for _ in range(n_predictor - 1)] for i_len in range(n_predictor - 1): i_len_combs = list(combinations(range(n_predictor), i_len + 2)) for j_node in range(n_predictor): j_node_sel = [v for v in i_len_combs if j_node in v] reduced_list = [remove_ret(comb, j_node) for comb in j_node_sel] diff_values = [ model_r_sq[j_node_sel[i]] - model_r_sq[reduced_list[i]] for i in range(len(reduced_list))] partial_dominance[i_len].append(np.mean(diff_values)) # save partial dominance partial_dominance = np.array(partial_dominance) model_metrics["partial_dominance"] = partial_dominance # get total dominance total_dominance = np.mean( np.r_[individual_dominance, partial_dominance], axis=0) # test and save total dominance assert np.allclose(total_dominance.sum(), model_r_sq[tuple(range(n_predictor))]), \ "Sum of total dominance is not equal to full r square!" model_metrics["total_dominance"] = total_dominance # save full r^2 model_metrics["full_r_sq"] = model_r_sq[tuple(range(n_predictor))] return model_metrics, model_r_sq