Source code for netneurotools.stats

# -*- coding: utf-8 -*-
"""Functions for performing statistical preprocessing and analyses."""

import warnings

import numpy as np
from tqdm import tqdm
from itertools import combinations
from scipy import optimize, spatial, special, stats as sstats
try:  # scipy >= 1.8.0
    from scipy.stats._stats_py import _chk2_asarray
except ImportError:  # scipy < 1.8.0
    from scipy.stats.stats import _chk2_asarray
from sklearn.utils.validation import check_random_state
from sklearn.linear_model import LinearRegression
from joblib import Parallel, delayed


from . import utils
from .metrics import _graph_laplacian

try:
    from numba import njit
    use_numba = True
except ImportError:
    use_numba = False


[docs]def residualize(X, Y, Xc=None, Yc=None, normalize=True, add_intercept=True):
    """
    Return residuals of regression equation from `Y ~ X`.

    Parameters
    ----------
    X : (N[, R]) array_like
        Coefficient matrix of `R` variables for `N` subjects
    Y : (N[, F]) array_like
        Dependent variable matrix of `F` variables for `N` subjects
    Xc : (M[, R]) array_like, optional
        Coefficient matrix of `R` variables for `M` subjects. If not specified
        then `X` is used to estimate betas. Default: None
    Yc : (M[, F]) array_like, optional
        Dependent variable matrix of `F` variables for `M` subjects. If not
        specified then `Y` is used to estimate betas. Default: None
    normalize : bool, optional
        Whether to normalize (i.e., z-score) residuals. Will use residuals from
        `Yc ~ Xc` for generating mean and variance. Default: True
    add_intercept : bool, optional
        Whether to add intercept to `X` (and `Xc`, if provided). The intercept
        will not be removed, just used in beta estimation. Default: True

    Returns
    -------
    Yr : (N, F) numpy.ndarray
        Residuals of `Y ~ X`

    Notes
    -----
    If both `Xc` and `Yc` are provided, these are used to calculate betas which
    are then applied to `X` and `Y`.
    """
    if ((Yc is None and Xc is not None) or (Yc is not None and Xc is None)):
        raise ValueError('If processing against a comparative group, you must '
                         'provide both `Xc` and `Yc`.')

    X, Y = np.asarray(X), np.asarray(Y)

    if Yc is None:
        Xc, Yc = X.copy(), Y.copy()
    else:
        Xc, Yc = np.asarray(Xc), np.asarray(Yc)

    # add intercept to regressors if requested and calculate fit
    if add_intercept:
        X, Xc = utils.add_constant(X), utils.add_constant(Xc)
    betas, *rest = np.linalg.lstsq(Xc, Yc, rcond=None)

    # remove intercept from regressors and betas for calculation of residuals
    if add_intercept:
        betas = betas[:-1]
        X, Xc = X[:, :-1], Xc[:, :-1]

    # calculate residuals
    Yr = Y - (X @ betas)
    Ycr = Yc - (Xc @ betas)

    if normalize:
        Yr = sstats.zmap(Yr, compare=Ycr)

    return Yr


[docs]def get_mad_outliers(data, thresh=3.5):
    """
    Determine which samples in `data` are outliers.

    Uses the Median Absolute Deviation for determining whether datapoints are
    outliers

    Parameters
    ----------
    data : (N, M) array_like
        Data array where `N` is samples and `M` is features
    thresh : float, optional
        Modified z-score. Observations with a modified z-score (based on the
        median absolute deviation) greater than this value will be classified
        as outliers. Default: 3.5

    Returns
    -------
    outliers : (N,) numpy.ndarray
        Boolean array where True indicates an outlier

    Notes
    -----
    Taken directly from https://stackoverflow.com/a/22357811

    References
    ----------
    Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
    Handle Outliers", The ASQC Basic References in Quality Control: Statistical
    Techniques, Edward F. Mykytka, Ph.D., Editor.

    Examples
    --------
    >>> from netneurotools import stats

    Create array with three samples of four features each:

    >>> X = np.array([[0, 5, 10, 15], [1, 4, 11, 16], [100, 100, 100, 100]])
    >>> X
    array([[  0,   5,  10,  15],
           [  1,   4,  11,  16],
           [100, 100, 100, 100]])

    Determine which sample(s) is outlier:

    >>> outliers = stats.get_mad_outliers(X)
    >>> outliers
    array([False, False,  True])
    """
    data = np.asarray(data)

    if data.ndim == 1:
        data = np.vstack(data)
    if data.ndim > 2:
        data = data.reshape(len(data), -1)

    median = np.nanmedian(data, axis=0)
    diff = np.nansum((data - median)**2, axis=-1)
    diff = np.sqrt(diff)
    med_abs_deviation = np.median(diff)

    modified_z_score = 0.6745 * diff / med_abs_deviation

    return modified_z_score > thresh


[docs]def permtest_1samp(a, popmean, axis=0, n_perm=1000, seed=0):
    """
    Non-parametric equivalent of :py:func:`scipy.stats.ttest_1samp`.

    Generates two-tailed p-value for hypothesis of whether `a` differs from
    `popmean` using permutation tests

    Parameters
    ----------
    a : array_like
        Sample observations
    popmean : float or array_like
        Expected valued in null hypothesis. If array_like then it must have the
        same shape as `a` excluding the `axis` dimension
    axis : int or None, optional
        Axis along which to compute test. If None, compute over the whole array
        of `a`. Default: 0
    n_perm : int, optional
        Number of permutations to assess. Unless `a` is very small along `axis`
        this will approximate a randomization test via Monte Carlo simulations.
        Default: 1000
    seed : {int, np.random.RandomState instance, None}, optional
        Seed for random number generation. Set to None for "randomness".
        Default: 0

    Returns
    -------
    stat : float or numpy.ndarray
        Difference from `popmean`
    pvalue : float or numpy.ndarray
        Non-parametric p-value

    Notes
    -----
    Providing multiple values to `popmean` to run *independent* tests in
    parallel is not currently supported.

    The lowest p-value that can be returned by this function is equal to 1 /
    (`n_perm` + 1).

    Examples
    --------
    >>> from netneurotools import stats
    >>> np.random.seed(7654567)  # set random seed for reproducible results
    >>> rvs = np.random.normal(loc=5, scale=10, size=(50, 2))

    Test if mean of random sample is equal to true mean, and different mean. We
    reject the null hypothesis in the second case and don't reject it in the
    first case.

    >>> stats.permtest_1samp(rvs, 5.0)
    (array([-0.985602  , -0.05204969]), array([0.48551449, 0.95904096]))
    >>> stats.permtest_1samp(rvs, 0.0)
    (array([4.014398  , 4.94795031]), array([0.00699301, 0.000999  ]))

    Example using axis and non-scalar dimension for population mean

    >>> stats.permtest_1samp(rvs, [5.0, 0.0])
    (array([-0.985602  ,  4.94795031]), array([0.48551449, 0.000999  ]))
    >>> stats.permtest_1samp(rvs.T, [5.0, 0.0], axis=1)
    (array([-0.985602  ,  4.94795031]), array([0.51548452, 0.000999  ]))
    """
    a, popmean, axis = _chk2_asarray(a, popmean, axis)
    rs = check_random_state(seed)

    if a.size == 0:
        return np.nan, np.nan

    # ensure popmean will broadcast to `a` correctly
    if popmean.ndim != a.ndim:
        popmean = np.expand_dims(popmean, axis=axis)

    # center `a` around `popmean` and calculate original mean
    zeroed = a - popmean
    true_mean = zeroed.mean(axis=axis) / 1
    abs_mean = np.abs(true_mean)

    # this for loop is not _the fastest_ but is memory efficient
    # the broadcasting alt. would mean storing zeroed.size * n_perm in memory
    permutations = np.ones(true_mean.shape)
    for _ in range(n_perm):
        flipped = zeroed * rs.choice([-1, 1], size=zeroed.shape)  # sign flip
        permutations += np.abs(flipped.mean(axis=axis)) >= abs_mean

    pvals = permutations / (n_perm + 1)  # + 1 in denom accounts for true_mean

    return true_mean, pvals


[docs]def permtest_rel(a, b, axis=0, n_perm=1000, seed=0):
    """
    Non-parametric equivalent of :py:func:`scipy.stats.ttest_rel`.

    Generates two-tailed p-value for hypothesis of whether related samples `a`
    and `b` differ using permutation tests

    Parameters
    ----------
    a, b : array_like
        Sample observations. These arrays must have the same shape.
    axis : int or None, optional
        Axis along which to compute test. If None, compute over whole arrays
        of `a` and `b`. Default: 0
    n_perm : int, optional
        Number of permutations to assess. Unless `a` and `b` are very small
        along `axis` this will approximate a randomization test via Monte
        Carlo simulations. Default: 1000
    seed : {int, np.random.RandomState instance, None}, optional
        Seed for random number generation. Set to None for "randomness".
        Default: 0

    Returns
    -------
    stat : float or numpy.ndarray
        Average difference between `a` and `b`
    pvalue : float or numpy.ndarray
        Non-parametric p-value

    Notes
    -----
    The lowest p-value that can be returned by this function is equal to 1 /
    (`n_perm` + 1).

    Examples
    --------
    >>> from netneurotools import stats

    >>> np.random.seed(12345678)  # set random seed for reproducible results
    >>> rvs1 = np.random.normal(loc=5, scale=10, size=500)
    >>> rvs2 = (np.random.normal(loc=5, scale=10, size=500)
    ...         + np.random.normal(scale=0.2, size=500))
    >>> stats.permtest_rel(rvs1, rvs2)  # doctest: +SKIP
    (-0.16506275161572695, 0.8021978021978022)

    >>> rvs3 = (np.random.normal(loc=8, scale=10, size=500)
    ...         + np.random.normal(scale=0.2, size=500))
    >>> stats.permtest_rel(rvs1, rvs3)  # doctest: +SKIP
    (2.40533726097883, 0.000999000999000999)
    """
    a, b, axis = _chk2_asarray(a, b, axis)
    rs = check_random_state(seed)

    if a.shape[axis] != b.shape[axis]:
        raise ValueError('Provided arrays do not have same length along axis')

    if a.size == 0 or b.size == 0:
        return np.nan, np.nan

    # calculate original difference in means
    ab = np.stack([a, b], axis=0)
    if ab.ndim < 3:
        ab = np.expand_dims(ab, axis=-1)
    true_diff = np.squeeze(np.diff(ab, axis=0)).mean(axis=axis) / 1
    abs_true = np.abs(true_diff)

    # idx array
    reidx = np.meshgrid(*[range(f) for f in ab.shape], indexing='ij')

    permutations = np.ones(true_diff.shape)
    for _ in range(n_perm):
        # use this to re-index (i.e., swap along) the first axis of `ab`
        swap = rs.random_sample(ab.shape[:-1]).argsort(axis=axis)
        reidx[0] = np.repeat(swap[..., np.newaxis], ab.shape[-1], axis=-1)
        # recompute difference between `a` and `b` (i.e., first axis of `ab`)
        pdiff = np.squeeze(np.diff(ab[tuple(reidx)], axis=0)).mean(axis=axis)
        permutations += np.abs(pdiff) >= abs_true

    pvals = permutations / (n_perm + 1)  # + 1 in denom accounts for true_diff

    return true_diff, pvals


[docs]def permtest_pearsonr(a, b, axis=0, n_perm=1000, resamples=None, seed=0):
    """
    Non-parametric equivalent of :py:func:`scipy.stats.pearsonr`.

    Generates two-tailed p-value for hypothesis of whether samples `a` and `b`
    are correlated using permutation tests

    Parameters
    ----------
    a,b : (N[, M]) array_like
        Sample observations. These arrays must have the same length and either
        an equivalent number of columns or be broadcastable
    axis : int or None, optional
        Axis along which to compute test. If None, compute over whole arrays
        of `a` and `b`. Default: 0
    n_perm : int, optional
        Number of permutations to assess. Unless `a` and `b` are very small
        along `axis` this will approximate a randomization test via Monte
        Carlo simulations. Default: 1000
    resamples : (N, P) array_like, optional
        Resampling array used to shuffle `a` when generating null distribution
        of correlations. This array must have the same length as `a` and `b`
        and should have at least the same number of columns as `n_perm` (if it
        has more then only `n_perm` columns will be used. When not specified a
        standard permutation is used to shuffle `a`. Default: None
    seed : {int, np.random.RandomState instance, None}, optional
        Seed for random number generation. Set to None for "randomness".
        Default: 0

    Returns
    -------
    corr : float or numpyndarray
        Correlations
    pvalue : float or numpy.ndarray
        Non-parametric p-value

    Notes
    -----
    The lowest p-value that can be returned by this function is equal to 1 /
    (`n_perm` + 1).

    Examples
    --------
    >>> from netneurotools import datasets, stats

    >>> np.random.seed(12345678)  # set random seed for reproducible results
    >>> x, y = datasets.make_correlated_xy(corr=0.1, size=100)
    >>> stats.permtest_pearsonr(x, y)  # doctest: +SKIP
    (0.10032564626876286, 0.3046953046953047)

    >>> x, y = datasets.make_correlated_xy(corr=0.5, size=100)
    >>> stats.permtest_pearsonr(x, y)  # doctest: +SKIP
    (0.500040365781984, 0.000999000999000999)

    Also works with multiple columns by either broadcasting the smaller array
    to the larger:

    >>> z = x + np.random.normal(loc=1, size=100)
    >>> stats.permtest_pearsonr(x, np.column_stack([y, z]))
    (array([0.50004037, 0.25843187]), array([0.000999  , 0.01098901]))

    or by using matching columns in the two arrays (e.g., `x` and `y` vs
    `a` and `b`):

    >>> a, b = datasets.make_correlated_xy(corr=0.9, size=100)
    >>> stats.permtest_pearsonr(np.column_stack([x, a]), np.column_stack([y, b]))
    (array([0.50004037, 0.89927523]), array([0.000999, 0.000999]))
    """  # noqa
    a, b, axis = _chk2_asarray(a, b, axis)
    rs = check_random_state(seed)

    if len(a) != len(b):
        raise ValueError('Provided arrays do not have same length')

    if a.size == 0 or b.size == 0:
        return np.nan, np.nan

    if resamples is not None:
        if n_perm > resamples.shape[-1]:
            raise ValueError('Number of permutations requested exceeds size '
                             'of resampling array.')

    # divide by one forces coercion to float if ndim = 0
    true_corr = efficient_pearsonr(a, b)[0] / 1
    abs_true = np.abs(true_corr)

    permutations = np.ones(true_corr.shape)
    for perm in range(n_perm):
        # permute `a` and determine whether correlations exceed original
        if resamples is None:
            ap = a[rs.permutation(len(a))]
        else:
            ap = a[resamples[:, perm]]
        permutations += np.abs(efficient_pearsonr(ap, b)[0]) >= abs_true

    pvals = permutations / (n_perm + 1)  # + 1 in denom accounts for true_corr

    return true_corr, pvals


[docs]def efficient_pearsonr(a, b, ddof=1, nan_policy='propagate'):
    """
    Compute correlation of matching columns in `a` and `b`.

    Parameters
    ----------
    a,b : array_like
        Sample observations. These arrays must have the same length and either
        an equivalent number of columns or be broadcastable
    ddof : int, optional
        Degrees of freedom correction in the calculation of the standard
        deviation. Default: 1
    nan_policy : bool, optional
        Defines how to handle when input contains nan. 'propagate' returns nan,
        'raise' throws an error, 'omit' performs the calculations ignoring nan
        values. Default: 'propagate'

    Returns
    -------
    corr : float or numpy.ndarray
        Pearson's correlation coefficient between matching columns of inputs
    pval : float or numpy.ndarray
        Two-tailed p-values

    Notes
    -----
    If either input contains nan and nan_policy is set to 'omit', both arrays
    will be masked to omit the nan entries.

    Examples
    --------
    >>> from netneurotools import datasets, stats

    Generate some not-very-correlated and some highly-correlated data:

    >>> np.random.seed(12345678)  # set random seed for reproducible results
    >>> x1, y1 = datasets.make_correlated_xy(corr=0.1, size=100)
    >>> x2, y2 = datasets.make_correlated_xy(corr=0.8, size=100)

    Calculate both correlations simultaneously:

    >>> stats.efficient_pearsonr(np.c_[x1, x2], np.c_[y1, y2])
    (array([0.10032565, 0.79961189]), array([3.20636135e-01, 1.97429944e-23]))
    """
    a, b, axis = _chk2_asarray(a, b, 0)
    if len(a) != len(b):
        raise ValueError('Provided arrays do not have same length')

    if a.size == 0 or b.size == 0:
        return np.nan, np.nan

    if nan_policy not in ('propagate', 'raise', 'omit'):
        raise ValueError(f'Value for nan_policy "{nan_policy}" not allowed')

    a, b = a.reshape(len(a), -1), b.reshape(len(b), -1)
    if (a.shape[1] != b.shape[1]):
        a, b = np.broadcast_arrays(a, b)

    mask = np.logical_or(np.isnan(a), np.isnan(b))
    if nan_policy == 'raise' and np.any(mask):
        raise ValueError('Input cannot contain NaN when nan_policy is "omit"')
    elif nan_policy == 'omit':
        # avoid making copies of the data, if possible
        a = np.ma.masked_array(a, mask, copy=False, fill_value=np.nan)
        b = np.ma.masked_array(b, mask, copy=False, fill_value=np.nan)

    with np.errstate(invalid='ignore'):
        corr = (sstats.zscore(a, ddof=ddof, nan_policy=nan_policy)
                * sstats.zscore(b, ddof=ddof, nan_policy=nan_policy))

    sumfunc, n_obs = np.sum, len(a)
    if nan_policy == 'omit':
        corr = corr.filled(np.nan)
        sumfunc = np.nansum
        n_obs = np.squeeze(np.sum(np.logical_not(np.isnan(corr)), axis=0))

    corr = sumfunc(corr, axis=0) / (n_obs - 1)
    corr = np.squeeze(np.clip(corr, -1, 1)) / 1

    # taken from scipy.stats
    ab = (n_obs / 2) - 1
    prob = 2 * special.btdtr(ab, ab, 0.5 * (1 - np.abs(corr)))

    return corr, prob


def _gen_rotation(seed=None):
    """
    Generate random matrix for rotating spherical coordinates.

    Parameters
    ----------
    seed : {int, np.random.RandomState instance, None}, optional
        Seed for random number generation

    Returns
    -------
    rotate_{l,r} : (3, 3) numpy.ndarray
        Rotations for left and right hemisphere coordinates, respectively
    """
    rs = check_random_state(seed)

    # for reflecting across Y-Z plane
    reflect = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]])

    # generate rotation for left
    rotate_l, temp = np.linalg.qr(rs.normal(size=(3, 3)))
    rotate_l = rotate_l @ np.diag(np.sign(np.diag(temp)))
    if np.linalg.det(rotate_l) < 0:
        rotate_l[:, 0] = -rotate_l[:, 0]

    # reflect the left rotation across Y-Z plane
    rotate_r = reflect @ rotate_l @ reflect

    return rotate_l, rotate_r


[docs]def gen_spinsamples(coords, hemiid, n_rotate=1000, check_duplicates=True,
                    method='original', exact=False, seed=None, verbose=False,
                    return_cost=False):
    """
    Return a resampling array for `coords` obtained from rotations / spins.

    Using the method initially proposed in [ST1]_ (and later modified + updated
    based on findings in [ST2]_ and [ST3]_), this function applies random
    rotations to the user-supplied `coords` in order to generate a resampling
    array that preserves its spatial embedding. Rotations are generated for one
    hemisphere and mirrored for the other (see `hemiid` for more information).

    Due to irregular sampling of `coords` and the randomness of the rotations
    it is possible that some "rotations" may resample with replacement (i.e.,
    will not be a true permutation). The likelihood of this can be reduced by
    either increasing the sampling density of `coords` or changing the
    ``method`` parameter (see Notes for more information on the latter).

    Parameters
    ----------
    coords : (N, 3) array_like
        X, Y, Z coordinates of `N` nodes/parcels/regions/vertices defined on a
        sphere
    hemiid : (N,) array_like
        Array denoting hemisphere designation of coordinates in `coords`, where
        values should be {0, 1} denoting the different hemispheres. Rotations
        are generated for one hemisphere and mirrored across the y-axis for the
        other hemisphere.
    n_rotate : int, optional
        Number of rotations to generate. Default: 1000
    check_duplicates : bool, optional
        Whether to check for and attempt to avoid duplicate resamplings. A
        warnings will be raised if duplicates cannot be avoided. Setting to
        True may increase the runtime of this function! Default: True
    method : {'original', 'vasa', 'hungarian'}, optional
        Method by which to match non- and rotated coordinates. Specifying
        'original' will use the method described in [ST1]_. Specfying 'vasa'
        will use the method described in [ST4]_. Specfying 'hungarian' will use
        the Hungarian algorithm to minimize the global cost of reassignment
        (will dramatically increase runtime). Default: 'original'
    seed : {int, np.random.RandomState instance, None}, optional
        Seed for random number generation. Default: None
    verbose : bool, optional
        Whether to print occasional status messages. Default: False
    return_cost : bool, optional
        Whether to return cost array (specified as Euclidean distance) for each
        coordinate for each rotation Default: True

    Returns
    -------
    spinsamples : (N, `n_rotate`) numpy.ndarray
        Resampling matrix to use in permuting data based on supplied `coords`.
    cost : (N, `n_rotate`,) numpy.ndarray
        Cost (specified as Euclidean distance) of re-assigning each coordinate
        for every rotation in `spinsamples`. Only provided if `return_cost` is
        True.

    Notes
    -----
    By default, this function uses the minimum Euclidean distance between the
    original coordinates and the new, rotated coordinates to generate a
    resampling array after each spin. Unfortunately, this can (with some
    frequency) lead to multiple coordinates being re-assigned the same value:

        >>> from netneurotools import stats as nnstats
        >>> coords = [[0, 0, 1], [1, 0, 0], [0, 0, 1], [1, 0, 0]]
        >>> hemi = [0, 0, 1, 1]
        >>> nnstats.gen_spinsamples(coords, hemi, n_rotate=1, seed=1,
        ...                         method='original', check_duplicates=False)
        array([[0],
               [0],
               [2],
               [3]])

    While this is reasonable in most circumstances, if you feel incredibly
    strongly about having a perfect "permutation" (i.e., all indices appear
    once and exactly once in the resampling), you can set the ``method``
    parameter to either 'vasa' or 'hungarian':

        >>> nnstats.gen_spinsamples(coords, hemi, n_rotate=1, seed=1,
        ...                         method='vasa', check_duplicates=False)
        array([[1],
               [0],
               [2],
               [3]])
        >>> nnstats.gen_spinsamples(coords, hemi, n_rotate=1, seed=1,
        ...                         method='hungarian', check_duplicates=False)
        array([[0],
               [1],
               [2],
               [3]])

    Note that setting this parameter may increase the runtime of the function
    (especially for `method='hungarian'`). Refer to [ST1]_ for information on
    why the default (i.e., ``exact`` set to False) suffices in most cases.

    For the original MATLAB implementation of this function refer to [ST5]_.

    References
    ----------
    .. [ST1] Alexander-Bloch, A., Shou, H., Liu, S., Satterthwaite, T. D.,
       Glahn, D. C., Shinohara, R. T., Vandekar, S. N., & Raznahan, A. (2018).
       On testing for spatial correspondence between maps of human brain
       structure and function. NeuroImage, 178, 540-51.

    .. [ST2] Blaser, R., & Fryzlewicz, P. (2016). Random Rotation Ensembles.
       Journal of Machine Learning Research, 17(4), 1–26.

    .. [ST3] Lefèvre, J., Pepe, A., Muscato, J., De Guio, F., Girard, N.,
       Auzias, G., & Germanaud, D. (2018). SPANOL (SPectral ANalysis of Lobes):
       A Spectral Clustering Framework for Individual and Group Parcellation of
       Cortical Surfaces in Lobes. Frontiers in Neuroscience, 12, 354.

    .. [ST4] Váša, F., Seidlitz, J., Romero-Garcia, R., Whitaker, K. J.,
       Rosenthal, G., Vértes, P. E., ... & Jones, P. B. (2018). Adolescent
       tuning of association cortex in human structural brain networks.
       Cerebral Cortex, 28(1), 281-294.

    .. [ST5] https://github.com/spin-test/spin-test
    """
    methods = ['original', 'vasa', 'hungarian']
    if method not in methods:
        raise ValueError('Provided method "{}" invalid. Must be one of {}.'
                         .format(method, methods))

    if exact:
        warnings.warn('The `exact` parameter will no longer be supported in '
                      'an upcoming release. Please use the `method` parameter '
                      'instead.', DeprecationWarning, stacklevel=3)
        if exact == 'vasa' and method == 'original':
            method = 'vasa'
        elif exact and method == 'original':
            method = 'hungarian'

    seed = check_random_state(seed)

    coords = np.asanyarray(coords)
    hemiid = np.squeeze(np.asanyarray(hemiid, dtype='int8'))

    # check supplied coordinate shape
    if coords.shape[-1] != 3 or coords.squeeze().ndim != 2:
        raise ValueError('Provided `coords` must be of shape (N, 3), not {}'
                         .format(coords.shape))

    # ensure hemisphere designation array is correct
    if hemiid.ndim != 1:
        raise ValueError('Provided `hemiid` array must be one-dimensional.')
    if len(coords) != len(hemiid):
        raise ValueError('Provided `coords` and `hemiid` must have the same '
                         'length. Provided lengths: coords = {}, hemiid = {}'
                         .format(len(coords), len(hemiid)))
    if np.max(hemiid) > 1 or np.min(hemiid) < 0:
        raise ValueError('Hemiid must have values in {0, 1} denoting left and '
                         'right hemisphere coordinates, respectively. '
                         + 'Provided array contains values: {}'
                         .format(np.unique(hemiid)))

    # empty array to store resampling indices
    spinsamples = np.zeros((len(coords), n_rotate), dtype=int)
    cost = np.zeros((len(coords), n_rotate))
    inds = np.arange(len(coords), dtype=int)

    # generate rotations and resampling array!
    msg, warned = '', False
    for n in range(n_rotate):
        count, duplicated = 0, True

        if verbose:
            msg = 'Generating spin {:>5} of {:>5}'.format(n, n_rotate)
            print(msg, end='\r', flush=True)

        while duplicated and count < 500:
            count, duplicated = count + 1, False
            resampled = np.zeros(len(coords), dtype='int32')

            # rotate each hemisphere separately
            for h, rot in enumerate(_gen_rotation(seed=seed)):
                hinds = (hemiid == h)
                coor = coords[hinds]
                if len(coor) == 0:
                    continue

                # if we need an "exact" mapping (i.e., each node needs to be
                # assigned EXACTLY once) then we have to calculate the full
                # distance matrix which is a nightmare with respect to memory
                # for anything that isn't parcellated data.
                # that is, don't do this with vertex coordinates!
                if method == 'vasa':
                    dist = spatial.distance_matrix(coor, coor @ rot)
                    # min of max a la Vasa et al., 2018
                    col = np.zeros(len(coor), dtype='int32')
                    for _ in range(len(dist)):
                        # find parcel whose closest neighbor is farthest away
                        # overall; assign to that
                        row = dist.min(axis=1).argmax()
                        col[row] = dist[row].argmin()
                        cost[inds[hinds][row], n] = dist[row, col[row]]
                        # set to -inf and inf so they can't be assigned again
                        dist[row] = -np.inf
                        dist[:, col[row]] = np.inf
                # optimization of total cost using Hungarian algorithm. this
                # may result in certain parcels having higher cost than with
                # `method='vasa'` but should always result in the total cost
                # being lower #tradeoffs
                elif method == 'hungarian':
                    dist = spatial.distance_matrix(coor, coor @ rot)
                    row, col = optimize.linear_sum_assignment(dist)
                    cost[hinds, n] = dist[row, col]
                # if nodes can be assigned multiple targets, we can simply use
                # the absolute minimum of the distances (no optimization
                # required) which is _much_ lighter on memory
                # huge thanks to https://stackoverflow.com/a/47779290 for this
                # memory-efficient method
                elif method == 'original':
                    dist, col = spatial.cKDTree(coor @ rot).query(coor, 1)
                    cost[hinds, n] = dist

                resampled[hinds] = inds[hinds][col]

            # if we want to check for duplicates ensure that we don't have any
            if check_duplicates:
                if np.any(np.all(resampled[:, None] == spinsamples[:, :n], 0)):
                    duplicated = True
                # if our "spin" is identical to the input then that's no good
                elif np.all(resampled == inds):
                    duplicated = True

        # if we broke out because we tried 500 rotations and couldn't generate
        # a new one, warn that we're using duplicate rotations and give up.
        # this should only be triggered if check_duplicates is set to True
        if count == 500 and not warned:
            warnings.warn(
                'Duplicate rotations used. Check resampling array '
                'to determine real number of unique permutations.', stacklevel=2)
            warned = True

        spinsamples[:, n] = resampled

    if verbose:
        print(' ' * len(msg) + '\b' * len(msg), end='', flush=True)

    if return_cost:
        return spinsamples, cost

    return spinsamples


[docs]def get_dominance_stats(X, y, use_adjusted_r_sq=True, verbose=False, n_jobs=1):
    """
    Return the dominance analysis statistics for multilinear regression.

    This is a rewritten & simplified version of [DA1]_. It is briefly
    tested against the original package, but still in early stages.
    Please feel free to report any bugs.

    Warning: Still work-in-progress. Parameters might change!

    Parameters
    ----------
    X : (N, M) array_like
        Input data
    y : (N,) array_like
        Target values
    use_adjusted_r_sq : bool, optional
        Whether to use adjusted r squares. Default: True
    verbose : bool, optional
        Whether to print debug messages. Default: False
    n_jobs : int, optional
        The number of jobs to run in parallel. Default: 1

    Returns
    -------
    model_metrics : dict
        The dominance metrics, currently containing `individual_dominance`,
        `partial_dominance`, `total_dominance`, and `full_r_sq`.
    model_r_sq : dict
        Contains all model r squares

    Notes
    -----
    Example usage

    .. code:: python

        from netneurotools.stats import get_dominance_stats
        from sklearn.datasets import load_boston
        X, y = load_boston(return_X_y=True)
        model_metrics, model_r_sq = get_dominance_stats(X, y)

    To compare with [DA1]_, use `use_adjusted_r_sq=False`

    .. code:: python

        from dominance_analysis import Dominance_Datasets
        from dominance_analysis import Dominance
        boston_dataset=Dominance_Datasets.get_boston()
        dominance_regression=Dominance(data=boston_dataset,
                                       target='House_Price',objective=1)
        incr_variable_rsquare=dominance_regression.incremental_rsquare()
        dominance_regression.dominance_stats()

    References
    ----------
    .. [DA1] https://github.com/dominance-analysis/dominance-analysis

    """
    # this helps to remove one element from a tuple
    def remove_ret(tpl, elem):
        lst = list(tpl)
        lst.remove(elem)
        return tuple(lst)

    # sklearn linear regression wrapper
    def get_reg_r_sq(X, y, use_adjusted_r_sq=True):
        lin_reg = LinearRegression()
        lin_reg.fit(X, y)
        yhat = lin_reg.predict(X)
        SS_Residual = sum((y - yhat) ** 2)
        SS_Total = sum((y - np.mean(y)) ** 2)
        r_squared = 1 - (float(SS_Residual)) / SS_Total
        adjusted_r_squared = 1 - (1 - r_squared) * \
            (len(y) - 1) / (len(y) - X.shape[1] - 1)
        if use_adjusted_r_sq:
            return adjusted_r_squared
        else:
            return r_squared

    # helper function to compute r_sq for a given idx_tuple
    def compute_r_sq(idx_tuple):
        return idx_tuple, get_reg_r_sq(X[:, idx_tuple],
                                       y,
                                       use_adjusted_r_sq=use_adjusted_r_sq)

    # generate all predictor combinations in list (num of predictors) of lists
    n_predictor = X.shape[-1]
    # n_comb_len_group = n_predictor - 1
    predictor_combs = [list(combinations(range(n_predictor), i))
                       for i in range(1, n_predictor + 1)]
    if verbose:
        print(f"[Dominance analysis] Generated \
              {len([v for i in predictor_combs for v in i])} combinations")

    model_r_sq = dict()
    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_r_sq)(idx_tuple)
        for len_group in tqdm(predictor_combs,
                              desc='num-of-predictor loop',
                              disable=not verbose)
        for idx_tuple in tqdm(len_group,
                              desc='insider loop',
                              disable=not verbose))

    # extract r_sq from results
    for idx_tuple, r_sq in results:
        model_r_sq[idx_tuple] = r_sq

    if verbose:
        print(f"[Dominance analysis] Acquired {len(model_r_sq)} r^2's")

    # getting all model metrics
    model_metrics = dict([])

    # individual dominance
    individual_dominance = []
    for i_pred in range(n_predictor):
        individual_dominance.append(model_r_sq[(i_pred,)])
    individual_dominance = np.array(individual_dominance).reshape(1, -1)
    model_metrics["individual_dominance"] = individual_dominance

    # partial dominance
    partial_dominance = [[] for _ in range(n_predictor - 1)]
    for i_len in range(n_predictor - 1):
        i_len_combs = list(combinations(range(n_predictor), i_len + 2))
        for j_node in range(n_predictor):
            j_node_sel = [v for v in i_len_combs if j_node in v]
            reduced_list = [remove_ret(comb, j_node) for comb in j_node_sel]
            diff_values = [
                model_r_sq[j_node_sel[i]] - model_r_sq[reduced_list[i]]
                for i in range(len(reduced_list))]
            partial_dominance[i_len].append(np.mean(diff_values))

    # save partial dominance
    partial_dominance = np.array(partial_dominance)
    model_metrics["partial_dominance"] = partial_dominance
    # get total dominance
    total_dominance = np.mean(
        np.r_[individual_dominance, partial_dominance], axis=0)
    # test and save total dominance
    assert np.allclose(total_dominance.sum(),
                       model_r_sq[tuple(range(n_predictor))]), \
           "Sum of total dominance is not equal to full r square!"
    model_metrics["total_dominance"] = total_dominance
    # save full r^2
    model_metrics["full_r_sq"] = model_r_sq[tuple(range(n_predictor))]

    return model_metrics, model_r_sq


[docs]def network_pearsonr(annot1, annot2, weight):
    r"""
    Calculate pearson correlation between two annotation vectors.

    .. warning::
       Test before use.

    Parameters
    ----------
    annot1 : (N,) array_like
        First annotation vector, demean will be applied.
    annot2 : (N,) array_like
        Second annotation vector, demean will be applied.
    weight : (N, N) array_like
        Weight matrix. Diagonal elements should be 1.

    Returns
    -------
    corr : float
        Network correlation between `annot1` and `annot2`

    Notes
    -----
    If Pearson correlation is represented as

    .. math::
        \rho_{x,y} = \dfrac{
            \mathrm{sum}(I \times (\hat{x} \otimes \hat{y}))
        }{
            \sigma_x \sigma_y
        }

    The network correlation is defined analogously as

    .. math::
        \rho_{x,y,G} = \dfrac{
            \mathrm{sum}(W \times (\hat{x} \otimes \hat{y}))
        }{
            \sigma_{x,W} \sigma_{y,W}
        }

    where :math:`\hat{x}` and :math:`\hat{y}` are the demeaned annotation vectors,

    The weight matrix :math:`W` is used to represent the network structure.
    It is usually in the form of :math:`W = \\exp(-kL)` where :math:`L` is the
    length matrix and :math:`k` is a decay parameter.

    Example using shortest path length as weight

    .. code:: python

        spl, _ = distance_wei_floyd(D) # input should be distance matrix
        spl_wei = 1 / np.exp(spl)
        netcorr = network_pearsonr(annot1, annot2, spl_wei)

    Example using (inverse) effective resistance as weight

    .. code:: python

        R_eff = effective_resistance(W)
        R_eff_norm = R_eff / np.max(R_eff)
        W = 1 / R_eff_norm
        W = W / np.max(W)
        np.fill_diagonal(W, 1.0)
        netcorr = network_pearsonr(annot1, annot2, W)

    References
    ----------
    .. [1] Coscia, M. (2021). Pearson correlations on complex networks.
       Journal of Complex Networks, 9(6), cnab036.
       https://doi.org/10.1093/comnet/cnab036


    See Also
    --------
    netneurotools.stats.network_pearsonr_pairwise
    """
    annot1 = annot1 - np.mean(annot1)
    annot2 = annot2 - np.mean(annot2)
    upper = np.sum(np.multiply(weight, np.outer(annot1, annot2)))
    lower1 = np.sum(np.multiply(weight, np.outer(annot1, annot1)))
    lower2 = np.sum(np.multiply(weight, np.outer(annot2, annot2)))
    return upper / np.sqrt(lower1) / np.sqrt(lower2)


[docs]def network_pearsonr_numba(annot1, annot2, weight):
    """
    Numba version of :meth:`netneurotools.stats.network_pearsonr`.

    .. warning::
       Test before use.

    Parameters
    ----------
    annot1 : (N,) array_like
        First annotation vector, demean will be applied.
    annot2 : (N,) array_like
        Second annotation vector, demean will be applied.
    weight : (N, N) array_like
        Weight matrix. Diagonal elements should be 1.

    Returns
    -------
    corr : float
        Network correlation between `annot1` and `annot2`
    """
    n = annot1.shape[0]
    annot1 = annot1 - np.mean(annot1)
    annot2 = annot2 - np.mean(annot2)
    upper, lower1, lower2 = 0.0, 0.0, 0.0
    for i in range(n):
        for j in range(n):
            upper += annot1[i] * annot2[j] * weight[i, j]
            lower1 += annot1[i] * annot1[j] * weight[i, j]
            lower2 += annot2[i] * annot2[j] * weight[i, j]
    return upper / np.sqrt(lower1) / np.sqrt(lower2)


if use_numba:
    network_pearsonr_numba = njit(network_pearsonr_numba)


def _cross_outer(annot_mat):
    """
    Calculate cross outer product of input matrix.

    This functions is only used in `network_pearsonr_pairwise`.

    Parameters
    ----------
    annot_mat : (N, D) array_like
        Input matrix

    Returns
    -------
    cross_outer : (N, N, D, D) numpy.ndarray
        Cross outer product of `annot_mat`
    """
    n_samp, n_feat = annot_mat.shape
    cross_outer = np.empty((n_samp, n_samp, n_feat, n_feat), annot_mat.dtype)
    for a in range(n_samp):
        for b in range(n_samp):
            for c in range(n_feat):
                for d in range(n_feat):
                    cross_outer[a, b, c, d] = annot_mat[a, c] * annot_mat[b, d]
    return cross_outer


if use_numba:
    # ("float64[:,:,:,::1](float64[:,::1])")
    _cross_outer = njit(_cross_outer)


def _multiply_sum(cross_outer, weight):
    """
    Multiply and sum cross outer product.

    This functions is only used in `network_pearsonr_pairwise`.

    Parameters
    ----------
    cross_outer : (N, N, D, D) array_like
        Cross outer product of `annot_mat`
    weight : (D, D) array_like
        Weight matrix

    Returns
    -------
    cross_outer_after : (N, N) numpy.ndarray
        Result of multiplying and summing `cross_outer`
    """
    n_samp, _, n_dim, _ = cross_outer.shape
    cross_outer_after = np.empty((n_samp, n_samp), cross_outer.dtype)
    for i in range(n_samp):
        for j in range(n_samp):
            curr_sum = 0.0
            for k in range(n_dim):
                for l in range(n_dim):  # noqa: E741
                    curr_sum += weight[k, l] * cross_outer[i, j, k, l]
            cross_outer_after[i, j] = curr_sum
    return cross_outer_after


if use_numba:
    # ("float64[:,::1](float64[:,:,:,::1],float64[:,::1])")
    _multiply_sum = njit(_multiply_sum)


[docs]def network_pearsonr_pairwise(annot_mat, weight):
    """
    Calculate pairwise network correlation between rows of `annot_mat`.

    .. warning::
       Test before use.

    Parameters
    ----------
    annot_mat : (N, D) array_like
        Input matrix
    weight : (D, D) array_like
        Weight matrix. Diagonal elements should be 1.

    Returns
    -------
    corr_mat : (N, N) numpy.ndarray
        Pairwise network correlation matrix

    Notes
    -----
    This is a faster version of :meth:`netneurotools.stats.network_pearsonr`
    for calculating pairwise network correlation between rows of `annot_mat`.
    Check :meth:`netneurotools.stats.network_pearsonr` for details.

    See Also
    --------
    netneurotools.stats.network_pearsonr
    """
    annot_mat_demean = annot_mat - np.mean(annot_mat, axis=1, keepdims=True)
    if use_numba:
        cross_outer = _cross_outer(annot_mat_demean)
        cross_outer_after = _multiply_sum(cross_outer, weight)
    else:
        # https://stackoverflow.com/questions/24839481/python-matrix-outer-product
        cross_outer = np.einsum('ac,bd->abcd', annot_mat_demean, annot_mat_demean)
        cross_outer_after = np.sum(np.multiply(cross_outer, weight), axis=(2, 3))
    # translating the two lines below in numba does not speed up much
    lower = np.sqrt(np.diagonal(cross_outer_after))
    return cross_outer_after / np.einsum('i,j', lower, lower)


def _onehot_quadratic_form_broadcast(Q_star):
    """
    Calculate one-hot quadratic form of input matrix.

    This functions is only used in `effective_resistance`.

    Parameters
    ----------
    Q_star : (N, N) array_like
        Input matrix

    Returns
    -------
    R_eff : (N, N) numpy.ndarray
        One-hot quadratic form of `Q_star`
    """
    n = Q_star.shape[0]
    R_eff = np.empty((n, n), Q_star.dtype)
    for i in range(n):
        for j in range(n):
            R_eff[i, j] = Q_star[i, i] - Q_star[j, i] - Q_star[i, j] + Q_star[j, j]
    return R_eff


if use_numba:
    # ("float64[:,::1](float64[:,::1])")
    _onehot_quadratic_form_broadcast = njit(_onehot_quadratic_form_broadcast)


[docs]def effective_resistance(W, directed=True):
    """
    Calculate effective resistance matrix.

    The effective resistance between two nodes in a graph, often used in the context
    of electrical networks, is a measure that stems from the inverse of the Laplacian
    matrix of the graph.

    .. warning::
       Test before use.

    Parameters
    ----------
    W : (N, N) array_like
        Weight matrix.
    directed : bool, optional
        Whether the graph is directed. This is used to determine whether to turn on
        the :code:`hermitian=True` option in :func:`numpy.linalg.pinv`. When you are
        using a symmetric weight matrix (while real-valued implying hermitian), you
        can set this to False for better performance. Default: True

    Returns
    -------
    R_eff : (N, N) numpy.ndarray
        Effective resistance matrix

    Notes
    -----
    The effective resistance between two nodes :math:`i` and :math:`j` is defined as

    .. math::
        R_{ij} = (e_i - e_j)^T Q^* (e_i - e_j)

    where :math:`Q^*` is the Moore-Penrose pseudoinverse of the Laplacian matrix
    :math:`L` of the graph, and :math:`e_i` is the :math:`i`-th standard basis vector.

    References
    ----------
    .. [1] Ellens, W., Spieksma, F. M., Van Mieghem, P., Jamakovic, A., & Kooij,
       R. E. (2011). Effective graph resistance. Linear Algebra and Its Applications,
       435(10), 2491–2506. https://doi.org/10.1016/j.laa.2011.02.024

    See Also
    --------
    netneurotools.stats.network_polarisation
    """
    L = _graph_laplacian(W)
    Q_star = np.linalg.pinv(L, hermitian=not directed)
    if use_numba:
        R_eff = _onehot_quadratic_form_broadcast(Q_star)
    else:
        Q_star_diag = np.diag(Q_star)
        R_eff = \
            Q_star_diag[:, np.newaxis] \
            - Q_star \
            - Q_star.T \
            + Q_star_diag[np.newaxis, :]
    return R_eff


def _polariz_diff(vec):
    """
    Calculate difference between positive and negative parts of a vector.

    This functions is only used in `network_polarisation`.

    Parameters
    ----------
    vec : (N,) array_like
        Input vector. Must have both positive and negative values.

    Returns
    -------
    vec_diff : (N,) numpy.ndarray
        Difference between positive and negative parts of `vec`
    """
    #
    vec_pos = np.maximum(vec, 0.0)
    vec_pos /= np.max(vec_pos)
    #
    vec_neg = np.minimum(vec, 0.0)
    vec_neg = np.abs(vec_neg)
    vec_neg /= np.max(vec_neg)
    return (vec_pos - vec_neg)


if use_numba:
    _polariz_diff = njit(_polariz_diff)


def _quadratic_form(W, vec_left, vec_right, squared=False):
    """
    Calculate quadratic form :math:`v_{left}^T W v_{right}`.

    Parameters
    ----------
    W : (N, N) array_like
        Input matrix.
    vec_left : (N,) array_like
        Left weight vector.
    vec_right : (N,) array_like
        Right weight vector.
    squared : bool, optional
        Whether to square the input weight matrix. Default: False

    Returns
    -------
    quadratic_form : float
        Quadratic form from `W`, `vec_left`, and `vec_right`
    """
    # [numpy]

    # (vec_left.T @ W @ vec_right)[0, 0]
    # [numba]
    # vec = np.ascontiguousarray(vec[np.newaxis, :])
    n = W.shape[0]
    ret = 0.0
    for i in range(n):
        for j in range(n):
            if squared:
                ret += vec_left[i] * vec_right[j] * W[i, j]**2
            else:
                ret += vec_left[i] * vec_right[j] * W[i, j]
    return ret


if use_numba:
    _quadratic_form = njit(_quadratic_form)


[docs]def network_polarisation(vec, W, directed=True):
    r"""
    Calculate polarisation of a vector on a graph.

    Network polarisation is a measure of polizzartion taken into account all the
    three factors below [1]_:

    - how extreme the opinions of the people are
    - how much they organize into echo chambers, and
    - how these echo chambers organize in the network

    .. warning::
       Test before use.

    Parameters
    ----------
    vec : (N,) array_like
        Polarization vector. Must have both positive and negative values. Will be
        normalized between -1 and 1 internally.
    W : (N, N) array_like
        Weight matrix.
    directed : bool, optional
        Whether the graph is directed. This is used to determine whether to turn on
        the :code:`hermitian=True` option in :func:`numpy.linalg.pinv`. When you are
        using a symmetric weight matrix (while real-valued implying hermitian), you
        can set this to False for better performance. Default: True

    Returns
    -------
    polariz : float
        Polarization of `vec` on `W`

    Notes
    -----
    The measure is based on the genralized Eucledian distance, defined as

    .. math::
        \delta_{G, o} = \sqrt{(o^+ - o^-)^T Q^* (o^+ - o^-)}

    where :math:`o^+` and :math:`o^-` are the positive and negative parts of the
    polarization vector, and :math:`Q^*` is the Moore-Penrose pseudoinverse
    of the Laplacian matrix :math:`L` of the graph. Check :func:`effective_resistance`
    for similarity.

    References
    ----------
    .. [1] Hohmann, M., Devriendt, K., & Coscia, M. (2023). Quantifying ideological
       polarization on a network using generalized Euclidean distance. Science Advances,
       9(9), eabq2044. https://doi.org/10.1126/sciadv.abq2044

    See Also
    --------
    netneurotools.stats.effective_resistance
    """
    L = _graph_laplacian(W)
    Q_star = np.linalg.pinv(L, hermitian=not directed)
    diff = _polariz_diff(vec)
    if use_numba:
        polariz_sq = _quadratic_form(Q_star, diff, diff, squared=False)
    else:
        polariz_sq = (diff.T @ Q_star @ diff)
    return np.sqrt(polariz_sq)


[docs]def network_variance(vec, D):
    r"""
    Calculate variance of a vector on a graph.

    Network variance is a measure of variance taken into account the network
    structure.

    .. warning::
       Test before use.

    Parameters
    ----------
    vec : (N,) array_like
        Input vector. Must be all positive.
        Will be normalized internally as a probability distribution.
    D : (N, N) array_like
        Distance matrix.

    Returns
    -------
    network_variance : float
        Network variance of `vec` on `D`

    Notes
    -----
    The network variance is defined as

    .. math::
        var(p) = \frac{1}{2} \sum_{i, j} p(i) p(j) d^2(i,j)

    where :math:`p` is the probability distribution of `vec`, and :math:`d(i,j)`
    is the distance between node :math:`i` and :math:`j`.

    The distance matrix :math:`D` can make use of effective resistance or its
    square root.

    Example using effective resistance as weight matrix

    .. code:: python

        R_eff = effective_resistance(W)
        netvar = network_variance(vec, R_eff)

    References
    ----------
    .. [1] Devriendt, K., Martin-Gutierrez, S., & Lambiotte, R. (2022).
       Variance and covariance of distributions on graphs. SIAM Review, 64(2),
       343–359. https://doi.org/10.1137/20M1361328

    See Also
    --------
    netneurotools.stats.network_covariance
    """
    p = vec / np.sum(vec)
    return 0.5 * (p.T @ np.multiply(D, D) @ p)


[docs]def network_variance_numba(vec, D):
    """
    Numba version of :meth:`netneurotools.stats.network_variance`.

    Network variance is a measure of variance taken into account the network
    structure.

    .. warning::
       Test before use.

    Parameters
    ----------
    vec : (N,) array_like
        Input vector. Must be all positive.
        Will be normalized internally as a probability distribution.
    D : (N, N) array_like
        Distance matrix.

    Returns
    -------
    network_variance : float
        Network variance of `vec` on `D`
    """
    p = vec / np.sum(vec)
    return 0.5 * _quadratic_form(D, p, p, squared=True)


if use_numba:
    network_variance_numba = njit(network_variance_numba)


[docs]def network_covariance(joint_pmat, D, calc_marginal=True):
    r"""
    Calculate covariance of a joint probability matrix on a graph.

    .. warning::
       Test before use.

    Parameters
    ----------
    joint_pmat : (N, N) array_like
        Joint probability matrix. Please make sure that it is valid.
    D : (N, N) array_like
        Distance matrix.
    calc_marginal : bool, optional
        Whether to calculate marginal variance. It will be marginally faster if
        :code:`calc_marginal=False` (returning marginal variances as 0). Default: True

    Returns
    -------
    network_covariance : float
        Covariance of `joint_pmat` on `D`
    var_p : float
        Marginal variance of `joint_pmat` on `D`.
        Will be 0 if :code:`calc_marginal=False`
    var_q : float
        Marginal variance of `joint_pmat` on `D`.
        Will be 0 if :code:`calc_marginal=False`

    Notes
    -----
    The network variance is defined as

    .. math::
        cov(P) = \frac{1}{2} \sum_{i, j} [p(i) q(j) - P(i,j)] d^2(i,j)

    where :math:`P` is the joint probability matrix, :math:`p` and :math:`q`
    are the marginal probability distributions of `joint_pmat`, and :math:`d(i,j)`
    is the distance between node :math:`i` and :math:`j`.

    Check :func:`network_variance` for usage.

    References
    ----------
    .. [1] Devriendt, K., Martin-Gutierrez, S., & Lambiotte, R. (2022).
       Variance and covariance of distributions on graphs. SIAM Review, 64(2),
       343–359. https://doi.org/10.1137/20M1361328

    See Also
    --------
    netneurotools.stats.network_variance
    """
    p = np.sum(joint_pmat, axis=1)
    q = np.sum(joint_pmat, axis=0)
    D_sq = np.multiply(D, D)
    cov = p.T @ D_sq @ q - np.sum(np.multiply(joint_pmat, D_sq))
    if calc_marginal:
        var_p = p.T @ D_sq @ p
        var_q = q.T @ D_sq @ q
    else:
        var_p, var_q = 0, 0
    return 0.5 * cov, 0.5 * var_p, 0.5 * var_q


[docs]def network_covariance_numba(joint_pmat, D, calc_marginal=True):
    """
    Numba version of :meth:`netneurotools.stats.network_covariance`.

    .. warning::
       Test before use.

    Parameters
    ----------
    joint_pmat : (N, N) array_like
        Joint probability matrix. Please make sure that it is valid.
    D : (N, N) array_like
        Distance matrix.
    calc_marginal : bool, optional
        Whether to calculate marginal variance. It will be marginally faster if
        :code:`calc_marginal=False` (returning marginal variances as 0). Default: True

    Returns
    -------
    network_covariance : float
        Covariance of `joint_pmat` on `D`
    var_p : float
        Marginal variance of `joint_pmat` on `D`.
        Will be 0 if :code:`calc_marginal=False`
    var_q : float
        Marginal variance of `joint_pmat` on `D`.
        Will be 0 if :code:`calc_marginal=False`
    """
    n = joint_pmat.shape[0]
    p = np.sum(joint_pmat, axis=1)
    q = np.sum(joint_pmat, axis=0)
    cov = 0.0
    var_p, var_q = 0.0, 0.0
    for i in range(n):
        for j in range(n):
            cov += (p[i] * q[j] - joint_pmat[i, j]) * D[i, j]**2
            if calc_marginal:
                var_p += p[i] * p[j] * D[i, j]**2
                var_q += q[i] * q[j] * D[i, j]**2
    return 0.5 * cov, 0.5 * var_p, 0.5 * var_q


if use_numba:
    network_covariance_numba = njit(network_covariance_numba)