Source code for netneurotools.modularity.modules

"""Functions for working with network modules."""

import bct
import numpy as np
from sklearn.utils.validation import check_random_state
from scipy import optimize
from scipy.cluster import hierarchy


def _get_relabels(c1, c2):
    """
    Find mapping of labels from `c1` to `c2`.

    Parameters
    ----------
    c1, c2 : (N,) array_like
        Cluster labels for `N` subjects

    Returns
    -------
    src, tar : numpy.ndarray
        Source-target mapping of labels in `c1`
    """

    def _match(c1s, c2s):
        intersect = len(np.intersect1d(c1s, c2s))
        return (len(c1s) - intersect) + (len(c2s) - intersect)

    # get unique IDs of clusters in both solutions
    ids1, ids2 = np.unique(c1), np.unique(c2)

    idxs = np.arange(len(c1))
    assignments = np.ones((len(ids1), len(ids2)), dtype=int) * -1

    for n, i in enumerate(ids1):
        c1s = idxs[c1 == i]
        assignments[n] = [_match(c1s, idxs[c2 == f]) for f in ids2]

    idx1, idx2 = optimize.linear_sum_assignment(assignments)

    return ids1[idx1], ids2[idx2]


[docs] def match_cluster_labels(source, target): """ Align cluster labels in `source` to those in `target`. Uses :func:`scipy.optimize.linear_sum_assignment` to match solutions. If `source` has fewer clusters than `target` the returned assignments may be discontinuous (see Examples for more information). Parameters ---------- source : (N,) array_like Cluster labels for `N` subjects, to be re-labelled target : (N,) array_like Cluster labels for `N` subjects, to which `source` is mapped Returns ------- matched : (N,) array_like Re-labelled `source` with cluster assignments "matched" to `target` Examples -------- >>> from netneurotools import modularity When cluster labels are perfectly matched but e.g., inverted the function will find a perfect mapping: >>> a = np.array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) >>> b = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) >>> modularity.match_cluster_labels(a, b) array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) However, the mapping will work even when cluster assignments between the two solutions aren't perfectly matched. The function will simply choose a re-labelling that generates the "best" alignment between labels: >>> a = np.array([0, 0, 0, 2, 2, 2, 2, 1, 1, 1]) >>> b = np.array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) >>> modularity.match_cluster_labels(a, b) array([1, 1, 1, 0, 0, 0, 0, 2, 2, 2]) If the source assignment has fewer clusters than the target the returned values may be discontinuous: >>> modularity.match_cluster_labels(b, a) array([0, 0, 0, 2, 2, 2, 2, 2, 2, 2]) """ # try and match the source to target src, tar = _get_relabels(source, target) # if there are a different number of clusters then handle elegantly. # elegantly here means we renumber the clusters so that they start at 1 src_m = np.setdiff1d(np.unique(source), src) if len(src_m) > 0: tar_m = np.arange(tar.max() + 1, tar.max() + 1 + len(src_m)) src, tar = np.append(src, src_m), np.append(tar, tar_m) # now re-label things based on the matched assignments sidx = src.argsort() src, tar = src[sidx], tar[sidx] matched = tar[np.searchsorted(src, source)] return matched
[docs] def match_assignments(assignments, target=None, seed=None): """ Re-label clusters in columns of `assignments` to best match `target`. Uses :func:`~.cluster.match_cluster_labels` to align cluster assignments. Parameters ---------- assignments : (N, M) array_like Array of `M` clustering assignments for `N` subjects target : (N,) array_like, optional Target clustering assignments to which all columns should be matched. If provided as an integer the relevant column in `assignments` will be selected. If not specified a (semi-)random column in `assignments` is chosen; because of the potential discontinuity introduced when matching an N-cluster solution to an N+1-cluster solution, the "random" target columns will be one `assignments` with the lowest cluster number. See Examples for more information. Default: None seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation; only used if `target` is not provided. Default: None Returns ------- assignments : (N, M) numpy.ndarray Provided array with re-labeled cluster solutions to better match across `M` assignments Examples -------- >>> from netneurotools import modularity First we can construct a matrix of `N` samples clustered `M` times (in this case, `M` is three) . Since cluster labels are generally arbitrary we can see that, while the same clusters were found each time, they were given different labels: >>> assignments = np.array([[0, 0, 1], ... [0, 0, 1], ... [0, 0, 1], ... [1, 2, 0], ... [1, 2, 0], ... [1, 2, 0], ... [2, 1, 2], ... [2, 1, 2]]) We would like to match the assignments so they're all the same. Since one of the columns will be randomly picked as the "target" solution, we provide a `seed` to ensure reproducibility in the selection: >>> modularity.match_assignments(assignments, seed=1234) array([[1, 1, 1], [1, 1, 1], [1, 1, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [2, 2, 2], [2, 2, 2]]) Alternatively, if `assignments` has clustering solutions with different numbers of clusters and no `target` is specified, the chosen `target` will be one of the columns with the smallest number of clusters: >>> assignments = np.array([[0, 0, 1], ... [0, 0, 1], ... [0, 0, 1], ... [1, 2, 0], ... [1, 2, 0], ... [1, 2, 0], ... [1, 1, 2], ... [1, 1, 2]]) >>> modularity.match_assignments(assignments) array([[0, 0, 0], [0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [1, 1, 1], [1, 2, 2], [1, 2, 2]]) """ assignments = np.asarray(assignments).copy() # pick a random assignment with the lowest # of clusters as "target" if target is None: rs = check_random_state(seed) cm = assignments.max(axis=0) mask = cm == cm.min() target = assignments[:, mask][:, rs.choice(mask.sum())] # use the specified column of the matrix elif isinstance(target, int): target = assignments[:, target] # assume that target is an iterable we can use (just check the length) else: if len(target) != len(assignments): raise ValueError('Length of target clustering solution must be ' 'identical to length of provided array.') # iterate through all assignments and try and match them to the target for n, source in enumerate(assignments.T): assignments[:, n] = match_cluster_labels(source, target) return assignments
[docs] def reorder_assignments(assignments, consensus=None, col_sort=True, row_sort=True, return_index=True, seed=None): """ Relabel and reorders rows / columns of `assignments` to "look better". Relabels cluster solutions in `assignments` so that distinct clustering solutions have similar cluster labels. Then, swaps columns of `assignments` so that similar clustering solutions are placed near each other. Finally, swaps rows of `assignments` so that subjects with similar clustering profiles are placed near each other. Uses hierarchical clustering to generate re-ordering of columns and rows Parameters ---------- assignments : (N, M) array_like Array of `M` clustering assignments for `N` subjects consensus : (N,) array_like, optional "Final" clustering solution for `N` subjects. If provided, reordering of rows will be constrained by cluster assignment. Default: None {row,col}_sort : bool, optional If True, sort the {rows, columns}. Default: True return_index : bool, optional Whether to return the row and column indices used to re-order `assignments` in addition to the re-ordered matrix. Default: True seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Default: None Returns ------- reordered : (N, M) numpy.ndarray Provided array with both rows and columns re-ordered index : tuple Indices used to reorder `assignments` to generate `reordered` output """ def _reorder_rows(arr): """Return indices of rows in `arr` after hierarchical clustering.""" link = hierarchy.linkage(arr, method='average', metric='hamming') return hierarchy.dendrogram(link, no_plot=True)['leaves'] # first, relabel the columns to try and match across assignments; this will # make our reordering procedure work a bit better! assignments = match_assignments(assignments, seed=seed) if col_sort: # get max cluster number for each partition max_cl = assignments.max(axis=0) # if different assignments have different numbers of detected clusters if len(np.unique(max_cl)) > 1: # first sort based on the number of clusters in each assignment col_idx = max_cl.argsort() assignments, max_cl = assignments[:, col_idx], max_cl[col_idx] # then, within assignments with the same number of clusters reorder # assignments (columns) reordered, splits = [], np.where(np.diff(max_cl) != 0)[0] + 1 col_idx = np.split(col_idx, splits) for n, cl in enumerate(np.split(assignments, splits, axis=1)): idx = _reorder_rows(cl.T) col_idx[n] = col_idx[n][idx] reordered += [cl[:, idx]] col_idx = list(np.hstack(col_idx)) assignments = np.column_stack(reordered) # otherwise all assignments have same number of detected clusters so # just sort them all else: col_idx = list(_reorder_rows(assignments.T)) assignments = assignments[:, col_idx] if row_sort: # if a consensus was provided reorder rows based on cluster assignment if consensus is not None: # sort subjects by their cluster assignment in the consensus for # each cluster, then reorder subjects (rows) reordered, row_idx = [], [] for cl in np.unique(consensus): cl, = np.where(consensus == cl) idx = list(cl[_reorder_rows(assignments[cl])]) reordered += [assignments[idx]] row_idx += idx assignments = np.vstack(reordered) # otherwise, just do a massive reordering of all the rows else: row_idx = list(_reorder_rows(assignments)) assignments = assignments[row_idx] if return_index: return assignments, np.ix_(row_idx, col_idx) return assignments
def agreement_matrix(assignments): """ Compute the agreement matrix from cluster solutions in `assignments`. Parameters ---------- assignments : (N, M) array_like Array of `M` clustering solutions for `N` samples (e.g., subjects, nodes, etc). Values of array should be integer-based cluster assignment labels Returns ------- agreement: (N, N) array_like Agreement matrix where each entry indicates the number of times a pair of samples was assigned to the same cluster. """ assignments = np.asarray(assignments) N, M = assignments.shape agreement = np.zeros((N, N), dtype=np.int32) for k in range(M): labels = assignments[:, k] for lbl in np.unique(labels): nodes = np.where(labels == lbl)[0] agreement[np.ix_(nodes, nodes)] += 1 np.fill_diagonal(agreement, 0) return agreement def consensus_clustering(agreement, threshold, n_it=10): """ Perform consensus clustering on an agreement matrix. This algorithm, first introduced in [1]_, iteratively thresholds the agreement matrix and applies community detection to identify stable clustering partitions. The procedure terminates when a single unique partition is obtained or when no supra-threshold agreement remains. Parameters ---------- agreement : (N, N) array_like Agreement matrix indicating the frequency with which pairs of samples were assigned to the same cluster threshold : float Threshold value used to remove weak agreement entries n_it : int, optional Number of community detection iterations per consensus step. Default: 10 Returns ------- consensus : (N,) numpy.ndarray Consensus cluster labels Notes ----- This algorithm is adapted from `consensus_und` of the Brain Connectivity Toolbox [2]_. References ---------- .. [1] Lancichinetti, A., & Fortunato, S. (2012). Consensus clustering in complex networks. Scientific reports, 2(1), 336. .. [2] Rubinov, M., & Sporns, O. (2010). Complex network measures of brain connectivity: uses and interpretations. Neuroimage, 52(3), 1059-1069. """ n = len(agreement) while True: thresholded_agreement = np.where(agreement >= threshold, agreement, 0) np.fill_diagonal(thresholded_agreement, 0) if np.all(thresholded_agreement == 0): ci_unique = np.arange(n) break else: ci = np.zeros((n, n_it)) for i in range(n_it): ci[:, i], _ = bct.community_louvain(thresholded_agreement, B='modularity') ci_unique = _unique_partitions(ci) if ci_unique.shape[1] > 1: agreement = agreement_matrix(ci) / n_it else: break return np.squeeze(ci_unique + 1)
[docs] def find_consensus(assignments, null_func=np.mean, return_agreement=False, seed=None): """ Find consensus clustering labels from cluster solutions in `assignments`. Parameters ---------- assignments : (N, M) array_like Array of `M` clustering solutions for `N` samples (e.g., subjects, nodes, etc). Values of array should be integer-based cluster assignment labels null_func : callable, optional Function used to generate null model when performing consensus-based clustering. Must accept a 2D array as input and return a single value. Default: :func:`numpy.mean` return_agreement : bool, optional Whether to return the thresholded N x N agreement matrix used in generating the final consensus clustering solution. Default: False seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Used when permuting cluster assignments during generation of null model. Default: None Returns ------- consensus : (N,) numpy.ndarray Consensus cluster labels References ---------- Bassett, D. S., Porter, M. A., Wymbs, N. F., Grafton, S. T., Carlson, J. M., & Mucha, P. J. (2013). Robust detection of dynamic community structure in networks. Chaos: An Interdisciplinary Journal of Nonlinear Science, 23(1), 013142. """ rs = check_random_state(seed) samp, comm = assignments.shape # create agreement matrix from input community assignments and convert to # probability matrix by dividing by `comm` agreement = agreement_matrix(assignments) / comm # generate null agreement matrix and use to create threshold null_assign = np.column_stack([rs.permutation(i) for i in assignments.T]) null_agree = agreement_matrix(null_assign) / comm threshold = null_func(null_agree) # run consensus clustering on agreement matrix after thresholding consensus = consensus_clustering(agreement, threshold, n_it=10) if return_agreement: return consensus.astype(int), agreement * (agreement > threshold) return consensus.astype(int)
[docs] def consensus_modularity(adjacency, gamma=1, B='modularity', repeats=250, null_func=np.mean, seed=None): """ Find community assignments from `adjacency` through consensus. Performs `repeats` iterations of community detection on `adjacency` and then uses consensus clustering on the resulting community assignments. Parameters ---------- adjacency : (N, N) array_like Adjacency matrix (weighted/non-weighted) on which to perform consensus community detection. gamma : float, optional Resolution parameter for modularity maximization. Default: 1 B : str or (N, N) array_like, optional Null model to use for consensus clustering. If `str`, must be one of ['modularity', 'potts', 'negative_sym', 'negative_asym']. Default: 'modularity' repeats : int, optional Number of times to repeat Louvain algorithm clustering. Default: 250 null_func : callable, optional Function used to generate null model when performing consensus-based clustering. Must accept a 2D array as input and return a single value. Default: `np.mean` seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Default: None Returns ------- consensus : (N,) np.ndarray Consensus-derived community assignments Q_all : array_like Optimized modularity over all `repeats` community assignments zrand_all : array_like z-Rand score over all pairs of `repeats` community assignment vectors References ---------- Bassett, D. S., Porter, M. A., Wymbs, N. F., Grafton, S. T., Carlson, J. M., & Mucha, P. J. (2013). Robust detection of dynamic community structure in networks. Chaos: An Interdisciplinary Journal of Nonlinear Science, 23(1), 013142. """ # generate community partitions `repeat` times comms, Q_all = zip(*[bct.community_louvain(adjacency, gamma=gamma, B=B) for i in range(repeats)]) comms = np.column_stack(comms) # find consensus cluster assignments across all partitoning solutions consensus = find_consensus(comms, null_func=null_func, seed=seed) # get z-rand statistics for partition similarity (n.b. can take a while) zrand_all = _zrand_partitions(comms) return consensus, np.array(Q_all), zrand_all
[docs] def zrand(X, Y): """ Calculate the z-Rand index of two community assignments. Communities are relabeled to consecutive integers for the comparison. Parameters ---------- X, Y : (n, 1) array_like Community assignment vectors to compare Returns ------- z_rand : float Z-rand index References ---------- Amanda L. Traud, Eric D. Kelsic, Peter J. Mucha, and Mason A. Porter. (2011). Comparing Community Structure to Characteristics in Online Collegiate Social Networks. SIAM Review, 53, 526-543. """ X = np.squeeze(X) Y = np.squeeze(Y) if X.ndim > 1 or Y.ndim > 1: raise ValueError('X and Y must have only one-dimension each. ' 'Please check inputs.') if len(X) != len(Y): raise ValueError('X and Y must have the same length.') X = np.unique(X, return_inverse=True)[1] Y = np.unique(Y, return_inverse=True)[1] # Check for undefined single-community case if X.max() == 0 or Y.max() == 0: raise ValueError('X and Y must contain at least 2 distinct communities ' 'each. z-Rand is undefined for single-community ' 'inputs (standard deviation=0).') # Calculate the contigency table + calculate row/column marginals kx = X.max() + 1 ky = Y.max() + 1 nij = np.bincount(X * ky + Y, minlength=kx * ky).reshape(kx, ky) ni = nij.sum(axis=1) nj = nij.sum(axis=0) n = len(X) M = n * (n - 1) // 2 M1 = np.sum(ni * (ni - 1)) / 2 M2 = np.sum(nj * (nj - 1)) / 2 wab = np.sum(nij * (nij - 1)) / 2 mod = n * (n**2 - 3 * n - 2) C1 = mod - 8 * (n + 1) * M1 + 4 * np.sum(ni**3) C2 = mod - 8 * (n + 1) * M2 + 4 * np.sum(nj**3) a = M / 16 b = ((4 * M1 - 2 * M)**2) * ((4 * M2 - 2 * M)**2) / (256 * (M**2)) c = C1 * C2 / (16 * n * (n - 1) * (n - 2)) d = ((((4 * M1 - 2 * M)**2) - (4 * C1) - (4 * M)) * (((4 * M2 - 2 * M)**2) - (4 * C2) - (4 * M)) / (64 * n * (n - 1) * (n - 2) * (n - 3))) sigw2 = a - b + c + d # catch any negatives if sigw2 < 0: return 0 z_rand = (wab - ((M1 * M2) / M)) / np.sqrt(sigw2) return z_rand
def _zrand_partitions(communities): """ Calculate z-Rand for all pairs of assignments in `communities`. Iterates through every pair of community assignment vectors in `communities` and calculates the z-Rand score to assess their similarity. Parameters ---------- communities : (S, R) array_like Community assignments for `S` samples over `R` partitions Returns ------- all_zrand : array_like z-Rand score over all pairs of `R` partitions of community assignments """ n_partitions = communities.shape[-1] all_zrand = np.zeros(int(n_partitions * (n_partitions - 1) / 2)) for c1 in range(n_partitions): for c2 in range(c1 + 1, n_partitions): idx = int((c1 * n_partitions) + c2 - ((c1 + 1) * (c1 + 2) // 2)) all_zrand[idx] = zrand(communities[:, c1], communities[:, c2]) return all_zrand def _unique_partitions(communities): """ Identify unique clustering partitions from a set of solutions. Relabels each partition in `communities` to use consecutive integer labels starting from zero, then removes duplicate partitions across columns. Parameters ---------- communities : (N, M) array_like Array of `M` clustering solutions for `N` samples Returns ------- unique : (N, K) numpy.ndarray Array containing the `K` unique clustering partitions identified from `communities` """ # Relabel partitions to consecutive integers n, m = communities.shape for i in range(m): communities[:, i] = np.unique(communities[:, i], return_inverse=True)[1] # Remove duplicate partitions seen = set() unique_cols = [] for i in range(m): key = tuple(communities[:, i]) if key not in seen: seen.add(key) unique_cols.append(communities[:, i]) return np.column_stack(unique_cols)
[docs] def get_modularity(adjacency, comm, gamma=1): """ Calculate modularity contribution for each community in `comm`. Parameters ---------- adjacency : (N, N) array_like Adjacency (e.g., correlation) matrix comm : (N,) array_like Community assignment vector splitting `N` subjects into `G` groups gamma : float, optional Resolution parameter used in original modularity maximization. Default: 1 Returns ------- comm_q : (G,) ndarray Relative modularity for each community See Also -------- netneurotools.modularity.get_modularity_z netneurotools.modularity.get_modularity_sig """ adjacency, comm = np.asarray(adjacency), np.asarray(comm) s = adjacency.sum() B = adjacency - (gamma * np.outer(adjacency.sum(axis=1), adjacency.sum(axis=0)) / s) # find modularity contribution of each community communities = np.unique(comm) comm_q = np.empty(shape=communities.size) for n, ci in enumerate(communities): inds = comm == ci comm_q[n] = B[np.ix_(inds, inds)].sum() / s return comm_q
[docs] def get_modularity_z(adjacency, comm, gamma=1, n_perm=10000, seed=None): """ Calculate average z-score of community assignments by permutation. Parameters ---------- adjacency : (N, N) array_like Adjacency (correlation) matrix comm : (N,) array_like Community assignment vector splitting `N` subjects into `G` groups gamma : float, optional Resolution parameter used in original modularity maximization. Default: 1 n_perm : int, optional Number of permutations. Default: 10000 seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Default: None Returns ------- q_z : float Average Z-score of modularity of communities See Also -------- netneurotools.modularity.get_modularity netneurotools.modularity.get_modularity_sig """ rs = check_random_state(seed) real_qs = get_modularity(adjacency, comm, gamma) simu_qs = np.empty(shape=(np.unique(comm).size, n_perm)) for perm in range(n_perm): simu_qs[:, perm] = get_modularity(adjacency, rs.permutation(comm), gamma) # avoid instances where dist.std(1) == 0 std = simu_qs.std(axis=1) if np.any(std == 0): return np.mean(real_qs - simu_qs.mean(axis=1)) else: return np.mean((real_qs - simu_qs.mean(axis=1)) / std)
[docs] def get_modularity_sig(adjacency, comm, gamma=1, n_perm=10000, alpha=0.01, seed=None): """ Calculate significance of community assignments in `comm` by permutation. Parameters ---------- adjacency : (N, N) array_like Adjacency (correlation) matrix comm : (N,) array_like Community assignment vector gamma : float Resolution parameter used in original modularity maximization n_perm : int, optional Number of permutations to test against. Default: 10000 alpha : (0,1) float, optional Alpha level to assess significance. Default: 0.01 seed : {int, np.random.RandomState instance, None}, optional Seed for random number generation. Default: None Returns ------- ndarray Significance of each community in `comm` (boolean) See Also -------- netneurotools.modularity.get_modularity_z netneurotools.modularity.get_modularity_sig """ rs = check_random_state(seed) real_qs = get_modularity(adjacency, comm, gamma) simu_qs = np.empty(shape=(np.unique(comm).size, n_perm)) for perm in range(n_perm): simu_qs[:, perm] = get_modularity(adjacency, rs.permutation(comm), gamma) q_sig = real_qs > np.percentile(simu_qs, 100 * (1 - alpha), axis=1) return q_sig