Source code for prody.utilities.catchall

"""This module defines miscellaneous utility functions that is public to users."""

import numpy as np

from prody import PY3K
from .misctools import addEnds, interpY, index, isListLike
from .checkers import checkCoords
from .logger import LOGGER


__all__ = ['calcTree', 'clusterMatrix', 'showLines', 'showMatrix', 
           'reorderMatrix', 'findSubgroups', 'getCoords',  
           'getLinkage', 'getTreeFromLinkage', 'clusterSubfamilies']

class LinkageError(Exception):
    pass

[docs]def clusterSubfamilies(similarities, n_clusters=0, linkage='all', method='tsne', cutoff=0.0, **kwargs): """Perform clustering based on members of the *ensemble* projected into lower a reduced dimension. :arg similarities: a matrix of similarities for each structure in the ensemble, such as RMSD-matrix, dynamics-based spectral overlap, sequence similarity :type similarities: :class:`~numpy.ndarray` :arg n_clusters: the number of clusters to generate. If **0**, will scan a range of number of clusters and return the best one based on highest silhouette score. Default is **0**. :type n_clusters: int :arg linkage: if **all**, will test all linkage types (ward, average, complete, single). Otherwise will use only the one(s) given as input. Default is **all**. :type linkage: str, list, tuple, :class:`~numpy.ndarray` :arg method: if set to **spectral**, will generate a Kirchoff matrix based on the cutoff value given and use that as input as clustering instead of the values themselves. Default is **tsne**. :type method: str :arg cutoff: only used if *method* is set to **spectral**. This value is used for generating the Kirchoff matrix to use for generating clusters when doing spectral clustering. Default is **0.0**. :type cutoff: float """ # Import necessary packages try: from sklearn.manifold import SpectralEmbedding from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import silhouette_score from sklearn.manifold import TSNE except ImportError: raise ImportError('need sklearn module') ''' try: import Bio except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') ''' # Check inputs to make sure are of valid types/values if not isinstance(similarities, np.ndarray): raise TypeError('similarities should be a numpy ndarray') dim = similarities.shape if dim[0] != dim[1]: raise ValueError('similarities must be a square matrix') if n_clusters != 0: if not isinstance(n_clusters, int): raise TypeError('clusters must be an instance of int') if n_clusters < 1: raise ValueError('clusters must be a positive integer') elif n_clusters > similarities.shape[0]: raise ValueError('clusters can\'t be longer than similarities matrix') nclusts = range(n_clusters,n_clusters+1) else: nclusts = range(2,10,1) if linkage != 'all': # Check if given input for linkage is list-like if isListLike(linkage): for val in linkage: if val.lower() not in ['ward', 'average', 'complete', 'single']: raise ValueError('linkage must be one or more of: \'ward\', \'average\', \'complete\', or \'single\'') if len(linkage) > 4: raise ValueError('linkage must be one or more of: \'ward\', \'average\', \'complete\', or \'single\'') linkages = [ x.lower() for x in linkage ] # If not, check if it is a valid string and method name else: if not isinstance(linkage, str): raise TypeError('linkage must be an instance of str or list-like of strs') if linkage not in ['ward', 'average', 'complete', 'single']: raise ValueError('linkage must one or more of: \'ward\', \'average\', \'complete\', or \'single\'') linkages = [linkage] else: linkages = ['ward', 'average', 'complete', 'single'] if method != 'tsne': if not isinstance(method, str): raise TypeError('method must be an instance of str') if method != 'spectral': raise ValueError('method must be either \'tsne\' or \'spectral\'') if not isinstance(cutoff, float): raise TypeError('cutoff must be an instance of float') best_score = -1 best_nclust = 0 best_link = '' best_labels = [] # Scan over range of clusters for x in nclusts: if method == 'tsne': embedding = TSNE(n_components=2) transform = embedding.fit_transform(similarities) else: kirchhoff = np.where(similarities > cutoff, 0, -1) embedding = SpectralEmbedding(n_components=2) transform = embedding.fit_transform(kirchhoff) for link in linkages: clustering = AgglomerativeClustering(linkage=link, n_clusters=x) clustering.fit(transform) silhouette_avg = silhouette_score(transform, clustering.labels_) if silhouette_avg > best_score: best_score = silhouette_avg best_nclust = x best_link = link best_labels = clustering.labels_ return best_labels
def getCoords(data): try: data = (data._getCoords() if hasattr(data, '_getCoords') else data.getCoords()) except AttributeError: try: checkCoords(data) except TypeError: raise TypeError('data must be a Numpy array or an object ' 'with `getCoords` method') return data
[docs]def getLinkage(names, tree): """ Obtain the :func:`~scipy.cluster.hierarchy.linkage` matrix encoding ``tree``. :arg names: a list of names, the order determines the values in the linkage matrix :type names: list, :class:`~numpy.ndarray` :arg tree: tree to be converted :type tree: :class:`~Bio.Phylo.BaseTree.Tree` """ tree_terminals = tree.get_terminals() if len(tree_terminals) != len(names): raise ValueError('inconsistent number of terminals in tree and names') terminals = [None] * len(names) for clade in tree_terminals: i = index(names, clade.name) terminals[i] = clade n = len(terminals) nonterminals = [c for c in reversed(tree.get_nonterminals())] if len(nonterminals) != n-1: raise LinkageError('wrong number of terminal clades') Z = np.zeros((n-1, 4)) root = tree.root def _indexOfClade(clade): if clade.is_terminal(): i = index(terminals, clade) else: i = index(nonterminals, clade) + n return i def _height_of(clade): if clade.is_terminal(): height = 0 else: height = max(_height_of(c) + c.branch_length for c in clade.clades) return height def _dfs(clade): if clade.is_terminal(): return i = _indexOfClade(clade) clade_a = clade.clades[0] clade_b = clade.clades[1] a = _indexOfClade(clade_a) b = _indexOfClade(clade_b) l = min(a, b) r = max(a, b) Z[i-n, 0] = l Z[i-n, 1] = r Z[i-n, 2] = _height_of(clade) * 2. Z[i-n, 3] = clade.count_terminals() _dfs(clade_a) _dfs(clade_b) _dfs(root) return Z
[docs]def getTreeFromLinkage(names, linkage): """ Obtain the tree encoded by ``linkage``. :arg names: a list of names, the order should correspond to the values in linkage :type names: list, :class:`~numpy.ndarray` :arg linkage: linkage matrix :type linkage: :class:`~numpy.ndarray` """ try: import Bio except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') from Bio.Phylo.BaseTree import Tree, Clade if not isinstance(linkage, np.ndarray): raise TypeError('linkage must be a numpy.ndarray instance') if linkage.ndim != 2: raise LinkageError('linkage must be a 2-dimensional matrix') if linkage.shape[1] != 4: raise LinkageError('linkage must have exactly 4 columns') n_terms = len(names) if linkage.shape[0] != n_terms-1: raise LinkageError('linkage must have exactly len(names)-1 rows') clades = [] heights = [] for name in names: clade = Clade(None, name) clades.append(clade) heights.append(0.) for link in linkage: l = int(link[0]) r = int(link[1]) height = link[2] left = clades[l] right = clades[r] lh = heights[l] rh = heights[r] left.branch_length = height - lh right.branch_length = height - rh clade = Clade(None, None) clade.clades.append(left) clade.clades.append(right) clades.append(clade) heights.append(height) return Tree(clade)
[docs]def calcTree(names, distance_matrix, method='upgma', linkage=False): """ Given a distance matrix, it creates an returns a tree structure. :arg names: a list of names :type names: list, :class:`~numpy.ndarray` :arg distance_matrix: a square matrix with length of ensemble. If numbers does not match *names* it will raise an error :type distance_matrix: :class:`~numpy.ndarray` :arg method: method used for constructing the tree. Acceptable options are ``"upgma"``, ``"nj"``, or methods supported by :func:`~scipy.cluster.hierarchy.linkage` such as ``"single"``, ``"average"``, ``"ward"``, etc. Default is ``"upgma"`` :type method: str :arg linkage: whether the linkage matrix is returned. Note that NJ trees do not support linkage :type linkage: bool """ try: import Bio except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') from .TreeConstruction import DistanceMatrix, DistanceTreeConstructor if len(names) != distance_matrix.shape[0] or len(names) != distance_matrix.shape[1]: raise ValueError("Mismatch between the sizes of matrix and names.") method = method.lower().strip() if method in ['ward', 'single', 'average', 'weighted', 'centroid', 'median']: from scipy.cluster.hierarchy import linkage as hlinkage from scipy.spatial.distance import squareform Z = hlinkage(squareform(distance_matrix), method=method) tree = getTreeFromLinkage(names, Z) else: matrix = [] k = 1 Z = None for row in distance_matrix: matrix.append(list(row[:k])) k = k + 1 if isinstance(names, np.ndarray): names = names.tolist() dm = DistanceMatrix(names, matrix) constructor = DistanceTreeConstructor() method = method.strip().lower() if method == 'nj': tree = constructor.nj(dm) elif method == 'upgma': tree = constructor.upgma(dm) if linkage: Z = getLinkage(names, tree) else: raise ValueError('Method can be only either "nj", "upgma" or ' 'hierarchical clustering such as "single", "average", etc.') for node in tree.get_nonterminals(): node.name = None if linkage: return tree, Z else: return tree
def writeTree(filename, tree, format_str='newick'): """ Write a tree to file using Biopython. :arg filename: name for output file :type filename: str :arg tree: a square matrix with length of ensemble. If numbers does not match *names* it will raise an error :type tree: :class:`~Bio.Phylo.BaseTree.Tree` :arg format_str: a string specifying the format for the tree :type format_str: str """ try: from Bio import Phylo except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') if not isinstance(filename, str): raise TypeError('filename should be a string') if not isinstance(tree, Phylo.BaseTree.Tree): raise TypeError('tree should be a Biopython.Phylo Tree object') if not isinstance(format_str, str): raise TypeError('format_str should be a string') Phylo.write(tree, filename, format_str)
[docs]def clusterMatrix(distance_matrix=None, similarity_matrix=None, labels=None, return_linkage=None, **kwargs): """ Cluster a distance matrix using scipy.cluster.hierarchy and return the sorted matrix, indices used for sorting, sorted labels (if **labels** are passed), and linkage matrix (if **return_linkage** is **True**). :arg distance_matrix: an N-by-N matrix containing some measure of distance such as 1. - seqid_matrix (Hamming distance), rmsds, or distances in PCA space :type distance_matrix: :class:`~numpy.ndarray` :arg similarity_matrix: an N-by-N matrix containing some measure of similarity such as sequence identity, mode-mode overlap, or spectral overlap. Each element will be subtracted from 1. to get distance, so make sure this is reasonable. :type similarity_matrix: :class:`~numpy.ndarray` :arg labels: labels for each matrix row that can be returned sorted :type labels: list :arg no_plot: if **True**, don't plot the dendrogram. default is **True** :type no_plot: bool :arg reversed: if set to **True**, then the sorting indices will be reversed. :type reversed: bool Other arguments for :func:`~scipy.hierarchy.linkage` and :func:`~scipy.hierarchy.dendrogram` can also be provided and will be taken as **kwargs**. """ import scipy.cluster.hierarchy as sch from scipy import spatial if similarity_matrix is None and distance_matrix is None: raise ValueError('Please provide a distance matrix or a similarity matrix') orientation = kwargs.pop('orientiation', 'right') reversed = kwargs.pop('reversed', False) no_plot = kwargs.pop('no_plot', True) if distance_matrix is None: matrix = similarity_matrix distance_matrix = 1. - similarity_matrix else: matrix = distance_matrix formatted_distance_matrix = spatial.distance.squareform(distance_matrix) linkage_matrix = sch.linkage(formatted_distance_matrix, **kwargs) sorting_dendrogram = sch.dendrogram(linkage_matrix, orientation=orientation, labels=labels, no_plot=no_plot) indices = sorting_dendrogram['leaves'] sorted_labels = sorting_dendrogram['ivl'] if reversed: indices = indices[::-1] sorted_labels = sorted_labels[::-1] sorted_matrix = matrix[indices, :] sorted_matrix = sorted_matrix[:, indices] return_vals = [sorted_matrix, indices] if labels is not None: return_vals.append(sorted_labels) if return_linkage: return_vals.append(linkage_matrix) return tuple(return_vals) # convert to tuple to avoid [pylint] E0632:Possible unbalanced tuple unpacking
[docs]def showLines(*args, **kwargs): """ Show 1-D data using :func:`~matplotlib.axes.Axes.plot`. :arg x: (optional) x coordinates. *x* can be an 1-D array or a 2-D matrix of column vectors. :type x: :class:`~numpy.ndarray` :arg y: data array. *y* can be an 1-D array or a 2-D matrix of column vectors. :type y: :class:`~numpy.ndarray` :arg dy: an array of variances of *y* which will be plotted as a band along *y*. It should have the same shape with *y*. :type dy: :class:`~numpy.ndarray` :arg lower: an array of lower bounds which will be plotted as a band along *y*. It should have the same shape with *y* and should be paired with *upper*. :type lower: :class:`~numpy.ndarray` :arg upper: an array of upper bounds which will be plotted as a band along *y*. It should have the same shape with *y* and should be paired with *lower*. :type upper: :class:`~numpy.ndarray` :arg alpha: the transparency of the band(s) for plotting *dy*. :type alpha: float :arg beta: the transparency of the band(s) for plotting *miny* and *maxy*. :type beta: float :arg ticklabels: user-defined tick labels for x-axis. :type ticklabels: list """ # note for developers: this function serves as a low-level # plotting function which provides basic utilities for other # plotting functions. Therefore showFigure is not handled # in this function as it should be already handled in the caller. ticklabels = kwargs.pop('ticklabels', None) dy = kwargs.pop('dy', None) miny = kwargs.pop('lower', None) maxy = kwargs.pop('upper', None) alpha = kwargs.pop('alpha', 0.5) beta = kwargs.pop('beta', 0.25) gap = kwargs.pop('gap', False) labels = kwargs.pop('label', None) from matplotlib import cm, ticker from matplotlib.pyplot import figure, gca, xlim from .drawtools import IndexFormatter ax = gca() lines = ax.plot(*args, **kwargs) polys = [] for i, line in enumerate(lines): color = line.get_color() x, y = line.get_data() if gap: x_new, y_new = addEnds(x, y) line.set_data(x_new, y_new) else: x_new, y_new = x, y if labels is not None: if np.isscalar(labels): line.set_label(labels) else: try: line.set_label(labels[i]) except IndexError: raise ValueError('The number of labels ({0}) and that of y ({1}) do not match.' .format(len(labels), len(line))) # the following function needs to be here so that line exists def sub_array(a, i, tag='a'): ndim = 0 if a is not None: if np.isscalar(a[0]): ndim = 1 # a plain list (array) else: ndim = 2 # a nested list (array) else: return None if ndim == 1: _a = a else: try: _a = a[i] except IndexError: raise ValueError('The number of {2} ({0}) and that of y ({1}) do not match.' .format(len(miny), len(line), tag)) if len(_a) != len(y): raise ValueError('The shapes of {2} ({0}) and y ({1}) do not match.' .format(len(_miny), len(y), tag)) return _a if miny is not None and maxy is not None: _miny = sub_array(miny, i) _maxy = sub_array(maxy, i) if gap: _, _miny = addEnds(x, _miny) _, _maxy = addEnds(x, _maxy) poly = ax.fill_between(x_new, _miny, _maxy, alpha=beta, facecolor=color, edgecolor=None, linewidth=1, antialiased=True) polys.append(poly) if dy is not None: _dy = sub_array(dy, i) if gap: _, _dy = addEnds(x, _dy) poly = ax.fill_between(x_new, y_new-_dy, y_new+_dy, alpha=alpha, facecolor=color, edgecolor=None, linewidth=1, antialiased=True) polys.append(poly) ax.margins(x=0) if ticklabels is not None: if callable(ticklabels): ax.get_xaxis().set_major_formatter(ticker.FuncFormatter(ticklabels)) else: ax.get_xaxis().set_major_formatter(IndexFormatter(ticklabels)) ax.xaxis.set_major_locator(ticker.AutoLocator()) ax.xaxis.set_minor_locator(ticker.AutoMinorLocator()) return lines, polys
[docs]def showMatrix(matrix, x_array=None, y_array=None, **kwargs): """Show a matrix using :meth:`~matplotlib.axes.Axes.imshow`. Curves on x- and y-axis can be added. :arg matrix: matrix to be displayed :type matrix: :class:`~numpy.ndarray` :arg x_array: data to be plotted above the matrix :type x_array: :class:`~numpy.ndarray` :arg y_array: data to be plotted on the left side of the matrix :type y_array: :class:`~numpy.ndarray` :arg percentile: a percentile threshold to remove outliers, i.e. only showing data within *p*-th to *100-p*-th percentile :type percentile: float :arg interactive: turn on or off the interactive options :type interactive: bool :arg xtickrotation: how much to rotate the xticklabels in degrees default is 0 :type xtickrotation: float """ from matplotlib import ticker from matplotlib.gridspec import GridSpec from matplotlib.collections import LineCollection from matplotlib.pyplot import gca, sca, sci, colorbar, subplot from .drawtools import drawTree, IndexFormatter p = kwargs.pop('percentile', None) vmin = vmax = None if p is not None: vmin = np.percentile(matrix, p) vmax = np.percentile(matrix, 100-p) vmin = kwargs.pop('vmin', vmin) vmax = kwargs.pop('vmax', vmax) vcenter = kwargs.pop('vcenter', None) norm = kwargs.pop('norm', None) if vcenter is not None and norm is None: if PY3K: try: from matplotlib.colors import DivergingNorm except ImportError: from matplotlib.colors import TwoSlopeNorm as DivergingNorm norm = DivergingNorm(vmin=vmin, vcenter=0., vmax=vmax) else: LOGGER.warn('vcenter cannot be used in Python 2 so norm remains None') lw = kwargs.pop('linewidth', 1) W = H = kwargs.pop('ratio', 6) ticklabels = kwargs.pop('ticklabels', None) xticklabels = kwargs.pop('xticklabels', ticklabels) yticklabels = kwargs.pop('yticklabels', ticklabels) xtickrotation = kwargs.pop('xtickrotation', 0.) show_colorbar = kwargs.pop('colorbar', True) cb_extend = kwargs.pop('cb_extend', 'neither') allticks = kwargs.pop('allticks', False) # this argument is temporary and will be replaced by better implementation interactive = kwargs.pop('interactive', True) cmap = kwargs.pop('cmap', 'jet') origin = kwargs.pop('origin', 'lower') try: from Bio import Phylo except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') tree_mode_y = isinstance(y_array, Phylo.BaseTree.Tree) tree_mode_x = isinstance(x_array, Phylo.BaseTree.Tree) if x_array is not None and y_array is not None: nrow = 2; ncol = 2 i = 1; j = 1 width_ratios = [1, W] height_ratios = [1, H] aspect = 'auto' elif x_array is not None and y_array is None: nrow = 2; ncol = 1 i = 1; j = 0 width_ratios = [W] height_ratios = [1, H] aspect = 'auto' elif x_array is None and y_array is not None: nrow = 1; ncol = 2 i = 0; j = 1 width_ratios = [1, W] height_ratios = [H] aspect = 'auto' else: nrow = 1; ncol = 1 i = 0; j = 0 width_ratios = [W] height_ratios = [H] aspect = kwargs.pop('aspect', None) main_index = (i, j) upper_index = (i-1, j) left_index = (i, j-1) complex_layout = nrow > 1 or ncol > 1 ax1 = ax2 = ax3 = None if complex_layout: gs = GridSpec(nrow, ncol, width_ratios=width_ratios, height_ratios=height_ratios, hspace=0., wspace=0.) ## draw matrix if complex_layout: ax3 = subplot(gs[main_index]) else: ax3 = gca() im = ax3.imshow(matrix, aspect=aspect, vmin=vmin, vmax=vmax, norm=norm, cmap=cmap, origin=origin, **kwargs) #ax3.set_xlim([-0.5, matrix.shape[0]+0.5]) #ax3.set_ylim([-0.5, matrix.shape[1]+0.5]) if xticklabels is not None: ax3.xaxis.set_major_formatter(IndexFormatter(xticklabels)) if yticklabels is not None and ncol == 1: ax3.yaxis.set_major_formatter(IndexFormatter(yticklabels)) if allticks: ax3.xaxis.set_major_locator(ticker.IndexLocator(offset=0.5, base=1.)) ax3.yaxis.set_major_locator(ticker.IndexLocator(offset=0.5, base=1.)) else: locator = ticker.AutoLocator() locator.set_params(integer=True) minor_locator = ticker.AutoMinorLocator() ax3.xaxis.set_major_locator(locator) ax3.xaxis.set_minor_locator(minor_locator) locator = ticker.AutoLocator() locator.set_params(integer=True) minor_locator = ticker.AutoMinorLocator() ax3.yaxis.set_major_locator(locator) ax3.yaxis.set_minor_locator(minor_locator) if ncol > 1: ax3.yaxis.set_major_formatter(ticker.NullFormatter()) ## draw x_ and y_array lines = [] if nrow > 1: ax1 = subplot(gs[upper_index]) if tree_mode_x: Y, X = drawTree(x_array, label_func=None, orientation='vertical', inverted=True) miny = min(Y.values()) maxy = max(Y.values()) minx = min(X.values()) maxx = max(X.values()) ax1.set_xlim(minx-.5, maxx+.5) ax1.set_ylim(miny, 1.05*maxy) else: ax1.set_xticklabels([]) y = x_array xp, yp = interpY(y) points = np.array([xp, yp]).T.reshape(-1, 1, 2) segments = np.concatenate([points[:-1], points[1:]], axis=1) lcy = LineCollection(segments, array=yp, linewidths=lw, cmap=cmap) lines.append(lcy) ax1.add_collection(lcy) ax1.set_xlim(xp.min()-.5, xp.max()+.5) ax1.set_ylim(yp.min(), yp.max()) if ax3.xaxis_inverted(): ax2.invert_xaxis() ax1.axis('off') if ncol > 1: ax2 = subplot(gs[left_index]) if tree_mode_y: X, Y = drawTree(y_array, label_func=None, inverted=True) miny = min(Y.values()) maxy = max(Y.values()) minx = min(X.values()) maxx = max(X.values()) ax2.set_ylim(miny-.5, maxy+.5) ax2.set_xlim(minx, 1.05*maxx) else: ax2.set_xticklabels([]) y = y_array xp, yp = interpY(y) points = np.array([yp, xp]).T.reshape(-1, 1, 2) segments = np.concatenate([points[:-1], points[1:]], axis=1) lcx = LineCollection(segments, array=yp, linewidths=lw, cmap=cmap) lines.append(lcx) ax2.add_collection(lcx) ax2.set_xlim(yp.min(), yp.max()) ax2.set_ylim(xp.min()-.5, xp.max()+.5) ax2.invert_xaxis() if ax3.yaxis_inverted(): ax2.invert_yaxis() ax2.axis('off') ## draw colorbar sca(ax3) cb = None if show_colorbar: if nrow > 1: axes = [ax1, ax2, ax3] while None in axes: axes.remove(None) s = H / (H + 1.) cb = colorbar(mappable=im, ax=axes, anchor=(0, 0), shrink=s, extend=cb_extend) else: cb = colorbar(mappable=im, extend=cb_extend) sca(ax3) sci(im) if interactive: from prody.utilities import ImageCursor from matplotlib.pyplot import connect cursor = ImageCursor(ax3, im) connect('button_press_event', cursor.onClick) ax3.tick_params(axis='x', rotation=xtickrotation) return im, lines, cb
[docs]def reorderMatrix(names, matrix, tree, axis=None): """ Reorder a matrix based on a tree and return the reordered matrix and indices for reordering other things. :arg names: a list of names associated with the rows of the matrix These names must match the ones used to generate the tree :type names: list :arg matrix: any square matrix :type matrix: :class:`~numpy.ndarray` :arg tree: any tree from :func:`calcTree` :type tree: :class:`~Bio.Phylo.BaseTree.Tree` :arg axis: along which axis the matrix should be reordered. Default is **None** which reorder along all the axes :type axis: int """ try: from Bio import Phylo except ImportError: raise ImportError('Phylo module could not be imported. ' 'Reinstall ProDy or install Biopython ' 'to solve the problem.') try: if matrix.ndim != 2: raise ValueError('matrix should be a 2D matrix.') except AttributeError: raise TypeError('matrix should be a numpy array.') if np.shape(matrix)[0] != np.shape(matrix)[1]: raise ValueError('matrix should be a square matrix') names = np.asarray(names) if np.isscalar(names): raise TypeError('names should be list-like') if not len(names): raise TypeError('names is empty') if not isinstance(tree, Phylo.BaseTree.Tree): raise TypeError('tree should be a BioPython Tree') if len(names) != len(matrix): raise ValueError('names should have entries for each matrix row/column') terminals = tree.get_terminals() if len(names) != len(terminals): raise ValueError('names should have entries for each tree terminal') if len(terminals) != len(matrix): raise ValueError('matrix should have a row for each tree terminal') indices = [] for terminal in terminals: name = terminal.name locs = np.where(names == name)[0] if not len(locs): raise ValueError('inconsistent names and tree: %s not in names'%name) if len(locs) > 1: raise ValueError('inconsistent names and tree: duplicate name %s in names'%name) indices.append(locs[0]) if axis is not None: I = [np.arange(s) for s in matrix.shape] axes = [axis] if np.isscalar(axis) else axis for ax in axes: I[ax] = indices else: I = [indices] * matrix.ndim rmatrix = matrix[np.ix_(*I)] return rmatrix, indices
[docs]def findSubgroups(tree, c, method='naive', **kwargs): """ Divide **tree** into subgroups using a criterion **method** and a cutoff **c**. Returns a list of lists with labels divided into subgroups. """ method = method.lower().strip() terminals = tree.get_terminals() names = [clade.name for clade in terminals] Z = None if method != 'naive': try: Z = getLinkage(names, tree) except LinkageError: print('Failed to build linkage; fall back to naive criterion') method = 'naive' if method == 'naive': subgroups = [[names[0]]] for i in range(len(terminals)-1): curr_clade = terminals[i] next_clade = terminals[i + 1] d = tree.distance(curr_clade, next_clade) if d > c: subgroups.append([]) subgroups[-1].append(next_clade.name) else: from scipy.cluster.hierarchy import fcluster T = fcluster(Z, c, criterion=method, **kwargs) labels = np.unique(T) subgroups = [[] for _ in range(len(labels))] for i, t in enumerate(T): subgroups[t-1].append(names[i]) return subgroups