Release Notes
v2.0 series come with new and improved sequence, structure, and dynamics analysis features. See release notes for details.
How to Cite
Bakan A, Meireles LM, Bahar I ProDy: Protein Dynamics Inferred from Theory and Experiments
Bioinformatics 2011 27(11):1575-1577.
Bakan A, Dutta A, Mao W, Liu Y, Chennubhotla C, Lezon TR, Bahar I Evol and ProDy for Bridging Protein Sequence Evolution and Structural Dynamics
Bioinformatics 2014 30(18):2681-2683.
Source code for prody.utilities.misctools

"""This module defines miscellaneous utility functions."""
import re

from numpy import unique, linalg, diag, sqrt, dot, chararray, divide, zeros_like, zeros, allclose, ceil, abs
from numpy import diff, where, insert, nan, isnan, loadtxt, array, round, average, min, max, delete, vstack
from numpy import sign, arange, asarray, ndarray, subtract, power, sum, isscalar, empty, triu, tril, median
from numpy import alltrue
from collections import Counter
import numbers

from prody import PY3K
from .logger import LOGGER

from Bio.Data import IUPACData

from xml.etree.ElementTree import Element

__all__ = ['Everything', 'Cursor', 'ImageCursor', 'rangeString', 'alnum', 'importLA', 'dictElement',
           'intorfloat', 'startswith', 'showFigure', 'countBytes', 'sqrtm',
           'saxsWater', 'count', 'addEnds', 'copy', 'dictElementLoop', 'index',
           'getDataPath', 'openData', 'chr2', 'toChararray', 'interpY', 'cmp', 'pystr',
           'getValue', 'indentElement', 'isPDB', 'isURL', 'isListLike', 'isSymmetric', 'makeSymmetric',
           'getDistance', 'fastin', 'createStringIO', 'div0', 'wmean', 'bin2dec', 'wrapModes', 
           'fixArraySize', 'decToHybrid36', 'hybrid36ToDec', 'DTYPE', 'checkIdentifiers', 'split', 'mad']

DTYPE = array(['a']).dtype.char  # 'S' for PY2K and 'U' for PY3K
CURSORS = []

# Note that the chain id can be blank (space). Examples:
# 3TT1, 3tt1A, 3tt1:A, 3tt1_A, 3tt1-A, 3tt1 A
isPDB = re.compile('^[A-Za-z0-9]{4}[ -_:]{,1}[A-Za-z0-9 ]{,1}$').match

# django url validation regex
isURL = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE).match

[docs]class Everything(object):
    """A place for everything."""

    def __contains__(self, what):

        return True

class Cursor(object):
    def __init__(self, ax):
        import matplotlib.patheffects as PathEffects

        self.ax = ax
        self.lx = ax.axhline(color='k', linestyle='--', linewidth=0.)  # the horiz line
        self.ly = ax.axvline(color='k', linestyle='--', linewidth=0.)  # the vert line

        # text location in axes coords
        self.txt = ax.text(0., 1., '', color='k', verticalalignment='bottom')
        self.txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='w')])
        
        # preserve the cursor reference
        global CURSORS
        CURSORS.append(self)

    def onClick(self, event):
        from matplotlib.pyplot import draw

        if event.inaxes != self.ax:
            return

        if event.button == 1:
            self.show(event)
        elif event.button == 3:
            self.clear(event)

        draw()

    def show(self, event):
        x, y = event.xdata, event.ydata
        # update the line positions
        self.lx.set_ydata(y)
        self.ly.set_xdata(x)

        self.lx.set_linewidth(.75)
        self.ly.set_linewidth(.75)

        self.txt.set_text(' x=%1.2f, y=%1.2f' % (x, y))
        self.txt.set_position((x, y))

    def clear(self, event):
        self.lx.set_linewidth(0.)
        self.ly.set_linewidth(0.)

        self.txt.set_text('')

class ImageCursor(Cursor):
    def __init__(self, ax, image, atoms=None):
        super(ImageCursor, self).__init__(ax)
        self.image = image
        self.atoms = atoms
    
    def show(self, event):
        x, y = event.xdata, event.ydata
        # update the line positions
        self.lx.set_ydata(y)
        self.ly.set_xdata(x)

        self.lx.set_linewidth(1.)
        self.ly.set_linewidth(1.)

        data = self.get_cursor_data(event)
        if data is None:
            return
            
        i, j, v = data

        if v > 1e-4 and v < 1e4:
            template = ' x=%s, y=%s [%s]'
        else:
            template = ' x=%s, y=%s [%s]'
        if self.atoms is None:
            self.txt.set_text(template % (j, i, v)) 
        else:
            seq = self.atoms.getSequence()
            resnums = self.atoms.getResnums()

            a = seq[j] + str(resnums[j])
            b = seq[i] + str(resnums[i])
            self.txt.set_text(template % (a, b, str(v)))
        self.txt.set_position((x, y))

    def get_cursor_data(self, event):
        """Get the cursor data for a given event"""
        from matplotlib.transforms import Bbox, BboxTransform

        aximg = self.image
        xmin, xmax, ymin, ymax = aximg.get_extent()
        if aximg.origin == 'upper':
            ymin, ymax = ymax, ymin

        arr = aximg.get_array()
        data_extent = Bbox([[ymin, xmin], [ymax, xmax]])
        array_extent = Bbox([[0, 0], arr.shape[:2]])
        trans = BboxTransform(boxin=data_extent, boxout=array_extent)
        y, x = event.ydata, event.xdata
        point = trans.transform_point([y, x])
        if any(isnan(point)):
            return None
        i, j = point.astype(int)
        # Clip the coordinates at array bounds
        if not (0 <= i < arr.shape[0]) or not (0 <= j < arr.shape[1]):
            return None
        else:
            return i, j, arr[i, j]

[docs]def rangeString(lint, sep=' ', rng=' to ', exc=False, pos=True):
    """Returns a structured string for a given list of integers.

    :arg lint: integer list or array
    :arg sep: range or number separator
    :arg rng: range symbol
    :arg exc: set **True** if range symbol is exclusive
    :arg pos: only consider zero and positive integers

    .. ipython:: python

       from prody.utilities import rangeString
       lint = [1, 2, 3, 4, 10, 15, 16, 17]
       rangeString(lint)
       rangeString(lint, sep=',', rng='-')
       rangeString(lint, ',', ':', exc=True)"""

    ints = unique(lint)
    if len(ints) == 0:
        return ''
    if pos and ints[0] < 0:
        ints = ints[ints > -1]

    prev = ints[0]
    lint = [[prev]]
    for i in ints[1:]:
        if i - prev > 1:
            lint.append([i])
        else:
            lint[-1].append(i)
        prev = i
    exc = int(exc)
    return sep.join([str(l[0]) if len(l) == 1 else
                     str(l[0]) + rng + str(l[-1] + exc) for l in lint])


[docs]def alnum(string, alt='_', trim=False, single=False):
    """Replace non alpha numeric characters with *alt*.  If *trim* is **True**
    remove preceding and trailing *arg* characters.  If *single* is **True**,
    contain only a single joining *alt* character. """

    result = ''
    multi = not bool(single)
    prev = None
    for char in string:
        if char.isalnum():
            result += char
            prev = char
        else:
            if multi or prev != alt:
                result += alt
            prev = alt
    trim = int(bool(trim))
    result = result[trim * (result[0] == alt):
                    len(result) - trim * (result[-1] == alt)]
    return result


[docs]def importLA():
    """Returns one of :mod:`scipy.linalg` or :mod:`numpy.linalg`."""

    try:
        import scipy.linalg as linalg
    except ImportError:
        try:
            import numpy.linalg as linalg
        except:
            raise ImportError('scipy.linalg or numpy.linalg is required for '
                              'NMA and structure alignment calculations')
    return linalg

def createStringIO():
    if PY3K:
        from io import StringIO
    else:
        from StringIO import StringIO
    return StringIO()

[docs]def dictElement(element, prefix=None, number_multiples=False):
    """Returns a dictionary built from the children of *element*, which must be
    a :class:`xml.etree.ElementTree.Element` instance. Keys of the dictionary
    are *tag* of children without the *prefix*, or namespace. Values depend on
    the content of the child. If a child does not have any children, its text
    attribute is the value. If a child has children, then the child is the
    value.
    """
    
    dict_ = {}
    length = False
    if isinstance(prefix, str):
        length = len(prefix)

    prev_tag = ''
    for child in element:
        tag = child.tag

        if length and tag.startswith(prefix):
            tag = tag[length:]

        if tag != prev_tag:
            prev_tag = tag
            i = 0
        else:
            i += 1

        if number_multiples:
            tag = tag + '{:>4}'.format(str(i))
            
        if len(child) == 0:
            if child.text is None:
                dict_[tag] = child.items()
            else:
                dict_[tag] = child.text
        else:
            dict_[tag] = child

    return dict_

def dictElementLoop(dict_, keys=None, prefix=None, number_multiples=False):

    if isinstance(keys, str):
        keys = [keys]

    if not keys:
        keys = dict_.keys()

    for orig_key in keys:
        item = dict_[orig_key]
        if isinstance(item, Element):
            dict2 = dictElement(dict_[orig_key], prefix, number_multiples)
            finished = False
            while not finished:
                dict3 = dict2.copy()
                try:
                    key = dict2.keys()[0]
                    dict2[key] = dictElement(dict2[key], prefix, number_multiples)
                except:
                    finished = True
                else:
                    dict2 = dict3
                    for key in dict2.keys():
                        dict2[key] = dictElement(dict2[key], prefix, number_multiples)

            dict_[orig_key] = dict2

    return dict_

[docs]def intorfloat(x):
    """Returns ``int(x)``, or ``float(x)`` upon :exc:`ValueError`."""

    try:
        return int(x)
    except ValueError:
        return float(x)


[docs]def startswith(this, that):
    """Returns **True** if *this* or *that* starts with the other."""

    if len(this) < len(that):
        return that.startswith(this)
    else:
        return this.startswith(that)


[docs]def showFigure():
    """Call :func:`~matplotlib.pyplot.show` function with ``block=False``
    argument to avoid blocking behavior in non-interactive sessions.  If
    *block* keyword argument is not recognized, try again without it."""

    from matplotlib.pyplot import show
    try:
        show(block=False)
    except TypeError:
        show()


[docs]def countBytes(arrays, base=False):
    """Returns total number of bytes consumed by elements of arrays.  If
    *base* is **True**, use number of bytes from the base array."""

    if base:
        getbase = lambda arr: arr if arr.base is None else getbase(arr.base)
        nbytes = lambda arr: getbase(arr).nbytes
    else:
        nbytes = lambda arr: arr.nbytes

    return sum(nbytes(arr) for arr in arrays)

[docs]def sqrtm(matrix):
    """Returns the square root of a matrix."""
    (U,S,VT)=linalg.svd(matrix)
    D = diag(sqrt(S))
    return dot(dot(U,D),VT)

def getMasses(elements):
    """Gets the mass atom. """
    
    # mass_dict = {'C':12,'N':14,'S':32,'O':16,'H':1}
    mass_dict = IUPACData.atom_weights

    if isinstance(elements, str):
        return mass_dict[elements.capitalize()]
    else:
        masses = zeros(len(elements))
        for i,element in enumerate(elements):
            if element.capitalize() in mass_dict:
                masses[i] = mass_dict[element.capitalize()]
            else:
                masses[i] = 0.
        return masses

def count(L, a=None):
    return len([b for b in L if b is a])

[docs]def addEnds(x, y, axis=0):
    """Finds breaks in *x*, extends them by one position and adds **nan** at the 
    corresponding position in *y*. *x* needs to be an 1-D array, *y* can be a 
    matrix of column (or row) vectors"""

    d = diff(x)
    counter = Counter(d)
    step = counter.most_common(1)[0][0]

    breaks = where(d != step)[0]
    for b in reversed(breaks):
        x = insert(x, b+1, x[b]+step)
        y = insert(y, b+1, nan, axis=axis)

    return x, y

def copy(x):
    if x is None:
        return None
    elif isinstance(x, list):
        x = list(x)
    else:
        try:
            x = x.copy()
        except AttributeError:
            from copy import copy as shallow_copy
            
            x = shallow_copy(x)
    return x

def pystr(a):
    b = a
    if PY3K:
        if hasattr(a, 'decode'):
            b = a.decode() 
    else:
        if hasattr(a, 'encode'):
            b = a.encode() 
    return b

def getDataPath(filename):
    import pkg_resources
    return pkg_resources.resource_filename('prody.utilities', 'datafiles/%s'%filename)

def openData(filename, mode='r'):
    return open(getDataPath(filename), mode)

def saxsWater():
    filename = getDataPath('saxs_water.dat')
    return loadtxt(filename, delimiter=',')

def chr2(a):
    try:
        c = chr(a)
    except TypeError:
        c = str(a)
    return c

def toChararray(arr, aligned=False):
    arr = array(arr, dtype='|S')
    try:
        ndim, dtype_, shape = arr.ndim, arr.dtype, arr.shape
    except AttributeError:
        raise TypeError('arr is not a Numpy array')

    if ndim < 1:
        raise ValueError('arr.ndim should be at least 1')
    if dtype_.char != 'S':
        raise ValueError('arr must be a character array')

    if ndim != 2:
        n_seq = shape[0]
        l_seq = dtype_.itemsize
        new_arr = chararray((n_seq, l_seq))
        for i, s in enumerate(arr):
            for j in range(l_seq):
                if j < len(s):
                    new_arr[i, j] = chr2(s[j])
                else:
                    if aligned:
                        raise ValueError('arr does not the same lengths')
                    new_arr[i, j] = '.'
    else:
        new_arr = array(arr, dtype='|S1')
    return new_arr

def interpY(Y):
    Y = asarray(Y, dtype=float)
    n = len(Y)
    X = arange(n)

    dy = (Y.max() - Y.min()) / n

    Xp = [X[0]]; Yp = [Y[0]]
    for i in range(n-1):
        y1, y2 = Y[i], Y[i+1]
        x1, x2 = X[i], X[i+1]
        if abs(y2 - y1) > dy:
            sdy = sign(y2 - y1)*dy
            yp = arange(Y[i]+sdy, Y[i+1], sdy)
            xp = (yp - y1)/(y2 - y1)*(x2 - x1) + x1
            Xp.extend(xp)
            Yp.extend(yp)

        Xp.append(x2)
        Yp.append(y2)
    return array(Xp), array(Yp)

def cmp(a, b):
    return (a > b) - (a < b)

def getValue(dict_, attr, default=None):
    value = default
    if attr in dict_:
        value = dict_[attr]
        if default is not None:
            try:
                if value.ndim == 0:
                    value = type(default)(value)
            except:
                pass
    return value

def indentElement(elem, level=0):
    i = "\n" + level*"  "
    j = "\n" + (level-1)*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
        if not elem.tail or not elem.tail.strip():
            elem.tail = i
        for subelem in elem:
            indentElement(subelem, level+1)
        if not elem.tail or not elem.tail.strip():
            elem.tail = j
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = j
    return elem 

def isListLike(a):
    return isinstance(a, (list, tuple, ndarray))

def getDistance(coords1, coords2, unitcell=None):

    diff = coords1 - coords2
    if unitcell is not None:
        diff = subtract(diff, round(diff/unitcell)*unitcell, diff)
    return sqrt(power(diff, 2, diff).sum(axis=-1))

def fastin(a, B):
    for b in reversed(B):
        if a is b:
            return True
    return False

[docs]def div0(a, b, defval=0.):
    """ Performs ``true_divide`` but ignores the error when division by zero 
    (result is set to zero instead). """

    from numpy import errstate, true_divide, isfinite, isscalar
    
    with errstate(divide='ignore', invalid='ignore'):
        c = true_divide(a, b)
        if isscalar(c):
            if not isfinite(c):
                c = defval
        else:
            c[~isfinite(c)] = defval  # -inf inf NaN
    return c

[docs]def wmean(array, weights, axis=None):
    """Calculates the weighted average of *array* given *axis*."""

    try:
        avg = average(array, axis=axis, weights=weights)
    except ZeroDivisionError:
        numer = sum(array*weights, axis=axis)
        denom = sum(weights, axis=axis)
        avg = div0(numer, denom, nan)
    return avg

[docs]def bin2dec(x):
    """Converts the binary array to decimal."""

    y = 0
    for i,j in enumerate(x):
        if j: y += 1<<i
    return y


def wrapModes(modes):
    if hasattr(modes, 'getArray'):
        try:
            modes = [mode for mode in modes]
        except TypeError:
            modes = [modes]
    else:
        if isscalar(modes[0]):
            modes = [modes]
    return modes

[docs]def fixArraySize(arr, sizes, value=0):
    """Makes sure that **arr** is of **sizes**. If not, pad with **value**."""

    if not isinstance(arr, ndarray):
        raise TypeError('arr has to be a numpy ndarray')

    if not isinstance(sizes, tuple):
        raise TypeError('sizes has to be a tuple')

    shapes = arr.shape
    if shapes == sizes:
        return arr

    arr2 = empty(sizes, dtype=arr.dtype)
    arr2.fill(value)

    common_sizes = [min((a, b)) for a, b in zip(sizes, shapes)]
    slices = tuple(slice(s) for s in common_sizes)
    arr2[slices] = arr[slices]

    return arr2

[docs]def isSymmetric(M, rtol=1e-05, atol=1e-08):
    """Checks if the matrix is symmetric."""
    if M.shape != M.T.shape:
        return False
    return allclose(M, M.T, rtol=rtol, atol=atol)

[docs]def makeSymmetric(M):
    """Makes sure the matrix is symmetric."""
    
    if isSymmetric(M):
        return M

    # make square 
    n, m = M.shape
    l = max((n, m))
    M = fixArraySize(M, (l, l))

    # determine which part of the matrix has values
    U = triu(M, k=1)
    L = tril(M, k=-1)

    if U.sum() == 0:
        M += L.T
    elif L.sum() == 0:
        M += U.T
    else:
        M = (M + M.T) / 2.
    return M

def index(A, a):
    if isinstance(A, list):
        return A.index(a)
    else:
        A = asarray(A)
        return where(A==a)[0][0]


[docs]def checkIdentifiers(*pdb, **kwargs):
    """Check whether *pdb* identifiers are valid, and replace invalid ones
    with **None** in place."""
    format = kwargs.get('format', 'pdb')
    identifiers = []
    append = identifiers.append
    for pid in pdb:
        try:
            pid = pid.strip().lower()
        except AttributeError:
            LOGGER.warn('{0} is not a valid identifier.'.format(repr(pid)))
            append(None)
        else:
            if format != 'emd' and not (len(pid) == 4 and pid.isalnum()):
                LOGGER.warn('{0} is not a valid identifier.'
                            .format(repr(pid)))
                append(None)
            elif format == 'emd' and not (len(pid) in [4, 5] and pid.isalnum()):
                LOGGER.warn('{0} is not a valid identifier.'
                            .format(repr(pid)))
                append(None)
            else:
                append(pid)
    return identifiers

def decToBase36(integer):
    """Converts a decimal number to base 36.
    Based on https://wikivisually.com/wiki/Base36
    """

    chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    
    sign = '-' if integer < 0 else ''
    integer = abs(integer)
    result = ''
    
    while integer > 0:
        integer, remainder = divmod(integer, 36)
        result = chars[remainder]+result

    if result == '':
        result = '0'

    return sign+result


[docs]def decToHybrid36(x, resnum=False):
    """Convert a regular decimal number to a string in hybrid36 format"""
    if not isinstance(x, numbers.Integral):
        raise TypeError('x should be an integer')

    if not resnum:
        if x < 100000:
            return str(x)

        start = 10*36**4 # decToBase36(start) = A0000
        return decToBase36(int(x) + (start - 100000))
    else:
        if x < 10000:
            return str(x)

        start = 10*36**3 # decToBase36(start) = A000
        return decToBase36(int(x) + (start - 10000))        


def base36ToDec(x):
    chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    if x[0] == '-':
        sign = '-'
        x = x[1:]
    else:
        sign = ''

    result = 0
    for i, entry in enumerate(reversed(x)):
        result += (chars.find(entry)*36**(i))

    return int(sign + str(result))


[docs]def hybrid36ToDec(x, resnum=False):
    """Convert string in hybrid36 format to a regular decimal number"""
    if not isinstance(x, str):
        raise TypeError('x should be a string')
    
    isnumeric = alltrue([y.isdigit() for y in x])
    if isnumeric:
        return int(x)

    if not resnum:
        start = 10*36**4 # decToBase36(start) = A0000
        return base36ToDec(x) - start + 100000
    else:
        start = 10*36**3 # decToBase36(start) = A000
        return base36ToDec(x) - start + 10000        


def split(string, shlex=False):
    if shlex:
        try:
            import shlex
        except ImportError:
            raise ImportError('Use of the shlex option requires the '
                              'installation of the shlex package.')
        else:
            return shlex.split(string)
    else:
        return string.split()


def packmolRenumChains(ag):
    chainids = ag.getChids()
    new_chainids = zeros(chainids.shape[0], dtype=DTYPE + '2')
    chid_mem = []
    chid_alt = 0

    num_chars = int(chainids.dtype.str[2:])
    for j, chid in enumerate(chainids):
        if j == 0:
            chid_mem.append(chid)
        else:
            if chid != chainids[j-1]:
                chid_alt += 1

                if chid_alt >= 10**num_chars:
                    num_chars += 1
                    new_chainids = array(new_chainids, dtype=new_chainids.dtype.str[:2]+str(num_chars))

                if chid_alt > 1 and len(chid_mem) > 1 and chid == chid_mem[-2]:
                    chid_mem.append(chid)

        new_chainids[j] = "{}".format(chid_alt)

    ag.setChids(new_chainids)
    return ag

def mad(x):
    try:
        from scipy.stats import median_absolute_deviation as _mad
    except ImportError:
        try:
            from scipy.stats import median_abs_deviation as _mad
        except ImportError:
            def _mad(x):
                med = median(x)
                return median(abs(x - med))
    
    return _mad(x)