Source code for prody.apps.evol_apps.evol_rankorder

"""Refine MSA application."""

__author__ = 'Ahmet Bakan, Anindita Dutta'

from ..apptools import DevelApp
import numpy as np

__all__ = ['evol_rankorder']

APP = DevelApp('rankorder', 'identify highly coevolving pairs of residues')

APP.setExample(
"""This application identifies that top ranking pairs of residues that \
coevolve based on their mutual information.  By default coevolution is \
reported for pairs that are at least 3 residues apart in sequence. A z-score \
normalization can be applied to the mutinfo matrix to identify coevolving \
pairs.  The following examples show how to use with default as well as \
additional options:

    $ evol rankorder piwi_refined_mutinfo.txt -z

    $ evol rankorder piwi_refined_mutinfo.txt --msa piwi_refined.slx \
--label AGO6_ARATH""", [])


APP.addArgument('mutinfo',
    help='mutual information matrix')

APP.addGroup('input', 'input options')
APP.addArgument('-z', '--zscore',
    dest='zscore',
    action='store_true',
    help='apply zscore for identifying top ranked coevolving pairs',
    group='input'
    )
APP.addArgument('-d', '--delimiter',
    dest='delimiter',
    help='delimiter used in mutual information matrix file',
    type=str,
    metavar='STR',
    default=None,
    group='input'
    )
APP.addArgument('-p', '--pdb',
    dest='pdb',
    help='PDB file that contains same number of residues as the mutual '
    'information matrix, output residue numbers will be based on PDB file',
    default=None,
    type=str,
    metavar='STR',
    group='input'
    )
APP.addArgument('-m', '--msa',
    dest='msa',
    help='MSA file used for building the mutual info matrix, '
    'output residue numbers will be based on the most complete sequence '
    'in MSA if a PDB file or sequence label is not specified',
    default=None,
    type=str,
    metavar='STR',
    group='input'
    )
APP.addArgument('-l', '--label',
    dest='label',
    help='label in MSA file for output residue numbers',
    default=None,
    type=str,
    metavar='STR',
    group='input'
    )
APP.addGroup('output', 'output options')
APP.addArgument('-n', '--num-pairs',
    dest='numpairs',
    help='number of top ranking residue pairs to list',
    default=100,
    type=int,
    metavar='INT',
    group='output'
    )
APP.addArgument('-q', '--seq-sep',
    dest='seqsep',
    help='report coevolution for residue pairs that are sequentially '
    'separated by input value',
    default=3,
    type=int,
    metavar='INT',
    group='output'
    )
APP.addArgument('-t', '--min-dist',
    dest='dist',
    help='report coevolution for residue pairs whose CA atoms are spatially '
    'separated by at least the input value, used when a PDB file is given '
    'and --use-dist is true',
    default=10.0,
    type=float,
    metavar='FLOAT',
    group='output'
    )
APP.addArgument('-u', '--use-dist',
    dest='usedist',
    action='store_true',
    help='use structural separation to report coevolving pairs',
    group='output'
    )
APP.addArgument('-o', '--outname',
    dest='outname',
    help='output filename, default is mutinfo_rankorder.txt',
    type=str,
    metavar='STR',
    group='output'
    )

def calcAllDist(coordset):

    from prody import calcDistance
    shape = coordset.shape
    distance = np.zeros((shape[1], shape[1]))

    for i in range(shape[1]):
        temp = np.tile(coordset[0,i,:], (1, shape[1], 1))
        distance[:,i] = calcDistance(coordset, temp)

    return distance

[docs]def evol_rankorder(mutinfo, **kwargs): from prody import parseMSA, LOGGER, PY3K from prody import parsePDB, calcMSAOccupancy, trimAtomsUsingMSA from prody.utilities import openFile, splitSeqLabel from os.path import splitext delimiter = kwargs.get('delimiter') mi = np.loadtxt(str(mutinfo), delimiter=delimiter) ndim, shape = mi.ndim, mi.shape if ndim != 2 or shape[0] != shape[1]: raise ValueError('mutinfo must contain a square matrix') msa, label, msaflag = kwargs.get('msa'), kwargs.get('label'), False pdb, pdbflag = kwargs.get('pdb'), False resnum = None if msa is not None: msa = parseMSA(msa) if msa.numResidues() != shape[0]: LOGGER.info('Input MSA and mutinfo do not have similar no ' 'of residues, ignoring MSA') else: index = msa.getIndex(label) try: if index is None: if label is not None: LOGGER.info('Could not find given label in MSA, ' 'using complete sequence from MSA') occ = calcMSAOccupancy(msa._msa, 'row') index = np.where(occ == occ.max())[0][0] label, start, end = splitSeqLabel(msa[index].getLabel(True)) else: label, start, end = splitSeqLabel(msa[index].getLabel(True)) except: LOGGER.info('Could not extract resnums from MSA') else: msaflag = True if pdb is not None: from prody import parsePDB try: pdb = parsePDB(pdb) except: LOGGER.info('Could not parse PDB, ignoring PDB input') else: chains = list(pdb.iterChains()) for chain in chains: sel = chain.select('protein and name CA') if sel.numAtoms() == shape[0]: resnum = sel.getResnums() coordset = sel.getCoordsets() distance = calcAllDist(coordset) pdbflag = True label = pdb.getTitle() LOGGER.info('Residue numbers will be based on pdb: ' '{0}'.format(pdb.getTitle())) break else: try: sel = trimAtomsUsingMSA(sel, msa, chain=chain.getChid()) if sel.numAtoms() == shape[0]: resnum = sel.getResnums() coordset = sel.getCoordsets() distance = calcAllDist(coordset) pdbflag = True label = pdb.getTitle() LOGGER.info('Residue numbers will be based on pdb: ' '{0}'.format(pdb.getTitle())) break except: LOGGER.info('Number of residues in PDB does not match ' 'mutinfo matrix and no MSA was provided to ' 'align the PDB against, so ignoring PDB input') if not pdbflag: if msaflag: if (start and end is not None) and (start < end): resnum = np.arange(start, end+1) if len(resnum) != shape[0]: LOGGER.info('Label: {0}/{1}-{2} and mutinfo do ' 'not have similar no of residues, using ' 'serial indexing'.format(label, start, end)) label = 'Serial Index' resnum = np.arange(1, shape[0]+1) else: LOGGER.info('Residue numbers will be based on MSA and label: ' '{0}'.format(label)) else: LOGGER.info('Could not identify residue indexes from MSA' ' using serial indexing') label = 'Serial Index' resnum = np.arange(1, shape[0]+1) else: LOGGER.info('MSA or PDB not given or does not match mutinfo, ' 'using serial indexing') resnum = np.arange(1, shape[0]+1) LOGGER.info('Residue numbers start and end with {0}-{1}'. format(str(resnum[0]), str(resnum[-1]))) outname = kwargs.get('outname') if outname is None: outname, ext = splitext(str(mutinfo)) if ext.lower() == '.gz': outname, _ = splitext(str(mutinfo)) else: outname, ext = splitext(str(outname)) if ext is None: ext = '.txt' outname += '_rankorder' + ext zscore = kwargs.get('zscore') if zscore: LOGGER.info('zscore normalization applied such that each column ' 'has 0 mean and standard deviation 1') header = 'Serial\tRow\tColumn\tZscore' mi = (mi - mi.mean(0)) / mi.std(0) else: header = 'Serial\tRow\tColumn\tMI' mi_ind_start, mi_ind_end = np.tril_indices(shape[0], k=-1) mi_matrix = mi[mi_ind_start, mi_ind_end] sorted_index = mi_matrix.argsort(axis=None)[::-1] row = mi_ind_start[sorted_index] column = mi_ind_end[sorted_index] count = 1 i = 0 if PY3K: mode = 'w' else: mode = 'wb' f = openFile(outname, mode) if label is None: label = 'Serial Index' numpairs = kwargs.get('numpairs') size = len(row) seqsep = kwargs.get('seqsep') if not kwargs.get('usedist') or not pdbflag: if kwargs.get('usedist'): LOGGER.info('use-struct-sep set to true, but PDB not given or ' 'incorrect residue number. Using sequence separation') else: if pdbflag: LOGGER.info('use-dist not set, using sequence separation' ' to report coevolving pairs') f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + '\tSequence Separation:' + str(seqsep) + '\n')) if pdbflag: f.write((header + '\tDistance\n')) while count <=numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 else: f.write((header + '\n')) while count <=numpairs and i < size: if row[i] > (column[i] + seqsep): f.write('{0}\t{1}\t{2}\t{3:.3f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]])) count += 1 i += 1 else: structsep = kwargs.get('dist') f.write(('Label: '+ label + '\t' + 'Residue Numbers: ' + str(resnum[0]) + '-' + str(resnum[-1]) + 'Distance Cutoff:' + str(structsep) + '\n')) f.write((header + '\tDistance\n')) while count <=numpairs and i < size: if distance[row[i], column[i]] > structsep: f.write('{0}\t{1}\t{2}\t{3:.3f}\t{4:.2f}\n'. format(count, resnum[row[i]], resnum[column[i]], mi[row[i], column[i]], distance[row[i], column[i]])) count += 1 i += 1 f.close()
APP.setFunction(evol_rankorder)