Release Notes
v2.0 series come with new and improved sequence, structure, and dynamics analysis features. See release notes for details.
How to Cite
Bakan A, Meireles LM, Bahar I ProDy: Protein Dynamics Inferred from Theory and Experiments
Bioinformatics 2011 27(11):1575-1577.
Bakan A, Dutta A, Mao W, Liu Y, Chennubhotla C, Lezon TR, Bahar I Evol and ProDy for Bridging Protein Sequence Evolution and Structural Dynamics
Bioinformatics 2014 30(18):2681-2683.
Source code for prody.apps.prody_apps.prody_pca

"""Perform PCA/EDA calculations and output the results in plain text, NMD, and
graphical formats."""

from ..apptools import *
from .nmaoptions import *
from . import nmaoptions
from numbers import Integral

DEFAULTS = {}
HELPTEXT = {}
for key, txt, val in [
    ('aligned', 'trajectory is already aligned', False),
    ('outproj', 'write projections onto PCs', False),
    ('figproj', 'save projections onto specified subspaces, e.g. '
                '"1,2" for projections onto PCs 1 and 2; '
                '"1,2 1,3" for projections onto PCs 1,2 and 1, 3; '
                '"1 1,2,3" for projections onto PCs 1 and 1, 2, 3', ''),]:

    DEFAULTS[key] = val
    HELPTEXT[key] = txt

DEFAULTS.update(nmaoptions.DEFAULTS)
HELPTEXT.update(nmaoptions.HELPTEXT)

DEFAULTS['prefix'] = '_pca'

__all__ = ['prody_pca']

[docs]def prody_pca(coords, **kwargs):
    """Perform PCA calculations for PDB or DCD format *coords* file.

    """

    for key in DEFAULTS:
        if not key in kwargs:
            kwargs[key] = DEFAULTS[key]

    from os.path import isdir, splitext, join
    outdir = kwargs.get('outdir')
    if not isdir(outdir):
        raise IOError('{0} is not a valid path'.format(repr(outdir)))

    import prody
    LOGGER = prody.LOGGER

    prefix = kwargs.get('prefix')
    nmodes = kwargs.get('nmodes')
    selstr = kwargs.get('select')

    ext = splitext(coords)[1].lower()
    if ext == '.gz':
        ext = splitext(coords[:-3])[1].lower()

    if ext == '.dcd':
        pdb = kwargs.get('psf') or kwargs.get('pdb')
        if pdb:
            if splitext(pdb)[1].lower() == '.psf':
                pdb = prody.parsePSF(pdb)
            else:
                pdb = prody.parsePDB(pdb)
        dcd = prody.DCDFile(coords)
        if prefix == '_pca' or prefix == '_eda':
            prefix = dcd.getTitle() + prefix

        if len(dcd) < 2:
            raise ValueError('DCD file must have multiple frames')
        if pdb:
            if pdb.numAtoms() == dcd.numAtoms():
                select = pdb.select(selstr)
                dcd.setAtoms(select)
                LOGGER.info('{0} atoms are selected for calculations.'
                            .format(len(select)))
            else:
                select = pdb.select(selstr)
                if select.numAtoms() != dcd.numAtoms():
                    raise ValueError('number of selected atoms ({0}) does '
                                     'not match number of atoms in the DCD '
                                     'file ({1})'.format(select.numAtoms(),
                                                           dcd.numAtoms()))
                if pdb.numCoordsets():
                    dcd.setCoords(select.getCoords())

        else:
            select = prody.AtomGroup()
            select.setCoords(dcd.getCoords())
        pca = prody.PCA(dcd.getTitle())
        if len(dcd) > 1000:
            pca.buildCovariance(dcd, aligned=kwargs.get('aligned'))
            pca.calcModes(nmodes)
            ensemble = dcd
        else:
            ensemble = dcd[:]
            if not kwargs.get('aligned'):
                ensemble.iterpose(quiet=True)
            pca.performSVD(ensemble)
        nmodes = pca.numModes()

    else:
        pdb = prody.parsePDB(coords)
        if pdb.numCoordsets() < 2:
            raise ValueError('PDB file must contain multiple models')

        if prefix == '_pca' or prefix == '_eda':
            prefix = pdb.getTitle() + prefix

        select = pdb.select(selstr)
        LOGGER.info('{0} atoms are selected for calculations.'
                    .format(len(select)))
        if select is None:
            raise ValueError('selection {0} do not match any atoms'
                                .format(repr(selstr)))
        LOGGER.info('{0} atoms will be used for PCA calculations.'
                    .format(len(select)))
        ensemble = prody.Ensemble(select)
        pca = prody.PCA(pdb.getTitle())
        if not kwargs.get('aligned'):
            ensemble.iterpose()
        pca.performSVD(ensemble)


    LOGGER.info('Writing numerical output.')
    if kwargs.get('outnpz'):
        prody.saveModel(pca, join(outdir, prefix))

    prody.writeNMD(join(outdir, prefix + '.nmd'), pca[:nmodes], select)

    extend = kwargs.get('extend')
    if extend:
        if pdb:
            if extend == 'all':
                extended = prody.extendModel(pca[:nmodes], select, pdb)
            else:
                extended = prody.extendModel(pca[:nmodes], select,
                                             select | pdb.bb)
            prody.writeNMD(join(outdir, prefix + '_extended_' +
                           extend + '.nmd'), *extended)
        else:
            prody.LOGGER.warn('Model could not be extended, provide a PDB or '
                              'PSF file.')
    outall = kwargs.get('outall')
    delim = kwargs.get('numdelim')
    ext = kwargs.get('numext')
    format = kwargs.get('numformat')

    if outall or kwargs.get('outeig'):
        prody.writeArray(join(outdir, prefix + '_evectors'+ext),
                         pca.getArray(), delimiter=delim, format=format)
        prody.writeArray(join(outdir, prefix + '_evalues'+ext),
                         pca.getEigvals(), delimiter=delim, format=format)
    if outall or kwargs.get('outcov'):
        prody.writeArray(join(outdir, prefix + '_covariance'+ext),
                         pca.getCovariance(), delimiter=delim, format=format)
    if outall or kwargs.get('outcc') or kwargs.get('outhm'):
        cc = prody.calcCrossCorr(pca)
        if outall or kwargs.get('outcc'):
            prody.writeArray(join(outdir, prefix + '_cross-correlations' +
                             ext), cc, delimiter=delim, format=format)
        if outall or kwargs.get('outhm'):
            resnums = select.getResnums()
            hmargs = {} if resnums is None else {'resnums': resnums}
            prody.writeHeatmap(join(outdir, prefix + '_cross-correlations.hm'),
                               cc, xlabel='Residue', ylabel='Residue',
                               title=pca.getTitle() + ' cross-correlations',
                               **hmargs)

    if outall or kwargs.get('outsf'):
        prody.writeArray(join(outdir, prefix + '_sqfluct'+ext),
                         prody.calcSqFlucts(pca), delimiter=delim,
                         format=format)
    if outall or kwargs.get('outproj'):
        prody.writeArray(join(outdir, prefix + '_proj'+ext),
                         prody.calcProjection(ensemble, pca), delimiter=delim,
                         format=format)

    figall = kwargs.get('figall')
    cc = kwargs.get('figcc')
    sf = kwargs.get('figsf')
    sp = kwargs.get('figproj')

    if figall or cc or sf or sp:
        try:
            import matplotlib.pyplot as plt
        except ImportError:
            LOGGER.warning('Matplotlib could not be imported. '
                           'Figures are not saved.')
        else:
            prody.SETTINGS['auto_show'] = False
            LOGGER.info('Saving graphical output.')
            format = kwargs.get('figformat')
            width = kwargs.get('figwidth')
            height = kwargs.get('figheight')
            dpi = kwargs.get('figdpi')

            format = format.lower()
            if figall or cc:
                plt.figure(figsize=(width, height))
                prody.showCrossCorr(pca)
                plt.savefig(join(outdir, prefix + '_cc.'+format),
                    dpi=dpi, format=format)
                plt.close('all')
            if figall or sf:
                plt.figure(figsize=(width, height))
                prody.showSqFlucts(pca)
                plt.savefig(join(outdir, prefix + '_sf.'+format),
                    dpi=dpi, format=format)
                plt.close('all')
            if figall or sp:
                indices = []
                for item in sp.split():
                    try:
                        if '-' in item:
                            item = item.split('-')
                            if len(item) == 2:
                                indices.append(list(range(int(item[0])-1,
                                                          int(item[1]))))
                        elif ',' in item:
                            indices.append([int(i)-1 for i in item.split(',')])
                        else:
                            indices.append(int(item)-1)
                    except:
                        pass
                for index in indices:
                        plt.figure(figsize=(width, height))
                        prody.showProjection(ensemble, pca[index])
                        if isinstance(index, Integral):
                            index = [index]
                        index = [str(i+1) for i in index]
                        plt.savefig(join(outdir, prefix + '_proj_' +
                            '_'.join(index) + '.' + format),
                            dpi=dpi, format=format)
                        plt.close('all')


_ = list(HELPTEXT)
_.sort()
for key in _:

    prody_pca.__doc__ += """
    :arg {0}: {1}, default is ``{2!r}``""".format(key, HELPTEXT[key],
                                                  DEFAULTS[key])


def addCommand(commands):

    subparser = commands.add_parser('pca',
        help='perform principal component analysis calculations')

    subparser.add_argument('--quiet', help="suppress info messages to stderr",
        action=Quiet, nargs=0)

    subparser.add_argument('--examples', action=UsageExample, nargs=0,
        help='show usage examples and exit')

    subparser.set_defaults(usage_example=
    """This command performs PCA (or EDA) calculations for given multi-model \
PDB structure or DCD format trajectory file and outputs results in NMD \
format.  If a PDB identifier is given, structure file will be downloaded from \
the PDB FTP server.  DCD files may be accompanied with PDB or PSF files to \
enable atoms selections.

Fetch pdb 2k39, perform PCA calculations, and output NMD file:

  $ prody pca 2k39

Fetch pdb 2k39 and perform calculations for backbone of residues up to 71, \
and save all output and figure files:

  $ prody pca 2k39 --select "backbone and resnum < 71" -a -A

Perform EDA of MDM2 trajectory:

  $ prody eda mdm2.dcd

Perform EDA for backbone atoms:

  $ prody eda mdm2.dcd --pdb mdm2.pdb --select backbone""",
    test_examples=[0, 1])

    group = addNMAParameters(subparser)

    group = addNMAOutput(subparser)

    group.add_argument('-j', '--projection', dest='outproj',
        action='store_true',
        default=DEFAULTS['outproj'], help=HELPTEXT['outproj'])

    group = addNMAOutputOptions(subparser, '_pca')

    group = addNMAFigures(subparser)

    group.add_argument('-J', '--projection-figure', dest='figproj', type=str,
        default=DEFAULTS['figproj'], metavar='STR',
        help=HELPTEXT['figproj'])

    group = addNMAFigureOptions(subparser)

    group = subparser.add_mutually_exclusive_group()
    group.add_argument('--psf', help='PSF filename')
    group.add_argument('--pdb', help='PDB filename')
    subparser.add_argument('--aligned', dest='aligned', action='store_true',
        default=DEFAULTS['aligned'], help=HELPTEXT['aligned'])

    subparser.add_argument('dcd', help='file in DCD or PDB format')

    subparser.set_defaults(func=lambda ns: prody_pca(ns.dcd, **ns.__dict__))
    subparser.set_defaults(subparser=subparser)