"""Blast Protein Data Bank for structures matching a user given sequence."""
import os.path
from ..apptools import *
__all__ = ['prody_blast']
def readFirstSequenceFasta(filename):
"""Returns first sequence from a file."""
fasta = open(filename)
seq = []
title = ''
first = True
for line in fasta:
if line[0] == '>':
if first:
title = line[1:].strip()
first = False
else:
break
else:
seq.append( line.strip() )
fasta.close()
return title, ''.join(seq)
[docs]def prody_blast(sequence, **kwargs):
"""Blast search PDB and download hits.
:arg sequence: sequence or file in fasta format
:arg identity: percent sequence identity for blast search, default is 90.0
:type identity: float
:arg overlap: percent sequence overlap between sequences, default is 90.0
:type overlap: float
:arg outdir: download uncompressed PDB files to given directory
:type outdir: str
:arg gzip: write compressed PDB file
*Blast Parameters*
:arg filename: a *filename* to save the results in XML format
:type filename: str
:arg hitlist_size: search parameters, default is 250
:type hitlist_size: int
:arg expect: search parameters, default is 1e-10
:type expect: float
:arg sleep: how long to wait to reconnect for results, default is 2
sleep time is doubled when results are not ready.
:type sleep: int
:arg timeout: when to give up waiting for results. default is 30
:type timeout: int"""
import prody
LOGGER = prody.LOGGER
title = None
if os.path.isfile(sequence):
title, sequence = readFirstSequenceFasta(sequence)
LOGGER.info("First sequence ({0}) is parsed from {1}."
.format(title, repr(sequence)))
if not sequence.isalpha() or not sequence.isupper():
raise ValueError("{0} is not a valid sequence or a file"
.format(repr(sequence)))
outdir = kwargs.get('outdir')
identity, overlap = kwargs.get('identity', 90), kwargs.get('overlap', 90)
if not 0 < identity < 100:
raise ValueError('identity must be between 0 and 100')
if not 0 < overlap < 100:
raise ValueError('overlap must be between 0 and 100')
filename = kwargs.get('filename', None)
hitlist_size = kwargs.get('hitlist_size', 250)
expect = kwargs.get('expect', 1e-10)
sleep, timeout = kwargs.get('sleep', 2), kwargs.get('timeout', 30)
blast_results = prody.blastPDB(sequence,filename=filename,
hitlist_size=hitlist_size, expect=expect,
sleep=sleep, timeout=timeout)
if not blast_results.isSuccess:
raise IOError('blast search timed out, please try again')
hits = blast_results.getHits(percent_identity=identity,
percent_overlap=overlap)
#sort hits by decreasing percent identity
hits2 = []
for pdb in hits:
hits2.append( (-hits[pdb]['percent_identity'], pdb) )
hits2.sort()
stdout = kwargs.get('stdout', False)
if not stdout:
finalHits = []
else:
from sys import stdout
for identity, pdb in hits2:
chain = hits[pdb]['chain_id']
percent_identity = hits[pdb]['percent_identity']
title = hits[pdb]['title']
if stdout:
stdout.write(pdb + ' ' + chain + ' ' +
('%5.1f%%' % (percent_identity)) + ' ' + title)
else:
finalHits.append((pdb, chain, ('%5.1f%%' % (percent_identity)),
title))
# download hits if --outdir is given
if outdir:
LOGGER.info('Downloading hits to ' + outdir)
pdblist = [ pdb for identity, pdb in hits2 ]
pdblist2 = prody.fetchPDB(pdblist, outdir,
compressed=kwargs.get('gzip'), copy=True)
if not stdout:
return finalHits
def addCommand(commands):
subparser = commands.add_parser('blast',
help='blast search Protein Data Bank')
subparser.add_argument('--quiet', help="suppress info messages to stderr",
action=Quiet, nargs=0)
subparser.add_argument('--examples', action=UsageExample, nargs=0,
help='show usage examples and exit')
subparser.set_defaults(usage_example=
"""Blast search PDB for the first sequence in a fasta file:
$ prody blast seq.fasta -i 70
Blast search PDB for the sequence argument:
$ prody blast MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQ\
KESTLHLVLRLRGG
Blast search PDB for avidin structures, download files, and align all files \
onto the 2avi structure:
$ prody blast -d . ARKCSLTGKWTNDLGSNMTIGAVNSRGEFTGTYITAVTATSNEIKESPLHGTQNTIN\
KRTQPTFGFTVNWKFSESTTVFT
$ prody align 2avi.pdb *pdb """)
subparser.add_argument('-i', '--identity', dest='identity', type=float,
default=90.0, metavar='FLOAT',
help='percent sequence identity (default: %(default)s)')
subparser.add_argument('-o', '--overlap', dest='overlap', type=float,
default=90.0, metavar='FLOAT',
help='percent sequence overlap (default: %(default)s)')
subparser.add_argument('-d', '--output-dir', dest='outdir', type=str,
default=None, metavar='PATH',
help='download uncompressed PDB files to given directory')
subparser.add_argument('-z', '--gzip', dest='gzip', action='store_true',
default=False, help='write compressed PDB file')
subparser.add_argument('sequence', type=str,
help='sequence or file in fasta format')
group = subparser.add_argument_group('Blast Parameters')
group.add_argument('-f', '--filename', dest='filename', type=str,
default=None, metavar='STR',
help='a filename to save the results in XML format')
group.add_argument('-e', '--expect', dest='expect', type=float,
default=1e-10, metavar='FLOAT', help='blast search parameter')
group.add_argument('-l', '--hit-list-size', dest='hitlist_size', type=int,
default=250, metavar='INT', help='blast search parameter')
group.add_argument('-s', '--sleep-time', dest='sleep', type=int,
default=2, metavar='INT',
help='how long to wait to reconnect for results '
'(sleep time is doubled when results are not ready)')
group.add_argument('-t', '--timeout', dest='timeout', type=int,
default=30, metavar='INT',
help='when to give up waiting for results')
subparser.set_defaults(func=lambda ns: prody_blast(
ns.__dict__.pop('sequence'), **ns.__dict__))
subparser.set_defaults(subparser=subparser)
subparser.set_defaults(stdout=True)