# -*- coding: utf-8 -*-
"""This module defines functions for Dali searching Protein Data Bank."""
import re
import numpy as np
from prody.atomic import Atomic, AtomGroup, AtomMap
from prody.proteins.pdbfile import _getPDBid
from prody.measure import getRMSD, getTransformation
from prody.utilities import checkCoords, checkWeights, createStringIO
from prody import LOGGER, PY3K
from prody import parsePDB, writePDBStream
# if PY3K:
# import urllib.parse as urllib
# import urllib.request as urllib2
# else:
# import urllib
# import urllib2
from prody.ensemble import Ensemble
from prody.ensemble import PDBEnsemble
import os
__all__ = ['DaliRecord', 'searchDali',
'daliFilterMultimer', 'daliFilterMultimers']
[docs]def searchDali(pdb, chain=None, subset='fullPDB', daliURL=None, **kwargs):
"""Search Dali server with input of PDB ID (or local PDB file) and chain ID.
Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/
:arg pdb: PDB code or local PDB file for the protein to be searched
:arg chain: chain identifier (only one chain can be assigned for PDB)
:type chain: str
:arg subset: fullPDB, PDB25, PDB50, PDB90
:type subset: str
"""
import requests
LOGGER.timeit('_dali')
# timeout = 120
timeout = kwargs.pop('timeout', 120)
if daliURL is None:
daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi"
if isinstance(pdb, Atomic):
atoms = pdb
chain_set = set(atoms.getChids())
if chain and not chain in chain_set:
raise ValueError('input structure (%s) does not have chain %s'%(atoms.getTitle(), chain))
if len(chain_set) > 1:
if not chain:
raise TypeError('the structure (%s) contains more than one chain, therefore a chain identifier '
'needs to be specified'%pdb.getTitle())
atoms = atoms.select('chain '+chain)
else:
chain = chain_set.pop()
stream = createStringIO()
writePDBStream(stream, atoms)
data = stream.getvalue()
stream.close()
files = {"file1" : data}
pdbId = atoms.getTitle()
pdb_chain = ''
dali_title = 'Title_'+pdbId+chain
elif isinstance(pdb, str):
if os.path.isfile(pdb):
atoms = parsePDB(pdb)
chain_set = set(atoms.getChids())
# pdbId = "s001"
filename = os.path.basename(pdb)
filename, ext = os.path.splitext(filename)
if ext.lower() == '.gz':
filename2, ext2 = os.path.splitext(filename)
if ext2.lower() == '.pdb':
filename = filename2
pdbId = filename
if chain and not chain in chain_set:
raise ValueError('input PDB file does not have chain ' + chain)
if len(chain_set) > 1:
if not chain:
raise TypeError('PDB file (%s) contains more than one chain, therefore a chain identifier '
'needs to be specified'%pdb)
atoms = atoms.select('chain '+chain)
#local_temp_pdb = pdbId+chain+'.pdb'
#local_temp_pdb = 's001'+chain+'.pdb'
stream = createStringIO()
writePDBStream(stream, atoms)
data = stream.getvalue()
stream.close()
else:
data = open(pdb, "rb")
chain = chain_set.pop()
files = {"file1" : data}
# case: multiple chains. apply fetch ? multiple times?
pdb_chain = ''
dali_title = 'Title_' + pdbId + chain
else:
pdbId, ch = _getPDBid(pdb)
if not chain:
chain = ch
if not chain:
raise TypeError('a chain identifier is needed for the search')
pdb_chain = pdbId + chain
dali_title = 'Title_' + pdb_chain
files = ''
parameters = { 'cd1' : pdb_chain, 'method': 'search', 'title': dali_title, 'address': '' }
# enc_params = urllib.urlencode(parameters).encode('utf-8')
# request = urllib2.Request(daliURL, enc_params)
request = requests.post(daliURL, parameters, files=files)
try_error = 3
while try_error >= 0:
try:
# url = urllib2.urlopen(request).url
url = request.url
break
except:
try_error -= 1
if try_error >= 0:
LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
continue
else:
# url = urllib2.urlopen(request).url
url = request.url
break
if url.split('.')[-1].lower() in ['html', 'php']:
# print('test -1: '+url)
url = url.replace(url.split('/')[-1], '')
LOGGER.debug('Submitted Dali search for PDB "{0}{1}".'.format(pdbId, chain))
LOGGER.info(url)
LOGGER.clear()
return DaliRecord(url, pdbId, chain, subset=subset, timeout=timeout, **kwargs)
[docs]class DaliRecord(object):
"""A class to store results from Dali PDB search."""
def __init__(self, url, pdbId, chain, subset='fullPDB', localFile=False, **kwargs):
"""Instantiate a DaliRecord object instance.
:arg url: url of Dali results page or local dali results file
:arg pdbId: PDB code for searched protein
:arg chain: chain identifier (only one chain can be assigned for PDB)
:arg subset: fullPDB, PDB25, PDB50, PDB90. Ignored if localFile=True (url is a local file)
:arg localFile: whether provided url is a path for a local dali results file
"""
self._url = url
self._pdbId = pdbId
self._chain = chain
subset = subset.upper()
if subset == "FULLPDB" or subset not in ["PDB25", "PDB50", "PDB90"]:
self._subset = ""
else:
self._subset = "-"+subset[3:]
timeout = kwargs.pop('timeout', 120)
self._title = pdbId + '-' + chain
self._alignPDB = None
self._filterDict = None
self._max_index = None
self.fetch(self._url, localFile=localFile, timeout=timeout, **kwargs)
[docs] def fetch(self, url=None, localFile=False, **kwargs):
"""Get Dali record from url or file.
:arg url: url of Dali results page or local dali results file
If None then the url already associated with the DaliRecord object is used.
:type url: str
:arg localFile: whether provided url is a path for a local dali results file
:type localFile: bool
:arg timeout: amount of time until the query times out in seconds
default value is 120
:type timeout: int
:arg localfolder: folder in which to find the local file
default is the current folder
:type localfolder: str
"""
if localFile:
dali_file = open(url, 'r')
data = dali_file.read()
dali_file.close()
else:
import requests
if url == None:
url = self._url
sleep = 2
timeout = kwargs.pop('timeout', 120)
LOGGER.timeit('_dali')
log_message = ''
try_error = 3
while True:
LOGGER.write('Connecting to Dali for search results...')
LOGGER.clear()
try:
# html = urllib2.urlopen(url).read()
html = requests.get(url).content
except:
try_error -= 1
if try_error >= 0:
LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...')
continue
else:
# html = urllib2.urlopen(url).read()
html = requests.get(url).content
if PY3K:
html = html.decode()
if html.find('Status: Queued') > -1:
log_message = '(Dali search is queued)...'
elif html.find('Status: Running') > -1:
log_message = '(Dali search is running)...'
elif html.find('Your job') == -1 and html.find('.txt') > -1:
break
elif html.find('ERROR:') > -1:
LOGGER.warn(': Dali search reported an ERROR!')
self.isSuccess = False
return False
sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
if LOGGER.timing('_dali') > timeout:
LOGGER.warn(': Dali search has timed out. \nThe results can be obtained later using the fetch() method.')
self.isSuccess = False
return False
LOGGER.sleep(int(sleep), 'to reconnect to Dali '+log_message)
LOGGER.clear()
LOGGER.clear()
LOGGER.report('Dali results were fetched in %.1fs.', '_dali')
lines = html.strip().split('\n')
file_name = re.search('=.+-90\\.txt', html).group()[1:]
file_name = file_name[:-7]
# LOGGER.info(url+file_name+self._subset+'.txt')
# data = urllib2.urlopen(url+file_name+self._subset+'.txt').read()
data = requests.get(url+file_name+self._subset+'.txt').content
if PY3K:
data = data.decode()
localfolder = kwargs.pop('localfolder', '.')
if file_name.lower().startswith('s001'):
temp_name = self._pdbId + self._chain
else:
temp_name = file_name
temp_name += self._subset + '_dali.txt'
if localfolder != '.' and not os.path.exists(localfolder):
os.mkdir(localfolder)
with open(localfolder+os.sep+temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name+self._subset+'.txt' + '\n' + data)
# with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data)
data_list = data.strip().split('# ')
# No: Chain Z rmsd lali nres %id PDB Description -> data_list[3]
# Structural equivalences -> data_list[4]
# Translation-rotation matrices -> data_list[5]
map_temp_dict = dict()
lines = data_list[4].strip().split('\n')
self._lines_4 = lines
mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28),
usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4')
# [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b]
for map_i in mapping_temp:
if not map_i[0] in map_temp_dict:
map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]]
else:
map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]])
self._max_index = max(mapping_temp[:,2])
self._mapping = map_temp_dict
self._data = data_list[3]
lines = data_list[3].strip().split('\n')
# daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8],
# dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'),
# ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|S70')])
daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8],
dtype=[('id', '<i4'), ('pdb_chain', '|U6'), ('Z', '<f4'), ('rmsd', '<f4'),
('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|U70')])
if daliInfo.ndim == 0:
daliInfo = np.array([daliInfo])
pdbListAll = []
self._daliInfo = daliInfo
dali_temp_dict = dict()
for temp in self._daliInfo:
temp_dict = dict()
pdb_chain = temp[1].strip()[0:6]
# U6 and U70 were used as the dtype for np.genfromtext -> unicode string were used in daliInfo
# if PY3K:
# pdb_chain = pdb_chain.decode()
pdb_chain = str(pdb_chain)
temp_dict['pdbId'] = pdbid = pdb_chain[0:4].lower()
temp_dict['chainId'] = chid = pdb_chain[5:6]
temp_dict['pdb_chain'] = pdb_chain = pdbid + chid
temp_dict['Z'] = temp[2]
temp_dict['rmsd'] = temp[3]
temp_dict['len_align'] = temp[4]
temp_dict['nres'] = temp[5]
temp_dict['identity'] = temp[6]
temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist()
temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)]
temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)]
dali_temp_dict[pdb_chain] = temp_dict
pdbListAll.append(pdb_chain)
self._pdbListAll = tuple(pdbListAll)
self._pdbList = self._pdbListAll
self._alignPDB = dali_temp_dict
LOGGER.info('Obtained ' + str(len(pdbListAll)) + ' PDB chains from Dali for '+self._pdbId+self._chain+'.')
self.isSuccess = True
return True
[docs] def getPDBs(self, filtered=True):
"""Returns PDB list (filters may be applied)"""
if self._alignPDB is None:
LOGGER.warn('Dali Record does not have any data yet. Please run fetch.')
if filtered:
return self._pdbList
return self._pdbListAll
[docs] def getHits(self):
"""Returns the dictionary associated with the DaliRecord"""
if self._alignPDB is None:
LOGGER.warn('Dali Record does not have any data yet. Please run fetch.')
return self._alignPDB
[docs] def getFilterList(self):
"""Returns a list of PDB IDs and chains for the entries that were filtered out"""
filterDict = self._filterDict
if filterDict is None:
raise ValueError('You cannot obtain the list of filtered out entries before doing any filtering.')
temp_str = ', '.join([str(len(filterDict['len'])), str(len(filterDict['rmsd'])),
str(len(filterDict['Z'])), str(len(filterDict['identity']))])
LOGGER.info('Filtered out [' + temp_str + '] for [length, RMSD, Z, identity]')
return self._filterList
[docs] def getMapping(self, key):
"""Get mapping for a particular entry in the DaliRecord"""
if self._alignPDB is None:
LOGGER.warn('Dali Record does not have any data yet. Please run fetch.')
return None
try:
info = self._alignPDB[key]
mapping = [info['map_ref'], info['map_sel']]
except KeyError:
return None
return mapping
[docs] def getMappings(self):
"""Get all mappings in the DaliRecord"""
if self._alignPDB is None:
LOGGER.warn('Dali Record does not have any data yet. Please run fetch.')
return None
map_dict = {}
for key in self._alignPDB:
info = self._alignPDB[key]
mapping = [info['map_ref'], info['map_sel']]
map_dict[key] = mapping
return map_dict
mappings = property(getMappings)
[docs] def filter(self, cutoff_len=None, cutoff_rmsd=None, cutoff_Z=None, cutoff_identity=None):
"""Filters out PDBs from the PDBList and returns the PDB list.
PDBs that satisfy any of the following criterion will be filtered out.
(1) Length of aligned residues < cutoff_len (must be an integer or a float between 0 and 1);
(2) RMSD < cutoff_rmsd (must be a positive number);
(3) Z score < cutoff_Z (must be a positive number);
(4) Identity > cutoff_identity (must be an integer or a float between 0 and 1).
"""
if self._max_index is None:
LOGGER.warn('DaliRecord has no data. Please use the fetch() method.')
return None
if cutoff_len == None:
# cutoff_len = int(0.8*self._max_index)
cutoff_len = 0
elif not isinstance(cutoff_len, (float, int)):
raise TypeError('cutoff_len must be a float or an integer')
elif cutoff_len <= 1 and cutoff_len > 0:
cutoff_len = int(cutoff_len*self._max_index)
elif cutoff_len <= self._max_index and cutoff_len > 0:
cutoff_len = int(cutoff_len)
else:
raise ValueError('cutoff_len must be a float between 0 and 1, or an int not greater than the max length')
if cutoff_rmsd == None:
cutoff_rmsd = 0
elif not isinstance(cutoff_rmsd, (float, int)):
raise TypeError('cutoff_rmsd must be a float or an integer')
elif cutoff_rmsd >= 0:
cutoff_rmsd = float(cutoff_rmsd)
else:
raise ValueError('cutoff_rmsd must be a number not less than 0')
if cutoff_Z == None:
cutoff_Z = 0
elif not isinstance(cutoff_Z, (float, int)):
raise TypeError('cutoff_Z must be a float or an integer')
elif cutoff_Z >= 0:
cutoff_Z = float(cutoff_Z)
else:
raise ValueError('cutoff_Z must be a number not less than 0')
if cutoff_identity == None or cutoff_identity == 0:
cutoff_identity = 100
elif not isinstance(cutoff_identity, (float, int)):
raise TypeError('cutoff_identity must be a float or an integer')
elif cutoff_identity <= 1 and cutoff_identity > 0:
cutoff_identity = float(cutoff_identity*100)
elif cutoff_identity <= 100 and cutoff_identity > 0:
cutoff_identity = float(cutoff_identity)
else:
raise ValueError('cutoff_identity must be a float between 0 and 1, or a number between 0 and 100')
# debug:
# print('cutoff_len: ' + str(cutoff_len) + ', ' + 'cutoff_rmsd: ' + str(cutoff_rmsd) + ', ' + 'cutoff_Z: ' + str(cutoff_Z) + ', ' + 'cutoff_identity: ' + str(cutoff_identity))
daliInfo = self._alignPDB
if daliInfo is None:
raise ValueError("Dali Record does not have any data yet. Please run fetch.")
pdbListAll = self._pdbListAll
missing_ind_dict = dict()
ref_indices_set = set(range(self._max_index))
filterListLen = []
filterListRMSD = []
filterListZ = []
filterListIdentity = []
# keep the first PDB (query PDB)
for pdb_chain in pdbListAll[1:]:
temp_dict = daliInfo[pdb_chain]
# filter: len_align, identity, rmsd, Z
if temp_dict['len_align'] < cutoff_len:
# print('Filter out ' + pdb_chain + ', len_align: ' + str(temp_dict['len_align']))
filterListLen.append(pdb_chain)
continue
if temp_dict['rmsd'] < cutoff_rmsd:
# print('Filter out ' + pdb_chain + ', rmsd: ' + str(temp_dict['rmsd']))
filterListRMSD.append(pdb_chain)
continue
if temp_dict['Z'] < cutoff_Z:
# print('Filter out ' + pdb_chain + ', Z: ' + str(temp_dict['Z']))
filterListZ.append(pdb_chain)
continue
if temp_dict['identity'] > cutoff_identity:
# print('Filter out ' + pdb_chain + ', identity: ' + str(temp_dict['identity']))
filterListIdentity.append(pdb_chain)
continue
temp_diff = list(ref_indices_set - set(temp_dict['map_ref']))
for diff_i in temp_diff:
if not diff_i in missing_ind_dict:
missing_ind_dict[diff_i] = 1
else:
missing_ind_dict[diff_i] += 1
self._missing_ind_dict = missing_ind_dict
filterList = filterListLen + filterListRMSD + filterListZ + filterListIdentity
filterDict = {'len': filterListLen, 'rmsd': filterListRMSD, 'Z': filterListZ, 'identity': filterListIdentity}
self._filterList = filterList
self._filterDict = filterDict
self._pdbList = [self._pdbListAll[0]] + [item for item in self._pdbListAll[1:] if not item in filterList]
LOGGER.info(str(len(filterList)) + ' PDBs have been filtered out from '+str(len(pdbListAll))+' Dali hits (remaining: '+str(len(pdbListAll)-len(filterList))+').')
return self._pdbList
[docs] def getTitle(self):
"""Return the title of the record"""
return self._title
[docs]def daliFilterMultimer(atoms, dali_rec, n_chains=None):
"""
Filters multimers to only include chains with Dali mappings.
:arg atoms: the multimer to be filtered
:type atoms: :class:`.Atomic`
:arg dali_rec: the DaliRecord object with which to filter chains
:type dali_rec: :class:`.DaliRecord`
"""
if not isinstance(atoms, Atomic):
raise TypeError("atoms should be an Atomic object")
if not isinstance(dali_rec, DaliRecord):
raise TypeError("dali_rec should be a DaliRecord")
try:
keys = dali_rec._alignPDB
except:
raise AttributeError("Dali Record does not have any data yet. Please run fetch.")
numChains = 0
atommap = None
for i, chain in enumerate(atoms.iterChains()):
m = dali_rec.getMapping(chain.getTitle()[:4] + chain.getChid())
if m is not None:
numChains += 1
if atommap is None:
atommap = chain
else:
atommap += chain
if n_chains is None or numChains == n_chains:
return atommap
else:
return None
[docs]def daliFilterMultimers(structures, dali_rec, n_chains=None):
"""A wrapper for daliFilterMultimer to apply to multiple structures.
"""
dali_ags = []
for entry in structures:
result = daliFilterMultimer(entry, dali_rec, n_chains)
if result is not None:
dali_ags.append(result)
return dali_ags