Source code for prody.database.dali

# -*- coding: utf-8 -*-
"""This module defines functions for Dali searching Protein Data Bank."""

import re
import numpy as np
from prody.atomic import Atomic, AtomGroup, AtomMap
from prody.proteins.pdbfile import _getPDBid
from prody.measure import getRMSD, getTransformation
from prody.utilities import checkCoords, checkWeights, createStringIO
from prody import LOGGER, PY3K
from prody import parsePDB, writePDBStream
# if PY3K:
    # import urllib.parse as urllib
    # import urllib.request as urllib2
# else:
    # import urllib
    # import urllib2
from prody.ensemble import Ensemble
from prody.ensemble import PDBEnsemble
import os

__all__ = ['DaliRecord', 'searchDali', 
           'daliFilterMultimer', 'daliFilterMultimers']

[docs]def searchDali(pdb, chain=None, subset='fullPDB', daliURL=None, **kwargs): """Search Dali server with input of PDB ID (or local PDB file) and chain ID. Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/ :arg pdb: PDB code or local PDB file for the protein to be searched :arg chain: chain identifier (only one chain can be assigned for PDB) :type chain: str :arg subset: fullPDB, PDB25, PDB50, PDB90 :type subset: str """ import requests LOGGER.timeit('_dali') # timeout = 120 timeout = kwargs.pop('timeout', 120) if daliURL is None: daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi" if isinstance(pdb, Atomic): atoms = pdb chain_set = set(atoms.getChids()) if chain and not chain in chain_set: raise ValueError('input structure (%s) does not have chain %s'%(atoms.getTitle(), chain)) if len(chain_set) > 1: if not chain: raise TypeError('the structure (%s) contains more than one chain, therefore a chain identifier ' 'needs to be specified'%pdb.getTitle()) atoms = atoms.select('chain '+chain) else: chain = chain_set.pop() stream = createStringIO() writePDBStream(stream, atoms) data = stream.getvalue() stream.close() files = {"file1" : data} pdbId = atoms.getTitle() pdb_chain = '' dali_title = 'Title_'+pdbId+chain elif isinstance(pdb, str): if os.path.isfile(pdb): atoms = parsePDB(pdb) chain_set = set(atoms.getChids()) # pdbId = "s001" filename = os.path.basename(pdb) filename, ext = os.path.splitext(filename) if ext.lower() == '.gz': filename2, ext2 = os.path.splitext(filename) if ext2.lower() == '.pdb': filename = filename2 pdbId = filename if chain and not chain in chain_set: raise ValueError('input PDB file does not have chain ' + chain) if len(chain_set) > 1: if not chain: raise TypeError('PDB file (%s) contains more than one chain, therefore a chain identifier ' 'needs to be specified'%pdb) atoms = atoms.select('chain '+chain) #local_temp_pdb = pdbId+chain+'.pdb' #local_temp_pdb = 's001'+chain+'.pdb' stream = createStringIO() writePDBStream(stream, atoms) data = stream.getvalue() stream.close() else: data = open(pdb, "rb") chain = chain_set.pop() files = {"file1" : data} # case: multiple chains. apply fetch ? multiple times? pdb_chain = '' dali_title = 'Title_' + pdbId + chain else: pdbId, ch = _getPDBid(pdb) if not chain: chain = ch if not chain: raise TypeError('a chain identifier is needed for the search') pdb_chain = pdbId + chain dali_title = 'Title_' + pdb_chain files = '' parameters = { 'cd1' : pdb_chain, 'method': 'search', 'title': dali_title, 'address': '' } # enc_params = urllib.urlencode(parameters).encode('utf-8') # request = urllib2.Request(daliURL, enc_params) request = requests.post(daliURL, parameters, files=files) try_error = 3 while try_error >= 0: try: # url = urllib2.urlopen(request).url url = request.url break except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: # url = urllib2.urlopen(request).url url = request.url break if url.split('.')[-1].lower() in ['html', 'php']: # print('test -1: '+url) url = url.replace(url.split('/')[-1], '') LOGGER.debug('Submitted Dali search for PDB "{0}{1}".'.format(pdbId, chain)) LOGGER.info(url) LOGGER.clear() return DaliRecord(url, pdbId, chain, subset=subset, timeout=timeout, **kwargs)
[docs]class DaliRecord(object): """A class to store results from Dali PDB search.""" def __init__(self, url, pdbId, chain, subset='fullPDB', localFile=False, **kwargs): """Instantiate a DaliRecord object instance. :arg url: url of Dali results page or local dali results file :arg pdbId: PDB code for searched protein :arg chain: chain identifier (only one chain can be assigned for PDB) :arg subset: fullPDB, PDB25, PDB50, PDB90. Ignored if localFile=True (url is a local file) :arg localFile: whether provided url is a path for a local dali results file """ self._url = url self._pdbId = pdbId self._chain = chain subset = subset.upper() if subset == "FULLPDB" or subset not in ["PDB25", "PDB50", "PDB90"]: self._subset = "" else: self._subset = "-"+subset[3:] timeout = kwargs.pop('timeout', 120) self._title = pdbId + '-' + chain self._alignPDB = None self._filterDict = None self._max_index = None self.fetch(self._url, localFile=localFile, timeout=timeout, **kwargs)
[docs] def fetch(self, url=None, localFile=False, **kwargs): """Get Dali record from url or file. :arg url: url of Dali results page or local dali results file If None then the url already associated with the DaliRecord object is used. :type url: str :arg localFile: whether provided url is a path for a local dali results file :type localFile: bool :arg timeout: amount of time until the query times out in seconds default value is 120 :type timeout: int :arg localfolder: folder in which to find the local file default is the current folder :type localfolder: str """ if localFile: dali_file = open(url, 'r') data = dali_file.read() dali_file.close() else: import requests if url == None: url = self._url sleep = 2 timeout = kwargs.pop('timeout', 120) LOGGER.timeit('_dali') log_message = '' try_error = 3 while True: LOGGER.write('Connecting to Dali for search results...') LOGGER.clear() try: # html = urllib2.urlopen(url).read() html = requests.get(url).content except: try_error -= 1 if try_error >= 0: LOGGER.sleep(2, '. Connection error happened. Trying to reconnect...') continue else: # html = urllib2.urlopen(url).read() html = requests.get(url).content if PY3K: html = html.decode() if html.find('Status: Queued') > -1: log_message = '(Dali search is queued)...' elif html.find('Status: Running') > -1: log_message = '(Dali search is running)...' elif html.find('Your job') == -1 and html.find('.txt') > -1: break elif html.find('ERROR:') > -1: LOGGER.warn(': Dali search reported an ERROR!') self.isSuccess = False return False sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5) if LOGGER.timing('_dali') > timeout: LOGGER.warn(': Dali search has timed out. \nThe results can be obtained later using the fetch() method.') self.isSuccess = False return False LOGGER.sleep(int(sleep), 'to reconnect to Dali '+log_message) LOGGER.clear() LOGGER.clear() LOGGER.report('Dali results were fetched in %.1fs.', '_dali') lines = html.strip().split('\n') file_name = re.search('=.+-90\\.txt', html).group()[1:] file_name = file_name[:-7] # LOGGER.info(url+file_name+self._subset+'.txt') # data = urllib2.urlopen(url+file_name+self._subset+'.txt').read() data = requests.get(url+file_name+self._subset+'.txt').content if PY3K: data = data.decode() localfolder = kwargs.pop('localfolder', '.') if file_name.lower().startswith('s001'): temp_name = self._pdbId + self._chain else: temp_name = file_name temp_name += self._subset + '_dali.txt' if localfolder != '.' and not os.path.exists(localfolder): os.mkdir(localfolder) with open(localfolder+os.sep+temp_name, "w") as file_temp: file_temp.write(html + '\n' + url+file_name+self._subset+'.txt' + '\n' + data) # with open(temp_name, "a+") as file_temp: file_temp.write(url+file_name + '\n' + data) data_list = data.strip().split('# ') # No: Chain Z rmsd lali nres %id PDB Description -> data_list[3] # Structural equivalences -> data_list[4] # Translation-rotation matrices -> data_list[5] map_temp_dict = dict() lines = data_list[4].strip().split('\n') self._lines_4 = lines mapping_temp = np.genfromtxt(lines[1:], delimiter = (4,1,14,6,2,4,4,5,2,4,4,3,5,4,3,5,6,3,5,4,3,5,28), usecols = [0,3,5,7,9,12,15,15,18,21], dtype='|i4') # [0,3,5,7,9,12,15,15,18,21] -> [index, residue_a, residue_b, residue_i_a, residue_i_b, resid_a, resid_b, resid_i_a, resid_i_b] for map_i in mapping_temp: if not map_i[0] in map_temp_dict: map_temp_dict[map_i[0]] = [[map_i[1], map_i[2], map_i[3], map_i[4]]] else: map_temp_dict[map_i[0]].append([map_i[1], map_i[2], map_i[3], map_i[4]]) self._max_index = max(mapping_temp[:,2]) self._mapping = map_temp_dict self._data = data_list[3] lines = data_list[3].strip().split('\n') # daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], # dtype=[('id', '<i4'), ('pdb_chain', '|S6'), ('Z', '<f4'), ('rmsd', '<f4'), # ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|S70')]) daliInfo = np.genfromtxt(lines[1:], delimiter = (4,3,6,5,5,5,6,5,57), usecols = [0,2,3,4,5,6,7,8], dtype=[('id', '<i4'), ('pdb_chain', '|U6'), ('Z', '<f4'), ('rmsd', '<f4'), ('len_align', '<i4'), ('nres', '<i4'), ('identity', '<i4'), ('title', '|U70')]) if daliInfo.ndim == 0: daliInfo = np.array([daliInfo]) pdbListAll = [] self._daliInfo = daliInfo dali_temp_dict = dict() for temp in self._daliInfo: temp_dict = dict() pdb_chain = temp[1].strip()[0:6] # U6 and U70 were used as the dtype for np.genfromtext -> unicode string were used in daliInfo # if PY3K: # pdb_chain = pdb_chain.decode() pdb_chain = str(pdb_chain) temp_dict['pdbId'] = pdbid = pdb_chain[0:4].lower() temp_dict['chainId'] = chid = pdb_chain[5:6] temp_dict['pdb_chain'] = pdb_chain = pdbid + chid temp_dict['Z'] = temp[2] temp_dict['rmsd'] = temp[3] temp_dict['len_align'] = temp[4] temp_dict['nres'] = temp[5] temp_dict['identity'] = temp[6] temp_dict['mapping'] = (np.array(map_temp_dict[temp[0]])-1).tolist() temp_dict['map_ref'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[0], map_i[1]+1)] temp_dict['map_sel'] = [x for map_i in (np.array(map_temp_dict[temp[0]])-1).tolist() for x in range(map_i[2], map_i[3]+1)] dali_temp_dict[pdb_chain] = temp_dict pdbListAll.append(pdb_chain) self._pdbListAll = tuple(pdbListAll) self._pdbList = self._pdbListAll self._alignPDB = dali_temp_dict LOGGER.info('Obtained ' + str(len(pdbListAll)) + ' PDB chains from Dali for '+self._pdbId+self._chain+'.') self.isSuccess = True return True
[docs] def getPDBs(self, filtered=True): """Returns PDB list (filters may be applied)""" if self._alignPDB is None: LOGGER.warn('Dali Record does not have any data yet. Please run fetch.') if filtered: return self._pdbList return self._pdbListAll
[docs] def getHits(self): """Returns the dictionary associated with the DaliRecord""" if self._alignPDB is None: LOGGER.warn('Dali Record does not have any data yet. Please run fetch.') return self._alignPDB
[docs] def getFilterList(self): """Returns a list of PDB IDs and chains for the entries that were filtered out""" filterDict = self._filterDict if filterDict is None: raise ValueError('You cannot obtain the list of filtered out entries before doing any filtering.') temp_str = ', '.join([str(len(filterDict['len'])), str(len(filterDict['rmsd'])), str(len(filterDict['Z'])), str(len(filterDict['identity']))]) LOGGER.info('Filtered out [' + temp_str + '] for [length, RMSD, Z, identity]') return self._filterList
[docs] def getMapping(self, key): """Get mapping for a particular entry in the DaliRecord""" if self._alignPDB is None: LOGGER.warn('Dali Record does not have any data yet. Please run fetch.') return None try: info = self._alignPDB[key] mapping = [info['map_ref'], info['map_sel']] except KeyError: return None return mapping
[docs] def getMappings(self): """Get all mappings in the DaliRecord""" if self._alignPDB is None: LOGGER.warn('Dali Record does not have any data yet. Please run fetch.') return None map_dict = {} for key in self._alignPDB: info = self._alignPDB[key] mapping = [info['map_ref'], info['map_sel']] map_dict[key] = mapping return map_dict
mappings = property(getMappings)
[docs] def filter(self, cutoff_len=None, cutoff_rmsd=None, cutoff_Z=None, cutoff_identity=None): """Filters out PDBs from the PDBList and returns the PDB list. PDBs that satisfy any of the following criterion will be filtered out. (1) Length of aligned residues < cutoff_len (must be an integer or a float between 0 and 1); (2) RMSD < cutoff_rmsd (must be a positive number); (3) Z score < cutoff_Z (must be a positive number); (4) Identity > cutoff_identity (must be an integer or a float between 0 and 1). """ if self._max_index is None: LOGGER.warn('DaliRecord has no data. Please use the fetch() method.') return None if cutoff_len == None: # cutoff_len = int(0.8*self._max_index) cutoff_len = 0 elif not isinstance(cutoff_len, (float, int)): raise TypeError('cutoff_len must be a float or an integer') elif cutoff_len <= 1 and cutoff_len > 0: cutoff_len = int(cutoff_len*self._max_index) elif cutoff_len <= self._max_index and cutoff_len > 0: cutoff_len = int(cutoff_len) else: raise ValueError('cutoff_len must be a float between 0 and 1, or an int not greater than the max length') if cutoff_rmsd == None: cutoff_rmsd = 0 elif not isinstance(cutoff_rmsd, (float, int)): raise TypeError('cutoff_rmsd must be a float or an integer') elif cutoff_rmsd >= 0: cutoff_rmsd = float(cutoff_rmsd) else: raise ValueError('cutoff_rmsd must be a number not less than 0') if cutoff_Z == None: cutoff_Z = 0 elif not isinstance(cutoff_Z, (float, int)): raise TypeError('cutoff_Z must be a float or an integer') elif cutoff_Z >= 0: cutoff_Z = float(cutoff_Z) else: raise ValueError('cutoff_Z must be a number not less than 0') if cutoff_identity == None or cutoff_identity == 0: cutoff_identity = 100 elif not isinstance(cutoff_identity, (float, int)): raise TypeError('cutoff_identity must be a float or an integer') elif cutoff_identity <= 1 and cutoff_identity > 0: cutoff_identity = float(cutoff_identity*100) elif cutoff_identity <= 100 and cutoff_identity > 0: cutoff_identity = float(cutoff_identity) else: raise ValueError('cutoff_identity must be a float between 0 and 1, or a number between 0 and 100') # debug: # print('cutoff_len: ' + str(cutoff_len) + ', ' + 'cutoff_rmsd: ' + str(cutoff_rmsd) + ', ' + 'cutoff_Z: ' + str(cutoff_Z) + ', ' + 'cutoff_identity: ' + str(cutoff_identity)) daliInfo = self._alignPDB if daliInfo is None: raise ValueError("Dali Record does not have any data yet. Please run fetch.") pdbListAll = self._pdbListAll missing_ind_dict = dict() ref_indices_set = set(range(self._max_index)) filterListLen = [] filterListRMSD = [] filterListZ = [] filterListIdentity = [] # keep the first PDB (query PDB) for pdb_chain in pdbListAll[1:]: temp_dict = daliInfo[pdb_chain] # filter: len_align, identity, rmsd, Z if temp_dict['len_align'] < cutoff_len: # print('Filter out ' + pdb_chain + ', len_align: ' + str(temp_dict['len_align'])) filterListLen.append(pdb_chain) continue if temp_dict['rmsd'] < cutoff_rmsd: # print('Filter out ' + pdb_chain + ', rmsd: ' + str(temp_dict['rmsd'])) filterListRMSD.append(pdb_chain) continue if temp_dict['Z'] < cutoff_Z: # print('Filter out ' + pdb_chain + ', Z: ' + str(temp_dict['Z'])) filterListZ.append(pdb_chain) continue if temp_dict['identity'] > cutoff_identity: # print('Filter out ' + pdb_chain + ', identity: ' + str(temp_dict['identity'])) filterListIdentity.append(pdb_chain) continue temp_diff = list(ref_indices_set - set(temp_dict['map_ref'])) for diff_i in temp_diff: if not diff_i in missing_ind_dict: missing_ind_dict[diff_i] = 1 else: missing_ind_dict[diff_i] += 1 self._missing_ind_dict = missing_ind_dict filterList = filterListLen + filterListRMSD + filterListZ + filterListIdentity filterDict = {'len': filterListLen, 'rmsd': filterListRMSD, 'Z': filterListZ, 'identity': filterListIdentity} self._filterList = filterList self._filterDict = filterDict self._pdbList = [self._pdbListAll[0]] + [item for item in self._pdbListAll[1:] if not item in filterList] LOGGER.info(str(len(filterList)) + ' PDBs have been filtered out from '+str(len(pdbListAll))+' Dali hits (remaining: '+str(len(pdbListAll)-len(filterList))+').') return self._pdbList
[docs] def getTitle(self): """Return the title of the record""" return self._title
[docs]def daliFilterMultimer(atoms, dali_rec, n_chains=None): """ Filters multimers to only include chains with Dali mappings. :arg atoms: the multimer to be filtered :type atoms: :class:`.Atomic` :arg dali_rec: the DaliRecord object with which to filter chains :type dali_rec: :class:`.DaliRecord` """ if not isinstance(atoms, Atomic): raise TypeError("atoms should be an Atomic object") if not isinstance(dali_rec, DaliRecord): raise TypeError("dali_rec should be a DaliRecord") try: keys = dali_rec._alignPDB except: raise AttributeError("Dali Record does not have any data yet. Please run fetch.") numChains = 0 atommap = None for i, chain in enumerate(atoms.iterChains()): m = dali_rec.getMapping(chain.getTitle()[:4] + chain.getChid()) if m is not None: numChains += 1 if atommap is None: atommap = chain else: atommap += chain if n_chains is None or numChains == n_chains: return atommap else: return None
[docs]def daliFilterMultimers(structures, dali_rec, n_chains=None): """A wrapper for daliFilterMultimer to apply to multiple structures. """ dali_ags = [] for entry in structures: result = daliFilterMultimer(entry, dali_rec, n_chains) if result is not None: dali_ags.append(result) return dali_ags