# -*- coding: utf-8 -*-
"""This module defines functions for interfacing Pfam database."""
__author__ = 'Anindita Dutta, Ahmet Bakan, Cihan Kaya'
from prody.dynamics.analysis import calcFracDimension
import re
from numbers import Integral
import numpy as np
from os.path import join, isfile
from io import BytesIO
from prody import LOGGER, PY3K
from prody.utilities import makePath, openURL, gunzip, openFile, dictElement
from prody.utilities import relpath
from prody.proteins import parsePDB
if PY3K:
import urllib.parse as urllib
import urllib.request as urllib2
else:
import urllib
import urllib2
__all__ = ['searchPfam', 'fetchPfamMSA', 'parsePfamPDBs']
FASTA = 'fasta'
SELEX = 'selex'
STOCKHOLM = 'stockholm'
DOWNLOAD_FORMATS = set(['seed', 'full', 'ncbi', 'metagenomics',
'rp15', 'rp35', 'rp55', 'rp75', 'uniprot'])
FORMAT_OPTIONS = ({'format': set([FASTA, SELEX, STOCKHOLM]),
'order': set(['tree', 'alphabetical']),
'inserts': set(['lower', 'upper']),
'gaps': set(['mixed', 'dots', 'dashes', 'none'])})
MINSEQLEN = 16
prefix = 'https://pfam.xfam.org/'
[docs]def searchPfam(query, **kwargs):
"""Returns Pfam search results in a dictionary. Matching Pfam accession
as keys will map to evalue, alignment start and end residue positions.
:arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence
file. Sequence queries must not contain without gaps and must be at
least 16 characters long
:type query: str
:arg timeout: timeout for blocking connection attempt in seconds, default
is 60
:type timeout: int
*query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with
chain identifier. UniProt ID of the specified chain, or the first
protein chain will be used for searching the Pfam database."""
if isfile(query):
from prody.sequence import MSAFile
try:
seq = next(MSAFile(query))
except:
with openFile(query) as inp:
seq = ''.join(inp.read().split())
else:
seq = seq[0][1]
if not seq.isalpha():
raise ValueError('could not parse a sequence without gaps from ' +
query)
else:
seq = ''.join(query.split())
import xml.etree.cElementTree as ET
LOGGER.timeit('_pfam')
timeout = int(kwargs.get('timeout', 60))
if len(seq) >= MINSEQLEN:
if not seq.isalpha():
raise ValueError(repr(seq) + ' is not a valid sequence')
fseq = '>Seq\n' + seq
parameters = { 'hmmdb' : 'pfam', 'seq': fseq }
enc_params = urllib.urlencode(parameters).encode('utf-8')
request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params)
results_url = urllib2.urlopen(request).geturl()
#res_params = { 'output' : 'xml' }
res_params = { 'format' : 'tsv' }
enc_res_params = urllib.urlencode(res_params)
#modified_res_url = results_url + '?' + enc_res_params
modified_res_url = results_url.replace('results','download') + '?' + enc_res_params
result_request = urllib2.Request(modified_res_url)
# url = ( urllib2.urlopen(request).geturl() + '?output=xml')
LOGGER.debug('Submitted Pfam search for sequence "{0}...".'
.format(seq[:MINSEQLEN]))
try:
#xml = urllib2.urlopen(result_request).read()
tsv = urllib2.urlopen(result_request).read()
# openURL(url, timeout=timeout).read()
except:
raise ValueError('No matching Pfam domains were found.')
# try:
# root = ET.XML(xml)
# except Exception as err:
# raise ValueError('failed to parse results XML, check URL: ' + modified_res_url)
matches = {}
#for child in root[0]:
#if child.tag == 'hits':
# accession = child.get('acc')
# pfam_id = accession.split('.')[0]
# matches[pfam_id]={}
# matches[pfam_id]['accession']=accession
# matches[pfam_id]['class']='Domain'
# matches[pfam_id]['id']=child.get('name')
# matches[pfam_id]['locations']={}
# matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto')
# matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom')
# matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore')
# matches[pfam_id]['locations']['end']=child[0].get('alisqto')
# matches[pfam_id]['locations']['evalue']=child.get('evalue')
# matches[pfam_id]['locations']['evidence']='hmmer v3.0'
# matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto')
# matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom')
# matches[pfam_id]['locations']['significant']=child[0].get('significant')
# matches[pfam_id]['locations']['start']=child[0].get('alisqfrom')
# matches[pfam_id]['type']='Pfam-A'
# return matches
if PY3K:
tsv = tsv.decode()
lines = tsv.split('\n')
keys = lines[0].split('\t')
root = {}
for i, line in enumerate(lines[1:-1]):
root[i] = {}
for j, key in enumerate(keys):
root[i][key] = line.split('\t')[j]
for child in root.values():
accession = child['Family Accession']
pfam_id = accession.split('.')[0]
matches[pfam_id]={}
matches[pfam_id]['accession'] = accession
matches[pfam_id]['class'] = 'Domain'
matches[pfam_id]['id'] = child['Family id']
matches[pfam_id]['locations'] = {}
matches[pfam_id]['locations']['ali_end'] = child['Ali. End']
matches[pfam_id]['locations']['ali_start'] = child['Ali. Start']
matches[pfam_id]['locations']['bitscore'] = child['Bit Score']
matches[pfam_id]['locations']['end'] = child['Env. End']
matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value']
matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value']
matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0'
matches[pfam_id]['locations']['hmm_end'] = child['Model End']
matches[pfam_id]['locations']['hmm_start'] = child['Model Start']
#matches[pfam_id]['locations']['significant'] = child['significant']
matches[pfam_id]['locations']['start'] = child['Env. Start']
matches[pfam_id]['type'] = 'Pfam-A'
return matches
else:
if len(seq) <= 5:
idcode = None
from prody import parsePDBHeader
try:
polymers = parsePDBHeader(seq[:4], 'polymers')
except Exception as err:
LOGGER.warn('failed to parse header for {0} ({1})'
.format(seq[:4], str(err)))
else:
chid = seq[4:].upper()
for poly in polymers:
if chid and poly.chid != chid:
continue
for dbref in poly.dbrefs:
if dbref.database != 'UniProt':
continue
idcode = dbref.idcode
accession = dbref.accession
LOGGER.info('UniProt ID code {0} for {1} chain '
'{2} will be used.'
.format(idcode, seq[:4], poly.chid))
break
if idcode is not None:
break
if idcode is None:
LOGGER.warn('A UniProt ID code for PDB {0} could not be '
'parsed.'.format(repr(seq)))
url = prefix + 'protein/' + seq + '?output=xml'
else:
url = prefix + 'protein/' + idcode + '?output=xml'
else:
url = prefix + 'protein/' + seq + '?output=xml'
LOGGER.debug('Retrieving Pfam search results: ' + url)
xml = None
while LOGGER.timing('_pfam') < timeout:
try:
xml = openURL(url, timeout=timeout).read()
except Exception:
pass
else:
if xml not in ['PEND','RUN']:
break
if not xml:
raise IOError('Pfam search timed out or failed to parse results '
'XML, check URL: ' + url)
else:
LOGGER.report('Pfam search completed in %.2fs.', '_pfam')
if xml.find(b'There was a system error on your last request.') > 0:
LOGGER.warn('No Pfam matches found for: ' + seq)
return None
elif xml.find(b'No valid UniProt accession or ID') > 0:
try:
url = prefix + 'protein/' + accession + '?output=xml'
LOGGER.debug('Retrieving Pfam search results: ' + url)
xml = openURL(url, timeout=timeout).read()
except:
try:
ag = parsePDB(seq, subset='ca')
ag_seq = ag.getSequence()
return searchPfam(ag_seq)
except:
raise ValueError('No valid UniProt accession or ID for: ' + seq)
if xml.find(b'No valid UniProt accession or ID') > 0:
try:
url = 'https://uniprot.org/uniprot/' + accession + '.xml'
xml = openURL(url, timeout=timeout).read()
root = ET.XML(xml)
accession = root[0][0].text
url = prefix + 'protein/' + accession + '?output=xml'
LOGGER.debug('Retrieving Pfam search results: ' + url)
xml = openURL(url, timeout=timeout).read()
except:
raise ValueError('No valid UniProt accession or ID for: ' + seq)
try:
root = ET.XML(xml)
except Exception as err:
raise ValueError('failed to parse results XML, check URL: ' + url)
if len(seq) >= MINSEQLEN:
try:
xml_matches = root[0][0][0][0]
except IndexError:
raise ValueError('failed to parse results XML, check URL: ' + url)
else:
key = '{' + prefix + '}'
results = dictElement(root[0], key)
try:
xml_matches = results['matches']
except KeyError:
raise ValueError('failed to parse results XML, check URL: ' + url)
matches = dict()
for child in xml_matches:
try:
accession = child.attrib['accession'][:7]
except KeyError:
raise ValueError('failed to parse results XML, check URL: ' + url)
if not re.search('^P(F|B)[0-9]{5}$', accession):
raise ValueError('{0} does not match pfam accession'
' format'.format(accession))
match = matches.setdefault(accession, dict(child.items()))
locations = match.setdefault('locations', [])
for loc in child:
locations.append(dict(loc.items()))
if len(seq) < MINSEQLEN:
query = 'Query ' + repr(query)
else:
query = 'Query sequence'
if matches:
LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
else:
LOGGER.info(query + ' did not match any Pfam families.')
return matches
[docs]def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs):
"""Returns a path to the downloaded Pfam MSA file.
:arg acc: Pfam ID or Accession Code
:type acc: str
:arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``,
``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``,
``'rp75'`` or ``'uniprot'`` where rp stands for representative
proteomes
:arg compressed: gzip the downloaded MSA file, default is **False**
*Alignment Options*
:arg format: a Pfam supported MSA file format, one of ``'selex'``,
(default), ``'stockholm'`` or ``'fasta'``
:arg order: ordering of sequences, ``'tree'`` (default) or
``'alphabetical'``
:arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'``
:arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``,
``'mixed'`` or **None** for unaligned
*Other Options*
:arg timeout: timeout for blocking connection attempt in seconds, default
is 60
:arg outname: out filename, default is input ``'acc_alignment.format'``
:arg folder: output folder, default is ``'.'``"""
url = prefix + 'family/acc?id=' + acc
handle = openURL(url, timeout=int(kwargs.get('timeout', 60)))
orig_acc = acc
acc = handle.readline().strip()
if PY3K:
acc = acc.decode()
url_flag = False
if not re.search('(?<=PF)[0-9]{5}$', acc):
raise ValueError('{0} is not a valid Pfam ID or Accession Code'
.format(repr(orig_acc)))
if alignment not in DOWNLOAD_FORMATS:
raise ValueError('alignment must be one of full, seed, ncbi or'
' metagenomics')
if alignment == 'ncbi' or alignment == 'metagenomics' or alignment == 'uniprot':
url = (prefix + 'family/' + acc + '/alignment/' +
alignment + '/gzipped')
url_flag = True
extension = '.sth'
else:
if not kwargs:
url = (prefix + 'family/' + acc + '/alignment/' +
alignment + '/gzipped')
url_flag = True
extension = '.sth'
else:
align_format = kwargs.get('format', 'selex').lower()
if align_format not in FORMAT_OPTIONS['format']:
raise ValueError('alignment format must be of type selex'
' stockholm or fasta. MSF not supported')
if align_format == SELEX:
align_format, extension = 'pfam', '.slx'
elif align_format == FASTA:
extension = '.fasta'
else:
extension = '.sth'
gaps = str(kwargs.get('gaps', 'dashes')).lower()
if gaps not in FORMAT_OPTIONS['gaps']:
raise ValueError('gaps must be of type mixed, dots, dashes, '
'or None')
inserts = kwargs.get('inserts', 'upper').lower()
if(inserts not in FORMAT_OPTIONS['inserts']):
raise ValueError('inserts must be of type lower or upper')
order = kwargs.get('order', 'tree').lower()
if order not in FORMAT_OPTIONS['order']:
raise ValueError('order must be of type tree or alphabetical')
url = (prefix + 'family/' + acc + '/alignment/'
+ alignment + '/format?format=' + align_format +
'&alnType=' + alignment + '&order=' + order[0] +
'&case=' + inserts[0] + '&gaps=' + gaps + '&download=1')
response = openURL(url, timeout=int(kwargs.get('timeout', 60)))
outname = kwargs.get('outname', None)
if not outname:
outname = orig_acc
folder = str(kwargs.get('folder', '.'))
filepath = join(makePath(folder), outname + '_' + alignment + extension)
if compressed:
filepath = filepath + '.gz'
if url_flag:
f_out = open(filepath, 'wb')
else:
f_out = openFile(filepath, 'wb')
f_out.write(response.read())
f_out.close()
else:
if url_flag:
gunzip(response.read(), filepath)
else:
with open(filepath, 'wb') as f_out:
f_out.write(response.read())
filepath = relpath(filepath)
LOGGER.info('Pfam MSA for {0} is written as {1}.'
.format(orig_acc, filepath))
return filepath
[docs]def parsePfamPDBs(query, data=[], **kwargs):
"""Returns a list of :class:`.AtomGroup` objects containing sections of chains
that correspond to a particular PFAM domain family. These are defined by
alignment start and end residue numbers.
:arg query: UniProt ID or PDB ID
If a PDB ID is provided the corresponding UniProt ID is used.
If this returns multiple matches then start or end must also be provided.
This query is also used for label refinement of the Pfam domain MSA.
:type query: str
:arg data: If given the data list from the Pfam mapping table will
be output through this argument.
:type data: list
:keyword start: Residue number for defining the start of the domain.
The PFAM domain that starts closest to this will be selected.
Default is **1**
:type start: int
:keyword end: Residue number for defining the end of the domain.
The PFAM domain that ends closest to this will be selected.
:type end: int
"""
start = kwargs.pop('start', 1)
end = kwargs.pop('end', None)
if len(query) > 4 and query.startswith('PF'):
pfam_acc = query
else:
pfam_matches = searchPfam(query)
keys = list(pfam_matches.keys())
if isinstance(start, Integral):
start_diff = []
for i, key in enumerate(pfam_matches):
start_diff.append(int(pfam_matches[key]['locations'][0]['start']) - start)
start_diff = np.array(start_diff)
pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]]
elif isinstance(end, Integral):
end_diff = []
for i, key in enumerate(pfam_matches):
end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end)
end_diff = np.array(end_diff)
pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]]
else:
raise ValueError('Please provide an integer for start or end '
'when using a UniProt ID or PDB ID.')
from ftplib import FTP
from .uniprot import queryUniprot
data_stream = BytesIO()
ftp_host = 'ftp.ebi.ac.uk'
ftp = FTP(ftp_host)
ftp.login()
ftp.cwd('pub/databases/Pfam/current_release')
ftp.retrbinary('RETR pdbmap.gz', data_stream.write)
ftp.quit()
zip_data = data_stream.getvalue()
data_stream.close()
rawdata = gunzip(zip_data)
if PY3K:
rawdata = rawdata.decode()
fields = ['PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC',
'UniprotAcc', 'UniprotResnumRange']
data_dicts = []
for line in rawdata.split('\n'):
if line.find(pfam_acc) != -1:
data_dicts.append({})
for j, entry in enumerate(line.strip().split('\t')):
data_dicts[-1][fields[j]] = entry.strip(';')
pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts]
chains = [data_dict['chain'] for data_dict in data_dicts]
header = kwargs.pop('header', False)
model = kwargs.get('model', None)
results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs)
ags, headers = results
ags, headers = list(ags), list(headers)
if model == 0:
LOGGER.info('only header is requested and returned')
return results
if header:
results = (ags, headers)
else:
results = ags
LOGGER.progress('Extracting Pfam domains...', len(ags))
comma_splitter = re.compile(r'\s*,\s*').split
no_info = []
for i, ag in enumerate(ags):
LOGGER.update(i)
data_dict = data_dicts[i]
pfamRange = data_dict['UniprotResnumRange'].split('-')
uniprotAcc = data_dict['UniprotAcc']
try:
uniData = queryUniprot(uniprotAcc)
except:
LOGGER.warn('No Uniprot record found for {0}'.format(data_dict['PBD_ID']))
continue
resrange = None
found = False
for key, value in uniData.items():
if not key.startswith('dbReference'):
continue
try:
pdbid = value['PDB']
except:
continue
if pdbid != data_dict['PDB_ID']:
continue
pdbchains = value['chains']
# example chain strings: "A=27-139, B=140-150" or "A/B=27-150"
pdbchains = comma_splitter(pdbchains)
for chain in pdbchains:
chids, resrange = chain.split('=')
chids = [chid.strip() for chid in chids.split('/')]
if data_dict['chain'] in chids:
resrange = resrange.split('-')
found = True
break
if found:
break
if found:
header = headers[i]
chain_accessions = [dbref.accession
for dbref in header[data_dict['chain']].dbrefs]
try:
if len(chain_accessions) > 0:
right_part = np.where(np.array(chain_accessions) ==
data_dict['UniprotAcc'])[0][0]
else:
raise ValueError('There is no accession for a chain in the Header')
except:
LOGGER.warn('Could not map domains in {0}'
.format(data_dict['PDB_ID']
+ data_dict['chain']))
no_info.append(i)
continue
right_dbref = header[data_dict['chain']].dbrefs[right_part]
chainStart = ag.select('chain {0}'.format(data_dict['chain'])
).getResnums()[0]
missing = chainStart - right_dbref.first[0]
partStart = ag.getResindices()[np.where(ag.getResnums() ==
right_dbref.first[0] + missing)][0]
pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1])
uniStart, uniEnd = int(resrange[0]), int(resrange[1])
resiStart = pfStart - uniStart + partStart - missing
resiEnd = pfEnd - uniStart + partStart - missing
ags[i] = ag.select('resindex {0} to {1}'.format(
resiStart, resiEnd))
else:
no_info.append(i)
LOGGER.finish()
for i in reversed(no_info):
ags.pop(i)
if header:
headers.pop(i)
if isinstance(data, list):
data.extend(data_dicts)
else:
LOGGER.warn('data should be a list in order to get output')
return results