Release Notes
v2.0 series come with new and improved sequence, structure, and dynamics analysis features. See release notes for details.
How to Cite
Bakan A, Meireles LM, Bahar I ProDy: Protein Dynamics Inferred from Theory and Experiments
Bioinformatics 2011 27(11):1575-1577.
Bakan A, Dutta A, Mao W, Liu Y, Chennubhotla C, Lezon TR, Bahar I Evol and ProDy for Bridging Protein Sequence Evolution and Structural Dynamics
Bioinformatics 2014 30(18):2681-2683.
Source code for prody.database.cath

# -*- coding: utf-8 -*-
"""This module defines functions for query CATH database."""

from os import getcwd, remove
from os.path import sep as pathsep
from os.path import isdir, isfile, join, split, splitext, normpath

import re
from numbers import Integral

from prody import LOGGER, SETTINGS, PY3K
from prody.proteins import parsePDB
from prody.utilities import makePath, gunzip, relpath, copyFile, openURL
from prody.utilities import sympath, indentElement, isPDB, isURL, isListLike

if PY3K:
    import urllib.parse as urllib
    import urllib.request as urllib2
else:
    import urllib
    import urllib2

import xml.etree.ElementTree as ET

__all__ = ['CATHDB', 'CATHElement']

CATH_URL = 'http://download.cathdb.info'
CATH_NAME_PATH = '/cath/releases/latest-release/cath-classification-data/cath-names.txt'
CATH_DOMAIN_PATH = '/cath/releases/latest-release/cath-classification-data/cath-domain-list.txt'
CATH_BOUNDARY_PATH = '/cath/releases/latest-release/cath-classification-data/cath-domain-boundaries-seqreschopping.txt'

isCATH = re.compile('^[0-9.]+$').match
# Note that if the chain id is blank, it will be represented by "0" in CATH domain ID.
isDomain = re.compile('^[A-Za-z0-9]{5}[0-9]{2}$').match

[docs]class CATHElement(ET.Element):
    """This class handles individual elements from the cath tree 
    including tree navigation and the getting of associated CATH domains, 
    PDB IDs and selection strings."""
    def __init__(self, tag, attrib={}, parent=None, **extra):
        super(CATHElement, self).__init__(tag, attrib, **extra)
        self._parent = parent

    def __getitem__(self, index):
        if isinstance(index, Integral):
            return super(CATHElement, self).__getitem__(index)
        else:
            return self.attrib[index]

    def __repr__(self):
        if self.isDomain():
            return '<CATHElement: {0}>'.format(self.id)
        return '<CATHElement: {0} ({1} members)>'.format(self.id, len(self))

    def isDomain(self):
        return isDomain(self.id)

    def getID(self):
        return self.tag

    def setID(self, value):
        self.tag = value
    
    id = property(getID, setID)

    def getchildren(self):
        children = super(CATHElement, self).getchildren()
        collection = CATHCollection(children, self)
        return collection

    children = property(getchildren)

    def getparent(self):
        return self._parent

    def setparent(self, value):
        self._parent = value
    
    parent = property(getparent, setparent)

    def find(self, key):
        for child in self:
            if child.tag == key:
                return child
        return None

    def numDomains(self):
        n_leaves = 0
        if self.isDomain():
            n_leaves = 1
        else:
            for child in self:
                if child.isDomain():
                    n_leaves += 1
                else:
                    n_leaves += child.numDomains()
        return n_leaves

    def getDomains(self):
        leaves = []
        if self.isDomain():
            leaves.append(self)
        else:
            for child in self:
                if child.isDomain():
                    leaves.append(child)
                else:
                    leaves.extend(child.getDomains())
        return leaves

    def getDomainData(self, attr):
        leaves = self.getDomains()
        data = []

        for leaf in leaves:
            data.append(leaf[attr])
        return data

    def getPDBs(self, chain=True):
        leaves = self.getDomains()
        pdbs = []

        for leaf in leaves:
            pdb_id = leaf.attrib['pdb']
            if chain:
                pdb_id += leaf.attrib['chain']
            pdbs.append(pdb_id)
        
        return pdbs
    
    def getSelstrs(self):
        leaves = self.getDomains()
        data = []

        for leaf in leaves:
            rangestr = leaf.attrib['range']
            selstr = range2selstr(rangestr)
            data.append(selstr)
        return data

    def getCATH(self):
        try:
            cath = self.attrib['cath']
        except KeyError:
            if self.parent is not None:
                cath = self.parent.cath
            else:
                cath = None # this is the case for the root
        return cath

    cath = property(getCATH)

    def getName(self):
        try: 
            name = self.attrib['name'].lstrip(':')
            if name == '':
                name = 'Superfamily %s'%self.cath
        except KeyError:
            if self.parent is not None:
                name = self.parent.name
            else:
                name = None # this is the case for the root
        return name

    name = property(getName)


[docs]    def parsePDBs(self, **kwargs):
        """Load PDB into memory as :class:`.AtomGroup` instances using :func:`.parsePDB` and 
        perform selection based on residue ranges given by CATH."""
        
        pdbs = self.getPDBs(True)
        selstrs = self.getSelstrs()
        header = kwargs.get('header', False)
        model = kwargs.get('model', None)

        LOGGER.timeit('_cath_parsePDB')
        LOGGER.info('Parsing {0} PDB files...'.format(len(pdbs)))
        ret = parsePDB(*pdbs, **kwargs)

        if model != 0:
            headers = None
            if header:
                prots, headers = ret
            else:
                prots = ret

            if not isinstance(prots, list):
                prots = [prots]

                if header:
                    headers = [headers]
                    ret = (prots, headers)
                else:
                    ret = prots
                    
            LOGGER.info('Extracting domains...')
            for i in range(len(prots)):
                sel = prots[i].select(selstrs[i])
                prots[i] = sel
        LOGGER.report('CATH domains are parsed and extracted in %.2fs', '_cath_parsePDB')

        return ret


class CATHCollection(CATHElement):
    """"This class handles collections of elements from the CATH tree 
    including the listing of names and IDs."""
    def __init__(self, items, element=None):
        if element is not None:
            tag = element.tag
            attrib = element.attrib
        else:
            tag = 'cath'
            attrib = {}

        super(CATHCollection, self).__init__(tag=tag, attrib=attrib)

        if not isListLike(items):
            items = [items]

        parents = []
        for item in items:
            self.append(item)
            parents.append(item.parent)

        uniq_parents = set(parents)
        if len(uniq_parents) == 1:
            self._parent = parents[0]

    def __repr__(self):
        names = []
        n = 0
        for child in self:
            n += 1
            if n > 100:
                names.append('...')
                break
            else:
                names.append(repr(child))

        return '<CATHCollection:\n[%s]>'%'\n'.join(names)

    def isDomain(self):
        return False

    def getCATH(self):
        cath = []
        for child in self:
            cath.append(child.attrib['cath'])
        return cath

    cath = property(getCATH)

    def getName(self):
        names = []
        for child in self:
            names.append(child.name)
        return names

    name = property(getName)

    def getID(self):
        tags = []
        for child in self:
            tags.append(child.tag)
        return tags
    
    id = property(getID)


[docs]class CATHDB(ET.ElementTree):
    """This class is for handing the data in the CATH database. 
    It facilitates tree navigation and ID-based searching."""

    def __init__(self, source=CATH_URL):
        """The class is initialised using data from *source*, which 
        is downloaded from the CATH website by default, but can 
        also be parsed from an XML file."""
        self.root = CATHElement('root')
        super(CATHDB, self).__init__(self.root)
        self._source = source
        self._raw_names = None
        self._raw_domains = None
        self._raw_boundaries = None
        self._map = {'root': self.root}

        # this property is only for debug purposes and will not be copied
        self._unclassified = {}

        if source is not None:
            self.update()
    
    def __repr__(self):
        return "<CATHDB: {0} members>".format(len(self.root))

    def reset(self):
        # by removing the immediate children of root, 
        # all the descendants will be removed
        for child in self.root:
            self.root.remove(child)

        self._unclassified = {}

[docs]    def update(self, source=None):
        """Update data and files from CATH."""

        self._source = source = self._source or source
        self.reset()
        if source is None:
            return

        LOGGER.timeit('_cath_update')
        
        type_ = 0
        tree = None
        if isinstance(source, str):
            if isfile(source):
                type_ = 1
            elif isURL(source):
                type_ = 0
            else:
                type_ = 2
        elif hasattr(source, 'read'):
                type_ = 1
        else:
            raise TypeError('source must be either an url, file name, file handle, '
                            'or text in xml format')

        if type_ == 0:
            LOGGER.info('Fetching data from CATH...')
            self._fetch()

            LOGGER.info('Parsing CATH files...')
            self._parse()
        elif type_ == 1:
            LOGGER.info('Reading data from the local xml file...')
            tree = ET.parse(source)
        elif type_ == 2:
            LOGGER.info('Parsing input string...')
            tree = ET.fromstring(source)

        # post-processing
        if type_ > 0:
            root = tree.getroot()
            nodes = root.iter()

            # remove prefix from node tags
            for node in nodes:
                node.tag = node.tag.lstrip('id.')

            # convert int to str
            length_nodes = root.findall('.//*[@length]')
            for node in length_nodes:
                node.attrib['length'] = int(node.attrib['length'])
            
            copy2(root, self.root)
            self._update_map()

        LOGGER.report('CATH local database built in %.2fs.', '_cath_update')

    def _update_map(self, parent=None):
        if parent is None:
            parent = self.root
        for child in parent:
            self._map[child.id] = child
            self._update_map(child)

    def copy(self):
        db = CATHDB(source=None)
        copy2(self.root, db.root)

        db._map = self._map
        db._source = self._source
        db._raw_names = self._raw_names
        db._raw_domains = self._raw_domains
        db._raw_boundaries = self._raw_boundaries
        return db

[docs]    def save(self, filename='cath.xml'):
        """Write local CATH database to an XML file. *filename* can either be a 
        file name or a handle."""

        LOGGER.timeit('_cath_write')

        if not isinstance(filename, str):
            try:
                fn = filename.name
            except AttributeError:
                fn = repr(filename)
            f = filename
        else:
            fn = filename

        LOGGER.info('Writing data to {0}...'.format(fn))

        if not len(self.root):
            raise ValueError('local database has not been built, '
                             'please call update() first')

        tree = self.copy()
        root = tree.getroot()

        # convert int to str
        length_nodes = root.findall('.//*[@length]')
        for node in length_nodes:
            node.attrib['length'] = str(node.attrib['length'])
        
        # add prefix to node tags
        nodes = root.iter()
        for node in nodes:
            node.tag = 'id.' + node.tag

        # add indentation to nodes
        indentElement(root)
        
        if isinstance(filename, str):
            f = open(filename, 'wb')
        tree.write(f, encoding='utf-8')
        f.close()

        LOGGER.report('CATH local database saved in %.2fs.', '_cath_write')

[docs]    def find(self, identifier='root'):
        """Find an :class:`xml.etree.Element` object given *identifier*."""

        if len(self.root) is None:
            raise ValueError('local database has not been built, '
                             'please call update() first')
        if identifier != 'root':
            try:
                data = self._map[identifier]
            except KeyError:
                iter = self.iter(identifier)
                try:
                    data = next(iter)
                except StopIteration:
                    data = None
        else:
            data = self.root
        return data

    def _fetch(self):
        """Download CATH files via HTTP."""

        try:
            response = urllib2.urlopen(CATH_URL + CATH_NAME_PATH)
            name_data = response.read()
        except Exception as error:
            raise type(error)('HTTP connection problem when extracting CATH names: '\
                              + repr(error))

        try:
            response = urllib2.urlopen(CATH_URL + CATH_DOMAIN_PATH)
            domain_data = response.read()
        except Exception as error:
            raise type(error)('HTTP connection problem when extracting domain information: '\
                              + repr(error))

        try:
            response = urllib2.urlopen(CATH_URL + CATH_BOUNDARY_PATH)
            boundary_data = response.read()
        except Exception as error:
            raise type(error)('HTTP connection problem when extracting domain boundaries: '\
                              + repr(error))

        #if PY3K:
        name_data = name_data.decode('utf-8')
        domain_data = domain_data.decode('utf-8')
        boundary_data = boundary_data.decode('utf-8')

        self._raw_names = name_data
        self._raw_domains = domain_data
        self._raw_boundaries = boundary_data
        
    def _parse(self):
        """Parse CATH files into a :class:`~xml.etree.ElementTree` structure. 
        The leaf nodes are tagged by domain names, e.g. 3hgnA02, and other nodes 
        are tagged by their full CATH identifiers, e.g. 2.40.10.10. The parsing 
        process assumes the domain file is organized in a topdown fashion, i.e. 
        1 -> 1.20 -> 1.20.120 -> 1.20.120.1000. """
        
        name_data = self._raw_names 
        domain_data = self._raw_domains
        boundary_data = self._raw_boundaries

        root = self.root

        # parsing the name file
        lines = name_data.splitlines()
        for line in lines:
            text = line.strip()
            if not text:
                continue
            if text.startswith('#'):
                continue
            items = text.split('    ')

            cath_id, domain_id, cath_name = items
            #pdb_id = domain_id[:4]
            #chain = domain_id[4]

            level = getLevel(cath_id)
            attrib = {'cath': cath_id,
                      'rep': domain_id,
                      'name': cath_name,}
                      #'pdb': pdb_id,
                      #'chain': chain}
            
            if level == 1:
                node = CATHElement(cath_id, attrib=attrib, parent=root)
                root.append(node)
                self._map[cath_id] = node
            else:
                parent_id = getParent(cath_id)
                #parent = findNodeByCATH(root, parent_id)
                parent = self.find(parent_id)
                if parent is None:
                    LOGGER.warn('error encountered when building the tree: {0}'
                                .format(cath_id))
                    continue
                node = CATHElement(cath_id, attrib=attrib, parent=parent)
                parent.append(node)
                self._map[cath_id] = node

        # parsing the domain file
        lines = domain_data.splitlines()
        for line in lines:
            # Column 1:  CATH domain name (seven characters)
            # Column 2:  Class number
            # Column 3:  Architecture number
            # Column 4:  Topology number
            # Column 5:  Homologous superfamily number
            # Column 6:  S35 sequence cluster number
            # Column 7:  S60 sequence cluster number
            # Column 8:  S95 sequence cluster number
            # Column 9:  S100 sequence cluster number
            # Column 10: S100 sequence count number
            # Column 11: Domain length
            # Column 12: Structure resolution (Angstroms)
            #            (999.000 for NMR structures and 1000.000 for obsolete PDB entries)
            text = line.strip()
            if not text:
                continue
            if text.startswith('#'):
                continue
            items = text.split()

            domain_id = items[0]
            pdb_id = domain_id[:4]
            chain = domain_id[4]
            if chain == '0':
                chain = ''
            
            cath_id = '.'.join(items[1:5])
            #S35, S60, S95, S100 = items[5:9]
            length = int(items[10])
            #resolution = float(items[11])

            attrib = {'cath': cath_id,
                      'pdb': pdb_id,
                      'chain': chain,
                      'length': length,
                      'range': ''}

            #parent = findNodeByCATH(root, cath_id)
            parent = self.find(cath_id)
            if parent is None:
                LOGGER.warn('error encountered when assigning domains: {0}'
                            .format(domain_id))
                continue
            node = CATHElement(domain_id, attrib=attrib, parent=parent)
            parent.append(node)
            self._map[domain_id] = node

        # parsing the boundary file
        lines = boundary_data.splitlines()
        for line in lines:
            # Example: 3hgnA02	13-111,228-240
            text = line.strip()
            if not text:
                continue
            if text.startswith('#'):
                continue
            items = text.split()

            domain_id, resrange = items

            if domain_id in self._map:
                node = self.find(domain_id)
            else:
                self._unclassified[domain_id] = resrange
                continue

            if node is None:
                LOGGER.warn('error encountered when assigning domain boundaries: {0}'
                            .format(domain_id))
                continue
            node.attrib['range'] = resrange
    
[docs]    def search(self, identifier):
        """Search the *identifier* against the local CATH database and return a list 
        of :class:`~xml.etree.ElementTree` instances. *identifier* can be a 4-digit 
        PDB ID plus an 1-digit chain ID (optional), 7-digit domain ID, or CATH ID."""

        ret = None
        if isCATH(identifier):
            ret = self.find(identifier)
        elif isDomain(identifier):
            identifier = identifier[:4].lower() + identifier[-3:]
            ret = self.find(identifier)
        elif isPDB(identifier):
            pdb_id = identifier[:4].lower()
            chain = identifier[4:]
            for gap in ' -_:':
                chain = chain.replace(gap, '')
            
            root = self.find()
            nodes = root.findall('.//*[@pdb="%s"]'%pdb_id)
            if chain.strip() != '':
                for i in range(len(nodes)-1, -1, -1):
                    node = nodes[i]
                    if node.attrib['chain'] != chain:
                        del nodes[i]
            ret = nodes
        else:
            raise ValueError('identifier can be a 4-digit PDB ID plus 1-digit chain ID '
                             '(optional), 7-digit domain ID, or CATH ID.')
        
        if ret is not None:
            if isinstance(ret, list):
                ret = CATHCollection(ret)
        return ret

# Note: all the following functions should be kept hidden from users
def splitCATH(cath_id, cumul=False):
    fields = cath_id.split('.')

    if cumul:
        fields_ = list(fields)
        for i in range(len(fields)):
            fields_[i] = '.'.join(fields[:i+1])
        fields = fields_
    return fields

def findNodeByCATH(root, cath_id):
    fields = splitCATH(cath_id, cumul=True)

    node = root
    for field in fields:
        node = node.find(field)
        if node is None:
            return None
    return node

def getParent(cath_id):
    fields = splitCATH(cath_id)
    parent = '.'.join(f for f in fields[:-1])
    return parent

def getLevel(cath_id):
    return len(splitCATH(cath_id))

def isLeaf(cath_id):
    level = getLevel(cath_id)

    if level > 4:
        raise ValueError('invalid cath_id: {0}'.format(cath_id))

    return level == 4

def copy2(a, b):
    """Copy a tree structure from *a* to *b*"""
    for c in a:
        node = CATHElement(c.tag, c.attrib, b)
        b.append(node)
        copy2(c, node)

def range2selstr(rangestr):
    if rangestr.strip() == '':
        return None
    frags = rangestr.split(',')
    sels = []
    for frag in frags:
        try:
            fromtos = frag.split('-')
            if len(fromtos) == 2:
                fro, to = fromtos
            else:
                LOGGER.warn('range "%s" is irregular'%rangestr)
                fro = '1'
                to = fromtos[-1]
            fro_num = intResnum(fro)
            to_num = intResnum(to)

            if fro_num > to_num:
                LOGGER.warn('range "%s" is irregular'%rangestr)
                to_num = fro_num
                fro_num = 1
            fro = str(fro_num)
            to = str(to_num)
        except ValueError:
            print('error occurred when parsing "%s"'%rangestr)
            continue
        sels.append('resindex %s to %s'%(fro, to))
    selstr = ' or '.join(sels)
    return selstr

def intResnum(resnum):
    match = re.findall('[0-9\\-]+', resnum)
    num = None
    if match:
        num = int(match[0])
    return num