Source code for prody.proteins.pdbfile

# -*- coding: utf-8 -*-
"""This module defines functions for parsing and writing `PDB files`_.

.. _PDB files: http://www.wwpdb.org/documentation/format32/v3.2.html"""

from collections import defaultdict
import os.path
import time
from numbers import Integral

import numpy as np

from prody.atomic import AtomGroup, Atom, Selection
from prody.atomic import flags
from prody.atomic import ATOMIC_FIELDS
from prody.utilities import openFile, isListLike
from prody.utilities.misctools import decToHybrid36, hybrid36ToDec, packmolRenumChains
from prody import LOGGER, SETTINGS

from .header import getHeaderDict, buildBiomolecules, assignSecstr, isHelix, isSheet
from .localpdb import fetchPDB
from .ciffile import parseMMCIF
from .emdfile import parseEMD

__all__ = ['parsePDBStream', 'parsePDB', 'parseChainsList', 'parsePQR',
           'writePDBStream', 'writePDB', 'writeChainsList', 'writePQR',
           'writePQRStream']

MAX_N_ATOM = 99999 
MAX_N_RES = 9999

class PDBParseError(Exception):
    pass

_parsePQRdoc = """
    :arg title: title of the :class:`.AtomGroup` instance, default is the
        PDB filename or PDB identifier
    :type title: str

    :arg ag: :class:`.AtomGroup` instance for storing data parsed from PDB
        file, number of atoms in *ag* and number of atoms parsed from the PDB
        file must be the same and atoms in *ag* and those in PDB file must
        be in the same order.  Non-coordinate data stored in *ag* will be
        overwritten with those parsed from the file.
    :type ag: :class:`.AtomGroup`

    :arg chain: chain identifiers for parsing specific chains, e.g.
        ``chain='A'``, ``chain='B'``, ``chain='DE'``, by default all
        chains are parsed
    :type chain: str

    :arg subset: a predefined keyword to parse subset of atoms, valid keywords
        are ``'calpha'`` (``'ca'``), ``'backbone'`` (``'bb'``), or **None**
        (read all atoms), e.g. ``subset='bb'``
    :type subset: str
"""

_parsePDBdoc = _parsePQRdoc + """
    :arg model: model index or None (read all models), e.g. ``model=10``
    :type model: int, list

    :arg header: if **True** PDB header content will be parsed and returned
    :type header: bool

    :arg altloc: if a location indicator is passed, such as ``'A'`` or ``'B'``,
         only indicated alternate locations will be parsed as the single
         coordinate set of the AtomGroup,  if *altloc* is set **True** all
         alternate locations will be parsed and each will be appended as a
         distinct coordinate set, default is ``"A"``
    :type altloc: str

    :arg biomol: if **True**, biomolecules are obtained by transforming the
        coordinates using information from header section will be returned.
        This option uses :func:`.buildBiomolecules` and as noted there, 
        atoms in biomolecules are ordered according to the original chain 
        IDs. Chains may have the same chain ID, in which case they are given 
        different segment names.
        Default is **False**
    :type biomol: bool

    :arg secondary: if **True**, secondary structure information from header
        section will be assigned to atoms.
        Default is **False**
    :type secondary: bool

    If ``model=0`` and ``header=True``, return header dictionary only.

    """

_PDBSubsets = {'ca': 'ca', 'calpha': 'ca', 'bb': 'bb', 'backbone': 'bb'}

[docs]def parsePDB(*pdb, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a PDB file. This function extends :func:`.parsePDBStream`. See :ref:`parsepdb` for a detailed usage example. :arg pdb: one PDB identifier or filename, or a list of them. If needed, PDB files are downloaded using :func:`.fetchPDB()` function. You can also provide arguments that you would like passed on to fetchPDB(). :arg extend_biomol: whether to extend the list of results with a list rather than appending, which can create a mixed list, especially when biomol=True. Default value is False to reproduce previous behaviour. This value is ignored when result is not a list (header=True or model=0). :type extend_biomol: bool """ extend_biomol = kwargs.pop('extend_biomol', False) n_pdb = len(pdb) if n_pdb == 0: raise ValueError('Please provide a PDB ID or filename') if n_pdb == 1: if isListLike(pdb[0]) or isinstance(pdb[0], dict): pdb = pdb[0] n_pdb = len(pdb) if n_pdb == 1: return _parsePDB(pdb[0], **kwargs) else: results = [] lstkwargs = {} for key in kwargs: argval = kwargs.get(key) if np.isscalar(argval): argval = [argval]*n_pdb lstkwargs[key] = argval start = time.time() LOGGER.progress('Retrieving {0} PDB structures...' .format(n_pdb), n_pdb, '_prody_parsePDB') for i, p in enumerate(pdb): kwargs = {} for key in lstkwargs: kwargs[key] = lstkwargs[key][i] c = kwargs.get('chain','') LOGGER.update(i, 'Retrieving {0}...'.format(p+c), label='_prody_parsePDB') result = _parsePDB(p, **kwargs) if not isinstance(result, tuple): if isinstance(result, dict): result = (None, result) else: result = (result, None) results.append(result) results = list(zip(*results)) LOGGER.finish() for i in reversed(range(len(results))): if all(j is None for j in results[i]): results.pop(i) if len(results) == 1: results = results[0] results = list(results) model = kwargs.get('model') header = kwargs.get('header', False) if model != 0 and header: numPdbs = len(results[0]) else: numPdbs = len(results) if extend_biomol: results_old = results results = [] for entry in results_old: if isinstance(entry, AtomGroup): results.append(entry) else: results.extend(entry) LOGGER.info('{0} PDBs were parsed in {1:.2f}s.' .format(numPdbs, time.time()-start)) return results
def _getPDBid(pdb): l = len(pdb) if l == 4: pdbid, chain = pdb, '' elif l == 5: pdbid = pdb[:4]; chain = pdb[-1] elif ':' in pdb: i = pdb.find(':') pdbid = pdb[:i]; chain = pdb[i+1:] else: raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) if not pdbid.isalnum(): raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) if chain != '' and not chain.isalnum(): raise IOError('{0} is not a valid chain identifier.'.format(chain)) return pdbid, chain def _parsePDB(pdb, **kwargs): title = kwargs.get('title', None) chain = '' if not os.path.isfile(pdb): pdb, chain = _getPDBid(pdb) if title is None: title = pdb kwargs['title'] = title filename = fetchPDB(pdb, **kwargs) if filename is None: try: LOGGER.info("Trying to parse mmCIF file instead") return parseMMCIF(pdb+chain, **kwargs) except: try: LOGGER.info("Trying to parse EMD file instead") return parseEMD(pdb+chain, **kwargs) except: raise IOError('PDB file for {0} could not be downloaded.' .format(pdb)) pdb = filename if title is None: title, ext = os.path.splitext(os.path.split(pdb)[1]) if ext == '.gz': title, ext = os.path.splitext(title) if len(title) == 7 and title.startswith('pdb'): title = title[3:] kwargs['title'] = title pdb = openFile(pdb, 'rt') if chain != '': kwargs['chain'] = chain result = parsePDBStream(pdb, **kwargs) pdb.close() return result parsePDB.__doc__ += _parsePDBdoc
[docs]def parsePDBStream(stream, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a stream of PDB lines. :arg stream: Anything that implements the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" model = kwargs.get('model') header = kwargs.get('header', False) assert isinstance(header, bool), 'header must be a boolean' chain = kwargs.get('chain') subset = kwargs.get('subset') altloc = kwargs.get('altloc', 'A') packmol = kwargs.get('packmol', False) auto_bonds = SETTINGS.get('auto_bonds') get_bonds = kwargs.get('bonds', auto_bonds) if model is not None: if isinstance(model, Integral): if model < 0: raise ValueError('model must be greater than 0') else: raise TypeError('model must be an integer, {0} is invalid' .format(str(model))) title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset' .format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = chain + title_suffix ag = kwargs.pop('ag', None) if ag is not None: if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() elif model != 0: ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) n_csets = 0 biomol = kwargs.get('biomol', False) auto_secondary = SETTINGS.get('auto_secondary') secondary = kwargs.get('secondary', auto_secondary) split = 0 hd = None if model != 0: LOGGER.timeit() try: lines = stream.readlines() except AttributeError as err: try: lines = stream.read().split('\n') except AttributeError: raise err if not len(lines): raise ValueError('empty PDB file or stream') if header or biomol or secondary: hd, split = getHeaderDict(lines) bonds = [] if get_bonds else None _parsePDBLines(ag, lines, split, model, chain, subset, altloc, bonds=bonds) if bonds: try: ag.setBonds(bonds) except ValueError: LOGGER.warn('Bonds read from CONECT records do not apply to subset so were not added') if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate set(s) were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) else: ag = None LOGGER.warn('Atomic data could not be parsed, please ' 'check the input file.') elif header: hd, split = getHeaderDict(stream) if ag is not None and isinstance(hd, dict): if secondary: if auto_secondary: try: ag = assignSecstr(hd, ag) except ValueError: pass else: ag = assignSecstr(hd, ag) if biomol: ag = buildBiomolecules(hd, ag) if isinstance(ag, list): LOGGER.info('Biomolecular transformations were applied, {0} ' 'biomolecule(s) are returned.'.format(len(ag))) else: LOGGER.info('Biomolecular transformations were applied to the ' 'coordinate data.') if packmol: ag = packmolRenumChains(ag) if model != 0: if header: return ag, hd else: return ag else: return hd
parsePDBStream.__doc__ += _parsePDBdoc
[docs]def parsePQR(filename, **kwargs): """Returns an :class:`.AtomGroup` containing data parsed from PDB lines. :arg filename: a PQR filename :type filename: str""" title = kwargs.get('title', kwargs.get('name')) chain = kwargs.get('chain') subset = kwargs.get('subset') if not os.path.isfile(filename): raise IOError('No such file: {0}'.format(repr(filename))) if title is None: fn, ext = os.path.splitext(os.path.split(filename)[1]) if ext == '.gz': fn, ext = os.path.splitext(fn) title = fn.lower() title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset' .format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() else: ag = AtomGroup(title + title_suffix) n_csets = 0 pqr = openFile(filename, 'rt') lines = pqr.readlines() pqr.close() LOGGER.timeit() ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain, subset=subset, altloc_torf=False, format='pqr') if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) return ag else: return None
parsePQR.__doc__ += _parsePQRdoc def _parsePDBLines(atomgroup, lines, split, model, chain, subset, altloc_torf, format='PDB', bonds=None): """Returns an AtomGroup. See also :func:`.parsePDBStream()`. :arg lines: PDB/PQR lines :arg split: starting index for coordinate data lines""" format = format.upper() isPDB = format == 'PDB' if subset: if subset == 'ca': subset = set(('CA',)) elif subset in 'bb': subset = flags.BACKBONE only_subset = True protein_resnames = flags.AMINOACIDS else: only_subset = False if chain is None: only_chains = False else: only_chains = True onlycoords = False n_atoms = atomgroup.numAtoms() if n_atoms > 0: asize = n_atoms else: asize = len(lines) - split addcoords = False if atomgroup.numCoordsets() > 0: addcoords = True alength = asize coordinates = np.zeros((asize, 3), dtype=float) atomnames = np.zeros(asize, dtype=ATOMIC_FIELDS['name'].dtype) resnames = np.zeros(asize, dtype=ATOMIC_FIELDS['resname'].dtype) resnums = np.zeros(asize, dtype=ATOMIC_FIELDS['resnum'].dtype) chainids = np.zeros(asize, dtype=ATOMIC_FIELDS['chain'].dtype) hetero = np.zeros(asize, dtype=bool) termini = np.zeros(asize, dtype=bool) altlocs = np.zeros(asize, dtype=ATOMIC_FIELDS['altloc'].dtype) icodes = np.zeros(asize, dtype=ATOMIC_FIELDS['icode'].dtype) serials = np.zeros(asize, dtype=ATOMIC_FIELDS['serial'].dtype) charges = np.zeros(asize, dtype=ATOMIC_FIELDS['charge'].dtype) if isPDB: segnames = np.zeros(asize, dtype=ATOMIC_FIELDS['segment'].dtype) elements = np.zeros(asize, dtype=ATOMIC_FIELDS['element'].dtype) bfactors = np.zeros(asize, dtype=ATOMIC_FIELDS['beta'].dtype) occupancies = np.zeros(asize, dtype=ATOMIC_FIELDS['occupancy'].dtype) anisou = None siguij = None else: radii = np.zeros(asize, dtype=ATOMIC_FIELDS['radius'].dtype) asize = 2000 # increase array length by this much when needed start = split stop = len(lines) nmodel = 0 # if a specific model is requested, skip lines until that one if isPDB and model is not None and model != 1: for i in range(split, len(lines)): if lines[i][:5] == 'MODEL': nmodel += 1 if model == nmodel: start = i+1 stop = len(lines) break if nmodel != model: raise PDBParseError('model {0} is not found'.format(model)) if isinstance(altloc_torf, str): if altloc_torf.strip() != 'A': LOGGER.info('Parsing alternate locations {0}.' .format(altloc_torf)) which_altlocs = ' ' + ''.join(altloc_torf.split()) else: which_altlocs = ' A' altloc_torf = False else: which_altlocs = ' A' altloc_torf = True acount = 0 coordsets = None altloc = defaultdict(list) i = start END = False warned_5_digit = False dec = True while i < stop: line = lines[i] if not isPDB: fields = line.split() if len(fields) == 10: fields.insert(4, '') elif len(fields) != 11: LOGGER.warn('wrong number of fields for PQR format at line %d'%i) i += 1 continue if isPDB: startswith = line[0:6].strip() else: startswith = fields[0] if startswith == 'ATOM' or startswith == 'HETATM': if isPDB: atomname = line[12:16].strip() resname = line[17:21].strip() else: atomname= fields[2] resname = fields[3] if only_subset: if not (atomname in subset and resname in protein_resnames): i += 1 continue if isPDB: chid = line[21] else: chid = fields[4] if only_chains: if not chid in chain: i += 1 continue if isPDB: alt = line[16] if alt not in which_altlocs: altloc[alt].append((line, i)) i += 1 continue else: alt = ' ' try: if isPDB: coordinates[acount, 0] = line[30:38] coordinates[acount, 1] = line[38:46] coordinates[acount, 2] = line[46:54] else: coordinates[acount, 0] = fields[6] coordinates[acount, 1] = fields[7] coordinates[acount, 2] = fields[8] except: if acount >= n_atoms > 0: if nmodel == 0: raise ValueError(format + 'file and AtomGroup ag must ' 'have same number of atoms') LOGGER.warn('Discarding model {0}, which contains {1} more ' 'atoms than first model does.' .format(nmodel+1,acount-n_atoms+1)) acount = 0 nmodel += 1 coordinates = np.zeros((n_atoms, 3), dtype=float) if isPDB: while lines[i][:6] != 'ENDMDL': i += 1 else: raise PDBParseError('invalid or missing coordinate(s) at ' 'line {0}'.format(i+1)) if onlycoords: acount += 1 i += 1 continue serial_str = line[6:11] if isPDB else fields[1] try: serials[acount] = int(serial_str) except ValueError: try: isnumeric = np.alltrue([x.isdigit() for x in serial_str]) if not isnumeric and serial_str == serial_str.upper(): serials[acount] = hybrid36ToDec(serial_str) else: # lower case is found in hexadecimal PDB files serials[acount] = int(serial_str, 16) except ValueError: if acount > 0: LOGGER.warn('failed to parse serial number in line {0}. Assigning it by incrementing.' .format(i)) serials[acount] = serials[acount-1]+1 else: LOGGER.warn('failed to parse serial number in line {0}. Assigning it as 1.' .format(i)) serials[acount] = 1 altlocs[acount] = alt atomnames[acount] = atomname resnames[acount] = resname chainids[acount] = chid if isPDB: resnum_str = line[22:26] if dec: try: resnum = int(resnum_str) except ValueError: dec = False icode = line[26] if icode.isdigit() and dec: if not warned_5_digit: LOGGER.warn('parsed 5 digit residue number including numeric insertion code') warned_5_digit = True resnum = int(str(resnum) + icode) else: icodes[acount] = icode if dec and acount > 2 and resnums[acount-2] > resnum and resnums[acount-2] >= MAX_N_RES: dec = False if not dec: resnum = resnum_str try: isnumeric = np.alltrue([x.isdigit() for x in resnum_str]) if not isnumeric and resnum_str == resnum_str.upper(): resnum = hybrid36ToDec(resnum_str, resnum=True) else: # lower case is found in hexadecimal PDB files resnum = int(resnum_str, 16) except ValueError: if acount > 0: LOGGER.warn('failed to parse residue number in line {0}. Assigning it by incrementing.' .format(i)) resnum = resnums[acount-1]+1 else: LOGGER.warn('failed to parse residue number in line {0}. Assigning it as 1.' .format(i)) resnum = 1 resnums[acount] = resnum else: resnum = fields[5] if resnum[-1].isalpha(): icode = resnum[-1] else: icode = ' ' try: resnums[acount] = int(resnum) except ValueError: try: resnums[acount] = int(resnum, 16) except ValueError: LOGGER.warn('failed to parse residue number in line {0}. Assigning it by incrementing.' .format(i)) resnums[acount] = resnums[acount-1]+1 icodes[acount] = icode if isPDB: try: occupancies[acount] = line[54:60] except: LOGGER.warn('failed to parse occupancy at line {0}' .format(i)) try: bfactors[acount] = line[60:66] except: LOGGER.warn('failed to parse beta-factor at line {0}' .format(i)) hetero[acount] = startswith[0] == 'H' segnames[acount] = line[72:76] elements[acount] = line[76:78] try: charges[acount] = int(line[79] + line[78]) except: charges[acount] = 0 else: try: charges[acount] = fields[9] except: LOGGER.warn('failed to parse charge at line {0}' .format(i)) try: radii[acount] = fields[10] except: LOGGER.warn('failed to parse radius at line {0}' .format(i)) acount += 1 if n_atoms == 0 and acount >= alength: # if arrays are short extend them with zeros alength += asize coordinates = np.concatenate( (coordinates, np.zeros((asize, 3), float))) atomnames = np.concatenate((atomnames, np.zeros(asize, ATOMIC_FIELDS['name'].dtype))) resnames = np.concatenate((resnames, np.zeros(asize, ATOMIC_FIELDS['resname'].dtype))) resnums = np.concatenate((resnums, np.zeros(asize, ATOMIC_FIELDS['resnum'].dtype))) chainids = np.concatenate((chainids, np.zeros(asize, ATOMIC_FIELDS['chain'].dtype))) hetero = np.concatenate((hetero, np.zeros(asize, bool))) termini = np.concatenate((termini, np.zeros(asize, bool))) altlocs = np.concatenate((altlocs, np.zeros(asize, ATOMIC_FIELDS['altloc'].dtype))) icodes = np.concatenate((icodes, np.zeros(asize, ATOMIC_FIELDS['icode'].dtype))) serials = np.concatenate((serials, np.zeros(asize, ATOMIC_FIELDS['serial'].dtype))) if isPDB: bfactors = np.concatenate((bfactors, np.zeros(asize, ATOMIC_FIELDS['beta'].dtype))) occupancies = np.concatenate((occupancies, np.zeros(asize, ATOMIC_FIELDS['occupancy'].dtype))) segnames = np.concatenate((segnames, np.zeros(asize, ATOMIC_FIELDS['segment'].dtype))) elements = np.concatenate((elements, np.zeros(asize, ATOMIC_FIELDS['element'].dtype))) if anisou is not None: anisou = np.concatenate((anisou, np.zeros((asize, 6), ATOMIC_FIELDS['anisou'].dtype))) if siguij is not None: siguij = np.concatenate((siguij, np.zeros((asize, 6), ATOMIC_FIELDS['siguij'].dtype))) else: charges = np.concatenate((charges, np.zeros(asize, ATOMIC_FIELDS['charge'].dtype))) radii = np.concatenate((radii, np.zeros(asize, ATOMIC_FIELDS['radius'].dtype))) elif startswith == 'CONECT': if bonds is not None: atom_serial = line[6:11] bonded1_serial = line[11:16] bonds.append([int(atom_serial), int(bonded1_serial)]) bonded2_serial = line[16:21] if len(bonded2_serial.strip()): bonds.append([int(atom_serial), int(bonded2_serial)]) bonded3_serial = line[21:26] if len(bonded3_serial.strip()): bonds.append([int(atom_serial), int(bonded3_serial)]) bonded4_serial = line[27:31] if len(bonded4_serial.strip()): bonds.append([int(atom_serial), int(bonded4_serial)]) elif not onlycoords and (startswith == 'TER ' or startswith.strip() == 'TER'): termini[acount - 1] = True elif startswith == 'ENDMDL' or startswith[:3] == 'END': if acount == 0: # If there is no atom record between ENDMDL & END skip to next i += 1 continue if model is not None: if bonds is None: i += 1 break else: i += 1 for j in range(i, stop): if lines[j].startswith('CONECT'): i = j break continue diff = stop - i - 1 END = diff < acount if coordsets is not None: END = END or nmodel >= coordsets.shape[0] if onlycoords: if acount < n_atoms: LOGGER.warn('Discarding model {0}, which contains ' '{1} fewer atoms than the first model ' 'does.'.format(nmodel+1, n_atoms-acount)) else: coordsets[nmodel] = coordinates nmodel += 1 acount = 0 if not END: coordinates = coordsets[nmodel] else: if acount != n_atoms > 0: raise ValueError('PDB file and AtomGroup ag must have ' 'same number of atoms') # this is where to decide if more coordsets should be expected if END: coordinates.resize((acount, 3), refcheck=False) if addcoords: atomgroup.addCoordset(coordinates) else: atomgroup._setCoords(coordinates) else: coordsets = np.zeros((int(diff//acount+1), acount, 3)) coordsets[0] = coordinates[:acount] onlycoords = True atomnames.resize(acount, refcheck=False) resnames.resize(acount, refcheck=False) resnums.resize(acount, refcheck=False) chainids.resize(acount, refcheck=False) hetero.resize(acount, refcheck=False) termini.resize(acount, refcheck=False) altlocs.resize(acount, refcheck=False) icodes.resize(acount, refcheck=False) serials.resize(acount, refcheck=False) if not only_subset: atomnames = np.char.strip(atomnames) resnames = np.char.strip(resnames) atomgroup.setNames(atomnames) atomgroup.setResnames(resnames) atomgroup.setResnums(resnums) atomgroup.setChids(chainids) atomgroup.setFlags('hetatm', hetero) atomgroup.setFlags('pdbter', termini) atomgroup.setAltlocs(altlocs) atomgroup.setIcodes(np.char.strip(icodes)) atomgroup.setSerials(serials) if isPDB: bfactors.resize(acount, refcheck=False) occupancies.resize(acount, refcheck=False) segnames.resize(acount, refcheck=False) elements.resize(acount, refcheck=False) atomgroup.setBetas(bfactors) atomgroup.setOccupancies(occupancies) atomgroup.setSegnames(np.char.strip(segnames)) atomgroup.setElements(np.char.strip(elements)) from prody.utilities.misctools import getMasses atomgroup.setMasses(getMasses(np.char.strip(elements))) if anisou is not None: anisou.resize((acount, 6), refcheck=False) atomgroup.setAnisous(anisou / 10000) if siguij is not None: siguij.resize((acount, 6), refcheck=False) atomgroup.setAnistds(siguij / 10000) else: radii.resize(acount, refcheck=False) atomgroup.setRadii(radii) charges.resize(acount, refcheck=False) atomgroup.setCharges(charges) nmodel += 1 n_atoms = acount acount = 0 coordinates = np.zeros((n_atoms, 3), dtype=float) if altloc and altloc_torf: _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames) altloc = defaultdict(list) if END: break elif isPDB and startswith == 'ANISOU': if anisou is None: anisou = True anisou = np.zeros((alength, 6), dtype=ATOMIC_FIELDS['anisou'].dtype) try: index = acount - 1 anisou[index, 0] = line[28:35] anisou[index, 1] = line[35:42] anisou[index, 2] = line[43:49] anisou[index, 3] = line[49:56] anisou[index, 4] = line[56:63] anisou[index, 5] = line[63:70] except: LOGGER.warn('failed to parse anisotropic temperature ' 'factors at line {0}'.format(i)) elif isPDB and startswith =='SIGUIJ': if siguij is None: siguij = np.zeros((alength, 6), dtype=ATOMIC_FIELDS['siguij'].dtype) try: index = acount - 1 siguij[index, 0] = line[28:35] siguij[index, 1] = line[35:42] siguij[index, 2] = line[43:49] siguij[index, 3] = line[49:56] siguij[index, 4] = line[56:63] siguij[index, 5] = line[63:70] except: LOGGER.warn('failed to parse standard deviations of ' 'anisotropic temperature factors at line {0}'.format(i)) elif startswith =='SIGATM': pass i += 1 if onlycoords: if acount == atomgroup.numAtoms(): coordsets[nmodel] = coordinates nmodel += 1 del coordinates coordsets.resize((nmodel, atomgroup.numAtoms(), 3), refcheck=False) if addcoords: atomgroup.addCoordset(coordsets) else: atomgroup._setCoords(coordsets) elif not END: # this means last line was an ATOM line, so atomgroup is not decorated coordinates.resize((acount, 3), refcheck=False) if addcoords: atomgroup.addCoordset(coordinates) else: atomgroup._setCoords(coordinates) atomnames.resize(acount, refcheck=False) resnames.resize(acount, refcheck=False) resnums.resize(acount, refcheck=False) chainids.resize(acount, refcheck=False) hetero.resize(acount, refcheck=False) termini.resize(acount, refcheck=False) altlocs.resize(acount, refcheck=False) icodes.resize(acount, refcheck=False) serials.resize(acount, refcheck=False) if not only_subset: atomnames = np.char.strip(atomnames) resnames = np.char.strip(resnames) atomgroup.setNames(atomnames) atomgroup.setResnames(resnames) atomgroup.setResnums(resnums) atomgroup.setChids(chainids) atomgroup.setFlags('hetatm', hetero) atomgroup.setFlags('pdbter', termini) atomgroup.setAltlocs(altlocs) atomgroup.setIcodes(np.char.strip(icodes)) atomgroup.setSerials(serials) if isPDB: if anisou is not None: anisou.resize((acount, 6), refcheck=False) atomgroup.setAnisous(anisou / 10000) if siguij is not None: siguij.resize((acount, 6), refcheck=False) atomgroup.setAnistds(siguij / 10000) bfactors.resize(acount, refcheck=False) occupancies.resize(acount, refcheck=False) segnames.resize(acount, refcheck=False) elements.resize(acount, refcheck=False) atomgroup.setSegnames(np.char.strip(segnames)) atomgroup.setElements(np.char.strip(elements)) from prody.utilities.misctools import getMasses atomgroup.setMasses(getMasses(np.char.strip(elements))) atomgroup.setBetas(bfactors) atomgroup.setOccupancies(occupancies) else: radii.resize(acount, refcheck=False) atomgroup.setRadii(radii) charges.resize(acount, refcheck=False) atomgroup.setCharges(charges) if altloc and altloc_torf: _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames) return atomgroup def _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames): altloc_keys = list(altloc) altloc_keys.sort() indices = {} for key in altloc_keys: xyz = atomgroup.getCoords() success = 0 lines = altloc[key] for line, i in lines: aan = line[12:16].strip() arn = line[17:21].strip() ach = line[21] ari = int(line[22:26].split()[0]) rn, ids, ans = indices.get((ach, ari), (None, None, None)) if ids is None: ids = indices.get(ach, None) if ids is None: ids = (chainids == ach).nonzero()[0] indices[ach] = ids ids = ids[resnums[ids] == ari] if len(ids) == 0: LOGGER.warn("failed to parse altloc {0} at line {1}, " "residue not present for altloc 'A'".format( repr(key), i+1)) continue rn = resnames[ids[0]] ans = atomnames[ids] indices[(ach, ari)] = (rn, ids, ans) if rn != arn: LOGGER.warn("failed to parse altloc {0} at line {1}, " "residue name mismatch (expected {2}, " "parsed {3})".format(repr(key), i+1, repr(rn), repr(arn))) continue index = ids[(ans == aan).nonzero()[0]] if len(index) != 1: LOGGER.warn("failed to parse altloc {0} at line {1}, atom" " {2} not found in the residue" .format(repr(key), i+1, repr(aan))) continue try: xyz[index[0], 0] = float(line[30:38]) xyz[index[0], 1] = float(line[38:46]) xyz[index[0], 2] = float(line[46:54]) except: LOGGER.warn('failed to parse altloc {0} at line {1}, could' ' not read coordinates'.format(repr(key), i+1)) continue success += 1 LOGGER.info('{0} out of {1} altloc {2} lines were parsed.' .format(success, len(lines), repr(key))) if success > 0: LOGGER.info('Altloc {0} is appended as a coordinate set to ' 'atomgroup {1}.'.format(repr(key), atomgroup.getTitle())) atomgroup.addCoordset(xyz, label='altloc ' + key) PDBLINE = ('{0:6s}{1:5d} {2:4s}{3:1s}' '{4:4s}{5:1s}{6:4d}{7:1s} ' '{8:8.3f}{9:8.3f}{10:8.3f}' '{11:6.2f}{12:6.2f} ' '{13:4s}{14:2s}\n') #HELIXLINE = ('HELIX %3d %3s %-3s %1s %4d%1s %-3s %1s %4d%1s%2d' # ' %5d\n') HELIXLINE = ('HELIX {serNum:3d} {helixID:>3s} ' '{initResName:<3s} {initChainID:1s} {initSeqNum:4d}{initICode:1s} ' '{endResName:<3s} {endChainID:1s} {endSeqNum:4d}{endICode:1s}' '{helixClass:2d} {length:5d}\n') SHEETLINE = ('SHEET {strand:3d} {sheetID:>3s}{numStrands:2d} ' '{initResName:3s} {initChainID:1s}{initSeqNum:4d}{initICode:1s} ' '{endResName:3s} {endChainID:1s}{endSeqNum:4d}{endICode:1s}{sense:2d} \n') PDBLINE_LT100K = ('%-6s%5d %-4s%1s%-4s%1s%4d%1s ' '%8.3f%8.3f%8.3f%6.2f%6.2f ' '%4s%2s%2s\n') # Residue number PDBLINE_GE10K = ('%-6s%5d %-4s%1s%-4s%1s%4x%1s ' '%8.3f%8.3f%8.3f%6.2f%6.2f ' '%4s%2s%2s\n') # Serial number PDBLINE_GE100K = ('%-6s%5x %-4s%1s%-4s%1s%4d%1s ' '%8.3f%8.3f%8.3f%6.2f%6.2f ' '%4s%2s%2s\n') # Both PDBLINE_GE100K_GE10K = ('%-6s%5x %-4s%1s%-4s%1s%4x%1s ' '%8.3f%8.3f%8.3f%6.2f%6.2f ' '%4s%2s%2s\n') # All cases PDBLINE_H36 = ('%-6s%5s %-4s%1s%-4s%1s%4s%1s ' '%8.3f%8.3f%8.3f%6.2f%6.2f ' '%4s%2s%2s\n') ANISOULINE_LT100K = ('%-6s%5d %-4s%1s%-4s%1s%4d%1s ' '%7d%7d%7d%7d%7d%7d ' '%4s%2s%2s\n') # Residue number ANISOULINE_GE10K = ('%-6s%5d %-4s%1s%-4s%1s%4x%1s ' '%7d%7d%7d%7d%7d%7d ' '%4s%2s%2s\n') # Serial number ANISOULINE_GE100K = ('%-6s%5x %-4s%1s%-4s%1s%4d%1s ' '%7d%7d%7d%7d%7d%7d ' '%4s%2s%2s\n') # Both ANISOULINE_GE100K_GE10K = ('%-6s%5x %-4s%1s%-4s%1s%4x%1s ' '%7d%7d%7d%7d%7d%7d ' '%4s%2s%2s\n') # All cases ANISOULINE_H36 = ('%-6s%5s %-4s%1s%-4s%1s%4s%1s ' '%7d%7d%7d%7d%7d%7d ' '%4s%2s%2s\n') _writePDBdoc = """ :arg atoms: an object with atom and coordinate data :arg csets: coordinate set indices, default is all coordinate sets :arg beta: a list or array of number to be outputted in beta column :arg occupancy: a list or array of number to be outputted in occupancy column :arg hybrid36: whether to use hybrid36 format for atoms with serial greater than 99999. Hexadecimal is used otherwise. Default is False :type hybrid36: bool """
[docs]def writeChainsList(chains, filename): """ Write a text file containing a list of chains that can be parsed. :arg chains: a list of :class:`.Chain` objects :type chains: list :arg filename: the name of the file to be written :type filename: str """ fo = open(filename,'w') for chain in chains: fo.write(chain.getTitle() + ' ' + chain.getChid() + '\n') fo.close() return
[docs]def parseChainsList(filename): """ Parse a set of PDBs and extract chains based on a list in a text file. :arg filename: the name of the file to be read :type filename: str Returns: lists containing an :class:'.AtomGroup' for each PDB, the headers for those PDBs, and the requested :class:`.Chain` objects """ fi = open(filename,'r') lines = fi.readlines() fi.close() pdb_ids = [] ags = [] headers = [] chains = [] num_lines = len(lines) LOGGER.progress('Starting', num_lines, '_prody_parseChainsList') for i, line in enumerate(lines): LOGGER.update(i, 'Parsing lines...', label='_prody_parseChainsList') pdb_id = line.split()[0].split('_')[0] if not pdb_id in pdb_ids: pdb_ids.append(pdb_id) ag, header = parsePDB(pdb_id, compressed=False, \ subset=line.split()[0].split('_')[1], header=True) ags.append(ag) headers.append(header) chains.append(ag.getHierView()[line.strip().split()[1]]) LOGGER.finish() LOGGER.info('{0} PDBs have been parsed and {1} chains have been extracted. \ '.format(len(ags),len(chains))) return ags, headers, chains
[docs]def writePDBStream(stream, atoms, csets=None, **kwargs): """Write *atoms* in PDB format to a *stream*. :arg stream: anything that implements a :meth:`write` method (e.g. file, buffer, stdout) :arg renumber: whether to renumber atoms with serial indices Default is **True** :type renumber: bool """ renumber = kwargs.get('renumber', True) remark = str(atoms) try: coordsets = atoms._getCoordsets(csets) except AttributeError: try: coordsets = atoms._getCoords() except AttributeError: raise TypeError('atoms must be an object with coordinate sets') if coordsets is not None: coordsets = [coordsets] else: if coordsets.ndim == 2: coordsets = [coordsets] if coordsets is None: raise ValueError('atoms does not have any coordinate sets') try: acsi = atoms.getACSIndex() except AttributeError: try: atoms = atoms.getAtoms() except AttributeError: raise TypeError('atoms must be an Atomic instance or an object ' 'with `getAtoms` method') else: if atoms is None: raise ValueError('atoms is not associated with an Atomic ' 'instance') try: acsi = atoms.getACSIndex() except AttributeError: raise TypeError('atoms does not have a valid type') try: atoms.getIndex() except AttributeError: pass else: atoms = atoms.select('all') n_atoms = atoms.numAtoms() hybrid36 = kwargs.get('hybrid36', False) occupancy = kwargs.get('occupancy') if occupancy is None: occupancies = atoms._getOccupancies() if occupancies is None: occupancies = np.zeros(n_atoms, float) else: occupancies = np.array(occupancy) if len(occupancies) != n_atoms: raise ValueError('len(occupancy) must be equal to number of atoms') beta = kwargs.get('beta') if beta is None: bfactors = atoms._getBetas() if bfactors is None: bfactors = np.zeros(n_atoms, float) else: bfactors = np.array(beta) if len(bfactors) != n_atoms: raise ValueError('len(beta) must be equal to number of atoms') atomnames = atoms.getNames() if atomnames is None: raise ValueError('atom names are not set') for i, an in enumerate(atomnames): if len(an) < 4: atomnames[i] = ' ' + an s_or_u = np.array(['a']).dtype.char altlocs = atoms._getAltlocs() if altlocs is None: altlocs = np.zeros(n_atoms, s_or_u + '1') resnames = atoms._getResnames() if resnames is None: resnames = ['UNK'] * n_atoms chainids = atoms._getChids() if chainids is None: chainids = np.zeros(n_atoms, s_or_u + '1') resnums = atoms._getResnums() if resnums is None: resnums = np.ones(n_atoms, int) serials = atoms._getSerials() if serials is None or renumber: serials = np.arange(n_atoms, dtype=int) + 1 icodes = atoms._getIcodes() if icodes is None: icodes = np.zeros(n_atoms, s_or_u + '1') hetero = ['ATOM'] * n_atoms heteroflags = atoms._getFlags('hetatm') if heteroflags is None: heteroflags = atoms._getFlags('hetero') if heteroflags is not None: hetero = np.array(hetero, s_or_u + '6') hetero[heteroflags] = 'HETATM' elements = atoms._getElements() if elements is None: elements = np.zeros(n_atoms, s_or_u + '1') else: elements = np.char.rjust(elements, 2) segments = atoms._getSegnames() if segments is None: segments = np.zeros(n_atoms, s_or_u + '6') charges = atoms._getCharges() charges2 = np.empty(n_atoms, s_or_u + '2') if charges is not None: for i, charge in enumerate(charges): charges2[i] = str(abs(int(charge))) if np.sign(charge) == -1: charges2[i] += '-' else: charges2[i] += '+' if charges2[i] == '0+': charges2[i] = ' ' anisous = atoms._getAnisous() if anisous is not None: anisous = np.array(anisous * 10000, dtype=int) # write remarks stream.write('REMARK {0}\n'.format(remark)) # write secondary structures (if any) secondary = kwargs.get('secondary', True) secstrs = atoms._getSecstrs() if secstrs is not None and secondary: secindices = atoms._getSecindices() secclasses = atoms._getSecclasses() secids = atoms._getSecids() # write helices for i in range(1,max(secindices)+1): torf = np.logical_and(isHelix(secstrs), secindices==i) if torf.any(): helix_resnums = resnums[torf] helix_chainids = chainids[torf] helix_resnames = resnames[torf] helix_secclasses = secclasses[torf] helix_secids = secids[torf] helix_icodes = icodes[torf] L = helix_resnums[-1] - helix_resnums[0] + 1 stream.write(HELIXLINE.format(serNum=i, helixID=helix_secids[0], initResName=helix_resnames[0], initChainID=helix_chainids[0], initSeqNum=helix_resnums[0], initICode=helix_icodes[0], endResName=helix_resnames[-1], endChainID=helix_chainids[-1], endSeqNum=helix_resnums[-1], endICode=helix_icodes[-1], helixClass=helix_secclasses[0], length=L)) # write strands torf_all_sheets = isSheet(secstrs) sheet_secids = secids[torf_all_sheets] for sheet_id in np.unique(sheet_secids): torf_strands_in_sheet = np.logical_and(torf_all_sheets, secids==sheet_id) strand_indices = secindices[torf_strands_in_sheet] numStrands = len(np.unique(strand_indices)) for i in np.unique(strand_indices): torf_strand = np.logical_and(torf_strands_in_sheet, secindices==i) strand_resnums = resnums[torf_strand] strand_chainids = chainids[torf_strand] strand_resnames = resnames[torf_strand] strand_secclasses = secclasses[torf_strand] strand_icodes = icodes[torf_strand] stream.write(SHEETLINE.format(strand=i, sheetID=sheet_id, numStrands=numStrands, initResName=strand_resnames[0], initChainID=strand_chainids[0], initSeqNum=strand_resnums[0], initICode=strand_icodes[0], endResName=strand_resnames[-1], endChainID=strand_chainids[-1], endSeqNum=strand_resnums[-1], endICode=strand_icodes[-1], sense=strand_secclasses[0])) pass # write atoms multi = len(coordsets) > 1 write = stream.write for m, coords in enumerate(coordsets): if multi: write('MODEL{0:9d}\n'.format(m+1)) if not hybrid36: # We need to check whether serial and residue numbers become hexadecimal reached_max_n_atom = False reached_max_n_res = False pdbline = PDBLINE_LT100K anisouline = ANISOULINE_LT100K else: warned_hybrid36 = False for i, xyz in enumerate(coords): if hybrid36: pdbline = PDBLINE_H36 anisouline = ANISOULINE_H36 if not warned_hybrid36: LOGGER.warn('hybrid36 format is being used') warned_hybrid36 = True else: if not (reached_max_n_atom or reached_max_n_res) and (i == MAX_N_ATOM or serials[i] > MAX_N_ATOM): reached_max_n_atom = True pdbline = PDBLINE_GE100K anisouline = ANISOULINE_GE100K LOGGER.warn('Indices are exceeding 99999 and hexadecimal format is being used for indices') elif not (reached_max_n_atom or reached_max_n_res) and resnums[i] > MAX_N_RES: reached_max_n_res = True pdbline = PDBLINE_GE10K anisouline = ANISOULINE_GE10K LOGGER.warn('Resnums are exceeding 9999 and hexadecimal format is being used for resnums') elif reached_max_n_atom and not reached_max_n_res and resnums[i] > MAX_N_RES: reached_max_n_res = True pdbline = PDBLINE_GE100K_GE10K anisouline = ANISOULINE_GE100K_GE10K LOGGER.warn('Resnums are exceeding 9999 and hexadecimal format is being used for indices and resnums') elif reached_max_n_res and not reached_max_n_atom and (i == MAX_N_ATOM or serials[i] > MAX_N_ATOM): reached_max_n_atom = True pdbline = PDBLINE_GE100K_GE10K anisouline = ANISOULINE_GE100K_GE10K LOGGER.warn('Indices are exceeding 99999 and hexadecimal format is being used for indices and resnums') if hybrid36: serial = decToHybrid36(serials[i]) resnum = decToHybrid36(resnums[i], resnum=True) else: serial = serials[i] resnum = resnums[i] write(pdbline % (hetero[i], serial, atomnames[i], altlocs[i], resnames[i], chainids[i], resnum, icodes[i], xyz[0], xyz[1], xyz[2], occupancies[i], bfactors[i], segments[i], elements[i], charges2[i])) if anisous is not None: anisou = anisous[i] write(anisouline % ("ANISOU", serial, atomnames[i], altlocs[i], resnames[i], chainids[i], resnum, icodes[i], anisou[0], anisou[1], anisou[2], anisou[3], anisou[4], anisou[5], segments[i], elements[i], charges2[i])) if atoms.getFlags('pdbter') is not None and atoms.getFlags('pdbter')[i]: write('TER\n') if multi: write('ENDMDL\n') altlocs = np.zeros(n_atoms, s_or_u + '1')
writePDBStream.__doc__ += _writePDBdoc
[docs]def writePDB(filename, atoms, csets=None, autoext=True, **kwargs): """Write *atoms* in PDB format to a file with name *filename* and return *filename*. If *filename* ends with :file:`.gz`, a compressed file will be written. :arg renumber: whether to renumber atoms with serial indices Default is **True** :type renumber: bool """ if not ('.pdb' in filename or '.pdb.gz' in filename or '.ent' in filename or '.ent.gz' in filename): filename += '.pdb' out = openFile(filename, 'wt') writePDBStream(out, atoms, csets, **kwargs) out.close() return filename
writePDB.__doc__ += _writePDBdoc + """ :arg autoext: when not present, append extension :file:`.pdb` to *filename* """
[docs]def writePQRStream(stream, atoms, **kwargs): if isinstance(atoms, Atom): atoms = Selection(atoms.getAtomGroup(), [atoms.getIndex()], atoms.getACSIndex(), 'index ' + str(atoms.getIndex())) n_atoms = atoms.numAtoms() atomnames = atoms.getNames() if atomnames is None: raise RuntimeError('atom names are not set') for i, an in enumerate(atomnames): lenan = len(an) if lenan < 4: atomnames[i] = ' ' + an elif lenan > 4: atomnames[i] = an[:4] s_or_u = np.array(['a']).dtype.char resnames = atoms._getResnames() if resnames is None: resnames = ['UNK'] * n_atoms resnums = atoms._getResnums() if resnums is None: resnums = np.ones(n_atoms, int) chainids = atoms._getChids() if chainids is None: chainids = np.zeros(n_atoms, s_or_u + '1') charges = atoms._getCharges() if charges is None: charges = np.zeros(n_atoms, float) radii = atoms._getRadii() if radii is None: radii = np.zeros(n_atoms, float) icodes = atoms._getIcodes() if icodes is None: icodes = np.zeros(n_atoms, s_or_u + '1') hetero = ['ATOM'] * n_atoms heteroflags = atoms._getFlags('hetatm') if heteroflags is None: heteroflags = atoms._getFlags('hetero') if heteroflags is not None: hetero = np.array(hetero, s_or_u + '6') hetero[heteroflags] = 'HETATM' altlocs = atoms._getAltlocs() if altlocs is None: altlocs = np.zeros(n_atoms, s_or_u + '1') format = ('{0:6s} {1:5d} {2:4s} {3:1s}' + '{4:4s} {5:1s} {6:4d} {7:1s} ' + '{8:8.3f} {9:8.3f} {10:8.3f}' + '{11:8.4f} {12:7.4f}\n').format coords = atoms._getCoords() write = stream.write for i, xyz in enumerate(coords): write(format(hetero[i], i+1, atomnames[i], altlocs[i], resnames[i], chainids[i], int(resnums[i]), icodes[i], xyz[0], xyz[1], xyz[2], charges[i], radii[i]))
[docs]def writePQR(filename, atoms, **kwargs): """Write *atoms* in PQR format to a file with name *filename*. Only current coordinate set is written. Returns *filename* upon success. If *filename* ends with :file:`.gz`, a compressed file will be written.""" stream = openFile(filename, 'w') writePQRStream(stream, atoms, **kwargs) stream.close() return filename