Release Notes
v2.0 series come with new and improved sequence, structure, and dynamics analysis features. See release notes for details.
How to Cite
Bakan A, Meireles LM, Bahar I ProDy: Protein Dynamics Inferred from Theory and Experiments
Bioinformatics 2011 27(11):1575-1577.
Bakan A, Dutta A, Mao W, Liu Y, Chennubhotla C, Lezon TR, Bahar I Evol and ProDy for Bridging Protein Sequence Evolution and Structural Dynamics
Bioinformatics 2014 30(18):2681-2683.
Source code for prody.database.quartataweb

# -*- coding: utf-8 -*-
"""This module defines classes and functions for browsing QuartataWeb.

Based on code written by the CHARMM-GUI team (http://charmm-gui.org) and modified by James Krieger

This suite uses the following softwares:
- python Splinter package (https://splinter.readthedocs.org/en/latest/)
- a web browser, such as Google Chrome or Mozilla Firefox
- the corresponding driver such as chromedriver (https://sites.google.com/a/chromium.org/chromedriver/downloads)
   for Chrome or geckodriver (https://github.com/mozilla/geckodriver/releases) for Firefox
"""

from prody import PY3K, LOGGER
from prody.utilities import openFile

import numpy as np
import os

__all__ = ['QuartataWebBrowser', 'QuartataChemicalRecord', 'searchQuartataWeb',
           'initializeBrowser']


[docs]class QuartataWebBrowser(object):
    """Class to browse the QuartataWeb website.

    :arg data_source: source database for QuartataWeb analysis
        options are ``"DrugBank"`` or ``"STITCH"``. Default is ``"DrugBank"``
    :type data_source: str

    :arg drug_group: group of drugs if using DrugBank
        options are ``"Approved"`` or ``"All"``. Default is ``"All"``
    :type drug_group: str

    :arg input_type: number corresponding to the input type, options are 
        ``1`` (Chemical and/or target) or 
        ``2`` (A list of chemicals, targets or chemical combinations). 
        Default is ``1``
    :type input_type: int

    :arg query_type: number corresponding to the query type. Options are 
        dependent on input_type. 
        
        With input_type 1, they are:
        * ``1`` (chemical-target interaction)
        * ``2`` (chemical-chemical similarity)
        * ``3`` (target-target similarity)

        With input_type 2, they are:
        * ``1`` (chemicals)
        * ``2`` (targets)
        * ``3`` (chemical combinations)

        Default is ``1``
    :type query_type: int

    :arg data: data to enter into the box or boxes. This varies depending on input type 
        and query type, but will always be a list of strings.
        
        For input_type 1, a list with two items is expected. These will be one of the 
        following depending on query_type:
        * With query_type 1, the first would be a chemical and the second a target. 
            One of these can also be left blank.
        * With query_type 2, the first would be a chemical and the second a chemical.
        * With query_type 3, the first would be a target and the second a target.

        For input_type 2, a list with any length is expected. These will be one of the 
        following depending on query_type:
        * With query_type 1, these would be chemicals. 
        * With query_type 2, these would be targets.
        * With query_type 3, these would be pairs of chemicals, separated by semicolons.
    :type data: list

    :arg num_predictions: number of predictions to show or consider in addition to 
        known interactions. Default is ``0``. 
        With DrugBank and input_type 1, a second number can be provided in a list 
        for secondary interactions.
    :type num_predictions: int, list

    :arg browser_type: browser type for navigation
        Default is ``"Chrome"``
    :type browser_type: str

    :arg job_id: job ID for accessing previous jobs
        Default is ``None``
    :type job_id: int        

    :arg tsv: a filename for a file that contains the results 
        or a file to save the results in tsv format
    :type tsv: str
    """

    def __init__(self, data_source=None, drug_group=None, input_type=None, query_type=None, 
                 data=None, num_predictions=None, browser_type=None, job_id=None, 
                 tsv=None, chem_type='known'):

        self.browser_type = None
        self.browser = None

        self.data_source = None
        self.drug_group = None
        self.input_type = None
        self.query_type = None
        self.data = None
        self.num_predictions = None

        self.chemical_data = {}
        self.fields = {}
        self.num_fields = {}
        self.num_rows = {}

        self.job_id = job_id

        self.filename = None
        self.no_data = {'known': True, 'predicted': True}
        if tsv is not None:
            try:
                self.parseChemicals(tsv, chem_type)
            except:
                raise ValueError('please provide a valid filename')

        self.setBrowserType(browser_type)
        self.setDataSource(data_source)
        self.setDrugGroup(drug_group)
        self.setInputType(input_type)
        self.setQueryType(query_type)
        self.setData(data)
        self.setNumPredictions(num_predictions)

[docs]    def updateHomePage(self):
        """Update the home page with data from setting variables"""
        url = "http://quartata.csb.pitt.edu"

        if self.data_source == 'DrugBank':
            url += '/index'
        else:
            url += '/index_stitch'

        if self.input_type == 2:
            url += '_2'

        url += '.php'
        self.browser.visit(url)

        if self.data_source == 'DrugBank':
            if self.drug_group == 'Approved':
                self.browser.find_by_name('db_type')[0].click()
            else:
                self.browser.find_by_name('db_type')[1].click()

        if self.query_type is not None:
            self.browser.find_by_name('pattern')[self.query_type - 1].click()

        if self.data is not None:
            if self.input_type == 1:
                if self.query_type == 1:
                    self.browser.find_by_name('q_drug_1')[0].fill(self.data[0])
                    self.browser.find_by_name('q_target_1')[
                        0].fill(self.data[1])
                elif self.query_type == 2:
                    self.browser.find_by_name('q_drug_1')[0].fill(self.data[0])
                    self.browser.find_by_name('q_drug_2')[0].fill(self.data[1])
                else:
                    self.browser.find_by_name('q_target_1')[0].fill(self.data[0])
                    self.browser.find_by_name('q_target_2')[0].fill(self.data[1])
            else:
                if self.query_type == 1:
                    self.browser.find_by_name('q_drugs')[0].fill('\n'.join(self.data))
                if self.query_type == 2:
                    self.browser.find_by_name('q_targets')[0].fill('\n'.join(self.data))
                else:
                    self.browser.find_by_name('q_drug_pairs')[0].fill('\n'.join(self.data))

        if self.num_predictions is not None:
            self.browser.find_by_name('pred_n')[0].fill(self.num_predictions[0])
            if self.data_source == 'DrugBank' and self.input_type == 1:
                self.browser.find_by_name('pred_n_2nd')[0].fill(self.num_predictions[1])

[docs]    def setDataSource(self, data_source):
        """Set data_source and update home page
        
        :arg data_source: source database for QuartataWeb analysis
            options are ``"DrugBank"`` or ``"STITCH"``. Default is ``"DrugBank"``
        :type data_source: str
        """
        if data_source is None:
            data_source = 'DrugBank'
        elif not isinstance(data_source, str):
            raise TypeError('data_source should be a string or None')
        elif data_source.lower() == 'drugbank':
            data_source = 'DrugBank'
        elif data_source.lower() == 'stitch':
            data_source = 'STITCH'
        else:
            raise ValueError('data_source should be DrugBank, STITCH or None')

        self.data_source = data_source
        if self.no_data:
            self.updateHomePage()

[docs]    def setDrugGroup(self, group):
        """Set drug_group and update home page
        
        :arg group: group of drugs if using DrugBank
            options are ``"Approved"`` or ``"All"``. Default is ``"All"``
        :type group: str
        """
        if self.data_source == 'DrugBank':
            if group is None:
                group = 'All'
            elif not isinstance(group, str):
                raise TypeError('group must be string or None')
            elif group.lower() == 'all':
                group = 'All'
            elif group.lower() == 'approved':
                group = 'Approved'
            else:
                raise ValueError('group should be approved, all or None')

            self.drug_group = group
            if self.no_data:
                self.updateHomePage()

        elif group is not None:
            LOGGER.warn('there are no groups when using STITCH')

[docs]    def setInputType(self, input_type):
        """Set input_type and update home page
        
        :arg input_type: number corresponding to the input type, options are 
            ``1`` (Chemical and/or target) or 
            ``2`` (A list of chemicals, targets or chemical combinations). 
            Default is ``1``
        :type input_type: int
        """        
        if input_type is None:
            input_type = 1
        elif not isinstance(input_type, int):
            raise TypeError('input_type should be an integer (1 or 2) or None')
        elif not input_type in [1, 2]:
            raise ValueError('input_type should be 1, 2 or None')

        self.input_type = input_type
        if self.no_data:
            self.updateHomePage()

[docs]    def setQueryType(self, query_type):
        """Set query_type and update home page
        
        :arg query_type: number corresponding to the query type. Options are 
            dependent on input_type. 
            
            With input_type 1, they are:
            * ``1`` (chemical-target interaction)
            * ``2`` (chemical-chemical similarity)
            * ``3`` (target-target similarity)

            With input_type 2, they are:
            * ``1`` (chemicals)
            * ``2`` (targets)
            * ``3`` (chemical combinations)

            Default is ``1``
        :type query_type: int
        """
        if query_type is None:
            query_type = 1
        elif not isinstance(query_type, int):
            raise TypeError(
                'query_type should be an integer (1, 2 or 3) or None')
        elif not query_type in [1, 2, 3]:
            raise ValueError('query_type should be 1, 2, 3 or None')

        self.query_type = query_type
        if self.no_data:
            self.updateHomePage()

[docs]    def setData(self, data):
        """Set data and update home page
        
        :arg data: data to enter into the box or boxes. This varies depending on input type 
            and query type, but will always be a list of strings.
            
            For input_type 1, a list with two items is expected. These will be one of the 
            following depending on query_type:
            * With query_type 1, the first would be a chemical and the second a target. 
                One of these can also be left blank.
            * With query_type 2, the first would be a chemical and the second a chemical.
            * With query_type 3, the first would be a target and the second a target.

            For input_type 2, a list with any length is expected. These will be one of the 
            following depending on query_type:
            * With query_type 1, these would be chemicals. 
            * With query_type 2, these would be targets.
            * With query_type 3, these would be pairs of chemicals, separated by semicolons.
        :type data: list
        """
        if data is None:
            LOGGER.warn('data is not set')
        elif not isinstance(data, list):
            raise TypeError('data should be a list')
        else:
            for item in data:
                if not isinstance(item, str):
                    raise TypeError('data should be a list of strings')

            if self.input_type == 1:
                if len(data) > 2:
                    raise ValueError(
                        'data can only have two values with input_type 1')

                if len(data) == 1:
                    data.append('')

            if self.input_type == 3:
                for item in data:
                    if item.find(';') == -1:
                        raise ValueError(
                            'each item in data must be a pair with ; as delimiter')

        self.data = data
        if self.no_data:
            self.updateHomePage()

[docs]    def setNumPredictions(self, num_predictions):
        """Set num_predictions and update home page
        
        :arg num_predictions: number of predictions to show or consider in addition to 
            known interactions. Default is ``0``. 
            With DrugBank and input_type 1, a second number can be provided in a list 
            for secondary interactions.
        :type num_predictions: int, list
        """
        if num_predictions is None:
            num_predictions = 0

        if not isinstance(num_predictions, (int, list)):
            raise TypeError(
                'num_predictions should be an integer, a list or None')

        if isinstance(num_predictions, int):
            num_predictions = [num_predictions, 0]

        if num_predictions[0] > 100:
            raise ValueError('1st num_predictions must be <= 100')

        if num_predictions[1] > 20:
            raise ValueError('2nd num_predictions must be <= 20')

        self.num_predictions = num_predictions
        if self.no_data:
            self.updateHomePage()

[docs]    def setBrowserType(self, browser_type):
        """Set browser_type and update home page
        
        :arg browser_type: browser type for navigation
            Default is ``"Chrome"``
        :type browser_type: str
        """
        if self.no_data:
            self.browser_type, self.browser = initializeBrowser(browser_type)
            self.updateHomePage()


[docs]    def setJObID(self, job_id):
        """Set job_id and view results
        
        :arg job_id: job ID for accessing previous jobs
            Default is ``None``
        :type job_id: int
        """
        self.job_id = job_id
        if self.no_data:
            self.viewResults()


[docs]    def viewResults(self):
        """View results by clicking submit or using a job_id"""
        if self.job_id is None or self.browser.url.find('index') != -1:
            self.browser.find_by_name('submit')[0].click()
            self.job_id = self.browser.url.split('_')[-1].split('=')[-1]

        else:
            if self.data_source == 'DrugBank':
                url = '_'.join(['http://quartata.csb.pitt.edu/quartata_result.php?job',
                                'id={0}'.format(self.job_id)])
            else:
                url = '_'.join(['http://quartata.csb.pitt.edu/quartata_result',
                                'stitch.php?job', 'id={0}'.format(self.job_id)])

            self.browser.visit(url)


[docs]    def goToDownloads(self):
        """Go to downloads page"""
        if self.job_id is None:
            self.viewResults()

        if self.data_source == 'DrugBank':
            url = '_'.join(['http://quartata.csb.pitt.edu/quartata_download.php?job',
                            'id={0}'.format(self.job_id)])
        else:
            url = '_'.join(['http://quartata.csb.pitt.edu/quartata_download',
                            'stitch.php?job', 'id={0}'.format(self.job_id)])

        self.browser.visit(url)


[docs]    def goToWorkDir(self):
        """Go to working directory"""
        if self.job_id is None:
            self.viewResults()

        url = 'http://quartata.csb.pitt.edu/work/{0}'.format(self.job_id)
        self.browser.visit(url)


[docs]    def parseChemicals(self, filename=None, chem_type='known'):
        """Go to working directory and parse chemicals for query protein.
        Updates self.chemical_data"""
        
        if filename is None:
            filename = self.filename

        try:
            if filename is not None:
                if not self.no_data[chem_type]:
                    return True

                if not isinstance(filename, str):
                    raise TypeError('filename should be a string')

                if os.path.isfile(filename):
                    # read the contents
                    LOGGER.info('reading chemicals from {0}'.format(filename))
                    stream = openFile(filename, 'rt')
                    lines = stream.readlines()
                    stream.close()
                    self.no_data[chem_type] = False
                else:
                    # filename contains a filename for writing
                    self.no_data[chem_type] = True

                self.filename = filename

            if self.no_data[chem_type]:
                self.goToWorkDir()
                
                if self.data_source == 'DrugBank':
                    data_filename = '%s_drugs_for_query_protein.txt' % chem_type
                else:
                    data_filename = '%s_chemicals_for_query_protein.txt' % chem_type

                self.browser.find_by_text(data_filename)[0].click()
                
                import requests
                html = requests.get(self.browser.url).content
                if PY3K:
                    html = html.decode()

                if filename is not None:
                    LOGGER.info('writing chemicals to {0}'.format(filename))
                    out = open(filename, 'w')
                    out.write(html)
                    out.close()

                lines = html.split('\n')

            self.fields[chem_type] = lines[0].split('\t')
            self.num_fields[chem_type] = len(self.fields[chem_type])

            self.num_rows[chem_type] = len(lines[1:])
            if lines[-1].strip() == '':
                self.num_rows[chem_type] -= 1

            dtypes = []
            for i, item in enumerate(lines[1].split('\t')):
                if item.isnumeric():
                    dtypes.append((self.fields[chem_type][i], int))
                elif item.find('.') != -1 and item.replace('.','0').isnumeric():
                    dtypes.append((self.fields[chem_type][i], float))
                else:
                    dtypes.append((self.fields[chem_type][i], object))

            self.chemical_data[chem_type] = np.empty(self.num_rows[chem_type], dtype=dtypes)

            for i, line in enumerate(lines[1:self.num_rows[chem_type]+1]):
                items = line.strip().split('\t')
                if len(items) != self.num_fields[chem_type]:
                    raise ValueError('line {0} has the wrong number of fields'.format(i+1))

                for j, item in enumerate(items):
                    self.chemical_data[chem_type][i][j] = item
        except:
            self.no_data[chem_type] = True
        else:
            self.no_data[chem_type] = False

        return not self.no_data[chem_type]


    def quit(self):
        if self.browser is not None:
            self.browser.quit()


[docs]class QuartataChemicalRecord(object):
    """Class for handling chemical data from QuartataWebBrowser"""

    def __init__(self, data_source=None, drug_group=None, input_type=None, query_type=None, 
                 data=None, num_predictions=None, browser_type=None, job_id=None, 
                 filename=None):
        """Instantiate a QuartataChemicalRecord object instance.
        Inputs are the same as QuartataWebBrowser.
        """
        self._chemData = None
        self._filterDict = None
        self.data_source = data_source 
        self.drug_group = drug_group
        self.input_type = input_type
        self.query_type = query_type
        self.data = data
        self.num_predictions = num_predictions
        self.job_id = job_id
        self.filename = filename

        self.isSuccess = self.fetch(data_source, drug_group, input_type, query_type,
                                    data, num_predictions, browser_type, job_id, filename)


[docs]    def fetch(self, data_source=None, drug_group=None, input_type=None, query_type=None, 
              data=None, num_predictions=None, browser_type=None, job_id=None, filename=None):
        """Fetch data"""
        if data_source is None:
            data_source = self.data_source
        if drug_group is None:
            drug_group = self.drug_group
        if input_type is None:
            input_type = self.input_type
        if query_type is None:
            query_type = self.query_type
        if data is None:
            data = self.data

        if data is None:
            raise ValueError('data cannot be None')

        if num_predictions is None:
            num_predictions = self.num_predictions
        if job_id is None:
            job_id = self.job_id
        if filename is None:
            filename = self.filename

        self.qwb = QuartataWebBrowser(data_source, drug_group, input_type, query_type,
                                      data, num_predictions, browser_type, job_id, filename)
        
        isSuccess = self.qwb.parseChemicals()
        if self.qwb.num_predictions[0] > 0:
            isSuccess = self.qwb.parseChemicals(chem_type='predicted')

        self.qwb.quit()

        self._chemData = self.qwb.chemical_data
        if self._chemData is None:
            raise ValueError('')
        chem_temp_dict = dict()
        listAll = []
        for key in self._chemData:
            for temp in self._chemData[key]:
                temp_dict = dict()
                chem_name = temp[1]

                temp_dict['DB_ID'] = temp[0]
                temp_dict['chemical_name'] = chem_name
                temp_dict['mol_weight'] = temp[2]
                temp_dict['SMILES'] = temp[3]
                temp_dict['conf_score'] = temp[4]

                chem_temp_dict[chem_name] = temp_dict
                listAll.append(chem_name)

            self._listAll = tuple(listAll)
            self._list = self._listAll
            self._chemDict = chem_temp_dict
        
        return isSuccess


[docs]    def getChemicalList(self, filtered=True):
        """Returns chemical list (filters may be applied)"""
        if not self.isSuccess:
            LOGGER.warn('Quartata Chemical Record does not have any data yet. '
                        'Please run fetch again, possibly with different parameters.')
        
        if filtered:
            return self._list
        return self._listAll
        

[docs]    def getSMILESList(self, filtered=True):
        """Returns SMILES list (filters may be applied)"""
        if not self.isSuccess:
            LOGGER.warn('Quartata Chemical Record does not have any data yet.'
                        'Please run fetch again, possibly with different parameters.')
        
        if filtered:
            return [self._chemDict[key]['SMILES'] for key in self._list]
        return self._chemData['SMILES']
        

[docs]    def getParticularSMILES(self, key):
        """Returns SMILES for a particular chemical"""
        if not self.isSuccess:
            LOGGER.warn('Quartata Chemical Record does not have any data yet.'
                        'Please run fetch again, possibly with different parameters.')

        return self._chemDict[key]['SMILES']


[docs]    def getFilterList(self):
        """Returns a list of chemicals for the entries that were filtered out"""
        
        filterDict = self._filterDict
        if filterDict is None:
            raise ValueError('You cannot obtain the list of filtered out entries before doing any filtering.')

        temp_str = ', '.join([str(len(filterDict['lower_MW'])), str(len(filterDict['upper_MW'])), 
                              str(len(filterDict['conf_score']))])
        LOGGER.info('Filtered out [' + temp_str + '] for [lower weight, upper weight, confidence score]')
        return self._filterList


[docs]    def filter(self, lower_weight=None, upper_weight=None, cutoff_score=None):
        """Filters out chemicals from the list and returns the updated list.
        Chemicals that satisfy any of the following criterion will be filtered out.
        (1) Molecular weight < lower_weight (must be a positive number);
        (2) Molecular weight > upper_weight (must be a positive number);
        (3) Confidence score < cutoff_score (must be a positive number);

        Please note that every time this function is run, this overrides any previous runs.
        Therefore, please provide all filters at once.
        """
        if not self.isSuccess:
            LOGGER.warn('Quartata Chemical Record does not have any data yet.'
                        'Please run fetch again, possibly with different parameters.')
            return None

        if lower_weight == None:
            lower_weight = 0
        elif not isinstance(lower_weight, (float, int)):
            raise TypeError('lower_weight must be a float or an integer')
        if lower_weight >= 0:
            lower_weight = float(lower_weight)
        else:
            raise ValueError('lower_weight must be a number not less than 0')
            
        if upper_weight == None:
            upper_weight = 0
        elif not isinstance(upper_weight, (float, int)):
            raise TypeError('upper_weight must be a float or an integer')
        if upper_weight >= 0:
            upper_weight = float(upper_weight)
        else:
            raise ValueError('upper_weight must be a number not less than 0')
            
        if cutoff_score == None:
            cutoff_score = 0
        elif not isinstance(cutoff_score, (float, int)):
            raise TypeError('cutoff_score must be a float or an integer')
        elif cutoff_score >= 0:
            cutoff_score = float(cutoff_score)
        else:
            raise ValueError('cutoff_score must be a number not less than 0')

        quartataInfo = self._chemDict
        if quartataInfo is None:
            raise ValueError("Quartata Chemical Record does not have any data yet. Please run fetch.")

        listAll = self._listAll
        ref_indices_set = set(range(self.qwb.num_rows))
        filterListLowerMW = []
        filterListUpperMW = []
        filterListConf = []
        
        for chem in listAll:
            temp_dict = quartataInfo[chem]

            if temp_dict['mol_weight'] < lower_weight:
                filterListLowerMW.append(chem)
                continue

            if upper_weight > 0 and temp_dict['mol_weight'] > upper_weight:
                filterListUpperMW.append(chem)
                continue

            if temp_dict['conf_score'] < cutoff_score:
                filterListConf.append(chem)
                continue

        filterList = filterListLowerMW + filterListUpperMW + filterListConf
        filterDict = {'lower_MW': filterListLowerMW, 'upper_MW': filterListUpperMW, 'conf_score': filterListConf}
        self._filterList = filterList
        self._filterDict = filterDict
        self._list = [item for item in self._listAll if not item in filterList]
        LOGGER.info(str(len(self._listAll)-len(self._list)) + ' chemicals have been filtered out from '+str(len(self._listAll))+' QuartataWeb hits (remaining: '+str(len(self._list))+').')
        return self._list
    

[docs]def searchQuartataWeb(data_source=None, drug_group=None, input_type=None, query_type=None, 
                      data=None, num_predictions=None, browser_type=None, job_id=None, 
                      filename=None, result_type='Chemical'):
    """Wrapper function for searching QuartataWeb.

    :arg result_type: type of results to get from QuartataWeb.
        So far only ``'Chemical'`` is supported.
    :type result_type: str

    All other arguments are the same as :class:`.QuartataWebBrowser`.
    """
    if result_type == 'Chemical':
        return QuartataChemicalRecord(data_source, drug_group, input_type, query_type,
                                      data, num_predictions, browser_type, job_id,
                                      filename)
    else:
        LOGGER.warn('No other result types are supported yet')
        return None


def initializeBrowser(browser_type, url):
    try:
        from splinter import Browser
    except ImportError:
        raise ImportError('Browser module could not be imported. '
                            'install splinter package to solve the problem.')
    else:
        from selenium.webdriver.common.service import WebDriverException
    
    if url is None:
        url = "http://quartata.csb.pitt.edu"

    if browser_type is None:
        try:
            browser = Browser('chrome')
            browser.visit(url)
        except WebDriverException:
            try:
                browser = Browser('firefox')
                browser.visit(url)
            except WebDriverException:
                raise ValueError('No web driver found for Chrome or Firefox. '
                                    'Please specify a different browser type or download an appropriate driver.')
            else:
                browser_type = 'firefox'
        else:
            browser_type = 'chrome'

    elif not isinstance(browser_type, str):
        raise TypeError('browser_type should be a string or None')
    else:
        try:
            browser = Browser(browser_type)
            browser.visit(url)
        except WebDriverException:
            raise ValueError('No web driver found for browser_type. '
                                'Please specify a different browser type or download an appropriate driver.')
        else:
            browser_type = browser_type

    return browser_type, browser