Source code for add_chembl_compound_properties

"""
Add ChEMBL compound properties to the dataset.
"""

import sqlite3

import pandas as pd

from dataset import Dataset
import sanity_checks


########### Add Compound Properties Based on ChEMBL Data ###########

[docs]
def get_first_publication_cpd_date(
    chembl_con: sqlite3.Connection, limit_to_literature: bool
) -> pd.DataFrame:
    """
    Query and calculate the first publication of a compound
    based on ChEMBL data (column name: first_publication_cpd).
    If limit_to_literature is True, this corresponds to the first appearance
    of the compound in the literature according to ChEMBL.
    Otherwise this is the first appearance in any source in ChEMBL.

    :param chembl_con: Sqlite3 connection to ChEMBL database.
    :type chembl_con: sqlite3.Connection
    :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
    :type limit_to_literature: bool
    :return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL.
    :rtype: pd.DataFrame
    """
    # information about salts is aggregated in the parent
    sql = """
    SELECT DISTINCT docs.year, mh.parent_molregno
    FROM docs
    LEFT JOIN compound_records cr
        ON docs.doc_id = cr.doc_id
    INNER JOIN molecule_hierarchy mh 
        ON cr.molregno = mh.molregno   -- cr.molregno = salt_molregno
    WHERE docs.year is not null
    """
    if limit_to_literature:
        sql += """    and docs.src_id = 1"""
    df_docs = pd.read_sql_query(sql, con=chembl_con)

    df_docs["first_publication_cpd"] = df_docs.groupby("parent_molregno")[
        "year"
    ].transform("min")
    df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates()

    return df_docs




[docs]
def get_chembl_properties_and_structures(
    chembl_con: sqlite3.Connection,
) -> pd.DataFrame:
    """
    Get compound properties from the compound_properties table
    (e.g., alogp, #hydrogen bond acceptors / donors, etc.).
    Get InChI, InChI key and canonical smiles.

    :param chembl_con: Sqlite3 connection to ChEMBL database.
    :type chembl_con: sqlite3.Connection
    :return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL
    :rtype: pd.DataFrame
    """
    sql = """
    SELECT DISTINCT mh.parent_molregno, 
        cp.mw_freebase, cp.alogp, cp.hba, cp.hbd, cp.psa, cp.rtb, cp.ro3_pass, cp.num_ro5_violations, 
        cp.cx_most_apka, cp.cx_most_bpka, cp.cx_logp, cp.cx_logd, cp.molecular_species, cp.full_mwt, 
        cp.aromatic_rings, cp.heavy_atoms, cp.qed_weighted, cp.mw_monoisotopic, cp.full_molformula, 
        cp.hba_lipinski, cp.hbd_lipinski, cp.num_lipinski_ro5_violations, 
        struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles
    FROM compound_properties cp
    INNER JOIN molecule_hierarchy mh
        ON cp.molregno = mh.parent_molregno
    INNER JOIN compound_structures struct
        ON mh.parent_molregno = struct.molregno
    """

    df_cpd_props = pd.read_sql_query(sql, con=chembl_con)

    return df_cpd_props




[docs]
def calculate_ligand_efficiency_metrics(dataset: Dataset):
    """
    Calculate and add the ligand efficiency metrics for the compounds
    based on the mean pchembl values for a compound-target pair and
    the following ligand efficiency (LE) formulas:

    .. math::
        LE &= \\frac{\\Delta G}{HA}
            \\qquad \\qquad \\text{where } \\Delta G = - RT \\ln(K_d)
            \\text{, } - RT\\ln(K_i)
            \\text{,  or} - RT\\ln(IC_{50})

        LE &= \\frac{2.303 \\cdot 298 \\cdot 0.00199 \\cdot pchembl \\_ value} {heavy \\_ atoms}

        BEI &= \\frac{pchembl \\_ mean \\cdot 1000}{mw \\_ freebase}

        SEI &= \\frac{pchembl \\_ mean \\cdot 100}{PSA}

        LLE &= pchembl \\_ mean - ALOGP

    Since LE metrics are based on pchembl values, they are calculated twice.
    Once for the pchembl values based on binding + functional assays (BF)
    and once for the pchembl values based on binding assays only (B).

    :param dataset: Dataset with compound-target pairs.
        Will be updated to include ligand efficiency metrics.
    :type dataset: Dataset
    """
    for suffix in ["BF", "B"]:
        dataset.df_result.loc[dataset.df_result["heavy_atoms"] != 0, f"LE_{suffix}"] = (
            dataset.df_result[f"pchembl_value_mean_{suffix}"]
            / dataset.df_result["heavy_atoms"]
            * (2.303 * 298 * 0.00199)
        )

        dataset.df_result.loc[
            dataset.df_result["mw_freebase"] != 0, f"BEI_{suffix}"
        ] = (
            dataset.df_result[f"pchembl_value_mean_{suffix}"]
            * 1000
            / dataset.df_result["mw_freebase"]
        )

        dataset.df_result.loc[dataset.df_result["psa"] != 0, f"SEI_{suffix}"] = (
            dataset.df_result[f"pchembl_value_mean_{suffix}"]
            * 100
            / dataset.df_result["psa"]
        )

        dataset.df_result[f"LLE_{suffix}"] = (
            dataset.df_result[f"pchembl_value_mean_{suffix}"]
            - dataset.df_result["alogp"]
        )

        dataset.df_result = dataset.df_result.astype(
            {
                f"LE_{suffix}": "float64",
                f"BEI_{suffix}": "float64",
                f"SEI_{suffix}": "float64",
                f"LLE_{suffix}": "float64",
            }
        )




[docs]
def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame:
    """
    Query ATC classifications (level 1) from the atc_classification and
    molecule_atc_classification tables.
    ATC level annotations for the same parent_molregno are combined into one description
    that concatenates all descriptions sorted alphabetically
    into one string with ' | ' as a separator.

    :param chembl_con: Sqlite3 connection to ChEMBL database.
    :type chembl_con: sqlite3.Connection
    :return: Pandas DataFrame with ATC annotations in ChEMBL.
    :rtype: pd.DataFrame
    """
    sql = """
    SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description
    FROM atc_classification atc
    INNER JOIN molecule_atc_classification matc
        ON atc.level5 = matc.level5
    INNER JOIN molecule_hierarchy mh
        ON matc.molregno = mh.molregno
    """

    atc_levels = pd.read_sql_query(sql, con=chembl_con)
    atc_levels["l1_full"] = (
        atc_levels["level1"] + "_" + atc_levels["level1_description"]
    )

    # Combine ATC level annotations
    between_str_join = " | "
    atc_levels["atc_level1"] = atc_levels.groupby(["parent_molregno"])[
        "l1_full"
    ].transform(lambda x: between_str_join.join(sorted(x)))
    atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates()

    return atc_levels




[docs]
def add_all_chembl_compound_properties(
    dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool
):
    """
    Add ChEMBL-based compound properties to the given compound-target pairs, specifically:

    - the first publication date of a compound (first_publication_cpd)
    - ChEMBL compound properties
    - InChI, InChI key and canonical smiles
    - ligand efficiency metrics
    - ATC classifications

    :param dataset: Dataset with compound-target pairs.
        Will be updated to include compound properties.
    :type dataset: Dataset
    :param chembl_con: Sqlite3 connection to ChEMBL database.
    :type chembl_con: sqlite3.Connection
    :param limit_to_literature: Base first_publication_cpd on literature sources only if True.
        Base it on all available sources otherwise.
    :type limit_to_literature: bool
    """
    df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature)
    dataset.df_result = dataset.df_result.merge(
        df_docs, on="parent_molregno", how="left"
    )

    df_cpd_props = get_chembl_properties_and_structures(chembl_con)
    dataset.df_cpd_props = df_cpd_props
    dataset.df_result = dataset.df_result.merge(
        df_cpd_props, on="parent_molregno", how="left"
    )
    sanity_checks.check_compound_props(dataset.df_result, df_cpd_props)

    calculate_ligand_efficiency_metrics(dataset)
    sanity_checks.check_ligand_efficiency_metrics(dataset.df_result)

    atc_levels = get_atc_classification(chembl_con)
    dataset.atc_levels = atc_levels
    dataset.df_result = dataset.df_result.merge(
        atc_levels, on="parent_molregno", how="left"
    )
    sanity_checks.check_atc(dataset.df_result, atc_levels)