Source code for add_chembl_compound_properties

"""
Add ChEMBL compound properties to the dataset.
"""

import sqlite3

import pandas as pd

from dataset import Dataset
import sanity_checks


########### Add Compound Properties Based on ChEMBL Data ###########
[docs] def get_first_publication_cpd_date( chembl_con: sqlite3.Connection, limit_to_literature: bool ) -> pd.DataFrame: """ Query and calculate the first publication of a compound based on ChEMBL data (column name: first_publication_cpd). If limit_to_literature is True, this corresponds to the first appearance of the compound in the literature according to ChEMBL. Otherwise this is the first appearance in any source in ChEMBL. :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :param limit_to_literature: Base first_publication_cpd on literature sources only if True. :type limit_to_literature: bool :return: Pandas DataFrame with parent_molregno and first_publication_cpd from ChEMBL. :rtype: pd.DataFrame """ # information about salts is aggregated in the parent sql = """ SELECT DISTINCT docs.year, mh.parent_molregno FROM docs LEFT JOIN compound_records cr ON docs.doc_id = cr.doc_id INNER JOIN molecule_hierarchy mh ON cr.molregno = mh.molregno -- cr.molregno = salt_molregno WHERE docs.year is not null """ if limit_to_literature: sql += """ and docs.src_id = 1""" df_docs = pd.read_sql_query(sql, con=chembl_con) df_docs["first_publication_cpd"] = df_docs.groupby("parent_molregno")[ "year" ].transform("min") df_docs = df_docs[["parent_molregno", "first_publication_cpd"]].drop_duplicates() return df_docs
[docs] def get_chembl_properties_and_structures( chembl_con: sqlite3.Connection, ) -> pd.DataFrame: """ Get compound properties from the compound_properties table (e.g., alogp, #hydrogen bond acceptors / donors, etc.). Get InChI, InChI key and canonical smiles. :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :return: Pandas DataFrame with compound properties and structures for all compound ids in ChEMBL :rtype: pd.DataFrame """ sql = """ SELECT DISTINCT mh.parent_molregno, cp.mw_freebase, cp.alogp, cp.hba, cp.hbd, cp.psa, cp.rtb, cp.ro3_pass, cp.num_ro5_violations, cp.cx_most_apka, cp.cx_most_bpka, cp.cx_logp, cp.cx_logd, cp.molecular_species, cp.full_mwt, cp.aromatic_rings, cp.heavy_atoms, cp.qed_weighted, cp.mw_monoisotopic, cp.full_molformula, cp.hba_lipinski, cp.hbd_lipinski, cp.num_lipinski_ro5_violations, struct.standard_inchi, struct.standard_inchi_key, struct.canonical_smiles FROM compound_properties cp INNER JOIN molecule_hierarchy mh ON cp.molregno = mh.parent_molregno INNER JOIN compound_structures struct ON mh.parent_molregno = struct.molregno """ df_cpd_props = pd.read_sql_query(sql, con=chembl_con) return df_cpd_props
[docs] def calculate_ligand_efficiency_metrics(dataset: Dataset): """ Calculate and add the ligand efficiency metrics for the compounds based on the mean pchembl values for a compound-target pair and the following ligand efficiency (LE) formulas: .. math:: LE &= \\frac{\\Delta G}{HA} \\qquad \\qquad \\text{where } \\Delta G = - RT \\ln(K_d) \\text{, } - RT\\ln(K_i) \\text{, or} - RT\\ln(IC_{50}) LE &= \\frac{2.303 \\cdot 298 \\cdot 0.00199 \\cdot pchembl \\_ value} {heavy \\_ atoms} BEI &= \\frac{pchembl \\_ mean \\cdot 1000}{mw \\_ freebase} SEI &= \\frac{pchembl \\_ mean \\cdot 100}{PSA} LLE &= pchembl \\_ mean - ALOGP Since LE metrics are based on pchembl values, they are calculated twice. Once for the pchembl values based on binding + functional assays (BF) and once for the pchembl values based on binding assays only (B). :param dataset: Dataset with compound-target pairs. Will be updated to include ligand efficiency metrics. :type dataset: Dataset """ for suffix in ["BF", "B"]: dataset.df_result.loc[dataset.df_result["heavy_atoms"] != 0, f"LE_{suffix}"] = ( dataset.df_result[f"pchembl_value_mean_{suffix}"] / dataset.df_result["heavy_atoms"] * (2.303 * 298 * 0.00199) ) dataset.df_result.loc[ dataset.df_result["mw_freebase"] != 0, f"BEI_{suffix}" ] = ( dataset.df_result[f"pchembl_value_mean_{suffix}"] * 1000 / dataset.df_result["mw_freebase"] ) dataset.df_result.loc[dataset.df_result["psa"] != 0, f"SEI_{suffix}"] = ( dataset.df_result[f"pchembl_value_mean_{suffix}"] * 100 / dataset.df_result["psa"] ) dataset.df_result[f"LLE_{suffix}"] = ( dataset.df_result[f"pchembl_value_mean_{suffix}"] - dataset.df_result["alogp"] ) dataset.df_result = dataset.df_result.astype( { f"LE_{suffix}": "float64", f"BEI_{suffix}": "float64", f"SEI_{suffix}": "float64", f"LLE_{suffix}": "float64", } )
[docs] def get_atc_classification(chembl_con: sqlite3.Connection) -> pd.DataFrame: """ Query ATC classifications (level 1) from the atc_classification and molecule_atc_classification tables. ATC level annotations for the same parent_molregno are combined into one description that concatenates all descriptions sorted alphabetically into one string with ' | ' as a separator. :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :return: Pandas DataFrame with ATC annotations in ChEMBL. :rtype: pd.DataFrame """ sql = """ SELECT DISTINCT mh.parent_molregno, atc.level1, atc.level1_description FROM atc_classification atc INNER JOIN molecule_atc_classification matc ON atc.level5 = matc.level5 INNER JOIN molecule_hierarchy mh ON matc.molregno = mh.molregno """ atc_levels = pd.read_sql_query(sql, con=chembl_con) atc_levels["l1_full"] = ( atc_levels["level1"] + "_" + atc_levels["level1_description"] ) # Combine ATC level annotations between_str_join = " | " atc_levels["atc_level1"] = atc_levels.groupby(["parent_molregno"])[ "l1_full" ].transform(lambda x: between_str_join.join(sorted(x))) atc_levels = atc_levels[["parent_molregno", "atc_level1"]].drop_duplicates() return atc_levels
[docs] def add_all_chembl_compound_properties( dataset: Dataset, chembl_con: sqlite3.Connection, limit_to_literature: bool ): """ Add ChEMBL-based compound properties to the given compound-target pairs, specifically: - the first publication date of a compound (first_publication_cpd) - ChEMBL compound properties - InChI, InChI key and canonical smiles - ligand efficiency metrics - ATC classifications :param dataset: Dataset with compound-target pairs. Will be updated to include compound properties. :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :param limit_to_literature: Base first_publication_cpd on literature sources only if True. Base it on all available sources otherwise. :type limit_to_literature: bool """ df_docs = get_first_publication_cpd_date(chembl_con, limit_to_literature) dataset.df_result = dataset.df_result.merge( df_docs, on="parent_molregno", how="left" ) df_cpd_props = get_chembl_properties_and_structures(chembl_con) dataset.df_cpd_props = df_cpd_props dataset.df_result = dataset.df_result.merge( df_cpd_props, on="parent_molregno", how="left" ) sanity_checks.check_compound_props(dataset.df_result, df_cpd_props) calculate_ligand_efficiency_metrics(dataset) sanity_checks.check_ligand_efficiency_metrics(dataset.df_result) atc_levels = get_atc_classification(chembl_con) dataset.atc_levels = atc_levels dataset.df_result = dataset.df_result.merge( atc_levels, on="parent_molregno", how="left" ) sanity_checks.check_atc(dataset.df_result, atc_levels)