Source code for get_drug_mechanism_ct_pairs

"""
Get and add compound-target pairs based on information 
in the drug_mechanism table.
"""

import logging
import sqlite3

import pandas as pd

from dataset import Dataset
import sanity_checks


########### Extract Drug-Target Interactions From the drug_mechanism Table ###########
[docs] def get_drug_mechanisms_interactions(chembl_con: sqlite3.Connection) -> pd.DataFrame: """ Extract the known compound-target interactions from the ChEMBL drug_mechanisms table. Note: While the interactions are mostly between drugs and targets, the table also includes some known interactions between compounds with a max_phase < 4 and their targets. Only entries with a disease_efficacy of 1 are taken into account, i.e., the target is believed to play a role in the efficacy of the drug. *disease_efficacy: Flag to show whether the target assigned is believed to play a role in the efficacy of the drug in the indication(s) for which it is approved (1 = yes, 0 = no).* :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :return: Pandas DataFrame with compound-target pairs from the drug_mechanism table with disease relevance. :rtype: pd.DataFrame """ sql = """ SELECT DISTINCT mh.parent_molregno, dm.tid FROM drug_mechanism dm INNER JOIN molecule_hierarchy mh ON dm.molregno = mh.molregno INNER JOIN molecule_dictionary md ON mh.parent_molregno = md.molregno WHERE dm.disease_efficacy = 1 and dm.tid is not null """ df_dti = pd.read_sql_query(sql, con=chembl_con) return df_dti
[docs] def get_relevant_tid_mappings(chembl_con: sqlite3.Connection) -> pd.DataFrame: """ Get DataFrame with mappings from target id to their related target ids based on the target_relations table. The following mappings are considered: +-------------------------------+-----------------------+----------------+ |protein family | -[superset of]-> | single protein | +-------------------------------+-----------------------+----------------+ |protein complex | -[superset of]-> | single protein | +-------------------------------+-----------------------+----------------+ |protein complex group | -[superset of]-> | single protein | +-------------------------------+-----------------------+----------------+ |single protein | -[equivalent to]-> | single protein | +-------------------------------+-----------------------+----------------+ |chimeric protein | -[superset of]-> | single protein | +-------------------------------+-----------------------+----------------+ |protein-protein interaction | -[superset of]-> | single protein | +-------------------------------+-----------------------+----------------+ These mappings can be used to increase the number of target ids for which there is data in the drug_mechanisms table. For example, for *protein family -[superset of]-> single protein* this means: If there is a known relevant interaction between a compound and a protein family, interactions between the compound and single proteins of that protein family are considered to be known interactions as well. :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :return: Pandas DataFrame with mappings from tid to related tid for the defined subset of target relations. :rtype: pd.DataFrame """ sql = """ SELECT DISTINCT tr.tid, tr.relationship, tr.related_tid, td1.pref_name as pref_name_1, td1.target_type as target_type_1, td1.organism as organism_1, td2.pref_name as pref_name_2, td2.target_type as target_type_2, td2.organism as organism_2 FROM target_relations tr INNER JOIN target_dictionary td1 ON tr.tid = td1.tid INNER JOIN target_dictionary td2 ON tr.related_tid = td2.tid """ df_related_targets = pd.read_sql_query(sql, con=chembl_con) protein_family_mapping = df_related_targets[ (df_related_targets["target_type_1"] == "PROTEIN FAMILY") & (df_related_targets["target_type_2"] == "SINGLE PROTEIN") & (df_related_targets["relationship"] == "SUPERSET OF") ] protein_complex_mapping = df_related_targets[ (df_related_targets["target_type_1"] == "PROTEIN COMPLEX") & (df_related_targets["target_type_2"] == "SINGLE PROTEIN") & (df_related_targets["relationship"] == "SUPERSET OF") ] protein_complex_group_mapping = df_related_targets[ (df_related_targets["target_type_1"] == "PROTEIN COMPLEX GROUP") & (df_related_targets["target_type_2"] == "SINGLE PROTEIN") & (df_related_targets["relationship"] == "SUPERSET OF") ] single_protein_mapping = df_related_targets[ (df_related_targets["target_type_1"] == "SINGLE PROTEIN") & (df_related_targets["target_type_2"] == "SINGLE PROTEIN") & (df_related_targets["relationship"] == "EQUIVALENT TO") ] chimeric_protein_mapping = df_related_targets[ (df_related_targets["target_type_1"] == "CHIMERIC PROTEIN") & (df_related_targets["target_type_2"] == "SINGLE PROTEIN") & (df_related_targets["relationship"] == "SUPERSET OF") ] ppi_mapping = df_related_targets[ (df_related_targets["target_type_1"] == "PROTEIN-PROTEIN INTERACTION") & (df_related_targets["target_type_2"] == "SINGLE PROTEIN") & (df_related_targets["relationship"] == "SUPERSET OF") ] relevant_tid_mappings = pd.concat( [ protein_family_mapping, protein_complex_mapping, protein_complex_group_mapping, single_protein_mapping, chimeric_protein_mapping, ppi_mapping, ] ) return relevant_tid_mappings
[docs] def add_annotations_to_drug_mechanisms_cti( chembl_con: sqlite3.Connection, cpd_target_pairs: pd.DataFrame ) -> pd.DataFrame: """ Add additional information to the compound-target pairs from the drug_mechanisms table to match the information that is present in the compound-target pairs table based on activities. :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :param cpd_target_pairs: Pandas DataFrame with compound-target pairs from the drug_mechanism table. :type cpd_target_pairs: pd.DataFrame :return: Updated pandas DataFrame with the additional annotations. :rtype: pd.DataFrame """ ##### Set columns existing in the df_results table. ##### # None of the targets from the drug mechanism table have any mutation annotation, # hence tid_mutation = tid cpd_target_pairs["tid_mutation"] = cpd_target_pairs["tid"].astype("str") cpd_target_pairs["cpd_target_pair"] = [ f"{a}_{b}" for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"]) ] cpd_target_pairs["cpd_target_pair_mutation"] = [ f"{a}_{b}" for a, b in zip( cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid_mutation"] ) ] # New column: is the compound target pair in the drug_mechanism table? cpd_target_pairs["pair_mutation_in_dm_table"] = True cpd_target_pairs["pair_in_dm_table"] = True ##### Query and combine compound information with compound-target pairs ##### sql = """ SELECT md.molregno as parent_molregno, md.chembl_id as parent_chemblid, md.pref_name as parent_pref_name, md.max_phase, md.first_approval, md.usan_year, md.black_box_warning, md.prodrug, md.oral, md.parenteral, md.topical FROM molecule_dictionary md """ df_compound_info = pd.read_sql_query(sql, con=chembl_con) cpd_target_pairs = cpd_target_pairs.merge( df_compound_info, on="parent_molregno", how="left" ) ##### Query and combine target information with compound-target pairs ##### sql = """ SELECT td.tid, td.chembl_id as target_chembl_id, td.pref_name as target_pref_name, td.target_type, td.organism FROM target_dictionary td """ df_target_info = pd.read_sql_query(sql, con=chembl_con) # Fix problems with null not being recognised as None df_target_info.loc[df_target_info["organism"].astype(str) == "null", "organism"] = ( None ) cpd_target_pairs = cpd_target_pairs.merge(df_target_info, on="tid", how="left") return cpd_target_pairs
[docs] def get_drug_mechanism_ct_pairs(chembl_con: sqlite3.Connection) -> pd.DataFrame: """ Get compound-target pairs from the drug_mechanism table with all the columns that are present in the compound-target pairs based on activities. Relevant mappings of target ids to related target ids are taken into account. :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection :return: Pandas DataFrame with compound-target interactions from the drug_mechanism table. :rtype: pd.DataFrame """ # get known compound-target interactions (CTI) from the drug_mechanisms table df_dti = get_drug_mechanisms_interactions(chembl_con) # Query target_relations for related target ids # to increase the number of target ids for which there is data in the drug_mechanisms table. relevant_tid_mappings = get_relevant_tid_mappings(chembl_con) # table with mapped target ids df_dti_mapped_targets = df_dti.merge(relevant_tid_mappings, on="tid", how="inner") # combine CTIs from drug_mechanism table with mapped CTIs cpd_target_pairs = pd.concat( [ df_dti[["parent_molregno", "tid"]], df_dti_mapped_targets[["parent_molregno", "related_tid"]].rename( columns={"related_tid": "tid"} ), ] ).drop_duplicates() cpd_target_pairs = add_annotations_to_drug_mechanisms_cti( chembl_con, cpd_target_pairs ) return cpd_target_pairs
########### Add Compounds From the drug_mechanism Table to the Dataset ###########
[docs] def add_dm_filtering_columns(dataset: Dataset): """ Add filtering columns related to the drug_mechanism table. - pair_mutation_in_dm_table: pair is in dm table (incl. mutations) - pair_in_dm_table: pair is in dm table (excl. mutations) - keep_for_binding: use to limit to binding assays :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data :type dataset: Dataset """ # Add a new column *pair_mutation_in_dm_table* which is set to True if the compound target pair # (taking mutation annotations into account) is in the drug_mechanism table. dataset.df_result["pair_mutation_in_dm_table"] = False dataset.df_result.loc[ ( dataset.df_result["cpd_target_pair_mutation"].isin( dataset.drug_mechanism_pairs_set ) ), "pair_mutation_in_dm_table", ] = True # Add a new column *pair_in_dm_table* which is set to True if the compound target pair # (NOT taking mutation annotations into account) is in the drug_mechanism table. dataset.df_result["pair_in_dm_table"] = False dataset.df_result.loc[ (dataset.df_result["cpd_target_pair"].isin(dataset.drug_mechanism_pairs_set)), "pair_in_dm_table", ] = True # Add a new column *keep_for_binding* which is set to True if the row should be kept # if you want to limit the dataset to only data based on binding assays. # Rows are kept if # - there is a binding data-based pchembl value or # - the compound-target pair (including mutation info) is in the drug_mechanism table dataset.df_result["keep_for_binding"] = False dataset.df_result.loc[ ( (dataset.df_result["pchembl_value_mean_B"].notnull()) | (dataset.df_result["pair_mutation_in_dm_table"]) ), "keep_for_binding", ] = True
[docs] def add_drug_mechanism_ct_pairs(dataset: Dataset, chembl_con: sqlite3.Connection): """ Add compound-target pairs from the drug_mechanism table that are not in the dataset based on the initial ChEMBL query. These are compound-target pairs for which there is no associated pchembl value data. Since the pairs are known interactions, they are added to the dataset despite not having a pchembl value. Add the set of compound-target pairs in the drug_mechanism table and the set of targets in the drug_mechanism table to the dataset. :param dataset: Pandas Dataframe with compound-target pairs based on ChEMBL activity data :type dataset: Dataset :param chembl_con: Sqlite3 connection to ChEMBL database. :type chembl_con: sqlite3.Connection """ cpd_target_pairs = get_drug_mechanism_ct_pairs(chembl_con) dataset.drug_mechanism_pairs_set = set( f"{a}_{b}" for a, b in zip(cpd_target_pairs["parent_molregno"], cpd_target_pairs["tid"]) ) dataset.drug_mechanism_targets_set = set(cpd_target_pairs["tid"]) ##### Limit the drug_mechanism pairs to the ones that are not yet in the dataset. ##### # Mutation annotations are taken into account. # Therefore, *(cpd A, target B without mutation)* will be added # if a pchembl is present for *(cpd A, target B with mutation C)* # but not for *(cpd A, target B without mutation)*. cpd_target_pairs = cpd_target_pairs[ ~( cpd_target_pairs["cpd_target_pair_mutation"].isin( set(dataset.df_result["cpd_target_pair_mutation"]) ) ) ].copy() logging.debug( "#Pairs not yet present based on binding or functional assays: %s", len(cpd_target_pairs), ) # Combined data of existing query with new compound-target pairs. dataset.df_result = pd.concat([dataset.df_result, cpd_target_pairs]) add_dm_filtering_columns(dataset) sanity_checks.check_pairs_without_pchembl_are_in_drug_mechanisms(dataset.df_result)