Source code for get_activity_ct_pairs

"""
Get initial set of compound-target pairs with an associated activity
for the dataset.
"""

import sqlite3

import numpy as np
import pandas as pd

from dataset import Dataset


########### Get Initial Compound-Target Data From ChEMBL ###########

[docs]
def get_compound_target_pairs_with_pchembl(
    chembl_con: sqlite3.Connection,
    limit_to_literature: bool,
) -> pd.DataFrame:
    """
    Query ChEMBL activities and related assay for compound-target pairs
    with an associated pchembl value.
    Compound-target pairs are required to have a pchembl value.
    Salt forms of compounds are mapped to their parent form.
    If limit_to_literature is true, only literature sources will be considered.
    Otherwise, all sources are included.
    Includes information about targets, mutations and year of publication (based on docs).

    :param chembl_con: Sqlite3 connection to ChEMBL database.
    :type chembl_con: sqlite3.Connection
    :param limit_to_literature: Include only literature sources if True.
        Include all available sources otherwise.
    :type limit_to_literature: bool
    :return: Pandas DataFrame with compound-target pairs with a pchembl value.
    :rtype: pd.DataFrame
    """
    # NOTE: DO NOT USE DISTINCT
    # This query does not capture act.activity_id.
    # There can be mulitple activities with different activity_ids but the same queries values.
    # For accurate mean, median, and max pchembl these additional rows are important.
    sql = """
    SELECT act.pchembl_value, 
        md.molregno as parent_molregno, md.chembl_id as parent_chemblid, md.pref_name as parent_pref_name,
        md.max_phase, md.first_approval, md.usan_year, md.black_box_warning, 
        md.prodrug, md.oral, md.parenteral, md.topical, 
        ass.assay_type, ass.tid, 
        vs.mutation,
        td.chembl_id as target_chembl_id, td.pref_name as target_pref_name, td.target_type, td.organism, 
        docs.year
    FROM activities act
    INNER JOIN molecule_hierarchy mh 
        ON act.molregno = mh.molregno         -- act.molregno = salt_molregno
    INNER JOIN molecule_dictionary md
        ON mh.parent_molregno = md.molregno   -- compound information based on parent compound
    INNER JOIN assays ass 
        ON  act.assay_id = ass.assay_id
    LEFT JOIN variant_sequences vs
        ON ass.variant_id = vs.variant_id
    INNER JOIN target_dictionary td
        ON ass.tid = td.tid
    LEFT JOIN docs
        ON act.doc_id = docs.doc_id
    WHERE act.pchembl_value is not null
        and act.potential_duplicate = 0
        and act.standard_relation = '='
        and act.data_validity_comment is null
        and td.tid <>22226                    -- exclude unchecked targets
        and td.target_type like '%PROTEIN%'
    """
    if limit_to_literature:
        sql += """    and docs.src_id = 1"""

    df_mols = pd.read_sql_query(sql, con=chembl_con)

    # Set relevant combinations of columns for easier processing later
    # target_id_mutation
    df_mols["tid_mutation"] = np.where(
        df_mols["mutation"].notnull(),
        df_mols["tid"].astype("str") + "_" + df_mols["mutation"],
        df_mols["tid"].astype("str"),
    )
    df_mols["cpd_target_pair"] = [
        f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid"])
    ]
    df_mols["cpd_target_pair_mutation"] = [
        f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid_mutation"])
    ]

    return df_mols



########### Calculate Mean, Median, Max pchembl Values for Each Compound-Target Pair ###########

[docs]
def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame:
    """
    Aggregate the information about compound-target pairs for which
    there is more than one entry into one entry.
    Compound-target pairs are considered equal if parent_molregno (internal compound ID)
    and tid_mutation (target ID + mutation annotations) are equal.

    The following values are aggregated:

    +-----------------------------------------------+-----------------------------------------------------------------------------------------------+
    | pchembl_value_mean                            | mean pchembl value for a compound-target pair                                                 |
    +-----------------------------------------------+-----------------------------------------------------------------------------------------------+
    | pchembl_value_max                             | maximum pchembl value for a compound-target pair                                              |
    +-----------------------------------------------+-----------------------------------------------------------------------------------------------+
    | pchembl_value_median                          | median pchembl value for a compound-target pair                                               |
    +-----------------------------------------------+-----------------------------------------------------------------------------------------------+
    | first_publication_cpd_target_pair             | first publication in ChEMBL with this compound-target pair                                    |
    +-----------------------------------------------+-----------------------------------------------------------------------------------------------+
    | first_publication_cpd_target_pair_w_pchembl   | first publication in ChEMBL with this compound-target pair and an associated pchembl value    |
    +-----------------------------------------------+-----------------------------------------------------------------------------------------------+

    :param df: Pandas DataFrame with compound-target pairs for which
        the information should be aggregated.
    :type df: pd.DataFrame
    :param suffix: Suffix indicating the type of the given DataFrame,
        e.g., _B for binding assays, _BF for binding+functional assays.
    :type suffix: str
    :return: Pandas DataFrame with 'parent_molregno', 'tid_mutation', and the aggregated columns.
    :rtype: pd.DataFrame
    """
    # pchembl mean, max, median
    df[f"pchembl_value_mean_{suffix}"] = df.groupby(
        ["parent_molregno", "tid_mutation"]
    )["pchembl_value"].transform("mean")
    df[f"pchembl_value_max_{suffix}"] = df.groupby(["parent_molregno", "tid_mutation"])[
        "pchembl_value"
    ].transform("max")
    df[f"pchembl_value_median_{suffix}"] = df.groupby(
        ["parent_molregno", "tid_mutation"]
    )["pchembl_value"].transform("median")

    # first publication of pair
    df[f"first_publication_cpd_target_pair_{suffix}"] = df.groupby(
        ["parent_molregno", "tid_mutation"]
    )["year"].transform("min")

    # first publication of pair with pchembl value
    df_mols_all_first_publication_pchembl = (
        df[df["pchembl_value"].notnull()]
        .groupby(["parent_molregno", "tid_mutation"])["year"]
        .min()
        .reset_index()
        .rename(
            columns={"year": f"first_publication_cpd_target_pair_w_pchembl_{suffix}"}
        )
    )
    df = df.merge(
        df_mols_all_first_publication_pchembl,
        on=["parent_molregno", "tid_mutation"],
        how="left",
    )

    # return relevant summarised information without duplicates
    df = df[
        [
            "parent_molregno",
            "tid_mutation",
            f"pchembl_value_mean_{suffix}",
            f"pchembl_value_max_{suffix}",
            f"pchembl_value_median_{suffix}",
            f"first_publication_cpd_target_pair_{suffix}",
            f"first_publication_cpd_target_pair_w_pchembl_{suffix}",
        ]
    ].drop_duplicates()

    return df



########### Get Aggregated Compound-Target Pair Information ###########

[docs]
def get_aggregated_compound_target_pairs_with_pchembl(
    chembl_con: sqlite3.Connection,
    limit_to_literature: bool,
) -> pd.DataFrame:
    """
    Get dataset of compound target-pairs with an associated pchembl value
    with pchembl and publication dates aggregated into one entry per pair.

    Values are aggregated for

    - a subset of the initial dataset based on binding and functional assays (suffix '_BF') and
    - a subset of the initial dataset set on only binding assays (suffix '_B').

    Therefore, there are two columns for pchembl_value_mean, _max, _median,
    first_publication_cpd_target_pair and first_publication_cpd_target_pair_w_pchembl,
    one with the suffix '_BF' based on binding + functional data
    and one with the suffix '_B' based on only binding data.

    :param chembl_con: Sqlite3 connection to ChEMBL database.
    :type chembl_con: sqlite3.Connection
    :param limit_to_literature: Include only literature sources if True.
        Include all available sources otherwise.
    :type limit_to_literature: bool
    :return: Pandas Dataframe with compound-target pairs
        based on ChEMBL activity data aggregated into one entry per compound-target pair.
    :rtype: pd.DataFrame
    """
    df_mols = get_compound_target_pairs_with_pchembl(
        chembl_con,
        limit_to_literature,
    )

    # Summarise the information for binding and functional assays
    suffix = "BF"
    df_mols_bf = df_mols[
        (df_mols["assay_type"] == "B") | (df_mols["assay_type"] == "F")
    ].copy()
    df_mols_bf = get_average_info(df_mols_bf, suffix)

    # Summarise the information for only binding assays
    suffix = "B"
    df_mols_b = df_mols[df_mols["assay_type"] == "B"].copy()
    df_mols_b = get_average_info(df_mols_b, suffix)

    # Combine both into one table with two columns per value
    # (one with suffix '_BF' for binding+functional and one with suffix '_B' for binding).
    # df_mols_B is a subset of the compound-target pairs of df_mols_BF
    df_combined = df_mols_bf.merge(
        df_mols_b, on=["parent_molregno", "tid_mutation"], how="left"
    )
    # Merge with other information from df_mols
    # left merge because df_mols may contain assays that are
    # of other types than binding / functional
    df_combined = df_combined.merge(
        df_mols.drop(columns=["pchembl_value", "year", "assay_type"]).drop_duplicates(),
        on=["parent_molregno", "tid_mutation"],
        how="left",
    )

    return df_combined




[docs]
def get_aggregated_activity_ct_pairs(
    chembl_con: sqlite3.Connection,
    limit_to_literature: bool,
) -> Dataset:
    """
    Wrapper for get_aggregated_compound_target_pairs_with_pchembl,
    initialising a dataset.

    :param chembl_con: Sqlite3 connection to ChEMBL database.
    :type chembl_con: sqlite3.Connection
    :param limit_to_literature: Include only literature sources if True.
        Include all available sources otherwise.
    :type limit_to_literature: bool
    :return: Dataset with a pandas Dataframe with compound-target pairs
        based on ChEMBL activity data aggregated into one entry per compound-target pair.
    :rtype: Dataset
    """
    df_result = get_aggregated_compound_target_pairs_with_pchembl(
        chembl_con, limit_to_literature
    )

    dataset = Dataset(
        df_result,
        set(),
        set(),
        pd.DataFrame(),
        pd.DataFrame(),
    )
    return dataset