"""
Get initial set of compound-target pairs with an associated activity
for the dataset.
"""
import sqlite3
import numpy as np
import pandas as pd
from dataset import Dataset
########### Get Initial Compound-Target Data From ChEMBL ###########
[docs]
def get_compound_target_pairs_with_pchembl(
chembl_con: sqlite3.Connection,
limit_to_literature: bool,
) -> pd.DataFrame:
"""
Query ChEMBL activities and related assay for compound-target pairs
with an associated pchembl value.
Compound-target pairs are required to have a pchembl value.
Salt forms of compounds are mapped to their parent form.
If limit_to_literature is true, only literature sources will be considered.
Otherwise, all sources are included.
Includes information about targets, mutations and year of publication (based on docs).
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:param limit_to_literature: Include only literature sources if True.
Include all available sources otherwise.
:type limit_to_literature: bool
:return: Pandas DataFrame with compound-target pairs with a pchembl value.
:rtype: pd.DataFrame
"""
# NOTE: DO NOT USE DISTINCT
# This query does not capture act.activity_id.
# There can be mulitple activities with different activity_ids but the same queries values.
# For accurate mean, median, and max pchembl these additional rows are important.
sql = """
SELECT act.pchembl_value,
md.molregno as parent_molregno, md.chembl_id as parent_chemblid, md.pref_name as parent_pref_name,
md.max_phase, md.first_approval, md.usan_year, md.black_box_warning,
md.prodrug, md.oral, md.parenteral, md.topical,
ass.assay_type, ass.tid,
vs.mutation,
td.chembl_id as target_chembl_id, td.pref_name as target_pref_name, td.target_type, td.organism,
docs.year
FROM activities act
INNER JOIN molecule_hierarchy mh
ON act.molregno = mh.molregno -- act.molregno = salt_molregno
INNER JOIN molecule_dictionary md
ON mh.parent_molregno = md.molregno -- compound information based on parent compound
INNER JOIN assays ass
ON act.assay_id = ass.assay_id
LEFT JOIN variant_sequences vs
ON ass.variant_id = vs.variant_id
INNER JOIN target_dictionary td
ON ass.tid = td.tid
LEFT JOIN docs
ON act.doc_id = docs.doc_id
WHERE act.pchembl_value is not null
and act.potential_duplicate = 0
and act.standard_relation = '='
and act.data_validity_comment is null
and td.tid <>22226 -- exclude unchecked targets
and td.target_type like '%PROTEIN%'
"""
if limit_to_literature:
sql += """ and docs.src_id = 1"""
df_mols = pd.read_sql_query(sql, con=chembl_con)
# Set relevant combinations of columns for easier processing later
# target_id_mutation
df_mols["tid_mutation"] = np.where(
df_mols["mutation"].notnull(),
df_mols["tid"].astype("str") + "_" + df_mols["mutation"],
df_mols["tid"].astype("str"),
)
df_mols["cpd_target_pair"] = [
f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid"])
]
df_mols["cpd_target_pair_mutation"] = [
f"{a}_{b}" for a, b in zip(df_mols["parent_molregno"], df_mols["tid_mutation"])
]
return df_mols
########### Calculate Mean, Median, Max pchembl Values for Each Compound-Target Pair ###########
[docs]
def get_average_info(df: pd.DataFrame, suffix: str) -> pd.DataFrame:
"""
Aggregate the information about compound-target pairs for which
there is more than one entry into one entry.
Compound-target pairs are considered equal if parent_molregno (internal compound ID)
and tid_mutation (target ID + mutation annotations) are equal.
The following values are aggregated:
+-----------------------------------------------+-----------------------------------------------------------------------------------------------+
| pchembl_value_mean | mean pchembl value for a compound-target pair |
+-----------------------------------------------+-----------------------------------------------------------------------------------------------+
| pchembl_value_max | maximum pchembl value for a compound-target pair |
+-----------------------------------------------+-----------------------------------------------------------------------------------------------+
| pchembl_value_median | median pchembl value for a compound-target pair |
+-----------------------------------------------+-----------------------------------------------------------------------------------------------+
| first_publication_cpd_target_pair | first publication in ChEMBL with this compound-target pair |
+-----------------------------------------------+-----------------------------------------------------------------------------------------------+
| first_publication_cpd_target_pair_w_pchembl | first publication in ChEMBL with this compound-target pair and an associated pchembl value |
+-----------------------------------------------+-----------------------------------------------------------------------------------------------+
:param df: Pandas DataFrame with compound-target pairs for which
the information should be aggregated.
:type df: pd.DataFrame
:param suffix: Suffix indicating the type of the given DataFrame,
e.g., _B for binding assays, _BF for binding+functional assays.
:type suffix: str
:return: Pandas DataFrame with 'parent_molregno', 'tid_mutation', and the aggregated columns.
:rtype: pd.DataFrame
"""
# pchembl mean, max, median
df[f"pchembl_value_mean_{suffix}"] = df.groupby(
["parent_molregno", "tid_mutation"]
)["pchembl_value"].transform("mean")
df[f"pchembl_value_max_{suffix}"] = df.groupby(["parent_molregno", "tid_mutation"])[
"pchembl_value"
].transform("max")
df[f"pchembl_value_median_{suffix}"] = df.groupby(
["parent_molregno", "tid_mutation"]
)["pchembl_value"].transform("median")
# first publication of pair
df[f"first_publication_cpd_target_pair_{suffix}"] = df.groupby(
["parent_molregno", "tid_mutation"]
)["year"].transform("min")
# first publication of pair with pchembl value
df_mols_all_first_publication_pchembl = (
df[df["pchembl_value"].notnull()]
.groupby(["parent_molregno", "tid_mutation"])["year"]
.min()
.reset_index()
.rename(
columns={"year": f"first_publication_cpd_target_pair_w_pchembl_{suffix}"}
)
)
df = df.merge(
df_mols_all_first_publication_pchembl,
on=["parent_molregno", "tid_mutation"],
how="left",
)
# return relevant summarised information without duplicates
df = df[
[
"parent_molregno",
"tid_mutation",
f"pchembl_value_mean_{suffix}",
f"pchembl_value_max_{suffix}",
f"pchembl_value_median_{suffix}",
f"first_publication_cpd_target_pair_{suffix}",
f"first_publication_cpd_target_pair_w_pchembl_{suffix}",
]
].drop_duplicates()
return df
########### Get Aggregated Compound-Target Pair Information ###########
[docs]
def get_aggregated_compound_target_pairs_with_pchembl(
chembl_con: sqlite3.Connection,
limit_to_literature: bool,
) -> pd.DataFrame:
"""
Get dataset of compound target-pairs with an associated pchembl value
with pchembl and publication dates aggregated into one entry per pair.
Values are aggregated for
- a subset of the initial dataset based on binding and functional assays (suffix '_BF') and
- a subset of the initial dataset set on only binding assays (suffix '_B').
Therefore, there are two columns for pchembl_value_mean, _max, _median,
first_publication_cpd_target_pair and first_publication_cpd_target_pair_w_pchembl,
one with the suffix '_BF' based on binding + functional data
and one with the suffix '_B' based on only binding data.
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:param limit_to_literature: Include only literature sources if True.
Include all available sources otherwise.
:type limit_to_literature: bool
:return: Pandas Dataframe with compound-target pairs
based on ChEMBL activity data aggregated into one entry per compound-target pair.
:rtype: pd.DataFrame
"""
df_mols = get_compound_target_pairs_with_pchembl(
chembl_con,
limit_to_literature,
)
# Summarise the information for binding and functional assays
suffix = "BF"
df_mols_bf = df_mols[
(df_mols["assay_type"] == "B") | (df_mols["assay_type"] == "F")
].copy()
df_mols_bf = get_average_info(df_mols_bf, suffix)
# Summarise the information for only binding assays
suffix = "B"
df_mols_b = df_mols[df_mols["assay_type"] == "B"].copy()
df_mols_b = get_average_info(df_mols_b, suffix)
# Combine both into one table with two columns per value
# (one with suffix '_BF' for binding+functional and one with suffix '_B' for binding).
# df_mols_B is a subset of the compound-target pairs of df_mols_BF
df_combined = df_mols_bf.merge(
df_mols_b, on=["parent_molregno", "tid_mutation"], how="left"
)
# Merge with other information from df_mols
# left merge because df_mols may contain assays that are
# of other types than binding / functional
df_combined = df_combined.merge(
df_mols.drop(columns=["pchembl_value", "year", "assay_type"]).drop_duplicates(),
on=["parent_molregno", "tid_mutation"],
how="left",
)
return df_combined
[docs]
def get_aggregated_activity_ct_pairs(
chembl_con: sqlite3.Connection,
limit_to_literature: bool,
) -> Dataset:
"""
Wrapper for get_aggregated_compound_target_pairs_with_pchembl,
initialising a dataset.
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
:param limit_to_literature: Include only literature sources if True.
Include all available sources otherwise.
:type limit_to_literature: bool
:return: Dataset with a pandas Dataframe with compound-target pairs
based on ChEMBL activity data aggregated into one entry per compound-target pair.
:rtype: Dataset
"""
df_result = get_aggregated_compound_target_pairs_with_pchembl(
chembl_con, limit_to_literature
)
dataset = Dataset(
df_result,
set(),
set(),
pd.DataFrame(),
pd.DataFrame(),
)
return dataset