"""
Methods related to cleaning the dataset.
"""
import logging
import sqlite3
import pandas as pd
from dataset import Dataset
########### Remove Irrelevant Compounds ###########
[docs]
def remove_compounds_without_smiles_and_mixtures(
dataset: Dataset, chembl_con: sqlite3.Connection
):
"""
Remove
- compounds without a smiles
- compounds with smiles containing a dot (mixtures and salts).
Since compound information is aggregated for the parents of salts,
the number of smiles with a dot is relatively low.
:param dataset: Dataset with compound-target pairs.
Will be updated to only include
compound-target pairs with a smiles that does not contain a '.'
:type dataset: Dataset
:param chembl_con: Sqlite3 connection to ChEMBL database.
:type chembl_con: sqlite3.Connection
"""
# Double-check that rows with a SMILES containing a '.' are the parent structures,
# i.e., there was no error in using salt information instead of parent information.
sql = """
SELECT DISTINCT mh.molregno as salt_molregno, mh.parent_molregno
FROM molecule_hierarchy mh
"""
df_hierarchy = pd.read_sql_query(sql, con=chembl_con)
smiles_with_dot = dataset.df_result[
dataset.df_result["canonical_smiles"].notnull()
& dataset.df_result["canonical_smiles"].str.contains(".", regex=False)
][["canonical_smiles", "parent_molregno"]].drop_duplicates()
for parent_molregno in set(smiles_with_dot["parent_molregno"]):
assert (
len(df_hierarchy[df_hierarchy["parent_molregno"] == parent_molregno]) > 0
), f"The compound with the parent_moregno {parent_molregno} does not occur \
as a parent_molregno in the molecule_hierarchy."
df_salt_molregno = df_hierarchy[
df_hierarchy["salt_molregno"] == parent_molregno
]
assert df_salt_molregno["salt_molregno"].equals(
df_salt_molregno["parent_molregno"]
), f"The compound with the parent_moregno {parent_molregno} occurs as a salt \
but has a different parent in the molecule_hierarchy."
# Double-check that the SMILES is indeed the SMILES for the parent structure.
sql = """
SELECT DISTINCT mh.parent_molregno, struct.canonical_smiles
FROM molecule_hierarchy mh
INNER JOIN compound_structures struct
ON mh.parent_molregno = struct.molregno
"""
df_parent_smiles = pd.read_sql_query(sql, con=chembl_con)
for parent_molregno in set(smiles_with_dot["parent_molregno"]):
parent_smiles_in_chembl = df_parent_smiles[
df_parent_smiles["parent_molregno"] == parent_molregno
]["canonical_smiles"].item()
parent_smiles_in_df = smiles_with_dot[
smiles_with_dot["parent_molregno"] == parent_molregno
]["canonical_smiles"].item()
assert (
parent_smiles_in_chembl == parent_smiles_in_df
), f"The smiles for the compound {parent_molregno} ({parent_smiles_in_df}) \
in the dataframe is not the same as \
the smiles for the compound in ChEMBL ({parent_smiles_in_chembl})."
# Remove rows that contain a SMILES with a dot or that don't have a SMILES.
len_missing_smiles = len(
dataset.df_result[dataset.df_result["canonical_smiles"].isnull()]
)
len_smiles_w_dot = len(
dataset.df_result[
dataset.df_result["parent_molregno"].isin(
set(smiles_with_dot["parent_molregno"])
)
]
)
logging.debug("#Compounds without a SMILES: %s", len_missing_smiles)
logging.debug("#SMILES with a dot: %s", len_smiles_w_dot)
dataset.df_result = dataset.df_result[
(dataset.df_result["canonical_smiles"].notnull())
& ~(
dataset.df_result["parent_molregno"].isin(
set(smiles_with_dot["parent_molregno"])
)
)
]
########### General Cleaning Steps ###########
[docs]
def clean_none_values(dataset: Dataset):
"""
Change nan values and empty strings to None for consistency.
"""
# Change all None / nan values to None
dataset.df_result = dataset.df_result.where(pd.notnull(dataset.df_result), None)
# replace empty strings with None
dataset.df_result = dataset.df_result.replace("", None).reset_index(drop=True)
[docs]
def set_types_to_int(dataset, calculate_rdkit):
"""
Set the type of relevant columns to Int64.
"""
dataset.df_result = dataset.df_result.astype(
{
"first_approval": "Int64",
"usan_year": "Int64",
"first_publication_cpd_target_pair_BF": "Int64",
"first_publication_cpd_target_pair_w_pchembl_BF": "Int64",
"first_publication_cpd_target_pair_B": "Int64",
"first_publication_cpd_target_pair_w_pchembl_B": "Int64",
"first_publication_cpd": "Int64",
"hba": "Int64",
"hbd": "Int64",
"rtb": "Int64",
"num_ro5_violations": "Int64",
"aromatic_rings": "Int64",
"heavy_atoms": "Int64",
"hba_lipinski": "Int64",
"hbd_lipinski": "Int64",
"num_lipinski_ro5_violations": "Int64",
}
)
if calculate_rdkit:
dataset.df_result = dataset.df_result.astype(
{
"num_aliphatic_carbocycles": "Int64",
"num_aliphatic_heterocycles": "Int64",
"num_aliphatic_rings": "Int64",
"num_aromatic_carbocycles": "Int64",
"num_aromatic_heterocycles": "Int64",
"num_aromatic_rings": "Int64",
"num_heteroatoms": "Int64",
"num_saturated_carbocycles": "Int64",
"num_saturated_heterocycles": "Int64",
"num_saturated_rings": "Int64",
"ring_count": "Int64",
"num_stereocentres": "Int64",
"aromatic_atoms": "Int64",
"aromatic_c": "Int64",
"aromatic_n": "Int64",
"aromatic_hetero": "Int64",
}
)
[docs]
def round_floats(dataset, decimal_places=4):
"""
Round float columns to <decimal_places> decimal places.
This does not apply to max_phase.
"""
for _, (col, dtype) in enumerate(dataset.df_result.dtypes.to_dict().items()):
if (dtype in ("float64", "Float64")) and col != "max_phase":
dataset.df_result[col] = dataset.df_result[col].round(
decimals=decimal_places
)
return dataset.df_result
[docs]
def reorder_columns(dataset, calculate_rdkit):
"""
Reorder the columns in the DataFrame.
"""
len_columns_before = len(dataset.df_result.columns)
compound_target_pair_columns = [
"parent_molregno",
"parent_chemblid",
"parent_pref_name",
"max_phase",
"first_approval",
"usan_year",
"black_box_warning",
"prodrug",
"oral",
"parenteral",
"topical",
"tid",
"mutation",
"target_chembl_id",
"target_pref_name",
"target_type",
"organism",
"tid_mutation",
"cpd_target_pair",
"cpd_target_pair_mutation",
]
aggregated_values = [
"pchembl_value_mean_BF",
"pchembl_value_max_BF",
"pchembl_value_median_BF",
"first_publication_cpd_target_pair_BF",
"first_publication_cpd_target_pair_w_pchembl_BF",
"pchembl_value_mean_B",
"pchembl_value_max_B",
"pchembl_value_median_B",
"first_publication_cpd_target_pair_B",
"first_publication_cpd_target_pair_w_pchembl_B",
]
dti_annotations = ["therapeutic_target", "DTI"]
first_publication_cpd = ["first_publication_cpd"]
chembl_compound_props = [
"mw_freebase",
"alogp",
"hba",
"hbd",
"psa",
"rtb",
"ro3_pass",
"num_ro5_violations",
"cx_most_apka",
"cx_most_bpka",
"cx_logp",
"cx_logd",
"molecular_species",
"full_mwt",
"aromatic_rings",
"heavy_atoms",
"qed_weighted",
"mw_monoisotopic",
"full_molformula",
"hba_lipinski",
"hbd_lipinski",
"num_lipinski_ro5_violations",
]
chembl_structures = ["standard_inchi", "standard_inchi_key", "canonical_smiles"]
ligand_efficieny_metrics = [
"LE_B",
"BEI_B",
"SEI_B",
"LLE_B",
"LE_BF",
"BEI_BF",
"SEI_BF",
"LLE_BF",
]
chembl_target_annotations = ["atc_level1", "target_class_l1", "target_class_l2"]
rdkit_columns = [
"fraction_csp3",
"ring_count",
"num_aliphatic_rings",
"num_aliphatic_carbocycles",
"num_aliphatic_heterocycles",
"num_aromatic_rings",
"num_aromatic_carbocycles",
"num_aromatic_heterocycles",
"num_saturated_rings",
"num_saturated_carbocycles",
"num_saturated_heterocycles",
"num_stereocentres",
"num_heteroatoms",
"aromatic_atoms",
"aromatic_c",
"aromatic_n",
"aromatic_hetero",
"scaffold_w_stereo",
"scaffold_wo_stereo",
]
filtering_columns = [
"pair_mutation_in_dm_table",
"pair_in_dm_table",
"keep_for_binding",
]
if calculate_rdkit:
columns = (
compound_target_pair_columns
+ aggregated_values
+ dti_annotations
+ first_publication_cpd
+ chembl_compound_props
+ chembl_structures
+ ligand_efficieny_metrics
+ chembl_target_annotations
+ rdkit_columns
+ filtering_columns
)
dataset.df_result = dataset.df_result[columns]
else:
columns = (
compound_target_pair_columns
+ aggregated_values
+ dti_annotations
+ first_publication_cpd
+ chembl_compound_props
+ chembl_structures
+ ligand_efficieny_metrics
+ chembl_target_annotations
+ filtering_columns
)
dataset.df_result = dataset.df_result[columns]
len_columns_after = len(dataset.df_result.columns)
assert (
len_columns_before == len_columns_after
), f"Different number of columns after reordering \
(before: {len_columns_before}, after: {len_columns_after})."
[docs]
def clean_dataset(dataset: Dataset, calculate_rdkit: bool) -> pd.DataFrame:
"""
Clean the dataset by
- changing nan values and empty strings to None
- setting the type of relevant columns to Int64
- rounding floats to 4 decimal places (with the exception of max_phase which is not rounded)
- reordering columns
- sorting rows by cpd_target_pair_mutation
:param dataset: Dataset with compound-target pairs.
Will be updated to clean version with the updates described above.
:type dataset: Dataset
:param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties
:type calculate_rdkit: bool
"""
clean_none_values(dataset)
set_types_to_int(dataset, calculate_rdkit)
round_floats(dataset, decimal_places=4)
reorder_columns(dataset, calculate_rdkit)
dataset.df_result = dataset.df_result.sort_values(
by=["cpd_target_pair_mutation"]
).reset_index(drop=True)