Source code for clean_dataset

"""
Methods related to cleaning the dataset.
"""

import logging
import sqlite3

import pandas as pd

from dataset import Dataset


########### Remove Irrelevant Compounds ###########

[docs]
def remove_compounds_without_smiles_and_mixtures(
    dataset: Dataset, chembl_con: sqlite3.Connection
):
    """
    Remove

    - compounds without a smiles
    - compounds with smiles containing a dot (mixtures and salts).

    Since compound information is aggregated for the parents of salts,
    the number of smiles with a dot is relatively low.

    :param dataset: Dataset with compound-target pairs.
        Will be updated to only include
        compound-target pairs with a smiles that does not contain a '.'
    :type dataset: Dataset
    :param chembl_con: Sqlite3 connection to ChEMBL database.
    :type chembl_con: sqlite3.Connection
    """
    # Double-check that rows with a SMILES containing a '.' are the parent structures,
    # i.e., there was no error in using salt information instead of parent information.
    sql = """
    SELECT DISTINCT mh.molregno as salt_molregno, mh.parent_molregno
    FROM molecule_hierarchy mh
    """
    df_hierarchy = pd.read_sql_query(sql, con=chembl_con)

    smiles_with_dot = dataset.df_result[
        dataset.df_result["canonical_smiles"].notnull()
        & dataset.df_result["canonical_smiles"].str.contains(".", regex=False)
    ][["canonical_smiles", "parent_molregno"]].drop_duplicates()

    for parent_molregno in set(smiles_with_dot["parent_molregno"]):
        assert (
            len(df_hierarchy[df_hierarchy["parent_molregno"] == parent_molregno]) > 0
        ), f"The compound with the parent_moregno {parent_molregno} does not occur \
                as a parent_molregno in the molecule_hierarchy."
        df_salt_molregno = df_hierarchy[
            df_hierarchy["salt_molregno"] == parent_molregno
        ]
        assert df_salt_molregno["salt_molregno"].equals(
            df_salt_molregno["parent_molregno"]
        ), f"The compound with the parent_moregno {parent_molregno} occurs as a salt \
                but has a different parent in the molecule_hierarchy."

    # Double-check that the SMILES is indeed the SMILES for the parent structure.
    sql = """
    SELECT DISTINCT mh.parent_molregno, struct.canonical_smiles
    FROM molecule_hierarchy mh
    INNER JOIN compound_structures struct
        ON mh.parent_molregno = struct.molregno
    """
    df_parent_smiles = pd.read_sql_query(sql, con=chembl_con)

    for parent_molregno in set(smiles_with_dot["parent_molregno"]):
        parent_smiles_in_chembl = df_parent_smiles[
            df_parent_smiles["parent_molregno"] == parent_molregno
        ]["canonical_smiles"].item()
        parent_smiles_in_df = smiles_with_dot[
            smiles_with_dot["parent_molregno"] == parent_molregno
        ]["canonical_smiles"].item()
        assert (
            parent_smiles_in_chembl == parent_smiles_in_df
        ), f"The smiles for the compound {parent_molregno} ({parent_smiles_in_df}) \
                in the dataframe is not the same as \
                the smiles for the compound in ChEMBL ({parent_smiles_in_chembl})."

    # Remove rows that contain a SMILES with a dot or that don't have a SMILES.
    len_missing_smiles = len(
        dataset.df_result[dataset.df_result["canonical_smiles"].isnull()]
    )
    len_smiles_w_dot = len(
        dataset.df_result[
            dataset.df_result["parent_molregno"].isin(
                set(smiles_with_dot["parent_molregno"])
            )
        ]
    )
    logging.debug("#Compounds without a SMILES: %s", len_missing_smiles)
    logging.debug("#SMILES with a dot: %s", len_smiles_w_dot)

    dataset.df_result = dataset.df_result[
        (dataset.df_result["canonical_smiles"].notnull())
        & ~(
            dataset.df_result["parent_molregno"].isin(
                set(smiles_with_dot["parent_molregno"])
            )
        )
    ]



########### General Cleaning Steps ###########

[docs]
def clean_none_values(dataset: Dataset):
    """
    Change nan values and empty strings to None for consistency.
    """
    # Change all None / nan values to None
    dataset.df_result = dataset.df_result.where(pd.notnull(dataset.df_result), None)
    # replace empty strings with None
    dataset.df_result = dataset.df_result.replace("", None).reset_index(drop=True)




[docs]
def set_types_to_int(dataset, calculate_rdkit):
    """
    Set the type of relevant columns to Int64.
    """
    dataset.df_result = dataset.df_result.astype(
        {
            "first_approval": "Int64",
            "usan_year": "Int64",
            "first_publication_cpd_target_pair_BF": "Int64",
            "first_publication_cpd_target_pair_w_pchembl_BF": "Int64",
            "first_publication_cpd_target_pair_B": "Int64",
            "first_publication_cpd_target_pair_w_pchembl_B": "Int64",
            "first_publication_cpd": "Int64",
            "hba": "Int64",
            "hbd": "Int64",
            "rtb": "Int64",
            "num_ro5_violations": "Int64",
            "aromatic_rings": "Int64",
            "heavy_atoms": "Int64",
            "hba_lipinski": "Int64",
            "hbd_lipinski": "Int64",
            "num_lipinski_ro5_violations": "Int64",
        }
    )

    if calculate_rdkit:
        dataset.df_result = dataset.df_result.astype(
            {
                "num_aliphatic_carbocycles": "Int64",
                "num_aliphatic_heterocycles": "Int64",
                "num_aliphatic_rings": "Int64",
                "num_aromatic_carbocycles": "Int64",
                "num_aromatic_heterocycles": "Int64",
                "num_aromatic_rings": "Int64",
                "num_heteroatoms": "Int64",
                "num_saturated_carbocycles": "Int64",
                "num_saturated_heterocycles": "Int64",
                "num_saturated_rings": "Int64",
                "ring_count": "Int64",
                "num_stereocentres": "Int64",
                "aromatic_atoms": "Int64",
                "aromatic_c": "Int64",
                "aromatic_n": "Int64",
                "aromatic_hetero": "Int64",
            }
        )




[docs]
def round_floats(dataset, decimal_places=4):
    """
    Round float columns to <decimal_places> decimal places.
    This does not apply to max_phase.
    """
    for _, (col, dtype) in enumerate(dataset.df_result.dtypes.to_dict().items()):
        if (dtype in ("float64", "Float64")) and col != "max_phase":
            dataset.df_result[col] = dataset.df_result[col].round(
                decimals=decimal_places
            )

    return dataset.df_result




[docs]
def reorder_columns(dataset, calculate_rdkit):
    """
    Reorder the columns in the DataFrame.
    """
    len_columns_before = len(dataset.df_result.columns)

    compound_target_pair_columns = [
        "parent_molregno",
        "parent_chemblid",
        "parent_pref_name",
        "max_phase",
        "first_approval",
        "usan_year",
        "black_box_warning",
        "prodrug",
        "oral",
        "parenteral",
        "topical",
        "tid",
        "mutation",
        "target_chembl_id",
        "target_pref_name",
        "target_type",
        "organism",
        "tid_mutation",
        "cpd_target_pair",
        "cpd_target_pair_mutation",
    ]
    aggregated_values = [
        "pchembl_value_mean_BF",
        "pchembl_value_max_BF",
        "pchembl_value_median_BF",
        "first_publication_cpd_target_pair_BF",
        "first_publication_cpd_target_pair_w_pchembl_BF",
        "pchembl_value_mean_B",
        "pchembl_value_max_B",
        "pchembl_value_median_B",
        "first_publication_cpd_target_pair_B",
        "first_publication_cpd_target_pair_w_pchembl_B",
    ]
    dti_annotations = ["therapeutic_target", "DTI"]
    first_publication_cpd = ["first_publication_cpd"]
    chembl_compound_props = [
        "mw_freebase",
        "alogp",
        "hba",
        "hbd",
        "psa",
        "rtb",
        "ro3_pass",
        "num_ro5_violations",
        "cx_most_apka",
        "cx_most_bpka",
        "cx_logp",
        "cx_logd",
        "molecular_species",
        "full_mwt",
        "aromatic_rings",
        "heavy_atoms",
        "qed_weighted",
        "mw_monoisotopic",
        "full_molformula",
        "hba_lipinski",
        "hbd_lipinski",
        "num_lipinski_ro5_violations",
    ]
    chembl_structures = ["standard_inchi", "standard_inchi_key", "canonical_smiles"]
    ligand_efficieny_metrics = [
        "LE_B",
        "BEI_B",
        "SEI_B",
        "LLE_B",
        "LE_BF",
        "BEI_BF",
        "SEI_BF",
        "LLE_BF",
    ]
    chembl_target_annotations = ["atc_level1", "target_class_l1", "target_class_l2"]
    rdkit_columns = [
        "fraction_csp3",
        "ring_count",
        "num_aliphatic_rings",
        "num_aliphatic_carbocycles",
        "num_aliphatic_heterocycles",
        "num_aromatic_rings",
        "num_aromatic_carbocycles",
        "num_aromatic_heterocycles",
        "num_saturated_rings",
        "num_saturated_carbocycles",
        "num_saturated_heterocycles",
        "num_stereocentres",
        "num_heteroatoms",
        "aromatic_atoms",
        "aromatic_c",
        "aromatic_n",
        "aromatic_hetero",
        "scaffold_w_stereo",
        "scaffold_wo_stereo",
    ]
    filtering_columns = [
        "pair_mutation_in_dm_table",
        "pair_in_dm_table",
        "keep_for_binding",
    ]

    if calculate_rdkit:
        columns = (
            compound_target_pair_columns
            + aggregated_values
            + dti_annotations
            + first_publication_cpd
            + chembl_compound_props
            + chembl_structures
            + ligand_efficieny_metrics
            + chembl_target_annotations
            + rdkit_columns
            + filtering_columns
        )
        dataset.df_result = dataset.df_result[columns]
    else:
        columns = (
            compound_target_pair_columns
            + aggregated_values
            + dti_annotations
            + first_publication_cpd
            + chembl_compound_props
            + chembl_structures
            + ligand_efficieny_metrics
            + chembl_target_annotations
            + filtering_columns
        )
        dataset.df_result = dataset.df_result[columns]

    len_columns_after = len(dataset.df_result.columns)
    assert (
        len_columns_before == len_columns_after
    ), f"Different number of columns after reordering \
        (before: {len_columns_before}, after: {len_columns_after})."




[docs]
def clean_dataset(dataset: Dataset, calculate_rdkit: bool) -> pd.DataFrame:
    """
    Clean the dataset by

    - changing nan values and empty strings to None
    - setting the type of relevant columns to Int64
    - rounding floats to 4 decimal places (with the exception of max_phase which is not rounded)
    - reordering columns
    - sorting rows by cpd_target_pair_mutation

    :param dataset: Dataset with compound-target pairs.
        Will be updated to clean version with the updates described above.
    :type dataset: Dataset
    :param calculate_rdkit: True if the DataFrame contains RDKit-based compound properties
    :type calculate_rdkit: bool
    """
    clean_none_values(dataset)
    set_types_to_int(dataset, calculate_rdkit)
    round_floats(dataset, decimal_places=4)
    reorder_columns(dataset, calculate_rdkit)
    dataset.df_result = dataset.df_result.sort_values(
        by=["cpd_target_pair_mutation"]
    ).reset_index(drop=True)