Source code for get_stats

"""
Get statistics of dataset for final results and debugging.
"""

import logging
import pandas as pd

from dataset import Dataset


##### Logging Stats #####

[docs]
def get_stats_columns() -> tuple[list[str], list[str]]:
    """
    Get the relevant columns for which stats should be calculated
    and a list of descriptions corresponding to the columns.
    """
    df_columns = [
        "parent_molregno",
        "tid",
        "tid_mutation",
        "cpd_target_pair",
        "cpd_target_pair_mutation",
    ]
    columns_descs = [
        "compound ID",
        "target ID",
        "target ID with mutation annotations",
        "compound-target pair",
        "compound-target pair with mutation annotations",
    ]
    return df_columns, columns_descs




[docs]
def get_stats_for_column(
    df: pd.DataFrame,
    column: str,
    columns_desc: str,
) -> list[list[str, str, int]]:
    """
    Calculate the number of unique values in df[column] and various subsets of df.

    :param df: Pandas Dataframe for which the number of unique values should be calculated
    :type df: pd.DataFrame
    :param column: Column of df that the values should be calculated for
    :type column: str
    :param columns_desc: Description of the column
    :type columns_desc: str
    :return: List of results in the format [column_name, subset_type, size]
    :rtype: list[list[str, str, int]]
    """
    return [
        [column, columns_desc, "all", df[column].nunique()],
        [
            column,
            columns_desc,
            "comparators",
            df[df["DTI"].isin(["DT"])][column].nunique(),
        ],
        [column, columns_desc, "drugs", df[df["DTI"] == "D_DT"][column].nunique()],
        [
            column,
            columns_desc,
            "candidates",
            df[df["DTI"].isin(["C0_DT", "C1_DT", "C2_DT", "C3_DT"])][column].nunique(),
        ],
        [
            column,
            columns_desc,
            "candidates_phase_3",
            df[df["DTI"] == "C3_DT"][column].nunique(),
        ],
        [
            column,
            columns_desc,
            "candidates_phase_2",
            df[df["DTI"] == "C2_DT"][column].nunique(),
        ],
        [
            column,
            columns_desc,
            "candidates_phase_1",
            df[df["DTI"] == "C1_DT"][column].nunique(),
        ],
        [
            column,
            columns_desc,
            "candidates_phase_0",
            df[df["DTI"] == "C0_DT"][column].nunique(),
        ],
    ]



##### Debugging Stats #####

[docs]
def get_dataset_sizes(df: pd.DataFrame, label: str) -> pd.DataFrame:
    """
    Calculate the number of unique compounds, targets and pairs
    for df and df limited to drugs.

    :param df: Pandas DataFrame for which the dataset sizes should be calculated.
    :type df: pd.DataFrame
    :param label: Description of pipeline step (e.g., initial query).
    :type label: str
    :return: Pandas DataFrame with calculated unique counts.
    :rtype: pd.DataFrame
    """
    stats = {"step": label}

    if "DTI" in df.columns:
        # drugs = compounds of a compound-target pair with a known interaction
        df_drugs = df[df["DTI"] == "D_DT"]
    else:
        df_drugs = df[df["max_phase"] == 4]

    df_columns, _ = get_stats_columns()
    for column in df_columns:
        stats[f"{column}_all"] = df[column].nunique()
        stats[f"{column}_drugs"] = df_drugs[column].nunique()

    df_stats = pd.DataFrame([stats])
    return df_stats




[docs]
def add_dataset_sizes(
    dataset: Dataset,
    df: pd.DataFrame,
    label: str,
):
    """
    Count and add representative counts of df used for debugging to the dataset.

    :param dataset: Dataset with compound-target pairs and debugging sizes.
    :type dataset: Dataset
    :param df: Pandas DataFrame with current compound-target pairs
    :type df: pd.DataFrame
    :param label: Description of pipeline step (e.g., initial query).
    :type label: str
    """
    df_stats = get_dataset_sizes(df, label)

    dataset.df_sizes_all = pd.concat([dataset.df_sizes_all, df_stats])

    # restrict to data with any pchembl value (any data with a pchembl,
    # even if it is based on only functional data)
    # these statistics are purely based on removing
    # compound-target pairs without pchembl information,
    # i.e., the subset of the dataset is determined by the given df and not recalculated
    df_copy = df.copy()
    df_pchembl = df_copy.dropna(
        subset=[x for x in df_copy.columns if x.startswith("pchembl_value")], how="all"
    )
    df_stats = get_dataset_sizes(df_pchembl, label)
    dataset.df_sizes_pchembl = pd.concat([dataset.df_sizes_pchembl, df_stats])




[docs]
def add_debugging_info(
    dataset: Dataset,
    df: pd.DataFrame,
    label: str,
):
    """
    Wrapper for add_dataset_sizes.
    Handles logging level.
    """
    if logging.DEBUG >= logging.root.level:
        add_dataset_sizes(dataset, df, label)