Source code for output

"""
Write the dataset, subsets and related statistics to files 
and to the command line.
"""

import logging
import os
import pandas as pd
import sanity_checks

from arguments import OutputArgs, CalculationArgs
from dataset import Dataset
import get_stats


##### Writing Output #####

[docs]
def write_output(
    df: pd.DataFrame,
    filename: str,
    out: OutputArgs,
) -> list[str]:
    """
    Write DataFrame df to output file named <filename>.

    :param df: Pandas Dataframe to write to output file.
    :type df: pd.DataFrame
    :param filename: Filename to write the output to
    :type filename: bool
    :param out: Arguments related to how to output the dataset
    :type out: OutputArgs
    :return: Returns list of types of files that was written to (csv and/or xlsx)
    :rtype: list[str]
    """
    file_type_list = []
    if out.write_to_csv:
        df.to_csv(f"{filename}.csv", sep=out.delimiter, index=False)
        file_type_list.append("csv")
    if out.write_to_excel:
        try:
            with pd.ExcelWriter(f"{filename}.xlsx", engine="xlsxwriter") as writer:
                writer.book.use_zip64()
                df.to_excel(writer, index=False)
            file_type_list.append("xlsx")
        except ValueError as e:  # full dataset may be too large to write to excel
            # remove empty file in case of error to avoid confusion
            if os.path.exists(f"{filename}.xlsx"):
                os.remove(f"{filename}.xlsx")
            print(e)
    return file_type_list




[docs]
def output_stats(
    df: pd.DataFrame,
    output_file: str,
    out: OutputArgs,
):
    """
    Summarise and output the number of unique values in the following columns:

    - parent_molregno (compound ID)
    - tid (target ID)
    - tid_mutation (target ID + mutation annotations)
    - cpd_target_pair (compound-target pairs)
    - cpd_target_pair_mutation (compound-target pairs including mutation annotations)

    :param df: Pandas Dataframe for which the stats should be calculated
    :type df: pd.DataFrame
    :param output_file: Path and filename to write the dataset stats to
    :type output_file: str
    :param out: Arguments related to how to output the dataset
    :type out: OutputArgs
    """
    logging.debug("Stats for %s", output_file)
    stats = []
    df_columns, columns_descs = get_stats.get_stats_columns()
    for column, columns_desc in zip(df_columns, columns_descs):
        logging.debug("Stats for column %s:", column)
        column_stats = get_stats.get_stats_for_column(df, column, columns_desc)
        stats += column_stats
        for colum_stat in column_stats:
            logging.debug("%20s %s", colum_stat[2], colum_stat[3])

    df_stats = pd.DataFrame(
        stats, columns=["column", "column_description", "subset_type", "counts"]
    )
    write_output(
        df_stats,
        output_file,
        out,
    )




[docs]
def write_and_check_output(
    df: pd.DataFrame,
    filename: str,
    assay_type: str,
    args: CalculationArgs,
    out: OutputArgs,
):
    """
    Write df to file and check that writing was successful.

    :param df: Pandas Dataframe to write to output file.
    :type df: pd.DataFrame
    :param filename: Filename to write the output to (should not include the file extension)
    :type filename: bool
    :param assay_type: Types of assays current_df contains information about. \
        Options: "BF" (binding+functional), 
        "B" (binding), 
        "all" (contains both BF and B information)
    :type assay_type: str
    :param args: Arguments related to how to calculate the dataset
    :type args: CalculationArgs
    :param out: Arguments related to how to output the dataset
    :type out: OutputArgs
    """
    file_type_list = write_output(df, filename, out)
    sanity_checks.test_equality(
        df, filename, assay_type, file_type_list, args.calculate_rdkit
    )
    output_stats(df, f"{filename}_stats", out)



##### Output Specific Results #####

[docs]
def write_full_dataset_to_file(
    dataset: Dataset,
    args: CalculationArgs,
    out: OutputArgs,
):
    """
    If write_full_dataset, write df_combined with filtering columns to output_path.

    :param dataset: Dataset with compound-target pairs.
    :type dataset: Dataset
    :param args: Arguments related to how to calculate the dataset
    :type args: CalculationArgs
    :param out: Arguments related to how to output the dataset
    :type out: OutputArgs
    """
    desc = "all"
    if out.write_full_dataset:
        name_all = os.path.join(
            out.output_path,
            f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset",
        )
        write_and_check_output(dataset.df_result, name_all, desc, args, out)




[docs]
def write_debug_sizes(
    dataset: Dataset,
    out: OutputArgs,
):
    """
    Output counts at various points during calculating the final dataset for debugging.

    :param dataset: Dataset with compound-target pairs and debugging sizes.
    :type dataset: Dataset
    :param args: Arguments related to how to calculate the dataset
    :type args: CalculationArgs
    :param out: Arguments related to how to output the dataset
    :type out: OutputArgs
    """
    # Size of full dataset at different points.
    name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes")
    write_output(
        dataset.df_sizes_all,
        name_full_df_sizes,
        out,
    )

    # Size of dataset with any pchembl values at different points.
    # This includes data for which we only have pchembl data
    # for functional assays but not for binding assays.
    name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes")
    write_output(
        dataset.df_sizes_pchembl,
        name_pchembl_df_sizes,
        out,
    )