Source code for output

"""
Write the dataset, subsets and related statistics to files 
and to the command line.
"""

import logging
import os
import pandas as pd
import sanity_checks

from arguments import OutputArgs, CalculationArgs
from dataset import Dataset
import get_stats


##### Writing Output #####
[docs] def write_output( df: pd.DataFrame, filename: str, out: OutputArgs, ) -> list[str]: """ Write DataFrame df to output file named <filename>. :param df: Pandas Dataframe to write to output file. :type df: pd.DataFrame :param filename: Filename to write the output to :type filename: bool :param out: Arguments related to how to output the dataset :type out: OutputArgs :return: Returns list of types of files that was written to (csv and/or xlsx) :rtype: list[str] """ file_type_list = [] if out.write_to_csv: df.to_csv(f"{filename}.csv", sep=out.delimiter, index=False) file_type_list.append("csv") if out.write_to_excel: try: with pd.ExcelWriter(f"{filename}.xlsx", engine="xlsxwriter") as writer: writer.book.use_zip64() df.to_excel(writer, index=False) file_type_list.append("xlsx") except ValueError as e: # full dataset may be too large to write to excel # remove empty file in case of error to avoid confusion if os.path.exists(f"{filename}.xlsx"): os.remove(f"{filename}.xlsx") print(e) return file_type_list
[docs] def output_stats( df: pd.DataFrame, output_file: str, out: OutputArgs, ): """ Summarise and output the number of unique values in the following columns: - parent_molregno (compound ID) - tid (target ID) - tid_mutation (target ID + mutation annotations) - cpd_target_pair (compound-target pairs) - cpd_target_pair_mutation (compound-target pairs including mutation annotations) :param df: Pandas Dataframe for which the stats should be calculated :type df: pd.DataFrame :param output_file: Path and filename to write the dataset stats to :type output_file: str :param out: Arguments related to how to output the dataset :type out: OutputArgs """ logging.debug("Stats for %s", output_file) stats = [] df_columns, columns_descs = get_stats.get_stats_columns() for column, columns_desc in zip(df_columns, columns_descs): logging.debug("Stats for column %s:", column) column_stats = get_stats.get_stats_for_column(df, column, columns_desc) stats += column_stats for colum_stat in column_stats: logging.debug("%20s %s", colum_stat[2], colum_stat[3]) df_stats = pd.DataFrame( stats, columns=["column", "column_description", "subset_type", "counts"] ) write_output( df_stats, output_file, out, )
[docs] def write_and_check_output( df: pd.DataFrame, filename: str, assay_type: str, args: CalculationArgs, out: OutputArgs, ): """ Write df to file and check that writing was successful. :param df: Pandas Dataframe to write to output file. :type df: pd.DataFrame :param filename: Filename to write the output to (should not include the file extension) :type filename: bool :param assay_type: Types of assays current_df contains information about. \ Options: "BF" (binding+functional), "B" (binding), "all" (contains both BF and B information) :type assay_type: str :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs :param out: Arguments related to how to output the dataset :type out: OutputArgs """ file_type_list = write_output(df, filename, out) sanity_checks.test_equality( df, filename, assay_type, file_type_list, args.calculate_rdkit ) output_stats(df, f"{filename}_stats", out)
##### Output Specific Results #####
[docs] def write_full_dataset_to_file( dataset: Dataset, args: CalculationArgs, out: OutputArgs, ): """ If write_full_dataset, write df_combined with filtering columns to output_path. :param dataset: Dataset with compound-target pairs. :type dataset: Dataset :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs :param out: Arguments related to how to output the dataset :type out: OutputArgs """ desc = "all" if out.write_full_dataset: name_all = os.path.join( out.output_path, f"ChEMBL{args.chembl_version}_CTI_{args.limited_flag}_full_dataset", ) write_and_check_output(dataset.df_result, name_all, desc, args, out)
[docs] def write_debug_sizes( dataset: Dataset, out: OutputArgs, ): """ Output counts at various points during calculating the final dataset for debugging. :param dataset: Dataset with compound-target pairs and debugging sizes. :type dataset: Dataset :param args: Arguments related to how to calculate the dataset :type args: CalculationArgs :param out: Arguments related to how to output the dataset :type out: OutputArgs """ # Size of full dataset at different points. name_full_df_sizes = os.path.join(out.output_path, "debug_full_df_sizes") write_output( dataset.df_sizes_all, name_full_df_sizes, out, ) # Size of dataset with any pchembl values at different points. # This includes data for which we only have pchembl data # for functional assays but not for binding assays. name_pchembl_df_sizes = os.path.join(out.output_path, "debug_pchembl_df_sizes") write_output( dataset.df_sizes_pchembl, name_pchembl_df_sizes, out, )