"""
Main workflow to calculate the compound-target pairs dataset.
"""
import logging
import sqlite3
from arguments import OutputArgs, CalculationArgs
import add_filtering_columns
import get_activity_ct_pairs
import add_chembl_compound_properties
import add_chembl_target_class_annotations
import get_drug_mechanism_ct_pairs
import add_dti_annotations
import add_rdkit_compound_descriptors
import clean_dataset
import get_stats
import output
import sanity_checks
[docs]
def get_ct_pair_dataset(
chembl_con: sqlite3.Connection, args: CalculationArgs, out: OutputArgs
):
"""
Calculate and output the compound-target pair dataset.
:param chembl_con: Sqlite3 connection to ChEMBL database
:type chembl_con: sqlite3.Connection
:param args: Arguments related to how to calculate the dataset
:type args: CalculationArgs
:param out: Arguments related to how to output the dataset
:type out: OutputArgs
"""
logging.info("get_aggregated_activity_ct_pairs")
dataset = get_activity_ct_pairs.get_aggregated_activity_ct_pairs(
chembl_con, args.limit_to_literature
)
get_stats.add_debugging_info(dataset, dataset.df_result, "activity ct-pairs")
logging.info("add_cti_from_drug_mechanisms")
get_drug_mechanism_ct_pairs.add_drug_mechanism_ct_pairs(dataset, chembl_con)
get_stats.add_debugging_info(dataset, dataset.df_result, "dm ct-pairs")
logging.info("add_cti_annotations")
add_dti_annotations.add_dti_annotations(dataset)
get_stats.add_debugging_info(dataset, dataset.df_result, "DTI annotations")
logging.info("add_all_chembl_compound_properties")
add_chembl_compound_properties.add_all_chembl_compound_properties(
dataset, chembl_con, args.limit_to_literature
)
get_stats.add_debugging_info(dataset, dataset.df_result, "ChEMBL props")
logging.info("remove_compounds_without_smiles_and_mixtures")
clean_dataset.remove_compounds_without_smiles_and_mixtures(dataset, chembl_con)
get_stats.add_debugging_info(dataset, dataset.df_result, "removed smiles")
logging.info("add_chembl_target_class_annotations")
add_chembl_target_class_annotations.add_chembl_target_class_annotations(
dataset,
chembl_con,
args,
out,
)
get_stats.add_debugging_info(dataset, dataset.df_result, "tclass annotations")
if args.calculate_rdkit:
logging.info("add_rdkit_compound_descriptors")
add_rdkit_compound_descriptors.add_rdkit_compound_descriptors(dataset)
get_stats.add_debugging_info(dataset, dataset.df_result, "RDKit props")
logging.info("clean_dataset")
clean_dataset.clean_dataset(dataset, args.calculate_rdkit)
get_stats.add_debugging_info(dataset, dataset.df_result, "clean df")
logging.info("sanity_checks")
sanity_checks.sanity_checks(dataset)
logging.info("add_filtering_columns")
add_filtering_columns.add_filtering_columns(dataset, args, out)
logging.info("write_full_dataset_to_file")
output.write_full_dataset_to_file(dataset, args, out)
if logging.DEBUG >= logging.root.level:
output.write_debug_sizes(dataset, out)