Source code for arguments

"""
Dataclasses related to handling arguments, 
specifically arguments related to how to calculate or output the dataset. 
"""

import argparse

from dataclasses import dataclass



[docs]
@dataclass(frozen=True)
class CalculationArgs:
    """
    Collection of arguments related to how to calculate the dataset.

    - chembl_version:         Version of ChEMBL for output file names
    - calculate_rdkit:        True if RDKit-based compound properties should be calculated
    - limit_to_literature:    Include only literature sources if True
    - limited_flag:           String version of limit_to_literature used in file names
    - min_nof_cpds_bf:        Minimum number of compounds per target for the BF subset
    - min_nof_cpds_b:         Minimum number of compounds per target for the B subset
    """

    chembl_version: str
    calculate_rdkit: bool
    limit_to_literature: bool
    limited_flag: str
    min_nof_cpds_bf: int
    min_nof_cpds_b: int




[docs]
@dataclass(frozen=True)
class OutputArgs:
    """
    Collection of arguments related to how to output the dataset.

    - output_path:        Path to write output files to
    - delimiter:          Delimiter in csv-output
    - write_to_csv:       True if output should be written to csv
    - write_to_excel:     True if output should be written to excel
    - write_full_dataset: True if the full dataset should be written to output
    - write_bf:           True if subsets based on binding+functional data \
                            should be written to output
    - write_b:            True if subsets based on binding data only should be written to output
    """

    output_path: str
    delimiter: str
    write_to_csv: bool
    write_to_excel: bool
    write_full_dataset: bool
    write_bf: bool
    write_b: bool




[docs]
def parse_args() -> argparse.Namespace:
    """
    Get arguments with argparse.

    :return: Populated argparse.Namespace
    :rtype: argparse.Namespace
    """
    parser = argparse.ArgumentParser(
        description="Extract the compound-target pairs dataset from ChEMBL. \
            The full dataset plus filtering columns for binding vs. binding+functional data \
            will always be written to csv. \
            Additional outputs and output types can be chosen with the parameters below."
    )

    parser.add_argument(
        "--chembl",
        "-v",
        dest="chembl_version",
        metavar="<version>",
        type=str,
        default=None,
        help="ChEMBL version. \
            Latest version if None. \
            Required if a path to a SQLite database is provided, \
            i.e., if --sqlite is set. (default: None)",
    )
    parser.add_argument(
        "--sqlite",
        "-s",
        metavar="<path>",
        type=str,
        default=None,
        help="Path to SQLite database. \
            ChEMBL is downloaded as an SQLite database \
            and handled by chembl_downloader if None. (default: None)",
    )
    parser.add_argument(
        "--output",
        "-o",
        dest="output_path",
        metavar="<path>",
        type=str,
        required=True,
        help="Path to write the output file(s) to. (required)",
    )
    parser.add_argument(
        "--delimiter",
        "-d",
        metavar="<delimiter>",
        type=str,
        default=";",
        help="Delimiter in output csv-files.  (default: ;)",
    )
    parser.add_argument(
        "--all_sources",
        action="store_true",
        help="If this is set, the dataset is calculated based on all sources in ChEMBL. \
            This includes data from BindingDB which may skew the results. \
            Default (not set): the dataset is calculated based on only literature data.",
    )
    parser.add_argument(
        "--rdkit",
        dest="calculate_rdkit",
        action="store_true",
        help="Calculate RDKit-based compound properties.",
    )
    parser.add_argument(
        "--excel",
        dest="write_to_excel",
        action="store_true",
        help="Write the results to excel. Note: this may fail if the output is too large.",
    )
    parser.add_argument(
        "--BF",
        dest="write_bf",
        action="store_true",
        help="Write binding+functional data subsets.",
    )
    parser.add_argument(
        "--B", dest="write_b", action="store_true", help="Write binding data subsets."
    )
    parser.add_argument(
        "--debug", action="store_true", help="Log additional debugging information."
    )
    args = parser.parse_args()

    return args




[docs]
def get_args() -> tuple[argparse.Namespace, CalculationArgs, OutputArgs]:
    """
    Get parsed and default arguments.

    :return: parserd arguments,
        arguments related to how to calculate the dataset as CalculationArgs,
        arguments related to how to output the dataset as OutputArgs
    :rtype: tuple[argparse.Namespace, CalculationArgs, OutputArgs]
    """
    args = parse_args()

    calc_args = CalculationArgs(
        chembl_version=args.chembl_version,
        calculate_rdkit=args.calculate_rdkit,
        limit_to_literature=not args.all_sources,
        # used in file names
        limited_flag="literature_only" if not args.all_sources else "all_sources",
        min_nof_cpds_bf=100,
        min_nof_cpds_b=100,
    )

    output_args = OutputArgs(
        output_path=args.output_path,
        delimiter=args.delimiter,
        # Always write the results to csv.
        write_to_csv=True,
        write_to_excel=args.write_to_excel,
        # Always write the full dataset plus filtering columns
        # for binding vs. binding+functional data.
        write_full_dataset=True,
        write_bf=args.write_bf,
        write_b=args.write_b,
    )

    return args, calc_args, output_args