"""
Dataclasses related to handling arguments,
specifically arguments related to how to calculate or output the dataset.
"""
import argparse
from dataclasses import dataclass
[docs]
@dataclass(frozen=True)
class CalculationArgs:
"""
Collection of arguments related to how to calculate the dataset.
- chembl_version: Version of ChEMBL for output file names
- calculate_rdkit: True if RDKit-based compound properties should be calculated
- limit_to_literature: Include only literature sources if True
- limited_flag: String version of limit_to_literature used in file names
- min_nof_cpds_bf: Minimum number of compounds per target for the BF subset
- min_nof_cpds_b: Minimum number of compounds per target for the B subset
"""
chembl_version: str
calculate_rdkit: bool
limit_to_literature: bool
limited_flag: str
min_nof_cpds_bf: int
min_nof_cpds_b: int
[docs]
@dataclass(frozen=True)
class OutputArgs:
"""
Collection of arguments related to how to output the dataset.
- output_path: Path to write output files to
- delimiter: Delimiter in csv-output
- write_to_csv: True if output should be written to csv
- write_to_excel: True if output should be written to excel
- write_full_dataset: True if the full dataset should be written to output
- write_bf: True if subsets based on binding+functional data \
should be written to output
- write_b: True if subsets based on binding data only should be written to output
"""
output_path: str
delimiter: str
write_to_csv: bool
write_to_excel: bool
write_full_dataset: bool
write_bf: bool
write_b: bool
[docs]
def parse_args() -> argparse.Namespace:
"""
Get arguments with argparse.
:return: Populated argparse.Namespace
:rtype: argparse.Namespace
"""
parser = argparse.ArgumentParser(
description="Extract the compound-target pairs dataset from ChEMBL. \
The full dataset plus filtering columns for binding vs. binding+functional data \
will always be written to csv. \
Additional outputs and output types can be chosen with the parameters below."
)
parser.add_argument(
"--chembl",
"-v",
dest="chembl_version",
metavar="<version>",
type=str,
default=None,
help="ChEMBL version. \
Latest version if None. \
Required if a path to a SQLite database is provided, \
i.e., if --sqlite is set. (default: None)",
)
parser.add_argument(
"--sqlite",
"-s",
metavar="<path>",
type=str,
default=None,
help="Path to SQLite database. \
ChEMBL is downloaded as an SQLite database \
and handled by chembl_downloader if None. (default: None)",
)
parser.add_argument(
"--output",
"-o",
dest="output_path",
metavar="<path>",
type=str,
required=True,
help="Path to write the output file(s) to. (required)",
)
parser.add_argument(
"--delimiter",
"-d",
metavar="<delimiter>",
type=str,
default=";",
help="Delimiter in output csv-files. (default: ;)",
)
parser.add_argument(
"--all_sources",
action="store_true",
help="If this is set, the dataset is calculated based on all sources in ChEMBL. \
This includes data from BindingDB which may skew the results. \
Default (not set): the dataset is calculated based on only literature data.",
)
parser.add_argument(
"--rdkit",
dest="calculate_rdkit",
action="store_true",
help="Calculate RDKit-based compound properties.",
)
parser.add_argument(
"--excel",
dest="write_to_excel",
action="store_true",
help="Write the results to excel. Note: this may fail if the output is too large.",
)
parser.add_argument(
"--BF",
dest="write_bf",
action="store_true",
help="Write binding+functional data subsets.",
)
parser.add_argument(
"--B", dest="write_b", action="store_true", help="Write binding data subsets."
)
parser.add_argument(
"--debug", action="store_true", help="Log additional debugging information."
)
args = parser.parse_args()
return args
[docs]
def get_args() -> tuple[argparse.Namespace, CalculationArgs, OutputArgs]:
"""
Get parsed and default arguments.
:return: parserd arguments,
arguments related to how to calculate the dataset as CalculationArgs,
arguments related to how to output the dataset as OutputArgs
:rtype: tuple[argparse.Namespace, CalculationArgs, OutputArgs]
"""
args = parse_args()
calc_args = CalculationArgs(
chembl_version=args.chembl_version,
calculate_rdkit=args.calculate_rdkit,
limit_to_literature=not args.all_sources,
# used in file names
limited_flag="literature_only" if not args.all_sources else "all_sources",
min_nof_cpds_bf=100,
min_nof_cpds_b=100,
)
output_args = OutputArgs(
output_path=args.output_path,
delimiter=args.delimiter,
# Always write the results to csv.
write_to_csv=True,
write_to_excel=args.write_to_excel,
# Always write the full dataset plus filtering columns
# for binding vs. binding+functional data.
write_full_dataset=True,
write_bf=args.write_bf,
write_b=args.write_b,
)
return args, calc_args, output_args