#!/usr/bin/env python3
"""
This script runs EXCAVATE-HT, a pipeline that generates guide RNA libraries for a given locus, with optional pairing, and off-target detection.
"""
from . import ap
from . import offtargets as ot
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Seq import MutableSeq
import re
import argparse
import os
from pathlib import Path
import sys
excavate_banner = r"""
_______ ___________ _ _____ ____________ __ ________
/ ____/ |/ / ____/ | | / / |/_ __/ ____/ / / / /_ __/
/ __/ | / / / /| | | / / /| | / / / __/______/ /_/ / / /
/ /___ / / /___/ ___ | |/ / ___ |/ / / /__/_____/ __ / / /
/_____//_/|_\____/_/ |_|___/_/ |_/_/ /_____/ /_/ /_/ /_/
⠀⠀⠀⠀⠀⠀⠀⠀⢀⣀⣀⣀⣀⣀⣀⡀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠀⠀⠀⠀⠀⢀⣀⡿⠿⠿⠿⠿⠿⠿⢿⣀⣀⣀⣀⣀⡀⠀⠀
⠀⠀⠀⠀⠀⠀⠸⠿⣇⣀⣀⣀⣀⣀⣀⣸⠿⢿⣿⣿⣿⡇⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠻⠿⠿⠿⠿⠿⣿⣿⣀⡸⠿⢿⣿⡇⠀⠀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣤⣤⣿⣿⣿⣧⣤⡼⠿⢧⣤⡀
⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⣤⣤⣿⣿⣿⣿⠛⢻⣿⡇⠀⢸⣿⡇
⠀⠀⠀⠀⠀⠀⠀⠀⣤⣤⣿⣿⣿⣿⠛⠛⠀⢸⣿⡇⠀⢸⣿⡇
⠀⠀⠀⠀⠀⠀⢠⣤⣿⣿⣿⣿⠛⠛⠀⠀⠀⢸⣿⡇⠀⢸⣿⡇
⠀⠀⠀⠀⢰⣶⣾⣿⣿⣿⠛ ⠀⠀⠀⠀⠀⠈⠛⢳⣶⡞⠛⠁⠀
⢰⣶⡎⠉⢹⣿⡏⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⢸⣿⣷⣶⡎⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
⠀⠉⠉⠉⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀
"""
[docs]
def add_common_args(parser):
parser.add_argument(
'--pairing-method',
nargs='?',
const='r',
choices=['r', 'fp', 't'],
type=str,
help="Enable pairing of gRNA to output a dual-guide library. Default is 'r': pairing all guides that target different SNPs together. 'fp': pairing guides about a fixed point. 't': tiled pairing"
)
parser.add_argument(
"-f",
"--fixed-points-list",
type=str,
help="1 or more fixed points, comma-separated (genomic coordinate without chr#, eg: 11989251,12002042,...]) in your locus to pair guides around."
)
"""Add arguments common to multiple subcommands"""
parser.add_argument(
"-o", "--output_dir",
required=True,
type=str,
help="Output directory. Folder name if you are already in the excavate folder.",
)
[docs]
def add_generate_parser(subparsers):
"""Add the generate subcommand parser"""
gen_parser = subparsers.add_parser(
"generate",
help="Generate complete gRNA library using inputted variant data"
)
gen_parser.add_argument("--vcf", type=str, required=True, help="path to the vcf file, comma-separated if more than one (cell_line.vcf.gz,1000genomes.vcf.gz). If using a phased cell-line vcf, put that first.")
gen_parser.add_argument("--var_type", type=str, required=True, help="'cell-line' or 'population', with a number and comma-separated if more than one (eg: cell-line,population1,population2)")
gen_parser.add_argument("--chrom_fa", type=str, required=True, help="path to the chromosome fasta file for your locus of interest (eg: ch1sequence.fasta)")
gen_parser.add_argument("--locus", type=str, required=True, help="chr#:start-end")
gen_parser.add_argument(
"--af-threshold",
type=str,
nargs="?",
default='0.1',
const='0.1',
help="allele frequency threshold between 0 and 1 (default = 0.1)"
)
gen_parser.add_argument( #this needs to be an option, not a flag
"--cas",
type=str,
help="One of SpCas9, SpCas9_NG, enAsCas12a, or SaCas9",
)
gen_parser.add_argument( #this needs to be an option, not a flag
"--pam-list",
type=str,
help="PAM sequences, if not using one of the supported Cas-species. Comma-separated. Use IUPAC codes.",
)
gen_parser.add_argument( #this needs to be an option, not a flag
"--orient",
choices=['3prime', '5prime'],
type=str,
help="PAM orientation",
)
gen_parser.add_argument(
"-g",
"--guide-length",
type=int,
nargs="?",
default=20,
const=20,
help="guide length in base pairs (defaut = 20)"
)
gen_parser.add_argument(
"-m",
"--max_snppos_in_protospacer",
type=int,
nargs="?",
default=10,
const=10,
help="maximum distance in bp of SNP from PAM sequence (default = 10)"
)
gen_parser.add_argument(
"--off-targets",
action="store_true",
help="Run off-target analysis: exact matches across genome and 1-bp mismatches in chromosome. Uses Bowtie1.",
)
gen_parser.add_argument(
"--download-hg38-index",
action="store_true",
help="Download prebuilt hg38 Bowtie indexes (GRCh38_noalt_as) into outdir/bowtie_index and use them.",
)
gen_parser.add_argument(
"--genome-index-prefix",
type=str,
default=None,
help="Bowtie1 index prefix for the genome FASTA (e.g., /path/to/index/hg38_bt1). "
"If missing or index files are not found, EXCAVATE-HT will build it next to this prefix.",
)
gen_parser.add_argument(
"--genome_fa",
type=str,
required=False,
help="path to the whole genome fasta file for your organism"
)
gen_parser.add_argument(
"--chrom-index-prefix",
type=str,
default=None,
help="Bowtie1 index prefix for the chromosome FASTA (e.g., /path/to/index/chr1_bt1). "
"If missing or index files are not found, EXCAVATE-HT will build it next to this prefix.",
)
gen_parser.add_argument(
"--bowtie-threads",
type=int,
default=None,
help="Number of threads for Bowtie1. Defaults to NSLOTS if set (HPC), otherwise 4.",
)
gen_parser.add_argument(
"--split-phased",
action='store_true',
help="Enable splitting of gRNA libraries by cell-line phasing.",
)
gen_parser.add_argument(
"--summary",
action='store_true',
help="Enable to output a summary table for each gRNA library.",
)
gen_parser.add_argument(
"--per-vcf",
action='store_true',
help="Enable to save single-gRNA libraries for each VCF file, split by allele.",
)
add_common_args(gen_parser)
return gen_parser
[docs]
def add_pair_parser(subparsers):
"""Add the pair subcommand parser"""
pair_parser = subparsers.add_parser(
"pair",
help="Pair your single-gRNA library"
)
pair_parser.add_argument(
"-i", "--input-library",
required=True,
type=str,
help="Path to input single-gRNA library file"
)
add_common_args(pair_parser)
return pair_parser
[docs]
def initialize_cas_obj(args):
# Initialize cas_obj based on inputted Cas enzyme or custom PAM and orientation
if args.cas is not None and args.pam_list is not None:
raise ValueError('Both Cas and PAM-list given, please only input one of these.')
elif args.cas is not None:
cas_name = args.cas
if hasattr(ap, cas_name):
cas_obj = getattr(ap, cas_name)
else:
raise ValueError(f"Invalid Cas enzyme name: {cas_name}, choose one of SpCas9, SpCas9-NG, enAsCas12a, or SaCas9. Otherwise, input custom PAMs")
elif args.pam_list is not None:
if args.orient is not None:
pam_list = args.pam_list.split(',')
cas_obj = ap.create_custom_cas_obj(pam_list, args.orient)
else:
raise ValueError("Custom PAM orientation not provided. Input --orient 5prime or 3prime")
else:
raise ValueError("Neither Cas nor custom PAM provided. Please input one of these.")
return cas_obj
[docs]
def load_variant_data(vcf_file, var_type, locus, af_threshold):
# check if vcf file path exists
for f in vcf_file:
if not os.path.exists(f):
raise FileNotFoundError(f"VCF file '{f}' does not exist.")
# check if var_types are valid
valid_types = {'cell-line', 'population'}
def get_chars_before_number(text): # function to store only the letter chars from var_type, to aid checking if the input is valid
for i, char in enumerate(text):
if char.isdigit():
return text[:i]
return text
for vt in var_type:
vt_letters = get_chars_before_number(vt)
if vt_letters not in valid_types:
raise ValueError(f"Invalid variant type: {vt}. Must be one of {valid_types}.")
# check if all inputs are the same length
if not (len(vcf_file) == len(var_type)):
raise ValueError("Arguments vcf_file and var_type must have the same number of comma-separated values.")
# check if af threshold is a valid float
try:
af_threshold = float(af_threshold)
if not (0 <= af_threshold <= 1):
raise ValueError("Allele frequency threshold must be between 0 and 1.")
except ValueError:
raise ValueError("Invalid allele frequency threshold. Must be a number between 0 and 1.")
# executing create_gens to create dataframe of SNP genotypes from VCF file data
gens_dict = {} # Dictionary to store gens dataframes
for i in range(len(vcf_file)): # For each vcf file provided
key = f"gens_df_{var_type[i]}" # Create a unique key for each iteration
gens_dict[key] = ap.create_gens(vcf_file[i], locus, var_type[i], af_threshold)
for i in range(len(vcf_file)):
key = f"gens_df_{var_type[i]}" # Create a unique key for each iteration
try:
gens_dict[key] = ap.create_gens(vcf_file[i], locus, var_type[i], af_threshold)
except Exception as e:
raise RuntimeError(f"Failed to create gens data for file {vcf_file[i]} with variant type {var_type[i]}: {e}")
return gens_dict
[docs]
def run_off_targets(
args,
all_guides_unique,
genome_fa,
cas_obj,
outdir,
chrom_name,
chrom_fasta_path,
):
"""
Run Bowtie1-based off-target analysis.
Parameters
----------
args : argparse.Namespace
Must include: off_targets, genome_index_prefix, chrom_index_prefix, bowtie_threads
Optional: no_index_build (if you add it)
all_guides_unique : pd.DataFrame (or your guide table type)
genome_fa : str | Path
Path to genome FASTA (used for index auto-build if enabled)
cas_obj : object
Your CasParameters/whatever ot.add_bowtie_offtargets expects
outdir : str | Path
Output directory for caching indexes/tmp files
chrom_name : str
e.g. "chr1"
chrom_fasta_path : str | Path
Path to chromosome FASTA (used for index auto-build if enabled)
"""
def _default_threads(user_threads: int | None) -> int:
if user_threads is not None:
return max(1, int(user_threads))
nslots = os.environ.get("NSLOTS")
if nslots and nslots.isdigit():
return max(1, int(nslots))
return 4
# If user didn't request off-targets, do nothing
if not getattr(args, "off_targets", False):
return all_guides_unique
outdir = Path(outdir)
outdir.mkdir(parents=True, exist_ok=True)
print(
"Beginning off-target analysis. This may take time depending on genome/index availability.\n"
"Tip: on HPC, submit as a job; on laptop, prevent sleep / power loss."
)
# Threads
threads = _default_threads(getattr(args, "bowtie_threads", None))
# Decide index prefixes (defaults into outdir so we can write/cache)
if args.download_hg38_index:
genome_index_prefix = ot.download_hg38_bt2(outdir)
elif getattr(args, "genome_index_prefix", None) is None:
genome_index_prefix = str(outdir / "bowtie_index" / "genome_bt1")
else:
genome_index_prefix = str(args.genome_index_prefix)
if getattr(args, "chrom_index_prefix", None) is None:
chr_index_prefix = str(outdir / "bowtie_index" / f"{chrom_name}_bt1")
else:
chr_index_prefix = str(args.chrom_index_prefix)
# tmp directory
tmp_dir = outdir / "tmp_offtargets"
tmp_dir.mkdir(parents=True, exist_ok=True)
try:
all_guides_final = ot.add_bowtie_offtargets(
all_guides_unique,
cas_obj,
genome_index_prefix=genome_index_prefix,
chr_index_prefix=chr_index_prefix,
ensure_index=True,
genome_fasta_for_autobuild=str(genome_fa),
chr_fasta_for_autobuild=str(chrom_fasta_path),
threads_genome=threads,
threads_chr=threads,
tmp_dir=str(tmp_dir),
exact_col="exact_matches_in_genome",
mm1_col=f"1bp_mismatch_matches_in_{chrom_name}",
)
print("Off-target analysis completed!")
return all_guides_final
except Exception as e:
raise RuntimeError(f"Failed to perform off-targets analysis: {e}") from e
[docs]
def apply_pairing(all_guides, method, fixed_points=None):
if method == 'r':
return ap.random_pair(all_guides)
elif method == 'fp':
return ap.fixed_point_pair(all_guides, fixed_points)
elif method == 't':
return ap.tiling_pair(all_guides)
else:
raise ValueError(f"Unsupported pairing method: {method}")
[docs]
def run_generate(args):
print(excavate_banner)
# get output directory
outdir = args.output_dir
cas_obj = initialize_cas_obj(args)
print(f'Initialized Cas object: {cas_obj.name}')
# Use inputted VCF files to generate dataframe of SNPs to find gRNA for.
# Extract all args from args
vcf_file = args.vcf.split(',')
var_type = args.var_type.split(',')
#checking validity of locus arg, and saving start and end for downstream uses (mainly verifying if inputted fixed-points are within the locus)
locus = args.locus
region_match = re.match(r"chr\w+:(\d+)-(\d+)", args.locus)
if not region_match:
raise ValueError("Locus format must be chr:start-end")
start = int(region_match.group(1))
end = int(region_match.group(2))
af_threshold = args.af_threshold
# create list of snps according to criteria, for each given vcf file and var type
gens_dict = load_variant_data(vcf_file, var_type, locus, af_threshold)
print(f'Loaded variant data from {vcf_file}')
# Make chr Seq object using chr fasta - this is not in a function, shoud it be? maybe
fa_file = args.chrom_fa
if not os.path.exists(fa_file):
raise FileNotFoundError(f"Chromosome fasta file '{fa_file}' does not exist.")
else:
try:
chseq = ap.makeseq(fa_file)
except Exception as e:
raise RuntimeError(f"Failed to load chromosome sequence from file {fa_file}: {e}")
print(f'Loaded chromosome sequence from {fa_file}')
# Create allelic sequences for each VCF file, with each of the present versions of the SNPs
seq_dict = {}
try:
for i in gens_dict: # for each set of snps
key = f"{i}_allele1_seq"
seq_dict[key] = ap.getaltseq(gens_dict[i], chseq, snpform = 'allele1')
key = f"{i}_allele2_seq"
seq_dict[key] = ap.getaltseq(gens_dict[i], chseq, snpform = 'allele2')
except Exception as e:
raise RuntimeError(f"Failed to create allelic sequences from {gens_dict[i]}: {e}")
# Find guides using gens_dict[i remove the last 8 chars, to give name of corresponding gens df], for each seq.
# check if args for find_guides are valid
max_snppos_in_protospacer = args.max_snppos_in_protospacer
guide_length = args.guide_length
if not (1 <= guide_length):
raise ValueError("guide length must be at least 1.")
if not (1 <= max_snppos_in_protospacer <= guide_length):
raise ValueError("maximum SNP pos in protospacer must be between 1 and guide length.")
# run find guides with valid args
guides_dict = {}
for i in seq_dict:
key = f"{i}_guides"
guides_dict[key] = ap.find_guides(gens_dict[str(i)[:-12]], seq_dict[i], cas_obj, max_snppos_in_protospacer, guide_length)
# save individual guides dfs in separate folder in output directory if --per-vcf included
if args.per_vcf:
os.makedirs(os.path.join(outdir, "single_gRNA_libraries_per_vcf"), exist_ok=True)
for lib in guides_dict:
single_gRNA_libs_per_vcf_output = os.path.join(outdir, "single_gRNA_libraries_per_vcf", f"{lib}.csv")
guides_dict[lib].to_csv(single_gRNA_libs_per_vcf_output, index=False)
print(f'Saved single-gRNA libraries for each VCF file provided, split by allele, in {outdir}/single_gRNA_libraries_per_vcf.')
# combine all gRNA libraries and add annotations about variant presence and alt allele frequencies
print('Combining all single-gRNA libraries...')
all_guides = pd.concat(list(guides_dict.values()), ignore_index=True)
all_guides_unique = ap.all_guides_var_info(gens_dict, all_guides)
all_guides_unique = all_guides_unique.reset_index(drop = True)
# Off-target analysis, if enabled
if args.off_targets:
print("Preparing to run off-targets analysis...")
chrom_fasta_path = args.chrom_fa
chrom_name = ot.chrom_name_from_fasta(chrom_fasta_path)
if args.genome_fa:
if not os.path.exists(genome_fa):
raise FileNotFoundError(f"Whole genome fasta file '{genome_fa}' does not exist.")
else:
genome_fa = args.genome_fa
else:
genome_fa = None
all_guides_final = run_off_targets(
args=args,
all_guides_unique=all_guides_unique,
genome_fa=genome_fa,
cas_obj=cas_obj,
outdir=outdir,
chrom_name=chrom_name,
chrom_fasta_path=chrom_fasta_path,
)
else:
all_guides_final = all_guides_unique
# Save final all_guides_unique df with or without off-targets analysis in outdir
# make sure outdir exists (although we already did this above, can maybe remove)
os.makedirs(outdir, exist_ok=True)
# save all_guides df in output directory as a csv file, and bed file for viewing in a genome browser
all_guides_final_output = os.path.join(outdir, f"{locus}_gRNA.csv")
all_guides_final.to_csv(all_guides_final_output, index=False)
all_guides_final_bed = ap.output_bed_format(all_guides_final)
all_guides_final_bed_output = os.path.join(outdir, f"{locus}_gRNA_viewing.bed")
all_guides_final_bed.to_csv(all_guides_final_bed_output, sep='\t', header=False, index=False)
print(f'Combined and annotated single-gRNA library saved in {outdir}')
# Split by phasing, if enabled
# check validity of fixed-points-lists if enabled. Checking here because pairing happens in each if block sepately. Do not want to check validity multiple times.
# first, check if fixed-points given if pair == fp
fixed_points_list = args.fixed_points_list
if args.pairing_method == 'fp':
if fixed_points_list is None:
raise ValueError("Fixed point pairing enabled, please input 1 or more fixed points using --fixed-points-list")
else:
fixed_points_list = fixed_points_list.split(',')
# second, check if points are valid (int, and fall between locus start and end)
for point in fixed_points_list:
try:
point_int = int(point)
except ValueError:
raise ValueError(f"Invalid fixed point: {point} is not an integer")
if not (start < point_int < end):
raise ValueError(f"Fixed point {point} is out of locus range {start}-{end}")
# if all is gucci, go ahead with splitting by phasing and pairing
if args.split_phased:
# split
phased_vcf = vcf_file[0]
try:
split_list = ap.split_phased(all_guides_final, phased_vcf, locus)
all_guides_allele1 = split_list[0]
all_guides_allele2 = split_list[1]
#save allele 1 csv and bed files
all_guides_allele1_output = os.path.join(outdir, f"{locus}_gRNA_allele1.csv")
all_guides_allele1.to_csv(all_guides_allele1_output, index=False)
all_guides_allele1_bed = ap.output_bed_format(all_guides_allele1)
all_guides_allele1_bed_output = os.path.join(outdir, f"{locus}_gRNA_allele1_viewing.bed")
all_guides_allele1_bed.to_csv(all_guides_allele1_bed_output, sep='\t', header=False, index=False)
#save allele 2 csv and bed files
all_guides_allele2_output = os.path.join(outdir, f"{locus}_gRNA_allele2.csv")
all_guides_allele2.to_csv(all_guides_allele2_output, index=False)
all_guides_allele2_bed = ap.output_bed_format(all_guides_allele2)
all_guides_allele2_bed_output = os.path.join(outdir, f"{locus}_gRNA_allele2_viewing.bed")
all_guides_allele2_bed.to_csv(all_guides_allele2_bed_output, sep='\t', header=False, index=False)
print(f'Single-gRNA library split by phasing, saved in {outdir}')
except Exception as e:
raise RuntimeError(f"Failed to split all_guides dataframe by phased alleles: {e}")
# if pairing enabled, pair each allele's guides according to specified method
if args.pairing_method:
all_guides_allele1_paired = apply_pairing(all_guides_allele1, args.pairing_method, fixed_points_list)
all_guides_allele2_paired = apply_pairing(all_guides_allele2, args.pairing_method, fixed_points_list)
all_guides_allele1_paired_output = os.path.join(outdir, f"{locus}_gRNA_allele1_paired.csv")
all_guides_allele2_paired_output = os.path.join(outdir, f"{locus}_gRNA_allele2_paired.csv")
all_guides_allele1_paired.to_csv(all_guides_allele1_paired_output, index=False)
all_guides_allele2_paired.to_csv(all_guides_allele2_paired_output, index=False)
print(f'Paired dual-guide libraries for each allele saved in {outdir}')
# if split-phased disabled, don't split, perform pairing on all_guides_final
else:
if args.pairing_method:
all_guides_paired = apply_pairing(all_guides_final, args.pairing_method, fixed_points_list)
all_guides_paired_output = os.path.join(outdir, f"{locus}_gRNA_paired.csv")
all_guides_paired.to_csv(all_guides_paired_output, index=False)
print(f'Paired dual-guide library saved in {outdir}')
# Generate summary table if --summary enabled
if args.summary:
if args.split_phased:
all_guides_allele1_summary = ap.targetable_vars(all_guides_allele1)
all_guides_allele2_summary = ap.targetable_vars(all_guides_allele2)
all_guides_allele1_summary_output = os.path.join(outdir, f"{locus}_gRNA_allele1_summary.csv")
all_guides_allele2_summary_output = os.path.join(outdir, f"{locus}_gRNA_allele2_summary.csv")
all_guides_allele1_summary.to_csv(all_guides_allele1_summary_output, index=False)
all_guides_allele2_summary.to_csv(all_guides_allele2_summary_output, index=False)
print('Summary tables saved')
else:
all_guides_summary = ap.targetable_vars(all_guides_final)
all_guides_summary_output = os.path.join(outdir, f"{locus}_gRNA_summary.csv")
all_guides_summary.to_csv(all_guides_summary_output, index=False)
print('Summary table saved')
print('EXCAVATE-HT has finished running! Enjoy your gRNA libraries! :)')
return
[docs]
def run_pairing(args):
print(excavate_banner)
from pathlib import Path
# get output directory
outdir = args.output_dir
#sort input library by values of SNP position and guide start
input_library_path = args.input_library
if not os.path.exists(input_library_path):
raise FileNotFoundError(f"input library file '{input_library_path}' does not exist.")
input_library_path_obj = Path(input_library_path)
input_library_name = input_library_path_obj.stem # Get filename without extension
all_guides = pd.read_csv(input_library_path_obj)
all_guides = all_guides.sort_values(by=["SNP position","start"]).reset_index(drop=True)
#check if fp
fixed_points_list = args.fixed_points_list
if args.pairing_method == 'fp':
if fixed_points_list is None:
raise ValueError("Fixed point pairing enabled, please input 1 or more fixed points using --fixed-points-list")
else:
fixed_points_list = fixed_points_list.split(',')
# second, check if points are valid int
for point in fixed_points_list:
try:
point_int = int(point)
except ValueError:
raise ValueError(f"Invalid fixed point: {point} is not an integer")
# if pairing enabled, pair each allele's guides according to specified method
all_guides_paired = apply_pairing(all_guides, args.pairing_method, fixed_points_list)
if all_guides_paired.empty:
raise ValueError("No pairs made. Check if fixed-points fall within the given genomic locus")
os.makedirs(outdir, exist_ok=True)
all_guides_paired_output = os.path.join(outdir, f"{input_library_name}_paired.csv")
all_guides_paired.to_csv(all_guides_paired_output, index=False)
print(f'Paired dual-guide library saved in {outdir}')
return
[docs]
def main():
"""Main entry point for the gRNA pipeline"""
parser = argparse.ArgumentParser(
description="EXCAVATE-HT: a pipeline to identify targetable genomic variants and generate allele-specific single and paired-gRNA libraries",
prog="excavate"
)
# Add version argument
parser.add_argument(
"--version",
action="version",
version="%(prog)s 1.0.0"
)
# Create subparsers
subparsers = parser.add_subparsers(
dest="command",
title="subcommands",
description="Available pipeline commands",
help="Use 'excavate-ht <command> --help' for command-specific help"
)
# Add subcommand parsers
gen_parser = add_generate_parser(subparsers)
pair_parser = add_pair_parser(subparsers)
# Parse arguments
if len(sys.argv) == 1:
parser.print_help()
sys.exit(0)
args = parser.parse_args()
# Handle case where subcommand is provided but no arguments
if not hasattr(args, 'output_dir'):
if args.command == "generate":
gen_parser.print_help()
elif args.command == "pair":
pair_parser.print_help()
else:
parser.print_help()
sys.exit(0)
# Execute appropriate command
if args.command == "generate":
run_generate(args)
elif args.command == "pair":
run_pairing(args)
if __name__ == "__main__":
main()