#!/usr/bin/env bash
# Module 5: BAM Processing + Variant Calling Pipeline (GATK4)
# Usage: bash variant_pipeline.sh <sample_name>

set -euo pipefail

SAMPLE="${1:-SRR7890001}"
REF="data/reference/chr22.fa"
BAM="results/${SAMPLE}/aligned/${SAMPLE}_sorted.bam"
VCF_DIR="results/${SAMPLE}/variants"
THREADS=4

mkdir -p "${VCF_DIR}"

echo "=== Variant Calling Pipeline: ${SAMPLE} ==="

# Step 1: Mark duplicates
echo "[1/6] Marking duplicates with Picard..."
picard MarkDuplicates \
  -I "${BAM}" \
  -O "${VCF_DIR}/${SAMPLE}_markdup.bam" \
  -M "${VCF_DIR}/${SAMPLE}_markdup_metrics.txt" \
  --VALIDATION_STRINGENCY LENIENT \
  2> "${VCF_DIR}/${SAMPLE}_markdup.log"
samtools index "${VCF_DIR}/${SAMPLE}_markdup.bam"

# Step 2: BQSR — generate recalibration table
echo "[2/6] Base Quality Score Recalibration (BQSR)..."
# NOTE: Requires known-sites VCF (dbSNP). For teaching, we skip BQSR with a flag.
# In production: download dbSNP VCF for your genome build first.
echo "  (Skipping BQSR for teaching purposes — in production use --known-sites dbsnp.vcf.gz)"

# Step 3: HaplotypeCaller — GVCF mode
echo "[3/6] Calling variants with GATK HaplotypeCaller..."
gatk HaplotypeCaller \
  -R "${REF}" \
  -I "${VCF_DIR}/${SAMPLE}_markdup.bam" \
  -O "${VCF_DIR}/${SAMPLE}.g.vcf.gz" \
  -ERC GVCF \
  --native-pair-hmm-threads "${THREADS}" \
  2> "${VCF_DIR}/${SAMPLE}_haplotypecaller.log"

# Step 4: GenotypeGVCFs
echo "[4/6] Genotyping GVCFs..."
gatk GenotypeGVCFs \
  -R "${REF}" \
  -V "${VCF_DIR}/${SAMPLE}.g.vcf.gz" \
  -O "${VCF_DIR}/${SAMPLE}_raw.vcf.gz" \
  2> "${VCF_DIR}/${SAMPLE}_genotype.log"

# Step 5: Hard filter (teaching shortcut — VQSR requires many samples)
echo "[5/6] Hard filtering variants..."
# SNPs
gatk SelectVariants -R "${REF}" \
  -V "${VCF_DIR}/${SAMPLE}_raw.vcf.gz" \
  --select-type-to-include SNP \
  -O "${VCF_DIR}/${SAMPLE}_raw_snps.vcf.gz"

gatk VariantFiltration -R "${REF}" \
  -V "${VCF_DIR}/${SAMPLE}_raw_snps.vcf.gz" \
  --filter-expression "QD < 2.0"  --filter-name "QD2" \
  --filter-expression "FS > 60.0" --filter-name "FS60" \
  --filter-expression "MQ < 40.0" --filter-name "MQ40" \
  -O "${VCF_DIR}/${SAMPLE}_filtered_snps.vcf.gz"

# Step 6: Summary stats
echo "[6/6] Variant summary..."
bcftools stats "${VCF_DIR}/${SAMPLE}_filtered_snps.vcf.gz" \
  | grep "^SN" \
  | tee "${VCF_DIR}/${SAMPLE}_variant_stats.txt"

echo ""
echo "Done! VCF at: ${VCF_DIR}/${SAMPLE}_filtered_snps.vcf.gz"
echo "Load in IGV alongside the BAM to see variants in context."
