#!/usr/bin/env bash
# Module 6: RNA-seq Pipeline — Alignment + Quantification
# Usage: bash rnaseq_pipeline.sh <sample_name>

set -euo pipefail

SAMPLE="${1:-SRR7890001}"
REF="data/reference/chr22.fa"
GTF="data/reference/chr22.gtf"
RAW_DIR="data/raw"
RNASEQ_DIR="results/${SAMPLE}/rnaseq"
THREADS=4

mkdir -p "${RNASEQ_DIR}/hisat2" "${RNASEQ_DIR}/counts" "${RNASEQ_DIR}/salmon"

echo "=== RNA-seq Pipeline: ${SAMPLE} ==="

# --- PATH A: Alignment-based (HISAT2 + featureCounts) ---

# Build HISAT2 index (one-time)
if [ ! -f "data/reference/hisat2_index/genome.1.ht2" ]; then
  echo "[0/A] Building HISAT2 index..."
  mkdir -p data/reference/hisat2_index
  hisat2-build "${REF}" data/reference/hisat2_index/genome \
    2> data/reference/hisat2_build.log
fi

# Align with HISAT2 (splice-aware)
echo "[1/A] Aligning with HISAT2..."
hisat2 \
  -x data/reference/hisat2_index/genome \
  -1 "${RAW_DIR}/${SAMPLE}_R1.fastq.gz" \
  -2 "${RAW_DIR}/${SAMPLE}_R2.fastq.gz" \
  -p "${THREADS}" \
  --rg-id "${SAMPLE}" \
  --rg "SM:${SAMPLE}" \
  --dta \
  2> "${RNASEQ_DIR}/hisat2/${SAMPLE}_hisat2.log" \
  | samtools sort -@ "${THREADS}" -o "${RNASEQ_DIR}/hisat2/${SAMPLE}_sorted.bam"
# Index as a separate step — `samtools index` needs a real file on disk, it
# cannot index a coordinate-sorted BAM streamed on stdin.
samtools index "${RNASEQ_DIR}/hisat2/${SAMPLE}_sorted.bam"

# Count reads per gene with featureCounts
echo "[2/A] Counting reads with featureCounts..."
featureCounts \
  -T "${THREADS}" \
  -p --countReadPairs \
  -a "${GTF}" \
  -o "${RNASEQ_DIR}/counts/${SAMPLE}_counts.txt" \
  "${RNASEQ_DIR}/hisat2/${SAMPLE}_sorted.bam" \
  2> "${RNASEQ_DIR}/counts/${SAMPLE}_featureCounts.log"

echo ""
echo "Alignment-based counts: ${RNASEQ_DIR}/counts/${SAMPLE}_counts.txt"
echo ""
echo "Next: open day2/module6_rnaseq/deseq2_analysis.R to run DE analysis"
echo "      (or the Jupyter notebook for the Python/pydeseq2 version)"
