#!/usr/bin/env python3
"""
Import PSAT 8/9 CSV batches into a single SQLite table.

Why this design?
- The CSV "ID" column may repeat across batches. We therefore store it as `csv_id`
  and use an AUTOINCREMENT primary key (`row_id`) plus a UNIQUE constraint on
  (batch_file, csv_id). This avoids collisions while preserving the original IDs.
- Fast + safe inserts: wrapped in a transaction, parameterized inserts, and a few PRAGMAs.

Usage examples:
  python import_psat_csv_to_sqlite.py --db psat.db --glob "psat_8_9_practice_batch_*.csv"
  python import_psat_csv_to_sqlite.py --db psat.db --folder /path/to/folder

After importing, example queries:
  - Total rows:        SELECT COUNT(*) FROM psat_items;
  - By section:        SELECT section, COUNT(*) FROM psat_items GROUP BY section;
  - By difficulty:     SELECT difficulty, COUNT(*) FROM psat_items GROUP BY difficulty;
"""

import argparse
import csv
import glob
import os
import sqlite3
from typing import List

REQUIRED_COLUMNS = ["ID","Section","Difficulty","Question","Options","Answer","Explanation"]

DDL = """
CREATE TABLE IF NOT EXISTS psat_items (
  row_id      INTEGER PRIMARY KEY AUTOINCREMENT,
  batch_file  TEXT    NOT NULL,
  csv_id      INTEGER,
  section     TEXT    NOT NULL,
  difficulty  TEXT    NOT NULL,
  question    TEXT    NOT NULL,
  options     TEXT    NOT NULL,
  answer      TEXT    NOT NULL,
  explanation TEXT
);
CREATE UNIQUE INDEX IF NOT EXISTS ux_psat_items_batch_csvid ON psat_items(batch_file, csv_id);
CREATE INDEX IF NOT EXISTS ix_psat_items_section ON psat_items(section);
CREATE INDEX IF NOT EXISTS ix_psat_items_difficulty ON psat_items(difficulty);
"""

def find_csvs(folder: str = None, pattern: str = None) -> List[str]:
    if pattern:
        files = sorted(glob.glob(pattern))
    elif folder:
        files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    else:
        # default: look in current working directory for our expected naming pattern
        files = sorted(glob.glob("psat_8_9_practice_batch_*.csv"))
    return [f for f in files if os.path.isfile(f)]

def ensure_columns(header: List[str]):
    missing = [c for c in REQUIRED_COLUMNS if c not in header]
    if missing:
        raise ValueError(f"CSV missing required columns: {missing}. Found header: {header}")

def import_file(conn: sqlite3.Connection, csv_path: str, verbose: bool = True) -> int:
    inserted = 0
    # detect UTF-8 with BOM gracefully
    with open(csv_path, "r", encoding="utf-8-sig", newline="") as f:
        reader = csv.DictReader(f)
        ensure_columns(reader.fieldnames or [])
        rows = []
        for row in reader:
            rows.append((
                os.path.basename(csv_path),            # batch_file
                safe_int(row.get("ID")),               # csv_id
                (row.get("Section") or "").strip(),    # section
                (row.get("Difficulty") or "").strip(), # difficulty
                (row.get("Question") or "").strip(),   # question
                (row.get("Options") or "").strip(),    # options
                (row.get("Answer") or "").strip(),     # answer
                (row.get("Explanation") or "").strip() # explanation
            ))
    with conn:  # transaction
        cur = conn.executemany(
            """INSERT OR IGNORE INTO psat_items
               (batch_file, csv_id, section, difficulty, question, options, answer, explanation)
               VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
            rows
        )
        inserted = cur.rowcount if cur.rowcount is not None else 0
    if verbose:
        print(f"Imported {inserted:>5} rows from {os.path.basename(csv_path)}")
    return inserted

def safe_int(x):
    try:
        return int(str(x).strip())
    except Exception:
        return None

def main():
    ap = argparse.ArgumentParser(description="Import PSAT 8/9 CSV batches into SQLite")
    ap.add_argument("--db", required=True, help="Path to SQLite database file to create/use (e.g., psat.db)")
    group = ap.add_mutually_exclusive_group()
    group.add_argument("--glob", help="Glob pattern for CSVs (e.g., 'psat_8_9_practice_batch_*.csv')")
    group.add_argument("--folder", help="Folder containing CSVs")
    ap.add_argument("--verbose", action="store_true", help="Print progress")
    args = ap.parse_args()

    csv_files = find_csvs(folder=args.folder, pattern=args.glob)
    if not csv_files:
        raise SystemExit("No CSV files found. Use --glob or --folder to point to your CSVs.")

    if args.verbose:
        print(f"Found {len(csv_files)} CSV files. Creating/connecting to DB: {args.db}")

    conn = sqlite3.connect(args.db)
    try:
        # Performance pragmas (safe defaults; adjust if needed)
        conn.execute("PRAGMA journal_mode = WAL;")
        conn.execute("PRAGMA synchronous = NORMAL;")
        conn.execute("PRAGMA temp_store = MEMORY;")
        conn.execute("PRAGMA mmap_size = 30000000000;")  # 30GB hint (ignored if unsupported)

        # Create schema
        for stmt in filter(None, DDL.split(";")):
            conn.execute(stmt + ";")

        total = 0
        for path in csv_files:
            total += import_file(conn, path, verbose=args.verbose)

        if args.verbose:
            print(f"Done. Inserted {total} rows in total.")
    finally:
        conn.close()

if __name__ == "__main__":
    main()
