midnight_season_1_tanking_s…/parse_dungeon.py

"""Parse dungeon HTML files from method.gg to extract boss info, abilities, and assets.

Usage: poetry run python parse_dungeon.py dungeons/<filename>.html
"""

import base64
import os
import re
import sys

from lxml import etree


def slugify(text: str) -> str:
    return re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_").lower()


def save_base64_image(src: str, filename: str) -> str | None:
    """Extract base64 image data and save to file. Returns filename if saved."""
    if not src.startswith("data:image/"):
        return None
    match = re.match(r"data:image/(\w+);base64,(.*)", src)
    if not match:
        return None
    ext = match.group(1)
    if ext == "jpeg":
        ext = "jpg"
    data = base64.b64decode(match.group(2))
    full_path = f"assets/{filename}.{ext}"
    with open(full_path, "wb") as f:
        f.write(data)
    print(f"  Saved: {full_path} ({len(data)} bytes)")
    return full_path


def main(html_file: str) -> None:
    # Derive dungeon slug from filename
    dungeon_slug = os.path.splitext(os.path.basename(html_file))[0]
    dungeon_slug_clean = slugify(dungeon_slug)

    parser = etree.HTMLParser()
    tree = etree.parse(html_file, parser)
    root = tree.getroot()

    os.makedirs("assets", exist_ok=True)

    # ── Dungeon title ──
    titles = root.xpath('//h1[contains(@class, "main-title")]')
    for t in titles:
        print("=== DUNGEON TITLE ===")
        print(etree.tostring(t, method="text", encoding="unicode").strip())

    # ── Guide sections ──
    sections = root.xpath('//div[contains(@class, "guide-section-title")]/h2')
    print("\n=== GUIDE SECTIONS ===")
    for s in sections:
        print(f"  {etree.tostring(s, method='text', encoding='unicode').strip()}")

    # ── Full section content ──
    print("\n" + "=" * 80)
    print("=== FULL SECTION CONTENT ===")
    print("=" * 80)

    guide_sections = root.xpath('//div[contains(@class, "guide-section-title")]')
    for section_title_div in guide_sections:
        h2 = section_title_div.find(".//h2")
        if h2 is None:
            continue
        title = etree.tostring(h2, method="text", encoding="unicode").strip()

        parent = section_title_div.getparent()
        if parent is None:
            continue

        full_text = etree.tostring(parent, method="text", encoding="unicode").strip()
        full_text = re.sub(r"\n\s*\n", "\n\n", full_text)
        full_text = re.sub(r"  +", " ", full_text)

        print(f"\n{'─' * 60}")
        print(f"SECTION: {title}")
        print(f"{'─' * 60}")
        print(full_text[:3000])
        if len(full_text) > 3000:
            print(f"... [truncated, total {len(full_text)} chars]")

    # ── Boss images ──
    print("\n" + "=" * 80)
    print("=== EXTRACTING BOSS IMAGES ===")
    print("=" * 80)

    boss_images = root.xpath('//img[contains(@class, "section-boss")]')
    for i, img in enumerate(boss_images):
        src = img.get("src", "")
        ancestor = img
        section_name = f"boss_{i}"
        while ancestor is not None:
            ancestor = ancestor.getparent()
            if ancestor is not None:
                title_el = ancestor.find('.//div[@class="guide-section-title"]//h2')
                if title_el is not None:
                    section_name = slugify(
                        etree.tostring(title_el, method="text", encoding="unicode").strip()
                    )
                    break
        save_base64_image(src, section_name)

    # ── Dungeon header image ──
    header_imgs = root.xpath('//img[contains(@class, "boss--render")]')
    for img in header_imgs:
        src = img.get("src", "")
        save_base64_image(src, f"{dungeon_slug_clean}_header")

    # ── Mob icons ──
    mob_icons = root.xpath('//div[contains(@class, "mob-icon")]/img')
    for i, icon_img in enumerate(mob_icons):
        src = icon_img.get("src", "")
        mob_header_div = icon_img.getparent().getparent()
        name_el = (
            mob_header_div.find('.//h4[@class="mob-name"]')
            if mob_header_div is not None
            else None
        )
        if name_el is not None:
            mob_name = etree.tostring(name_el, method="text", encoding="unicode").strip()
            mob_name_clean = slugify(mob_name)
        else:
            mob_name_clean = f"mob_{i}"
        save_base64_image(src, f"icon_{mob_name_clean}")

    # ── Wowhead spell links ──
    print("\n" + "=" * 80)
    print("=== SPELLS WITH CONTEXT ===")
    print("=" * 80)

    spell_links = root.xpath('//a[contains(@href, "wowhead.com") and contains(@href, "spell")]')
    for link in spell_links:
        href = link.get("href", "")
        spell_name = etree.tostring(link, method="text", encoding="unicode").strip()

        parent = link.getparent()
        if parent is not None:
            context = etree.tostring(parent, method="text", encoding="unicode").strip()
            context = re.sub(r"\s+", " ", context)[:300]
        else:
            context = ""

        spell_id_match = re.search(r"spell=(\d+)", href)
        spell_id = spell_id_match.group(1) if spell_id_match else "unknown"

        print(f"\n  Spell: {spell_name} (ID: {spell_id})")
        print(f"  Context: {context}")

    # ── Dungeon selector images ──
    print("\n" + "=" * 80)
    print("=== DUNGEON SELECTOR IMAGES ===")
    print("=" * 80)

    dungeon_links = root.xpath('//a[contains(@class, "boss-guide-link")]')
    for link in dungeon_links:
        img = link.find(".//img")
        if img is not None:
            alt = img.get("alt", "")
            src = img.get("src", "")
            dungeon_name = re.sub(r" Mythic\+ Guide", "", alt)
            dungeon_name_clean = slugify(dungeon_name)
            saved = save_base64_image(src, f"dungeon_{dungeon_name_clean}")
            if saved:
                print(f"    {alt}")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: poetry run python parse_dungeon.py dungeons/<filename>.html")
        sys.exit(1)
    main(sys.argv[1])