"""Parse dungeon HTML files from method.gg to extract boss info, abilities, and assets.
Usage: poetry run python parse_dungeon.py dungeons/.html
"""
import base64
import os
import re
import sys
from lxml import etree
def slugify(text: str) -> str:
return re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_").lower()
def save_base64_image(src: str, filename: str) -> str | None:
"""Extract base64 image data and save to file. Returns filename if saved."""
if not src.startswith("data:image/"):
return None
match = re.match(r"data:image/(\w+);base64,(.*)", src)
if not match:
return None
ext = match.group(1)
if ext == "jpeg":
ext = "jpg"
data = base64.b64decode(match.group(2))
full_path = f"assets/{filename}.{ext}"
with open(full_path, "wb") as f:
f.write(data)
print(f" Saved: {full_path} ({len(data)} bytes)")
return full_path
def main(html_file: str) -> None:
# Derive dungeon slug from filename
dungeon_slug = os.path.splitext(os.path.basename(html_file))[0]
dungeon_slug_clean = slugify(dungeon_slug)
parser = etree.HTMLParser()
tree = etree.parse(html_file, parser)
root = tree.getroot()
os.makedirs("assets", exist_ok=True)
# ── Dungeon title ──
titles = root.xpath('//h1[contains(@class, "main-title")]')
for t in titles:
print("=== DUNGEON TITLE ===")
print(etree.tostring(t, method="text", encoding="unicode").strip())
# ── Guide sections ──
sections = root.xpath('//div[contains(@class, "guide-section-title")]/h2')
print("\n=== GUIDE SECTIONS ===")
for s in sections:
print(f" {etree.tostring(s, method='text', encoding='unicode').strip()}")
# ── Full section content ──
print("\n" + "=" * 80)
print("=== FULL SECTION CONTENT ===")
print("=" * 80)
guide_sections = root.xpath('//div[contains(@class, "guide-section-title")]')
for section_title_div in guide_sections:
h2 = section_title_div.find(".//h2")
if h2 is None:
continue
title = etree.tostring(h2, method="text", encoding="unicode").strip()
parent = section_title_div.getparent()
if parent is None:
continue
full_text = etree.tostring(parent, method="text", encoding="unicode").strip()
full_text = re.sub(r"\n\s*\n", "\n\n", full_text)
full_text = re.sub(r" +", " ", full_text)
print(f"\n{'─' * 60}")
print(f"SECTION: {title}")
print(f"{'─' * 60}")
print(full_text[:3000])
if len(full_text) > 3000:
print(f"... [truncated, total {len(full_text)} chars]")
# ── Boss images ──
print("\n" + "=" * 80)
print("=== EXTRACTING BOSS IMAGES ===")
print("=" * 80)
boss_images = root.xpath('//img[contains(@class, "section-boss")]')
for i, img in enumerate(boss_images):
src = img.get("src", "")
ancestor = img
section_name = f"boss_{i}"
while ancestor is not None:
ancestor = ancestor.getparent()
if ancestor is not None:
title_el = ancestor.find('.//div[@class="guide-section-title"]//h2')
if title_el is not None:
section_name = slugify(
etree.tostring(title_el, method="text", encoding="unicode").strip()
)
break
save_base64_image(src, section_name)
# ── Dungeon header image ──
header_imgs = root.xpath('//img[contains(@class, "boss--render")]')
for img in header_imgs:
src = img.get("src", "")
save_base64_image(src, f"{dungeon_slug_clean}_header")
# ── Mob icons ──
mob_icons = root.xpath('//div[contains(@class, "mob-icon")]/img')
for i, icon_img in enumerate(mob_icons):
src = icon_img.get("src", "")
mob_header_div = icon_img.getparent().getparent()
name_el = (
mob_header_div.find('.//h4[@class="mob-name"]')
if mob_header_div is not None
else None
)
if name_el is not None:
mob_name = etree.tostring(name_el, method="text", encoding="unicode").strip()
mob_name_clean = slugify(mob_name)
else:
mob_name_clean = f"mob_{i}"
save_base64_image(src, f"icon_{mob_name_clean}")
# ── Wowhead spell links ──
print("\n" + "=" * 80)
print("=== SPELLS WITH CONTEXT ===")
print("=" * 80)
spell_links = root.xpath('//a[contains(@href, "wowhead.com") and contains(@href, "spell")]')
for link in spell_links:
href = link.get("href", "")
spell_name = etree.tostring(link, method="text", encoding="unicode").strip()
parent = link.getparent()
if parent is not None:
context = etree.tostring(parent, method="text", encoding="unicode").strip()
context = re.sub(r"\s+", " ", context)[:300]
else:
context = ""
spell_id_match = re.search(r"spell=(\d+)", href)
spell_id = spell_id_match.group(1) if spell_id_match else "unknown"
print(f"\n Spell: {spell_name} (ID: {spell_id})")
print(f" Context: {context}")
# ── Dungeon selector images ──
print("\n" + "=" * 80)
print("=== DUNGEON SELECTOR IMAGES ===")
print("=" * 80)
dungeon_links = root.xpath('//a[contains(@class, "boss-guide-link")]')
for link in dungeon_links:
img = link.find(".//img")
if img is not None:
alt = img.get("alt", "")
src = img.get("src", "")
dungeon_name = re.sub(r" Mythic\+ Guide", "", alt)
dungeon_name_clean = slugify(dungeon_name)
saved = save_base64_image(src, f"dungeon_{dungeon_name_clean}")
if saved:
print(f" {alt}")
if __name__ == "__main__":
if len(sys.argv) < 2:
print("Usage: poetry run python parse_dungeon.py dungeons/.html")
sys.exit(1)
main(sys.argv[1])