176 lines
5.9 KiB
Python
176 lines
5.9 KiB
Python
"""Parse dungeon HTML files from method.gg to extract boss info, abilities, and assets.
|
|
|
|
Usage: poetry run python parse_dungeon.py dungeons/<filename>.html
|
|
"""
|
|
|
|
import base64
|
|
import os
|
|
import re
|
|
import sys
|
|
|
|
from lxml import etree
|
|
|
|
|
|
def slugify(text: str) -> str:
|
|
return re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_").lower()
|
|
|
|
|
|
def save_base64_image(src: str, filename: str) -> str | None:
|
|
"""Extract base64 image data and save to file. Returns filename if saved."""
|
|
if not src.startswith("data:image/"):
|
|
return None
|
|
match = re.match(r"data:image/(\w+);base64,(.*)", src)
|
|
if not match:
|
|
return None
|
|
ext = match.group(1)
|
|
if ext == "jpeg":
|
|
ext = "jpg"
|
|
data = base64.b64decode(match.group(2))
|
|
full_path = f"assets/{filename}.{ext}"
|
|
with open(full_path, "wb") as f:
|
|
f.write(data)
|
|
print(f" Saved: {full_path} ({len(data)} bytes)")
|
|
return full_path
|
|
|
|
|
|
def main(html_file: str) -> None:
|
|
# Derive dungeon slug from filename
|
|
dungeon_slug = os.path.splitext(os.path.basename(html_file))[0]
|
|
dungeon_slug_clean = slugify(dungeon_slug)
|
|
|
|
parser = etree.HTMLParser()
|
|
tree = etree.parse(html_file, parser)
|
|
root = tree.getroot()
|
|
|
|
os.makedirs("assets", exist_ok=True)
|
|
|
|
# ── Dungeon title ──
|
|
titles = root.xpath('//h1[contains(@class, "main-title")]')
|
|
for t in titles:
|
|
print("=== DUNGEON TITLE ===")
|
|
print(etree.tostring(t, method="text", encoding="unicode").strip())
|
|
|
|
# ── Guide sections ──
|
|
sections = root.xpath('//div[contains(@class, "guide-section-title")]/h2')
|
|
print("\n=== GUIDE SECTIONS ===")
|
|
for s in sections:
|
|
print(f" {etree.tostring(s, method='text', encoding='unicode').strip()}")
|
|
|
|
# ── Full section content ──
|
|
print("\n" + "=" * 80)
|
|
print("=== FULL SECTION CONTENT ===")
|
|
print("=" * 80)
|
|
|
|
guide_sections = root.xpath('//div[contains(@class, "guide-section-title")]')
|
|
for section_title_div in guide_sections:
|
|
h2 = section_title_div.find(".//h2")
|
|
if h2 is None:
|
|
continue
|
|
title = etree.tostring(h2, method="text", encoding="unicode").strip()
|
|
|
|
parent = section_title_div.getparent()
|
|
if parent is None:
|
|
continue
|
|
|
|
full_text = etree.tostring(parent, method="text", encoding="unicode").strip()
|
|
full_text = re.sub(r"\n\s*\n", "\n\n", full_text)
|
|
full_text = re.sub(r" +", " ", full_text)
|
|
|
|
print(f"\n{'─' * 60}")
|
|
print(f"SECTION: {title}")
|
|
print(f"{'─' * 60}")
|
|
print(full_text[:3000])
|
|
if len(full_text) > 3000:
|
|
print(f"... [truncated, total {len(full_text)} chars]")
|
|
|
|
# ── Boss images ──
|
|
print("\n" + "=" * 80)
|
|
print("=== EXTRACTING BOSS IMAGES ===")
|
|
print("=" * 80)
|
|
|
|
boss_images = root.xpath('//img[contains(@class, "section-boss")]')
|
|
for i, img in enumerate(boss_images):
|
|
src = img.get("src", "")
|
|
ancestor = img
|
|
section_name = f"boss_{i}"
|
|
while ancestor is not None:
|
|
ancestor = ancestor.getparent()
|
|
if ancestor is not None:
|
|
title_el = ancestor.find('.//div[@class="guide-section-title"]//h2')
|
|
if title_el is not None:
|
|
section_name = slugify(
|
|
etree.tostring(title_el, method="text", encoding="unicode").strip()
|
|
)
|
|
break
|
|
save_base64_image(src, section_name)
|
|
|
|
# ── Dungeon header image ──
|
|
header_imgs = root.xpath('//img[contains(@class, "boss--render")]')
|
|
for img in header_imgs:
|
|
src = img.get("src", "")
|
|
save_base64_image(src, f"{dungeon_slug_clean}_header")
|
|
|
|
# ── Mob icons ──
|
|
mob_icons = root.xpath('//div[contains(@class, "mob-icon")]/img')
|
|
for i, icon_img in enumerate(mob_icons):
|
|
src = icon_img.get("src", "")
|
|
mob_header_div = icon_img.getparent().getparent()
|
|
name_el = (
|
|
mob_header_div.find('.//h4[@class="mob-name"]')
|
|
if mob_header_div is not None
|
|
else None
|
|
)
|
|
if name_el is not None:
|
|
mob_name = etree.tostring(name_el, method="text", encoding="unicode").strip()
|
|
mob_name_clean = slugify(mob_name)
|
|
else:
|
|
mob_name_clean = f"mob_{i}"
|
|
save_base64_image(src, f"icon_{mob_name_clean}")
|
|
|
|
# ── Wowhead spell links ──
|
|
print("\n" + "=" * 80)
|
|
print("=== SPELLS WITH CONTEXT ===")
|
|
print("=" * 80)
|
|
|
|
spell_links = root.xpath('//a[contains(@href, "wowhead.com") and contains(@href, "spell")]')
|
|
for link in spell_links:
|
|
href = link.get("href", "")
|
|
spell_name = etree.tostring(link, method="text", encoding="unicode").strip()
|
|
|
|
parent = link.getparent()
|
|
if parent is not None:
|
|
context = etree.tostring(parent, method="text", encoding="unicode").strip()
|
|
context = re.sub(r"\s+", " ", context)[:300]
|
|
else:
|
|
context = ""
|
|
|
|
spell_id_match = re.search(r"spell=(\d+)", href)
|
|
spell_id = spell_id_match.group(1) if spell_id_match else "unknown"
|
|
|
|
print(f"\n Spell: {spell_name} (ID: {spell_id})")
|
|
print(f" Context: {context}")
|
|
|
|
# ── Dungeon selector images ──
|
|
print("\n" + "=" * 80)
|
|
print("=== DUNGEON SELECTOR IMAGES ===")
|
|
print("=" * 80)
|
|
|
|
dungeon_links = root.xpath('//a[contains(@class, "boss-guide-link")]')
|
|
for link in dungeon_links:
|
|
img = link.find(".//img")
|
|
if img is not None:
|
|
alt = img.get("alt", "")
|
|
src = img.get("src", "")
|
|
dungeon_name = re.sub(r" Mythic\+ Guide", "", alt)
|
|
dungeon_name_clean = slugify(dungeon_name)
|
|
saved = save_base64_image(src, f"dungeon_{dungeon_name_clean}")
|
|
if saved:
|
|
print(f" {alt}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 2:
|
|
print("Usage: poetry run python parse_dungeon.py dungeons/<filename>.html")
|
|
sys.exit(1)
|
|
main(sys.argv[1])
|