initial commit
This commit is contained in:
175
parse_dungeon.py
Normal file
175
parse_dungeon.py
Normal file
@@ -0,0 +1,175 @@
|
||||
"""Parse dungeon HTML files from method.gg to extract boss info, abilities, and assets.
|
||||
|
||||
Usage: poetry run python parse_dungeon.py dungeons/<filename>.html
|
||||
"""
|
||||
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
from lxml import etree
|
||||
|
||||
|
||||
def slugify(text: str) -> str:
|
||||
return re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_").lower()
|
||||
|
||||
|
||||
def save_base64_image(src: str, filename: str) -> str | None:
|
||||
"""Extract base64 image data and save to file. Returns filename if saved."""
|
||||
if not src.startswith("data:image/"):
|
||||
return None
|
||||
match = re.match(r"data:image/(\w+);base64,(.*)", src)
|
||||
if not match:
|
||||
return None
|
||||
ext = match.group(1)
|
||||
if ext == "jpeg":
|
||||
ext = "jpg"
|
||||
data = base64.b64decode(match.group(2))
|
||||
full_path = f"assets/{filename}.{ext}"
|
||||
with open(full_path, "wb") as f:
|
||||
f.write(data)
|
||||
print(f" Saved: {full_path} ({len(data)} bytes)")
|
||||
return full_path
|
||||
|
||||
|
||||
def main(html_file: str) -> None:
|
||||
# Derive dungeon slug from filename
|
||||
dungeon_slug = os.path.splitext(os.path.basename(html_file))[0]
|
||||
dungeon_slug_clean = slugify(dungeon_slug)
|
||||
|
||||
parser = etree.HTMLParser()
|
||||
tree = etree.parse(html_file, parser)
|
||||
root = tree.getroot()
|
||||
|
||||
os.makedirs("assets", exist_ok=True)
|
||||
|
||||
# ── Dungeon title ──
|
||||
titles = root.xpath('//h1[contains(@class, "main-title")]')
|
||||
for t in titles:
|
||||
print("=== DUNGEON TITLE ===")
|
||||
print(etree.tostring(t, method="text", encoding="unicode").strip())
|
||||
|
||||
# ── Guide sections ──
|
||||
sections = root.xpath('//div[contains(@class, "guide-section-title")]/h2')
|
||||
print("\n=== GUIDE SECTIONS ===")
|
||||
for s in sections:
|
||||
print(f" {etree.tostring(s, method='text', encoding='unicode').strip()}")
|
||||
|
||||
# ── Full section content ──
|
||||
print("\n" + "=" * 80)
|
||||
print("=== FULL SECTION CONTENT ===")
|
||||
print("=" * 80)
|
||||
|
||||
guide_sections = root.xpath('//div[contains(@class, "guide-section-title")]')
|
||||
for section_title_div in guide_sections:
|
||||
h2 = section_title_div.find(".//h2")
|
||||
if h2 is None:
|
||||
continue
|
||||
title = etree.tostring(h2, method="text", encoding="unicode").strip()
|
||||
|
||||
parent = section_title_div.getparent()
|
||||
if parent is None:
|
||||
continue
|
||||
|
||||
full_text = etree.tostring(parent, method="text", encoding="unicode").strip()
|
||||
full_text = re.sub(r"\n\s*\n", "\n\n", full_text)
|
||||
full_text = re.sub(r" +", " ", full_text)
|
||||
|
||||
print(f"\n{'─' * 60}")
|
||||
print(f"SECTION: {title}")
|
||||
print(f"{'─' * 60}")
|
||||
print(full_text[:3000])
|
||||
if len(full_text) > 3000:
|
||||
print(f"... [truncated, total {len(full_text)} chars]")
|
||||
|
||||
# ── Boss images ──
|
||||
print("\n" + "=" * 80)
|
||||
print("=== EXTRACTING BOSS IMAGES ===")
|
||||
print("=" * 80)
|
||||
|
||||
boss_images = root.xpath('//img[contains(@class, "section-boss")]')
|
||||
for i, img in enumerate(boss_images):
|
||||
src = img.get("src", "")
|
||||
ancestor = img
|
||||
section_name = f"boss_{i}"
|
||||
while ancestor is not None:
|
||||
ancestor = ancestor.getparent()
|
||||
if ancestor is not None:
|
||||
title_el = ancestor.find('.//div[@class="guide-section-title"]//h2')
|
||||
if title_el is not None:
|
||||
section_name = slugify(
|
||||
etree.tostring(title_el, method="text", encoding="unicode").strip()
|
||||
)
|
||||
break
|
||||
save_base64_image(src, section_name)
|
||||
|
||||
# ── Dungeon header image ──
|
||||
header_imgs = root.xpath('//img[contains(@class, "boss--render")]')
|
||||
for img in header_imgs:
|
||||
src = img.get("src", "")
|
||||
save_base64_image(src, f"{dungeon_slug_clean}_header")
|
||||
|
||||
# ── Mob icons ──
|
||||
mob_icons = root.xpath('//div[contains(@class, "mob-icon")]/img')
|
||||
for i, icon_img in enumerate(mob_icons):
|
||||
src = icon_img.get("src", "")
|
||||
mob_header_div = icon_img.getparent().getparent()
|
||||
name_el = (
|
||||
mob_header_div.find('.//h4[@class="mob-name"]')
|
||||
if mob_header_div is not None
|
||||
else None
|
||||
)
|
||||
if name_el is not None:
|
||||
mob_name = etree.tostring(name_el, method="text", encoding="unicode").strip()
|
||||
mob_name_clean = slugify(mob_name)
|
||||
else:
|
||||
mob_name_clean = f"mob_{i}"
|
||||
save_base64_image(src, f"icon_{mob_name_clean}")
|
||||
|
||||
# ── Wowhead spell links ──
|
||||
print("\n" + "=" * 80)
|
||||
print("=== SPELLS WITH CONTEXT ===")
|
||||
print("=" * 80)
|
||||
|
||||
spell_links = root.xpath('//a[contains(@href, "wowhead.com") and contains(@href, "spell")]')
|
||||
for link in spell_links:
|
||||
href = link.get("href", "")
|
||||
spell_name = etree.tostring(link, method="text", encoding="unicode").strip()
|
||||
|
||||
parent = link.getparent()
|
||||
if parent is not None:
|
||||
context = etree.tostring(parent, method="text", encoding="unicode").strip()
|
||||
context = re.sub(r"\s+", " ", context)[:300]
|
||||
else:
|
||||
context = ""
|
||||
|
||||
spell_id_match = re.search(r"spell=(\d+)", href)
|
||||
spell_id = spell_id_match.group(1) if spell_id_match else "unknown"
|
||||
|
||||
print(f"\n Spell: {spell_name} (ID: {spell_id})")
|
||||
print(f" Context: {context}")
|
||||
|
||||
# ── Dungeon selector images ──
|
||||
print("\n" + "=" * 80)
|
||||
print("=== DUNGEON SELECTOR IMAGES ===")
|
||||
print("=" * 80)
|
||||
|
||||
dungeon_links = root.xpath('//a[contains(@class, "boss-guide-link")]')
|
||||
for link in dungeon_links:
|
||||
img = link.find(".//img")
|
||||
if img is not None:
|
||||
alt = img.get("alt", "")
|
||||
src = img.get("src", "")
|
||||
dungeon_name = re.sub(r" Mythic\+ Guide", "", alt)
|
||||
dungeon_name_clean = slugify(dungeon_name)
|
||||
saved = save_base64_image(src, f"dungeon_{dungeon_name_clean}")
|
||||
if saved:
|
||||
print(f" {alt}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: poetry run python parse_dungeon.py dungeons/<filename>.html")
|
||||
sys.exit(1)
|
||||
main(sys.argv[1])
|
||||
Reference in New Issue
Block a user