"""Parse dungeon HTML files from method.gg to extract boss info, abilities, and assets. Usage: poetry run python parse_dungeon.py dungeons/.html """ import base64 import os import re import sys from lxml import etree def slugify(text: str) -> str: return re.sub(r"[^a-zA-Z0-9]+", "_", text).strip("_").lower() def save_base64_image(src: str, filename: str) -> str | None: """Extract base64 image data and save to file. Returns filename if saved.""" if not src.startswith("data:image/"): return None match = re.match(r"data:image/(\w+);base64,(.*)", src) if not match: return None ext = match.group(1) if ext == "jpeg": ext = "jpg" data = base64.b64decode(match.group(2)) full_path = f"assets/{filename}.{ext}" with open(full_path, "wb") as f: f.write(data) print(f" Saved: {full_path} ({len(data)} bytes)") return full_path def main(html_file: str) -> None: # Derive dungeon slug from filename dungeon_slug = os.path.splitext(os.path.basename(html_file))[0] dungeon_slug_clean = slugify(dungeon_slug) parser = etree.HTMLParser() tree = etree.parse(html_file, parser) root = tree.getroot() os.makedirs("assets", exist_ok=True) # ── Dungeon title ── titles = root.xpath('//h1[contains(@class, "main-title")]') for t in titles: print("=== DUNGEON TITLE ===") print(etree.tostring(t, method="text", encoding="unicode").strip()) # ── Guide sections ── sections = root.xpath('//div[contains(@class, "guide-section-title")]/h2') print("\n=== GUIDE SECTIONS ===") for s in sections: print(f" {etree.tostring(s, method='text', encoding='unicode').strip()}") # ── Full section content ── print("\n" + "=" * 80) print("=== FULL SECTION CONTENT ===") print("=" * 80) guide_sections = root.xpath('//div[contains(@class, "guide-section-title")]') for section_title_div in guide_sections: h2 = section_title_div.find(".//h2") if h2 is None: continue title = etree.tostring(h2, method="text", encoding="unicode").strip() parent = section_title_div.getparent() if parent is None: continue full_text = etree.tostring(parent, method="text", encoding="unicode").strip() full_text = re.sub(r"\n\s*\n", "\n\n", full_text) full_text = re.sub(r" +", " ", full_text) print(f"\n{'─' * 60}") print(f"SECTION: {title}") print(f"{'─' * 60}") print(full_text[:3000]) if len(full_text) > 3000: print(f"... [truncated, total {len(full_text)} chars]") # ── Boss images ── print("\n" + "=" * 80) print("=== EXTRACTING BOSS IMAGES ===") print("=" * 80) boss_images = root.xpath('//img[contains(@class, "section-boss")]') for i, img in enumerate(boss_images): src = img.get("src", "") ancestor = img section_name = f"boss_{i}" while ancestor is not None: ancestor = ancestor.getparent() if ancestor is not None: title_el = ancestor.find('.//div[@class="guide-section-title"]//h2') if title_el is not None: section_name = slugify( etree.tostring(title_el, method="text", encoding="unicode").strip() ) break save_base64_image(src, section_name) # ── Dungeon header image ── header_imgs = root.xpath('//img[contains(@class, "boss--render")]') for img in header_imgs: src = img.get("src", "") save_base64_image(src, f"{dungeon_slug_clean}_header") # ── Mob icons ── mob_icons = root.xpath('//div[contains(@class, "mob-icon")]/img') for i, icon_img in enumerate(mob_icons): src = icon_img.get("src", "") mob_header_div = icon_img.getparent().getparent() name_el = ( mob_header_div.find('.//h4[@class="mob-name"]') if mob_header_div is not None else None ) if name_el is not None: mob_name = etree.tostring(name_el, method="text", encoding="unicode").strip() mob_name_clean = slugify(mob_name) else: mob_name_clean = f"mob_{i}" save_base64_image(src, f"icon_{mob_name_clean}") # ── Wowhead spell links ── print("\n" + "=" * 80) print("=== SPELLS WITH CONTEXT ===") print("=" * 80) spell_links = root.xpath('//a[contains(@href, "wowhead.com") and contains(@href, "spell")]') for link in spell_links: href = link.get("href", "") spell_name = etree.tostring(link, method="text", encoding="unicode").strip() parent = link.getparent() if parent is not None: context = etree.tostring(parent, method="text", encoding="unicode").strip() context = re.sub(r"\s+", " ", context)[:300] else: context = "" spell_id_match = re.search(r"spell=(\d+)", href) spell_id = spell_id_match.group(1) if spell_id_match else "unknown" print(f"\n Spell: {spell_name} (ID: {spell_id})") print(f" Context: {context}") # ── Dungeon selector images ── print("\n" + "=" * 80) print("=== DUNGEON SELECTOR IMAGES ===") print("=" * 80) dungeon_links = root.xpath('//a[contains(@class, "boss-guide-link")]') for link in dungeon_links: img = link.find(".//img") if img is not None: alt = img.get("alt", "") src = img.get("src", "") dungeon_name = re.sub(r" Mythic\+ Guide", "", alt) dungeon_name_clean = slugify(dungeon_name) saved = save_base64_image(src, f"dungeon_{dungeon_name_clean}") if saved: print(f" {alt}") if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: poetry run python parse_dungeon.py dungeons/.html") sys.exit(1) main(sys.argv[1])