geography-anki/playground/extract-urls/main.py

import scrapy
from scrapy.http import Response, TextResponse
from scrapy.exceptions import CloseSpider


class CountrySpider(scrapy.Spider):
    name = "list_of_sovereign_states"

    def start_requests(self):
        return [
            scrapy.Request(
                url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
            )
        ]

    def extract_country_urls(self, response: TextResponse):
        _xpath = response.xpath(
            "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
        ).getall()

        return [scrapy.Request(url=f"https://en.wikipedia.org{url}", callback=self.parse) for url in _xpath]

    def parse(self, response: TextResponse):
        # extract country urls
        _xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()

        yield {"country": _xpath}