import scrapy from scrapy.http import Response, TextResponse from scrapy.exceptions import CloseSpider class CountrySpider(scrapy.Spider): name = "list_of_sovereign_states" def start_requests(self): return [ scrapy.Request( url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls ) ] def extract_country_urls(self, response: TextResponse): _xpath = response.xpath( "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href" ).getall() return [scrapy.Request(url=f"https://en.wikipedia.org{url}", callback=self.parse) for url in _xpath] def parse(self, response: TextResponse): # extract country urls _xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall() yield {"country": _xpath}