Files

28 lines
1009 B
Python

import scrapy
from scrapy.http import Response, TextResponse
from scrapy.exceptions import CloseSpider
class CountrySpider(scrapy.Spider):
name = "list_of_sovereign_states"
def start_requests(self):
return [
scrapy.Request(
url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
)
]
def extract_country_urls(self, response: TextResponse):
_xpath = response.xpath(
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
).getall()
return [scrapy.Request(url=f"https://en.wikipedia.org{url}", callback=self.parse) for url in _xpath]
def parse(self, response: TextResponse):
# extract country urls
_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
yield {"country": _xpath}