28 lines
1009 B
Python
28 lines
1009 B
Python
import scrapy
|
|
from scrapy.http import Response, TextResponse
|
|
from scrapy.exceptions import CloseSpider
|
|
|
|
|
|
class CountrySpider(scrapy.Spider):
|
|
name = "list_of_sovereign_states"
|
|
|
|
def start_requests(self):
|
|
return [
|
|
scrapy.Request(
|
|
url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
|
|
)
|
|
]
|
|
|
|
def extract_country_urls(self, response: TextResponse):
|
|
_xpath = response.xpath(
|
|
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
|
|
).getall()
|
|
|
|
return [scrapy.Request(url=f"https://en.wikipedia.org{url}", callback=self.parse) for url in _xpath]
|
|
|
|
def parse(self, response: TextResponse):
|
|
# extract country urls
|
|
_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
|
|
|
|
yield {"country": _xpath}
|