chore: add dev playground
This commit is contained in:
27
playground/extract-urls/main.py
Normal file
27
playground/extract-urls/main.py
Normal file
@@ -0,0 +1,27 @@
|
||||
import scrapy
|
||||
from scrapy.http import Response, TextResponse
|
||||
from scrapy.exceptions import CloseSpider
|
||||
|
||||
|
||||
class CountrySpider(scrapy.Spider):
|
||||
name = "list_of_sovereign_states"
|
||||
|
||||
def start_requests(self):
|
||||
return [
|
||||
scrapy.Request(
|
||||
url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
|
||||
)
|
||||
]
|
||||
|
||||
def extract_country_urls(self, response: TextResponse):
|
||||
_xpath = response.xpath(
|
||||
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
|
||||
).getall()
|
||||
|
||||
return [scrapy.Request(url=f"https://en.wikipedia.org{url}", callback=self.parse) for url in _xpath]
|
||||
|
||||
def parse(self, response: TextResponse):
|
||||
# extract country urls
|
||||
_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
|
||||
|
||||
yield {"country": _xpath}
|
||||
Reference in New Issue
Block a user