chore: add capitals spider
This commit is contained in:
@@ -4,6 +4,7 @@
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
from wikipedia_country_scraper.items import WikipediaCountryScraperItem
|
||||
|
||||
|
||||
class WikipediaCountryScraperItem(scrapy.Item):
|
||||
@@ -24,3 +25,8 @@ class AnthemsItem(scrapy.Item):
|
||||
|
||||
file_urls = scrapy.Field()
|
||||
files = scrapy.Field()
|
||||
|
||||
|
||||
class CapitalsItem(scrapy.Item):
|
||||
country_name = scrapy.Field()
|
||||
capitals = scrapy.Field()
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
import scrapy
|
||||
from scrapy.http import TextResponse
|
||||
from wikipedia_country_scraper.items import CapitalsItem
|
||||
|
||||
|
||||
class CapitalsSpider(scrapy.Spider):
|
||||
name = "capitals"
|
||||
start_urls = [
|
||||
"https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages"
|
||||
]
|
||||
custom_settings = {
|
||||
"FEEDS": {
|
||||
pathlib.Path(__file__).resolve().parents[4]
|
||||
/ "data"
|
||||
/ "scrapy"
|
||||
/ "raw_country_data"
|
||||
/ "capitals.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||
},
|
||||
}
|
||||
|
||||
def parse(self, response: TextResponse):
|
||||
_country = response.xpath("//table[@class='wikitable']/tbody/tr[not(@style='background:#ccc;')]")
|
||||
|
||||
country_names = [country_name.get() for country_name in _country.xpath("td[1]//a[@title]/text()")]
|
||||
|
||||
capital_names = []
|
||||
for capital in _country.xpath("td[2]"):
|
||||
_capitals = [subcapital.get() for subcapital in capital.xpath("a[@title]/text()")]
|
||||
capital_names.append(_capitals)
|
||||
|
||||
for country, capitals in zip(country_names, capital_names):
|
||||
capital_item = CapitalsItem()
|
||||
|
||||
capital_item["country_name"] = country
|
||||
capital_item["capitals"] = capitals
|
||||
|
||||
yield capital_item
|
||||
Reference in New Issue
Block a user