chore: add capitals spider

This commit is contained in:
2022-06-26 16:32:07 +01:00
parent 101f4a4080
commit 0d2b379a28
3 changed files with 276 additions and 348 deletions

View File

@@ -4,6 +4,7 @@
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from wikipedia_country_scraper.items import WikipediaCountryScraperItem
class WikipediaCountryScraperItem(scrapy.Item):
@@ -24,3 +25,8 @@ class AnthemsItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
class CapitalsItem(scrapy.Item):
country_name = scrapy.Field()
capitals = scrapy.Field()

View File

@@ -0,0 +1,37 @@
import scrapy
from scrapy.http import TextResponse
from wikipedia_country_scraper.items import CapitalsItem
class CapitalsSpider(scrapy.Spider):
name = "capitals"
start_urls = [
"https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages"
]
custom_settings = {
"FEEDS": {
pathlib.Path(__file__).resolve().parents[4]
/ "data"
/ "scrapy"
/ "raw_country_data"
/ "capitals.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
},
}
def parse(self, response: TextResponse):
_country = response.xpath("//table[@class='wikitable']/tbody/tr[not(@style='background:#ccc;')]")
country_names = [country_name.get() for country_name in _country.xpath("td[1]//a[@title]/text()")]
capital_names = []
for capital in _country.xpath("td[2]"):
_capitals = [subcapital.get() for subcapital in capital.xpath("a[@title]/text()")]
capital_names.append(_capitals)
for country, capitals in zip(country_names, capital_names):
capital_item = CapitalsItem()
capital_item["country_name"] = country
capital_item["capitals"] = capitals
yield capital_item