chore: add anthems spider

This commit is contained in:
2022-06-25 17:01:53 +01:00
parent e865018fd9
commit 1156976823
8 changed files with 260 additions and 146 deletions

View File

@@ -0,0 +1,3 @@
#!/bin/bash
source .venv/bin/activate
scrapy crawl AnthemsSpider

View File

@@ -15,3 +15,12 @@ class WikipediaCountryScraperItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
class AnthemsItem(scrapy.Item):
country_name = scrapy.Field()
native_anthem_title = scrapy.Field()
english_title = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()

View File

@@ -26,3 +26,12 @@ class WikipediaCountryScraperFilesPipeline(FilesPipeline):
return f"files/flags/{filename}"
elif filename.endswith(".ogg") or filename.endswith("oga"):
return f"files/anthems/{filename}"
class AnthemDownloadFilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
flag_filename = re.search(r"([^\/]*)$", request.url)
if isinstance(flag_filename, re.Match):
if (filename := flag_filename[1]).endswith("ogg") or filename.endswith("oga"):
return f"files/anthems/{filename}"

View File

@@ -65,10 +65,10 @@ DOWNLOADER_MIDDLEWARES = {
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
# "scrapy.pipelines.files.FilesPipeline": 1
}
# ITEM_PIPELINES = {
# "wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
# # "scrapy.pipelines.files.FilesPipeline": 1
# }
FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy" / "raw_country_data")
# Enable and configure the AutoThrottle extension (disabled by default)
@@ -92,10 +92,10 @@ FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
FEEDS = {
pathlib.Path(__file__).resolve().parents[3]
/ "data"
/ "scrapy"
/ "raw_country_data"
/ "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
# FEEDS = {
# pathlib.Path(__file__).resolve().parents[3]
# / "data"
# / "scrapy"
# / "raw_country_data"
# / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
}

View File

@@ -0,0 +1,65 @@
import pathlib
import re
import scrapy
from scrapy.http import TextResponse
from wikipedia_country_scraper.items import AnthemsItem
class AnthemsSpider(scrapy.Spider):
name = "anthems"
start_urls = ["https://en.wikipedia.org/wiki/List_of_national_anthems"]
custom_settings = {
"ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.AnthemDownloadFilesPipeline": 100},
"FEEDS": {
pathlib.Path(__file__).resolve().parents[4]
/ "data"
/ "scrapy"
/ "raw_country_data"
/ "anthems.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
},
}
def parse(self, response: TextResponse):
country_names = []
native_anthem_titles = []
english_titles = []
anthem_urls = []
_country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/th[1]")
for link in _country_names:
if (country_name := link.xpath("a/text()").get()) is not None:
country_names.append(country_name)
_native_anthem_titles = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/td[1]")
for link in _native_anthem_titles:
titles = link.xpath("a/text()").getall()
native_anthem_title = titles[0] if len(titles) == 0 else "\n".join(titles)
native_anthem_titles.append(native_anthem_title)
for link in _native_anthem_titles:
if (english_title := link.xpath("small/text()").get()) is not None:
english_titles.append(re.search(r"(?:[\W]*)(?P<title>[^\"]*)", english_title)["title"])
else:
english_titles.append(None)
_country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody")
for index, link in enumerate(_country_names):
if index == 0:
recognised_countries = link.xpath("tr/td[5]")
anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in recognised_countries)
elif index == 1:
partially_recognised_countries = link.xpath("tr/td[6]")
anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in partially_recognised_countries)
for country_name, native_anthem_title, english_title, anthem_url in zip(
country_names, native_anthem_titles, english_titles, anthem_urls
):
anthem_item = AnthemsItem()
anthem_item["country_name"] = country_name
anthem_item["native_anthem_title"] = native_anthem_title
anthem_item["english_title"] = english_title
anthem_item["file_urls"] = [f"https://en.wikipedia.org{anthem_url}" if anthem_url is not None else None]
yield anthem_item

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import pathlib
import re
import scrapy
@@ -10,6 +11,16 @@ from wikipedia_country_scraper.items import WikipediaCountryScraperItem
class CountrydownloaderSpider(scrapy.Spider):
name = "CountrydownloaderSpider"
custom_settings = {
"ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 100},
"FEEDS": {
pathlib.Path(__file__).resolve().parents[4]
/ "data"
/ "scrapy"
/ "raw_country_data"
/ "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
},
}
def start_requests(self):
return [