chore: add anthems spider
This commit is contained in:
3
01_scrapy/wikipedia_country_scraper/download_anthems.sh
Executable file
3
01_scrapy/wikipedia_country_scraper/download_anthems.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/bin/bash
|
||||
source .venv/bin/activate
|
||||
scrapy crawl AnthemsSpider
|
||||
@@ -15,3 +15,12 @@ class WikipediaCountryScraperItem(scrapy.Item):
|
||||
|
||||
file_urls = scrapy.Field()
|
||||
files = scrapy.Field()
|
||||
|
||||
|
||||
class AnthemsItem(scrapy.Item):
|
||||
country_name = scrapy.Field()
|
||||
native_anthem_title = scrapy.Field()
|
||||
english_title = scrapy.Field()
|
||||
|
||||
file_urls = scrapy.Field()
|
||||
files = scrapy.Field()
|
||||
|
||||
@@ -26,3 +26,12 @@ class WikipediaCountryScraperFilesPipeline(FilesPipeline):
|
||||
return f"files/flags/{filename}"
|
||||
elif filename.endswith(".ogg") or filename.endswith("oga"):
|
||||
return f"files/anthems/{filename}"
|
||||
|
||||
|
||||
class AnthemDownloadFilesPipeline(FilesPipeline):
|
||||
def file_path(self, request, response=None, info=None, *, item=None):
|
||||
flag_filename = re.search(r"([^\/]*)$", request.url)
|
||||
|
||||
if isinstance(flag_filename, re.Match):
|
||||
if (filename := flag_filename[1]).endswith("ogg") or filename.endswith("oga"):
|
||||
return f"files/anthems/{filename}"
|
||||
|
||||
@@ -65,10 +65,10 @@ DOWNLOADER_MIDDLEWARES = {
|
||||
|
||||
# Configure item pipelines
|
||||
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
|
||||
ITEM_PIPELINES = {
|
||||
"wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
|
||||
# "scrapy.pipelines.files.FilesPipeline": 1
|
||||
}
|
||||
# ITEM_PIPELINES = {
|
||||
# "wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
|
||||
# # "scrapy.pipelines.files.FilesPipeline": 1
|
||||
# }
|
||||
FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy" / "raw_country_data")
|
||||
|
||||
# Enable and configure the AutoThrottle extension (disabled by default)
|
||||
@@ -92,10 +92,10 @@ FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy
|
||||
# HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
||||
|
||||
FEEDS = {
|
||||
pathlib.Path(__file__).resolve().parents[3]
|
||||
/ "data"
|
||||
/ "scrapy"
|
||||
/ "raw_country_data"
|
||||
/ "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||
# FEEDS = {
|
||||
# pathlib.Path(__file__).resolve().parents[3]
|
||||
# / "data"
|
||||
# / "scrapy"
|
||||
# / "raw_country_data"
|
||||
# / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
import scrapy
|
||||
from scrapy.http import TextResponse
|
||||
|
||||
from wikipedia_country_scraper.items import AnthemsItem
|
||||
|
||||
|
||||
class AnthemsSpider(scrapy.Spider):
|
||||
name = "anthems"
|
||||
start_urls = ["https://en.wikipedia.org/wiki/List_of_national_anthems"]
|
||||
custom_settings = {
|
||||
"ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.AnthemDownloadFilesPipeline": 100},
|
||||
"FEEDS": {
|
||||
pathlib.Path(__file__).resolve().parents[4]
|
||||
/ "data"
|
||||
/ "scrapy"
|
||||
/ "raw_country_data"
|
||||
/ "anthems.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||
},
|
||||
}
|
||||
|
||||
def parse(self, response: TextResponse):
|
||||
country_names = []
|
||||
native_anthem_titles = []
|
||||
english_titles = []
|
||||
anthem_urls = []
|
||||
|
||||
_country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/th[1]")
|
||||
for link in _country_names:
|
||||
if (country_name := link.xpath("a/text()").get()) is not None:
|
||||
country_names.append(country_name)
|
||||
|
||||
_native_anthem_titles = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/td[1]")
|
||||
for link in _native_anthem_titles:
|
||||
titles = link.xpath("a/text()").getall()
|
||||
native_anthem_title = titles[0] if len(titles) == 0 else "\n".join(titles)
|
||||
native_anthem_titles.append(native_anthem_title)
|
||||
|
||||
for link in _native_anthem_titles:
|
||||
if (english_title := link.xpath("small/text()").get()) is not None:
|
||||
english_titles.append(re.search(r"(?:[\W]*)(?P<title>[^\"]*)", english_title)["title"])
|
||||
else:
|
||||
english_titles.append(None)
|
||||
|
||||
_country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody")
|
||||
for index, link in enumerate(_country_names):
|
||||
if index == 0:
|
||||
recognised_countries = link.xpath("tr/td[5]")
|
||||
anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in recognised_countries)
|
||||
elif index == 1:
|
||||
partially_recognised_countries = link.xpath("tr/td[6]")
|
||||
anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in partially_recognised_countries)
|
||||
|
||||
for country_name, native_anthem_title, english_title, anthem_url in zip(
|
||||
country_names, native_anthem_titles, english_titles, anthem_urls
|
||||
):
|
||||
anthem_item = AnthemsItem()
|
||||
anthem_item["country_name"] = country_name
|
||||
anthem_item["native_anthem_title"] = native_anthem_title
|
||||
anthem_item["english_title"] = english_title
|
||||
anthem_item["file_urls"] = [f"https://en.wikipedia.org{anthem_url}" if anthem_url is not None else None]
|
||||
|
||||
yield anthem_item
|
||||
@@ -1,5 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
import scrapy
|
||||
@@ -10,6 +11,16 @@ from wikipedia_country_scraper.items import WikipediaCountryScraperItem
|
||||
|
||||
class CountrydownloaderSpider(scrapy.Spider):
|
||||
name = "CountrydownloaderSpider"
|
||||
custom_settings = {
|
||||
"ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 100},
|
||||
"FEEDS": {
|
||||
pathlib.Path(__file__).resolve().parents[4]
|
||||
/ "data"
|
||||
/ "scrapy"
|
||||
/ "raw_country_data"
|
||||
/ "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||
},
|
||||
}
|
||||
|
||||
def start_requests(self):
|
||||
return [
|
||||
|
||||
Reference in New Issue
Block a user