chore: add anthems spider

2022-06-25 17:01:53 +01:00
parent e865018fd9
commit 1156976823
8 changed files with 260 additions and 146 deletions
--- a/01_scrapy/wikipedia_country_scraper/download_anthems.sh
+++ b/01_scrapy/wikipedia_country_scraper/download_anthems.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+source .venv/bin/activate
+scrapy crawl AnthemsSpider
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py
@@ -15,3 +15,12 @@ class WikipediaCountryScraperItem(scrapy.Item):

    file_urls = scrapy.Field()
    files = scrapy.Field()
+
+
+class AnthemsItem(scrapy.Item):
+    country_name = scrapy.Field()
+    native_anthem_title = scrapy.Field()
+    english_title = scrapy.Field()
+
+    file_urls = scrapy.Field()
+    files = scrapy.Field()
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py
@@ -26,3 +26,12 @@ class WikipediaCountryScraperFilesPipeline(FilesPipeline):
                return f"files/flags/{filename}"
            elif filename.endswith(".ogg") or filename.endswith("oga"):
                return f"files/anthems/{filename}"
+
+
+class AnthemDownloadFilesPipeline(FilesPipeline):
+    def file_path(self, request, response=None, info=None, *, item=None):
+        flag_filename = re.search(r"([^\/]*)$", request.url)
+
+        if isinstance(flag_filename, re.Match):
+            if (filename := flag_filename[1]).endswith("ogg") or filename.endswith("oga"):
+                return f"files/anthems/{filename}"
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/settings.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/settings.py
@@ -65,10 +65,10 @@ DOWNLOADER_MIDDLEWARES = {

 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-ITEM_PIPELINES = {
-    "wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
-    # "scrapy.pipelines.files.FilesPipeline": 1
-}
+# ITEM_PIPELINES = {
+#     "wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
+#     # "scrapy.pipelines.files.FilesPipeline": 1
+# }
 FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy" / "raw_country_data")

 # Enable and configure the AutoThrottle extension (disabled by default)
@@ -92,10 +92,10 @@ FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy
 # HTTPCACHE_IGNORE_HTTP_CODES = []
 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

-FEEDS = {
-    pathlib.Path(__file__).resolve().parents[3]
-    / "data"
-    / "scrapy"
-    / "raw_country_data"
-    / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
+# FEEDS = {
+#     pathlib.Path(__file__).resolve().parents[3]
+#     / "data"
+#     / "scrapy"
+#     / "raw_country_data"
+#     / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
 }
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/anthems.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/anthems.py
@@ -0,0 +1,65 @@
+import pathlib
+import re
+
+import scrapy
+from scrapy.http import TextResponse
+
+from wikipedia_country_scraper.items import AnthemsItem
+
+
+class AnthemsSpider(scrapy.Spider):
+    name = "anthems"
+    start_urls = ["https://en.wikipedia.org/wiki/List_of_national_anthems"]
+    custom_settings = {
+        "ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.AnthemDownloadFilesPipeline": 100},
+        "FEEDS": {
+            pathlib.Path(__file__).resolve().parents[4]
+            / "data"
+            / "scrapy"
+            / "raw_country_data"
+            / "anthems.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
+        },
+    }
+
+    def parse(self, response: TextResponse):
+        country_names = []
+        native_anthem_titles = []
+        english_titles = []
+        anthem_urls = []
+
+        _country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/th[1]")
+        for link in _country_names:
+            if (country_name := link.xpath("a/text()").get()) is not None:
+                country_names.append(country_name)
+
+        _native_anthem_titles = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/td[1]")
+        for link in _native_anthem_titles:
+            titles = link.xpath("a/text()").getall()
+            native_anthem_title = titles[0] if len(titles) == 0 else "\n".join(titles)
+            native_anthem_titles.append(native_anthem_title)
+
+        for link in _native_anthem_titles:
+            if (english_title := link.xpath("small/text()").get()) is not None:
+                english_titles.append(re.search(r"(?:[\W]*)(?P<title>[^\"]*)", english_title)["title"])
+            else:
+                english_titles.append(None)
+
+        _country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody")
+        for index, link in enumerate(_country_names):
+            if index == 0:
+                recognised_countries = link.xpath("tr/td[5]")
+                anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in recognised_countries)
+            elif index == 1:
+                partially_recognised_countries = link.xpath("tr/td[6]")
+                anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in partially_recognised_countries)
+
+        for country_name, native_anthem_title, english_title, anthem_url in zip(
+            country_names, native_anthem_titles, english_titles, anthem_urls
+        ):
+            anthem_item = AnthemsItem()
+            anthem_item["country_name"] = country_name
+            anthem_item["native_anthem_title"] = native_anthem_title
+            anthem_item["english_title"] = english_title
+            anthem_item["file_urls"] = [f"https://en.wikipedia.org{anthem_url}" if anthem_url is not None else None]
+
+            yield anthem_item
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py
@@ -1,5 +1,6 @@
 from __future__ import annotations

+import pathlib
 import re

 import scrapy
@@ -10,6 +11,16 @@ from wikipedia_country_scraper.items import WikipediaCountryScraperItem

 class CountrydownloaderSpider(scrapy.Spider):
    name = "CountrydownloaderSpider"
+    custom_settings = {
+        "ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 100},
+        "FEEDS": {
+            pathlib.Path(__file__).resolve().parents[4]
+            / "data"
+            / "scrapy"
+            / "raw_country_data"
+            / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
+        },
+    }

    def start_requests(self):
        return [