chore: add anthems spider

2022-06-25 17:01:53 +01:00
parent e865018fd9
commit 1156976823
8 changed files with 260 additions and 146 deletions
--- a/01_scrapy/wikipedia_country_scraper/download_anthems.sh
+++ b/01_scrapy/wikipedia_country_scraper/download_anthems.sh
@@ -0,0 +1,3 @@
 #!/bin/bash
 source .venv/bin/activate
 scrapy crawl AnthemsSpider
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py
@@ -15,3 +15,12 @@ class WikipediaCountryScraperItem(scrapy.Item):
    file_urls = scrapy.Field()
    files = scrapy.Field()
 class AnthemsItem(scrapy.Item):
    country_name = scrapy.Field()
    native_anthem_title = scrapy.Field()
    english_title = scrapy.Field()
    file_urls = scrapy.Field()
    files = scrapy.Field()
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py
@@ -26,3 +26,12 @@ class WikipediaCountryScraperFilesPipeline(FilesPipeline):
                return f"files/flags/{filename}"
            elif filename.endswith(".ogg") or filename.endswith("oga"):
                return f"files/anthems/{filename}"
 class AnthemDownloadFilesPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):
        flag_filename = re.search(r"([^\/]*)$", request.url)
        if isinstance(flag_filename, re.Match):
            if (filename := flag_filename[1]).endswith("ogg") or filename.endswith("oga"):
                return f"files/anthems/{filename}"
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/settings.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/settings.py
@@ -65,10 +65,10 @@ DOWNLOADER_MIDDLEWARES = {
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-ITEM_PIPELINES = {
+# ITEM_PIPELINES = {
-    "wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
+#     "wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 300,
-    # "scrapy.pipelines.files.FilesPipeline": 1
+#     # "scrapy.pipelines.files.FilesPipeline": 1
-}
+# }
 FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy" / "raw_country_data")
 # Enable and configure the AutoThrottle extension (disabled by default)
@@ -92,10 +92,10 @@ FILES_STORE = str(pathlib.Path(__file__).resolve().parents[3] / "data" / "scrapy
 # HTTPCACHE_IGNORE_HTTP_CODES = []
 # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
-FEEDS = {
+# FEEDS = {
-    pathlib.Path(__file__).resolve().parents[3]
+#     pathlib.Path(__file__).resolve().parents[3]
-    / "data"
+#     / "data"
-    / "scrapy"
+#     / "scrapy"
-    / "raw_country_data"
+#     / "raw_country_data"
-    / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
+#     / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
 }
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/anthems.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/anthems.py
@@ -0,0 +1,65 @@
 import pathlib
 import re
 import scrapy
 from scrapy.http import TextResponse
 from wikipedia_country_scraper.items import AnthemsItem
 class AnthemsSpider(scrapy.Spider):
    name = "anthems"
    start_urls = ["https://en.wikipedia.org/wiki/List_of_national_anthems"]
    custom_settings = {
        "ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.AnthemDownloadFilesPipeline": 100},
        "FEEDS": {
            pathlib.Path(__file__).resolve().parents[4]
            / "data"
            / "scrapy"
            / "raw_country_data"
            / "anthems.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
        },
    }
    def parse(self, response: TextResponse):
        country_names = []
        native_anthem_titles = []
        english_titles = []
        anthem_urls = []
        _country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/th[1]")
        for link in _country_names:
            if (country_name := link.xpath("a/text()").get()) is not None:
                country_names.append(country_name)
        _native_anthem_titles = response.xpath("//table[contains(@class, 'wikitable')]/tbody/tr/td[1]")
        for link in _native_anthem_titles:
            titles = link.xpath("a/text()").getall()
            native_anthem_title = titles[0] if len(titles) == 0 else "\n".join(titles)
            native_anthem_titles.append(native_anthem_title)
        for link in _native_anthem_titles:
            if (english_title := link.xpath("small/text()").get()) is not None:
                english_titles.append(re.search(r"(?:[\W]*)(?P<title>[^\"]*)", english_title)["title"])
            else:
                english_titles.append(None)
        _country_names = response.xpath("//table[contains(@class, 'wikitable')]/tbody")
        for index, link in enumerate(_country_names):
            if index == 0:
                recognised_countries = link.xpath("tr/td[5]")
                anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in recognised_countries)
            elif index == 1:
                partially_recognised_countries = link.xpath("tr/td[6]")
                anthem_urls.extend(anthem_url.xpath("a/@href").get() for anthem_url in partially_recognised_countries)
        for country_name, native_anthem_title, english_title, anthem_url in zip(
            country_names, native_anthem_titles, english_titles, anthem_urls
        ):
            anthem_item = AnthemsItem()
            anthem_item["country_name"] = country_name
            anthem_item["native_anthem_title"] = native_anthem_title
            anthem_item["english_title"] = english_title
            anthem_item["file_urls"] = [f"https://en.wikipedia.org{anthem_url}" if anthem_url is not None else None]
            yield anthem_item
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 import pathlib
 import re
 import scrapy
@@ -10,6 +11,16 @@ from wikipedia_country_scraper.items import WikipediaCountryScraperItem
 class CountrydownloaderSpider(scrapy.Spider):
    name = "CountrydownloaderSpider"
    custom_settings = {
        "ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.WikipediaCountryScraperFilesPipeline": 100},
        "FEEDS": {
            pathlib.Path(__file__).resolve().parents[4]
            / "data"
            / "scrapy"
            / "raw_country_data"
            / "countries.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
        },
    }
    def start_requests(self):
        return [
--- a/docs/scraping.md
+++ b/docs/scraping.md
@@ -23,9 +23,13 @@ Using selectors:
 Download files/images:
 <https://docs.scrapy.org/en/latest/topics/media-pipeline.html>
 Setting pipelines per spider:
 <https://stackoverflow.com/a/34647090>
 Exporting JSON:
 <https://docs.scrapy.org/en/latest/topics/feed-exports.html#std-setting-FEEDS>
 Setting exports per spider:
 <https://stackoverflow.com/a/53322959>
 ### new project
--- a/playground/downloaded_data_inspection_lab/Untitled.ipynb
+++ b/playground/downloaded_data_inspection_lab/Untitled.ipynb