chore: add flags spider

2022-06-26 17:18:31 +01:00
parent 7e95992654
commit eda5e67058
4 changed files with 98 additions and 2 deletions
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py
@@ -29,3 +29,11 @@ class AnthemsItem(scrapy.Item):
 class CapitalsItem(scrapy.Item):
    country_name = scrapy.Field()
    capitals = scrapy.Field()
+
+
+class FlagsItem(scrapy.Item):
+    country_name = scrapy.Field()
+    flag_description_html = scrapy.Field()
+
+    file_urls = scrapy.Field()
+    files = scrapy.Field()
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py
@@ -33,5 +33,19 @@ class AnthemDownloadFilesPipeline(FilesPipeline):
        flag_filename = re.search(r"([^\/]*)$", request.url)

        if isinstance(flag_filename, re.Match):
-            if (filename := flag_filename[1]).endswith("ogg") or filename.endswith("oga"):
+            if (
+                (filename := flag_filename[1]).endswith("ogg")
+                or filename.endswith("oga")
+                or filename.endswith("mp3")
+                or filename.endswith("wav")
+            ):
                return f"files/anthems/{filename}"
+
+
+class FlagDownloadFilesPipeline(FilesPipeline):
+    def file_path(self, request, response=None, info=None, *, item=None):
+        flag_filename = re.search(r"([^\/]*)$", request.url)
+
+        if isinstance(flag_filename, re.Match):
+            if filename := flag_filename[1].endswith(".svg"):
+                return f"files/flags/{filename}"
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py
@@ -32,7 +32,6 @@ class CapitalsSpider(scrapy.Spider):

        for country, capitals in zip(country_names, capital_names):
            capital_item = CapitalsItem()
-
            capital_item["country_name"] = country
            capital_item["capitals"] = capitals

--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py
@@ -0,0 +1,75 @@
+import pathlib
+
+import scrapy
+from scrapy.http import TextResponse
+from wikipedia_country_scraper.items import FlagsItem
+
+
+class FlagsSpider(scrapy.Spider):
+    name = "FlagsSpider"
+    start_urls = [
+        "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages"
+    ]
+    custom_settings = {
+        "ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.FlagDownloadFilesPipeline": 100},
+        "FEEDS": {
+            pathlib.Path(__file__).resolve().parents[4]
+            / "data"
+            / "scrapy"
+            / "raw_country_data"
+            / "flags.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
+        },
+    }
+
+    def parse(self, response: TextResponse):
+        _country = response.xpath("//table[@class='wikitable']/tbody/tr[not(@style='background:#ccc;')]")
+
+        country_names = [country_name.get() for country_name in _country.xpath("td[1]//a[@title]/text()")]
+        country_urls = [
+            f"https://en.wikipedia.org{country_url.get()}" for country_url in _country.xpath("td[1]//a[@title]/@href")
+        ]
+
+        for country_name, country_url in zip(country_names, country_urls):
+            yield scrapy.Request(
+                url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name}
+            )
+
+    def get_country_page(self, response: TextResponse, country_name: str):
+        flag_data = response.xpath(
+            "//table[contains(@class, 'infobox')]/tbody/tr/td/div/div[1]/div/a[not(contains(@href, 'cite_note'))]/@href"
+        ).getall()
+
+        flag_image_url = flag_data[0]
+        flag_description_url = flag_data[1]
+
+        yield scrapy.Request(
+            url=f"https://en.wikipedia.org{flag_description_url}",
+            callback=self.get_flag_description,
+            cb_kwargs={"country_name": country_name},
+        )
+
+        yield scrapy.Request(
+            url=f"https://en.wikipedia.org{flag_image_url}",
+            callback=self.get_flag_image,
+            cb_kwargs={"country_name": country_name},
+        )
+
+    def get_flag_description(self, response: TextResponse, country_name: str):
+        flag_description_result = response.xpath(
+            "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
+        ).get()
+
+        flags_item = FlagsItem()
+        flags_item["country_name"] = country_name
+        flags_item["flag_description_html"] = flag_description_result
+
+        yield flags_item
+
+    def get_flag_image(self, response: TextResponse, country_name: str):
+        flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
+
+        flags_item = FlagsItem()
+        flags_item["country_name"] = country_name
+        flags_item["file_urls"] = [f"https://{flag_image_result}"]
+
+        yield flags_item