chore: remove anthem from scraper

2022-06-24 20:48:56 +01:00
parent 0bd759a002
commit 34d6980cac
4 changed files with 174 additions and 53 deletions
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py
@@ -9,11 +9,8 @@ import scrapy
 class WikipediaCountryScraperItem(scrapy.Item):
    country_url = scrapy.Field()
    short_country_name = scrapy.Field()
-    country = scrapy.Field()
-    flag_description = scrapy.Field()
-    anthem = scrapy.Field()
-    anthem_url = scrapy.Field()
-    anthem_file_url = scrapy.Field()
+    country_html = scrapy.Field()
+    flag_html = scrapy.Field()

    file_urls = scrapy.Field()
    files = scrapy.Field()
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py
@@ -19,13 +19,15 @@ class CountrydownloaderSpider(scrapy.Spider):
        ]

    def extract_country_urls(self, response: TextResponse):
+        """Extract urls of all countries from https://en.wikipedia.org/wiki/List_of_sovereign_states."""
+
        country_urls_xpath = response.xpath(
            "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
        ).getall()

        for url in country_urls_xpath:
-            # for url in country_urls_xpath[:3]:
            regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
+
            yield scrapy.Request(
                url=f"https://en.wikipedia.org{url}",
                callback=self.extract_country_information,
@@ -40,6 +42,8 @@ class CountrydownloaderSpider(scrapy.Spider):
            )

    def extract_country_information(self, response: TextResponse, country_item: dict):
+        """Extract html of country sidebar on each country page."""
+
        country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()

        flag_image_url = response.xpath(
@@ -49,13 +53,9 @@ class CountrydownloaderSpider(scrapy.Spider):
            "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
        ).get()

-        anthem_page_url = response.xpath(
-            "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
-        ).get()
-
        country_item = {
            **country_item,
-            "country": country_information_xpath,
+            "country_html": country_information_xpath,
        }

        yield scrapy.Request(
@@ -65,16 +65,18 @@ class CountrydownloaderSpider(scrapy.Spider):
                "country_item": country_item,
                "urls": {
                    "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
-                    "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
                },
            },
        )

    def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
+        """Extract the html of the first paragraph on each country flag page."""
+
        flag_description_xpath = response.xpath(
            "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
        ).get()
-        country_item = {**country_item, "flag_description": flag_description_xpath}
+
+        country_item = {**country_item, "flag_html": flag_description_xpath}

        yield scrapy.Request(
            url=urls["flag_image_url"],
@@ -86,37 +88,18 @@ class CountrydownloaderSpider(scrapy.Spider):
        )

    def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
+        """Extract the image URL for each country flag."""
+
        flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
+
        country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}

-        yield scrapy.Request(
-            url=urls["anthem_page_url"],
-            callback=self.extract_anthem_file,
-            cb_kwargs={
-                "country_item": country_item,
-                "urls": urls,
-            },
-        )
-
-    def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
-        anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
-        _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
-
-        anthem_file_url = next(
-            (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
-        )
-
+        # yield the country item containing scraped data
        country_scrapy_item = WikipediaCountryScraperItem()
        country_scrapy_item["country_url"] = country_item["country_url"]
        country_scrapy_item["short_country_name"] = country_item["short_country_name"]
-        country_scrapy_item["country"] = country_item["country"]
-        country_scrapy_item["flag_description"] = country_item["flag_description"]
-        country_scrapy_item["anthem"] = anthem_text
-        country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
-        country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
-        country_scrapy_item["file_urls"] = [
-            country_item["flag_image_url"],
-            f"https://en.wikipedia.org{anthem_file_url}",
-        ]
+        country_scrapy_item["country_html"] = country_item["country_html"]
+        country_scrapy_item["flag_html"] = country_item["flag_html"]
+        country_scrapy_item["file_urls"] = [country_item["flag_image_url"]]

        yield country_scrapy_item
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+import re
+
+import scrapy
+from scrapy.http import TextResponse
+
+from wikipedia_country_scraper.items import WikipediaCountryScraperItem
+
+
+class CountrydownloaderSpider(scrapy.Spider):
+    name = "CountrydownloaderSpider"
+
+    def start_requests(self):
+        return [
+            scrapy.Request(
+                url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
+            )
+        ]
+
+    def extract_country_urls(self, response: TextResponse):
+        country_urls_xpath = response.xpath(
+            "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
+        ).getall()
+
+        for url in country_urls_xpath:
+            # for url in country_urls_xpath[:3]:
+            regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
+            yield scrapy.Request(
+                url=f"https://en.wikipedia.org{url}",
+                callback=self.extract_country_information,
+                cb_kwargs={
+                    "country_item": {
+                        "country_url": f"https://en.wikipedia.org{url}",
+                        "short_country_name": regex_match["short_country_name"]
+                        if isinstance(regex_match, re.Match)
+                        else None,
+                    }
+                },
+            )
+
+    def extract_country_information(self, response: TextResponse, country_item: dict):
+        country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
+
+        flag_image_url = response.xpath(
+            "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
+        ).get()
+        flag_description_url = response.xpath(
+            "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
+        ).get()
+
+        anthem_page_url = response.xpath(
+            "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
+        ).get()
+
+        country_item = {
+            **country_item,
+            "country": country_information_xpath,
+        }
+
+        yield scrapy.Request(
+            url=f"https://en.wikipedia.org{flag_description_url}",
+            callback=self.extract_flag_description,
+            cb_kwargs={
+                "country_item": country_item,
+                "urls": {
+                    "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
+                    "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
+                },
+            },
+        )
+
+    def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
+        flag_description_xpath = response.xpath(
+            "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
+        ).get()
+        country_item = {**country_item, "flag_description": flag_description_xpath}
+
+        yield scrapy.Request(
+            url=urls["flag_image_url"],
+            callback=self.extract_flag_images,
+            cb_kwargs={
+                "country_item": country_item,
+                "urls": urls,
+            },
+        )
+
+    def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
+        flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
+        country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
+
+        yield scrapy.Request(
+            url=urls["anthem_page_url"],
+            callback=self.extract_anthem_file,
+            cb_kwargs={
+                "country_item": country_item,
+                "urls": urls,
+            },
+        )
+
+    def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
+        anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
+        _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
+
+        anthem_file_url = next(
+            (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
+        )
+
+        country_scrapy_item = WikipediaCountryScraperItem()
+        country_scrapy_item["country_url"] = country_item["country_url"]
+        country_scrapy_item["short_country_name"] = country_item["short_country_name"]
+        country_scrapy_item["country"] = country_item["country"]
+        country_scrapy_item["flag_description"] = country_item["flag_description"]
+        country_scrapy_item["anthem"] = anthem_text
+        country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
+        country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
+        country_scrapy_item["file_urls"] = [
+            country_item["flag_image_url"],
+            f"https://en.wikipedia.org{anthem_file_url}",
+        ]
+
+        yield country_scrapy_item