chore: try get anthem from page

2022-06-23 22:40:28 +01:00
parent a59d34d180
commit 71b8cebc42
1 changed files with 23 additions and 25 deletions
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py
@@ -24,7 +24,7 @@ class CountrydownloaderSpider(scrapy.Spider):
        ).getall()

        for url in country_urls_xpath:
-        # for url in country_urls_xpath[:3]:
+            # for url in country_urls_xpath[:3]:
            regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
            yield scrapy.Request(
                url=f"https://en.wikipedia.org{url}",
@@ -49,17 +49,13 @@ class CountrydownloaderSpider(scrapy.Spider):
            "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
        ).get()

-        anthem_file_url = response.xpath(
-            "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//source[contains(@type, 'audio/ogg')]/@src"
-        ).get()
-        anthem_item = response.xpath(
-            "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]"
+        anthem_page_url = response.xpath(
+            "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]/a/@href"
        ).get()

        country_item = {
            **country_item,
            "country": country_information_xpath,
-            "anthem": anthem_item,
        }

        yield scrapy.Request(
@@ -69,7 +65,7 @@ class CountrydownloaderSpider(scrapy.Spider):
                "country_item": country_item,
                "urls": {
                    "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
-                    "anthem_file_url": f"https:{anthem_file_url}",
+                    "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
                },
            },
        )
@@ -93,26 +89,28 @@ class CountrydownloaderSpider(scrapy.Spider):
        flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
        country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}

-        country_scrapy_item = WikipediaCountryScraperItem()
+        yield scrapy.Request(
+            url=urls["anthem_file_url"],
+            callback=self.extract_anthem_file,
+            cb_kwargs={
+                "country_item": country_item,
+                "urls": urls,
+            },
+        )

+    def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
+        anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
+        anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").get()
+
+        country_scrapy_item = WikipediaCountryScraperItem()
        country_scrapy_item["country_url"] = country_item["country_url"]
        country_scrapy_item["short_country_name"] = country_item["short_country_name"]
        country_scrapy_item["country"] = country_item["country"]
        country_scrapy_item["flag_description"] = country_item["flag_description"]
-        country_scrapy_item["anthem"] = country_item["anthem"]
+        country_scrapy_item["anthem"] = anthem_text
+        country_scrapy_item["file_urls"] = [
+            country_item["flag_image_url"],
+            f"https://en.wikipedia.org{anthem_file_url}",
+        ]

-        country_scrapy_item["file_urls"] = [country_item["flag_image_url"], urls["anthem_file_url"]]
-
-        yield country_scrapy_item
-
-    # def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
-    #     anthem_file_xpath = response.xpath(
-    #         "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
-    #     ).get()
-
-    #     country_scrapy_item = WikipediaCountryScraperItem()
-    #     country_scrapy_item["country_url"] = country_item["country_url"]
-    #     country_scrapy_item["short_country_name"] = country_item["short_country_name"]
-    #     country_scrapy_item["country"] = country_item["country"]
-    #     country_scrapy_item["flag_description"] = country_item["flag_description"]
-    #     country_scrapy_item["file_urls"] = [country_item["flag_image_url"], f"https:{anthem_file_xpath}"]
+        yield