chore: try alternative flag spider

2022-06-26 17:46:04 +01:00
parent b5eec4550d
commit 44a8365c53
2 changed files with 4496 additions and 19 deletions
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py
@@ -5,6 +5,9 @@ from scrapy.http import TextResponse
 from wikipedia_country_scraper.items import FlagsItem


+EXCLUDED = ["Réunion"]
+
+
 class FlagsSpider(scrapy.Spider):
    name = "FlagsSpider"
    start_urls = [
@@ -30,46 +33,55 @@ class FlagsSpider(scrapy.Spider):
        ]

        for country_name, country_url in zip(country_names, country_urls):
-            yield scrapy.Request(
-                url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name}
-            )
+            if country_name not in EXCLUDED:
+                yield scrapy.Request(
+                    url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name}
+                )

    def get_country_page(self, response: TextResponse, country_name: str):
        flag_data = response.xpath(
            "//table[contains(@class, 'infobox')]/tbody/tr/td/div/div/div/a[not(contains(@href, 'cite_note'))]/@href"
        ).getall()

-        flag_image_url = flag_data[0]
-        flag_description_url = flag_data[1]
+        try:
+            flag_image_url = flag_data[0]
+            flag_description_url = flag_data[1]
+        except IndexError:
+            flag_image_data = response.xpath(
+                "//table[contains(@class, 'infobox')]/tbody/tr/td/div/div/a[not(contains(@href, 'cite_note'))]/@href"
+            ).getall()
+            flag_description_data = response.xpath(
+                "//table[contains(@class, 'infobox')]/tbody/tr/td/div/a[not(contains(@href, 'cite_note'))]/@href"
+            ).getall()
+            flag_image_url = flag_image_data[0]
+            flag_description_url = flag_description_data[1]

        yield scrapy.Request(
            url=f"https://en.wikipedia.org{flag_description_url}",
            callback=self.get_flag_description,
-            cb_kwargs={"country_name": country_name},
+            cb_kwargs={
+                "country_name": country_name,
+                "urls": {"flag_image_url": f"https://en.wikipedia.org{flag_image_url}"},
+            },
        )

-        yield scrapy.Request(
-            url=f"https://en.wikipedia.org{flag_image_url}",
-            callback=self.get_flag_image,
-            cb_kwargs={"country_name": country_name},
-        )
-
-    def get_flag_description(self, response: TextResponse, country_name: str):
+    def get_flag_description(self, response: TextResponse, country_name: str, urls: dict):
        flag_description_result = response.xpath(
            "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
        ).get()

-        flags_item = FlagsItem()
-        flags_item["country_name"] = country_name
-        flags_item["flag_description_html"] = flag_description_result
+        yield scrapy.Request(
+            url=urls["flag_image_url"],
+            callback=self.get_flag_image,
+            cb_kwargs={"country_name": country_name, "flag_description_html": flag_description_result},
+        )

-        yield flags_item
-
-    def get_flag_image(self, response: TextResponse, country_name: str):
+    def get_flag_image(self, response: TextResponse, country_name: str, flag_description_html: str):
        flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()

        flags_item = FlagsItem()
        flags_item["country_name"] = country_name
+        flags_item["flag_description_html"] = flag_description_html
        flags_item["file_urls"] = [f"https:{flag_image_result}"]

        yield flags_item
--- a/playground/downloaded_data_inspection_lab/flags.json
+++ b/playground/downloaded_data_inspection_lab/flags.json