chore: comment out backup spider

2022-06-24 21:52:54 +01:00
parent f9d364506a
commit 59df2f02dd
1 changed files with 101 additions and 101 deletions
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py
@@ -1,122 +1,122 @@
-from __future__ import annotations
+# from __future__ import annotations
-import re
+# import re
-import scrapy
+# import scrapy
-from scrapy.http import TextResponse
+# from scrapy.http import TextResponse
-from wikipedia_country_scraper.items import WikipediaCountryScraperItem
+# from wikipedia_country_scraper.items import WikipediaCountryScraperItem
-class CountrydownloaderSpider(scrapy.Spider):
+# class CountrydownloaderSpider(scrapy.Spider):
-    name = "CountrydownloaderSpider"
+#     name = "CountrydownloaderSpider"
-    def start_requests(self):
+#     def start_requests(self):
-        return [
+#         return [
-            scrapy.Request(
+#             scrapy.Request(
-                url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
+#                 url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
-            )
+#             )
-        ]
+#         ]
-    def extract_country_urls(self, response: TextResponse):
+#     def extract_country_urls(self, response: TextResponse):
-        country_urls_xpath = response.xpath(
+#         country_urls_xpath = response.xpath(
-            "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
+#             "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
-        ).getall()
+#         ).getall()
-        for url in country_urls_xpath:
+#         for url in country_urls_xpath:
-            # for url in country_urls_xpath[:3]:
+#             # for url in country_urls_xpath[:3]:
-            regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
+#             regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
-            yield scrapy.Request(
+#             yield scrapy.Request(
-                url=f"https://en.wikipedia.org{url}",
+#                 url=f"https://en.wikipedia.org{url}",
-                callback=self.extract_country_information,
+#                 callback=self.extract_country_information,
-                cb_kwargs={
+#                 cb_kwargs={
-                    "country_item": {
+#                     "country_item": {
-                        "country_url": f"https://en.wikipedia.org{url}",
+#                         "country_url": f"https://en.wikipedia.org{url}",
-                        "short_country_name": regex_match["short_country_name"]
+#                         "short_country_name": regex_match["short_country_name"]
-                        if isinstance(regex_match, re.Match)
+#                         if isinstance(regex_match, re.Match)
-                        else None,
+#                         else None,
-                    }
+#                     }
-                },
+#                 },
-            )
+#             )
-    def extract_country_information(self, response: TextResponse, country_item: dict):
+#     def extract_country_information(self, response: TextResponse, country_item: dict):
-        country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
+#         country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
-        flag_image_url = response.xpath(
+#         flag_image_url = response.xpath(
-            "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
+#             "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
-        ).get()
+#         ).get()
-        flag_description_url = response.xpath(
+#         flag_description_url = response.xpath(
-            "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
+#             "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
-        ).get()
+#         ).get()
-        anthem_page_url = response.xpath(
+#         anthem_page_url = response.xpath(
-            "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
+#             "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
-        ).get()
+#         ).get()
-        country_item = {
+#         country_item = {
-            **country_item,
+#             **country_item,
-            "country": country_information_xpath,
+#             "country": country_information_xpath,
-        }
+#         }
-        yield scrapy.Request(
+#         yield scrapy.Request(
-            url=f"https://en.wikipedia.org{flag_description_url}",
+#             url=f"https://en.wikipedia.org{flag_description_url}",
-            callback=self.extract_flag_description,
+#             callback=self.extract_flag_description,
-            cb_kwargs={
+#             cb_kwargs={
-                "country_item": country_item,
+#                 "country_item": country_item,
-                "urls": {
+#                 "urls": {
-                    "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
+#                     "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
-                    "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
+#                     "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
-                },
+#                 },
-            },
+#             },
-        )
+#         )
-    def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
+#     def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
-        flag_description_xpath = response.xpath(
+#         flag_description_xpath = response.xpath(
-            "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
+#             "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
-        ).get()
+#         ).get()
-        country_item = {**country_item, "flag_description": flag_description_xpath}
+#         country_item = {**country_item, "flag_description": flag_description_xpath}
-        yield scrapy.Request(
+#         yield scrapy.Request(
-            url=urls["flag_image_url"],
+#             url=urls["flag_image_url"],
-            callback=self.extract_flag_images,
+#             callback=self.extract_flag_images,
-            cb_kwargs={
+#             cb_kwargs={
-                "country_item": country_item,
+#                 "country_item": country_item,
-                "urls": urls,
+#                 "urls": urls,
-            },
+#             },
-        )
+#         )
-    def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
+#     def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
-        flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
+#         flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
-        country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
+#         country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
-        yield scrapy.Request(
+#         yield scrapy.Request(
-            url=urls["anthem_page_url"],
+#             url=urls["anthem_page_url"],
-            callback=self.extract_anthem_file,
+#             callback=self.extract_anthem_file,
-            cb_kwargs={
+#             cb_kwargs={
-                "country_item": country_item,
+#                 "country_item": country_item,
-                "urls": urls,
+#                 "urls": urls,
-            },
+#             },
-        )
+#         )
-    def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
+#     def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
-        anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
+#         anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
-        _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
+#         _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
-        anthem_file_url = next(
+#         anthem_file_url = next(
-            (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
+#             (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
-        )
+#         )
-        country_scrapy_item = WikipediaCountryScraperItem()
+#         country_scrapy_item = WikipediaCountryScraperItem()
-        country_scrapy_item["country_url"] = country_item["country_url"]
+#         country_scrapy_item["country_url"] = country_item["country_url"]
-        country_scrapy_item["short_country_name"] = country_item["short_country_name"]
+#         country_scrapy_item["short_country_name"] = country_item["short_country_name"]
-        country_scrapy_item["country"] = country_item["country"]
+#         country_scrapy_item["country"] = country_item["country"]
-        country_scrapy_item["flag_description"] = country_item["flag_description"]
+#         country_scrapy_item["flag_description"] = country_item["flag_description"]
-        country_scrapy_item["anthem"] = anthem_text
+#         country_scrapy_item["anthem"] = anthem_text
-        country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
+#         country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
-        country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
+#         country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
-        country_scrapy_item["file_urls"] = [
+#         country_scrapy_item["file_urls"] = [
-            country_item["flag_image_url"],
+#             country_item["flag_image_url"],
-            f"https://en.wikipedia.org{anthem_file_url}",
+#             f"https://en.wikipedia.org{anthem_file_url}",
-        ]
+#         ]
-        yield country_scrapy_item
+#         yield country_scrapy_item