chore: update playground

2022-06-25 23:22:59 +01:00
parent d51e803baf
commit 101f4a4080
5 changed files with 2760 additions and 621 deletions
--- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py
+++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py
@@ -1,122 +0,0 @@
-# from __future__ import annotations
-
-# import re
-
-# import scrapy
-# from scrapy.http import TextResponse
-
-# from wikipedia_country_scraper.items import WikipediaCountryScraperItem
-
-
-# class CountrydownloaderSpider(scrapy.Spider):
-#     name = "CountrydownloaderSpider"
-
-#     def start_requests(self):
-#         return [
-#             scrapy.Request(
-#                 url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
-#             )
-#         ]
-
-#     def extract_country_urls(self, response: TextResponse):
-#         country_urls_xpath = response.xpath(
-#             "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
-#         ).getall()
-
-#         for url in country_urls_xpath:
-#             # for url in country_urls_xpath[:3]:
-#             regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
-#             yield scrapy.Request(
-#                 url=f"https://en.wikipedia.org{url}",
-#                 callback=self.extract_country_information,
-#                 cb_kwargs={
-#                     "country_item": {
-#                         "country_url": f"https://en.wikipedia.org{url}",
-#                         "short_country_name": regex_match["short_country_name"]
-#                         if isinstance(regex_match, re.Match)
-#                         else None,
-#                     }
-#                 },
-#             )
-
-#     def extract_country_information(self, response: TextResponse, country_item: dict):
-#         country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
-
-#         flag_image_url = response.xpath(
-#             "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
-#         ).get()
-#         flag_description_url = response.xpath(
-#             "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
-#         ).get()
-
-#         anthem_page_url = response.xpath(
-#             "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
-#         ).get()
-
-#         country_item = {
-#             **country_item,
-#             "country": country_information_xpath,
-#         }
-
-#         yield scrapy.Request(
-#             url=f"https://en.wikipedia.org{flag_description_url}",
-#             callback=self.extract_flag_description,
-#             cb_kwargs={
-#                 "country_item": country_item,
-#                 "urls": {
-#                     "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
-#                     "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
-#                 },
-#             },
-#         )
-
-#     def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
-#         flag_description_xpath = response.xpath(
-#             "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
-#         ).get()
-#         country_item = {**country_item, "flag_description": flag_description_xpath}
-
-#         yield scrapy.Request(
-#             url=urls["flag_image_url"],
-#             callback=self.extract_flag_images,
-#             cb_kwargs={
-#                 "country_item": country_item,
-#                 "urls": urls,
-#             },
-#         )
-
-#     def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
-#         flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
-#         country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
-
-#         yield scrapy.Request(
-#             url=urls["anthem_page_url"],
-#             callback=self.extract_anthem_file,
-#             cb_kwargs={
-#                 "country_item": country_item,
-#                 "urls": urls,
-#             },
-#         )
-
-#     def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
-#         anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
-#         _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
-
-#         anthem_file_url = next(
-#             (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
-#         )
-
-#         country_scrapy_item = WikipediaCountryScraperItem()
-#         country_scrapy_item["country_url"] = country_item["country_url"]
-#         country_scrapy_item["short_country_name"] = country_item["short_country_name"]
-#         country_scrapy_item["country"] = country_item["country"]
-#         country_scrapy_item["flag_description"] = country_item["flag_description"]
-#         country_scrapy_item["anthem"] = anthem_text
-#         country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
-#         country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
-#         country_scrapy_item["file_urls"] = [
-#             country_item["flag_image_url"],
-#             f"https://en.wikipedia.org{anthem_file_url}",
-#         ]
-
-#         yield country_scrapy_item