chore: comment out backup spider

This commit is contained in:
2022-06-24 21:52:54 +01:00
parent f9d364506a
commit 59df2f02dd

View File

@@ -1,122 +1,122 @@
from __future__ import annotations # from __future__ import annotations
import re # import re
import scrapy # import scrapy
from scrapy.http import TextResponse # from scrapy.http import TextResponse
from wikipedia_country_scraper.items import WikipediaCountryScraperItem # from wikipedia_country_scraper.items import WikipediaCountryScraperItem
class CountrydownloaderSpider(scrapy.Spider): # class CountrydownloaderSpider(scrapy.Spider):
name = "CountrydownloaderSpider" # name = "CountrydownloaderSpider"
def start_requests(self): # def start_requests(self):
return [ # return [
scrapy.Request( # scrapy.Request(
url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls # url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
) # )
] # ]
def extract_country_urls(self, response: TextResponse): # def extract_country_urls(self, response: TextResponse):
country_urls_xpath = response.xpath( # country_urls_xpath = response.xpath(
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href" # "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
).getall() # ).getall()
for url in country_urls_xpath: # for url in country_urls_xpath:
# for url in country_urls_xpath[:3]: # # for url in country_urls_xpath[:3]:
regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url) # regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
yield scrapy.Request( # yield scrapy.Request(
url=f"https://en.wikipedia.org{url}", # url=f"https://en.wikipedia.org{url}",
callback=self.extract_country_information, # callback=self.extract_country_information,
cb_kwargs={ # cb_kwargs={
"country_item": { # "country_item": {
"country_url": f"https://en.wikipedia.org{url}", # "country_url": f"https://en.wikipedia.org{url}",
"short_country_name": regex_match["short_country_name"] # "short_country_name": regex_match["short_country_name"]
if isinstance(regex_match, re.Match) # if isinstance(regex_match, re.Match)
else None, # else None,
} # }
}, # },
) # )
def extract_country_information(self, response: TextResponse, country_item: dict): # def extract_country_information(self, response: TextResponse, country_item: dict):
country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall() # country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
flag_image_url = response.xpath( # flag_image_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href" # "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
).get() # ).get()
flag_description_url = response.xpath( # flag_description_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href" # "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
).get() # ).get()
anthem_page_url = response.xpath( # anthem_page_url = response.xpath(
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href" # "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
).get() # ).get()
country_item = { # country_item = {
**country_item, # **country_item,
"country": country_information_xpath, # "country": country_information_xpath,
} # }
yield scrapy.Request( # yield scrapy.Request(
url=f"https://en.wikipedia.org{flag_description_url}", # url=f"https://en.wikipedia.org{flag_description_url}",
callback=self.extract_flag_description, # callback=self.extract_flag_description,
cb_kwargs={ # cb_kwargs={
"country_item": country_item, # "country_item": country_item,
"urls": { # "urls": {
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}", # "flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
"anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}", # "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
}, # },
}, # },
) # )
def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict): # def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
flag_description_xpath = response.xpath( # flag_description_xpath = response.xpath(
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]" # "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
).get() # ).get()
country_item = {**country_item, "flag_description": flag_description_xpath} # country_item = {**country_item, "flag_description": flag_description_xpath}
yield scrapy.Request( # yield scrapy.Request(
url=urls["flag_image_url"], # url=urls["flag_image_url"],
callback=self.extract_flag_images, # callback=self.extract_flag_images,
cb_kwargs={ # cb_kwargs={
"country_item": country_item, # "country_item": country_item,
"urls": urls, # "urls": urls,
}, # },
) # )
def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict): # def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get() # flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"} # country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
yield scrapy.Request( # yield scrapy.Request(
url=urls["anthem_page_url"], # url=urls["anthem_page_url"],
callback=self.extract_anthem_file, # callback=self.extract_anthem_file,
cb_kwargs={ # cb_kwargs={
"country_item": country_item, # "country_item": country_item,
"urls": urls, # "urls": urls,
}, # },
) # )
def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict): # def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get() # anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
_anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall() # _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
anthem_file_url = next( # anthem_file_url = next(
(file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None # (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
) # )
country_scrapy_item = WikipediaCountryScraperItem() # country_scrapy_item = WikipediaCountryScraperItem()
country_scrapy_item["country_url"] = country_item["country_url"] # country_scrapy_item["country_url"] = country_item["country_url"]
country_scrapy_item["short_country_name"] = country_item["short_country_name"] # country_scrapy_item["short_country_name"] = country_item["short_country_name"]
country_scrapy_item["country"] = country_item["country"] # country_scrapy_item["country"] = country_item["country"]
country_scrapy_item["flag_description"] = country_item["flag_description"] # country_scrapy_item["flag_description"] = country_item["flag_description"]
country_scrapy_item["anthem"] = anthem_text # country_scrapy_item["anthem"] = anthem_text
country_scrapy_item["anthem_url"] = urls["anthem_page_url"] # country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}" # country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
country_scrapy_item["file_urls"] = [ # country_scrapy_item["file_urls"] = [
country_item["flag_image_url"], # country_item["flag_image_url"],
f"https://en.wikipedia.org{anthem_file_url}", # f"https://en.wikipedia.org{anthem_file_url}",
] # ]
yield country_scrapy_item # yield country_scrapy_item