diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py index 0d6a041..23acf29 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader_old_anthem.py @@ -1,122 +1,122 @@ -from __future__ import annotations +# from __future__ import annotations -import re +# import re -import scrapy -from scrapy.http import TextResponse +# import scrapy +# from scrapy.http import TextResponse -from wikipedia_country_scraper.items import WikipediaCountryScraperItem +# from wikipedia_country_scraper.items import WikipediaCountryScraperItem -class CountrydownloaderSpider(scrapy.Spider): - name = "CountrydownloaderSpider" +# class CountrydownloaderSpider(scrapy.Spider): +# name = "CountrydownloaderSpider" - def start_requests(self): - return [ - scrapy.Request( - url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls - ) - ] +# def start_requests(self): +# return [ +# scrapy.Request( +# url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls +# ) +# ] - def extract_country_urls(self, response: TextResponse): - country_urls_xpath = response.xpath( - "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href" - ).getall() +# def extract_country_urls(self, response: TextResponse): +# country_urls_xpath = response.xpath( +# "//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href" +# ).getall() - for url in country_urls_xpath: - # for url in country_urls_xpath[:3]: - regex_match = re.search(r"\/wiki\/(?P[^$]*)", url) - yield scrapy.Request( - url=f"https://en.wikipedia.org{url}", - callback=self.extract_country_information, - cb_kwargs={ - "country_item": { - "country_url": f"https://en.wikipedia.org{url}", - "short_country_name": regex_match["short_country_name"] - if isinstance(regex_match, re.Match) - else None, - } - }, - ) +# for url in country_urls_xpath: +# # for url in country_urls_xpath[:3]: +# regex_match = re.search(r"\/wiki\/(?P[^$]*)", url) +# yield scrapy.Request( +# url=f"https://en.wikipedia.org{url}", +# callback=self.extract_country_information, +# cb_kwargs={ +# "country_item": { +# "country_url": f"https://en.wikipedia.org{url}", +# "short_country_name": regex_match["short_country_name"] +# if isinstance(regex_match, re.Match) +# else None, +# } +# }, +# ) - def extract_country_information(self, response: TextResponse, country_item: dict): - country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall() +# def extract_country_information(self, response: TextResponse, country_item: dict): +# country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall() - flag_image_url = response.xpath( - "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href" - ).get() - flag_description_url = response.xpath( - "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href" - ).get() +# flag_image_url = response.xpath( +# "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href" +# ).get() +# flag_description_url = response.xpath( +# "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href" +# ).get() - anthem_page_url = response.xpath( - "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href" - ).get() +# anthem_page_url = response.xpath( +# "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href" +# ).get() - country_item = { - **country_item, - "country": country_information_xpath, - } +# country_item = { +# **country_item, +# "country": country_information_xpath, +# } - yield scrapy.Request( - url=f"https://en.wikipedia.org{flag_description_url}", - callback=self.extract_flag_description, - cb_kwargs={ - "country_item": country_item, - "urls": { - "flag_image_url": f"https://en.wikipedia.org{flag_image_url}", - "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}", - }, - }, - ) +# yield scrapy.Request( +# url=f"https://en.wikipedia.org{flag_description_url}", +# callback=self.extract_flag_description, +# cb_kwargs={ +# "country_item": country_item, +# "urls": { +# "flag_image_url": f"https://en.wikipedia.org{flag_image_url}", +# "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}", +# }, +# }, +# ) - def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict): - flag_description_xpath = response.xpath( - "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]" - ).get() - country_item = {**country_item, "flag_description": flag_description_xpath} +# def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict): +# flag_description_xpath = response.xpath( +# "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]" +# ).get() +# country_item = {**country_item, "flag_description": flag_description_xpath} - yield scrapy.Request( - url=urls["flag_image_url"], - callback=self.extract_flag_images, - cb_kwargs={ - "country_item": country_item, - "urls": urls, - }, - ) +# yield scrapy.Request( +# url=urls["flag_image_url"], +# callback=self.extract_flag_images, +# cb_kwargs={ +# "country_item": country_item, +# "urls": urls, +# }, +# ) - def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict): - flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get() - country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"} +# def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict): +# flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get() +# country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"} - yield scrapy.Request( - url=urls["anthem_page_url"], - callback=self.extract_anthem_file, - cb_kwargs={ - "country_item": country_item, - "urls": urls, - }, - ) +# yield scrapy.Request( +# url=urls["anthem_page_url"], +# callback=self.extract_anthem_file, +# cb_kwargs={ +# "country_item": country_item, +# "urls": urls, +# }, +# ) - def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict): - anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get() - _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall() +# def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict): +# anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get() +# _anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall() - anthem_file_url = next( - (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None - ) +# anthem_file_url = next( +# (file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None +# ) - country_scrapy_item = WikipediaCountryScraperItem() - country_scrapy_item["country_url"] = country_item["country_url"] - country_scrapy_item["short_country_name"] = country_item["short_country_name"] - country_scrapy_item["country"] = country_item["country"] - country_scrapy_item["flag_description"] = country_item["flag_description"] - country_scrapy_item["anthem"] = anthem_text - country_scrapy_item["anthem_url"] = urls["anthem_page_url"] - country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}" - country_scrapy_item["file_urls"] = [ - country_item["flag_image_url"], - f"https://en.wikipedia.org{anthem_file_url}", - ] +# country_scrapy_item = WikipediaCountryScraperItem() +# country_scrapy_item["country_url"] = country_item["country_url"] +# country_scrapy_item["short_country_name"] = country_item["short_country_name"] +# country_scrapy_item["country"] = country_item["country"] +# country_scrapy_item["flag_description"] = country_item["flag_description"] +# country_scrapy_item["anthem"] = anthem_text +# country_scrapy_item["anthem_url"] = urls["anthem_page_url"] +# country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}" +# country_scrapy_item["file_urls"] = [ +# country_item["flag_image_url"], +# f"https://en.wikipedia.org{anthem_file_url}", +# ] - yield country_scrapy_item +# yield country_scrapy_item