diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py index a58577d..f19f175 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/countrydownloader.py @@ -24,7 +24,7 @@ class CountrydownloaderSpider(scrapy.Spider): ).getall() for url in country_urls_xpath: - # for url in country_urls_xpath[:3]: + # for url in country_urls_xpath[:3]: regex_match = re.search(r"\/wiki\/(?P[^$]*)", url) yield scrapy.Request( url=f"https://en.wikipedia.org{url}", @@ -49,17 +49,13 @@ class CountrydownloaderSpider(scrapy.Spider): "//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href" ).get() - anthem_file_url = response.xpath( - "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//source[contains(@type, 'audio/ogg')]/@src" - ).get() - anthem_item = response.xpath( - "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]" + anthem_page_url = response.xpath( + "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]/a/@href" ).get() country_item = { **country_item, "country": country_information_xpath, - "anthem": anthem_item, } yield scrapy.Request( @@ -69,7 +65,7 @@ class CountrydownloaderSpider(scrapy.Spider): "country_item": country_item, "urls": { "flag_image_url": f"https://en.wikipedia.org{flag_image_url}", - "anthem_file_url": f"https:{anthem_file_url}", + "anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}", }, }, ) @@ -93,26 +89,28 @@ class CountrydownloaderSpider(scrapy.Spider): flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get() country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"} - country_scrapy_item = WikipediaCountryScraperItem() + yield scrapy.Request( + url=urls["anthem_file_url"], + callback=self.extract_anthem_file, + cb_kwargs={ + "country_item": country_item, + "urls": urls, + }, + ) + def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict): + anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get() + anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").get() + + country_scrapy_item = WikipediaCountryScraperItem() country_scrapy_item["country_url"] = country_item["country_url"] country_scrapy_item["short_country_name"] = country_item["short_country_name"] country_scrapy_item["country"] = country_item["country"] country_scrapy_item["flag_description"] = country_item["flag_description"] - country_scrapy_item["anthem"] = country_item["anthem"] + country_scrapy_item["anthem"] = anthem_text + country_scrapy_item["file_urls"] = [ + country_item["flag_image_url"], + f"https://en.wikipedia.org{anthem_file_url}", + ] - country_scrapy_item["file_urls"] = [country_item["flag_image_url"], urls["anthem_file_url"]] - - yield country_scrapy_item - - # def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict): - # anthem_file_xpath = response.xpath( - # "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href" - # ).get() - - # country_scrapy_item = WikipediaCountryScraperItem() - # country_scrapy_item["country_url"] = country_item["country_url"] - # country_scrapy_item["short_country_name"] = country_item["short_country_name"] - # country_scrapy_item["country"] = country_item["country"] - # country_scrapy_item["flag_description"] = country_item["flag_description"] - # country_scrapy_item["file_urls"] = [country_item["flag_image_url"], f"https:{anthem_file_xpath}"] + yield