chore: try get anthem from page
This commit is contained in:
@@ -24,7 +24,7 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
).getall()
|
).getall()
|
||||||
|
|
||||||
for url in country_urls_xpath:
|
for url in country_urls_xpath:
|
||||||
# for url in country_urls_xpath[:3]:
|
# for url in country_urls_xpath[:3]:
|
||||||
regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
|
regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
url=f"https://en.wikipedia.org{url}",
|
url=f"https://en.wikipedia.org{url}",
|
||||||
@@ -49,17 +49,13 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
|
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
|
||||||
).get()
|
).get()
|
||||||
|
|
||||||
anthem_file_url = response.xpath(
|
anthem_page_url = response.xpath(
|
||||||
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//source[contains(@type, 'audio/ogg')]/@src"
|
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]/a/@href"
|
||||||
).get()
|
|
||||||
anthem_item = response.xpath(
|
|
||||||
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]"
|
|
||||||
).get()
|
).get()
|
||||||
|
|
||||||
country_item = {
|
country_item = {
|
||||||
**country_item,
|
**country_item,
|
||||||
"country": country_information_xpath,
|
"country": country_information_xpath,
|
||||||
"anthem": anthem_item,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
yield scrapy.Request(
|
yield scrapy.Request(
|
||||||
@@ -69,7 +65,7 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
"country_item": country_item,
|
"country_item": country_item,
|
||||||
"urls": {
|
"urls": {
|
||||||
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
|
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
|
||||||
"anthem_file_url": f"https:{anthem_file_url}",
|
"anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -93,26 +89,28 @@ class CountrydownloaderSpider(scrapy.Spider):
|
|||||||
flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
||||||
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
|
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
|
||||||
|
|
||||||
country_scrapy_item = WikipediaCountryScraperItem()
|
yield scrapy.Request(
|
||||||
|
url=urls["anthem_file_url"],
|
||||||
|
callback=self.extract_anthem_file,
|
||||||
|
cb_kwargs={
|
||||||
|
"country_item": country_item,
|
||||||
|
"urls": urls,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
|
||||||
|
anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
|
||||||
|
anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").get()
|
||||||
|
|
||||||
|
country_scrapy_item = WikipediaCountryScraperItem()
|
||||||
country_scrapy_item["country_url"] = country_item["country_url"]
|
country_scrapy_item["country_url"] = country_item["country_url"]
|
||||||
country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
||||||
country_scrapy_item["country"] = country_item["country"]
|
country_scrapy_item["country"] = country_item["country"]
|
||||||
country_scrapy_item["flag_description"] = country_item["flag_description"]
|
country_scrapy_item["flag_description"] = country_item["flag_description"]
|
||||||
country_scrapy_item["anthem"] = country_item["anthem"]
|
country_scrapy_item["anthem"] = anthem_text
|
||||||
|
country_scrapy_item["file_urls"] = [
|
||||||
|
country_item["flag_image_url"],
|
||||||
|
f"https://en.wikipedia.org{anthem_file_url}",
|
||||||
|
]
|
||||||
|
|
||||||
country_scrapy_item["file_urls"] = [country_item["flag_image_url"], urls["anthem_file_url"]]
|
yield
|
||||||
|
|
||||||
yield country_scrapy_item
|
|
||||||
|
|
||||||
# def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
|
|
||||||
# anthem_file_xpath = response.xpath(
|
|
||||||
# "//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
|
|
||||||
# ).get()
|
|
||||||
|
|
||||||
# country_scrapy_item = WikipediaCountryScraperItem()
|
|
||||||
# country_scrapy_item["country_url"] = country_item["country_url"]
|
|
||||||
# country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
|
||||||
# country_scrapy_item["country"] = country_item["country"]
|
|
||||||
# country_scrapy_item["flag_description"] = country_item["flag_description"]
|
|
||||||
# country_scrapy_item["file_urls"] = [country_item["flag_image_url"], f"https:{anthem_file_xpath}"]
|
|
||||||
|
|||||||
Reference in New Issue
Block a user