chore: remove anthem from scraper
This commit is contained in:
@@ -9,11 +9,8 @@ import scrapy
|
||||
class WikipediaCountryScraperItem(scrapy.Item):
|
||||
country_url = scrapy.Field()
|
||||
short_country_name = scrapy.Field()
|
||||
country = scrapy.Field()
|
||||
flag_description = scrapy.Field()
|
||||
anthem = scrapy.Field()
|
||||
anthem_url = scrapy.Field()
|
||||
anthem_file_url = scrapy.Field()
|
||||
country_html = scrapy.Field()
|
||||
flag_html = scrapy.Field()
|
||||
|
||||
file_urls = scrapy.Field()
|
||||
files = scrapy.Field()
|
||||
|
||||
@@ -19,13 +19,15 @@ class CountrydownloaderSpider(scrapy.Spider):
|
||||
]
|
||||
|
||||
def extract_country_urls(self, response: TextResponse):
|
||||
"""Extract urls of all countries from https://en.wikipedia.org/wiki/List_of_sovereign_states."""
|
||||
|
||||
country_urls_xpath = response.xpath(
|
||||
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
|
||||
).getall()
|
||||
|
||||
for url in country_urls_xpath:
|
||||
# for url in country_urls_xpath[:3]:
|
||||
regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
|
||||
|
||||
yield scrapy.Request(
|
||||
url=f"https://en.wikipedia.org{url}",
|
||||
callback=self.extract_country_information,
|
||||
@@ -40,6 +42,8 @@ class CountrydownloaderSpider(scrapy.Spider):
|
||||
)
|
||||
|
||||
def extract_country_information(self, response: TextResponse, country_item: dict):
|
||||
"""Extract html of country sidebar on each country page."""
|
||||
|
||||
country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
|
||||
|
||||
flag_image_url = response.xpath(
|
||||
@@ -49,13 +53,9 @@ class CountrydownloaderSpider(scrapy.Spider):
|
||||
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
|
||||
).get()
|
||||
|
||||
anthem_page_url = response.xpath(
|
||||
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
|
||||
).get()
|
||||
|
||||
country_item = {
|
||||
**country_item,
|
||||
"country": country_information_xpath,
|
||||
"country_html": country_information_xpath,
|
||||
}
|
||||
|
||||
yield scrapy.Request(
|
||||
@@ -65,16 +65,18 @@ class CountrydownloaderSpider(scrapy.Spider):
|
||||
"country_item": country_item,
|
||||
"urls": {
|
||||
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
|
||||
"anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
|
||||
"""Extract the html of the first paragraph on each country flag page."""
|
||||
|
||||
flag_description_xpath = response.xpath(
|
||||
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
|
||||
).get()
|
||||
country_item = {**country_item, "flag_description": flag_description_xpath}
|
||||
|
||||
country_item = {**country_item, "flag_html": flag_description_xpath}
|
||||
|
||||
yield scrapy.Request(
|
||||
url=urls["flag_image_url"],
|
||||
@@ -86,37 +88,18 @@ class CountrydownloaderSpider(scrapy.Spider):
|
||||
)
|
||||
|
||||
def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
|
||||
"""Extract the image URL for each country flag."""
|
||||
|
||||
flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
||||
|
||||
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
|
||||
|
||||
yield scrapy.Request(
|
||||
url=urls["anthem_page_url"],
|
||||
callback=self.extract_anthem_file,
|
||||
cb_kwargs={
|
||||
"country_item": country_item,
|
||||
"urls": urls,
|
||||
},
|
||||
)
|
||||
|
||||
def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
|
||||
anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
|
||||
_anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
|
||||
|
||||
anthem_file_url = next(
|
||||
(file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
|
||||
)
|
||||
|
||||
# yield the country item containing scraped data
|
||||
country_scrapy_item = WikipediaCountryScraperItem()
|
||||
country_scrapy_item["country_url"] = country_item["country_url"]
|
||||
country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
||||
country_scrapy_item["country"] = country_item["country"]
|
||||
country_scrapy_item["flag_description"] = country_item["flag_description"]
|
||||
country_scrapy_item["anthem"] = anthem_text
|
||||
country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
|
||||
country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
|
||||
country_scrapy_item["file_urls"] = [
|
||||
country_item["flag_image_url"],
|
||||
f"https://en.wikipedia.org{anthem_file_url}",
|
||||
]
|
||||
country_scrapy_item["country_html"] = country_item["country_html"]
|
||||
country_scrapy_item["flag_html"] = country_item["flag_html"]
|
||||
country_scrapy_item["file_urls"] = [country_item["flag_image_url"]]
|
||||
|
||||
yield country_scrapy_item
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
import scrapy
|
||||
from scrapy.http import TextResponse
|
||||
|
||||
from wikipedia_country_scraper.items import WikipediaCountryScraperItem
|
||||
|
||||
|
||||
class CountrydownloaderSpider(scrapy.Spider):
|
||||
name = "CountrydownloaderSpider"
|
||||
|
||||
def start_requests(self):
|
||||
return [
|
||||
scrapy.Request(
|
||||
url="https://en.wikipedia.org/wiki/List_of_sovereign_states", callback=self.extract_country_urls
|
||||
)
|
||||
]
|
||||
|
||||
def extract_country_urls(self, response: TextResponse):
|
||||
country_urls_xpath = response.xpath(
|
||||
"//table[contains(@class, 'sortable') and contains(@class, 'wikitable')]/tbody/tr[not(contains(@style, 'background'))]/td[1 and contains(@style, 'vertical-align:top;')]/b/a/@href"
|
||||
).getall()
|
||||
|
||||
for url in country_urls_xpath:
|
||||
# for url in country_urls_xpath[:3]:
|
||||
regex_match = re.search(r"\/wiki\/(?P<short_country_name>[^$]*)", url)
|
||||
yield scrapy.Request(
|
||||
url=f"https://en.wikipedia.org{url}",
|
||||
callback=self.extract_country_information,
|
||||
cb_kwargs={
|
||||
"country_item": {
|
||||
"country_url": f"https://en.wikipedia.org{url}",
|
||||
"short_country_name": regex_match["short_country_name"]
|
||||
if isinstance(regex_match, re.Match)
|
||||
else None,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
def extract_country_information(self, response: TextResponse, country_item: dict):
|
||||
country_information_xpath = response.xpath("//table[contains(@class, 'infobox')]/tbody/tr").getall()
|
||||
|
||||
flag_image_url = response.xpath(
|
||||
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[1]/a/@href"
|
||||
).get()
|
||||
flag_description_url = response.xpath(
|
||||
"//table[contains(@class, 'infobox')]/tbody/tr[2]/td/div/div[1]/div[2]/a/@href"
|
||||
).get()
|
||||
|
||||
anthem_page_url = response.xpath(
|
||||
"//table[contains(@class, 'infobox')]/tbody/tr/td[contains(@class, 'anthem')]//span[contains(@class, 'audio')]/a/@href"
|
||||
).get()
|
||||
|
||||
country_item = {
|
||||
**country_item,
|
||||
"country": country_information_xpath,
|
||||
}
|
||||
|
||||
yield scrapy.Request(
|
||||
url=f"https://en.wikipedia.org{flag_description_url}",
|
||||
callback=self.extract_flag_description,
|
||||
cb_kwargs={
|
||||
"country_item": country_item,
|
||||
"urls": {
|
||||
"flag_image_url": f"https://en.wikipedia.org{flag_image_url}",
|
||||
"anthem_page_url": f"https://en.wikipedia.org{anthem_page_url}",
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
def extract_flag_description(self, response: TextResponse, country_item: dict, urls: dict):
|
||||
flag_description_xpath = response.xpath(
|
||||
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
|
||||
).get()
|
||||
country_item = {**country_item, "flag_description": flag_description_xpath}
|
||||
|
||||
yield scrapy.Request(
|
||||
url=urls["flag_image_url"],
|
||||
callback=self.extract_flag_images,
|
||||
cb_kwargs={
|
||||
"country_item": country_item,
|
||||
"urls": urls,
|
||||
},
|
||||
)
|
||||
|
||||
def extract_flag_images(self, response: TextResponse, country_item: dict, urls: dict):
|
||||
flag_image_xpath = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
||||
country_item = {**country_item, "flag_image_url": f"https:{flag_image_xpath}"}
|
||||
|
||||
yield scrapy.Request(
|
||||
url=urls["anthem_page_url"],
|
||||
callback=self.extract_anthem_file,
|
||||
cb_kwargs={
|
||||
"country_item": country_item,
|
||||
"urls": urls,
|
||||
},
|
||||
)
|
||||
|
||||
def extract_anthem_file(self, response: TextResponse, country_item: dict, urls: dict):
|
||||
anthem_text = response.xpath("//div[@id='mw-content-text']/div/p").get()
|
||||
_anthem_file_url = response.xpath("//tr[contains(@class, 'haudio')]//a/@href").getall()
|
||||
|
||||
anthem_file_url = next(
|
||||
(file for file in _anthem_file_url if file.endswith(".ogg") or file.endswith(".oga")), None
|
||||
)
|
||||
|
||||
country_scrapy_item = WikipediaCountryScraperItem()
|
||||
country_scrapy_item["country_url"] = country_item["country_url"]
|
||||
country_scrapy_item["short_country_name"] = country_item["short_country_name"]
|
||||
country_scrapy_item["country"] = country_item["country"]
|
||||
country_scrapy_item["flag_description"] = country_item["flag_description"]
|
||||
country_scrapy_item["anthem"] = anthem_text
|
||||
country_scrapy_item["anthem_url"] = urls["anthem_page_url"]
|
||||
country_scrapy_item["anthem_file_url"] = f"https://en.wikipedia.org{anthem_file_url}"
|
||||
country_scrapy_item["file_urls"] = [
|
||||
country_item["flag_image_url"],
|
||||
f"https://en.wikipedia.org{anthem_file_url}",
|
||||
]
|
||||
|
||||
yield country_scrapy_item
|
||||
Reference in New Issue
Block a user