chore: add flags spider
This commit is contained in:
@@ -29,3 +29,11 @@ class AnthemsItem(scrapy.Item):
|
|||||||
class CapitalsItem(scrapy.Item):
|
class CapitalsItem(scrapy.Item):
|
||||||
country_name = scrapy.Field()
|
country_name = scrapy.Field()
|
||||||
capitals = scrapy.Field()
|
capitals = scrapy.Field()
|
||||||
|
|
||||||
|
|
||||||
|
class FlagsItem(scrapy.Item):
|
||||||
|
country_name = scrapy.Field()
|
||||||
|
flag_description_html = scrapy.Field()
|
||||||
|
|
||||||
|
file_urls = scrapy.Field()
|
||||||
|
files = scrapy.Field()
|
||||||
|
|||||||
@@ -33,5 +33,19 @@ class AnthemDownloadFilesPipeline(FilesPipeline):
|
|||||||
flag_filename = re.search(r"([^\/]*)$", request.url)
|
flag_filename = re.search(r"([^\/]*)$", request.url)
|
||||||
|
|
||||||
if isinstance(flag_filename, re.Match):
|
if isinstance(flag_filename, re.Match):
|
||||||
if (filename := flag_filename[1]).endswith("ogg") or filename.endswith("oga"):
|
if (
|
||||||
|
(filename := flag_filename[1]).endswith("ogg")
|
||||||
|
or filename.endswith("oga")
|
||||||
|
or filename.endswith("mp3")
|
||||||
|
or filename.endswith("wav")
|
||||||
|
):
|
||||||
return f"files/anthems/{filename}"
|
return f"files/anthems/{filename}"
|
||||||
|
|
||||||
|
|
||||||
|
class FlagDownloadFilesPipeline(FilesPipeline):
|
||||||
|
def file_path(self, request, response=None, info=None, *, item=None):
|
||||||
|
flag_filename = re.search(r"([^\/]*)$", request.url)
|
||||||
|
|
||||||
|
if isinstance(flag_filename, re.Match):
|
||||||
|
if filename := flag_filename[1].endswith(".svg"):
|
||||||
|
return f"files/flags/{filename}"
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ class CapitalsSpider(scrapy.Spider):
|
|||||||
|
|
||||||
for country, capitals in zip(country_names, capital_names):
|
for country, capitals in zip(country_names, capital_names):
|
||||||
capital_item = CapitalsItem()
|
capital_item = CapitalsItem()
|
||||||
|
|
||||||
capital_item["country_name"] = country
|
capital_item["country_name"] = country
|
||||||
capital_item["capitals"] = capitals
|
capital_item["capitals"] = capitals
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,75 @@
|
|||||||
|
import pathlib
|
||||||
|
|
||||||
|
import scrapy
|
||||||
|
from scrapy.http import TextResponse
|
||||||
|
from wikipedia_country_scraper.items import FlagsItem
|
||||||
|
|
||||||
|
|
||||||
|
class FlagsSpider(scrapy.Spider):
|
||||||
|
name = "FlagsSpider"
|
||||||
|
start_urls = [
|
||||||
|
"https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages"
|
||||||
|
]
|
||||||
|
custom_settings = {
|
||||||
|
"ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.FlagDownloadFilesPipeline": 100},
|
||||||
|
"FEEDS": {
|
||||||
|
pathlib.Path(__file__).resolve().parents[4]
|
||||||
|
/ "data"
|
||||||
|
/ "scrapy"
|
||||||
|
/ "raw_country_data"
|
||||||
|
/ "flags.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse(self, response: TextResponse):
|
||||||
|
_country = response.xpath("//table[@class='wikitable']/tbody/tr[not(@style='background:#ccc;')]")
|
||||||
|
|
||||||
|
country_names = [country_name.get() for country_name in _country.xpath("td[1]//a[@title]/text()")]
|
||||||
|
country_urls = [
|
||||||
|
f"https://en.wikipedia.org{country_url.get()}" for country_url in _country.xpath("td[1]//a[@title]/@href")
|
||||||
|
]
|
||||||
|
|
||||||
|
for country_name, country_url in zip(country_names, country_urls):
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name}
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_country_page(self, response: TextResponse, country_name: str):
|
||||||
|
flag_data = response.xpath(
|
||||||
|
"//table[contains(@class, 'infobox')]/tbody/tr/td/div/div[1]/div/a[not(contains(@href, 'cite_note'))]/@href"
|
||||||
|
).getall()
|
||||||
|
|
||||||
|
flag_image_url = flag_data[0]
|
||||||
|
flag_description_url = flag_data[1]
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=f"https://en.wikipedia.org{flag_description_url}",
|
||||||
|
callback=self.get_flag_description,
|
||||||
|
cb_kwargs={"country_name": country_name},
|
||||||
|
)
|
||||||
|
|
||||||
|
yield scrapy.Request(
|
||||||
|
url=f"https://en.wikipedia.org{flag_image_url}",
|
||||||
|
callback=self.get_flag_image,
|
||||||
|
cb_kwargs={"country_name": country_name},
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_flag_description(self, response: TextResponse, country_name: str):
|
||||||
|
flag_description_result = response.xpath(
|
||||||
|
"//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]"
|
||||||
|
).get()
|
||||||
|
|
||||||
|
flags_item = FlagsItem()
|
||||||
|
flags_item["country_name"] = country_name
|
||||||
|
flags_item["flag_description_html"] = flag_description_result
|
||||||
|
|
||||||
|
yield flags_item
|
||||||
|
|
||||||
|
def get_flag_image(self, response: TextResponse, country_name: str):
|
||||||
|
flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get()
|
||||||
|
|
||||||
|
flags_item = FlagsItem()
|
||||||
|
flags_item["country_name"] = country_name
|
||||||
|
flags_item["file_urls"] = [f"https://{flag_image_result}"]
|
||||||
|
|
||||||
|
yield flags_item
|
||||||
Reference in New Issue
Block a user