From eda5e670582d6863d981e2f7ffb922a73cecf4c6 Mon Sep 17 00:00:00 2001 From: Daniel Tomlinson Date: Sun, 26 Jun 2022 17:18:31 +0100 Subject: [PATCH] chore: add flags spider --- .../wikipedia_country_scraper/items.py | 8 ++ .../wikipedia_country_scraper/pipelines.py | 16 +++- .../spiders/capitals.py | 1 - .../spiders/flags.py | 75 +++++++++++++++++++ 4 files changed, 98 insertions(+), 2 deletions(-) create mode 100644 01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py index 3492afb..2926b42 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/items.py @@ -29,3 +29,11 @@ class AnthemsItem(scrapy.Item): class CapitalsItem(scrapy.Item): country_name = scrapy.Field() capitals = scrapy.Field() + + +class FlagsItem(scrapy.Item): + country_name = scrapy.Field() + flag_description_html = scrapy.Field() + + file_urls = scrapy.Field() + files = scrapy.Field() diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py index 4bcaef9..f381daf 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/pipelines.py @@ -33,5 +33,19 @@ class AnthemDownloadFilesPipeline(FilesPipeline): flag_filename = re.search(r"([^\/]*)$", request.url) if isinstance(flag_filename, re.Match): - if (filename := flag_filename[1]).endswith("ogg") or filename.endswith("oga"): + if ( + (filename := flag_filename[1]).endswith("ogg") + or filename.endswith("oga") + or filename.endswith("mp3") + or filename.endswith("wav") + ): return f"files/anthems/{filename}" + + +class FlagDownloadFilesPipeline(FilesPipeline): + def file_path(self, request, response=None, info=None, *, item=None): + flag_filename = re.search(r"([^\/]*)$", request.url) + + if isinstance(flag_filename, re.Match): + if filename := flag_filename[1].endswith(".svg"): + return f"files/flags/{filename}" diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py index bf02974..2126116 100644 --- a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/capitals.py @@ -32,7 +32,6 @@ class CapitalsSpider(scrapy.Spider): for country, capitals in zip(country_names, capital_names): capital_item = CapitalsItem() - capital_item["country_name"] = country capital_item["capitals"] = capitals diff --git a/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py new file mode 100644 index 0000000..2bc8005 --- /dev/null +++ b/01_scrapy/wikipedia_country_scraper/wikipedia_country_scraper/spiders/flags.py @@ -0,0 +1,75 @@ +import pathlib + +import scrapy +from scrapy.http import TextResponse +from wikipedia_country_scraper.items import FlagsItem + + +class FlagsSpider(scrapy.Spider): + name = "FlagsSpider" + start_urls = [ + "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_and_their_capitals_in_native_languages" + ] + custom_settings = { + "ITEM_PIPELINES": {"wikipedia_country_scraper.pipelines.FlagDownloadFilesPipeline": 100}, + "FEEDS": { + pathlib.Path(__file__).resolve().parents[4] + / "data" + / "scrapy" + / "raw_country_data" + / "flags.json": {"format": "json", "encoding": "utf8", "store_empty": False, "indent": 2} + }, + } + + def parse(self, response: TextResponse): + _country = response.xpath("//table[@class='wikitable']/tbody/tr[not(@style='background:#ccc;')]") + + country_names = [country_name.get() for country_name in _country.xpath("td[1]//a[@title]/text()")] + country_urls = [ + f"https://en.wikipedia.org{country_url.get()}" for country_url in _country.xpath("td[1]//a[@title]/@href") + ] + + for country_name, country_url in zip(country_names, country_urls): + yield scrapy.Request( + url=country_url, callback=self.get_country_page, cb_kwargs={"country_name": country_name} + ) + + def get_country_page(self, response: TextResponse, country_name: str): + flag_data = response.xpath( + "//table[contains(@class, 'infobox')]/tbody/tr/td/div/div[1]/div/a[not(contains(@href, 'cite_note'))]/@href" + ).getall() + + flag_image_url = flag_data[0] + flag_description_url = flag_data[1] + + yield scrapy.Request( + url=f"https://en.wikipedia.org{flag_description_url}", + callback=self.get_flag_description, + cb_kwargs={"country_name": country_name}, + ) + + yield scrapy.Request( + url=f"https://en.wikipedia.org{flag_image_url}", + callback=self.get_flag_image, + cb_kwargs={"country_name": country_name}, + ) + + def get_flag_description(self, response: TextResponse, country_name: str): + flag_description_result = response.xpath( + "//div[contains(@id, 'mw-content-text')]/div/p[not(contains(@class, 'mw-empty-elt'))]" + ).get() + + flags_item = FlagsItem() + flags_item["country_name"] = country_name + flags_item["flag_description_html"] = flag_description_result + + yield flags_item + + def get_flag_image(self, response: TextResponse, country_name: str): + flag_image_result = response.xpath("//div[contains(@class, 'fullImageLink')]/a/@href").get() + + flags_item = FlagsItem() + flags_item["country_name"] = country_name + flags_item["file_urls"] = [f"https://{flag_image_result}"] + + yield flags_item